224 files changed, 6908 insertions, 3276 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index a396261b79..82c814a919 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -63,6 +63,17 @@ W: http://wiki.qemu.org/SecurityProcess
 M: Michael S. Tsirkin <mst@redhat.com>
 L: secalert@redhat.com
 
+Trivial patches
+---------------
+Trivial patches
+M: Michael Tokarev <mjt@tls.msk.ru>
+M: Laurent Vivier <laurent@vivier.eu>
+S: Maintained
+L: qemu-trivial@nongnu.org
+K: ^Subject:.*(?i)trivial
+T: git git://git.corpit.ru/qemu.git trivial-patches
+T: git git://github.com/vivier/qemu.git trivial-patches
+
 Guest CPU cores (TCG):
 ----------------------
 Overall
@@ -1428,6 +1439,14 @@ F: util/uuid.c
 F: include/qemu/uuid.h
 F: tests/test-uuid.c
 
+COLO Framework
+M: zhanghailiang <zhang.zhanghailiang@huawei.com>
+S: Maintained
+F: migration/colo*
+F: include/migration/colo.h
+F: include/migration/failover.h
+F: docs/COLO-FT.txt
+
 COLO Proxy
 M: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
 M: Li Zhijian <lizhijian@cn.fujitsu.com>
@@ -1498,8 +1517,8 @@ F: tcg/mips/
 F: disas/mips.c
 
 PPC
-M: Vassili Karpov (malc) <av1474@comtv.ru>
-S: Maintained
+M: Richard Henderson <rth@twiddle.net>
+S: Odd Fixes
 F: tcg/ppc/
 F: disas/ppc.c
 
diff --git a/Makefile b/Makefile
index 11f5154c81..474cc5e66a 100644
--- a/Makefile
+++ b/Makefile
@@ -695,7 +695,7 @@ help:
 	@echo  ''
 ifdef CONFIG_WIN32
 	@echo  'Windows targets:'
-	@echo  '  installer       - Build NSIS-based installer for qemu-ga'
+	@echo  '  installer       - Build NSIS-based installer for QEMU'
 ifdef QEMU_GA_MSI_ENABLED
 	@echo  '  msi             - Build MSI-based installer for qemu-ga'
 endif
diff --git a/accel.c b/accel.c
index 403eb5e94d..664bb88422 100644
--- a/accel.c
+++ b/accel.c
@@ -33,7 +33,6 @@
 #include "sysemu/qtest.h"
 #include "hw/xen/xen.h"
 #include "qom/object.h"
-#include "hw/boards.h"
 
 int tcg_tb_size;
 static bool tcg_allowed = true;
diff --git a/async.c b/async.c
index f30d011ebc..b2de360c23 100644
--- a/async.c
+++ b/async.c
@@ -61,6 +61,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
     smp_wmb();
     ctx->first_bh = bh;
     qemu_mutex_unlock(&ctx->bh_lock);
+    aio_notify(ctx);
 }
 
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
@@ -106,8 +107,8 @@ int aio_bh_poll(AioContext *ctx)
          * aio_notify again if necessary.
          */
         if (atomic_xchg(&bh->scheduled, 0)) {
-            /* Idle BHs and the notify BH don't count as progress */
-            if (!bh->idle && bh != ctx->notify_dummy_bh) {
+            /* Idle BHs don't count as progress */
+            if (!bh->idle) {
                 ret = 1;
             }
             bh->idle = 0;
@@ -259,7 +260,6 @@ aio_ctx_finalize(GSource     *source)
 {
     AioContext *ctx = (AioContext *) source;
 
-    qemu_bh_delete(ctx->notify_dummy_bh);
     thread_pool_free(ctx->thread_pool);
 
 #ifdef CONFIG_LINUX_AIO
@@ -284,7 +284,7 @@ aio_ctx_finalize(GSource     *source)
 
     aio_set_event_notifier(ctx, &ctx->notifier, false, NULL);
     event_notifier_cleanup(&ctx->notifier);
-    rfifolock_destroy(&ctx->lock);
+    qemu_rec_mutex_destroy(&ctx->lock);
     qemu_mutex_destroy(&ctx->bh_lock);
     timerlistgroup_deinit(&ctx->tlg);
 }
@@ -345,19 +345,6 @@ static void aio_timerlist_notify(void *opaque)
     aio_notify(opaque);
 }
 
-static void aio_rfifolock_cb(void *opaque)
-{
-    AioContext *ctx = opaque;
-
-    /* Kick owner thread in case they are blocked in aio_poll() */
-    qemu_bh_schedule(ctx->notify_dummy_bh);
-}
-
-static void notify_dummy_bh(void *opaque)
-{
-    /* Do nothing, we were invoked just to force the event loop to iterate */
-}
-
 static void event_notifier_dummy_cb(EventNotifier *e)
 {
 }
@@ -385,11 +372,9 @@ AioContext *aio_context_new(Error **errp)
 #endif
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
-    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+    qemu_rec_mutex_init(&ctx->lock);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
-    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
-
     return ctx;
 fail:
     g_source_destroy(&ctx->source);
@@ -408,10 +393,10 @@ void aio_context_unref(AioContext *ctx)
 
 void aio_context_acquire(AioContext *ctx)
 {
-    rfifolock_lock(&ctx->lock);
+    qemu_rec_mutex_lock(&ctx->lock);
 }
 
 void aio_context_release(AioContext *ctx)
 {
-    rfifolock_unlock(&ctx->lock);
+    qemu_rec_mutex_unlock(&ctx->lock);
 }
diff --git a/block.c b/block.c
index 7f3e7bcdc3..c19c6c6d42 100644
--- a/block.c
+++ b/block.c
@@ -1428,9 +1428,11 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
             backing_hd->drv ? backing_hd->drv->format_name : "");
 
     bdrv_op_block_all(backing_hd, bs->backing_blocker);
-    /* Otherwise we won't be able to commit due to check in bdrv_commit */
+    /* Otherwise we won't be able to commit or stream */
     bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
                     bs->backing_blocker);
+    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
+                    bs->backing_blocker);
     /*
      * We do backup in 3 ways:
      * 1. drive backup
@@ -2082,7 +2084,7 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  * to all devices.
  *
  */
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
 {
     int ret = -1;
     BlockReopenQueueEntry *bs_entry, *next;
@@ -2090,7 +2092,9 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
 
     assert(bs_queue != NULL);
 
-    bdrv_drain_all();
+    aio_context_release(ctx);
+    bdrv_drain_all_begin();
+    aio_context_acquire(ctx);
 
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
@@ -2120,6 +2124,9 @@ cleanup:
         g_free(bs_entry);
     }
     g_free(bs_queue);
+
+    bdrv_drain_all_end();
+
     return ret;
 }
 
@@ -2131,7 +2138,7 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
     Error *local_err = NULL;
     BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
 
-    ret = bdrv_reopen_multiple(queue, &local_err);
+    ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
diff --git a/block/backup.c b/block/backup.c
index 02dbe48035..7b5d8a3757 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -16,7 +16,7 @@
 #include "trace.h"
 #include "block/block.h"
 #include "block/block_int.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "block/block_backup.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
@@ -300,6 +300,21 @@ void backup_cow_request_end(CowRequest *req)
     cow_request_end(req);
 }
 
+static void backup_drain(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    /* Need to keep a reference in case blk_drain triggers execution
+     * of backup_complete...
+     */
+    if (s->target) {
+        BlockBackend *target = s->target;
+        blk_ref(target);
+        blk_drain(target);
+        blk_unref(target);
+    }
+}
+
 static const BlockJobDriver backup_job_driver = {
     .instance_size          = sizeof(BackupBlockJob),
     .job_type               = BLOCK_JOB_TYPE_BACKUP,
@@ -307,6 +322,7 @@ static const BlockJobDriver backup_job_driver = {
     .commit                 = backup_commit,
     .abort                  = backup_abort,
     .attached_aio_context   = backup_attached_aio_context,
+    .drain                  = backup_drain,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -331,6 +347,7 @@ static void backup_complete(BlockJob *job, void *opaque)
     BackupCompleteData *data = opaque;
 
     blk_unref(s->target);
+    s->target = NULL;
 
     block_job_completed(job, data->ret);
     g_free(data);
@@ -429,7 +446,6 @@ static void coroutine_fn backup_run(void *opaque)
     BackupBlockJob *job = opaque;
     BackupCompleteData *data;
     BlockDriverState *bs = blk_bs(job->common.blk);
-    BlockBackend *target = job->target;
     int64_t start, end;
     int64_t sectors_per_cluster = cluster_size_sectors(job);
     int ret = 0;
@@ -516,8 +532,6 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_unlock(&job->flush_rwlock);
     g_free(job->done_bitmap);
 
-    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);
-
     data = g_malloc(sizeof(*data));
     data->ret = ret;
     block_job_defer_to_main_loop(&job->common, backup_complete, data);
@@ -529,6 +543,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
                   bool compress,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
+                  int creation_flags,
                   BlockCompletionFunc *cb, void *opaque,
                   BlockJobTxn *txn, Error **errp)
 {
@@ -598,7 +613,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
     }
 
     job = block_job_create(job_id, &backup_job_driver, bs, speed,
-                           cb, opaque, errp);
+                           creation_flags, cb, opaque, errp);
     if (!job) {
         goto error;
     }
@@ -631,7 +646,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
         job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
     }
 
-    bdrv_op_block_all(target, job->common.blocker);
+    block_job_add_bdrv(&job->common, target);
     job->common.len = len;
     job->common.co = qemu_coroutine_create(backup_run, job);
     block_job_txn_add_job(txn, &job->common);
diff --git a/block/block-backend.c b/block/block-backend.c
index c53ca30000..27a7f6f523 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -799,20 +799,25 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                BdrvRequestFlags flags)
 {
     int ret;
+    BlockDriverState *bs = blk_bs(blk);
 
-    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
+
     /* throttling disk I/O */
     if (blk->public.throttle_state) {
         throttle_group_co_io_limits_intercept(blk, bytes, false);
     }
 
-    return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+    bdrv_dec_in_flight(bs);
+    return ret;
 }
 
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
@@ -820,14 +825,17 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                 BdrvRequestFlags flags)
 {
     int ret;
+    BlockDriverState *bs = blk_bs(blk);
 
-    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
+
     /* throttling disk I/O */
     if (blk->public.throttle_state) {
         throttle_group_co_io_limits_intercept(blk, bytes, true);
@@ -837,7 +845,9 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
         flags |= BDRV_REQ_FUA;
     }
 
-    return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    bdrv_dec_in_flight(bs);
+    return ret;
 }
 
 typedef struct BlkRwCo {
@@ -868,7 +878,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
                    int64_t bytes, CoroutineEntry co_entry,
                    BdrvRequestFlags flags)
 {
-    AioContext *aio_context;
     QEMUIOVector qiov;
     struct iovec iov;
     Coroutine *co;
@@ -890,11 +899,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
 
     co = qemu_coroutine_create(co_entry, &rwco);
     qemu_coroutine_enter(co);
-
-    aio_context = blk_get_aio_context(blk);
-    while (rwco.ret == NOT_DONE) {
-        aio_poll(aio_context, true);
-    }
+    BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
 
     return rwco.ret;
 }
@@ -930,6 +935,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
+
+    bdrv_dec_in_flight(acb->common.bs);
     acb->common.cb(acb->common.opaque, acb->ret);
     qemu_aio_unref(acb);
 }
@@ -940,6 +947,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
 {
     struct BlockBackendAIOCB *acb;
 
+    bdrv_inc_in_flight(blk_bs(blk));
     acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
     acb->blk = blk;
     acb->ret = ret;
@@ -962,6 +970,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = {
 static void blk_aio_complete(BlkAioEmAIOCB *acb)
 {
     if (acb->has_returned) {
+        bdrv_dec_in_flight(acb->common.bs);
         acb->common.cb(acb->common.opaque, acb->rwco.ret);
         qemu_aio_unref(acb);
     }
@@ -983,6 +992,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
     BlkAioEmAIOCB *acb;
     Coroutine *co;
 
+    bdrv_inc_in_flight(blk_bs(blk));
     acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
     acb->rwco = (BlkRwCo) {
         .blk    = blk,
diff --git a/block/commit.c b/block/commit.c
index 9f67a8b121..e1eda8908b 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -15,7 +15,7 @@
 #include "qemu/osdep.h"
 #include "trace.h"
 #include "block/block_int.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
@@ -209,13 +209,14 @@ static const BlockJobDriver commit_job_driver = {
 
 void commit_start(const char *job_id, BlockDriverState *bs,
                   BlockDriverState *base, BlockDriverState *top, int64_t speed,
-                  BlockdevOnError on_error, BlockCompletionFunc *cb,
-                  void *opaque, const char *backing_file_str, Error **errp)
+                  BlockdevOnError on_error, const char *backing_file_str,
+                  Error **errp)
 {
     CommitBlockJob *s;
     BlockReopenQueue *reopen_queue = NULL;
     int orig_overlay_flags;
     int orig_base_flags;
+    BlockDriverState *iter;
     BlockDriverState *overlay_bs;
     Error *local_err = NULL;
 
@@ -233,7 +234,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     }
 
     s = block_job_create(job_id, &commit_job_driver, bs, speed,
-                         cb, opaque, errp);
+                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
     if (!s) {
         return;
     }
@@ -251,7 +252,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
                                          orig_overlay_flags | BDRV_O_RDWR);
     }
     if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
+        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
         if (local_err != NULL) {
             error_propagate(errp, local_err);
             block_job_unref(&s->common);
@@ -260,6 +261,19 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     }
 
 
+    /* Block all nodes between top and base, because they will
+     * disappear from the chain after this operation. */
+    assert(bdrv_chain_contains(top, base));
+    for (iter = top; iter != backing_bs(base); iter = backing_bs(iter)) {
+        block_job_add_bdrv(&s->common, iter);
+    }
+    /* overlay_bs must be blocked because it needs to be modified to
+     * update the backing image string, but if it's the root node then
+     * don't block it again */
+    if (bs != overlay_bs) {
+        block_job_add_bdrv(&s->common, overlay_bs);
+    }
+
     s->base = blk_new();
     blk_insert_bs(s->base, base);
 
@@ -276,7 +290,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     s->on_error = on_error;
     s->common.co = qemu_coroutine_create(commit_run, s);
 
-    trace_commit_start(bs, base, top, s, s->common.co, opaque);
+    trace_commit_start(bs, base, top, s, s->common.co);
     qemu_coroutine_enter(s->common.co);
 }
 
diff --git a/block/gluster.c b/block/gluster.c
index af76d7d59a..0ce15f7adc 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -14,6 +14,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/uri.h"
 #include "qemu/error-report.h"
+#include "qemu/cutils.h"
 
 #define GLUSTER_OPT_FILENAME        "filename"
 #define GLUSTER_OPT_VOLUME          "volume"
@@ -56,6 +57,19 @@ typedef struct BDRVGlusterReopenState {
 } BDRVGlusterReopenState;
 
 
+typedef struct GlfsPreopened {
+    char *volume;
+    glfs_t *fs;
+    int ref;
+} GlfsPreopened;
+
+typedef struct ListElement {
+    QLIST_ENTRY(ListElement) list;
+    GlfsPreopened saved;
+} ListElement;
+
+static QLIST_HEAD(glfs_list, ListElement) glfs_list;
+
 static QemuOptsList qemu_gluster_create_opts = {
     .name = "qemu-gluster-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
@@ -172,7 +186,7 @@ static QemuOptsList runtime_tcp_opts = {
         },
         {
             .name = GLUSTER_OPT_PORT,
-            .type = QEMU_OPT_NUMBER,
+            .type = QEMU_OPT_STRING,
             .help = "port number on which glusterd is listening (default 24007)",
         },
         {
@@ -194,6 +208,57 @@ static QemuOptsList runtime_tcp_opts = {
     },
 };
 
+static void glfs_set_preopened(const char *volume, glfs_t *fs)
+{
+    ListElement *entry = NULL;
+
+    entry = g_new(ListElement, 1);
+
+    entry->saved.volume = g_strdup(volume);
+
+    entry->saved.fs = fs;
+    entry->saved.ref = 1;
+
+    QLIST_INSERT_HEAD(&glfs_list, entry, list);
+}
+
+static glfs_t *glfs_find_preopened(const char *volume)
+{
+    ListElement *entry = NULL;
+
+     QLIST_FOREACH(entry, &glfs_list, list) {
+        if (strcmp(entry->saved.volume, volume) == 0) {
+            entry->saved.ref++;
+            return entry->saved.fs;
+        }
+     }
+
+    return NULL;
+}
+
+static void glfs_clear_preopened(glfs_t *fs)
+{
+    ListElement *entry = NULL;
+
+    if (fs == NULL) {
+        return;
+    }
+
+    QLIST_FOREACH(entry, &glfs_list, list) {
+        if (entry->saved.fs == fs) {
+            if (--entry->saved.ref) {
+                return;
+            }
+
+            QLIST_REMOVE(entry, list);
+
+            glfs_fini(entry->saved.fs);
+            g_free(entry->saved.volume);
+            g_free(entry);
+        }
+    }
+}
+
 static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
 {
     char *p, *q;
@@ -330,22 +395,37 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
     int ret;
     int old_errno;
     GlusterServerList *server;
+    unsigned long long port;
+
+    glfs = glfs_find_preopened(gconf->volume);
+    if (glfs) {
+        return glfs;
+    }
 
     glfs = glfs_new(gconf->volume);
     if (!glfs) {
         goto out;
     }
 
+    glfs_set_preopened(gconf->volume, glfs);
+
     for (server = gconf->server; server; server = server->next) {
         if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
             ret = glfs_set_volfile_server(glfs,
                                    GlusterTransport_lookup[server->value->type],
                                    server->value->u.q_unix.path, 0);
         } else {
+            if (parse_uint_full(server->value->u.tcp.port, &port, 10) < 0 ||
+                port > 65535) {
+                error_setg(errp, "'%s' is not a valid port number",
+                           server->value->u.tcp.port);
+                errno = EINVAL;
+                goto out;
+            }
             ret = glfs_set_volfile_server(glfs,
                                    GlusterTransport_lookup[server->value->type],
                                    server->value->u.tcp.host,
-                                   atoi(server->value->u.tcp.port));
+                                   (int)port);
         }
 
         if (ret < 0) {
@@ -387,7 +467,7 @@ static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
 out:
     if (glfs) {
         old_errno = errno;
-        glfs_fini(glfs);
+        glfs_clear_preopened(glfs);
         errno = old_errno;
     }
     return NULL;
@@ -668,7 +748,10 @@ static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
  */
 static bool qemu_gluster_test_seek(struct glfs_fd *fd)
 {
-    off_t ret, eof;
+    off_t ret = 0;
+
+#if defined SEEK_HOLE && defined SEEK_DATA
+    off_t eof;
 
     eof = glfs_lseek(fd, 0, SEEK_END);
     if (eof < 0) {
@@ -678,6 +761,8 @@ static bool qemu_gluster_test_seek(struct glfs_fd *fd)
 
     /* this should always fail with ENXIO if SEEK_DATA is supported */
     ret = glfs_lseek(fd, eof, SEEK_DATA);
+#endif
+
     return (ret < 0) && (errno == ENXIO);
 }
 
@@ -762,9 +847,9 @@ out:
     if (s->fd) {
         glfs_close(s->fd);
     }
-    if (s->glfs) {
-        glfs_fini(s->glfs);
-    }
+
+    glfs_clear_preopened(s->glfs);
+
     return ret;
 }
 
@@ -831,9 +916,8 @@ static void qemu_gluster_reopen_commit(BDRVReopenState *state)
     if (s->fd) {
         glfs_close(s->fd);
     }
-    if (s->glfs) {
-        glfs_fini(s->glfs);
-    }
+
+    glfs_clear_preopened(s->glfs);
 
     /* use the newly opened image / connection */
     s->fd         = reop_s->fd;
@@ -858,9 +942,7 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state)
         glfs_close(reop_s->fd);
     }
 
-    if (reop_s->glfs) {
-        glfs_fini(reop_s->glfs);
-    }
+    glfs_clear_preopened(reop_s->glfs);
 
     g_free(state->opaque);
     state->opaque = NULL;
@@ -984,9 +1066,7 @@ static int qemu_gluster_create(const char *filename,
 out:
     g_free(tmp);
     qapi_free_BlockdevOptionsGluster(gconf);
-    if (glfs) {
-        glfs_fini(glfs);
-    }
+    glfs_clear_preopened(glfs);
     return ret;
 }
 
@@ -1059,7 +1139,7 @@ static void qemu_gluster_close(BlockDriverState *bs)
         glfs_close(s->fd);
         s->fd = NULL;
     }
-    glfs_fini(s->glfs);
+    glfs_clear_preopened(s->glfs);
 }
 
 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
@@ -1178,12 +1258,14 @@ static int find_allocation(BlockDriverState *bs, off_t start,
                            off_t *data, off_t *hole)
 {
     BDRVGlusterState *s = bs->opaque;
-    off_t offs;
 
     if (!s->supports_seek_data) {
-        return -ENOTSUP;
+        goto exit;
     }
 
+#if defined SEEK_HOLE && defined SEEK_DATA
+    off_t offs;
+
     /*
      * SEEK_DATA cases:
      * D1. offs == start: start is in data
@@ -1247,6 +1329,10 @@ static int find_allocation(BlockDriverState *bs, off_t start,
 
     /* D1 and H1 */
     return -EBUSY;
+#endif
+
+exit:
+    return -ENOTSUP;
 }
 
 /*
diff --git a/block/io.c b/block/io.c
index 79cbbdf769..37749b680a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -143,7 +143,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)
 {
     BdrvChild *child;
 
-    if (!QLIST_EMPTY(&bs->tracked_requests)) {
+    if (atomic_read(&bs->in_flight)) {
         return true;
     }
 
@@ -156,16 +156,22 @@ bool bdrv_requests_pending(BlockDriverState *bs)
     return false;
 }
 
-static void bdrv_drain_recurse(BlockDriverState *bs)
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
     BdrvChild *child;
+    bool waited;
+
+    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
     if (bs->drv && bs->drv->bdrv_drain) {
         bs->drv->bdrv_drain(bs);
     }
+
     QLIST_FOREACH(child, &bs->children, next) {
-        bdrv_drain_recurse(child->bs);
+        waited |= bdrv_drain_recurse(child->bs);
     }
+
+    return waited;
 }
 
 typedef struct {
@@ -174,23 +180,14 @@ typedef struct {
     bool done;
 } BdrvCoDrainData;
 
-static void bdrv_drain_poll(BlockDriverState *bs)
-{
-    bool busy = true;
-
-    while (busy) {
-        /* Keep iterating */
-        busy = bdrv_requests_pending(bs);
-        busy |= aio_poll(bdrv_get_aio_context(bs), busy);
-    }
-}
-
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
     BdrvCoDrainData *data = opaque;
     Coroutine *co = data->co;
+    BlockDriverState *bs = data->bs;
 
-    bdrv_drain_poll(data->bs);
+    bdrv_dec_in_flight(bs);
+    bdrv_drained_begin(bs);
     data->done = true;
     qemu_coroutine_enter(co);
 }
@@ -209,6 +206,7 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
         .bs = bs,
         .done = false,
     };
+    bdrv_inc_in_flight(bs);
     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
                             bdrv_co_drain_bh_cb, &data);
 
@@ -220,6 +218,11 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs);
+        return;
+    }
+
     if (!bs->quiesce_counter++) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
@@ -227,11 +230,6 @@ void bdrv_drained_begin(BlockDriverState *bs)
 
     bdrv_io_unplugged_begin(bs);
     bdrv_drain_recurse(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs);
-    } else {
-        bdrv_drain_poll(bs);
-    }
     bdrv_io_unplugged_end(bs);
 }
 
@@ -275,11 +273,17 @@ void bdrv_drain(BlockDriverState *bs)
  *
  * This function does not flush data to disk, use bdrv_flush_all() for that
  * after calling this function.
+ *
+ * This pauses all block jobs and disables external clients. It must
+ * be paired with bdrv_drain_all_end().
+ *
+ * NOTE: no new block jobs or BlockDriverStates can be created between
+ * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
  */
-void bdrv_drain_all(void)
+void bdrv_drain_all_begin(void)
 {
     /* Always run first iteration so any pending completion BHs run */
-    bool busy = true;
+    bool waited = true;
     BlockDriverState *bs;
     BdrvNextIterator it;
     BlockJob *job = NULL;
@@ -299,7 +303,7 @@ void bdrv_drain_all(void)
         aio_context_acquire(aio_context);
         bdrv_parent_drained_begin(bs);
         bdrv_io_unplugged_begin(bs);
-        bdrv_drain_recurse(bs);
+        aio_disable_external(aio_context);
         aio_context_release(aio_context);
 
         if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -313,8 +317,8 @@ void bdrv_drain_all(void)
      * request completion.  Therefore we must keep looping until there was no
      * more activity rather than simply draining each device independently.
      */
-    while (busy) {
-        busy = false;
+    while (waited) {
+        waited = false;
 
         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
             AioContext *aio_context = ctx->data;
@@ -322,28 +326,32 @@ void bdrv_drain_all(void)
             aio_context_acquire(aio_context);
             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
-                    if (bdrv_requests_pending(bs)) {
-                        busy = true;
-                        aio_poll(aio_context, busy);
-                    }
+                    waited |= bdrv_drain_recurse(bs);
                 }
             }
-            busy |= aio_poll(aio_context, false);
             aio_context_release(aio_context);
         }
     }
 
+    g_slist_free(aio_ctxs);
+}
+
+void bdrv_drain_all_end(void)
+{
+    BlockDriverState *bs;
+    BdrvNextIterator it;
+    BlockJob *job = NULL;
+
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
+        aio_enable_external(aio_context);
         bdrv_io_unplugged_end(bs);
         bdrv_parent_drained_end(bs);
         aio_context_release(aio_context);
     }
-    g_slist_free(aio_ctxs);
 
-    job = NULL;
     while ((job = block_job_next(job))) {
         AioContext *aio_context = blk_get_aio_context(job->blk);
 
@@ -353,6 +361,12 @@ void bdrv_drain_all(void)
     }
 }
 
+void bdrv_drain_all(void)
+{
+    bdrv_drain_all_begin();
+    bdrv_drain_all_end();
+}
+
 /**
  * Remove an active request from the tracked requests list
  *
@@ -476,6 +490,28 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
     return true;
 }
 
+void bdrv_inc_in_flight(BlockDriverState *bs)
+{
+    atomic_inc(&bs->in_flight);
+}
+
+static void dummy_bh_cb(void *opaque)
+{
+}
+
+void bdrv_wakeup(BlockDriverState *bs)
+{
+    if (bs->wakeup) {
+        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+    }
+}
+
+void bdrv_dec_in_flight(BlockDriverState *bs)
+{
+    atomic_dec(&bs->in_flight);
+    bdrv_wakeup(bs);
+}
+
 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
     BlockDriverState *bs = self->bs;
@@ -583,13 +619,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(child->bs);
-
         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
         qemu_coroutine_enter(co);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
     }
     return rwco.ret;
 }
@@ -1097,6 +1129,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
+
     /* Don't do copy-on-read if we read data before write operation */
     if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
         flags |= BDRV_REQ_COPY_ON_READ;
@@ -1132,6 +1166,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
     tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
 
     if (use_local_qiov) {
         qemu_iovec_destroy(&local_qiov);
@@ -1480,6 +1515,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
         return ret;
     }
 
+    bdrv_inc_in_flight(bs);
     /*
      * Align write if necessary by performing a read-modify-write cycle.
      * Pad qiov with the read parts and be sure to have a tracked request not
@@ -1581,6 +1617,7 @@ fail:
     qemu_vfree(tail_buf);
 out:
     tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -1705,17 +1742,19 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
     }
 
     *file = NULL;
+    bdrv_inc_in_flight(bs);
     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
                                             file);
     if (ret < 0) {
         *pnum = 0;
-        return ret;
+        goto out;
     }
 
     if (ret & BDRV_BLOCK_RAW) {
         assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
-                                     *pnum, pnum, file);
+        ret = bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
+                                    *pnum, pnum, file);
+        goto out;
     }
 
     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
@@ -1757,6 +1796,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
         }
     }
 
+out:
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -1822,14 +1863,10 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
         /* Fast-path if already in coroutine context */
         bdrv_get_block_status_above_co_entry(&data);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
                                    &data);
         qemu_coroutine_enter(co);
-        while (!data.done) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, !data.done);
     }
     return data.ret;
 }
@@ -2102,6 +2139,7 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = {
 static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
 {
     if (!acb->need_bh) {
+        bdrv_dec_in_flight(acb->common.bs);
         acb->common.cb(acb->common.opaque, acb->req.error);
         qemu_aio_unref(acb);
     }
@@ -2152,6 +2190,9 @@ static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
+    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
+    bdrv_inc_in_flight(child->bs);
+
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
     acb->child = child;
     acb->need_bh = true;
@@ -2185,6 +2226,9 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
+    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
+    bdrv_inc_in_flight(bs);
+
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
@@ -2244,23 +2288,22 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
-    BdrvTrackedRequest req;
 
     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
         bdrv_is_sg(bs)) {
         return 0;
     }
 
-    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
+    bdrv_inc_in_flight(bs);
 
     int current_gen = bs->write_gen;
 
     /* Wait until any previous flushes are completed */
-    while (bs->active_flush_req != NULL) {
+    while (bs->active_flush_req) {
         qemu_co_queue_wait(&bs->flush_queue);
     }
 
-    bs->active_flush_req = &req;
+    bs->active_flush_req = true;
 
     /* Write back all layers by calling one driver function */
     if (bs->drv->bdrv_co_flush) {
@@ -2330,11 +2373,11 @@ flush_parent:
 out:
     /* Notify any pending flushes that we have completed */
     bs->flushed_gen = current_gen;
-    bs->active_flush_req = NULL;
+    bs->active_flush_req = false;
     /* Return value is ignored - it's ok if wait queue is empty */
     qemu_co_queue_next(&bs->flush_queue);
 
-    tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -2350,13 +2393,9 @@ int bdrv_flush(BlockDriverState *bs)
         /* Fast-path if already in coroutine context */
         bdrv_flush_co_entry(&flush_co);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
         qemu_coroutine_enter(co);
-        while (flush_co.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
     }
 
     return flush_co.ret;
@@ -2417,6 +2456,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
         return 0;
     }
 
+    bdrv_inc_in_flight(bs);
     tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
@@ -2463,6 +2503,7 @@ out:
     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
                    req.bytes >> BDRV_SECTOR_BITS);
     tracked_request_end(&req);
+    bdrv_dec_in_flight(bs);
     return ret;
 }
 
@@ -2480,13 +2521,9 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
         /* Fast-path if already in coroutine context */
         bdrv_pdiscard_co_entry(&rwco);
     } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
         qemu_coroutine_enter(co);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
+        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
     }
 
     return rwco.ret;
@@ -2495,13 +2532,12 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
 {
     BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest tracked_req;
     CoroutineIOCompletion co = {
         .coroutine = qemu_coroutine_self(),
     };
     BlockAIOCB *acb;
 
-    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+    bdrv_inc_in_flight(bs);
     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
         co.ret = -ENOTSUP;
         goto out;
@@ -2518,7 +2554,7 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
         qemu_coroutine_yield();
     }
 out:
-    tracked_request_end(&tracked_req);
+    bdrv_dec_in_flight(bs);
     return co.ret;
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index a433e6848c..b2c1fb855b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -13,7 +13,7 @@
 
 #include "qemu/osdep.h"
 #include "trace.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
@@ -469,7 +469,11 @@ static void mirror_free_init(MirrorBlockJob *s)
     }
 }
 
-static void mirror_drain(MirrorBlockJob *s)
+/* This is also used for the .pause callback. There is no matching
+ * mirror_resume() because mirror_run() will begin iterating again
+ * when the job is resumed.
+ */
+static void mirror_wait_for_all_io(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
         mirror_wait_for_io(s);
@@ -526,8 +530,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
         aio_context_release(replace_aio_context);
     }
     g_free(s->replaces);
-    bdrv_op_unblock_all(target_bs, s->common.blocker);
     blk_unref(s->target);
+    s->target = NULL;
     block_job_completed(&s->common, data->ret);
     g_free(data);
     bdrv_drained_end(src);
@@ -582,7 +586,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
             sector_num += nb_sectors;
         }
 
-        mirror_drain(s);
+        mirror_wait_for_all_io(s);
     }
 
     /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -617,6 +621,7 @@ static void coroutine_fn mirror_run(void *opaque)
     MirrorExitData *data;
     BlockDriverState *bs = blk_bs(s->common.blk);
     BlockDriverState *target_bs = blk_bs(s->target);
+    bool need_drain = true;
     int64_t length;
     BlockDriverInfo bdi;
     char backing_filename[2]; /* we only need 2 characters because we are only
@@ -752,11 +757,26 @@ static void coroutine_fn mirror_run(void *opaque)
              * source has dirty data to copy!
              *
              * Note that I/O can be submitted by the guest while
-             * mirror_populate runs.
+             * mirror_populate runs, so pause it now.  Before deciding
+             * whether to switch to target check one last time if I/O has
+             * come in the meanwhile, and if not flush the data to disk.
              */
             trace_mirror_before_drain(s, cnt);
-            bdrv_co_drain(bs);
+
+            bdrv_drained_begin(bs);
             cnt = bdrv_get_dirty_count(s->dirty_bitmap);
+            if (cnt > 0) {
+                bdrv_drained_end(bs);
+                continue;
+            }
+
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            need_drain = false;
+            break;
         }
 
         ret = 0;
@@ -769,13 +789,6 @@ static void coroutine_fn mirror_run(void *opaque)
         } else if (!should_complete) {
             delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
             block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
-        } else if (cnt == 0) {
-            /* The two disks are in sync.  Exit and report successful
-             * completion.
-             */
-            assert(QLIST_EMPTY(&bs->tracked_requests));
-            s->common.cancelled = false;
-            break;
         }
         s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
@@ -787,7 +800,8 @@ immediate_exit:
          * the target is a copy of the source.
          */
         assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
-        mirror_drain(s);
+        assert(need_drain);
+        mirror_wait_for_all_io(s);
     }
 
     assert(s->in_flight == 0);
@@ -799,9 +813,10 @@ immediate_exit:
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
-    /* Before we switch to target in mirror_exit, make sure data doesn't
-     * change. */
-    bdrv_drained_begin(bs);
+
+    if (need_drain) {
+        bdrv_drained_begin(bs);
+    }
     block_job_defer_to_main_loop(&s->common, mirror_exit, data);
 }
 
@@ -872,14 +887,11 @@ static void mirror_complete(BlockJob *job, Error **errp)
     block_job_enter(&s->common);
 }
 
-/* There is no matching mirror_resume() because mirror_run() will begin
- * iterating again when the job is resumed.
- */
-static void coroutine_fn mirror_pause(BlockJob *job)
+static void mirror_pause(BlockJob *job)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
 
-    mirror_drain(s);
+    mirror_wait_for_all_io(s);
 }
 
 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
@@ -889,6 +901,21 @@ static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
     blk_set_aio_context(s->target, new_context);
 }
 
+static void mirror_drain(BlockJob *job)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    /* Need to keep a reference in case blk_drain triggers execution
+     * of mirror_complete...
+     */
+    if (s->target) {
+        BlockBackend *target = s->target;
+        blk_ref(target);
+        blk_drain(target);
+        blk_unref(target);
+    }
+}
+
 static const BlockJobDriver mirror_job_driver = {
     .instance_size          = sizeof(MirrorBlockJob),
     .job_type               = BLOCK_JOB_TYPE_MIRROR,
@@ -896,6 +923,7 @@ static const BlockJobDriver mirror_job_driver = {
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
     .attached_aio_context   = mirror_attached_aio_context,
+    .drain                  = mirror_drain,
 };
 
 static const BlockJobDriver commit_active_job_driver = {
@@ -905,12 +933,13 @@ static const BlockJobDriver commit_active_job_driver = {
     .complete               = mirror_complete,
     .pause                  = mirror_pause,
     .attached_aio_context   = mirror_attached_aio_context,
+    .drain                  = mirror_drain,
 };
 
 static void mirror_start_job(const char *job_id, BlockDriverState *bs,
-                             BlockDriverState *target, const char *replaces,
-                             int64_t speed, uint32_t granularity,
-                             int64_t buf_size,
+                             int creation_flags, BlockDriverState *target,
+                             const char *replaces, int64_t speed,
+                             uint32_t granularity, int64_t buf_size,
                              BlockMirrorBackingMode backing_mode,
                              BlockdevOnError on_source_error,
                              BlockdevOnError on_target_error,
@@ -938,7 +967,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
         buf_size = DEFAULT_MIRROR_BUF_SIZE;
     }
 
-    s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp);
+    s = block_job_create(job_id, driver, bs, speed, creation_flags,
+                         cb, opaque, errp);
     if (!s) {
         return;
     }
@@ -967,7 +997,15 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs,
         return;
     }
 
-    bdrv_op_block_all(target, s->common.blocker);
+    block_job_add_bdrv(&s->common, target);
+    /* In commit_active_start() all intermediate nodes disappear, so
+     * any jobs in them must be blocked */
+    if (bdrv_chain_contains(bs, target)) {
+        BlockDriverState *iter;
+        for (iter = backing_bs(bs); iter != target; iter = backing_bs(iter)) {
+            block_job_add_bdrv(&s->common, iter);
+        }
+    }
 
     s->common.co = qemu_coroutine_create(mirror_run, s);
     trace_mirror_start(bs, s, s->common.co, opaque);
@@ -980,9 +1018,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap,
-                  BlockCompletionFunc *cb,
-                  void *opaque, Error **errp)
+                  bool unmap, Error **errp)
 {
     bool is_none_mode;
     BlockDriverState *base;
@@ -993,17 +1029,16 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
     }
     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
     base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL;
-    mirror_start_job(job_id, bs, target, replaces,
+    mirror_start_job(job_id, bs, BLOCK_JOB_DEFAULT, target, replaces,
                      speed, granularity, buf_size, backing_mode,
-                     on_source_error, on_target_error, unmap, cb, opaque, errp,
+                     on_source_error, on_target_error, unmap, NULL, NULL, errp,
                      &mirror_job_driver, is_none_mode, base, false);
 }
 
 void commit_active_start(const char *job_id, BlockDriverState *bs,
-                         BlockDriverState *base, int64_t speed,
-                         BlockdevOnError on_error,
-                         BlockCompletionFunc *cb,
-                         void *opaque, Error **errp,
+                         BlockDriverState *base, int creation_flags,
+                         int64_t speed, BlockdevOnError on_error,
+                         BlockCompletionFunc *cb, void *opaque, Error **errp,
                          bool auto_complete)
 {
     int64_t length, base_length;
@@ -1042,9 +1077,9 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
         }
     }
 
-    mirror_start_job(job_id, bs, base, NULL, speed, 0, 0,
+    mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
                      MIRROR_LEAVE_BACKING_CHAIN,
-                     on_error, on_error, false, cb, opaque, &local_err,
+                     on_error, on_error, true, cb, opaque, &local_err,
                      &commit_active_job_driver, false, base, auto_complete);
     if (local_err) {
         error_propagate(errp, local_err);
diff --git a/block/nfs.c b/block/nfs.c
index c3db2ec58d..55c4e0b073 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -35,8 +35,15 @@
 #include "qemu/uri.h"
 #include "qemu/cutils.h"
 #include "sysemu/sysemu.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qint.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi-visit.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 #include <nfsc/libnfs.h>
 
+
 #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576
 #define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE)
 #define QEMU_NFS_MAX_DEBUG_LEVEL 2
@@ -49,9 +56,13 @@ typedef struct NFSClient {
     AioContext *aio_context;
     blkcnt_t st_blocks;
     bool cache_used;
+    NFSServer *server;
+    char *path;
+    int64_t uid, gid, tcp_syncnt, readahead, pagecache, debug;
 } NFSClient;
 
 typedef struct NFSRPC {
+    BlockDriverState *bs;
     int ret;
     int complete;
     QEMUIOVector *iov;
@@ -60,6 +71,122 @@ typedef struct NFSRPC {
     NFSClient *client;
 } NFSRPC;
 
+static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
+{
+    URI *uri = NULL;
+    QueryParams *qp = NULL;
+    int ret = -EINVAL, i;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        error_setg(errp, "Invalid URI specified");
+        goto out;
+    }
+    if (strcmp(uri->scheme, "nfs") != 0) {
+        error_setg(errp, "URI scheme must be 'nfs'");
+        goto out;
+    }
+
+    if (!uri->server) {
+        error_setg(errp, "missing hostname in URI");
+        goto out;
+    }
+
+    if (!uri->path) {
+        error_setg(errp, "missing file path in URI");
+        goto out;
+    }
+
+    qp = query_params_parse(uri->query);
+    if (!qp) {
+        error_setg(errp, "could not parse query parameters");
+        goto out;
+    }
+
+    qdict_put(options, "server.host", qstring_from_str(uri->server));
+    qdict_put(options, "server.type", qstring_from_str("inet"));
+    qdict_put(options, "path", qstring_from_str(uri->path));
+
+    for (i = 0; i < qp->n; i++) {
+        if (!qp->p[i].value) {
+            error_setg(errp, "Value for NFS parameter expected: %s",
+                       qp->p[i].name);
+            goto out;
+        }
+        if (parse_uint_full(qp->p[i].value, NULL, 0)) {
+            error_setg(errp, "Illegal value for NFS parameter: %s",
+                       qp->p[i].name);
+            goto out;
+        }
+        if (!strcmp(qp->p[i].name, "uid")) {
+            qdict_put(options, "user",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "gid")) {
+            qdict_put(options, "group",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) {
+            qdict_put(options, "tcp-syn-count",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "readahead")) {
+            qdict_put(options, "readahead-size",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "pagecache")) {
+            qdict_put(options, "page-cache-size",
+                      qstring_from_str(qp->p[i].value));
+        } else if (!strcmp(qp->p[i].name, "debug")) {
+            qdict_put(options, "debug-level",
+                      qstring_from_str(qp->p[i].value));
+        } else {
+            error_setg(errp, "Unknown NFS parameter name: %s",
+                       qp->p[i].name);
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    if (uri) {
+        uri_free(uri);
+    }
+    return ret;
+}
+
+static bool nfs_has_filename_options_conflict(QDict *options, Error **errp)
+{
+    const QDictEntry *qe;
+
+    for (qe = qdict_first(options); qe; qe = qdict_next(options, qe)) {
+        if (!strcmp(qe->key, "host") ||
+            !strcmp(qe->key, "path") ||
+            !strcmp(qe->key, "user") ||
+            !strcmp(qe->key, "group") ||
+            !strcmp(qe->key, "tcp-syn-count") ||
+            !strcmp(qe->key, "readahead-size") ||
+            !strcmp(qe->key, "page-cache-size") ||
+            !strcmp(qe->key, "debug-level") ||
+            strstart(qe->key, "server.", NULL))
+        {
+            error_setg(errp, "Option %s cannot be used with a filename",
+                       qe->key);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void nfs_parse_filename(const char *filename, QDict *options,
+                               Error **errp)
+{
+    if (nfs_has_filename_options_conflict(options, errp)) {
+        return;
+    }
+
+    nfs_parse_uri(filename, options, errp);
+}
+
 static void nfs_process_read(void *arg);
 static void nfs_process_write(void *arg);
 
@@ -90,11 +217,12 @@ static void nfs_process_write(void *arg)
     nfs_set_events(client);
 }
 
-static void nfs_co_init_task(NFSClient *client, NFSRPC *task)
+static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 {
     *task = (NFSRPC) {
         .co             = qemu_coroutine_self(),
-        .client         = client,
+        .bs             = bs,
+        .client         = bs->opaque,
     };
 }
 
@@ -111,6 +239,7 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
 {
     NFSRPC *task = private_data;
     task->ret = ret;
+    assert(!task->st);
     if (task->ret > 0 && task->iov) {
         if (task->ret <= task->iov->size) {
             qemu_iovec_from_buf(task->iov, 0, data, task->ret);
@@ -118,18 +247,11 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data,
             task->ret = -EIO;
         }
     }
-    if (task->ret == 0 && task->st) {
-        memcpy(task->st, data, sizeof(struct stat));
-    }
     if (task->ret < 0) {
         error_report("NFS Error: %s", nfs_get_error(nfs));
     }
-    if (task->co) {
-        aio_bh_schedule_oneshot(task->client->aio_context,
-                                nfs_co_generic_bh_cb, task);
-    } else {
-        task->complete = 1;
-    }
+    aio_bh_schedule_oneshot(task->client->aio_context,
+                            nfs_co_generic_bh_cb, task);
 }
 
 static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
@@ -139,7 +261,7 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
     NFSClient *client = bs->opaque;
     NFSRPC task;
 
-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
     task.iov = iov;
 
     if (nfs_pread_async(client->context, client->fh,
@@ -149,8 +271,8 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
         return -ENOMEM;
     }
 
+    nfs_set_events(client);
     while (!task.complete) {
-        nfs_set_events(client);
         qemu_coroutine_yield();
     }
 
@@ -174,7 +296,7 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
     NFSRPC task;
     char *buf = NULL;
 
-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
 
     buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
     if (nb_sectors && buf == NULL) {
@@ -191,8 +313,8 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
         return -ENOMEM;
     }
 
+    nfs_set_events(client);
     while (!task.complete) {
-        nfs_set_events(client);
         qemu_coroutine_yield();
     }
 
@@ -210,30 +332,59 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
     NFSClient *client = bs->opaque;
     NFSRPC task;
 
-    nfs_co_init_task(client, &task);
+    nfs_co_init_task(bs, &task);
 
     if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb,
                         &task) != 0) {
         return -ENOMEM;
     }
 
+    nfs_set_events(client);
     while (!task.complete) {
-        nfs_set_events(client);
         qemu_coroutine_yield();
     }
 
     return task.ret;
 }
 
-/* TODO Convert to fine grained options */
 static QemuOptsList runtime_opts = {
     .name = "nfs",
     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
     .desc = {
         {
-            .name = "filename",
+            .name = "path",
             .type = QEMU_OPT_STRING,
-            .help = "URL to the NFS file",
+            .help = "Path of the image on the host",
+        },
+        {
+            .name = "uid",
+            .type = QEMU_OPT_NUMBER,
+            .help = "UID value to use when talking to the server",
+        },
+        {
+            .name = "gid",
+            .type = QEMU_OPT_NUMBER,
+            .help = "GID value to use when talking to the server",
+        },
+        {
+            .name = "tcp-syncnt",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Number of SYNs to send during the session establish",
+        },
+        {
+            .name = "readahead",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Set the readahead size in bytes",
+        },
+        {
+            .name = "pagecache",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Set the pagecache size in bytes",
+        },
+        {
+            .name = "debug",
+            .type = QEMU_OPT_NUMBER,
+            .help = "Set the NFS debug level (max 2)",
         },
         { /* end of list */ }
     },
@@ -276,25 +427,65 @@ static void nfs_file_close(BlockDriverState *bs)
     nfs_client_close(client);
 }
 
-static int64_t nfs_client_open(NFSClient *client, const char *filename,
+static NFSServer *nfs_config(QDict *options, Error **errp)
+{
+    NFSServer *server = NULL;
+    QDict *addr = NULL;
+    QObject *crumpled_addr = NULL;
+    Visitor *iv = NULL;
+    Error *local_error = NULL;
+
+    qdict_extract_subqdict(options, &addr, "server.");
+    if (!qdict_size(addr)) {
+        error_setg(errp, "NFS server address missing");
+        goto out;
+    }
+
+    crumpled_addr = qdict_crumple(addr, errp);
+    if (!crumpled_addr) {
+        goto out;
+    }
+
+    iv = qobject_input_visitor_new(crumpled_addr, true);
+    visit_type_NFSServer(iv, NULL, &server, &local_error);
+    if (local_error) {
+        error_propagate(errp, local_error);
+        goto out;
+    }
+
+out:
+    QDECREF(addr);
+    qobject_decref(crumpled_addr);
+    visit_free(iv);
+    return server;
+}
+
+
+static int64_t nfs_client_open(NFSClient *client, QDict *options,
                                int flags, Error **errp, int open_flags)
 {
-    int ret = -EINVAL, i;
+    int ret = -EINVAL;
+    QemuOpts *opts = NULL;
+    Error *local_err = NULL;
     struct stat st;
-    URI *uri;
-    QueryParams *qp = NULL;
     char *file = NULL, *strp = NULL;
 
-    uri = uri_parse(filename);
-    if (!uri) {
-        error_setg(errp, "Invalid URL specified");
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
         goto fail;
     }
-    if (!uri->server) {
-        error_setg(errp, "Invalid URL specified");
+
+    client->path = g_strdup(qemu_opt_get(opts, "path"));
+    if (!client->path) {
+        ret = -EINVAL;
+        error_setg(errp, "No path was specified");
         goto fail;
     }
-    strp = strrchr(uri->path, '/');
+
+    strp = strrchr(client->path, '/');
     if (strp == NULL) {
         error_setg(errp, "Invalid URL specified");
         goto fail;
@@ -302,85 +493,89 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
     file = g_strdup(strp);
     *strp = 0;
 
+    /* Pop the config into our state object, Exit if invalid */
+    client->server = nfs_config(options, errp);
+    if (!client->server) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
     client->context = nfs_init_context();
     if (client->context == NULL) {
         error_setg(errp, "Failed to init NFS context");
         goto fail;
     }
 
-    qp = query_params_parse(uri->query);
-    for (i = 0; i < qp->n; i++) {
-        unsigned long long val;
-        if (!qp->p[i].value) {
-            error_setg(errp, "Value for NFS parameter expected: %s",
-                       qp->p[i].name);
+    if (qemu_opt_get(opts, "uid")) {
+        client->uid = qemu_opt_get_number(opts, "uid", 0);
+        nfs_set_uid(client->context, client->uid);
+    }
+
+    if (qemu_opt_get(opts, "gid")) {
+        client->gid = qemu_opt_get_number(opts, "gid", 0);
+        nfs_set_gid(client->context, client->gid);
+    }
+
+    if (qemu_opt_get(opts, "tcp-syncnt")) {
+        client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syncnt", 0);
+        nfs_set_tcp_syncnt(client->context, client->tcp_syncnt);
+    }
+
+#ifdef LIBNFS_FEATURE_READAHEAD
+    if (qemu_opt_get(opts, "readahead")) {
+        if (open_flags & BDRV_O_NOCACHE) {
+            error_setg(errp, "Cannot enable NFS readahead "
+                             "if cache.direct = on");
             goto fail;
         }
-        if (parse_uint_full(qp->p[i].value, &val, 0)) {
-            error_setg(errp, "Illegal value for NFS parameter: %s",
-                       qp->p[i].name);
-            goto fail;
+        client->readahead = qemu_opt_get_number(opts, "readahead", 0);
+        if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) {
+            error_report("NFS Warning: Truncating NFS readahead "
+                         "size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
+            client->readahead = QEMU_NFS_MAX_READAHEAD_SIZE;
         }
-        if (!strcmp(qp->p[i].name, "uid")) {
-            nfs_set_uid(client->context, val);
-        } else if (!strcmp(qp->p[i].name, "gid")) {
-            nfs_set_gid(client->context, val);
-        } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) {
-            nfs_set_tcp_syncnt(client->context, val);
-#ifdef LIBNFS_FEATURE_READAHEAD
-        } else if (!strcmp(qp->p[i].name, "readahead")) {
-            if (open_flags & BDRV_O_NOCACHE) {
-                error_setg(errp, "Cannot enable NFS readahead "
-                                 "if cache.direct = on");
-                goto fail;
-            }
-            if (val > QEMU_NFS_MAX_READAHEAD_SIZE) {
-                error_report("NFS Warning: Truncating NFS readahead"
-                             " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
-                val = QEMU_NFS_MAX_READAHEAD_SIZE;
-            }
-            nfs_set_readahead(client->context, val);
+        nfs_set_readahead(client->context, client->readahead);
 #ifdef LIBNFS_FEATURE_PAGECACHE
-            nfs_set_pagecache_ttl(client->context, 0);
+        nfs_set_pagecache_ttl(client->context, 0);
 #endif
-            client->cache_used = true;
+        client->cache_used = true;
+    }
 #endif
+
 #ifdef LIBNFS_FEATURE_PAGECACHE
-            nfs_set_pagecache_ttl(client->context, 0);
-        } else if (!strcmp(qp->p[i].name, "pagecache")) {
-            if (open_flags & BDRV_O_NOCACHE) {
-                error_setg(errp, "Cannot enable NFS pagecache "
-                                 "if cache.direct = on");
-                goto fail;
-            }
-            if (val > QEMU_NFS_MAX_PAGECACHE_SIZE) {
-                error_report("NFS Warning: Truncating NFS pagecache"
-                             " size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
-                val = QEMU_NFS_MAX_PAGECACHE_SIZE;
-            }
-            nfs_set_pagecache(client->context, val);
-            nfs_set_pagecache_ttl(client->context, 0);
-            client->cache_used = true;
+    if (qemu_opt_get(opts, "pagecache")) {
+        if (open_flags & BDRV_O_NOCACHE) {
+            error_setg(errp, "Cannot enable NFS pagecache "
+                             "if cache.direct = on");
+            goto fail;
+        }
+        client->pagecache = qemu_opt_get_number(opts, "pagecache", 0);
+        if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) {
+            error_report("NFS Warning: Truncating NFS pagecache "
+                         "size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
+            client->pagecache = QEMU_NFS_MAX_PAGECACHE_SIZE;
+        }
+        nfs_set_pagecache(client->context, client->pagecache);
+        nfs_set_pagecache_ttl(client->context, 0);
+        client->cache_used = true;
+    }
 #endif
+
 #ifdef LIBNFS_FEATURE_DEBUG
-        } else if (!strcmp(qp->p[i].name, "debug")) {
-            /* limit the maximum debug level to avoid potential flooding
-             * of our log files. */
-            if (val > QEMU_NFS_MAX_DEBUG_LEVEL) {
-                error_report("NFS Warning: Limiting NFS debug level"
-                             " to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
-                val = QEMU_NFS_MAX_DEBUG_LEVEL;
-            }
-            nfs_set_debug(client->context, val);
-#endif
-        } else {
-            error_setg(errp, "Unknown NFS parameter name: %s",
-                       qp->p[i].name);
-            goto fail;
+    if (qemu_opt_get(opts, "debug")) {
+        client->debug = qemu_opt_get_number(opts, "debug", 0);
+        /* limit the maximum debug level to avoid potential flooding
+         * of our log files. */
+        if (client->debug > QEMU_NFS_MAX_DEBUG_LEVEL) {
+            error_report("NFS Warning: Limiting NFS debug level "
+                         "to %d", QEMU_NFS_MAX_DEBUG_LEVEL);
+            client->debug = QEMU_NFS_MAX_DEBUG_LEVEL;
         }
+        nfs_set_debug(client->context, client->debug);
     }
+#endif
 
-    ret = nfs_mount(client->context, uri->server, uri->path);
+    ret = nfs_mount(client->context, client->server->host, client->path);
     if (ret < 0) {
         error_setg(errp, "Failed to mount nfs share: %s",
                    nfs_get_error(client->context));
@@ -413,14 +608,13 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
     ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
     client->st_blocks = st.st_blocks;
     client->has_zero_init = S_ISREG(st.st_mode);
+    *strp = '/';
     goto out;
+
 fail:
     nfs_client_close(client);
 out:
-    if (qp) {
-        query_params_free(qp);
-    }
-    uri_free(uri);
+    qemu_opts_del(opts);
     g_free(file);
     return ret;
 }
@@ -429,28 +623,17 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp) {
     NFSClient *client = bs->opaque;
     int64_t ret;
-    QemuOpts *opts;
-    Error *local_err = NULL;
 
     client->aio_context = bdrv_get_aio_context(bs);
 
-    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-    qemu_opts_absorb_qdict(opts, options, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        ret = -EINVAL;
-        goto out;
-    }
-    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
+    ret = nfs_client_open(client, options,
                           (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
                           errp, bs->open_flags);
     if (ret < 0) {
-        goto out;
+        return ret;
     }
     bs->total_sectors = ret;
     ret = 0;
-out:
-    qemu_opts_del(opts);
     return ret;
 }
 
@@ -472,6 +655,7 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
     int ret = 0;
     int64_t total_size = 0;
     NFSClient *client = g_new0(NFSClient, 1);
+    QDict *options = NULL;
 
     client->aio_context = qemu_get_aio_context();
 
@@ -479,7 +663,13 @@ static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                           BDRV_SECTOR_SIZE);
 
-    ret = nfs_client_open(client, url, O_CREAT, errp, 0);
+    options = qdict_new();
+    ret = nfs_parse_uri(url, options, errp);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = nfs_client_open(client, options, O_CREAT, errp, 0);
     if (ret < 0) {
         goto out;
     }
@@ -496,6 +686,22 @@ static int nfs_has_zero_init(BlockDriverState *bs)
     return client->has_zero_init;
 }
 
+static void
+nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data,
+                               void *private_data)
+{
+    NFSRPC *task = private_data;
+    task->ret = ret;
+    if (task->ret == 0) {
+        memcpy(task->st, data, sizeof(struct stat));
+    }
+    if (task->ret < 0) {
+        error_report("NFS Error: %s", nfs_get_error(nfs));
+    }
+    task->complete = 1;
+    bdrv_wakeup(task->bs);
+}
+
 static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 {
     NFSClient *client = bs->opaque;
@@ -507,16 +713,15 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
         return client->st_blocks * 512;
     }
 
+    task.bs = bs;
     task.st = &st;
-    if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
+    if (nfs_fstat_async(client->context, client->fh, nfs_get_allocated_file_size_cb,
                         &task) != 0) {
         return -ENOMEM;
     }
 
-    while (!task.complete) {
-        nfs_set_events(client);
-        aio_poll(client->aio_context, true);
-    }
+    nfs_set_events(client);
+    BDRV_POLL_WHILE(bs, !task.complete);
 
     return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
@@ -561,6 +766,67 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
+static void nfs_refresh_filename(BlockDriverState *bs, QDict *options)
+{
+    NFSClient *client = bs->opaque;
+    QDict *opts = qdict_new();
+    QObject *server_qdict;
+    Visitor *ov;
+
+    qdict_put(opts, "driver", qstring_from_str("nfs"));
+
+    if (client->uid && !client->gid) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s?uid=%" PRId64, client->server->host, client->path,
+                 client->uid);
+    } else if (!client->uid && client->gid) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s?gid=%" PRId64, client->server->host, client->path,
+                 client->gid);
+    } else if (client->uid && client->gid) {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s?uid=%" PRId64 "&gid=%" PRId64,
+                 client->server->host, client->path, client->uid, client->gid);
+    } else {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "nfs://%s%s", client->server->host, client->path);
+    }
+
+    ov = qobject_output_visitor_new(&server_qdict);
+    visit_type_NFSServer(ov, NULL, &client->server, &error_abort);
+    visit_complete(ov, &server_qdict);
+    assert(qobject_type(server_qdict) == QTYPE_QDICT);
+
+    qdict_put_obj(opts, "server", server_qdict);
+    qdict_put(opts, "path", qstring_from_str(client->path));
+
+    if (client->uid) {
+        qdict_put(opts, "uid", qint_from_int(client->uid));
+    }
+    if (client->gid) {
+        qdict_put(opts, "gid", qint_from_int(client->gid));
+    }
+    if (client->tcp_syncnt) {
+        qdict_put(opts, "tcp-syncnt",
+                      qint_from_int(client->tcp_syncnt));
+    }
+    if (client->readahead) {
+        qdict_put(opts, "readahead",
+                      qint_from_int(client->readahead));
+    }
+    if (client->pagecache) {
+        qdict_put(opts, "pagecache",
+                      qint_from_int(client->pagecache));
+    }
+    if (client->debug) {
+        qdict_put(opts, "debug", qint_from_int(client->debug));
+    }
+
+    visit_free(ov);
+    qdict_flatten(opts);
+    bs->full_open_options = opts;
+}
+
 #ifdef LIBNFS_FEATURE_PAGECACHE
 static void nfs_invalidate_cache(BlockDriverState *bs,
                                  Error **errp)
@@ -575,7 +841,7 @@ static BlockDriver bdrv_nfs = {
     .protocol_name                  = "nfs",
 
     .instance_size                  = sizeof(NFSClient),
-    .bdrv_needs_filename            = true,
+    .bdrv_parse_filename            = nfs_parse_filename,
     .create_opts                    = &nfs_create_opts,
 
     .bdrv_has_zero_init             = nfs_has_zero_init,
@@ -593,6 +859,7 @@ static BlockDriver bdrv_nfs = {
 
     .bdrv_detach_aio_context        = nfs_detach_aio_context,
     .bdrv_attach_aio_context        = nfs_attach_aio_context,
+    .bdrv_refresh_filename          = nfs_refresh_filename,
 
 #ifdef LIBNFS_FEATURE_PAGECACHE
     .bdrv_invalidate_cache          = nfs_invalidate_cache,
diff --git a/block/qed-table.c b/block/qed-table.c
index 1a731dff51..ed443e2b70 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -174,9 +174,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 
     qed_read_table(s, s->header.l1_table_offset,
                    s->l1_table, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
@@ -195,9 +193,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
     int ret = -EINPROGRESS;
 
     qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
@@ -268,9 +264,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
     int ret = -EINPROGRESS;
 
     qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
@@ -290,9 +284,7 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
     int ret = -EINPROGRESS;
 
     qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-    while (ret == -EINPROGRESS) {
-        aio_poll(bdrv_get_aio_context(s->bs), true);
-    }
+    BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
 
     return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index 3ee879b52e..1a7ef0a9ce 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -336,7 +336,7 @@ static void qed_need_check_timer_cb(void *opaque)
     qed_plug_allocating_write_reqs(s);
 
     /* Ensure writes are on disk before clearing flag */
-    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+    bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 }
 
 static void qed_start_need_check_timer(BDRVQEDState *s)
@@ -378,6 +378,19 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
     }
 }
 
+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    /* Fire the timer immediately in order to start doing I/O as soon as the
+     * header is flushed.
+     */
+    if (s->need_check_timer && timer_pending(s->need_check_timer)) {
+        qed_cancel_need_check_timer(s);
+        qed_need_check_timer_cb(s);
+    }
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
@@ -1668,6 +1681,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_check               = bdrv_qed_check,
     .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
     .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+    .bdrv_drain               = bdrv_qed_drain,
 };
 
 static void bdrv_qed_init(void)
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index fc16ec1f74..7c9bebb507 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -31,6 +31,30 @@
 #include "qapi/error.h"
 #include "qemu/option.h"
 
+typedef struct BDRVRawState {
+    uint64_t offset;
+    uint64_t size;
+    bool has_size;
+} BDRVRawState;
+
+static QemuOptsList raw_runtime_opts = {
+    .name = "raw",
+    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+    .desc = {
+        {
+            .name = "offset",
+            .type = QEMU_OPT_SIZE,
+            .help = "offset in the disk where the image starts",
+        },
+        {
+            .name = "size",
+            .type = QEMU_OPT_SIZE,
+            .help = "virtual disk size",
+        },
+        { /* end of list */ }
+    },
+};
+
 static QemuOptsList raw_create_opts = {
     .name = "raw-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
@@ -44,16 +68,108 @@ static QemuOptsList raw_create_opts = {
     }
 };
 
+static int raw_read_options(QDict *options, BlockDriverState *bs,
+    BDRVRawState *s, Error **errp)
+{
+    Error *local_err = NULL;
+    QemuOpts *opts = NULL;
+    int64_t real_size = 0;
+    int ret;
+
+    real_size = bdrv_getlength(bs->file->bs);
+    if (real_size < 0) {
+        error_setg_errno(errp, -real_size, "Could not get image size");
+        return real_size;
+    }
+
+    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    s->offset = qemu_opt_get_size(opts, "offset", 0);
+    if (qemu_opt_find(opts, "size") != NULL) {
+        s->size = qemu_opt_get_size(opts, "size", 0);
+        s->has_size = true;
+    } else {
+        s->has_size = false;
+        s->size = real_size - s->offset;
+    }
+
+    /* Check size and offset */
+    if (real_size < s->offset || (real_size - s->offset) < s->size) {
+        error_setg(errp, "The sum of offset (%" PRIu64 ") and size "
+            "(%" PRIu64 ") has to be smaller or equal to the "
+            " actual size of the containing file (%" PRId64 ")",
+            s->offset, s->size, real_size);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    /* Make sure size is multiple of BDRV_SECTOR_SIZE to prevent rounding
+     * up and leaking out of the specified area. */
+    if (!QEMU_IS_ALIGNED(s->size, BDRV_SECTOR_SIZE)) {
+        error_setg(errp, "Specified size is not multiple of %llu",
+            BDRV_SECTOR_SIZE);
+        ret = -EINVAL;
+        goto end;
+    }
+
+    ret = 0;
+
+end:
+
+    qemu_opts_del(opts);
+
+    return ret;
+}
+
 static int raw_reopen_prepare(BDRVReopenState *reopen_state,
                               BlockReopenQueue *queue, Error **errp)
 {
-    return 0;
+    assert(reopen_state != NULL);
+    assert(reopen_state->bs != NULL);
+
+    reopen_state->opaque = g_new0(BDRVRawState, 1);
+
+    return raw_read_options(
+        reopen_state->options,
+        reopen_state->bs,
+        reopen_state->opaque,
+        errp);
+}
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+    BDRVRawState *new_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+
+    memcpy(s, new_s, sizeof(BDRVRawState));
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+    g_free(state->opaque);
+    state->opaque = NULL;
 }
 
 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
                                       uint64_t bytes, QEMUIOVector *qiov,
                                       int flags)
 {
+    BDRVRawState *s = bs->opaque;
+
+    if (offset > UINT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    offset += s->offset;
+
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
     return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
@@ -62,11 +178,23 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                        uint64_t bytes, QEMUIOVector *qiov,
                                        int flags)
 {
+    BDRVRawState *s = bs->opaque;
     void *buf = NULL;
     BlockDriver *drv;
     QEMUIOVector local_qiov;
     int ret;
 
+    if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
+        /* There's not enough space for the data. Don't write anything and just
+         * fail to prevent leaking out of the size specified in options. */
+        return -ENOSPC;
+    }
+
+    if (offset > UINT64_MAX - s->offset) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
     if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
         /* Handling partial writes would be a pain - so we just
          * require that guests have 512-byte request alignment if
@@ -101,6 +229,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
         qiov = &local_qiov;
     }
 
+    offset += s->offset;
+
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
     ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 
@@ -117,8 +247,10 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                             int nb_sectors, int *pnum,
                                             BlockDriverState **file)
 {
+    BDRVRawState *s = bs->opaque;
     *pnum = nb_sectors;
     *file = bs->file->bs;
+    sector_num += s->offset / BDRV_SECTOR_SIZE;
     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
            (sector_num << BDRV_SECTOR_BITS);
 }
@@ -127,18 +259,49 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
                                              int64_t offset, int count,
                                              BdrvRequestFlags flags)
 {
+    BDRVRawState *s = bs->opaque;
+    if (offset > UINT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    offset += s->offset;
     return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
 }
 
 static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
                                         int64_t offset, int count)
 {
+    BDRVRawState *s = bs->opaque;
+    if (offset > UINT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    offset += s->offset;
     return bdrv_co_pdiscard(bs->file->bs, offset, count);
 }
 
 static int64_t raw_getlength(BlockDriverState *bs)
 {
-    return bdrv_getlength(bs->file->bs);
+    int64_t len;
+    BDRVRawState *s = bs->opaque;
+
+    /* Update size. It should not change unless the file was externally
+     * modified. */
+    len = bdrv_getlength(bs->file->bs);
+    if (len < 0) {
+        return len;
+    }
+
+    if (len < s->offset) {
+        s->size = 0;
+    } else {
+        if (s->has_size) {
+            /* Try to honour the size */
+            s->size = MIN(s->size, len - s->offset);
+        } else {
+            s->size = len - s->offset;
+        }
+    }
+
+    return s->size;
 }
 
 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -158,6 +321,18 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
+    BDRVRawState *s = bs->opaque;
+
+    if (s->has_size) {
+        return -ENOTSUP;
+    }
+
+    if (INT64_MAX - offset < s->offset) {
+        return -EINVAL;
+    }
+
+    s->size = offset;
+    offset += s->offset;
     return bdrv_truncate(bs->file->bs, offset);
 }
 
@@ -178,6 +353,10 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
 
 static int raw_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 {
+    BDRVRawState *s = bs->opaque;
+    if (s->offset || s->has_size) {
+        return -ENOTSUP;
+    }
     return bdrv_co_ioctl(bs->file->bs, req, buf);
 }
 
@@ -194,6 +373,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
     bs->sg = bs->file->bs->sg;
     bs->supported_write_flags = BDRV_REQ_FUA &
         bs->file->bs->supported_write_flags;
@@ -211,6 +393,16 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                 bs->file->bs->filename);
     }
 
+    ret = raw_read_options(options, bs, s, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (bs->sg && (s->offset || s->has_size)) {
+        error_setg(errp, "Cannot use offset/size with SCSI generic devices");
+        return -EINVAL;
+    }
+
     return 0;
 }
 
@@ -228,18 +420,37 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
 
 static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 {
-    return bdrv_probe_blocksizes(bs->file->bs, bsz);
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = bdrv_probe_blocksizes(bs->file->bs, bsz);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!QEMU_IS_ALIGNED(s->offset, MAX(bsz->log, bsz->phys))) {
+        return -ENOTSUP;
+    }
+
+    return 0;
 }
 
 static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 {
+    BDRVRawState *s = bs->opaque;
+    if (s->offset || s->has_size) {
+        return -ENOTSUP;
+    }
     return bdrv_probe_geometry(bs->file->bs, geo);
 }
 
 BlockDriver bdrv_raw = {
     .format_name          = "raw",
+    .instance_size        = sizeof(BDRVRawState),
     .bdrv_probe           = &raw_probe,
     .bdrv_reopen_prepare  = &raw_reopen_prepare,
+    .bdrv_reopen_commit   = &raw_reopen_commit,
+    .bdrv_reopen_abort    = &raw_reopen_abort,
     .bdrv_open            = &raw_open,
     .bdrv_close           = &raw_close,
     .bdrv_create          = &raw_create,
diff --git a/block/rbd.c b/block/rbd.c
index f6e1d4bc11..a57b3e3c5d 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -365,45 +365,44 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
         rados_conf_read_file(cluster, NULL);
     } else if (conf[0] != '\0' &&
                qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) {
-        rados_shutdown(cluster);
         error_propagate(errp, local_err);
-        return -EIO;
+        ret = -EIO;
+        goto shutdown;
     }
 
     if (conf[0] != '\0' &&
         qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) {
-        rados_shutdown(cluster);
         error_propagate(errp, local_err);
-        return -EIO;
+        ret = -EIO;
+        goto shutdown;
     }
 
     if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
-        rados_shutdown(cluster);
-        return -EIO;
+        ret = -EIO;
+        goto shutdown;
     }
 
     ret = rados_connect(cluster);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "error connecting");
-        rados_shutdown(cluster);
-        return ret;
+        goto shutdown;
     }
 
     ret = rados_ioctx_create(cluster, pool, &io_ctx);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "error opening pool %s", pool);
-        rados_shutdown(cluster);
-        return ret;
+        goto shutdown;
     }
 
     ret = rbd_create(io_ctx, name, bytes, &obj_order);
-    rados_ioctx_destroy(io_ctx);
-    rados_shutdown(cluster);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "error rbd create");
-        return ret;
     }
 
+    rados_ioctx_destroy(io_ctx);
+
+shutdown:
+    rados_shutdown(cluster);
     return ret;
 }
 
diff --git a/block/replication.c b/block/replication.c
index 8bbfc8f870..d5e2b0f497 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -138,6 +138,9 @@ static void replication_close(BlockDriverState *bs)
     if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
         replication_stop(s->rs, false, NULL);
     }
+    if (s->replication_state == BLOCK_REPLICATION_FAILOVER) {
+        block_job_cancel_sync(s->active_disk->bs->job);
+    }
 
     if (s->mode == REPLICATION_MODE_SECONDARY) {
         g_free(s->top_id);
@@ -319,9 +322,10 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
     }
 }
 
-static void reopen_backing_file(BDRVReplicationState *s, bool writable,
+static void reopen_backing_file(BlockDriverState *bs, bool writable,
                                 Error **errp)
 {
+    BDRVReplicationState *s = bs->opaque;
     BlockReopenQueue *reopen_queue = NULL;
     int orig_hidden_flags, orig_secondary_flags;
     int new_hidden_flags, new_secondary_flags;
@@ -356,13 +360,15 @@ static void reopen_backing_file(BDRVReplicationState *s, bool writable,
     }
 
     if (reopen_queue) {
-        bdrv_reopen_multiple(reopen_queue, &local_err);
+        bdrv_reopen_multiple(bdrv_get_aio_context(bs),
+                             reopen_queue, &local_err);
         error_propagate(errp, local_err);
     }
 }
 
-static void backup_job_cleanup(BDRVReplicationState *s)
+static void backup_job_cleanup(BlockDriverState *bs)
 {
+    BDRVReplicationState *s = bs->opaque;
     BlockDriverState *top_bs;
 
     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
@@ -371,19 +377,20 @@ static void backup_job_cleanup(BDRVReplicationState *s)
     }
     bdrv_op_unblock_all(top_bs, s->blocker);
     error_free(s->blocker);
-    reopen_backing_file(s, false, NULL);
+    reopen_backing_file(bs, false, NULL);
 }
 
 static void backup_job_completed(void *opaque, int ret)
 {
-    BDRVReplicationState *s = opaque;
+    BlockDriverState *bs = opaque;
+    BDRVReplicationState *s = bs->opaque;
 
     if (s->replication_state != BLOCK_REPLICATION_FAILOVER) {
         /* The backup job is cancelled unexpectedly */
         s->error = -EIO;
     }
 
-    backup_job_cleanup(s);
+    backup_job_cleanup(bs);
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -479,7 +486,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
         }
 
         /* reopen the backing file in r/w mode */
-        reopen_backing_file(s, true, &local_err);
+        reopen_backing_file(bs, true, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
             aio_context_release(aio_context);
@@ -494,20 +501,21 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
         if (!top_bs || !bdrv_is_root_node(top_bs) ||
             !check_top_bs(top_bs, bs)) {
             error_setg(errp, "No top_bs or it is invalid");
-            reopen_backing_file(s, false, NULL);
+            reopen_backing_file(bs, false, NULL);
             aio_context_release(aio_context);
             return;
         }
         bdrv_op_block_all(top_bs, s->blocker);
         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
 
-        backup_start("replication-backup", s->secondary_disk->bs,
-                     s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
+        backup_start(NULL, s->secondary_disk->bs, s->hidden_disk->bs, 0,
+                     MIRROR_SYNC_MODE_NONE, NULL, false,
                      BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
-                     backup_job_completed, s, NULL, &local_err);
+                     BLOCK_JOB_INTERNAL, backup_job_completed, bs,
+                     NULL, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
-            backup_job_cleanup(s);
+            backup_job_cleanup(bs);
             aio_context_release(aio_context);
             return;
         }
@@ -626,10 +634,9 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
         }
 
         s->replication_state = BLOCK_REPLICATION_FAILOVER;
-        commit_active_start("replication-commit", s->active_disk->bs,
-                            s->secondary_disk->bs, 0, BLOCKDEV_ON_ERROR_REPORT,
-                            replication_done,
-                            bs, errp, true);
+        commit_active_start(NULL, s->active_disk->bs, s->secondary_disk->bs,
+                            BLOCK_JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
+                            replication_done, bs, errp, true);
         break;
     default:
         aio_context_release(aio_context);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index ccbf7e1fa6..1fb917343a 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -641,6 +641,7 @@ static void restart_co_req(void *opaque)
 
 typedef struct SheepdogReqCo {
     int sockfd;
+    BlockDriverState *bs;
     AioContext *aio_context;
     SheepdogReq *hdr;
     void *data;
@@ -701,6 +702,9 @@ out:
 
     srco->ret = ret;
     srco->finished = true;
+    if (srco->bs) {
+        bdrv_wakeup(srco->bs);
+    }
 }
 
 /*
@@ -708,13 +712,14 @@ out:
  *
  * Return 0 on success, -errno in case of error.
  */
-static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
+static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
                   void *data, unsigned int *wlen, unsigned int *rlen)
 {
     Coroutine *co;
     SheepdogReqCo srco = {
         .sockfd = sockfd,
-        .aio_context = aio_context,
+        .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
+        .bs = bs,
         .hdr = hdr,
         .data = data,
         .wlen = wlen,
@@ -727,9 +732,14 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
         do_co_req(&srco);
     } else {
         co = qemu_coroutine_create(do_co_req, &srco);
-        qemu_coroutine_enter(co);
-        while (!srco.finished) {
-            aio_poll(aio_context, true);
+        if (bs) {
+            qemu_coroutine_enter(co);
+            BDRV_POLL_WHILE(bs, !srco.finished);
+        } else {
+            qemu_coroutine_enter(co);
+            while (!srco.finished) {
+                aio_poll(qemu_get_aio_context(), true);
+            }
         }
     }
 
@@ -1125,7 +1135,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
     hdr.snapid = snapid;
     hdr.flags = SD_FLAG_CMD_WRITE;
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
     if (ret) {
         error_setg_errno(errp, -ret, "cannot get vdi info");
         goto out;
@@ -1240,7 +1250,7 @@ out:
     qemu_co_mutex_unlock(&s->lock);
 }
 
-static int read_write_object(int fd, AioContext *aio_context, char *buf,
+static int read_write_object(int fd, BlockDriverState *bs, char *buf,
                              uint64_t oid, uint8_t copies,
                              unsigned int datalen, uint64_t offset,
                              bool write, bool create, uint32_t cache_flags)
@@ -1274,7 +1284,7 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf,
     hdr.offset = offset;
     hdr.copies = copies;
 
-    ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
     if (ret) {
         error_report("failed to send a request to the sheep");
         return ret;
@@ -1289,22 +1299,22 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf,
     }
 }
 
-static int read_object(int fd, AioContext *aio_context, char *buf,
+static int read_object(int fd, BlockDriverState *bs, char *buf,
                        uint64_t oid, uint8_t copies,
                        unsigned int datalen, uint64_t offset,
                        uint32_t cache_flags)
 {
-    return read_write_object(fd, aio_context, buf, oid, copies,
+    return read_write_object(fd, bs, buf, oid, copies,
                              datalen, offset, false,
                              false, cache_flags);
 }
 
-static int write_object(int fd, AioContext *aio_context, char *buf,
+static int write_object(int fd, BlockDriverState *bs, char *buf,
                         uint64_t oid, uint8_t copies,
                         unsigned int datalen, uint64_t offset, bool create,
                         uint32_t cache_flags)
 {
-    return read_write_object(fd, aio_context, buf, oid, copies,
+    return read_write_object(fd, bs, buf, oid, copies,
                              datalen, offset, true,
                              create, cache_flags);
 }
@@ -1331,7 +1341,7 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
         goto out;
     }
 
-    ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
                       s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
                       s->cache_flags);
     if (ret < 0) {
@@ -1489,7 +1499,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     buf = g_malloc(SD_INODE_SIZE);
-    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
                       0, SD_INODE_SIZE, 0, s->cache_flags);
 
     closesocket(fd);
@@ -1618,7 +1628,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     hdr.copies = s->inode.nr_copies;
     hdr.block_size_shift = s->inode.block_size_shift;
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
+    ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
 
     closesocket(fd);
 
@@ -1886,7 +1896,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
         hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
         hdr.proto_ver = SD_PROTO_VER;
 
-        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+        ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
                      NULL, &wlen, &rlen);
         closesocket(fd);
         if (ret) {
@@ -1951,7 +1961,7 @@ static void sd_close(BlockDriverState *bs)
     hdr.data_length = wlen;
     hdr.flags = SD_FLAG_CMD_WRITE;
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                  s->name, &wlen, &rlen);
 
     closesocket(fd);
@@ -2000,7 +2010,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     /* we don't need to update entire object */
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
     s->inode.vdi_size = offset;
-    ret = write_object(fd, s->aio_context, (char *)&s->inode,
+    ret = write_object(fd, s->bs, (char *)&s->inode,
                        vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                        datalen, 0, false, s->cache_flags);
     close(fd);
@@ -2070,7 +2080,7 @@ static bool sd_delete(BDRVSheepdogState *s)
         return false;
     }
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                  s->name, &wlen, &rlen);
     closesocket(fd);
     if (ret) {
@@ -2126,7 +2136,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
         goto out;
     }
 
-    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+    ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
                       s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
 
     closesocket(fd);
@@ -2411,7 +2421,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto cleanup;
     }
 
-    ret = write_object(fd, s->aio_context, (char *)&s->inode,
+    ret = write_object(fd, s->bs, (char *)&s->inode,
                        vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
                        datalen, 0, false, s->cache_flags);
     if (ret < 0) {
@@ -2426,7 +2436,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         goto cleanup;
     }
 
-    ret = read_object(fd, s->aio_context, (char *)inode,
+    ret = read_object(fd, s->bs, (char *)inode,
                       vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
                       s->cache_flags);
 
@@ -2528,7 +2538,7 @@ static bool remove_objects(BDRVSheepdogState *s)
             i++;
         }
 
-        ret = write_object(fd, s->aio_context,
+        ret = write_object(fd, s->bs,
                            (char *)&inode->data_vdi_id[start_idx],
                            vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
                            (i - start_idx) * sizeof(uint32_t),
@@ -2600,7 +2610,7 @@ static int sd_snapshot_delete(BlockDriverState *bs,
         return -1;
     }
 
-    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+    ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
                  buf, &wlen, &rlen);
     closesocket(fd);
     if (ret) {
@@ -2652,8 +2662,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
     req.opcode = SD_OP_READ_VDIS;
     req.data_length = max;
 
-    ret = do_req(fd, s->aio_context, &req,
-                 vdi_inuse, &wlen, &rlen);
+    ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
 
     closesocket(fd);
     if (ret) {
@@ -2679,7 +2688,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
         }
 
         /* we don't need to read entire object */
-        ret = read_object(fd, s->aio_context, (char *)&inode,
+        ret = read_object(fd, s->bs, (char *)&inode,
                           vid_to_vdi_oid(vid),
                           0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
                           s->cache_flags);
@@ -2745,11 +2754,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
 
         create = (offset == 0);
         if (load) {
-            ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid,
+            ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
                               s->inode.nr_copies, data_len, offset,
                               s->cache_flags);
         } else {
-            ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid,
+            ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
                                s->inode.nr_copies, data_len, offset, create,
                                s->cache_flags);
         }
diff --git a/block/ssh.c b/block/ssh.c
index 5ce12b633a..ca071c569c 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -30,10 +30,14 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "qemu/cutils.h"
 #include "qemu/sockets.h"
 #include "qemu/uri.h"
+#include "qapi-visit.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qstring.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 
 /* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
  * this block driver code.
@@ -74,8 +78,9 @@ typedef struct BDRVSSHState {
      */
     LIBSSH2_SFTP_ATTRIBUTES attrs;
 
+    InetSocketAddress *inet;
+
     /* Used to warn if 'flush' is not supported. */
-    char *hostport;
     bool unsafe_flush_warning;
 } BDRVSSHState;
 
@@ -89,7 +94,6 @@ static void ssh_state_init(BDRVSSHState *s)
 
 static void ssh_state_free(BDRVSSHState *s)
 {
-    g_free(s->hostport);
     if (s->sftp_handle) {
         libssh2_sftp_close(s->sftp_handle);
     }
@@ -193,6 +197,7 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
 {
     URI *uri = NULL;
     QueryParams *qp;
+    char *port_str;
     int i;
 
     uri = uri_parse(filename);
@@ -225,11 +230,11 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
         qdict_put(options, "user", qstring_from_str(uri->user));
     }
 
-    qdict_put(options, "host", qstring_from_str(uri->server));
+    qdict_put(options, "server.host", qstring_from_str(uri->server));
 
-    if (uri->port) {
-        qdict_put(options, "port", qint_from_int(uri->port));
-    }
+    port_str = g_strdup_printf("%d", uri->port ?: 22);
+    qdict_put(options, "server.port", qstring_from_str(port_str));
+    g_free(port_str);
 
     qdict_put(options, "path", qstring_from_str(uri->path));
 
@@ -254,15 +259,31 @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
     return -EINVAL;
 }
 
+static bool ssh_has_filename_options_conflict(QDict *options, Error **errp)
+{
+    const QDictEntry *qe;
+
+    for (qe = qdict_first(options); qe; qe = qdict_next(options, qe)) {
+        if (!strcmp(qe->key, "host") ||
+            !strcmp(qe->key, "port") ||
+            !strcmp(qe->key, "path") ||
+            !strcmp(qe->key, "user") ||
+            !strcmp(qe->key, "host_key_check") ||
+            strstart(qe->key, "server.", NULL))
+        {
+            error_setg(errp, "Option '%s' cannot be used with a file name",
+                       qe->key);
+            return true;
+        }
+    }
+
+    return false;
+}
+
 static void ssh_parse_filename(const char *filename, QDict *options,
                                Error **errp)
 {
-    if (qdict_haskey(options, "user") ||
-        qdict_haskey(options, "host") ||
-        qdict_haskey(options, "port") ||
-        qdict_haskey(options, "path") ||
-        qdict_haskey(options, "host_key_check")) {
-        error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option");
+    if (ssh_has_filename_options_conflict(options, errp)) {
         return;
     }
 
@@ -540,14 +561,69 @@ static QemuOptsList ssh_runtime_opts = {
     },
 };
 
+static bool ssh_process_legacy_socket_options(QDict *output_opts,
+                                              QemuOpts *legacy_opts,
+                                              Error **errp)
+{
+    const char *host = qemu_opt_get(legacy_opts, "host");
+    const char *port = qemu_opt_get(legacy_opts, "port");
+
+    if (!host && port) {
+        error_setg(errp, "port may not be used without host");
+        return false;
+    }
+
+    if (host) {
+        qdict_put(output_opts, "server.host", qstring_from_str(host));
+        qdict_put(output_opts, "server.port",
+                  qstring_from_str(port ?: stringify(22)));
+    }
+
+    return true;
+}
+
+static InetSocketAddress *ssh_config(BDRVSSHState *s, QDict *options,
+                                     Error **errp)
+{
+    InetSocketAddress *inet = NULL;
+    QDict *addr = NULL;
+    QObject *crumpled_addr = NULL;
+    Visitor *iv = NULL;
+    Error *local_error = NULL;
+
+    qdict_extract_subqdict(options, &addr, "server.");
+    if (!qdict_size(addr)) {
+        error_setg(errp, "SSH server address missing");
+        goto out;
+    }
+
+    crumpled_addr = qdict_crumple(addr, errp);
+    if (!crumpled_addr) {
+        goto out;
+    }
+
+    iv = qobject_input_visitor_new(crumpled_addr, true);
+    visit_type_InetSocketAddress(iv, NULL, &inet, &local_error);
+    if (local_error) {
+        error_propagate(errp, local_error);
+        goto out;
+    }
+
+out:
+    QDECREF(addr);
+    qobject_decref(crumpled_addr);
+    visit_free(iv);
+    return inet;
+}
+
 static int connect_to_ssh(BDRVSSHState *s, QDict *options,
                           int ssh_flags, int creat_mode, Error **errp)
 {
     int r, ret;
     QemuOpts *opts = NULL;
     Error *local_err = NULL;
-    const char *host, *user, *path, *host_key_check;
-    int port;
+    const char *user, *path, *host_key_check;
+    long port = 0;
 
     opts = qemu_opts_create(&ssh_runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -557,15 +633,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
         goto err;
     }
 
-    host = qemu_opt_get(opts, "host");
-    if (!host) {
+    if (!ssh_process_legacy_socket_options(options, opts, errp)) {
         ret = -EINVAL;
-        error_setg(errp, "No hostname was specified");
         goto err;
     }
 
-    port = qemu_opt_get_number(opts, "port", 22);
-
     path = qemu_opt_get(opts, "path");
     if (!path) {
         ret = -EINVAL;
@@ -588,12 +660,21 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
         host_key_check = "yes";
     }
 
-    /* Construct the host:port name for inet_connect. */
-    g_free(s->hostport);
-    s->hostport = g_strdup_printf("%s:%d", host, port);
+    /* Pop the config into our state object, Exit if invalid */
+    s->inet = ssh_config(s, options, errp);
+    if (!s->inet) {
+        ret = -EINVAL;
+        goto err;
+    }
+
+    if (qemu_strtol(s->inet->port, NULL, 10, &port) < 0) {
+        error_setg(errp, "Use only numeric port value");
+        ret = -EINVAL;
+        goto err;
+    }
 
     /* Open the socket and connect. */
-    s->sock = inet_connect(s->hostport, errp);
+    s->sock = inet_connect_saddr(s->inet, errp, NULL, NULL);
     if (s->sock < 0) {
         ret = -EIO;
         goto err;
@@ -619,7 +700,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
     }
 
     /* Check the remote host's key against known_hosts. */
-    ret = check_host_key(s, host, port, host_key_check, errp);
+    ret = check_host_key(s, s->inet->host, port, host_key_check,
+                         errp);
     if (ret < 0) {
         goto err;
     }
@@ -1040,7 +1122,7 @@ static void unsafe_flush_warning(BDRVSSHState *s, const char *what)
 {
     if (!s->unsafe_flush_warning) {
         error_report("warning: ssh server %s does not support fsync",
-                     s->hostport);
+                     s->inet->host);
         if (what) {
             error_report("to support fsync, you need %s", what);
         }
diff --git a/block/stream.c b/block/stream.c
index 31874817c2..b05856bd65 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -14,7 +14,7 @@
 #include "qemu/osdep.h"
 #include "trace.h"
 #include "block/block_int.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
@@ -37,6 +37,7 @@ typedef struct StreamBlockJob {
     BlockDriverState *base;
     BlockdevOnError on_error;
     char *backing_file_str;
+    int bs_flags;
 } StreamBlockJob;
 
 static int coroutine_fn stream_populate(BlockBackend *blk,
@@ -81,6 +82,11 @@ static void stream_complete(BlockJob *job, void *opaque)
         bdrv_set_backing_hd(bs, base);
     }
 
+    /* Reopen the image back in read-only mode if necessary */
+    if (s->bs_flags != bdrv_get_flags(bs)) {
+        bdrv_reopen(bs, s->bs_flags, NULL);
+    }
+
     g_free(s->backing_file_str);
     block_job_completed(&s->common, data->ret);
     g_free(data);
@@ -216,22 +222,39 @@ static const BlockJobDriver stream_job_driver = {
 
 void stream_start(const char *job_id, BlockDriverState *bs,
                   BlockDriverState *base, const char *backing_file_str,
-                  int64_t speed, BlockdevOnError on_error,
-                  BlockCompletionFunc *cb, void *opaque, Error **errp)
+                  int64_t speed, BlockdevOnError on_error, Error **errp)
 {
     StreamBlockJob *s;
+    BlockDriverState *iter;
+    int orig_bs_flags;
 
     s = block_job_create(job_id, &stream_job_driver, bs, speed,
-                         cb, opaque, errp);
+                         BLOCK_JOB_DEFAULT, NULL, NULL, errp);
     if (!s) {
         return;
     }
 
+    /* Make sure that the image is opened in read-write mode */
+    orig_bs_flags = bdrv_get_flags(bs);
+    if (!(orig_bs_flags & BDRV_O_RDWR)) {
+        if (bdrv_reopen(bs, orig_bs_flags | BDRV_O_RDWR, errp) != 0) {
+            block_job_unref(&s->common);
+            return;
+        }
+    }
+
+    /* Block all intermediate nodes between bs and base, because they
+     * will disappear from the chain after this operation */
+    for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) {
+        block_job_add_bdrv(&s->common, iter);
+    }
+
     s->base = base;
     s->backing_file_str = g_strdup(backing_file_str);
+    s->bs_flags = orig_bs_flags;
 
     s->on_error = on_error;
     s->common.co = qemu_coroutine_create(stream_run, s);
-    trace_stream_start(bs, base, s, s->common.co, opaque);
+    trace_stream_start(bs, base, s, s->common.co);
     qemu_coroutine_enter(s->common.co);
 }
diff --git a/block/trace-events b/block/trace-events
index aff8a9674d..882c9034c2 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -19,11 +19,11 @@ bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t c
 
 # block/stream.c
 stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
-stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
+stream_start(void *bs, void *base, void *s, void *co) "bs %p base %p s %p co %p"
 
 # block/commit.c
 commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
-commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p"
+commit_start(void *bs, void *base, void *top, void *s, void *co) "bs %p base %p top %p s %p co %p"
 
 # block/mirror.c
 mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
@@ -51,7 +51,6 @@ qmp_block_job_cancel(void *job) "job %p"
 qmp_block_job_pause(void *job) "job %p"
 qmp_block_job_resume(void *job) "job %p"
 qmp_block_job_complete(void *job) "job %p"
-block_job_cb(void *bs, void *job, int ret) "bs %p job %p ret %d"
 qmp_block_stream(void *bs, void *job) "bs %p job %p"
 
 # block/raw-win32.c
diff --git a/blockdev.c b/blockdev.c
index d11a74f837..102ca9fe01 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2905,39 +2905,15 @@ out:
     aio_context_release(aio_context);
 }
 
-static void block_job_cb(void *opaque, int ret)
-{
-    /* Note that this function may be executed from another AioContext besides
-     * the QEMU main loop.  If you need to access anything that assumes the
-     * QEMU global mutex, use a BH or introduce a mutex.
-     */
-
-    BlockDriverState *bs = opaque;
-    const char *msg = NULL;
-
-    trace_block_job_cb(bs, bs->job, ret);
-
-    assert(bs->job);
-
-    if (ret < 0) {
-        msg = strerror(-ret);
-    }
-
-    if (block_job_is_cancelled(bs->job)) {
-        block_job_event_cancelled(bs->job);
-    } else {
-        block_job_event_completed(bs->job, msg);
-    }
-}
-
 void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
                       bool has_base, const char *base,
+                      bool has_base_node, const char *base_node,
                       bool has_backing_file, const char *backing_file,
                       bool has_speed, int64_t speed,
                       bool has_on_error, BlockdevOnError on_error,
                       Error **errp)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs, *iter;
     BlockDriverState *base_bs = NULL;
     AioContext *aio_context;
     Error *local_err = NULL;
@@ -2947,7 +2923,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
         on_error = BLOCKDEV_ON_ERROR_REPORT;
     }
 
-    bs = qmp_get_root_bs(device, errp);
+    bs = bdrv_lookup_bs(device, device, errp);
     if (!bs) {
         return;
     }
@@ -2955,7 +2931,9 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_STREAM, errp)) {
+    if (has_base && has_base_node) {
+        error_setg(errp, "'base' and 'base-node' cannot be specified "
+                   "at the same time");
         goto out;
     }
 
@@ -2969,6 +2947,27 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
         base_name = base;
     }
 
+    if (has_base_node) {
+        base_bs = bdrv_lookup_bs(NULL, base_node, errp);
+        if (!base_bs) {
+            goto out;
+        }
+        if (bs == base_bs || !bdrv_chain_contains(bs, base_bs)) {
+            error_setg(errp, "Node '%s' is not a backing image of '%s'",
+                       base_node, device);
+            goto out;
+        }
+        assert(bdrv_get_aio_context(base_bs) == aio_context);
+        base_name = base_bs->filename;
+    }
+
+    /* Check for op blockers in the whole chain between bs and base */
+    for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) {
+        if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) {
+            goto out;
+        }
+    }
+
     /* if we are streaming the entire chain, the result will have no backing
      * file, and specifying one is therefore an error */
     if (base_bs == NULL && has_backing_file) {
@@ -2981,7 +2980,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device,
     base_name = has_backing_file ? backing_file : base_name;
 
     stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name,
-                 has_speed ? speed : 0, on_error, block_job_cb, bs, &local_err);
+                 has_speed ? speed : 0, on_error, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         goto out;
@@ -3001,6 +3000,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
                       Error **errp)
 {
     BlockDriverState *bs;
+    BlockDriverState *iter;
     BlockDriverState *base_bs, *top_bs;
     AioContext *aio_context;
     Error *local_err = NULL;
@@ -3067,8 +3067,10 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
 
     assert(bdrv_get_aio_context(base_bs) == aio_context);
 
-    if (bdrv_op_is_blocked(base_bs, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
-        goto out;
+    for (iter = top_bs; iter != backing_bs(base_bs); iter = backing_bs(iter)) {
+        if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
+            goto out;
+        }
     }
 
     /* Do not allow attempts to commit an image into itself */
@@ -3083,12 +3085,17 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device,
                              " but 'top' is the active layer");
             goto out;
         }
-        commit_active_start(has_job_id ? job_id : NULL, bs, base_bs, speed,
-                            on_error, block_job_cb, bs, &local_err, false);
+        commit_active_start(has_job_id ? job_id : NULL, bs, base_bs,
+                            BLOCK_JOB_DEFAULT, speed, on_error, NULL, NULL,
+                            &local_err, false);
     } else {
+        BlockDriverState *overlay_bs = bdrv_find_overlay(bs, top_bs);
+        if (bdrv_op_is_blocked(overlay_bs, BLOCK_OP_TYPE_COMMIT_TARGET, errp)) {
+            goto out;
+        }
         commit_start(has_job_id ? job_id : NULL, bs, base_bs, top_bs, speed,
-                     on_error, block_job_cb, bs,
-                     has_backing_file ? backing_file : NULL, &local_err);
+                     on_error, has_backing_file ? backing_file : NULL,
+                     &local_err);
     }
     if (local_err != NULL) {
         error_propagate(errp, local_err);
@@ -3208,7 +3215,8 @@ static void do_drive_backup(DriveBackup *backup, BlockJobTxn *txn, Error **errp)
 
     backup_start(backup->job_id, bs, target_bs, backup->speed, backup->sync,
                  bmap, backup->compress, backup->on_source_error,
-                 backup->on_target_error, block_job_cb, bs, txn, &local_err);
+                 backup->on_target_error, BLOCK_JOB_DEFAULT,
+                 NULL, NULL, txn, &local_err);
     bdrv_unref(target_bs);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
@@ -3278,7 +3286,8 @@ void do_blockdev_backup(BlockdevBackup *backup, BlockJobTxn *txn, Error **errp)
     }
     backup_start(backup->job_id, bs, target_bs, backup->speed, backup->sync,
                  NULL, backup->compress, backup->on_source_error,
-                 backup->on_target_error, block_job_cb, bs, txn, &local_err);
+                 backup->on_target_error, BLOCK_JOB_DEFAULT,
+                 NULL, NULL, txn, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
     }
@@ -3357,8 +3366,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
     mirror_start(job_id, bs, target,
                  has_replaces ? replaces : NULL,
                  speed, granularity, buf_size, sync, backing_mode,
-                 on_source_error, on_target_error, unmap,
-                 block_job_cb, bs, errp);
+                 on_source_error, on_target_error, unmap, errp);
 }
 
 void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -3602,7 +3610,7 @@ void qmp_block_job_cancel(const char *device,
         force = false;
     }
 
-    if (job->user_paused && !force) {
+    if (block_job_user_paused(job) && !force) {
         error_setg(errp, "The block job for device '%s' is currently paused",
                    device);
         goto out;
@@ -3619,13 +3627,12 @@ void qmp_block_job_pause(const char *device, Error **errp)
     AioContext *aio_context;
     BlockJob *job = find_block_job(device, &aio_context, errp);
 
-    if (!job || job->user_paused) {
+    if (!job || block_job_user_paused(job)) {
         return;
     }
 
-    job->user_paused = true;
     trace_qmp_block_job_pause(job);
-    block_job_pause(job);
+    block_job_user_pause(job);
     aio_context_release(aio_context);
 }
 
@@ -3634,14 +3641,13 @@ void qmp_block_job_resume(const char *device, Error **errp)
     AioContext *aio_context;
     BlockJob *job = find_block_job(device, &aio_context, errp);
 
-    if (!job || !job->user_paused) {
+    if (!job || !block_job_user_paused(job)) {
         return;
     }
 
-    job->user_paused = false;
     trace_qmp_block_job_resume(job);
     block_job_iostatus_reset(job);
-    block_job_resume(job);
+    block_job_user_resume(job);
     aio_context_release(aio_context);
 }
 
@@ -3915,13 +3921,22 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
     BlockJob *job;
 
     for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-        BlockJobInfoList *elem = g_new0(BlockJobInfoList, 1);
-        AioContext *aio_context = blk_get_aio_context(job->blk);
+        BlockJobInfoList *elem;
+        AioContext *aio_context;
 
+        if (block_job_is_internal(job)) {
+            continue;
+        }
+        elem = g_new0(BlockJobInfoList, 1);
+        aio_context = blk_get_aio_context(job->blk);
         aio_context_acquire(aio_context);
-        elem->value = block_job_query(job);
+        elem->value = block_job_query(job, errp);
         aio_context_release(aio_context);
-
+        if (!elem->value) {
+            g_free(elem);
+            qapi_free_BlockJobInfoList(head);
+            return NULL;
+        }
         *p_next = elem;
         p_next = &elem->next;
     }
diff --git a/blockjob.c b/blockjob.c
index 43fecbe13e..4aa14a4974 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -27,7 +27,7 @@
 #include "qemu-common.h"
 #include "trace.h"
 #include "block/block.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
 #include "qapi/qmp/qerror.h"
@@ -38,6 +38,9 @@
 #include "qemu/timer.h"
 #include "qapi-event.h"
 
+static void block_job_event_cancelled(BlockJob *job);
+static void block_job_event_completed(BlockJob *job, const char *msg);
+
 /* Transactional group of block jobs */
 struct BlockJobTxn {
 
@@ -66,7 +69,7 @@ BlockJob *block_job_get(const char *id)
     BlockJob *job;
 
     QLIST_FOREACH(job, &block_jobs, job_list) {
-        if (!strcmp(id, job->id)) {
+        if (job->id && !strcmp(id, job->id)) {
             return job;
         }
     }
@@ -74,17 +77,6 @@ BlockJob *block_job_get(const char *id)
     return NULL;
 }
 
-/* Normally the job runs in its BlockBackend's AioContext.  The exception is
- * block_job_defer_to_main_loop() where it runs in the QEMU main loop.  Code
- * that supports both cases uses this helper function.
- */
-static AioContext *block_job_get_aio_context(BlockJob *job)
-{
-    return job->deferred_to_main_loop ?
-           qemu_get_aio_context() :
-           blk_get_aio_context(job->blk);
-}
-
 static void block_job_attached_aio_context(AioContext *new_context,
                                            void *opaque)
 {
@@ -97,6 +89,17 @@ static void block_job_attached_aio_context(AioContext *new_context,
     block_job_resume(job);
 }
 
+static void block_job_drain(BlockJob *job)
+{
+    /* If job is !job->busy this kicks it into the next pause point. */
+    block_job_enter(job);
+
+    blk_drain(job->blk);
+    if (job->driver->drain) {
+        job->driver->drain(job);
+    }
+}
+
 static void block_job_detach_aio_context(void *opaque)
 {
     BlockJob *job = opaque;
@@ -106,31 +109,33 @@ static void block_job_detach_aio_context(void *opaque)
 
     block_job_pause(job);
 
-    if (!job->paused) {
-        /* If job is !job->busy this kicks it into the next pause point. */
-        block_job_enter(job);
-    }
     while (!job->paused && !job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        block_job_drain(job);
     }
 
     block_job_unref(job);
 }
 
+void block_job_add_bdrv(BlockJob *job, BlockDriverState *bs)
+{
+    job->nodes = g_slist_prepend(job->nodes, bs);
+    bdrv_ref(bs);
+    bdrv_op_block_all(bs, job->blocker);
+}
+
 void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed,
+                       BlockDriverState *bs, int64_t speed, int flags,
                        BlockCompletionFunc *cb, void *opaque, Error **errp)
 {
     BlockBackend *blk;
     BlockJob *job;
 
-    assert(cb);
     if (bs->job) {
         error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
         return NULL;
     }
 
-    if (job_id == NULL) {
+    if (job_id == NULL && !(flags & BLOCK_JOB_INTERNAL)) {
         job_id = bdrv_get_device_name(bs);
         if (!*job_id) {
             error_setg(errp, "An explicit job ID is required for this node");
@@ -138,14 +143,21 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
         }
     }
 
-    if (!id_wellformed(job_id)) {
-        error_setg(errp, "Invalid job ID '%s'", job_id);
-        return NULL;
-    }
+    if (job_id) {
+        if (flags & BLOCK_JOB_INTERNAL) {
+            error_setg(errp, "Cannot specify job ID for internal block job");
+            return NULL;
+        }
 
-    if (block_job_get(job_id)) {
-        error_setg(errp, "Job ID '%s' already in use", job_id);
-        return NULL;
+        if (!id_wellformed(job_id)) {
+            error_setg(errp, "Invalid job ID '%s'", job_id);
+            return NULL;
+        }
+
+        if (block_job_get(job_id)) {
+            error_setg(errp, "Job ID '%s' already in use", job_id);
+            return NULL;
+        }
     }
 
     blk = blk_new();
@@ -154,7 +166,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     job = g_malloc0(driver->instance_size);
     error_setg(&job->blocker, "block device is in use by block job: %s",
                BlockJobType_lookup[driver->job_type]);
-    bdrv_op_block_all(bs, job->blocker);
+    block_job_add_bdrv(job, bs);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
     job->driver        = driver;
@@ -185,6 +197,11 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     return job;
 }
 
+bool block_job_is_internal(BlockJob *job)
+{
+    return (job->id == NULL);
+}
+
 void block_job_ref(BlockJob *job)
 {
     ++job->refcnt;
@@ -193,9 +210,15 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
     if (--job->refcnt == 0) {
+        GSList *l;
         BlockDriverState *bs = blk_bs(job->blk);
         bs->job = NULL;
-        bdrv_op_unblock_all(bs, job->blocker);
+        for (l = job->nodes; l; l = l->next) {
+            bs = l->data;
+            bdrv_op_unblock_all(bs, job->blocker);
+            bdrv_unref(bs);
+        }
+        g_slist_free(job->nodes);
         blk_remove_aio_context_notifier(job->blk,
                                         block_job_attached_aio_context,
                                         block_job_detach_aio_context, job);
@@ -218,7 +241,20 @@ static void block_job_completed_single(BlockJob *job)
             job->driver->abort(job);
         }
     }
-    job->cb(job->opaque, job->ret);
+
+    if (job->cb) {
+        job->cb(job->opaque, job->ret);
+    }
+    if (block_job_is_cancelled(job)) {
+        block_job_event_cancelled(job);
+    } else {
+        const char *msg = NULL;
+        if (job->ret < 0) {
+            msg = strerror(-job->ret);
+        }
+        block_job_event_completed(job, msg);
+    }
+
     if (job->txn) {
         block_job_txn_unref(job->txn);
     }
@@ -321,6 +357,8 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 
 void block_job_complete(BlockJob *job, Error **errp)
 {
+    /* Should not be reachable via external interface for internal jobs */
+    assert(job->id);
     if (job->pause_count || job->cancelled || !job->driver->complete) {
         error_setg(errp, "The active block job '%s' cannot be completed",
                    job->id);
@@ -335,11 +373,22 @@ void block_job_pause(BlockJob *job)
     job->pause_count++;
 }
 
+void block_job_user_pause(BlockJob *job)
+{
+    job->user_paused = true;
+    block_job_pause(job);
+}
+
 static bool block_job_should_pause(BlockJob *job)
 {
     return job->pause_count > 0;
 }
 
+bool block_job_user_paused(BlockJob *job)
+{
+    return job ? job->user_paused : 0;
+}
+
 void coroutine_fn block_job_pause_point(BlockJob *job)
 {
     if (!block_job_should_pause(job)) {
@@ -376,6 +425,14 @@ void block_job_resume(BlockJob *job)
     block_job_enter(job);
 }
 
+void block_job_user_resume(BlockJob *job)
+{
+    if (job && job->user_paused && job->pause_count > 0) {
+        job->user_paused = false;
+        block_job_resume(job);
+    }
+}
+
 void block_job_enter(BlockJob *job)
 {
     if (job->co && !job->busy) {
@@ -413,14 +470,21 @@ static int block_job_finish_sync(BlockJob *job,
     assert(blk_bs(job->blk)->job == job);
 
     block_job_ref(job);
+
     finish(job, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         block_job_unref(job);
         return -EBUSY;
     }
+    /* block_job_drain calls block_job_enter, and it should be enough to
+     * induce progress until the job completes or moves to the main thread.
+    */
+    while (!job->deferred_to_main_loop && !job->completed) {
+        block_job_drain(job);
+    }
     while (!job->completed) {
-        aio_poll(block_job_get_aio_context(job), true);
+        aio_poll(qemu_get_aio_context(), true);
     }
     ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
     block_job_unref(job);
@@ -494,9 +558,15 @@ void block_job_yield(BlockJob *job)
     block_job_pause_point(job);
 }
 
-BlockJobInfo *block_job_query(BlockJob *job)
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 {
-    BlockJobInfo *info = g_new0(BlockJobInfo, 1);
+    BlockJobInfo *info;
+
+    if (block_job_is_internal(job)) {
+        error_setg(errp, "Cannot query QEMU internal jobs");
+        return NULL;
+    }
+    info = g_new0(BlockJobInfo, 1);
     info->type      = g_strdup(BlockJobType_lookup[job->driver->job_type]);
     info->device    = g_strdup(job->id);
     info->len       = job->len;
@@ -517,8 +587,12 @@ static void block_job_iostatus_set_err(BlockJob *job, int error)
     }
 }
 
-void block_job_event_cancelled(BlockJob *job)
+static void block_job_event_cancelled(BlockJob *job)
 {
+    if (block_job_is_internal(job)) {
+        return;
+    }
+
     qapi_event_send_block_job_cancelled(job->driver->job_type,
                                         job->id,
                                         job->len,
@@ -527,8 +601,12 @@ void block_job_event_cancelled(BlockJob *job)
                                         &error_abort);
 }
 
-void block_job_event_completed(BlockJob *job, const char *msg)
+static void block_job_event_completed(BlockJob *job, const char *msg)
 {
+    if (block_job_is_internal(job)) {
+        return;
+    }
+
     qapi_event_send_block_job_completed(job->driver->job_type,
                                         job->id,
                                         job->len,
@@ -543,6 +621,10 @@ void block_job_event_ready(BlockJob *job)
 {
     job->ready = true;
 
+    if (block_job_is_internal(job)) {
+        return;
+    }
+
     qapi_event_send_block_job_ready(job->driver->job_type,
                                     job->id,
                                     job->len,
@@ -573,14 +655,15 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
     default:
         abort();
     }
-    qapi_event_send_block_job_error(job->id,
-                                    is_read ? IO_OPERATION_TYPE_READ :
-                                    IO_OPERATION_TYPE_WRITE,
-                                    action, &error_abort);
+    if (!block_job_is_internal(job)) {
+        qapi_event_send_block_job_error(job->id,
+                                        is_read ? IO_OPERATION_TYPE_READ :
+                                        IO_OPERATION_TYPE_WRITE,
+                                        action, &error_abort);
+    }
     if (action == BLOCK_ERROR_ACTION_STOP) {
         /* make the pause user visible, which will be resumed from QMP. */
-        job->user_paused = true;
-        block_job_pause(job);
+        block_job_user_pause(job);
         block_job_iostatus_set_err(job, error);
     }
     return action;
diff --git a/bsd-user/mmap.c b/bsd-user/mmap.c
index 610f91b285..ee5907330f 100644
--- a/bsd-user/mmap.c
+++ b/bsd-user/mmap.c
@@ -42,6 +42,11 @@ void mmap_unlock(void)
     }
 }
 
+bool have_mmap_lock(void)
+{
+    return mmap_lock_count > 0 ? true : false;
+}
+
 /* Grab lock to make sure things are in a consistent state after fork().  */
 void mmap_fork_start(void)
 {
diff --git a/configure b/configure
index f83cdf8876..fd6f898cbc 100755
--- a/configure
+++ b/configure
@@ -230,6 +230,7 @@ vhost_net="no"
 vhost_scsi="no"
 vhost_vsock="no"
 kvm="no"
+colo="yes"
 rdma=""
 gprof="no"
 debug_tcg="no"
@@ -918,6 +919,10 @@ for opt do
   ;;
   --enable-kvm) kvm="yes"
   ;;
+  --disable-colo) colo="no"
+  ;;
+  --enable-colo) colo="yes"
+  ;;
   --disable-tcg-interpreter) tcg_interpreter="no"
   ;;
   --enable-tcg-interpreter) tcg_interpreter="yes"
@@ -1366,6 +1371,7 @@ disabled with --disable-FEATURE, default is enabled if available:
   fdt             fdt device tree
   bluez           bluez stack connectivity
   kvm             KVM acceleration support
+  colo            COarse-grain LOck-stepping VM for Non-stop Service
   rdma            RDMA-based migration support
   vde             support for vde network
   netmap          support for netmap network
@@ -4668,6 +4674,33 @@ if compile_prog "" "" ; then
     have_rtnetlink=yes
 fi
 
+##########################################
+# check for usable AF_VSOCK environment
+have_af_vsock=no
+cat > $TMPC << EOF
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#if !defined(AF_VSOCK)
+# error missing AF_VSOCK flag
+#endif
+#include <linux/vm_sockets.h>
+int main(void) {
+    int sock, ret;
+    struct sockaddr_vm svm;
+    socklen_t len = sizeof(svm);
+    sock = socket(AF_VSOCK, SOCK_STREAM, 0);
+    ret = getpeername(sock, (struct sockaddr *)&svm, &len);
+    if ((ret == -1) && (errno == ENOTCONN)) {
+        return 0;
+    }
+    return -1;
+}
+EOF
+if compile_prog "" "" ; then
+    have_af_vsock=yes
+fi
+
 #################################################
 # Sparc implicitly links with --relax, which is
 # incompatible with -r, so --no-relax should be
@@ -5004,6 +5037,7 @@ echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs     $blobs"
 echo "KVM support       $kvm"
+echo "COLO support      $colo"
 echo "RDMA support      $rdma"
 echo "TCG interpreter   $tcg_interpreter"
 echo "fdt support       $fdt"
@@ -5639,6 +5673,10 @@ if have_backend "syslog"; then
 fi
 echo "CONFIG_TRACE_FILE=$trace_file" >> $config_host_mak
 
+if test "$colo" = "yes"; then
+  echo "CONFIG_COLO=y" >> $config_host_mak
+fi
+
 if test "$rdma" = "yes" ; then
   echo "CONFIG_RDMA=y" >> $config_host_mak
 fi
@@ -5651,6 +5689,10 @@ if test "$replication" = "yes" ; then
   echo "CONFIG_REPLICATION=y" >> $config_host_mak
 fi
 
+if test "$have_af_vsock" = "yes" ; then
+  echo "CONFIG_AF_VSOCK=y" >> $config_host_mak
+fi
+
 # Hold two types of flag:
 #   CONFIG_THREAD_SETNAME_BYTHREAD  - we've got a way of setting the name on
 #                                     a thread we have a handle to
diff --git a/cpu-exec.c b/cpu-exec.c
index 9400732137..4188fed3c6 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -143,17 +143,20 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
     uint8_t *tb_ptr = itb->tc_ptr;
 
     qemu_log_mask_and_addr(CPU_LOG_EXEC, itb->pc,
-                           "Trace %p [" TARGET_FMT_lx "] %s\n",
-                           itb->tc_ptr, itb->pc, lookup_symbol(itb->pc));
+                           "Trace %p [%d: " TARGET_FMT_lx "] %s\n",
+                           itb->tc_ptr, cpu->cpu_index, itb->pc,
+                           lookup_symbol(itb->pc));
 
 #if defined(DEBUG_DISAS)
     if (qemu_loglevel_mask(CPU_LOG_TB_CPU)
         && qemu_log_in_addr_range(itb->pc)) {
+        qemu_log_lock();
 #if defined(TARGET_I386)
         log_cpu_state(cpu, CPU_DUMP_CCOP);
 #else
         log_cpu_state(cpu, 0);
 #endif
+        qemu_log_unlock();
     }
 #endif /* DEBUG_DISAS */
 
@@ -204,15 +207,21 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
     if (max_cycles > CF_COUNT_MASK)
         max_cycles = CF_COUNT_MASK;
 
+    tb_lock();
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                      max_cycles | CF_NOCACHE
                          | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
     tb->orig_tb = orig_tb;
+    tb_unlock();
+
     /* execute the generated code */
     trace_exec_tb_nocache(tb, tb->pc);
     cpu_tb_exec(cpu, tb);
+
+    tb_lock();
     tb_phys_invalidate(tb, -1);
     tb_free(tb);
+    tb_unlock();
 }
 #endif
 
diff --git a/cpus-common.c b/cpus-common.c
index 3e114529c9..59f751ecf9 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -109,7 +109,7 @@ void cpu_list_remove(CPUState *cpu)
 struct qemu_work_item {
     struct qemu_work_item *next;
     run_on_cpu_func func;
-    void *data;
+    run_on_cpu_data data;
     bool free, exclusive, done;
 };
 
@@ -129,7 +129,7 @@ static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
     qemu_cpu_kick(cpu);
 }
 
-void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
                    QemuMutex *mutex)
 {
     struct qemu_work_item wi;
@@ -154,7 +154,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
     }
 }
 
-void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 {
     struct qemu_work_item *wi;
 
@@ -296,7 +296,8 @@ void cpu_exec_end(CPUState *cpu)
     }
 }
 
-void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func,
+                           run_on_cpu_data data)
 {
     struct qemu_work_item *wi;
 
diff --git a/cpus.c b/cpus.c
index cfd5cdc0e3..5213351c6d 100644
--- a/cpus.c
+++ b/cpus.c
@@ -69,7 +69,6 @@
 
 #endif /* CONFIG_LINUX */
 
-static CPUState *next_cpu;
 int64_t max_delay;
 int64_t max_advance;
 
@@ -557,7 +556,7 @@ static const VMStateDescription vmstate_timers = {
     }
 };
 
-static void cpu_throttle_thread(CPUState *cpu, void *opaque)
+static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 {
     double pct;
     double throttle_ratio;
@@ -588,7 +587,8 @@ static void cpu_throttle_timer_tick(void *opaque)
     }
     CPU_FOREACH(cpu) {
         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
-            async_run_on_cpu(cpu, cpu_throttle_thread, NULL);
+            async_run_on_cpu(cpu, cpu_throttle_thread,
+                             RUN_ON_CPU_NULL);
         }
     }
 
@@ -915,7 +915,7 @@ void qemu_init_cpu_loop(void)
     qemu_thread_get_self(&io_thread);
 }
 
-void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 {
     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 }
@@ -1055,12 +1055,102 @@ static void *qemu_dummy_cpu_thread_fn(void *arg)
 #endif
 }
 
-static void tcg_exec_all(void);
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        /* Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return qemu_icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
+static void handle_icount_deadline(void)
+{
+    if (use_icount) {
+        int64_t deadline =
+            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        if (deadline == 0) {
+            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+        }
+    }
+}
+
+static int tcg_cpu_exec(CPUState *cpu)
+{
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    if (use_icount) {
+        int64_t count;
+        int decr;
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                                    + cpu->icount_extra);
+        cpu->icount_decr.u16.low = 0;
+        cpu->icount_extra = 0;
+        count = tcg_get_icount_limit();
+        timers_state.qemu_icount += count;
+        decr = (count > 0xffff) ? 0xffff : count;
+        count -= decr;
+        cpu->icount_decr.u16.low = decr;
+        cpu->icount_extra = count;
+    }
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+#ifdef CONFIG_PROFILER
+    tcg_time += profile_getclock() - ti;
+#endif
+    if (use_icount) {
+        /* Fold pending instructions back into the
+           instruction counter, and clear the interrupt flag.  */
+        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
+                        + cpu->icount_extra);
+        cpu->icount_decr.u32 = 0;
+        cpu->icount_extra = 0;
+        replay_account_executed_instructions();
+    }
+    return ret;
+}
+
+/* Destroy any remaining vCPUs which have been unplugged and have
+ * finished running
+ */
+static void deal_with_unplugged_cpus(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu->unplug && !cpu_can_run(cpu)) {
+            qemu_tcg_destroy_vcpu(cpu);
+            cpu->created = false;
+            qemu_cond_signal(&qemu_cpu_cond);
+            break;
+        }
+    }
+}
 
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
-    CPUState *remove_cpu = NULL;
 
     rcu_register_thread();
 
@@ -1087,29 +1177,44 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
     /* process any pending work */
     atomic_mb_set(&exit_request, 1);
 
-    while (1) {
-        tcg_exec_all();
+    cpu = first_cpu;
 
-        if (use_icount) {
-            int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+    while (1) {
+        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
+        qemu_account_warp_timer();
 
-            if (deadline == 0) {
-                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-            }
+        if (!cpu) {
+            cpu = first_cpu;
         }
-        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
-        CPU_FOREACH(cpu) {
-            if (cpu->unplug && !cpu_can_run(cpu)) {
-                remove_cpu = cpu;
+
+        for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
+
+            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
+                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
+
+            if (cpu_can_run(cpu)) {
+                int r;
+                r = tcg_cpu_exec(cpu);
+                if (r == EXCP_DEBUG) {
+                    cpu_handle_guest_debug(cpu);
+                    break;
+                }
+            } else if (cpu->stop || cpu->stopped) {
+                if (cpu->unplug) {
+                    cpu = CPU_NEXT(cpu);
+                }
                 break;
             }
-        }
-        if (remove_cpu) {
-            qemu_tcg_destroy_vcpu(remove_cpu);
-            cpu->created = false;
-            qemu_cond_signal(&qemu_cpu_cond);
-            remove_cpu = NULL;
-        }
+
+        } /* for cpu.. */
+
+        /* Pairs with smp_wmb in qemu_cpu_kick.  */
+        atomic_mb_set(&exit_request, 0);
+
+        handle_icount_deadline();
+
+        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        deal_with_unplugged_cpus();
     }
 
     return NULL;
@@ -1207,17 +1312,17 @@ void qemu_mutex_unlock_iothread(void)
     qemu_mutex_unlock(&qemu_global_mutex);
 }
 
-static int all_vcpus_paused(void)
+static bool all_vcpus_paused(void)
 {
     CPUState *cpu;
 
     CPU_FOREACH(cpu) {
         if (!cpu->stopped) {
-            return 0;
+            return false;
         }
     }
 
-    return 1;
+    return true;
 }
 
 void pause_all_vcpus(void)
@@ -1412,106 +1517,6 @@ int vm_stop_force_state(RunState state)
     }
 }
 
-static int64_t tcg_get_icount_limit(void)
-{
-    int64_t deadline;
-
-    if (replay_mode != REPLAY_MODE_PLAY) {
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-        /* Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        return qemu_icount_round(deadline);
-    } else {
-        return replay_get_instructions();
-    }
-}
-
-static int tcg_cpu_exec(CPUState *cpu)
-{
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
-    if (use_icount) {
-        int64_t count;
-        int decr;
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                                    + cpu->icount_extra);
-        cpu->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
-        count = tcg_get_icount_limit();
-        timers_state.qemu_icount += count;
-        decr = (count > 0xffff) ? 0xffff : count;
-        count -= decr;
-        cpu->icount_decr.u16.low = decr;
-        cpu->icount_extra = count;
-    }
-    cpu_exec_start(cpu);
-    ret = cpu_exec(cpu);
-    cpu_exec_end(cpu);
-#ifdef CONFIG_PROFILER
-    tcg_time += profile_getclock() - ti;
-#endif
-    if (use_icount) {
-        /* Fold pending instructions back into the
-           instruction counter, and clear the interrupt flag.  */
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                        + cpu->icount_extra);
-        cpu->icount_decr.u32 = 0;
-        cpu->icount_extra = 0;
-        replay_account_executed_instructions();
-    }
-    return ret;
-}
-
-static void tcg_exec_all(void)
-{
-    int r;
-
-    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
-    qemu_account_warp_timer();
-
-    if (next_cpu == NULL) {
-        next_cpu = first_cpu;
-    }
-    for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
-        CPUState *cpu = next_cpu;
-
-        qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
-                          (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
-
-        if (cpu_can_run(cpu)) {
-            r = tcg_cpu_exec(cpu);
-            if (r == EXCP_DEBUG) {
-                cpu_handle_guest_debug(cpu);
-                break;
-            } else if (r == EXCP_ATOMIC) {
-                cpu_exec_step_atomic(cpu);
-            }
-        } else if (cpu->stop || cpu->stopped) {
-            if (cpu->unplug) {
-                next_cpu = CPU_NEXT(cpu);
-            }
-            break;
-        }
-    }
-
-    /* Pairs with smp_wmb in qemu_cpu_kick.  */
-    atomic_mb_set(&exit_request, 0);
-}
-
 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
 {
     /* XXX: implement xxx_cpu_list for targets that still miss it */
diff --git a/cputlb.c b/cputlb.c
index cc4da4d7eb..813279f3bc 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -26,7 +26,6 @@
 #include "exec/cputlb.h"
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
-#include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
new file mode 100644
index 0000000000..628293824c
--- /dev/null
+++ b/docs/COLO-FT.txt
@@ -0,0 +1,189 @@
+COarse-grained LOck-stepping Virtual Machines for Non-stop Service
+----------------------------------------
+Copyright (c) 2016 Intel Corporation
+Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+Copyright (c) 2016 Fujitsu, Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+This document gives an overview of COLO's design and how to use it.
+
+== Background ==
+Virtual machine (VM) replication is a well known technique for providing
+application-agnostic software-implemented hardware fault tolerance,
+also known as "non-stop service".
+
+COLO (COarse-grained LOck-stepping) is a high availability solution.
+Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the
+same request from client, and generate response in parallel too.
+If the response packets from PVM and SVM are identical, they are released
+immediately. Otherwise, a VM checkpoint (on demand) is conducted.
+
+== Architecture ==
+
+The architecture of COLO is shown in the diagram below.
+It consists of a pair of networked physical nodes:
+The primary node running the PVM, and the secondary node running the SVM
+to maintain a valid replica of the PVM.
+PVM and SVM execute in parallel and generate output of response packets for
+client requests according to the application semantics.
+
+The incoming packets from the client or external network are received by the
+primary node, and then forwarded to the secondary node, so that both the PVM
+and the SVM are stimulated with the same requests.
+
+COLO receives the outbound packets from both the PVM and SVM and compares them
+before allowing the output to be sent to clients.
+
+The SVM is qualified as a valid replica of the PVM, as long as it generates
+identical responses to all client requests. Once the differences in the outputs
+are detected between the PVM and SVM, COLO withholds transmission of the
+outbound packets until it has successfully synchronized the PVM state to the SVM.
+
+   Primary Node                                                            Secondary Node
+ +------------+  +-----------------------+       +------------------------+  +------------+
+ |            |  |       HeartBeat       |<----->|       HeartBeat        |  |            |
+ | Primary VM |  +-----------|-----------+       +-----------|------------+  |Secondary VM|
+ |            |              |                               |               |            |
+ |            |  +-----------|-----------+       +-----------|------------+  |            |
+ |            |  |QEMU   +---v----+      |       |QEMU  +----v---+        |  |            |
+ |            |  |       |Failover|      |       |      |Failover|        |  |            |
+ |            |  |       +--------+      |       |      +--------+        |  |            |
+ |            |  |   +---------------+   |       |   +---------------+    |  |            |
+ |            |  |   | VM Checkpoint |-------------->| VM Checkpoint |    |  |            |
+ |            |  |   +---------------+   |       |   +---------------+    |  |            |
+ |            |  |                       |       |                        |  |            |
+ |Requests<---------------------------^------------------------------------------>Requests|
+ |Responses----------------------\ /--|--------------\  /------------------------Responses|
+ |            |  |               | |  |  |       |   |  |                 |  |            |
+ |            |  | +-----------+ | |  |  |       |   |  |  +------------+ |  |            |
+ |            |  | | COLO disk | | |  |  |       |   |  |  | COLO disk  | |  |            |
+ |            |  | |   Manager |-|-|--|--------------|--|->| Manager    | |  |            |
+ |            |  | +|----------+ | |  |  |       |   |  |  +-----------|+ |  |            |
+ |            |  |  |            | |  |  |       |   |  |              |  |  |            |
+ +------------+  +--|------------|-|--|--+       +---|--|--------------|--+  +------------+
+                    |            | |  |              |  |              |
+ +-------------+    | +----------v-v--|--+       +---|--v-----------+  |    +-------------+
+ |  VM Monitor |    | |  COLO Proxy      |       |    COLO Proxy    |  |    | VM Monitor  |
+ |             |    | |(compare packet)  |       | (adjust sequence)|  |    |             |
+ +-------------+    | +----------|----^--+       +------------------+  |    +-------------+
+                    |            |    |                                |
+ +------------------|------------|----|--+       +---------------------|------------------+
+ |   Kernel         |            |    |  |       |   Kernel            |                  |
+ +------------------|------------|----|--+       +---------------------|------------------+
+                    |            |    |                                |
+     +--------------v+  +--------v----|--+       +------------------+ +v-------------+
+     |   Storage     |  |External Network|       | External Network | |   Storage    |
+     +---------------+  +----------------+       +------------------+ +--------------+
+
+== Components introduction ==
+
+You can see there are several components in COLO's diagram of architecture.
+Their functions are described below.
+
+HeartBeat:
+Runs on both the primary and secondary nodes, to periodically check platform
+availability. When the primary node suffers a hardware fail-stop failure,
+the heartbeat stops responding, the secondary node will trigger a failover
+as soon as it determines the absence.
+
+COLO disk Manager:
+When primary VM writes data into image, the colo disk manger captures this data
+and sends it to secondary VM's which makes sure the context of secondary VM's
+image is consistent with the context of primary VM 's image.
+For more details, please refer to docs/block-replication.txt.
+
+Checkpoint/Failover Controller:
+Modifications of save/restore flow to realize continuous migration,
+to make sure the state of VM in Secondary side is always consistent with VM in
+Primary side.
+
+COLO Proxy:
+Delivers packets to Primary and Seconday, and then compare the responses from
+both side. Then decide whether to start a checkpoint according to some rules.
+Please refer to docs/colo-proxy.txt for more informations.
+
+Note:
+HeartBeat has not been implemented yet, so you need to trigger failover process
+by using 'x-colo-lost-heartbeat' command.
+
+== Test procedure ==
+1. Startup qemu
+Primary:
+# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name primary \
+  -device piix3-usb-uhci \
+  -device usb-tablet -netdev tap,id=hn0,vhost=off \
+  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
+  -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
+         children.0.file.filename=1.raw,\
+         children.0.driver=raw -S
+Secondary:
+# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name secondary \
+  -device piix3-usb-uhci \
+  -device usb-tablet -netdev tap,id=hn0,vhost=off \
+  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
+  -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \
+  -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\
+         file.driver=qcow2,top-id=active-disk0,\
+         file.file.filename=/mnt/ramfs/active_disk.img,\
+         file.backing.driver=qcow2,\
+         file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\
+         file.backing.backing=secondary-disk0 \
+  -incoming tcp:0:8888
+
+2. On Secondary VM's QEMU monitor, issue command
+{'execute':'qmp_capabilities'}
+{ 'execute': 'nbd-server-start',
+  'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } }
+}
+{'execute': 'nbd-server-add', 'arguments': {'device': 'secondeary-disk0', 'writable': true } }
+
+Note:
+  a. The qmp command nbd-server-start and nbd-server-add must be run
+     before running the qmp command migrate on primary QEMU
+  b. Active disk, hidden disk and nbd target's length should be the
+     same.
+  c. It is better to put active disk and hidden disk in ramdisk.
+
+3. On Primary VM's QEMU monitor, issue command:
+{'execute':'qmp_capabilities'}
+{ 'execute': 'human-monitor-command',
+  'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}}
+{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } }
+{ 'execute': 'migrate-set-capabilities',
+      'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
+{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } }
+
+  Note:
+  a. There should be only one NBD Client for each primary disk.
+  b. xx.xx.xx.xx is the secondary physical machine's hostname or IP
+  c. The qmp command line must be run after running qmp command line in
+     secondary qemu.
+
+4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
+You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }'
+to change the checkpoint period time
+
+5. Failover test
+You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's
+monitor at the same time, then SVM will failover and client will not detect this
+change.
+
+Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to
+issue block related command to stop block replication.
+Primary:
+  Remove the nbd child from the quorum:
+  { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}}
+  { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}}
+  Note: there is no qmp command to remove the blockdev now
+
+Secondary:
+  The primary host is down, so we should do the following thing:
+  { 'execute': 'nbd-server-stop' }
+
+== TODO ==
+1. Support continuous VM replication.
+2. Support shared storage.
+3. Develop the heartbeat part.
+4. Reduce checkpoint VM’s downtime while doing checkpoint.
diff --git a/docs/live-block-ops.txt b/docs/live-block-ops.txt
index a257087401..2211d14428 100644
--- a/docs/live-block-ops.txt
+++ b/docs/live-block-ops.txt
@@ -4,15 +4,20 @@ LIVE BLOCK OPERATIONS
 High level description of live block operations. Note these are not
 supported for use with the raw format at the moment.
 
+Note also that this document is incomplete and it currently only
+covers the 'stream' operation. Other operations supported by QEMU such
+as 'commit', 'mirror' and 'backup' are not described here yet. Please
+refer to the qapi/block-core.json file for an overview of those.
+
 Snapshot live merge
 ===================
 
 Given a snapshot chain, described in this document in the following
 format:
 
-[A] -> [B] -> [C] -> [D]
+[A] <- [B] <- [C] <- [D] <- [E]
 
-Where the rightmost object ([D] in the example) described is the current
+Where the rightmost object ([E] in the example) described is the current
 image which the guest OS has write access to. To the left of it is its base
 image, and so on accordingly until the leftmost image, which has no
 base.
@@ -21,11 +26,14 @@ The snapshot live merge operation transforms such a chain into a
 smaller one with fewer elements, such as this transformation relative
 to the first example:
 
-[A] -> [D]
+[A] <- [E]
+
+Data is copied in the right direction with destination being the
+rightmost image, but any other intermediate image can be specified
+instead. In this example data is copied from [C] into [D], so [D] can
+be backed by [B]:
 
-Currently only forward merge with target being the active image is
-supported, that is, data copy is performed in the right direction with
-destination being the rightmost image.
+[A] <- [B] <- [D] <- [E]
 
 The operation is implemented in QEMU through image streaming facilities.
 
@@ -35,14 +43,20 @@ streaming operation completes it raises a QMP event. 'block_stream'
 copies data from the backing file(s) into the active image. When finished,
 it adjusts the backing file pointer.
 
-The 'base' parameter specifies an image which data need not be streamed from.
-This image will be used as the backing file for the active image when the
-operation is finished.
+The 'base' parameter specifies an image which data need not be
+streamed from. This image will be used as the backing file for the
+destination image when the operation is finished.
+
+In the first example above, the command would be:
+
+(qemu) block_stream virtio0 file-A.img
 
-In the example above, the command would be:
+In order to specify a destination image different from the active
+(rightmost) one we can use its node name instead.
 
-(qemu) block_stream virtio0 A
+In the second example above, the command would be:
 
+(qemu) block_stream node-D file-B.img
 
 Live block copy
 ===============
diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt
index 40b8419916..0e7cdb2c28 100644
--- a/docs/multiple-iothreads.txt
+++ b/docs/multiple-iothreads.txt
@@ -105,13 +105,10 @@ a BH in the target AioContext beforehand and then call qemu_bh_schedule().  No
 acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
 sure to acquire the AioContext for aio_bh_new() if necessary.
 
-The relationship between AioContext and the block layer
--------------------------------------------------------
-The AioContext originates from the QEMU block layer because it provides a
-scoped way of running event loop iterations until all work is done.  This
-feature is used to complete all in-flight block I/O requests (see
-bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
-used by any QEMU subsystem.
+AioContext and the block layer
+------------------------------
+The AioContext originates from the QEMU block layer, even though nowadays
+AioContext is a generic event loop that can be used by any QEMU subsystem.
 
 The block layer has support for AioContext integrated.  Each BlockDriverState
 is associated with an AioContext using bdrv_set_aio_context() and
@@ -122,13 +119,22 @@ Block layer code must therefore expect to run in an IOThread and avoid using
 old APIs that implicitly use the main loop.  See the "How to program for
 IOThreads" above for information on how to do that.
 
-If main loop code such as a QMP function wishes to access a BlockDriverState it
-must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
-IOThread does not run in parallel.
-
-Long-running jobs (usually in the form of coroutines) are best scheduled in the
-BlockDriverState's AioContext to avoid the need to acquire/release around each
-bdrv_*() call.  Be aware that there is currently no mechanism to get notified
-when bdrv_set_aio_context() moves this BlockDriverState to a different
-AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you
-may need to add this if you want to support long-running jobs.
+If main loop code such as a QMP function wishes to access a BlockDriverState
+it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
+that callbacks in the IOThread do not run in parallel.
+
+Code running in the monitor typically needs to ensure that past
+requests from the guest are completed.  When a block device is running
+in an IOThread, the IOThread can also process requests from the guest
+(via ioeventfd).  To achieve both objects, wrap the code between
+bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
+section".  The functions must be called between aio_context_acquire()
+and aio_context_release().  You can freely release and re-acquire the
+AioContext within a drained section.
+
+Long-running jobs (usually in the form of coroutines) are best scheduled in
+the BlockDriverState's AioContext to avoid the need to acquire/release around
+each bdrv_*() call.  The functions bdrv_add/remove_aio_context_notifier,
+or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
+can be used to get a notification whenever bdrv_set_aio_context() moves a
+BlockDriverState to a different AioContext.
diff --git a/docs/qmp-commands.txt b/docs/qmp-commands.txt
index 284576d795..6afa87298d 100644
--- a/docs/qmp-commands.txt
+++ b/docs/qmp-commands.txt
@@ -554,6 +554,16 @@ Example:
 -> { "execute": "migrate_set_downtime", "arguments": { "value": 0.1 } }
 <- { "return": {} }
 
+x-colo-lost-heartbeat
+--------------------
+
+Tell COLO that heartbeat is lost, a failover or takeover is needed.
+
+Example:
+
+-> { "execute": "x-colo-lost-heartbeat" }
+<- { "return": {} }
+
 client_migrate_info
 -------------------
 
@@ -740,8 +750,11 @@ Arguments:
 - "job-id": Identifier for the newly-created block job. If omitted,
             the device name will be used. (json-string, optional)
 - "device": The device name or node-name of a root node (json-string)
-- "base": The file name of the backing image above which copying starts
-          (json-string, optional)
+- "base": The file name of the backing image above which copying starts.
+          It cannot be set if 'base-node' is also set (json-string, optional)
+- "base-node": the node name of the backing image above which copying starts.
+               It cannot be set if 'base' is also set.
+               (json-string, optional) (Since 2.8)
 - "backing-file": The backing file string to write into the active layer. This
                   filename is not validated.
 
@@ -2861,6 +2874,7 @@ Enable/Disable migration capabilities
 - "compress": use multiple compression threads to accelerate live migration
 - "events": generate events for each migration state change
 - "postcopy-ram": postcopy mode for live migration
+- "x-colo": COarse-Grain LOck Stepping (COLO) for Non-stop Service
 
 Arguments:
 
@@ -2882,6 +2896,7 @@ Query current migration capabilities
          - "compress": Multiple compression threads state (json-bool)
          - "events": Migration state change event state (json-bool)
          - "postcopy-ram": postcopy ram state (json-bool)
+         - "x-colo": COarse-Grain LOck Stepping for Non-stop Service (json-bool)
 
 Arguments:
 
@@ -2895,7 +2910,8 @@ Example:
      {"state": false, "capability": "zero-blocks"},
      {"state": false, "capability": "compress"},
      {"state": true, "capability": "events"},
-     {"state": false, "capability": "postcopy-ram"}
+     {"state": false, "capability": "postcopy-ram"},
+     {"state": false, "capability": "x-colo"}
    ]}
 
 migrate-set-parameters
@@ -2913,6 +2929,8 @@ Set migration parameters
 - "max-bandwidth": set maximum speed for migrations (in bytes/sec) (json-int)
 - "downtime-limit": set maximum tolerated downtime (in milliseconds) for
                     migrations (json-int)
+- "x-checkpoint-delay": set the delay time for periodic checkpoint (json-int)
+
 Arguments:
 
 Example:
diff --git a/exec.c b/exec.c
index 4c84389b56..b1094c0cd2 100644
--- a/exec.c
+++ b/exec.c
@@ -687,7 +687,11 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 #if defined(CONFIG_USER_ONLY)
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 {
+    mmap_lock();
+    tb_lock();
     tb_invalidate_phys_page_range(pc, pc + 1, 0);
+    tb_unlock();
+    mmap_unlock();
 }
 #else
 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
@@ -696,6 +700,7 @@ static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
     hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
     int asidx = cpu_asidx_from_attrs(cpu, attrs);
     if (phys != -1) {
+        /* Locks grabbed by tb_invalidate_phys_addr */
         tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
                                 phys | (pc & ~TARGET_PAGE_MASK));
     }
@@ -906,11 +911,13 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...)
     fprintf(stderr, "\n");
     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
     if (qemu_log_separate()) {
+        qemu_log_lock();
         qemu_log("qemu: fatal: ");
         qemu_log_vprintf(fmt, ap2);
         qemu_log("\n");
         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
         qemu_log_flush();
+        qemu_log_unlock();
         qemu_log_close();
     }
     va_end(ap2);
@@ -1988,7 +1995,11 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
                                uint64_t val, unsigned size)
 {
+    bool locked = false;
+
     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
+        locked = true;
+        tb_lock();
         tb_invalidate_phys_page_fast(ram_addr, size);
     }
     switch (size) {
@@ -2004,6 +2015,11 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
     default:
         abort();
     }
+
+    if (locked) {
+        tb_unlock();
+    }
+
     /* Set both VGA and migration bits for simplicity and to remove
      * the notdirty callback faster.
      */
@@ -2064,6 +2080,12 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                     continue;
                 }
                 cpu->watchpoint_hit = wp;
+
+                /* The tb_lock will be reset when cpu_loop_exit or
+                 * cpu_loop_exit_noexc longjmp back into the cpu_exec
+                 * main loop.
+                 */
+                tb_lock();
                 tb_check_watchpoint(cpu);
                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                     cpu->exception_index = EXCP_DEBUG;
@@ -2471,7 +2493,9 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
     }
     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
+        tb_lock();
         tb_invalidate_phys_range(addr, addr + length);
+        tb_unlock();
         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
     }
     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
diff --git a/gdbstub.c b/gdbstub.c
index b2e1b79ca3..de62d26096 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -31,7 +31,6 @@
 
 #define MAX_PACKET_LENGTH 4096
 
-#include "cpu.h"
 #include "qemu/sockets.h"
 #include "sysemu/kvm.h"
 #include "exec/semihost.h"
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 06bef470b9..88192817b2 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1040,6 +1040,21 @@ migration (or once already in postcopy).
 ETEXI
 
     {
+        .name       = "x_colo_lost_heartbeat",
+        .args_type  = "",
+        .params     = "",
+        .help       = "Tell COLO that heartbeat is lost,\n\t\t\t"
+                      "a failover or takeover is needed.",
+        .cmd = hmp_x_colo_lost_heartbeat,
+    },
+
+STEXI
+@item x_colo_lost_heartbeat
+@findex x_colo_lost_heartbeat
+Tell COLO that heartbeat is lost, a failover or takeover is needed.
+ETEXI
+
+    {
         .name       = "client_migrate_info",
         .args_type  = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?",
         .params     = "protocol hostname port tls-port cert-subject",
diff --git a/hmp.c b/hmp.c
index 3d602594a2..b5e3f54534 100644
--- a/hmp.c
+++ b/hmp.c
@@ -318,6 +318,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, " %s: %" PRId64 " milliseconds",
             MigrationParameter_lookup[MIGRATION_PARAMETER_DOWNTIME_LIMIT],
             params->downtime_limit);
+        monitor_printf(mon, " %s: %" PRId64,
+            MigrationParameter_lookup[MIGRATION_PARAMETER_X_CHECKPOINT_DELAY],
+            params->x_checkpoint_delay);
         monitor_printf(mon, "\n");
     }
 
@@ -1386,6 +1389,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
                 p.has_downtime_limit = true;
                 use_int_value = true;
                 break;
+            case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY:
+                p.has_x_checkpoint_delay = true;
+                use_int_value = true;
+                break;
             }
 
             if (use_int_value) {
@@ -1402,6 +1409,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
                 p.cpu_throttle_initial = valueint;
                 p.cpu_throttle_increment = valueint;
                 p.downtime_limit = valueint;
+                p.x_checkpoint_delay = valueint;
             }
 
             qmp_migrate_set_parameters(&p, &err);
@@ -1443,6 +1451,14 @@ void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict)
     hmp_handle_error(mon, &err);
 }
 
+void hmp_x_colo_lost_heartbeat(Monitor *mon, const QDict *qdict)
+{
+    Error *err = NULL;
+
+    qmp_x_colo_lost_heartbeat(&err);
+    hmp_handle_error(mon, &err);
+}
+
 void hmp_set_password(Monitor *mon, const QDict *qdict)
 {
     const char *protocol  = qdict_get_str(qdict, "protocol");
@@ -1555,7 +1571,7 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
     int64_t speed = qdict_get_try_int(qdict, "speed", 0);
 
     qmp_block_stream(false, NULL, device, base != NULL, base, false, NULL,
-                     qdict_haskey(qdict, "speed"), speed,
+                     false, NULL, qdict_haskey(qdict, "speed"), speed,
                      true, BLOCKDEV_ON_ERROR_REPORT, &error);
 
     hmp_handle_error(mon, &error);
diff --git a/hmp.h b/hmp.h
index 184769c13f..05daf7cd5c 100644
--- a/hmp.h
+++ b/hmp.h
@@ -72,6 +72,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
 void hmp_client_migrate_info(Monitor *mon, const QDict *qdict);
 void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict);
+void hmp_x_colo_lost_heartbeat(Monitor *mon, const QDict *qdict);
 void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index e88cf257a2..aea7e9d392 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -325,7 +325,7 @@ static int coroutine_fn v9fs_xattr_fid_clunk(V9fsPDU *pdu, V9fsFidState *fidp)
 {
     int retval = 0;
 
-    if (fidp->fs.xattr.copied_len == -1) {
+    if (fidp->fs.xattr.xattrwalk_fid) {
         /* getxattr/listxattr fid */
         goto free_value;
     }
@@ -535,7 +535,7 @@ static int coroutine_fn v9fs_mark_fids_unreclaim(V9fsPDU *pdu, V9fsPath *path)
 static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
 {
     V9fsState *s = pdu->s;
-    V9fsFidState *fidp = NULL;
+    V9fsFidState *fidp;
 
     /* Free all fids */
     while (s->fid_list) {
@@ -548,11 +548,6 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
             free_fid(pdu, fidp);
         }
     }
-    if (fidp) {
-        /* One or more unclunked fids found... */
-        error_report("9pfs:%s: One or more uncluncked fids "
-                     "found during reset", __func__);
-    }
 }
 
 #define P9_QID_TYPE_DIR         0x80
@@ -1361,7 +1356,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
         memcpy(&qids[name_idx], &qid, sizeof(qid));
     }
     if (fid == newfid) {
-        BUG_ON(fidp->fid_type != P9_FID_NONE);
+        if (fidp->fid_type != P9_FID_NONE) {
+            err = -EINVAL;
+            goto out;
+        }
         v9fs_path_copy(&fidp->path, &path);
     } else {
         newfidp = alloc_fid(s, newfid);
@@ -1443,7 +1441,10 @@ static void coroutine_fn v9fs_open(void *opaque)
         err = -ENOENT;
         goto out_nofid;
     }
-    BUG_ON(fidp->fid_type != P9_FID_NONE);
+    if (fidp->fid_type != P9_FID_NONE) {
+        err = -EINVAL;
+        goto out;
+    }
 
     err = v9fs_co_lstat(pdu, &fidp->path, &stbuf);
     if (err < 0) {
@@ -1637,20 +1638,17 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
 {
     ssize_t err;
     size_t offset = 7;
-    int read_count;
-    int64_t xattr_len;
+    uint64_t read_count;
     V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
     VirtQueueElement *elem = v->elems[pdu->idx];
 
-    xattr_len = fidp->fs.xattr.len;
-    read_count = xattr_len - off;
+    if (fidp->fs.xattr.len < off) {
+        read_count = 0;
+    } else {
+        read_count = fidp->fs.xattr.len - off;
+    }
     if (read_count > max_count) {
         read_count = max_count;
-    } else if (read_count < 0) {
-        /*
-         * read beyond XATTR value
-         */
-        read_count = 0;
     }
     err = pdu_marshal(pdu, offset, "d", read_count);
     if (err < 0) {
@@ -1979,23 +1977,18 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
 {
     int i, to_copy;
     ssize_t err = 0;
-    int write_count;
-    int64_t xattr_len;
+    uint64_t write_count;
     size_t offset = 7;
 
 
-    xattr_len = fidp->fs.xattr.len;
-    write_count = xattr_len - off;
-    if (write_count > count) {
-        write_count = count;
-    } else if (write_count < 0) {
-        /*
-         * write beyond XATTR value len specified in
-         * xattrcreate
-         */
+    if (fidp->fs.xattr.len < off) {
         err = -ENOSPC;
         goto out;
     }
+    write_count = fidp->fs.xattr.len - off;
+    if (write_count > count) {
+        write_count = count;
+    }
     err = pdu_marshal(pdu, offset, "d", write_count);
     if (err < 0) {
         return err;
@@ -2548,7 +2541,10 @@ static int coroutine_fn v9fs_complete_rename(V9fsPDU *pdu, V9fsFidState *fidp,
             err = -ENOENT;
             goto out_nofid;
         }
-        BUG_ON(dirfidp->fid_type != P9_FID_NONE);
+        if (fidp->fid_type != P9_FID_NONE) {
+            err = -EINVAL;
+            goto out;
+        }
         v9fs_co_name_to_path(pdu, &dirfidp->path, name->data, &new_path);
     } else {
         old_name = fidp->path.data;
@@ -2620,7 +2616,10 @@ static void coroutine_fn v9fs_rename(void *opaque)
         err = -ENOENT;
         goto out_nofid;
     }
-    BUG_ON(fidp->fid_type != P9_FID_NONE);
+    if (fidp->fid_type != P9_FID_NONE) {
+        err = -EINVAL;
+        goto out;
+    }
     /* if fs driver is not path based, return EOPNOTSUPP */
     if (!(pdu->s->ctx.export_flags & V9FS_PATHNAME_FSCONTEXT)) {
         err = -EOPNOTSUPP;
@@ -3190,7 +3189,7 @@ static void coroutine_fn v9fs_xattrwalk(void *opaque)
          */
         xattr_fidp->fs.xattr.len = size;
         xattr_fidp->fid_type = P9_FID_XATTR;
-        xattr_fidp->fs.xattr.copied_len = -1;
+        xattr_fidp->fs.xattr.xattrwalk_fid = true;
         if (size) {
             xattr_fidp->fs.xattr.value = g_malloc(size);
             err = v9fs_co_llistxattr(pdu, &xattr_fidp->path,
@@ -3223,7 +3222,7 @@ static void coroutine_fn v9fs_xattrwalk(void *opaque)
          */
         xattr_fidp->fs.xattr.len = size;
         xattr_fidp->fid_type = P9_FID_XATTR;
-        xattr_fidp->fs.xattr.copied_len = -1;
+        xattr_fidp->fs.xattr.xattrwalk_fid = true;
         if (size) {
             xattr_fidp->fs.xattr.value = g_malloc(size);
             err = v9fs_co_lgetxattr(pdu, &xattr_fidp->path,
@@ -3255,7 +3254,7 @@ static void coroutine_fn v9fs_xattrcreate(void *opaque)
 {
     int flags;
     int32_t fid;
-    int64_t size;
+    uint64_t size;
     ssize_t err = 0;
     V9fsString name;
     size_t offset = 7;
@@ -3270,22 +3269,33 @@ static void coroutine_fn v9fs_xattrcreate(void *opaque)
     }
     trace_v9fs_xattrcreate(pdu->tag, pdu->id, fid, name.data, size, flags);
 
+    if (size > XATTR_SIZE_MAX) {
+        err = -E2BIG;
+        goto out_nofid;
+    }
+
     file_fidp = get_fid(pdu, fid);
     if (file_fidp == NULL) {
         err = -EINVAL;
         goto out_nofid;
     }
+    if (file_fidp->fid_type != P9_FID_NONE) {
+        err = -EINVAL;
+        goto out_put_fid;
+    }
+
     /* Make the file fid point to xattr */
     xattr_fidp = file_fidp;
     xattr_fidp->fid_type = P9_FID_XATTR;
     xattr_fidp->fs.xattr.copied_len = 0;
+    xattr_fidp->fs.xattr.xattrwalk_fid = false;
     xattr_fidp->fs.xattr.len = size;
     xattr_fidp->fs.xattr.flags = flags;
     v9fs_string_init(&xattr_fidp->fs.xattr.name);
     v9fs_string_copy(&xattr_fidp->fs.xattr.name, &name);
-    g_free(xattr_fidp->fs.xattr.value);
     xattr_fidp->fs.xattr.value = g_malloc0(size);
     err = offset;
+out_put_fid:
     put_fid(pdu, file_fidp);
 out_nofid:
     pdu_complete(pdu, err);
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 2523a445f8..3976b7fe3d 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -159,11 +159,12 @@ typedef struct V9fsConf
 
 typedef struct V9fsXattr
 {
-    int64_t copied_len;
-    int64_t len;
+    uint64_t copied_len;
+    uint64_t len;
     void *value;
     V9fsString name;
     int flags;
+    bool xattrwalk_fid;
 } V9fsXattr;
 
 typedef struct V9fsDir {
diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events
index 48d3d8abed..fb4de3d465 100644
--- a/hw/9pfs/trace-events
+++ b/hw/9pfs/trace-events
@@ -42,6 +42,6 @@ v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t
 v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
 v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
 v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
-v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, int64_t size, int flags) "tag %d id %d fid %d name %s size %"PRId64" flags %d"
+v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"
 v9fs_readlink(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
 v9fs_readlink_return(uint16_t tag, uint8_t id, char* target) "tag %d id %d name %s"
diff --git a/hw/arm/cubieboard.c b/hw/arm/cubieboard.c
index fbd78ed01c..dd19ba3c99 100644
--- a/hw/arm/cubieboard.c
+++ b/hw/arm/cubieboard.c
@@ -74,6 +74,7 @@ static void cubieboard_init(MachineState *machine)
     cubieboard_binfo.ram_size = machine->ram_size;
     cubieboard_binfo.kernel_filename = machine->kernel_filename;
     cubieboard_binfo.kernel_cmdline = machine->kernel_cmdline;
+    cubieboard_binfo.initrd_filename = machine->initrd_filename;
     arm_load_kernel(&s->a10->cpu, &cubieboard_binfo);
 }
 
diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index 42cdde0478..21ea1d6210 100644
--- a/hw/arm/pxa2xx.c
+++ b/hw/arm/pxa2xx.c
@@ -2267,7 +2267,9 @@ PXA2xxState *pxa255_init(MemoryRegion *address_space, unsigned int sdram_size)
                     qdev_get_gpio_in(s->pic, PXA2XX_PIC_LCD));
 
     s->cm_base = 0x41300000;
-    s->cm_regs[CCCR >> 2] = 0x02000210;	/* 416.0 MHz */
+    s->cm_regs[CCCR >> 2] = 0x00000121;         /* from datasheet */
+    s->cm_regs[CKEN >> 2] = 0x00017def;         /* from datasheet */
+
     s->clkcfg = 0x00000009;		/* Turbo mode active */
     memory_region_init_io(&s->cm_iomem, NULL, &pxa2xx_cm_ops, s, "pxa2xx-cm", 0x1000);
     memory_region_add_subregion(address_space, s->cm_base, &s->cm_iomem);
diff --git a/hw/arm/spitz.c b/hw/arm/spitz.c
index 41cc2eeeb1..949a15ae64 100644
--- a/hw/arm/spitz.c
+++ b/hw/arm/spitz.c
@@ -29,6 +29,7 @@
 #include "sysemu/block-backend.h"
 #include "hw/sysbus.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
 
 #undef REG_FMT
 #define REG_FMT			"0x%02lx"
@@ -844,9 +845,18 @@ static void spitz_lcd_hsync_handler(void *opaque, int line, int level)
     spitz_hsync ^= 1;
 }
 
+static void spitz_reset(void *opaque, int line, int level)
+{
+    if (level) {
+        qemu_system_reset_request();
+    }
+}
+
 static void spitz_gpio_setup(PXA2xxState *cpu, int slots)
 {
     qemu_irq lcd_hsync;
+    qemu_irq reset;
+
     /*
      * Bad hack: We toggle the LCD hsync GPIO on every GPIO status
      * read to satisfy broken guests that poll-wait for hsync.
@@ -867,7 +877,8 @@ static void spitz_gpio_setup(PXA2xxState *cpu, int slots)
     qemu_irq_raise(qdev_get_gpio_in(cpu->gpio, SPITZ_GPIO_BAT_COVER));
 
     /* Handle reset */
-    qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_ON_RESET, cpu->reset);
+    reset = qemu_allocate_irq(spitz_reset, cpu, 0);
+    qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_ON_RESET, reset);
 
     /* PCMCIA signals: card's IRQ and Card-Detect */
     if (slots >= 1)
diff --git a/hw/arm/tosa.c b/hw/arm/tosa.c
index 2db66508b5..1ee12f49b3 100644
--- a/hw/arm/tosa.c
+++ b/hw/arm/tosa.c
@@ -25,6 +25,7 @@
 #include "sysemu/block-backend.h"
 #include "hw/sysbus.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
 
 #define TOSA_RAM    0x04000000
 #define TOSA_ROM	0x00800000
@@ -86,6 +87,12 @@ static void tosa_out_switch(void *opaque, int line, int level)
     }
 }
 
+static void tosa_reset(void *opaque, int line, int level)
+{
+    if (level) {
+        qemu_system_reset_request();
+    }
+}
 
 static void tosa_gpio_setup(PXA2xxState *cpu,
                 DeviceState *scp0,
@@ -93,13 +100,16 @@ static void tosa_gpio_setup(PXA2xxState *cpu,
                 TC6393xbState *tmio)
 {
     qemu_irq *outsignals = qemu_allocate_irqs(tosa_out_switch, cpu, 4);
+    qemu_irq reset;
+
     /* MMC/SD host */
     pxa2xx_mmci_handlers(cpu->mmc,
                     qdev_get_gpio_in(scp0, TOSA_GPIO_SD_WP),
                     qemu_irq_invert(qdev_get_gpio_in(cpu->gpio, TOSA_GPIO_nSD_DETECT)));
 
     /* Handle reset */
-    qdev_connect_gpio_out(cpu->gpio, TOSA_GPIO_ON_RESET, cpu->reset);
+    reset = qemu_allocate_irq(tosa_reset, cpu, 0);
+    qdev_connect_gpio_out(cpu->gpio, TOSA_GPIO_ON_RESET, reset);
 
     /* PCMCIA signals: card's IRQ and Card-Detect */
     pxa2xx_pcmcia_set_irq_cb(cpu->pcmcia[0],
diff --git a/hw/arm/versatilepb.c b/hw/arm/versatilepb.c
index 8ae5392bcc..7b5cb36d5a 100644
--- a/hw/arm/versatilepb.c
+++ b/hw/arm/versatilepb.c
@@ -198,6 +198,15 @@ static void versatile_init(MachineState *machine, int board_id)
     int done_smc = 0;
     DriveInfo *dinfo;
 
+    if (machine->ram_size > 0x10000000) {
+        /* Device starting at address 0x10000000,
+         * and memory cannot overlap with devices.
+         * Refuse to run rather than behaving very confusingly.
+         */
+        error_report("versatilepb: memory size must not exceed 256MB");
+        exit(1);
+    }
+
     if (!machine->cpu_model) {
         machine->cpu_model = "arm926";
     }
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 5fc10df08e..f953610018 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -594,7 +594,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info)
         gicc->uid = i;
         gicc->flags = cpu_to_le32(ACPI_GICC_ENABLED);
 
-        if (armcpu->has_pmu) {
+        if (arm_feature(&armcpu->env, ARM_FEATURE_PMU)) {
             gicc->performance_interrupt = cpu_to_le32(PPI(VIRTUAL_PMU_IRQ));
         }
     }
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 070bbf89d4..54a8b28a58 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -85,6 +85,7 @@ typedef struct {
     VirtBoardInfo *daughterboard;
     bool disallow_affinity_adjustment;
     bool no_its;
+    bool no_pmu;
 } VirtMachineClass;
 
 typedef struct {
@@ -490,7 +491,7 @@ static void fdt_add_pmu_nodes(const VirtBoardInfo *vbi, int gictype)
 
     CPU_FOREACH(cpu) {
         armcpu = ARM_CPU(cpu);
-        if (!armcpu->has_pmu ||
+        if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU) ||
             !kvm_arm_pmu_create(cpu, PPI(VIRTUAL_PMU_IRQ))) {
             return;
         }
@@ -1353,6 +1354,10 @@ static void machvirt_init(MachineState *machine)
             }
         }
 
+        if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) {
+            object_property_set_bool(cpuobj, false, "pmu", NULL);
+        }
+
         if (object_property_find(cpuobj, "reset-cbar", NULL)) {
             object_property_set_int(cpuobj, vbi->memmap[VIRT_CPUPERIPHS].base,
                                     "reset-cbar", &error_abort);
@@ -1592,5 +1597,7 @@ static void virt_machine_2_6_options(MachineClass *mc)
     virt_machine_2_7_options(mc);
     SET_MACHINE_COMPAT(mc, VIRT_COMPAT_2_6);
     vmc->disallow_affinity_adjustment = true;
+    /* Disable PMU for 2.6 as PMU support was first introduced in 2.7 */
+    vmc->no_pmu = true;
 }
 DEFINE_VIRT_MACHINE(2, 6)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b380142028..d479fd22f5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -375,7 +375,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     if (!cqid || nvme_check_cqid(n, cqid)) {
         return NVME_INVALID_CQID | NVME_DNR;
     }
-    if (!sqid || (sqid && !nvme_check_sqid(n, sqid))) {
+    if (!sqid || !nvme_check_sqid(n, sqid)) {
         return NVME_INVALID_QID | NVME_DNR;
     }
     if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
@@ -449,7 +449,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
     uint16_t qflags = le16_to_cpu(c->cq_flags);
     uint64_t prp1 = le64_to_cpu(c->prp1);
 
-    if (!cqid || (cqid && !nvme_check_cqid(n, cqid))) {
+    if (!cqid || !nvme_check_cqid(n, cqid)) {
         return NVME_INVALID_CQID | NVME_DNR;
     }
     if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 1292a4b459..3a7dc194e2 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -167,12 +167,12 @@ static void destroy_grant(gpointer pgnt)
     xengnttab_handle *gnt = grant->blkdev->xendev.gnttabdev;
 
     if (xengnttab_unmap(gnt, grant->page, 1) != 0) {
-        xen_be_printf(&grant->blkdev->xendev, 0,
+        xen_pv_printf(&grant->blkdev->xendev, 0,
                       "xengnttab_unmap failed: %s\n",
                       strerror(errno));
     }
     grant->blkdev->persistent_gnt_count--;
-    xen_be_printf(&grant->blkdev->xendev, 3,
+    xen_pv_printf(&grant->blkdev->xendev, 3,
                   "unmapped grant %p\n", grant->page);
     g_free(grant);
 }
@@ -184,11 +184,11 @@ static void remove_persistent_region(gpointer data, gpointer dev)
     xengnttab_handle *gnt = blkdev->xendev.gnttabdev;
 
     if (xengnttab_unmap(gnt, region->addr, region->num) != 0) {
-        xen_be_printf(&blkdev->xendev, 0,
+        xen_pv_printf(&blkdev->xendev, 0,
                       "xengnttab_unmap region %p failed: %s\n",
                       region->addr, strerror(errno));
     }
-    xen_be_printf(&blkdev->xendev, 3,
+    xen_pv_printf(&blkdev->xendev, 3,
                   "unmapped grant region %p with %d pages\n",
                   region->addr, region->num);
     g_free(region);
@@ -255,7 +255,7 @@ static int ioreq_parse(struct ioreq *ioreq)
     size_t len;
     int i;
 
-    xen_be_printf(&blkdev->xendev, 3,
+    xen_pv_printf(&blkdev->xendev, 3,
                   "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
                   ioreq->req.operation, ioreq->req.nr_segments,
                   ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
@@ -275,28 +275,28 @@ static int ioreq_parse(struct ioreq *ioreq)
     case BLKIF_OP_DISCARD:
         return 0;
     default:
-        xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
+        xen_pv_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
                       ioreq->req.operation);
         goto err;
     };
 
     if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
-        xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
+        xen_pv_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
         goto err;
     }
 
     ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
     for (i = 0; i < ioreq->req.nr_segments; i++) {
         if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
-            xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
+            xen_pv_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
             goto err;
         }
         if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
-            xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
+            xen_pv_printf(&blkdev->xendev, 0, "error: first > last sector\n");
             goto err;
         }
         if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
-            xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
+            xen_pv_printf(&blkdev->xendev, 0, "error: page crossing\n");
             goto err;
         }
 
@@ -308,7 +308,7 @@ static int ioreq_parse(struct ioreq *ioreq)
         qemu_iovec_add(&ioreq->v, (void*)mem, len);
     }
     if (ioreq->start + ioreq->v.size > blkdev->file_size) {
-        xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
+        xen_pv_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
         goto err;
     }
     return 0;
@@ -331,7 +331,7 @@ static void ioreq_unmap(struct ioreq *ioreq)
             return;
         }
         if (xengnttab_unmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
-            xen_be_printf(&ioreq->blkdev->xendev, 0,
+            xen_pv_printf(&ioreq->blkdev->xendev, 0,
                           "xengnttab_unmap failed: %s\n",
                           strerror(errno));
         }
@@ -343,7 +343,7 @@ static void ioreq_unmap(struct ioreq *ioreq)
                 continue;
             }
             if (xengnttab_unmap(gnt, ioreq->page[i], 1) != 0) {
-                xen_be_printf(&ioreq->blkdev->xendev, 0,
+                xen_pv_printf(&ioreq->blkdev->xendev, 0,
                               "xengnttab_unmap failed: %s\n",
                               strerror(errno));
             }
@@ -381,7 +381,7 @@ static int ioreq_map(struct ioreq *ioreq)
 
             if (grant != NULL) {
                 page[i] = grant->page;
-                xen_be_printf(&ioreq->blkdev->xendev, 3,
+                xen_pv_printf(&ioreq->blkdev->xendev, 3,
                               "using persistent-grant %" PRIu32 "\n",
                               ioreq->refs[i]);
             } else {
@@ -410,7 +410,7 @@ static int ioreq_map(struct ioreq *ioreq)
         ioreq->pages = xengnttab_map_grant_refs
             (gnt, new_maps, domids, refs, ioreq->prot);
         if (ioreq->pages == NULL) {
-            xen_be_printf(&ioreq->blkdev->xendev, 0,
+            xen_pv_printf(&ioreq->blkdev->xendev, 0,
                           "can't map %d grant refs (%s, %d maps)\n",
                           new_maps, strerror(errno), ioreq->blkdev->cnt_map);
             return -1;
@@ -426,7 +426,7 @@ static int ioreq_map(struct ioreq *ioreq)
             ioreq->page[i] = xengnttab_map_grant_ref
                 (gnt, domids[i], refs[i], ioreq->prot);
             if (ioreq->page[i] == NULL) {
-                xen_be_printf(&ioreq->blkdev->xendev, 0,
+                xen_pv_printf(&ioreq->blkdev->xendev, 0,
                               "can't map grant ref %d (%s, %d maps)\n",
                               refs[i], strerror(errno), ioreq->blkdev->cnt_map);
                 ioreq->mapped = 1;
@@ -474,7 +474,7 @@ static int ioreq_map(struct ioreq *ioreq)
                 grant->page = ioreq->page[new_maps];
             }
             grant->blkdev = ioreq->blkdev;
-            xen_be_printf(&ioreq->blkdev->xendev, 3,
+            xen_pv_printf(&ioreq->blkdev->xendev, 3,
                           "adding grant %" PRIu32 " page: %p\n",
                           refs[new_maps], grant->page);
             g_tree_insert(ioreq->blkdev->persistent_gnts,
@@ -557,7 +557,7 @@ static int ioreq_grant_copy(struct ioreq *ioreq)
     rc = xengnttab_grant_copy(gnt, count, segs);
 
     if (rc) {
-        xen_be_printf(&ioreq->blkdev->xendev, 0,
+        xen_pv_printf(&ioreq->blkdev->xendev, 0,
                       "failed to copy data %d\n", rc);
         ioreq->aio_errors++;
         return -1;
@@ -565,7 +565,7 @@ static int ioreq_grant_copy(struct ioreq *ioreq)
 
     for (i = 0; i < count; i++) {
         if (segs[i].status != GNTST_okay) {
-            xen_be_printf(&ioreq->blkdev->xendev, 3,
+            xen_pv_printf(&ioreq->blkdev->xendev, 3,
                           "failed to copy data %d for gref %d, domid %d\n",
                           segs[i].status, ioreq->refs[i], ioreq->domids[i]);
             ioreq->aio_errors++;
@@ -599,7 +599,7 @@ static void qemu_aio_complete(void *opaque, int ret)
     struct ioreq *ioreq = opaque;
 
     if (ret != 0) {
-        xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
+        xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
                       ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
         ioreq->aio_errors++;
     }
@@ -796,7 +796,7 @@ static void blk_send_response_all(struct XenBlkDev *blkdev)
         ioreq_release(ioreq, true);
     }
     if (send_notify) {
-        xen_be_send_notify(&blkdev->xendev);
+        xen_pv_send_notify(&blkdev->xendev);
     }
 }
 
@@ -866,7 +866,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
             };
 
             if (blk_send_response_one(ioreq)) {
-                xen_be_send_notify(&blkdev->xendev);
+                xen_pv_send_notify(&blkdev->xendev);
             }
             ioreq_release(ioreq, false);
             continue;
@@ -910,7 +910,7 @@ static void blk_alloc(struct XenDevice *xendev)
     }
     if (xengnttab_set_max_grants(xendev->gnttabdev,
             MAX_GRANTS(max_requests, BLKIF_MAX_SEGMENTS_PER_REQUEST)) < 0) {
-        xen_be_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
+        xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
                       strerror(errno));
     }
 }
@@ -1056,11 +1056,11 @@ static int blk_connect(struct XenDevice *xendev)
         }
 
         /* setup via xenbus -> create new block driver instance */
-        xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
+        xen_pv_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
         blkdev->blk = blk_new_open(blkdev->filename, NULL, options,
                                    qflags, &local_err);
         if (!blkdev->blk) {
-            xen_be_printf(&blkdev->xendev, 0, "error: %s\n",
+            xen_pv_printf(&blkdev->xendev, 0, "error: %s\n",
                           error_get_pretty(local_err));
             error_free(local_err);
             return -1;
@@ -1068,10 +1068,11 @@ static int blk_connect(struct XenDevice *xendev)
         blk_set_enable_write_cache(blkdev->blk, !writethrough);
     } else {
         /* setup via qemu cmdline -> already setup for us */
-        xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
+        xen_pv_printf(&blkdev->xendev, 2,
+                      "get configured bdrv (cmdline setup)\n");
         blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
         if (blk_is_read_only(blkdev->blk) && !readonly) {
-            xen_be_printf(&blkdev->xendev, 0, "Unexpected read-only drive");
+            xen_pv_printf(&blkdev->xendev, 0, "Unexpected read-only drive");
             blkdev->blk = NULL;
             return -1;
         }
@@ -1084,13 +1085,13 @@ static int blk_connect(struct XenDevice *xendev)
     if (blkdev->file_size < 0) {
         BlockDriverState *bs = blk_bs(blkdev->blk);
         const char *drv_name = bs ? bdrv_get_format_name(bs) : NULL;
-        xen_be_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
+        xen_pv_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
                       (int)blkdev->file_size, strerror(-blkdev->file_size),
                       drv_name ?: "-");
         blkdev->file_size = 0;
     }
 
-    xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
+    xen_pv_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
                   " size %" PRId64 " (%" PRId64 " MB)\n",
                   blkdev->type, blkdev->fileproto, blkdev->filename,
                   blkdev->file_size, blkdev->file_size >> 20);
@@ -1174,10 +1175,10 @@ static int blk_connect(struct XenDevice *xendev)
     blkdev->feature_grant_copy =
                 (xengnttab_grant_copy(blkdev->xendev.gnttabdev, 0, NULL) == 0);
 
-    xen_be_printf(&blkdev->xendev, 3, "grant copy operation %s\n",
+    xen_pv_printf(&blkdev->xendev, 3, "grant copy operation %s\n",
                   blkdev->feature_grant_copy ? "enabled" : "disabled");
 
-    xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
+    xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
                   "remote port %d, local port %d\n",
                   blkdev->xendev.protocol, blkdev->ring_ref,
                   blkdev->xendev.remote_port, blkdev->xendev.local_port);
@@ -1193,7 +1194,7 @@ static void blk_disconnect(struct XenDevice *xendev)
         blk_unref(blkdev->blk);
         blkdev->blk = NULL;
     }
-    xen_be_unbind_evtchn(&blkdev->xendev);
+    xen_pv_unbind_evtchn(&blkdev->xendev);
 
     if (blkdev->sring) {
         xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c
index c2b9154305..def34cd0d2 100644
--- a/hw/char/cadence_uart.c
+++ b/hw/char/cadence_uart.c
@@ -450,7 +450,8 @@ static void cadence_uart_reset(DeviceState *dev)
     s->r[R_IMR] = 0;
     s->r[R_CISR] = 0;
     s->r[R_RTRIG] = 0x00000020;
-    s->r[R_BRGR] = 0x0000000F;
+    s->r[R_BRGR] = 0x0000028B;
+    s->r[R_BDIV] = 0x0000000F;
     s->r[R_TTRIG] = 0x00000020;
 
     uart_rx_reset(s);
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 86cdc529a3..c01f41090e 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -74,7 +74,7 @@ static void buffer_append(struct XenConsole *con)
 
     xen_mb();
     intf->out_cons = cons;
-    xen_be_send_notify(&con->xendev);
+    xen_pv_send_notify(&con->xendev);
 
     if (buffer->max_capacity &&
 	buffer->size > buffer->max_capacity) {
@@ -142,7 +142,7 @@ static void xencons_receive(void *opaque, const uint8_t *buf, int len)
     }
     xen_wmb();
     intf->in_prod = prod;
-    xen_be_send_notify(&con->xendev);
+    xen_pv_send_notify(&con->xendev);
 }
 
 static void xencons_send(struct XenConsole *con)
@@ -158,16 +158,17 @@ static void xencons_send(struct XenConsole *con)
         len = size;
     }
     if (len < 1) {
-	if (!con->backlog) {
-	    con->backlog = 1;
-	    xen_be_printf(&con->xendev, 1, "backlog piling up, nobody listening?\n");
-	}
+        if (!con->backlog) {
+            con->backlog = 1;
+            xen_pv_printf(&con->xendev, 1,
+                          "backlog piling up, nobody listening?\n");
+        }
     } else {
-	buffer_advance(&con->buffer, len);
-	if (con->backlog && len == size) {
-	    con->backlog = 0;
-	    xen_be_printf(&con->xendev, 1, "backlog is gone\n");
-	}
+        buffer_advance(&con->buffer, len);
+        if (con->backlog && len == size) {
+            con->backlog = 0;
+            xen_pv_printf(&con->xendev, 1, "backlog is gone\n");
+        }
     }
 }
 
@@ -191,7 +192,7 @@ static int con_init(struct XenDevice *xendev)
 
     type = xenstore_read_str(con->console, "type");
     if (!type || strcmp(type, "ioemu") != 0) {
-	xen_be_printf(xendev, 1, "not for me (type=%s)\n", type);
+        xen_pv_printf(xendev, 1, "not for me (type=%s)\n", type);
         ret = -1;
         goto out;
     }
@@ -247,7 +248,8 @@ static int con_initialise(struct XenDevice *xendev)
     qemu_chr_fe_set_handlers(&con->chr, xencons_can_receive,
                              xencons_receive, NULL, con, NULL, true);
 
-    xen_be_printf(xendev, 1, "ring mfn %d, remote port %d, local port %d, limit %zd\n",
+    xen_pv_printf(xendev, 1,
+                  "ring mfn %d, remote port %d, local port %d, limit %zd\n",
 		  con->ring_ref,
 		  con->xendev.remote_port,
 		  con->xendev.local_port,
@@ -260,7 +262,7 @@ static void con_disconnect(struct XenDevice *xendev)
     struct XenConsole *con = container_of(xendev, struct XenConsole, xendev);
 
     qemu_chr_fe_deinit(&con->chr);
-    xen_be_unbind_evtchn(&con->xendev);
+    xen_pv_unbind_evtchn(&con->xendev);
 
     if (con->sring) {
         if (!xendev->dev) {
diff --git a/hw/display/milkymist-tmu2.c b/hw/display/milkymist-tmu2.c
index 9c0018448a..5c666f9b24 100644
--- a/hw/display/milkymist-tmu2.c
+++ b/hw/display/milkymist-tmu2.c
@@ -213,7 +213,7 @@ static void tmu2_start(MilkymistTMU2State *s)
     /* Read the QEMU source framebuffer into an OpenGL texture */
     glGenTextures(1, &texture);
     glBindTexture(GL_TEXTURE_2D, texture);
-    fb_len = 2*s->regs[R_TEXHRES]*s->regs[R_TEXVRES];
+    fb_len = 2ULL * s->regs[R_TEXHRES] * s->regs[R_TEXVRES];
     fb = cpu_physical_memory_map(s->regs[R_TEXFBUF], &fb_len, 0);
     if (fb == NULL) {
         glDeleteTextures(1, &texture);
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 46b7d5eded..7a8727aa21 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -90,28 +90,29 @@ static int common_bind(struct common *c)
     xen_pfn_t mfn;
 
     if (xenstore_read_fe_uint64(&c->xendev, "page-ref", &val) == -1)
-	return -1;
+        return -1;
     mfn = (xen_pfn_t)val;
     assert(val == mfn);
 
     if (xenstore_read_fe_int(&c->xendev, "event-channel", &c->xendev.remote_port) == -1)
-	return -1;
+        return -1;
 
     c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom,
                                    PROT_READ | PROT_WRITE, 1, &mfn, NULL);
     if (c->page == NULL)
-	return -1;
+        return -1;
 
     xen_be_bind_evtchn(&c->xendev);
-    xen_be_printf(&c->xendev, 1, "ring mfn %"PRI_xen_pfn", remote-port %d, local-port %d\n",
-		  mfn, c->xendev.remote_port, c->xendev.local_port);
+    xen_pv_printf(&c->xendev, 1,
+                  "ring mfn %"PRI_xen_pfn", remote-port %d, local-port %d\n",
+                  mfn, c->xendev.remote_port, c->xendev.local_port);
 
     return 0;
 }
 
 static void common_unbind(struct common *c)
 {
-    xen_be_unbind_evtchn(&c->xendev);
+    xen_pv_unbind_evtchn(&c->xendev);
     if (c->page) {
         xenforeignmemory_unmap(xen_fmem, c->page, 1);
 	c->page = NULL;
@@ -214,7 +215,7 @@ static int xenfb_kbd_event(struct XenInput *xenfb,
     XENKBD_IN_RING_REF(page, prod) = *event;
     xen_wmb();		/* ensure ring contents visible */
     page->in_prod = prod + 1;
-    return xen_be_send_notify(&xenfb->c.xendev);
+    return xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 /* Send a keyboard (or mouse button) event */
@@ -345,7 +346,7 @@ static int input_initialise(struct XenDevice *xendev)
     int rc;
 
     if (!in->c.con) {
-        xen_be_printf(xendev, 1, "ds not set (yet)\n");
+        xen_pv_printf(xendev, 1, "ds not set (yet)\n");
         return -1;
     }
 
@@ -396,7 +397,7 @@ static void input_event(struct XenDevice *xendev)
     if (page->out_prod == page->out_cons)
 	return;
     page->out_cons = page->out_prod;
-    xen_be_send_notify(&xenfb->c.xendev);
+    xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 /* -------------------------------------------------------------------- */
@@ -500,8 +501,8 @@ out:
 }
 
 static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
-			      int width, int height, int depth,
-			      size_t fb_len, int offset, int row_stride)
+                              int width, int height, int depth,
+                              size_t fb_len, int offset, int row_stride)
 {
     size_t mfn_sz = sizeof(*((struct xenfb_page *)0)->pd);
     size_t pd_len = sizeof(((struct xenfb_page *)0)->pd) / mfn_sz;
@@ -510,40 +511,47 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
     int max_width, max_height;
 
     if (fb_len_lim > fb_len_max) {
-	xen_be_printf(&xenfb->c.xendev, 0, "fb size limit %zu exceeds %zu, corrected\n",
-		      fb_len_lim, fb_len_max);
-	fb_len_lim = fb_len_max;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "fb size limit %zu exceeds %zu, corrected\n",
+                      fb_len_lim, fb_len_max);
+        fb_len_lim = fb_len_max;
     }
     if (fb_len_lim && fb_len > fb_len_lim) {
-	xen_be_printf(&xenfb->c.xendev, 0, "frontend fb size %zu limited to %zu\n",
-		      fb_len, fb_len_lim);
-	fb_len = fb_len_lim;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "frontend fb size %zu limited to %zu\n",
+                      fb_len, fb_len_lim);
+        fb_len = fb_len_lim;
     }
     if (depth != 8 && depth != 16 && depth != 24 && depth != 32) {
-	xen_be_printf(&xenfb->c.xendev, 0, "can't handle frontend fb depth %d\n",
-		      depth);
-	return -1;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "can't handle frontend fb depth %d\n",
+                      depth);
+        return -1;
     }
     if (row_stride <= 0 || row_stride > fb_len) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend stride %d\n", row_stride);
-	return -1;
+        xen_pv_printf(&xenfb->c.xendev, 0, "invalid frontend stride %d\n",
+                      row_stride);
+        return -1;
     }
     max_width = row_stride / (depth / 8);
     if (width < 0 || width > max_width) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend width %d limited to %d\n",
-		      width, max_width);
-	width = max_width;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "invalid frontend width %d limited to %d\n",
+                      width, max_width);
+        width = max_width;
     }
     if (offset < 0 || offset >= fb_len) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend offset %d (max %zu)\n",
-		      offset, fb_len - 1);
-	return -1;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "invalid frontend offset %d (max %zu)\n",
+                      offset, fb_len - 1);
+        return -1;
     }
     max_height = (fb_len - offset) / row_stride;
     if (height < 0 || height > max_height) {
-	xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend height %d limited to %d\n",
-		      height, max_height);
-	height = max_height;
+        xen_pv_printf(&xenfb->c.xendev, 0,
+                      "invalid frontend height %d limited to %d\n",
+                      height, max_height);
+        height = max_height;
     }
     xenfb->fb_len = fb_len;
     xenfb->row_stride = row_stride;
@@ -553,8 +561,9 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim,
     xenfb->offset = offset;
     xenfb->up_fullscreen = 1;
     xenfb->do_resize = 1;
-    xen_be_printf(&xenfb->c.xendev, 1, "framebuffer %dx%dx%d offset %d stride %d\n",
-		  width, height, depth, offset, row_stride);
+    xen_pv_printf(&xenfb->c.xendev, 1,
+                  "framebuffer %dx%dx%d offset %d stride %d\n",
+                  width, height, depth, offset, row_stride);
     return 0;
 }
 
@@ -631,7 +640,7 @@ static void xenfb_guest_copy(struct XenFB *xenfb, int x, int y, int w, int h)
 	}
     }
     if (oops) /* should not happen */
-        xen_be_printf(&xenfb->c.xendev, 0, "%s: oops: convert %d -> %d bpp?\n",
+        xen_pv_printf(&xenfb->c.xendev, 0, "%s: oops: convert %d -> %d bpp?\n",
                       __FUNCTION__, xenfb->depth, bpp);
 
     dpy_gfx_update(xenfb->c.con, x, y, w, h);
@@ -663,7 +672,7 @@ static void xenfb_send_event(struct XenFB *xenfb, union xenfb_in_event *event)
     xen_wmb();                  /* ensure ring contents visible */
     page->in_prod = prod + 1;
 
-    xen_be_send_notify(&xenfb->c.xendev);
+    xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 static void xenfb_send_refresh_period(struct XenFB *xenfb, int period)
@@ -696,9 +705,9 @@ static void xenfb_update(void *opaque)
         return;
 
     if (!xenfb->feature_update) {
-	/* we don't get update notifications, thus use the
-	 * sledge hammer approach ... */
-	xenfb->up_fullscreen = 1;
+        /* we don't get update notifications, thus use the
+         * sledge hammer approach ... */
+        xenfb->up_fullscreen = 1;
     }
 
     /* resize if needed */
@@ -721,7 +730,8 @@ static void xenfb_update(void *opaque)
             break;
         }
         dpy_gfx_replace_surface(xenfb->c.con, surface);
-        xen_be_printf(&xenfb->c.xendev, 1, "update: resizing: %dx%d @ %d bpp%s\n",
+        xen_pv_printf(&xenfb->c.xendev, 1,
+                      "update: resizing: %dx%d @ %d bpp%s\n",
                       xenfb->width, xenfb->height, xenfb->depth,
                       is_buffer_shared(surface) ? " (shared)" : "");
         xenfb->up_fullscreen = 1;
@@ -729,18 +739,19 @@ static void xenfb_update(void *opaque)
 
     /* run queued updates */
     if (xenfb->up_fullscreen) {
-	xen_be_printf(&xenfb->c.xendev, 3, "update: fullscreen\n");
-	xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height);
+        xen_pv_printf(&xenfb->c.xendev, 3, "update: fullscreen\n");
+        xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height);
     } else if (xenfb->up_count) {
-	xen_be_printf(&xenfb->c.xendev, 3, "update: %d rects\n", xenfb->up_count);
-	for (i = 0; i < xenfb->up_count; i++)
-	    xenfb_guest_copy(xenfb,
-			     xenfb->up_rects[i].x,
-			     xenfb->up_rects[i].y,
-			     xenfb->up_rects[i].w,
-			     xenfb->up_rects[i].h);
+        xen_pv_printf(&xenfb->c.xendev, 3, "update: %d rects\n",
+                      xenfb->up_count);
+        for (i = 0; i < xenfb->up_count; i++)
+            xenfb_guest_copy(xenfb,
+                             xenfb->up_rects[i].x,
+                             xenfb->up_rects[i].y,
+                             xenfb->up_rects[i].w,
+                             xenfb->up_rects[i].h);
     } else {
-	xen_be_printf(&xenfb->c.xendev, 3, "update: nothing\n");
+        xen_pv_printf(&xenfb->c.xendev, 3, "update: nothing\n");
     }
     xenfb->up_count = 0;
     xenfb->up_fullscreen = 0;
@@ -794,14 +805,14 @@ static void xenfb_handle_events(struct XenFB *xenfb)
 	    w = MIN(event->update.width, xenfb->width - x);
 	    h = MIN(event->update.height, xenfb->height - y);
 	    if (w < 0 || h < 0) {
-                xen_be_printf(&xenfb->c.xendev, 1, "bogus update ignored\n");
+                xen_pv_printf(&xenfb->c.xendev, 1, "bogus update ignored\n");
 		break;
 	    }
 	    if (x != event->update.x ||
                 y != event->update.y ||
 		w != event->update.width ||
 		h != event->update.height) {
-                xen_be_printf(&xenfb->c.xendev, 1, "bogus update clipped\n");
+                xen_pv_printf(&xenfb->c.xendev, 1, "bogus update clipped\n");
 	    }
 	    if (w == xenfb->width && h > xenfb->height / 2) {
 		/* scroll detector: updated more than 50% of the lines,
@@ -883,7 +894,7 @@ static int fb_initialise(struct XenDevice *xendev)
     if (fb->feature_update)
 	xenstore_write_be_int(xendev, "request-update", 1);
 
-    xen_be_printf(xendev, 1, "feature-update=%d, videoram=%d\n",
+    xen_pv_printf(xendev, 1, "feature-update=%d, videoram=%d\n",
 		  fb->feature_update, videoram);
     return 0;
 }
@@ -902,7 +913,7 @@ static void fb_disconnect(struct XenDevice *xendev)
                       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
                       -1, 0);
     if (fb->pixels == MAP_FAILED) {
-        xen_be_printf(xendev, 0,
+        xen_pv_printf(xendev, 0,
                 "Couldn't replace the framebuffer with anonymous memory errno=%d\n",
                 errno);
     }
@@ -923,7 +934,7 @@ static void fb_frontend_changed(struct XenDevice *xendev, const char *node)
     if (fb->bug_trigger == 0 && strcmp(node, "state") == 0 &&
         xendev->fe_state == XenbusStateConnected &&
         xendev->be_state == XenbusStateConnected) {
-        xen_be_printf(xendev, 2, "re-trigger connected (frontend bug)\n");
+        xen_pv_printf(xendev, 2, "re-trigger connected (frontend bug)\n");
         xen_be_set_state(xendev, XenbusStateConnected);
         fb->bug_trigger = 1; /* only once */
     }
@@ -934,7 +945,7 @@ static void fb_event(struct XenDevice *xendev)
     struct XenFB *xenfb = container_of(xendev, struct XenFB, c.xendev);
 
     xenfb_handle_events(xenfb);
-    xen_be_send_notify(&xenfb->c.xendev);
+    xen_pv_send_notify(&xenfb->c.xendev);
 }
 
 /* -------------------------------------------------------------------- */
@@ -977,14 +988,14 @@ void xen_init_display(int domid)
 wait_more:
     i++;
     main_loop_wait(true);
-    xfb = xen_be_find_xendev("vfb", domid, 0);
-    xin = xen_be_find_xendev("vkbd", domid, 0);
+    xfb = xen_pv_find_xendev("vfb", domid, 0);
+    xin = xen_pv_find_xendev("vkbd", domid, 0);
     if (!xfb || !xin) {
         if (i < 256) {
             usleep(10000);
             goto wait_more;
         }
-        xen_be_printf(NULL, 1, "displaystate setup failed\n");
+        xen_pv_printf(NULL, 1, "displaystate setup failed\n");
         return;
     }
 
diff --git a/hw/gpio/imx_gpio.c b/hw/gpio/imx_gpio.c
index f3574aa8f3..c36c394fda 100644
--- a/hw/gpio/imx_gpio.c
+++ b/hw/gpio/imx_gpio.c
@@ -237,7 +237,7 @@ static void imx_gpio_write(void *opaque, hwaddr offset, uint64_t value,
         break;
 
     case ISR_ADDR:
-        s->isr |= ~value;
+        s->isr &= ~value;
         imx_gpio_set_all_int_lines(s);
         break;
 
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 7aaa07a177..ce9cc93af4 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -53,7 +53,6 @@
 #include "hw/pci/pci_bus.h"
 #include "hw/pci-host/q35.h"
 #include "hw/i386/x86-iommu.h"
-#include "hw/timer/hpet.h"
 
 #include "hw/acpi/aml-build.h"
 
diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
index 39b73e7b3d..01cbaa88d2 100644
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@@ -133,9 +133,9 @@ static void kvm_apic_vapic_base_update(APICCommonState *s)
     }
 }
 
-static void kvm_apic_put(CPUState *cs, void *data)
+static void kvm_apic_put(CPUState *cs, run_on_cpu_data data)
 {
-    APICCommonState *s = data;
+    APICCommonState *s = data.host_ptr;
     struct kvm_lapic_state kapic;
     int ret;
 
@@ -151,12 +151,12 @@ static void kvm_apic_put(CPUState *cs, void *data)
 
 static void kvm_apic_post_load(APICCommonState *s)
 {
-    run_on_cpu(CPU(s->cpu), kvm_apic_put, s);
+    run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s));
 }
 
-static void do_inject_external_nmi(CPUState *cpu, void *data)
+static void do_inject_external_nmi(CPUState *cpu, run_on_cpu_data data)
 {
-    APICCommonState *s = data;
+    APICCommonState *s = data.host_ptr;
     uint32_t lvt;
     int ret;
 
@@ -174,7 +174,7 @@ static void do_inject_external_nmi(CPUState *cpu, void *data)
 
 static void kvm_apic_external_nmi(APICCommonState *s)
 {
-    run_on_cpu(CPU(s->cpu), do_inject_external_nmi, s);
+    run_on_cpu(CPU(s->cpu), do_inject_external_nmi, RUN_ON_CPU_HOST_PTR(s));
 }
 
 static void kvm_send_msi(MSIMessage *msg)
@@ -213,7 +213,7 @@ static void kvm_apic_reset(APICCommonState *s)
     /* Not used by KVM, which uses the CPU mp_state instead.  */
     s->wait_for_sipi = 0;
 
-    run_on_cpu(CPU(s->cpu), kvm_apic_put, s);
+    run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s));
 }
 
 static void kvm_apic_realize(DeviceState *dev, Error **errp)
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
index 74a549becf..b30d1b90c6 100644
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@@ -17,6 +17,7 @@
 #include "sysemu/kvm.h"
 #include "hw/i386/apic_internal.h"
 #include "hw/sysbus.h"
+#include "tcg/tcg.h"
 
 #define VAPIC_IO_PORT           0x7e
 
@@ -449,6 +450,9 @@ static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip)
     resume_all_vcpus();
 
     if (!kvm_enabled()) {
+        /* tb_lock will be reset when cpu_loop_exit_noexc longjmps
+         * back into the cpu_exec loop. */
+        tb_lock();
         tb_gen_code(cs, current_pc, current_cs_base, current_flags, 1);
         cpu_loop_exit_noexc(cs);
     }
@@ -483,10 +487,9 @@ typedef struct VAPICEnableTPRReporting {
     bool enable;
 } VAPICEnableTPRReporting;
 
-static void vapic_do_enable_tpr_reporting(CPUState *cpu, void *data)
+static void vapic_do_enable_tpr_reporting(CPUState *cpu, run_on_cpu_data data)
 {
-    VAPICEnableTPRReporting *info = data;
-
+    VAPICEnableTPRReporting *info = data.host_ptr;
     apic_enable_tpr_access_reporting(info->apic, info->enable);
 }
 
@@ -501,7 +504,7 @@ static void vapic_enable_tpr_reporting(bool enable)
     CPU_FOREACH(cs) {
         cpu = X86_CPU(cs);
         info.apic = cpu->apic_state;
-        run_on_cpu(cs, vapic_do_enable_tpr_reporting, &info);
+        run_on_cpu(cs, vapic_do_enable_tpr_reporting, RUN_ON_CPU_HOST_PTR(&info));
     }
 }
 
@@ -734,9 +737,9 @@ static void vapic_realize(DeviceState *dev, Error **errp)
     nb_option_roms++;
 }
 
-static void do_vapic_enable(CPUState *cs, void *data)
+static void do_vapic_enable(CPUState *cs, run_on_cpu_data data)
 {
-    VAPICROMState *s = data;
+    VAPICROMState *s = data.host_ptr;
     X86CPU *cpu = X86_CPU(cs);
 
     static const uint8_t enabled = 1;
@@ -758,7 +761,7 @@ static void kvmvapic_vm_state_change(void *opaque, int running,
 
     if (s->state == VAPIC_ACTIVE) {
         if (smp_cpus == 1) {
-            run_on_cpu(first_cpu, do_vapic_enable, s);
+            run_on_cpu(first_cpu, do_vapic_enable, RUN_ON_CPU_HOST_PTR(s));
         } else {
             zero = g_malloc0(s->rom_state.vapic_size);
             cpu_physical_memory_write(s->vapic_paddr, zero,
diff --git a/hw/microblaze/boot.c b/hw/microblaze/boot.c
index 9eebb1a521..1834d22a61 100644
--- a/hw/microblaze/boot.c
+++ b/hw/microblaze/boot.c
@@ -30,7 +30,6 @@
 #include "qemu/option.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
-#include "qemu-common.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/sysemu.h"
 #include "hw/loader.h"
diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c
index cf9bd3eb45..cf48f420cc 100644
--- a/hw/mips/mips_malta.c
+++ b/hw/mips/mips_malta.c
@@ -47,7 +47,6 @@
 #include "elf.h"
 #include "hw/timer/mc146818rtc.h"
 #include "hw/timer/i8254.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
 #include "hw/sysbus.h"             /* SysBusDevice */
diff --git a/hw/misc/milkymist-pfpu.c b/hw/misc/milkymist-pfpu.c
index 1da21a643e..3ca25894f1 100644
--- a/hw/misc/milkymist-pfpu.c
+++ b/hw/misc/milkymist-pfpu.c
@@ -137,7 +137,7 @@ struct MilkymistPFPUState {
 };
 typedef struct MilkymistPFPUState MilkymistPFPUState;
 
-static inline hwaddr
+static inline uint32_t
 get_dma_address(uint32_t base, uint32_t x, uint32_t y)
 {
     return base + 8 * (128 * y + x);
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 6856b52999..20c43a61b3 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -69,7 +69,7 @@ static void net_tx_response(struct XenNetDev *netdev, netif_tx_request_t *txp, i
     netdev->tx_ring.rsp_prod_pvt = ++i;
     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netdev->tx_ring, notify);
     if (notify) {
-        xen_be_send_notify(&netdev->xendev);
+        xen_pv_send_notify(&netdev->xendev);
     }
 
     if (i == netdev->tx_ring.req_cons) {
@@ -128,30 +128,32 @@ static void net_tx_packets(struct XenNetDev *netdev)
             /* should not happen in theory, we don't announce the *
              * feature-{sg,gso,whatelse} flags in xenstore (yet?) */
             if (txreq.flags & NETTXF_extra_info) {
-                xen_be_printf(&netdev->xendev, 0, "FIXME: extra info flag\n");
+                xen_pv_printf(&netdev->xendev, 0, "FIXME: extra info flag\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
             if (txreq.flags & NETTXF_more_data) {
-                xen_be_printf(&netdev->xendev, 0, "FIXME: more data flag\n");
+                xen_pv_printf(&netdev->xendev, 0, "FIXME: more data flag\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
 #endif
 
             if (txreq.size < 14) {
-                xen_be_printf(&netdev->xendev, 0, "bad packet size: %d\n", txreq.size);
+                xen_pv_printf(&netdev->xendev, 0, "bad packet size: %d\n",
+                              txreq.size);
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
 
             if ((txreq.offset + txreq.size) > XC_PAGE_SIZE) {
-                xen_be_printf(&netdev->xendev, 0, "error: page crossing\n");
+                xen_pv_printf(&netdev->xendev, 0, "error: page crossing\n");
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
 
-            xen_be_printf(&netdev->xendev, 3, "tx packet ref %d, off %d, len %d, flags 0x%x%s%s%s%s\n",
+            xen_pv_printf(&netdev->xendev, 3,
+                          "tx packet ref %d, off %d, len %d, flags 0x%x%s%s%s%s\n",
                           txreq.gref, txreq.offset, txreq.size, txreq.flags,
                           (txreq.flags & NETTXF_csum_blank)     ? " csum_blank"     : "",
                           (txreq.flags & NETTXF_data_validated) ? " data_validated" : "",
@@ -162,8 +164,9 @@ static void net_tx_packets(struct XenNetDev *netdev)
                                            netdev->xendev.dom,
                                            txreq.gref, PROT_READ);
             if (page == NULL) {
-                xen_be_printf(&netdev->xendev, 0, "error: tx gref dereference failed (%d)\n",
-                              txreq.gref);
+                xen_pv_printf(&netdev->xendev, 0,
+                              "error: tx gref dereference failed (%d)\n",
+                             txreq.gref);
                 net_tx_error(netdev, &txreq, rc);
                 continue;
             }
@@ -211,13 +214,14 @@ static void net_rx_response(struct XenNetDev *netdev,
         resp->status = (int16_t)st;
     }
 
-    xen_be_printf(&netdev->xendev, 3, "rx response: idx %d, status %d, flags 0x%x\n",
+    xen_pv_printf(&netdev->xendev, 3,
+                  "rx response: idx %d, status %d, flags 0x%x\n",
                   i, resp->status, resp->flags);
 
     netdev->rx_ring.rsp_prod_pvt = ++i;
     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netdev->rx_ring, notify);
     if (notify) {
-        xen_be_send_notify(&netdev->xendev);
+        xen_pv_send_notify(&netdev->xendev);
     }
 }
 
@@ -242,7 +246,7 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size
         return 0;
     }
     if (size > XC_PAGE_SIZE - NET_IP_ALIGN) {
-        xen_be_printf(&netdev->xendev, 0, "packet too big (%lu > %ld)",
+        xen_pv_printf(&netdev->xendev, 0, "packet too big (%lu > %ld)",
                       (unsigned long)size, XC_PAGE_SIZE - NET_IP_ALIGN);
         return -1;
     }
@@ -254,7 +258,8 @@ static ssize_t net_rx_packet(NetClientState *nc, const uint8_t *buf, size_t size
                                    netdev->xendev.dom,
                                    rxreq.gref, PROT_WRITE);
     if (page == NULL) {
-        xen_be_printf(&netdev->xendev, 0, "error: rx gref dereference failed (%d)\n",
+        xen_pv_printf(&netdev->xendev, 0,
+                      "error: rx gref dereference failed (%d)\n",
                       rxreq.gref);
         net_rx_response(netdev, &rxreq, NETIF_RSP_ERROR, 0, 0, 0);
         return -1;
@@ -328,7 +333,8 @@ static int net_connect(struct XenDevice *xendev)
         rx_copy = 0;
     }
     if (rx_copy == 0) {
-        xen_be_printf(&netdev->xendev, 0, "frontend doesn't support rx-copy.\n");
+        xen_pv_printf(&netdev->xendev, 0,
+                      "frontend doesn't support rx-copy.\n");
         return -1;
     }
 
@@ -353,7 +359,7 @@ static int net_connect(struct XenDevice *xendev)
 
     xen_be_bind_evtchn(&netdev->xendev);
 
-    xen_be_printf(&netdev->xendev, 1, "ok: tx-ring-ref %d, rx-ring-ref %d, "
+    xen_pv_printf(&netdev->xendev, 1, "ok: tx-ring-ref %d, rx-ring-ref %d, "
                   "remote port %d, local port %d\n",
                   netdev->tx_ring_ref, netdev->rx_ring_ref,
                   netdev->xendev.remote_port, netdev->xendev.local_port);
@@ -366,7 +372,7 @@ static void net_disconnect(struct XenDevice *xendev)
 {
     struct XenNetDev *netdev = container_of(xendev, struct XenNetDev, xendev);
 
-    xen_be_unbind_evtchn(&netdev->xendev);
+    xen_pv_unbind_evtchn(&netdev->xendev);
 
     if (netdev->txs) {
         xengnttab_unmap(netdev->xendev.gnttabdev, netdev->txs, 1);
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index 92aa563929..1f0c3e9910 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -29,7 +29,6 @@
 #include "hw/isa/isa.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/sysbus.h"
-#include "hw/boards.h"
 #include "trace.h"
 #include "qemu/error-report.h"
 #include "qemu/config-file.h"
diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
index 1cc598f7e9..6ac187fa32 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -15,7 +15,6 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
-#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_bridge.h"
 #include "hw/i386/pc.h"
 #include "qemu/range.h"
diff --git a/hw/ppc/ppc405_boards.c b/hw/ppc/ppc405_boards.c
index 4b2f07aecb..d01798f245 100644
--- a/hw/ppc/ppc405_boards.c
+++ b/hw/ppc/ppc405_boards.c
@@ -37,7 +37,6 @@
 #include "qemu/log.h"
 #include "qemu/error-report.h"
 #include "hw/loader.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "exec/address-spaces.h"
 
diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c
index 8e16f651ea..cf958a9e00 100644
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -84,11 +84,11 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env,
     env->tlb_dirty = true;
 }
 
-static void spin_kick(CPUState *cs, void *data)
+static void spin_kick(CPUState *cs, run_on_cpu_data data)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
     CPUPPCState *env = &cpu->env;
-    SpinInfo *curspin = data;
+    SpinInfo *curspin = data.host_ptr;
     hwaddr map_size = 64 * 1024 * 1024;
     hwaddr map_start;
 
@@ -147,7 +147,7 @@ static void spin_write(void *opaque, hwaddr addr, uint64_t value,
 
     if (!(ldq_p(&curspin->addr) & 1)) {
         /* run CPU */
-        run_on_cpu(cpu, spin_kick, curspin);
+        run_on_cpu(cpu, spin_kick, RUN_ON_CPU_HOST_PTR(curspin));
     }
 }
 
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c8e29212cb..0cbab24c91 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -37,7 +37,6 @@
 #include "sysemu/block-backend.h"
 #include "sysemu/cpus.h"
 #include "sysemu/kvm.h"
-#include "sysemu/device_tree.h"
 #include "kvm_ppc.h"
 #include "migration/migration.h"
 #include "mmu-hash64.h"
@@ -2187,7 +2186,7 @@ static void spapr_machine_finalizefn(Object *obj)
     g_free(spapr->kvm_type);
 }
 
-static void ppc_cpu_do_nmi_on_cpu(CPUState *cs, void *arg)
+static void ppc_cpu_do_nmi_on_cpu(CPUState *cs, run_on_cpu_data arg)
 {
     cpu_synchronize_state(cs);
     ppc_cpu_do_system_reset(cs);
@@ -2198,7 +2197,7 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
     CPUState *cs;
 
     CPU_FOREACH(cs) {
-        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, NULL);
+        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, RUN_ON_CPU_NULL);
     }
 }
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 7c46d4625b..9a9bedf1bd 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -19,9 +19,9 @@ struct SPRSyncState {
     target_ulong mask;
 };
 
-static void do_spr_sync(CPUState *cs, void *arg)
+static void do_spr_sync(CPUState *cs, run_on_cpu_data arg)
 {
-    struct SPRSyncState *s = arg;
+    struct SPRSyncState *s = arg.host_ptr;
     PowerPCCPU *cpu = POWERPC_CPU(cs);
     CPUPPCState *env = &cpu->env;
 
@@ -38,7 +38,7 @@ static void set_spr(CPUState *cs, int spr, target_ulong value,
         .value = value,
         .mask = mask
     };
-    run_on_cpu(cs, do_spr_sync, &s);
+    run_on_cpu(cs, do_spr_sync, RUN_ON_CPU_HOST_PTR(&s));
 }
 
 static bool has_spr(PowerPCCPU *cpu, int spr)
@@ -886,10 +886,10 @@ typedef struct {
     Error *err;
 } SetCompatState;
 
-static void do_set_compat(CPUState *cs, void *arg)
+static void do_set_compat(CPUState *cs, run_on_cpu_data arg)
 {
     PowerPCCPU *cpu = POWERPC_CPU(cs);
-    SetCompatState *s = arg;
+    SetCompatState *s = arg.host_ptr;
 
     cpu_synchronize_state(cs);
     ppc_set_compat(cpu, s->cpu_version, &s->err);
@@ -990,7 +990,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
                 .err = NULL,
             };
 
-            run_on_cpu(cs, do_set_compat, &s);
+            run_on_cpu(cs, do_set_compat, RUN_ON_CPU_HOST_PTR(&s));
 
             if (s.err) {
                 error_report_err(s.err);
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index b7f8bca1fd..63f6248f1d 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -463,7 +463,6 @@ static void s390_msi_ctrl_write(void *opaque, hwaddr addr, uint64_t data,
                                 unsigned int size)
 {
     S390PCIBusDevice *pbdev = opaque;
-    uint32_t io_int_word;
     uint32_t idx = data >> ZPCI_MSI_VEC_BITS;
     uint32_t vec = data & ZPCI_MSI_VEC_MASK;
     uint64_t ind_bit;
@@ -489,8 +488,7 @@ static void s390_msi_ctrl_write(void *opaque, hwaddr addr, uint64_t data,
                    0x80 >> ((ind_bit + vec) % 8));
     if (!set_ind_atomic(pbdev->routes.adapter.summary_addr + sum_bit / 8,
                                        0x80 >> (sum_bit % 8))) {
-        io_int_word = (pbdev->isc << 27) | IO_INT_WORD_AI;
-        s390_io_interrupt(0, 0, 0, io_int_word);
+        css_adapter_interrupt(pbdev->isc);
     }
 }
 
@@ -809,17 +807,11 @@ static uint32_t s390_pci_generate_fid(Error **errp)
 {
     uint32_t fid = 0;
 
-    while (fid <= ZPCI_MAX_FID) {
+    do {
         if (!s390_pci_find_dev_by_fid(fid)) {
             return fid;
         }
-
-        if (fid == ZPCI_MAX_FID) {
-            break;
-        }
-
-        fid++;
-    }
+    } while (fid++ != ZPCI_MAX_FID);
 
     error_setg(errp, "no free fid could be found");
     return 0;
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 80a51049ca..0864d9be12 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -316,6 +316,7 @@ int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
     uint64_t offset;
     uint64_t data;
     MemoryRegion *mr;
+    MemTxResult result;
     uint8_t len;
     uint32_t fh;
     uint8_t pcias;
@@ -365,8 +366,12 @@ int pcilg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
             return 0;
         }
         mr = pbdev->pdev->io_regions[pcias].memory;
-        memory_region_dispatch_read(mr, offset, &data, len,
-                                    MEMTXATTRS_UNSPECIFIED);
+        result = memory_region_dispatch_read(mr, offset, &data, len,
+                                             MEMTXATTRS_UNSPECIFIED);
+        if (result != MEMTX_OK) {
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
     } else if (pcias == 15) {
         if ((4 - (offset & 0x3)) < len) {
             program_interrupt(env, PGM_OPERAND, 4);
@@ -444,6 +449,7 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
     uint64_t offset, data;
     S390PCIBusDevice *pbdev;
     MemoryRegion *mr;
+    MemTxResult result;
     uint8_t len;
     uint32_t fh;
     uint8_t pcias;
@@ -502,8 +508,12 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
             mr = pbdev->pdev->io_regions[pcias].memory;
         }
 
-        memory_region_dispatch_write(mr, offset, data, len,
+        result = memory_region_dispatch_write(mr, offset, data, len,
                                      MEMTXATTRS_UNSPECIFIED);
+        if (result != MEMTX_OK) {
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
     } else if (pcias == 15) {
         if ((4 - (offset & 0x3)) < len) {
             program_interrupt(env, PGM_OPERAND, 4);
@@ -633,6 +643,7 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
     CPUS390XState *env = &cpu->env;
     S390PCIBusDevice *pbdev;
     MemoryRegion *mr;
+    MemTxResult result;
     int i;
     uint32_t fh;
     uint8_t pcias;
@@ -690,7 +701,7 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
 
     mr = pbdev->pdev->io_regions[pcias].memory;
     if (!memory_region_access_valid(mr, env->regs[r3], len, true)) {
-        program_interrupt(env, PGM_ADDRESSING, 6);
+        program_interrupt(env, PGM_OPERAND, 6);
         return 0;
     }
 
@@ -699,9 +710,13 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr,
     }
 
     for (i = 0; i < len / 8; i++) {
-        memory_region_dispatch_write(mr, env->regs[r3] + i * 8,
+        result = memory_region_dispatch_write(mr, env->regs[r3] + i * 8,
                                      ldq_p(buffer + i * 8), 8,
                                      MEMTXATTRS_UNSPECIFIED);
+        if (result != MEMTX_OK) {
+            program_interrupt(env, PGM_OPERAND, 6);
+            return 0;
+        }
     }
 
     setcc(cpu, ZPCI_PCI_LS_OK);
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index aa6be541ec..f2ea29dbc3 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -201,13 +201,11 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
     s->dataplane_stopping = true;
 
     aio_context_acquire(s->ctx);
-
     virtio_scsi_clear_aio(s);
+    aio_context_release(s->ctx);
 
     blk_drain_all(); /* ensure there are no in-flight requests */
 
-    aio_context_release(s->ctx);
-
     for (i = 0; i < vs->conf.num_queues + 2; i++) {
         virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
     }
diff --git a/hw/timer/grlib_gptimer.c b/hw/timer/grlib_gptimer.c
index 712d1aece5..4ed96e970a 100644
--- a/hw/timer/grlib_gptimer.c
+++ b/hw/timer/grlib_gptimer.c
@@ -26,7 +26,6 @@
 #include "hw/sysbus.h"
 #include "qemu/timer.h"
 #include "hw/ptimer.h"
-#include "qemu/timer.h"
 #include "qemu/main-loop.h"
 
 #include "trace.h"
diff --git a/hw/tpm/tpm_passthrough.c b/hw/tpm/tpm_passthrough.c
index e88c0d20bc..9234eb3459 100644
--- a/hw/tpm/tpm_passthrough.c
+++ b/hw/tpm/tpm_passthrough.c
@@ -165,8 +165,7 @@ static int tpm_passthrough_unix_tx_bufs(TPMPassthruState *tpm_pt,
 
     ret = tpm_passthrough_unix_write(tpm_pt->tpm_fd, in, in_len);
     if (ret != in_len) {
-        if (!tpm_pt->tpm_op_canceled ||
-            (tpm_pt->tpm_op_canceled && errno != ECANCELED)) {
+        if (!tpm_pt->tpm_op_canceled || errno != ECANCELED) {
             error_report("tpm_passthrough: error while transmitting data "
                          "to TPM: %s (%i)",
                          strerror(errno), errno);
@@ -178,8 +177,7 @@ static int tpm_passthrough_unix_tx_bufs(TPMPassthruState *tpm_pt,
 
     ret = tpm_passthrough_unix_read(tpm_pt->tpm_fd, out, out_len);
     if (ret < 0) {
-        if (!tpm_pt->tpm_op_canceled ||
-            (tpm_pt->tpm_op_canceled && errno != ECANCELED)) {
+        if (!tpm_pt->tpm_op_canceled || errno != ECANCELED) {
             error_report("tpm_passthrough: error while reading data from "
                          "TPM: %s (%i)",
                          strerror(errno), errno);
diff --git a/hw/tpm/tpm_tis.c b/hw/tpm/tpm_tis.c
index 381e7266ea..a6440fef91 100644
--- a/hw/tpm/tpm_tis.c
+++ b/hw/tpm/tpm_tis.c
@@ -34,7 +34,6 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "qemu/main-loop.h"
-#include "sysemu/tpm_backend.h"
 
 #define DEBUG_TIS 0
 
diff --git a/hw/unicore32/puv3.c b/hw/unicore32/puv3.c
index 31cd171016..032078fd3e 100644
--- a/hw/unicore32/puv3.c
+++ b/hw/unicore32/puv3.c
@@ -13,7 +13,6 @@
 #include "qapi/error.h"
 #include "qemu-common.h"
 #include "cpu.h"
-#include "qemu-common.h"
 #include "ui/console.h"
 #include "elf.h"
 #include "exec/address-spaces.h"
diff --git a/hw/usb/ccid-card-emulated.c b/hw/usb/ccid-card-emulated.c
index 3213f9f8af..eceb5f3ee2 100644
--- a/hw/usb/ccid-card-emulated.c
+++ b/hw/usb/ccid-card-emulated.c
@@ -547,7 +547,7 @@ static int emulated_initfn(CCIDCardState *base)
     return 0;
 }
 
-static int emulated_exitfn(CCIDCardState *base)
+static void emulated_exitfn(CCIDCardState *base)
 {
     EmulatedState *card = EMULATED_CCID_CARD(base);
     VEvent *vevent = vevent_new(VEVENT_LAST, NULL, NULL);
@@ -564,7 +564,6 @@ static int emulated_exitfn(CCIDCardState *base)
     qemu_mutex_destroy(&card->handle_apdu_mutex);
     qemu_mutex_destroy(&card->vreader_mutex);
     qemu_mutex_destroy(&card->event_list_mutex);
-    return 0;
 }
 
 static Property emulated_card_properties[] = {
diff --git a/hw/usb/ccid-card-passthru.c b/hw/usb/ccid-card-passthru.c
index 325129a2f6..88cb6d8978 100644
--- a/hw/usb/ccid-card-passthru.c
+++ b/hw/usb/ccid-card-passthru.c
@@ -365,11 +365,6 @@ static int passthru_initfn(CCIDCardState *base)
     return 0;
 }
 
-static int passthru_exitfn(CCIDCardState *base)
-{
-    return 0;
-}
-
 static VMStateDescription passthru_vmstate = {
     .name = "ccid-card-passthru",
     .version_id = 1,
@@ -396,7 +391,6 @@ static void passthru_class_initfn(ObjectClass *klass, void *data)
     CCIDCardClass *cc = CCID_CARD_CLASS(klass);
 
     cc->initfn = passthru_initfn;
-    cc->exitfn = passthru_exitfn;
     cc->get_atr = passthru_get_atr;
     cc->apdu_from_guest = passthru_apdu_from_guest;
     set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
diff --git a/hw/usb/ccid.h b/hw/usb/ccid.h
index 9334da8acd..1f070116d6 100644
--- a/hw/usb/ccid.h
+++ b/hw/usb/ccid.h
@@ -33,7 +33,7 @@ typedef struct CCIDCardClass {
     void (*apdu_from_guest)(CCIDCardState *card,
                             const uint8_t *apdu,
                             uint32_t len);
-    int (*exitfn)(CCIDCardState *card);
+    void (*exitfn)(CCIDCardState *card);
     int (*initfn)(CCIDCardState *card);
 } CCIDCardClass;
 
diff --git a/hw/usb/dev-mtp.c b/hw/usb/dev-mtp.c
index 58d95fffb2..9cb0f50750 100644
--- a/hw/usb/dev-mtp.c
+++ b/hw/usb/dev-mtp.c
@@ -17,7 +17,6 @@
 #include <sys/statvfs.h>
 #ifdef CONFIG_INOTIFY1
 #include <sys/inotify.h>
-#include "qapi/error.h"
 #include "qemu/main-loop.h"
 #endif
 
diff --git a/hw/usb/dev-smartcard-reader.c b/hw/usb/dev-smartcard-reader.c
index af4b851356..89e11b68c4 100644
--- a/hw/usb/dev-smartcard-reader.c
+++ b/hw/usb/dev-smartcard-reader.c
@@ -508,14 +508,14 @@ static void ccid_card_apdu_from_guest(CCIDCardState *card,
     }
 }
 
-static int ccid_card_exitfn(CCIDCardState *card)
+static void ccid_card_exitfn(CCIDCardState *card)
 {
     CCIDCardClass *cc = CCID_CARD_GET_CLASS(card);
 
     if (cc->exitfn) {
-        return cc->exitfn(card);
+        cc->exitfn(card);
     }
-    return 0;
+
 }
 
 static int ccid_card_initfn(CCIDCardState *card)
@@ -1279,7 +1279,6 @@ void ccid_card_card_inserted(CCIDCardState *card)
 
 static int ccid_card_exit(DeviceState *qdev)
 {
-    int ret = 0;
     CCIDCardState *card = CCID_CARD(qdev);
     USBDevice *dev = USB_DEVICE(qdev->parent_bus->parent);
     USBCCIDState *s = USB_CCID_DEV(dev);
@@ -1287,9 +1286,9 @@ static int ccid_card_exit(DeviceState *qdev)
     if (ccid_card_inserted(s)) {
         ccid_card_card_removed(card);
     }
-    ret = ccid_card_exitfn(card);
+    ccid_card_exitfn(card);
     s->card = NULL;
-    return ret;
+    return 0;
 }
 
 static int ccid_card_init(DeviceState *qdev)
diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
index de2ebd6210..1b3c2fb3c7 100644
--- a/hw/usb/xen-usb.c
+++ b/hw/usb/xen-usb.c
@@ -47,7 +47,7 @@
         struct timeval tv;                                          \
                                                                     \
         gettimeofday(&tv, NULL);                                    \
-        xen_be_printf(xendev, lvl, "%8ld.%06ld xen-usb(%s):" fmt,   \
+        xen_pv_printf(xendev, lvl, "%8ld.%06ld xen-usb(%s):" fmt,   \
                       tv.tv_sec, tv.tv_usec, __func__, ##args);     \
     }
 #define TR_BUS(xendev, fmt, args...) TR(xendev, 2, fmt, ##args)
@@ -153,7 +153,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
     }
 
     if (nr_segs > USBIF_MAX_SEGMENTS_PER_REQUEST) {
-        xen_be_printf(xendev, 0, "bad number of segments in request (%d)\n",
+        xen_pv_printf(xendev, 0, "bad number of segments in request (%d)\n",
                       nr_segs);
         return -EINVAL;
     }
@@ -161,7 +161,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
     for (i = 0; i < nr_segs; i++) {
         if ((unsigned)usbback_req->req.seg[i].offset +
             (unsigned)usbback_req->req.seg[i].length > XC_PAGE_SIZE) {
-            xen_be_printf(xendev, 0, "segment crosses page boundary\n");
+            xen_pv_printf(xendev, 0, "segment crosses page boundary\n");
             return -EINVAL;
         }
     }
@@ -199,7 +199,7 @@ static int usbback_gnttab_map(struct usbback_req *usbback_req)
      */
 
     if (!usbback_req->nr_extra_segs) {
-        xen_be_printf(xendev, 0, "iso request without descriptor segments\n");
+        xen_pv_printf(xendev, 0, "iso request without descriptor segments\n");
         return -EINVAL;
     }
 
@@ -314,7 +314,7 @@ static void usbback_do_response(struct usbback_req *usbback_req, int32_t status,
         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&usbif->urb_ring, notify);
 
         if (notify) {
-            xen_be_send_notify(xendev);
+            xen_pv_send_notify(xendev);
         }
     }
 
@@ -551,14 +551,14 @@ static void usbback_dispatch(struct usbback_req *usbback_req)
 
     ret = usbback_init_packet(usbback_req);
     if (ret) {
-        xen_be_printf(&usbif->xendev, 0, "invalid request\n");
+        xen_pv_printf(&usbif->xendev, 0, "invalid request\n");
         ret = -ESHUTDOWN;
         goto fail_free_urb;
     }
 
     ret = usbback_gnttab_map(usbback_req);
     if (ret) {
-        xen_be_printf(&usbif->xendev, 0, "invalid buffer, ret=%d\n", ret);
+        xen_pv_printf(&usbif->xendev, 0, "invalid buffer, ret=%d\n", ret);
         ret = -ESHUTDOWN;
         goto fail_free_urb;
     }
@@ -590,7 +590,7 @@ static void usbback_hotplug_notify(struct usbback_info *usbif)
 
     /* Check for full ring. */
     if ((RING_SIZE(ring) - ring->rsp_prod_pvt - ring->req_cons) == 0) {
-        xen_be_send_notify(&usbif->xendev);
+        xen_pv_send_notify(&usbif->xendev);
         return;
     }
 
@@ -609,7 +609,7 @@ static void usbback_hotplug_notify(struct usbback_info *usbif)
     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(ring, notify);
 
     if (notify) {
-        xen_be_send_notify(&usbif->xendev);
+        xen_pv_send_notify(&usbif->xendev);
     }
 
     TR_BUS(&usbif->xendev, "hotplug port %d speed %d\n", usb_hp->port,
@@ -646,7 +646,7 @@ static void usbback_bh(void *opaque)
 
     if (RING_REQUEST_PROD_OVERFLOW(urb_ring, rp)) {
         rc = urb_ring->rsp_prod_pvt;
-        xen_be_printf(&usbif->xendev, 0, "domU provided bogus ring requests "
+        xen_pv_printf(&usbif->xendev, 0, "domU provided bogus ring requests "
                       "(%#x - %#x = %u). Halting ring processing.\n",
                       rp, rc, rp - rc);
         usbif->ring_error = true;
@@ -744,7 +744,7 @@ static void usbback_portid_add(struct usbback_info *usbif, unsigned port,
 
     portname = strchr(busid, '-');
     if (!portname) {
-        xen_be_printf(&usbif->xendev, 0, "device %s illegal specification\n",
+        xen_pv_printf(&usbif->xendev, 0, "device %s illegal specification\n",
                       busid);
         return;
     }
@@ -783,7 +783,7 @@ static void usbback_portid_add(struct usbback_info *usbif, unsigned port,
         break;
     }
     if (speed == USBIF_SPEED_NONE) {
-        xen_be_printf(&usbif->xendev, 0, "device %s wrong speed\n", busid);
+        xen_pv_printf(&usbif->xendev, 0, "device %s wrong speed\n", busid);
         object_unparent(OBJECT(usbif->ports[port - 1].dev));
         usbif->ports[port - 1].dev = NULL;
         return;
@@ -800,7 +800,7 @@ static void usbback_portid_add(struct usbback_info *usbif, unsigned port,
 err:
     QDECREF(qdict);
     snprintf(p->path, sizeof(p->path), "%d", 99);
-    xen_be_printf(&usbif->xendev, 0, "device %s could not be opened\n", busid);
+    xen_pv_printf(&usbif->xendev, 0, "device %s could not be opened\n", busid);
 }
 
 static void usbback_process_port(struct usbback_info *usbif, unsigned port)
@@ -811,7 +811,7 @@ static void usbback_process_port(struct usbback_info *usbif, unsigned port)
     snprintf(node, sizeof(node), "port/%d", port);
     busid = xenstore_read_be_str(&usbif->xendev, node);
     if (busid == NULL) {
-        xen_be_printf(&usbif->xendev, 0, "xenstore_read %s failed\n", node);
+        xen_pv_printf(&usbif->xendev, 0, "xenstore_read %s failed\n", node);
         return;
     }
 
@@ -834,7 +834,7 @@ static void usbback_disconnect(struct XenDevice *xendev)
 
     usbif = container_of(xendev, struct usbback_info, xendev);
 
-    xen_be_unbind_evtchn(xendev);
+    xen_pv_unbind_evtchn(xendev);
 
     if (usbif->urb_sring) {
         xengnttab_unmap(xendev->gnttabdev, usbif->urb_sring, 1);
@@ -868,15 +868,15 @@ static int usbback_connect(struct XenDevice *xendev)
     usbif = container_of(xendev, struct usbback_info, xendev);
 
     if (xenstore_read_fe_int(xendev, "urb-ring-ref", &urb_ring_ref)) {
-        xen_be_printf(xendev, 0, "error reading urb-ring-ref\n");
+        xen_pv_printf(xendev, 0, "error reading urb-ring-ref\n");
         return -1;
     }
     if (xenstore_read_fe_int(xendev, "conn-ring-ref", &conn_ring_ref)) {
-        xen_be_printf(xendev, 0, "error reading conn-ring-ref\n");
+        xen_pv_printf(xendev, 0, "error reading conn-ring-ref\n");
         return -1;
     }
     if (xenstore_read_fe_int(xendev, "event-channel", &xendev->remote_port)) {
-        xen_be_printf(xendev, 0, "error reading event-channel\n");
+        xen_pv_printf(xendev, 0, "error reading event-channel\n");
         return -1;
     }
 
@@ -887,7 +887,7 @@ static int usbback_connect(struct XenDevice *xendev)
                                                 conn_ring_ref,
                                                 PROT_READ | PROT_WRITE);
     if (!usbif->urb_sring || !usbif->conn_sring) {
-        xen_be_printf(xendev, 0, "error mapping rings\n");
+        xen_pv_printf(xendev, 0, "error mapping rings\n");
         usbback_disconnect(xendev);
         return -1;
     }
@@ -899,7 +899,7 @@ static int usbback_connect(struct XenDevice *xendev)
 
     xen_be_bind_evtchn(xendev);
 
-    xen_be_printf(xendev, 1, "urb-ring-ref %d, conn-ring-ref %d, "
+    xen_pv_printf(xendev, 1, "urb-ring-ref %d, conn-ring-ref %d, "
                   "remote port %d, local port %d\n", urb_ring_ref,
                   conn_ring_ref, xendev->remote_port, xendev->local_port);
 
@@ -935,12 +935,12 @@ static int usbback_init(struct XenDevice *xendev)
 
     if (xenstore_read_be_int(xendev, "num-ports", &usbif->num_ports) ||
         usbif->num_ports < 1 || usbif->num_ports > USBBACK_MAXPORTS) {
-        xen_be_printf(xendev, 0, "num-ports not readable or out of bounds\n");
+        xen_pv_printf(xendev, 0, "num-ports not readable or out of bounds\n");
         return -1;
     }
     if (xenstore_read_be_int(xendev, "usb-ver", &usbif->usb_ver) ||
         (usbif->usb_ver != USB_VER_USB11 && usbif->usb_ver != USB_VER_USB20)) {
-        xen_be_printf(xendev, 0, "usb-ver not readable or out of bounds\n");
+        xen_pv_printf(xendev, 0, "usb-ver not readable or out of bounds\n");
         return -1;
     }
 
@@ -1028,7 +1028,7 @@ static void usbback_alloc(struct XenDevice *xendev)
     /* max_grants: for each request and for the rings (request and connect). */
     max_grants = USBIF_MAX_SEGMENTS_PER_REQUEST * USB_URB_RING_SIZE + 2;
     if (xengnttab_set_max_grants(xendev->gnttabdev, max_grants) < 0) {
-        xen_be_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
+        xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
                       strerror(errno));
     }
 }
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9505fb3040..801578b4b9 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -610,16 +610,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
     return NULL;
 }
 
-static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
-                                           struct vfio_region_info *info)
+static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
+                                          struct vfio_region_info *info)
 {
     struct vfio_info_cap_header *hdr;
     struct vfio_region_info_cap_sparse_mmap *sparse;
-    int i;
+    int i, j;
 
     hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
     if (!hdr) {
-        return;
+        return -ENODEV;
     }
 
     sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
@@ -627,16 +627,24 @@ static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
     trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
                                          region->nr, sparse->nr_areas);
 
-    region->nr_mmaps = sparse->nr_areas;
-    region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+    region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
 
-    for (i = 0; i < region->nr_mmaps; i++) {
-        region->mmaps[i].offset = sparse->areas[i].offset;
-        region->mmaps[i].size = sparse->areas[i].size;
-        trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset,
-                                            region->mmaps[i].offset +
-                                            region->mmaps[i].size);
+    for (i = 0, j = 0; i < sparse->nr_areas; i++) {
+        trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
+                                            sparse->areas[i].offset +
+                                            sparse->areas[i].size);
+
+        if (sparse->areas[i].size) {
+            region->mmaps[j].offset = sparse->areas[i].offset;
+            region->mmaps[j].size = sparse->areas[i].size;
+            j++;
+        }
     }
+
+    region->nr_mmaps = j;
+    region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
+
+    return 0;
 }
 
 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
@@ -662,12 +670,11 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
                               region, name, region->size);
 
         if (!vbasedev->no_mmap &&
-            region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
-            !(region->size & ~qemu_real_host_page_mask)) {
+            region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
 
-            vfio_setup_region_sparse_mmaps(region, info);
+            ret = vfio_setup_region_sparse_mmaps(region, info);
 
-            if (!region->nr_mmaps) {
+            if (ret) {
                 region->nr_mmaps = 1;
                 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
                 region->mmaps[0].offset = 0;
@@ -724,12 +731,11 @@ int vfio_region_mmap(VFIORegion *region)
 
         name = g_strdup_printf("%s mmaps[%d]",
                                memory_region_name(region->mem), i);
-        memory_region_init_ram_ptr(&region->mmaps[i].mem,
-                                   memory_region_owner(region->mem),
-                                   name, region->mmaps[i].size,
-                                   region->mmaps[i].mmap);
+        memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
+                                          memory_region_owner(region->mem),
+                                          name, region->mmaps[i].size,
+                                          region->mmaps[i].mmap);
         g_free(name);
-        memory_region_set_skip_dump(&region->mmaps[i].mem);
         memory_region_add_subregion(region->mem, region->mmaps[i].offset,
                                     &region->mmaps[i].mem);
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 65d30fdef9..d7dbe0e3e0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1071,6 +1071,55 @@ static const MemoryRegionOps vfio_vga_ops = {
 };
 
 /*
+ * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
+ * size if the BAR is in an exclusive page in host so that we could map
+ * this BAR to guest. But this sub-page BAR may not occupy an exclusive
+ * page in guest. So we should set the priority of the expanded memory
+ * region to zero in case of overlap with BARs which share the same page
+ * with the sub-page BAR in guest. Besides, we should also recover the
+ * size of this sub-page BAR when its base address is changed in guest
+ * and not page aligned any more.
+ */
+static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
+{
+    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
+    VFIORegion *region = &vdev->bars[bar].region;
+    MemoryRegion *mmap_mr, *mr;
+    PCIIORegion *r;
+    pcibus_t bar_addr;
+    uint64_t size = region->size;
+
+    /* Make sure that the whole region is allowed to be mmapped */
+    if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
+        region->mmaps[0].size != region->size) {
+        return;
+    }
+
+    r = &pdev->io_regions[bar];
+    bar_addr = r->addr;
+    mr = region->mem;
+    mmap_mr = &region->mmaps[0].mem;
+
+    /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
+    if (bar_addr != PCI_BAR_UNMAPPED &&
+        !(bar_addr & ~qemu_real_host_page_mask)) {
+        size = qemu_real_host_page_size;
+    }
+
+    memory_region_transaction_begin();
+
+    memory_region_set_size(mr, size);
+    memory_region_set_size(mmap_mr, size);
+    if (size != region->size && memory_region_is_mapped(mr)) {
+        memory_region_del_subregion(r->address_space, mr);
+        memory_region_add_subregion_overlap(r->address_space,
+                                            bar_addr, mr, 0);
+    }
+
+    memory_region_transaction_commit();
+}
+
+/*
  * PCI config space
  */
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
@@ -1153,6 +1202,24 @@ void vfio_pci_write_config(PCIDevice *pdev,
         } else if (was_enabled && !is_enabled) {
             vfio_msix_disable(vdev);
         }
+    } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
+        range_covers_byte(addr, len, PCI_COMMAND)) {
+        pcibus_t old_addr[PCI_NUM_REGIONS - 1];
+        int bar;
+
+        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
+            old_addr[bar] = pdev->io_regions[bar].addr;
+        }
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
+            if (old_addr[bar] != pdev->io_regions[bar].addr &&
+                pdev->io_regions[bar].size > 0 &&
+                pdev->io_regions[bar].size < qemu_real_host_page_size) {
+                vfio_sub_page_bar_update_mapping(pdev, bar);
+            }
+        }
     } else {
         /* Write everything to QEMU to keep emulated bits correct */
         pci_default_write_config(pdev, addr, val, len);
@@ -1922,11 +1989,23 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 {
     Error *err = NULL;
+    int nr;
 
     vfio_intx_enable(vdev, &err);
     if (err) {
         error_reportf_err(err, ERR_PREFIX, vdev->vbasedev.name);
     }
+
+    for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
+        off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
+        uint32_t val = 0;
+        uint32_t len = sizeof(val);
+
+        if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
+            error_report("%s(%s) reset bar %d failed: %m", __func__,
+                         vdev->vbasedev.name, nr);
+        }
+    }
 }
 
 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 7443d348d9..4409bcc0d7 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -25,7 +25,7 @@ static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
     }
 
     return !memory_region_is_ram(section->mr) ||
-            memory_region_is_skip_dump(section->mr);
+            memory_region_is_ram_device(section->mr);
 }
 
 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
diff --git a/hw/xen/Makefile.objs b/hw/xen/Makefile.objs
index d3670940b7..591cdc229d 100644
--- a/hw/xen/Makefile.objs
+++ b/hw/xen/Makefile.objs
@@ -1,5 +1,5 @@
 # xen backend driver support
-common-obj-$(CONFIG_XEN_BACKEND) += xen_backend.o xen_devconfig.o
+common-obj-$(CONFIG_XEN_BACKEND) += xen_backend.o xen_devconfig.o xen_pvdev.o
 
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_graphics.o xen_pt_msi.o
diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
index 69a238817e..41ba5c585a 100644
--- a/hw/xen/xen_backend.c
+++ b/hw/xen/xen_backend.c
@@ -30,6 +30,7 @@
 #include "sysemu/char.h"
 #include "qemu/log.h"
 #include "hw/xen/xen_backend.h"
+#include "hw/xen/xen_pvdev.h"
 
 #include <xen/grant_table.h>
 
@@ -46,129 +47,7 @@ struct xs_handle *xenstore = NULL;
 const char *xen_protocol;
 
 /* private */
-struct xs_dirs {
-    char *xs_dir;
-    QTAILQ_ENTRY(xs_dirs) list;
-};
-static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
-    QTAILQ_HEAD_INITIALIZER(xs_cleanup);
-
-static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs = QTAILQ_HEAD_INITIALIZER(xendevs);
-static int debug = 0;
-
-/* ------------------------------------------------------------- */
-
-static void xenstore_cleanup_dir(char *dir)
-{
-    struct xs_dirs *d;
-
-    d = g_malloc(sizeof(*d));
-    d->xs_dir = dir;
-    QTAILQ_INSERT_TAIL(&xs_cleanup, d, list);
-}
-
-void xen_config_cleanup(void)
-{
-    struct xs_dirs *d;
-
-    QTAILQ_FOREACH(d, &xs_cleanup, list) {
-        xs_rm(xenstore, 0, d->xs_dir);
-    }
-}
-
-int xenstore_write_str(const char *base, const char *node, const char *val)
-{
-    char abspath[XEN_BUFSIZE];
-
-    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-    if (!xs_write(xenstore, 0, abspath, val, strlen(val))) {
-        return -1;
-    }
-    return 0;
-}
-
-char *xenstore_read_str(const char *base, const char *node)
-{
-    char abspath[XEN_BUFSIZE];
-    unsigned int len;
-    char *str, *ret = NULL;
-
-    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
-    str = xs_read(xenstore, 0, abspath, &len);
-    if (str != NULL) {
-        /* move to qemu-allocated memory to make sure
-         * callers can savely g_free() stuff. */
-        ret = g_strdup(str);
-        free(str);
-    }
-    return ret;
-}
-
-int xenstore_mkdir(char *path, int p)
-{
-    struct xs_permissions perms[2] = {
-        {
-            .id    = 0, /* set owner: dom0 */
-        }, {
-            .id    = xen_domid,
-            .perms = p,
-        }
-    };
-
-    if (!xs_mkdir(xenstore, 0, path)) {
-        xen_be_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
-        return -1;
-    }
-    xenstore_cleanup_dir(g_strdup(path));
-
-    if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
-        xen_be_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
-        return -1;
-    }
-    return 0;
-}
-
-int xenstore_write_int(const char *base, const char *node, int ival)
-{
-    char val[12];
-
-    snprintf(val, sizeof(val), "%d", ival);
-    return xenstore_write_str(base, node, val);
-}
-
-int xenstore_write_int64(const char *base, const char *node, int64_t ival)
-{
-    char val[21];
-
-    snprintf(val, sizeof(val), "%"PRId64, ival);
-    return xenstore_write_str(base, node, val);
-}
-
-int xenstore_read_int(const char *base, const char *node, int *ival)
-{
-    char *val;
-    int rc = -1;
-
-    val = xenstore_read_str(base, node);
-    if (val && 1 == sscanf(val, "%d", ival)) {
-        rc = 0;
-    }
-    g_free(val);
-    return rc;
-}
-
-int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval)
-{
-    char *val;
-    int rc = -1;
-
-    val = xenstore_read_str(base, node);
-    if (val && 1 == sscanf(val, "%"SCNu64, uval)) {
-        rc = 0;
-    }
-    g_free(val);
-    return rc;
-}
+static int debug;
 
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const char *val)
 {
@@ -205,27 +84,14 @@ int xenstore_read_fe_int(struct XenDevice *xendev, const char *node, int *ival)
     return xenstore_read_int(xendev->fe, node, ival);
 }
 
-int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node, uint64_t *uval)
+int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node,
+                            uint64_t *uval)
 {
     return xenstore_read_uint64(xendev->fe, node, uval);
 }
 
 /* ------------------------------------------------------------- */
 
-const char *xenbus_strstate(enum xenbus_state state)
-{
-    static const char *const name[] = {
-        [ XenbusStateUnknown      ] = "Unknown",
-        [ XenbusStateInitialising ] = "Initialising",
-        [ XenbusStateInitWait     ] = "InitWait",
-        [ XenbusStateInitialised  ] = "Initialised",
-        [ XenbusStateConnected    ] = "Connected",
-        [ XenbusStateClosing      ] = "Closing",
-        [ XenbusStateClosed       ] = "Closed",
-    };
-    return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
-}
-
 int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state)
 {
     int rc;
@@ -234,33 +100,12 @@ int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state)
     if (rc < 0) {
         return rc;
     }
-    xen_be_printf(xendev, 1, "backend state: %s -> %s\n",
+    xen_pv_printf(xendev, 1, "backend state: %s -> %s\n",
                   xenbus_strstate(xendev->be_state), xenbus_strstate(state));
     xendev->be_state = state;
     return 0;
 }
 
-/* ------------------------------------------------------------- */
-
-struct XenDevice *xen_be_find_xendev(const char *type, int dom, int dev)
-{
-    struct XenDevice *xendev;
-
-    QTAILQ_FOREACH(xendev, &xendevs, next) {
-        if (xendev->dom != dom) {
-            continue;
-        }
-        if (xendev->dev != dev) {
-            continue;
-        }
-        if (strcmp(xendev->type, type) != 0) {
-            continue;
-        }
-        return xendev;
-    }
-    return NULL;
-}
-
 /*
  * get xen backend device, allocate a new one if it doesn't exist.
  */
@@ -269,7 +114,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
 {
     struct XenDevice *xendev;
 
-    xendev = xen_be_find_xendev(type, dom, dev);
+    xendev = xen_pv_find_xendev(type, dom, dev);
     if (xendev) {
         return xendev;
     }
@@ -291,7 +136,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
 
     xendev->evtchndev = xenevtchn_open(NULL, 0);
     if (xendev->evtchndev == NULL) {
-        xen_be_printf(NULL, 0, "can't open evtchn device\n");
+        xen_pv_printf(NULL, 0, "can't open evtchn device\n");
         g_free(xendev);
         return NULL;
     }
@@ -300,7 +145,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
     if (ops->flags & DEVOPS_FLAG_NEED_GNTDEV) {
         xendev->gnttabdev = xengnttab_open(NULL, 0);
         if (xendev->gnttabdev == NULL) {
-            xen_be_printf(NULL, 0, "can't open gnttab device\n");
+            xen_pv_printf(NULL, 0, "can't open gnttab device\n");
             xenevtchn_close(xendev->evtchndev);
             g_free(xendev);
             return NULL;
@@ -309,7 +154,7 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
         xendev->gnttabdev = NULL;
     }
 
-    QTAILQ_INSERT_TAIL(&xendevs, xendev, next);
+    xen_pv_insert_xendev(xendev);
 
     if (xendev->ops->alloc) {
         xendev->ops->alloc(xendev);
@@ -318,32 +163,6 @@ static struct XenDevice *xen_be_get_xendev(const char *type, int dom, int dev,
     return xendev;
 }
 
-/*
- * release xen backend device.
- */
-static void xen_be_del_xendev(struct XenDevice *xendev)
-{
-    if (xendev->ops->free) {
-        xendev->ops->free(xendev);
-    }
-
-    if (xendev->fe) {
-        char token[XEN_BUFSIZE];
-        snprintf(token, sizeof(token), "fe:%p", xendev);
-        xs_unwatch(xenstore, xendev->fe, token);
-        g_free(xendev->fe);
-    }
-
-    if (xendev->evtchndev != NULL) {
-        xenevtchn_close(xendev->evtchndev);
-    }
-    if (xendev->gnttabdev != NULL) {
-        xengnttab_close(xendev->gnttabdev);
-    }
-
-    QTAILQ_REMOVE(&xendevs, xendev, next);
-    g_free(xendev);
-}
 
 /*
  * Sync internal data structures on xenstore updates.
@@ -359,7 +178,7 @@ static void xen_be_backend_changed(struct XenDevice *xendev, const char *node)
     }
 
     if (node) {
-        xen_be_printf(xendev, 2, "backend update: %s\n", node);
+        xen_pv_printf(xendev, 2, "backend update: %s\n", node);
         if (xendev->ops->backend_changed) {
             xendev->ops->backend_changed(xendev, node);
         }
@@ -375,7 +194,7 @@ static void xen_be_frontend_changed(struct XenDevice *xendev, const char *node)
             fe_state = XenbusStateUnknown;
         }
         if (xendev->fe_state != fe_state) {
-            xen_be_printf(xendev, 1, "frontend state: %s -> %s\n",
+            xen_pv_printf(xendev, 1, "frontend state: %s -> %s\n",
                           xenbus_strstate(xendev->fe_state),
                           xenbus_strstate(fe_state));
         }
@@ -385,12 +204,13 @@ static void xen_be_frontend_changed(struct XenDevice *xendev, const char *node)
         g_free(xendev->protocol);
         xendev->protocol = xenstore_read_fe_str(xendev, "protocol");
         if (xendev->protocol) {
-            xen_be_printf(xendev, 1, "frontend protocol: %s\n", xendev->protocol);
+            xen_pv_printf(xendev, 1, "frontend protocol: %s\n",
+                          xendev->protocol);
         }
     }
 
     if (node) {
-        xen_be_printf(xendev, 2, "frontend update: %s\n", node);
+        xen_pv_printf(xendev, 2, "frontend update: %s\n", node);
         if (xendev->ops->frontend_changed) {
             xendev->ops->frontend_changed(xendev, node);
         }
@@ -414,26 +234,26 @@ static int xen_be_try_setup(struct XenDevice *xendev)
     int be_state;
 
     if (xenstore_read_be_int(xendev, "state", &be_state) == -1) {
-        xen_be_printf(xendev, 0, "reading backend state failed\n");
+        xen_pv_printf(xendev, 0, "reading backend state failed\n");
         return -1;
     }
 
     if (be_state != XenbusStateInitialising) {
-        xen_be_printf(xendev, 0, "initial backend state is wrong (%s)\n",
+        xen_pv_printf(xendev, 0, "initial backend state is wrong (%s)\n",
                       xenbus_strstate(be_state));
         return -1;
     }
 
     xendev->fe = xenstore_read_be_str(xendev, "frontend");
     if (xendev->fe == NULL) {
-        xen_be_printf(xendev, 0, "reading frontend path failed\n");
+        xen_pv_printf(xendev, 0, "reading frontend path failed\n");
         return -1;
     }
 
     /* setup frontend watch */
     snprintf(token, sizeof(token), "fe:%p", xendev);
     if (!xs_watch(xenstore, xendev->fe, token)) {
-        xen_be_printf(xendev, 0, "watching frontend path (%s) failed\n",
+        xen_pv_printf(xendev, 0, "watching frontend path (%s) failed\n",
                       xendev->fe);
         return -1;
     }
@@ -457,7 +277,7 @@ static int xen_be_try_init(struct XenDevice *xendev)
     int rc = 0;
 
     if (!xendev->online) {
-        xen_be_printf(xendev, 1, "not online\n");
+        xen_pv_printf(xendev, 1, "not online\n");
         return -1;
     }
 
@@ -465,7 +285,7 @@ static int xen_be_try_init(struct XenDevice *xendev)
         rc = xendev->ops->init(xendev);
     }
     if (rc != 0) {
-        xen_be_printf(xendev, 1, "init() failed\n");
+        xen_pv_printf(xendev, 1, "init() failed\n");
         return rc;
     }
 
@@ -488,9 +308,9 @@ static int xen_be_try_initialise(struct XenDevice *xendev)
     if (xendev->fe_state != XenbusStateInitialised  &&
         xendev->fe_state != XenbusStateConnected) {
         if (xendev->ops->flags & DEVOPS_FLAG_IGNORE_STATE) {
-            xen_be_printf(xendev, 2, "frontend not ready, ignoring\n");
+            xen_pv_printf(xendev, 2, "frontend not ready, ignoring\n");
         } else {
-            xen_be_printf(xendev, 2, "frontend not ready (yet)\n");
+            xen_pv_printf(xendev, 2, "frontend not ready (yet)\n");
             return -1;
         }
     }
@@ -499,7 +319,7 @@ static int xen_be_try_initialise(struct XenDevice *xendev)
         rc = xendev->ops->initialise(xendev);
     }
     if (rc != 0) {
-        xen_be_printf(xendev, 0, "initialise() failed\n");
+        xen_pv_printf(xendev, 0, "initialise() failed\n");
         return rc;
     }
 
@@ -520,9 +340,9 @@ static void xen_be_try_connected(struct XenDevice *xendev)
 
     if (xendev->fe_state != XenbusStateConnected) {
         if (xendev->ops->flags & DEVOPS_FLAG_IGNORE_STATE) {
-            xen_be_printf(xendev, 2, "frontend not ready, ignoring\n");
+            xen_pv_printf(xendev, 2, "frontend not ready, ignoring\n");
         } else {
-            xen_be_printf(xendev, 2, "frontend not ready (yet)\n");
+            xen_pv_printf(xendev, 2, "frontend not ready (yet)\n");
             return;
         }
     }
@@ -556,7 +376,7 @@ static int xen_be_try_reset(struct XenDevice *xendev)
         return -1;
     }
 
-    xen_be_printf(xendev, 1, "device reset (for re-connect)\n");
+    xen_pv_printf(xendev, 1, "device reset (for re-connect)\n");
     xen_be_set_state(xendev, XenbusStateInitialising);
     return 0;
 }
@@ -617,7 +437,8 @@ static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops)
     snprintf(token, sizeof(token), "be:%p:%d:%p", type, dom, ops);
     snprintf(path, sizeof(path), "backend/%s/%d", type, dom);
     if (!xs_watch(xenstore, path, token)) {
-        xen_be_printf(NULL, 0, "xen be: watching backend path (%s) failed\n", path);
+        xen_pv_printf(NULL, 0, "xen be: watching backend path (%s) failed\n",
+                      path);
         return -1;
     }
 
@@ -637,8 +458,8 @@ static int xenstore_scan(const char *type, int dom, struct XenDevOps *ops)
     return 0;
 }
 
-static void xenstore_update_be(char *watch, char *type, int dom,
-                               struct XenDevOps *ops)
+void xenstore_update_be(char *watch, char *type, int dom,
+                        struct XenDevOps *ops)
 {
     struct XenDevice *xendev;
     char path[XEN_BUFSIZE], *bepath;
@@ -662,7 +483,7 @@ static void xenstore_update_be(char *watch, char *type, int dom,
     if (xendev != NULL) {
         bepath = xs_read(xenstore, 0, xendev->be, &len);
         if (bepath == NULL) {
-            xen_be_del_xendev(xendev);
+            xen_pv_del_xendev(xendev);
         } else {
             free(bepath);
             xen_be_backend_changed(xendev, path);
@@ -671,7 +492,7 @@ static void xenstore_update_be(char *watch, char *type, int dom,
     }
 }
 
-static void xenstore_update_fe(char *watch, struct XenDevice *xendev)
+void xenstore_update_fe(char *watch, struct XenDevice *xendev)
 {
     char *node;
     unsigned int len;
@@ -688,56 +509,13 @@ static void xenstore_update_fe(char *watch, struct XenDevice *xendev)
     xen_be_frontend_changed(xendev, node);
     xen_be_check_state(xendev);
 }
-
-static void xenstore_update(void *unused)
-{
-    char **vec = NULL;
-    intptr_t type, ops, ptr;
-    unsigned int dom, count;
-
-    vec = xs_read_watch(xenstore, &count);
-    if (vec == NULL) {
-        goto cleanup;
-    }
-
-    if (sscanf(vec[XS_WATCH_TOKEN], "be:%" PRIxPTR ":%d:%" PRIxPTR,
-               &type, &dom, &ops) == 3) {
-        xenstore_update_be(vec[XS_WATCH_PATH], (void*)type, dom, (void*)ops);
-    }
-    if (sscanf(vec[XS_WATCH_TOKEN], "fe:%" PRIxPTR, &ptr) == 1) {
-        xenstore_update_fe(vec[XS_WATCH_PATH], (void*)ptr);
-    }
-
-cleanup:
-    free(vec);
-}
-
-static void xen_be_evtchn_event(void *opaque)
-{
-    struct XenDevice *xendev = opaque;
-    evtchn_port_t port;
-
-    port = xenevtchn_pending(xendev->evtchndev);
-    if (port != xendev->local_port) {
-        xen_be_printf(xendev, 0,
-                      "xenevtchn_pending returned %d (expected %d)\n",
-                      port, xendev->local_port);
-        return;
-    }
-    xenevtchn_unmask(xendev->evtchndev, port);
-
-    if (xendev->ops->event) {
-        xendev->ops->event(xendev);
-    }
-}
-
 /* -------------------------------------------------------------------- */
 
 int xen_be_init(void)
 {
     xenstore = xs_daemon_open();
     if (!xenstore) {
-        xen_be_printf(NULL, 0, "can't connect to xenstored\n");
+        xen_pv_printf(NULL, 0, "can't connect to xenstored\n");
         return -1;
     }
 
@@ -798,69 +576,15 @@ int xen_be_bind_evtchn(struct XenDevice *xendev)
     xendev->local_port = xenevtchn_bind_interdomain
         (xendev->evtchndev, xendev->dom, xendev->remote_port);
     if (xendev->local_port == -1) {
-        xen_be_printf(xendev, 0, "xenevtchn_bind_interdomain failed\n");
+        xen_pv_printf(xendev, 0, "xenevtchn_bind_interdomain failed\n");
         return -1;
     }
-    xen_be_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
+    xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
     qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev),
-                        xen_be_evtchn_event, NULL, xendev);
+                        xen_pv_evtchn_event, NULL, xendev);
     return 0;
 }
 
-void xen_be_unbind_evtchn(struct XenDevice *xendev)
-{
-    if (xendev->local_port == -1) {
-        return;
-    }
-    qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), NULL, NULL, NULL);
-    xenevtchn_unbind(xendev->evtchndev, xendev->local_port);
-    xen_be_printf(xendev, 2, "unbind evtchn port %d\n", xendev->local_port);
-    xendev->local_port = -1;
-}
-
-int xen_be_send_notify(struct XenDevice *xendev)
-{
-    return xenevtchn_notify(xendev->evtchndev, xendev->local_port);
-}
-
-/*
- * msg_level:
- *  0 == errors (stderr + logfile).
- *  1 == informative debug messages (logfile only).
- *  2 == noisy debug messages (logfile only).
- *  3 == will flood your log (logfile only).
- */
-void xen_be_printf(struct XenDevice *xendev, int msg_level, const char *fmt, ...)
-{
-    va_list args;
-
-    if (xendev) {
-        if (msg_level > xendev->debug) {
-            return;
-        }
-        qemu_log("xen be: %s: ", xendev->name);
-        if (msg_level == 0) {
-            fprintf(stderr, "xen be: %s: ", xendev->name);
-        }
-    } else {
-        if (msg_level > debug) {
-            return;
-        }
-        qemu_log("xen be core: ");
-        if (msg_level == 0) {
-            fprintf(stderr, "xen be core: ");
-        }
-    }
-    va_start(args, fmt);
-    qemu_log_vprintf(fmt, args);
-    va_end(args);
-    if (msg_level == 0) {
-        va_start(args, fmt);
-        vfprintf(stderr, fmt, args);
-        va_end(args);
-    }
-    qemu_log_flush();
-}
 
 static int xen_sysdev_init(SysBusDevice *dev)
 {
diff --git a/hw/xen/xen_devconfig.c b/hw/xen/xen_devconfig.c
index b7d290df6c..a80e78c0dc 100644
--- a/hw/xen/xen_devconfig.c
+++ b/hw/xen/xen_devconfig.c
@@ -55,7 +55,7 @@ int xen_config_dev_blk(DriveInfo *disk)
     const char *filename = qemu_opt_get(disk->opts, "file");
 
     snprintf(device_name, sizeof(device_name), "xvd%c", 'a' + disk->unit);
-    xen_be_printf(NULL, 1, "config disk %d [%s]: %s\n",
+    xen_pv_printf(NULL, 1, "config disk %d [%s]: %s\n",
                   disk->unit, device_name, filename);
     xen_config_dev_dirs("vbd", "qdisk", vdev, fe, be, sizeof(fe));
 
@@ -83,7 +83,7 @@ int xen_config_dev_nic(NICInfo *nic)
     snprintf(mac, sizeof(mac), "%02x:%02x:%02x:%02x:%02x:%02x",
              nic->macaddr.a[0], nic->macaddr.a[1], nic->macaddr.a[2],
              nic->macaddr.a[3], nic->macaddr.a[4], nic->macaddr.a[5]);
-    xen_be_printf(NULL, 1, "config nic %d: mac=\"%s\"\n", vlan_id, mac);
+    xen_pv_printf(NULL, 1, "config nic %d: mac=\"%s\"\n", vlan_id, mac);
     xen_config_dev_dirs("vif", "qnic", vlan_id, fe, be, sizeof(fe));
 
     /* frontend */
diff --git a/hw/xen/xen_pvdev.c b/hw/xen/xen_pvdev.c
new file mode 100644
index 0000000000..405e15484c
--- /dev/null
+++ b/hw/xen/xen_pvdev.c
@@ -0,0 +1,316 @@
+/*
+ * Xen para-virtualization device
+ *
+ *  (c) 2008 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/xen/xen_backend.h"
+#include "hw/xen/xen_pvdev.h"
+
+/* private */
+static int debug;
+
+struct xs_dirs {
+    char *xs_dir;
+    QTAILQ_ENTRY(xs_dirs) list;
+};
+
+static QTAILQ_HEAD(xs_dirs_head, xs_dirs) xs_cleanup =
+    QTAILQ_HEAD_INITIALIZER(xs_cleanup);
+
+static QTAILQ_HEAD(XenDeviceHead, XenDevice) xendevs =
+    QTAILQ_HEAD_INITIALIZER(xendevs);
+
+/* ------------------------------------------------------------- */
+
+static void xenstore_cleanup_dir(char *dir)
+{
+    struct xs_dirs *d;
+
+    d = g_malloc(sizeof(*d));
+    d->xs_dir = dir;
+    QTAILQ_INSERT_TAIL(&xs_cleanup, d, list);
+}
+
+void xen_config_cleanup(void)
+{
+    struct xs_dirs *d;
+
+    QTAILQ_FOREACH(d, &xs_cleanup, list) {
+        xs_rm(xenstore, 0, d->xs_dir);
+    }
+}
+
+int xenstore_mkdir(char *path, int p)
+{
+    struct xs_permissions perms[2] = {
+        {
+            .id    = 0, /* set owner: dom0 */
+        }, {
+            .id    = xen_domid,
+            .perms = p,
+        }
+    };
+
+    if (!xs_mkdir(xenstore, 0, path)) {
+        xen_pv_printf(NULL, 0, "xs_mkdir %s: failed\n", path);
+        return -1;
+    }
+    xenstore_cleanup_dir(g_strdup(path));
+
+    if (!xs_set_permissions(xenstore, 0, path, perms, 2)) {
+        xen_pv_printf(NULL, 0, "xs_set_permissions %s: failed\n", path);
+        return -1;
+    }
+    return 0;
+}
+
+int xenstore_write_str(const char *base, const char *node, const char *val)
+{
+    char abspath[XEN_BUFSIZE];
+
+    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
+    if (!xs_write(xenstore, 0, abspath, val, strlen(val))) {
+        return -1;
+    }
+    return 0;
+}
+
+char *xenstore_read_str(const char *base, const char *node)
+{
+    char abspath[XEN_BUFSIZE];
+    unsigned int len;
+    char *str, *ret = NULL;
+
+    snprintf(abspath, sizeof(abspath), "%s/%s", base, node);
+    str = xs_read(xenstore, 0, abspath, &len);
+    if (str != NULL) {
+        /* move to qemu-allocated memory to make sure
+         * callers can savely g_free() stuff. */
+        ret = g_strdup(str);
+        free(str);
+    }
+    return ret;
+}
+
+int xenstore_write_int(const char *base, const char *node, int ival)
+{
+    char val[12];
+
+    snprintf(val, sizeof(val), "%d", ival);
+    return xenstore_write_str(base, node, val);
+}
+
+int xenstore_write_int64(const char *base, const char *node, int64_t ival)
+{
+    char val[21];
+
+    snprintf(val, sizeof(val), "%"PRId64, ival);
+    return xenstore_write_str(base, node, val);
+}
+
+int xenstore_read_int(const char *base, const char *node, int *ival)
+{
+    char *val;
+    int rc = -1;
+
+    val = xenstore_read_str(base, node);
+    if (val && 1 == sscanf(val, "%d", ival)) {
+        rc = 0;
+    }
+    g_free(val);
+    return rc;
+}
+
+int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval)
+{
+    char *val;
+    int rc = -1;
+
+    val = xenstore_read_str(base, node);
+    if (val && 1 == sscanf(val, "%"SCNu64, uval)) {
+        rc = 0;
+    }
+    g_free(val);
+    return rc;
+}
+
+void xenstore_update(void *unused)
+{
+    char **vec = NULL;
+    intptr_t type, ops, ptr;
+    unsigned int dom, count;
+
+    vec = xs_read_watch(xenstore, &count);
+    if (vec == NULL) {
+        goto cleanup;
+    }
+
+    if (sscanf(vec[XS_WATCH_TOKEN], "be:%" PRIxPTR ":%d:%" PRIxPTR,
+               &type, &dom, &ops) == 3) {
+        xenstore_update_be(vec[XS_WATCH_PATH], (void *)type, dom, (void*)ops);
+    }
+    if (sscanf(vec[XS_WATCH_TOKEN], "fe:%" PRIxPTR, &ptr) == 1) {
+        xenstore_update_fe(vec[XS_WATCH_PATH], (void *)ptr);
+    }
+
+cleanup:
+    free(vec);
+}
+
+const char *xenbus_strstate(enum xenbus_state state)
+{
+    static const char *const name[] = {
+        [XenbusStateUnknown]       = "Unknown",
+        [XenbusStateInitialising]  = "Initialising",
+        [XenbusStateInitWait]      = "InitWait",
+        [XenbusStateInitialised]   = "Initialised",
+        [XenbusStateConnected]     = "Connected",
+        [XenbusStateClosing]       = "Closing",
+        [XenbusStateClosed]        = "Closed",
+    };
+    return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+}
+
+/*
+ * msg_level:
+ *  0 == errors (stderr + logfile).
+ *  1 == informative debug messages (logfile only).
+ *  2 == noisy debug messages (logfile only).
+ *  3 == will flood your log (logfile only).
+ */
+void xen_pv_printf(struct XenDevice *xendev, int msg_level,
+                   const char *fmt, ...)
+{
+    va_list args;
+
+    if (xendev) {
+        if (msg_level > xendev->debug) {
+            return;
+        }
+        qemu_log("xen be: %s: ", xendev->name);
+        if (msg_level == 0) {
+            fprintf(stderr, "xen be: %s: ", xendev->name);
+        }
+    } else {
+        if (msg_level > debug) {
+            return;
+        }
+        qemu_log("xen be core: ");
+        if (msg_level == 0) {
+            fprintf(stderr, "xen be core: ");
+        }
+    }
+    va_start(args, fmt);
+    qemu_log_vprintf(fmt, args);
+    va_end(args);
+    if (msg_level == 0) {
+        va_start(args, fmt);
+        vfprintf(stderr, fmt, args);
+        va_end(args);
+    }
+    qemu_log_flush();
+}
+
+void xen_pv_evtchn_event(void *opaque)
+{
+    struct XenDevice *xendev = opaque;
+    evtchn_port_t port;
+
+    port = xenevtchn_pending(xendev->evtchndev);
+    if (port != xendev->local_port) {
+        xen_pv_printf(xendev, 0,
+                      "xenevtchn_pending returned %d (expected %d)\n",
+                      port, xendev->local_port);
+        return;
+    }
+    xenevtchn_unmask(xendev->evtchndev, port);
+
+    if (xendev->ops->event) {
+        xendev->ops->event(xendev);
+    }
+}
+
+void xen_pv_unbind_evtchn(struct XenDevice *xendev)
+{
+    if (xendev->local_port == -1) {
+        return;
+    }
+    qemu_set_fd_handler(xenevtchn_fd(xendev->evtchndev), NULL, NULL, NULL);
+    xenevtchn_unbind(xendev->evtchndev, xendev->local_port);
+    xen_pv_printf(xendev, 2, "unbind evtchn port %d\n", xendev->local_port);
+    xendev->local_port = -1;
+}
+
+int xen_pv_send_notify(struct XenDevice *xendev)
+{
+    return xenevtchn_notify(xendev->evtchndev, xendev->local_port);
+}
+
+/* ------------------------------------------------------------- */
+
+struct XenDevice *xen_pv_find_xendev(const char *type, int dom, int dev)
+{
+    struct XenDevice *xendev;
+
+    QTAILQ_FOREACH(xendev, &xendevs, next) {
+        if (xendev->dom != dom) {
+            continue;
+        }
+        if (xendev->dev != dev) {
+            continue;
+        }
+        if (strcmp(xendev->type, type) != 0) {
+            continue;
+        }
+        return xendev;
+    }
+    return NULL;
+}
+
+/*
+ * release xen backend device.
+ */
+void xen_pv_del_xendev(struct XenDevice *xendev)
+{
+    if (xendev->ops->free) {
+        xendev->ops->free(xendev);
+    }
+
+    if (xendev->fe) {
+        char token[XEN_BUFSIZE];
+        snprintf(token, sizeof(token), "fe:%p", xendev);
+        xs_unwatch(xenstore, xendev->fe, token);
+        g_free(xendev->fe);
+    }
+
+    if (xendev->evtchndev != NULL) {
+        xenevtchn_close(xendev->evtchndev);
+    }
+    if (xendev->gnttabdev != NULL) {
+        xengnttab_close(xendev->gnttabdev);
+    }
+
+    QTAILQ_REMOVE(&xendevs, xendev, next);
+    g_free(xendev);
+}
+
+void xen_pv_insert_xendev(struct XenDevice *xendev)
+{
+    QTAILQ_INSERT_TAIL(&xendevs, xendev, next);
+}
diff --git a/include/block/aio.h b/include/block/aio.h
index b9fe2cb37e..c7ae27c91c 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -18,7 +18,6 @@
 #include "qemu/queue.h"
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
-#include "qemu/rfifolock.h"
 #include "qemu/timer.h"
 
 typedef struct BlockAIOCB BlockAIOCB;
@@ -54,7 +53,7 @@ struct AioContext {
     GSource source;
 
     /* Protects all fields from multi-threaded access */
-    RFifoLock lock;
+    QemuRecMutex lock;
 
     /* The list of registered AIO handlers */
     QLIST_HEAD(, AioHandler) aio_handlers;
@@ -116,9 +115,6 @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
-    /* Scheduling this BH forces the event loop it iterate */
-    QEMUBH *notify_dummy_bh;
-
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
@@ -453,6 +449,24 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
 }
 
 /**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from another thread it will be the main loop AioContext.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+/**
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the I/O thread that manages @ctx.
+ */
+static inline bool aio_context_in_iothread(AioContext *ctx)
+{
+    return ctx == qemu_get_current_aio_context();
+}
+
+/**
  * aio_context_setup:
  * @ctx: the aio context
  *
diff --git a/include/block/block.h b/include/block/block.h
index 398a050176..49bb0b239a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -7,16 +7,15 @@
 #include "qemu/coroutine.h"
 #include "block/accounting.h"
 #include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
 #include "qapi/qmp/qobject.h"
 #include "qapi-types.h"
 #include "qemu/hbitmap.h"
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
-typedef struct BlockJob BlockJob;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildRole BdrvChildRole;
-typedef struct BlockJobTxn BlockJobTxn;
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
@@ -218,7 +217,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                     BlockDriverState *bs,
                                     QDict *options, int flags);
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp);
 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp);
 int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                         BlockReopenQueue *queue, Error **errp);
@@ -332,8 +331,39 @@ int bdrv_flush_all(void);
 void bdrv_close_all(void);
 void bdrv_drain(BlockDriverState *bs);
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
+void bdrv_drain_all_begin(void);
+void bdrv_drain_all_end(void);
 void bdrv_drain_all(void);
 
+#define BDRV_POLL_WHILE(bs, cond) ({                       \
+    bool waited_ = false;                                  \
+    BlockDriverState *bs_ = (bs);                          \
+    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
+    if (aio_context_in_iothread(ctx_)) {                   \
+        while ((cond)) {                                   \
+            aio_poll(ctx_, true);                          \
+            waited_ = true;                                \
+        }                                                  \
+    } else {                                               \
+        assert(qemu_get_current_aio_context() ==           \
+               qemu_get_aio_context());                    \
+        /* Ask bdrv_dec_in_flight to wake up the main      \
+         * QEMU AioContext.  Extra I/O threads never take  \
+         * other I/O threads' AioContexts (see for example \
+         * block_job_defer_to_main_loop for how to do it). \
+         */                                                \
+        assert(!bs_->wakeup);                              \
+        bs_->wakeup = true;                                \
+        while ((cond)) {                                   \
+            aio_context_release(ctx_);                     \
+            aio_poll(qemu_get_aio_context(), true);        \
+            aio_context_acquire(ctx_);                     \
+            waited_ = true;                                \
+        }                                                  \
+        bs_->wakeup = false;                               \
+    }                                                      \
+    waited_; })
+
 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e96e9ada57..b02abbd618 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -62,8 +62,6 @@
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
     BDRV_TRACKED_WRITE,
-    BDRV_TRACKED_FLUSH,
-    BDRV_TRACKED_IOCTL,
     BDRV_TRACKED_DISCARD,
 };
 
@@ -445,7 +443,7 @@ struct BlockDriverState {
                          note this is a reference count */
 
     CoQueue flush_queue;            /* Serializing flush queue */
-    BdrvTrackedRequest *active_flush_req; /* Flush request in flight */
+    bool active_flush_req;          /* Flush request in flight? */
     unsigned int write_gen;         /* Current data generation */
     unsigned int flushed_gen;       /* Flushed write generation */
 
@@ -473,9 +471,12 @@ struct BlockDriverState {
     /* Callback before write request is processed */
     NotifierWithReturnList before_write_notifiers;
 
-    /* number of in-flight serialising requests */
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
     unsigned int serialising_in_flight;
 
+    bool wakeup;
+
     /* Offset after the highest byte written to */
     uint64_t wr_highest_offset;
 
@@ -634,6 +635,21 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
                                       void (*aio_context_detached)(void *),
                                       void *opaque);
 
+/**
+ * bdrv_wakeup:
+ * @bs: The BlockDriverState for which an I/O operation has been completed.
+ *
+ * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
+ * synchronous I/O on a BlockDriverState that is attached to another
+ * I/O thread, the main thread lets the I/O thread's event loop run,
+ * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
+ * up the main thread if necessary.
+ *
+ * Manual calls to bdrv_wakeup are rarely necessary, because
+ * bdrv_dec_in_flight already calls it.
+ */
+void bdrv_wakeup(BlockDriverState *bs);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
@@ -649,8 +665,6 @@ int is_windows_drive(const char *filename);
  * the new backing file if the job completes. Ignored if @base is %NULL.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
  * @errp: Error object.
  *
  * Start a streaming operation on @bs.  Clusters that are unallocated
@@ -662,8 +676,7 @@ int is_windows_drive(const char *filename);
  */
 void stream_start(const char *job_id, BlockDriverState *bs,
                   BlockDriverState *base, const char *backing_file_str,
-                  int64_t speed, BlockdevOnError on_error,
-                  BlockCompletionFunc *cb, void *opaque, Error **errp);
+                  int64_t speed, BlockdevOnError on_error, Error **errp);
 
 /**
  * commit_start:
@@ -674,22 +687,22 @@ void stream_start(const char *job_id, BlockDriverState *bs,
  * @base: Block device that will be written into, and become the new top.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
  * @backing_file_str: String to use as the backing file in @top's overlay
  * @errp: Error object.
  *
  */
 void commit_start(const char *job_id, BlockDriverState *bs,
                   BlockDriverState *base, BlockDriverState *top, int64_t speed,
-                  BlockdevOnError on_error, BlockCompletionFunc *cb,
-                  void *opaque, const char *backing_file_str, Error **errp);
+                  BlockdevOnError on_error, const char *backing_file_str,
+                  Error **errp);
 /**
  * commit_active_start:
  * @job_id: The id of the newly-created job, or %NULL to use the
  * device name of @bs.
  * @bs: Active block device to be committed.
  * @base: Block device that will be written into, and become the new top.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
  * @cb: Completion function for the job.
@@ -699,8 +712,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
  *
  */
 void commit_active_start(const char *job_id, BlockDriverState *bs,
-                         BlockDriverState *base, int64_t speed,
-                         BlockdevOnError on_error,
+                         BlockDriverState *base, int creation_flags,
+                         int64_t speed, BlockdevOnError on_error,
                          BlockCompletionFunc *cb,
                          void *opaque, Error **errp, bool auto_complete);
 /*
@@ -719,8 +732,6 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
  * @unmap: Whether to unmap target where source sectors only contain zeroes.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
  * @errp: Error object.
  *
  * Start a mirroring operation on @bs.  Clusters that are allocated
@@ -734,9 +745,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap,
-                  BlockCompletionFunc *cb,
-                  void *opaque, Error **errp);
+                  bool unmap, Error **errp);
 
 /*
  * backup_start:
@@ -749,6 +758,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
  * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
  * @cb: Completion function for the job.
  * @opaque: Opaque pointer value passed to @cb.
  * @txn: Transaction that this job is part of (may be NULL).
@@ -762,6 +773,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
                   bool compress,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
+                  int creation_flags,
                   BlockCompletionFunc *cb, void *opaque,
                   BlockJobTxn *txn, Error **errp);
 
@@ -787,6 +799,9 @@ bool bdrv_requests_pending(BlockDriverState *bs);
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 
+void bdrv_inc_in_flight(BlockDriverState *bs);
+void bdrv_dec_in_flight(BlockDriverState *bs);
+
 void blockdev_close_all_bdrv_states(void);
 
 #endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 4ddb4ae2e1..356cacf004 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -28,78 +28,15 @@
 
 #include "block/block.h"
 
-/**
- * BlockJobDriver:
- *
- * A class type for block job driver.
- */
-typedef struct BlockJobDriver {
-    /** Derived BlockJob struct size */
-    size_t instance_size;
-
-    /** String describing the operation, part of query-block-jobs QMP API */
-    BlockJobType job_type;
-
-    /** Optional callback for job types that support setting a speed limit */
-    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
-
-    /** Optional callback for job types that need to forward I/O status reset */
-    void (*iostatus_reset)(BlockJob *job);
-
-    /**
-     * Optional callback for job types whose completion must be triggered
-     * manually.
-     */
-    void (*complete)(BlockJob *job, Error **errp);
-
-    /**
-     * If the callback is not NULL, it will be invoked when all the jobs
-     * belonging to the same transaction complete; or upon this job's
-     * completion if it is not in a transaction. Skipped if NULL.
-     *
-     * All jobs will complete with a call to either .commit() or .abort() but
-     * never both.
-     */
-    void (*commit)(BlockJob *job);
-
-    /**
-     * If the callback is not NULL, it will be invoked when any job in the
-     * same transaction fails; or upon this job's failure (due to error or
-     * cancellation) if it is not in a transaction. Skipped if NULL.
-     *
-     * All jobs will complete with a call to either .commit() or .abort() but
-     * never both.
-     */
-    void (*abort)(BlockJob *job);
-
-    /**
-     * If the callback is not NULL, it will be invoked when the job transitions
-     * into the paused state.  Paused jobs must not perform any asynchronous
-     * I/O or event loop activity.  This callback is used to quiesce jobs.
-     */
-    void coroutine_fn (*pause)(BlockJob *job);
-
-    /**
-     * If the callback is not NULL, it will be invoked when the job transitions
-     * out of the paused state.  Any asynchronous I/O or event loop activity
-     * should be restarted from this callback.
-     */
-    void coroutine_fn (*resume)(BlockJob *job);
-
-    /*
-     * If the callback is not NULL, it will be invoked before the job is
-     * resumed in a new AioContext.  This is the place to move any resources
-     * besides job->blk to the new AioContext.
-     */
-    void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
-} BlockJobDriver;
+typedef struct BlockJobDriver BlockJobDriver;
+typedef struct BlockJobTxn BlockJobTxn;
 
 /**
  * BlockJob:
  *
  * Long-running operation on a BlockDriverState.
  */
-struct BlockJob {
+typedef struct BlockJob {
     /** The job type, including the job vtable.  */
     const BlockJobDriver *driver;
 
@@ -107,7 +44,7 @@ struct BlockJob {
     BlockBackend *blk;
 
     /**
-     * The ID of the block job.
+     * The ID of the block job. May be NULL for internal jobs.
      */
     char *id;
 
@@ -181,6 +118,9 @@ struct BlockJob {
     /** Block other operations when block job is running */
     Error *blocker;
 
+    /** BlockDriverStates that are involved in this block job */
+    GSList *nodes;
+
     /** The opaque value that is passed to the completion function.  */
     void *opaque;
 
@@ -198,7 +138,12 @@ struct BlockJob {
     /** Non-NULL if this job is part of a transaction */
     BlockJobTxn *txn;
     QLIST_ENTRY(BlockJob) txn_list;
-};
+} BlockJob;
+
+typedef enum BlockJobCreateFlags {
+    BLOCK_JOB_DEFAULT = 0x00,
+    BLOCK_JOB_INTERNAL = 0x01,
+} BlockJobCreateFlags;
 
 /**
  * block_job_next:
@@ -222,74 +167,15 @@ BlockJob *block_job_next(BlockJob *job);
 BlockJob *block_job_get(const char *id);
 
 /**
- * block_job_create:
- * @job_id: The id of the newly-created job, or %NULL to have one
- * generated automatically.
- * @job_type: The class object for the newly-created job.
- * @bs: The block
- * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
- * @errp: Error object.
- *
- * Create a new long-running block device job and return it.  The job
- * will call @cb asynchronously when the job completes.  Note that
- * @bs may have been closed at the time the @cb it is called.  If
- * this is the case, the job may be reported as either cancelled or
- * completed.
- *
- * This function is not part of the public job interface; it should be
- * called from a wrapper that is specific to the job type.
- */
-void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed,
-                       BlockCompletionFunc *cb, void *opaque, Error **errp);
-
-/**
- * block_job_sleep_ns:
- * @job: The job that calls the function.
- * @clock: The clock to sleep on.
- * @ns: How many nanoseconds to stop for.
- *
- * Put the job to sleep (assuming that it wasn't canceled) for @ns
- * nanoseconds.  Canceling the job will interrupt the wait immediately.
- */
-void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns);
-
-/**
- * block_job_yield:
- * @job: The job that calls the function.
- *
- * Yield the block job coroutine.
- */
-void block_job_yield(BlockJob *job);
-
-/**
- * block_job_ref:
- * @bs: The block device.
- *
- * Grab a reference to the block job. Should be paired with block_job_unref.
- */
-void block_job_ref(BlockJob *job);
-
-/**
- * block_job_unref:
- * @bs: The block device.
- *
- * Release reference to the block job and release resources if it is the last
- * reference.
- */
-void block_job_unref(BlockJob *job);
-
-/**
- * block_job_completed:
- * @job: The job being completed.
- * @ret: The status code.
+ * block_job_add_bdrv:
+ * @job: A block job
+ * @bs: A BlockDriverState that is involved in @job
  *
- * Call the completion function that was registered at creation time, and
- * free @job.
+ * Add @bs to the list of BlockDriverState that are involved in
+ * @job. This means that all operations will be blocked on @bs while
+ * @job exists.
  */
-void block_job_completed(BlockJob *job, int ret);
+void block_job_add_bdrv(BlockJob *job, BlockDriverState *bs);
 
 /**
  * block_job_set_speed:
@@ -320,29 +206,12 @@ void block_job_cancel(BlockJob *job);
 void block_job_complete(BlockJob *job, Error **errp);
 
 /**
- * block_job_is_cancelled:
- * @job: The job being queried.
- *
- * Returns whether the job is scheduled for cancellation.
- */
-bool block_job_is_cancelled(BlockJob *job);
-
-/**
  * block_job_query:
  * @job: The job to get information about.
  *
  * Return information about a job.
  */
-BlockJobInfo *block_job_query(BlockJob *job);
-
-/**
- * block_job_pause_point:
- * @job: The job that is ready to pause.
- *
- * Pause now if block_job_pause() has been called.  Block jobs that perform
- * lots of I/O must call this between requests so that the job can be paused.
- */
-void coroutine_fn block_job_pause_point(BlockJob *job);
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
 
 /**
  * block_job_pause:
@@ -353,45 +222,38 @@ void coroutine_fn block_job_pause_point(BlockJob *job);
 void block_job_pause(BlockJob *job);
 
 /**
- * block_job_resume:
- * @job: The job to be resumed.
- *
- * Resume the specified job.  Must be paired with a preceding block_job_pause.
- */
-void block_job_resume(BlockJob *job);
-
-/**
- * block_job_enter:
- * @job: The job to enter.
+ * block_job_user_pause:
+ * @job: The job to be paused.
  *
- * Continue the specified job by entering the coroutine.
+ * Asynchronously pause the specified job.
+ * Do not allow a resume until a matching call to block_job_user_resume.
  */
-void block_job_enter(BlockJob *job);
+void block_job_user_pause(BlockJob *job);
 
 /**
- * block_job_event_cancelled:
- * @job: The job whose information is requested.
+ * block_job_paused:
+ * @job: The job to query.
  *
- * Send a BLOCK_JOB_CANCELLED event for the specified job.
+ * Returns true if the job is user-paused.
  */
-void block_job_event_cancelled(BlockJob *job);
+bool block_job_user_paused(BlockJob *job);
 
 /**
- * block_job_ready:
- * @job: The job which is now ready to complete.
- * @msg: Error message. Only present on failure.
+ * block_job_resume:
+ * @job: The job to be resumed.
  *
- * Send a BLOCK_JOB_COMPLETED event for the specified job.
+ * Resume the specified job.  Must be paired with a preceding block_job_pause.
  */
-void block_job_event_completed(BlockJob *job, const char *msg);
+void block_job_resume(BlockJob *job);
 
 /**
- * block_job_ready:
- * @job: The job which is now ready to complete.
+ * block_job_user_resume:
+ * @job: The job to be resumed.
  *
- * Send a BLOCK_JOB_READY event for the specified job.
+ * Resume the specified job.
+ * Must be paired with a preceding block_job_user_pause.
  */
-void block_job_event_ready(BlockJob *job);
+void block_job_user_resume(BlockJob *job);
 
 /**
  * block_job_cancel_sync:
@@ -439,37 +301,6 @@ int block_job_complete_sync(BlockJob *job, Error **errp);
 void block_job_iostatus_reset(BlockJob *job);
 
 /**
- * block_job_error_action:
- * @job: The job to signal an error for.
- * @on_err: The error action setting.
- * @is_read: Whether the operation was a read.
- * @error: The error that was reported.
- *
- * Report an I/O error for a block job and possibly stop the VM.  Return the
- * action that was selected based on @on_err and @error.
- */
-BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
-                                        int is_read, int error);
-
-typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque);
-
-/**
- * block_job_defer_to_main_loop:
- * @job: The job
- * @fn: The function to run in the main loop
- * @opaque: The opaque value that is passed to @fn
- *
- * Execute a given function in the main loop with the BlockDriverState
- * AioContext acquired.  Block jobs must call bdrv_unref(), bdrv_close(), and
- * anything that uses bdrv_drain_all() in the main loop.
- *
- * The @job AioContext is held while @fn executes.
- */
-void block_job_defer_to_main_loop(BlockJob *job,
-                                  BlockJobDeferToMainLoopFn *fn,
-                                  void *opaque);
-
-/**
  * block_job_txn_new:
  *
  * Allocate and return a new block job transaction.  Jobs can be added to the
@@ -504,4 +335,12 @@ void block_job_txn_unref(BlockJobTxn *txn);
  */
 void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
 
+/**
+ * block_job_is_internal:
+ * @job: The job to determine if it is user-visible or not.
+ *
+ * Returns true if the job should not be visible to the management layer.
+ */
+bool block_job_is_internal(BlockJob *job);
+
 #endif
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
new file mode 100644
index 0000000000..40275e4437
--- /dev/null
+++ b/include/block/blockjob_int.h
@@ -0,0 +1,239 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCKJOB_INT_H
+#define BLOCKJOB_INT_H
+
+#include "block/blockjob.h"
+#include "block/block.h"
+
+/**
+ * BlockJobDriver:
+ *
+ * A class type for block job driver.
+ */
+struct BlockJobDriver {
+    /** Derived BlockJob struct size */
+    size_t instance_size;
+
+    /** String describing the operation, part of query-block-jobs QMP API */
+    BlockJobType job_type;
+
+    /** Optional callback for job types that support setting a speed limit */
+    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
+
+    /** Optional callback for job types that need to forward I/O status reset */
+    void (*iostatus_reset)(BlockJob *job);
+
+    /**
+     * Optional callback for job types whose completion must be triggered
+     * manually.
+     */
+    void (*complete)(BlockJob *job, Error **errp);
+
+    /**
+     * If the callback is not NULL, it will be invoked when all the jobs
+     * belonging to the same transaction complete; or upon this job's
+     * completion if it is not in a transaction. Skipped if NULL.
+     *
+     * All jobs will complete with a call to either .commit() or .abort() but
+     * never both.
+     */
+    void (*commit)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when any job in the
+     * same transaction fails; or upon this job's failure (due to error or
+     * cancellation) if it is not in a transaction. Skipped if NULL.
+     *
+     * All jobs will complete with a call to either .commit() or .abort() but
+     * never both.
+     */
+    void (*abort)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when the job transitions
+     * into the paused state.  Paused jobs must not perform any asynchronous
+     * I/O or event loop activity.  This callback is used to quiesce jobs.
+     */
+    void coroutine_fn (*pause)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when the job transitions
+     * out of the paused state.  Any asynchronous I/O or event loop activity
+     * should be restarted from this callback.
+     */
+    void coroutine_fn (*resume)(BlockJob *job);
+
+    /*
+     * If the callback is not NULL, it will be invoked before the job is
+     * resumed in a new AioContext.  This is the place to move any resources
+     * besides job->blk to the new AioContext.
+     */
+    void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
+
+    /*
+     * If the callback is not NULL, it will be invoked when the job has to be
+     * synchronously cancelled or completed; it should drain BlockDriverStates
+     * as required to ensure progress.
+     */
+    void (*drain)(BlockJob *job);
+};
+
+/**
+ * block_job_create:
+ * @job_id: The id of the newly-created job, or %NULL to have one
+ * generated automatically.
+ * @job_type: The class object for the newly-created job.
+ * @bs: The block
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Create a new long-running block device job and return it.  The job
+ * will call @cb asynchronously when the job completes.  Note that
+ * @bs may have been closed at the time the @cb it is called.  If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
+void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+                       BlockDriverState *bs, int64_t speed, int flags,
+                       BlockCompletionFunc *cb, void *opaque, Error **errp);
+
+/**
+ * block_job_sleep_ns:
+ * @job: The job that calls the function.
+ * @clock: The clock to sleep on.
+ * @ns: How many nanoseconds to stop for.
+ *
+ * Put the job to sleep (assuming that it wasn't canceled) for @ns
+ * nanoseconds.  Canceling the job will interrupt the wait immediately.
+ */
+void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns);
+
+/**
+ * block_job_yield:
+ * @job: The job that calls the function.
+ *
+ * Yield the block job coroutine.
+ */
+void block_job_yield(BlockJob *job);
+
+/**
+ * block_job_ref:
+ * @bs: The block device.
+ *
+ * Grab a reference to the block job. Should be paired with block_job_unref.
+ */
+void block_job_ref(BlockJob *job);
+
+/**
+ * block_job_unref:
+ * @bs: The block device.
+ *
+ * Release reference to the block job and release resources if it is the last
+ * reference.
+ */
+void block_job_unref(BlockJob *job);
+
+/**
+ * block_job_completed:
+ * @job: The job being completed.
+ * @ret: The status code.
+ *
+ * Call the completion function that was registered at creation time, and
+ * free @job.
+ */
+void block_job_completed(BlockJob *job, int ret);
+
+/**
+ * block_job_is_cancelled:
+ * @job: The job being queried.
+ *
+ * Returns whether the job is scheduled for cancellation.
+ */
+bool block_job_is_cancelled(BlockJob *job);
+
+/**
+ * block_job_pause_point:
+ * @job: The job that is ready to pause.
+ *
+ * Pause now if block_job_pause() has been called.  Block jobs that perform
+ * lots of I/O must call this between requests so that the job can be paused.
+ */
+void coroutine_fn block_job_pause_point(BlockJob *job);
+
+/**
+ * block_job_enter:
+ * @job: The job to enter.
+ *
+ * Continue the specified job by entering the coroutine.
+ */
+void block_job_enter(BlockJob *job);
+
+/**
+ * block_job_event_ready:
+ * @job: The job which is now ready to be completed.
+ *
+ * Send a BLOCK_JOB_READY event for the specified job.
+ */
+void block_job_event_ready(BlockJob *job);
+
+/**
+ * block_job_error_action:
+ * @job: The job to signal an error for.
+ * @on_err: The error action setting.
+ * @is_read: Whether the operation was a read.
+ * @error: The error that was reported.
+ *
+ * Report an I/O error for a block job and possibly stop the VM.  Return the
+ * action that was selected based on @on_err and @error.
+ */
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
+                                        int is_read, int error);
+
+typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque);
+
+/**
+ * block_job_defer_to_main_loop:
+ * @job: The job
+ * @fn: The function to run in the main loop
+ * @opaque: The opaque value that is passed to @fn
+ *
+ * Execute a given function in the main loop with the BlockDriverState
+ * AioContext acquired.  Block jobs must call bdrv_unref(), bdrv_close(), and
+ * anything that uses bdrv_drain_all() in the main loop.
+ *
+ * The @job AioContext is held while @fn executes.
+ */
+void block_job_defer_to_main_loop(BlockJob *job,
+                                  BlockJobDeferToMainLoopFn *fn,
+                                  void *opaque);
+
+#endif
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index cb624e4acc..a8c13cee66 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -316,6 +316,7 @@ static inline void tb_set_jmp_target(TranslationBlock *tb,
 
 #endif
 
+/* Called with tb_lock held.  */
 static inline void tb_add_jump(TranslationBlock *tb, int n,
                                TranslationBlock *tb_next)
 {
@@ -369,6 +370,7 @@ void tlb_fill(CPUState *cpu, target_ulong addr, MMUAccessType access_type,
 #if defined(CONFIG_USER_ONLY)
 void mmap_lock(void);
 void mmap_unlock(void);
+bool have_mmap_lock(void);
 
 static inline tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
 {
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 79ccaaba1f..9728a2fb1a 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -209,7 +209,7 @@ struct MemoryRegion {
     void (*destructor)(MemoryRegion *mr);
     uint64_t align;
     bool terminates;
-    bool skip_dump;
+    bool ram_device;
     bool enabled;
     bool warning_printed; /* For reservations */
     uint8_t vga_logging_count;
@@ -449,6 +449,30 @@ void memory_region_init_ram_ptr(MemoryRegion *mr,
                                 void *ptr);
 
 /**
+ * memory_region_init_ram_device_ptr:  Initialize RAM device memory region from
+ *                                     a user-provided pointer.
+ *
+ * A RAM device represents a mapping to a physical device, such as to a PCI
+ * MMIO BAR of an vfio-pci assigned device.  The memory region may be mapped
+ * into the VM address space and access to the region will modify memory
+ * directly.  However, the memory region should not be included in a memory
+ * dump (device may not be enabled/mapped at the time of the dump), and
+ * operations incompatible with manipulating MMIO should be avoided.  Replaces
+ * skip_dump flag.
+ *
+ * @mr: the #MemoryRegion to be initialized.
+ * @owner: the object that tracks the region's reference count
+ * @name: the name of the region.
+ * @size: size of the region.
+ * @ptr: memory to be mapped; must contain at least @size bytes.
+ */
+void memory_region_init_ram_device_ptr(MemoryRegion *mr,
+                                       struct Object *owner,
+                                       const char *name,
+                                       uint64_t size,
+                                       void *ptr);
+
+/**
  * memory_region_init_alias: Initialize a memory region that aliases all or a
  *                           part of another memory region.
  *
@@ -574,22 +598,13 @@ static inline bool memory_region_is_ram(MemoryRegion *mr)
 }
 
 /**
- * memory_region_is_skip_dump: check whether a memory region should not be
- *                             dumped
- *
- * Returns %true is a memory region should not be dumped(e.g. VFIO BAR MMAP).
+ * memory_region_is_ram_device: check whether a memory region is a ram device
  *
- * @mr: the memory region being queried
- */
-bool memory_region_is_skip_dump(MemoryRegion *mr);
-
-/**
- * memory_region_set_skip_dump: Set skip_dump flag, dump will ignore this memory
- *                              region
+ * Returns %true is a memory region is a device backed ram region
  *
  * @mr: the memory region being queried
  */
-void memory_region_set_skip_dump(MemoryRegion *mr);
+bool memory_region_is_ram_device(MemoryRegion *mr);
 
 /**
  * memory_region_is_romd: check whether a memory region is in ROMD mode
@@ -1465,9 +1480,11 @@ void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);
 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 {
     if (is_write) {
-        return memory_region_is_ram(mr) && !mr->readonly;
+        return memory_region_is_ram(mr) &&
+               !mr->readonly && !memory_region_is_ram_device(mr);
     } else {
-        return memory_region_is_ram(mr) || memory_region_is_romd(mr);
+        return (memory_region_is_ram(mr) && !memory_region_is_ram_device(mr)) ||
+               memory_region_is_romd(mr);
     }
 }
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 17fff80c8a..98dc7722c3 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -13,7 +13,6 @@
 #include "qemu/bitmap.h"
 #include "sysemu/sysemu.h"
 #include "hw/pci/pci.h"
-#include "hw/boards.h"
 #include "hw/compat.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/mem/nvdimm.h"
diff --git a/include/hw/xen/xen_backend.h b/include/hw/xen/xen_backend.h
index 0df282ab5f..cbda40ee53 100644
--- a/include/hw/xen/xen_backend.h
+++ b/include/hw/xen/xen_backend.h
@@ -2,60 +2,10 @@
 #define QEMU_HW_XEN_BACKEND_H
 
 #include "hw/xen/xen_common.h"
+#include "hw/xen/xen_pvdev.h"
 #include "sysemu/sysemu.h"
 #include "net/net.h"
 
-/* ------------------------------------------------------------- */
-
-#define XEN_BUFSIZE 1024
-
-struct XenDevice;
-
-/* driver uses grant tables  ->  open gntdev device (xendev->gnttabdev) */
-#define DEVOPS_FLAG_NEED_GNTDEV   1
-/* don't expect frontend doing correct state transitions (aka console quirk) */
-#define DEVOPS_FLAG_IGNORE_STATE  2
-
-struct XenDevOps {
-    size_t    size;
-    uint32_t  flags;
-    void      (*alloc)(struct XenDevice *xendev);
-    int       (*init)(struct XenDevice *xendev);
-    int       (*initialise)(struct XenDevice *xendev);
-    void      (*connected)(struct XenDevice *xendev);
-    void      (*event)(struct XenDevice *xendev);
-    void      (*disconnect)(struct XenDevice *xendev);
-    int       (*free)(struct XenDevice *xendev);
-    void      (*backend_changed)(struct XenDevice *xendev, const char *node);
-    void      (*frontend_changed)(struct XenDevice *xendev, const char *node);
-    int       (*backend_register)(void);
-};
-
-struct XenDevice {
-    const char         *type;
-    int                dom;
-    int                dev;
-    char               name[64];
-    int                debug;
-
-    enum xenbus_state  be_state;
-    enum xenbus_state  fe_state;
-    int                online;
-    char               be[XEN_BUFSIZE];
-    char               *fe;
-    char               *protocol;
-    int                remote_port;
-    int                local_port;
-
-    xenevtchn_handle   *evtchndev;
-    xengnttab_handle   *gnttabdev;
-
-    struct XenDevOps   *ops;
-    QTAILQ_ENTRY(XenDevice) next;
-};
-
-/* ------------------------------------------------------------- */
-
 /* variables */
 extern xc_interface *xen_xc;
 extern xenforeignmemory_handle *xen_fmem;
@@ -63,26 +13,20 @@ extern struct xs_handle *xenstore;
 extern const char *xen_protocol;
 extern DeviceState *xen_sysdev;
 
-/* xenstore helper functions */
 int xenstore_mkdir(char *path, int p);
-int xenstore_write_str(const char *base, const char *node, const char *val);
-int xenstore_write_int(const char *base, const char *node, int ival);
-int xenstore_write_int64(const char *base, const char *node, int64_t ival);
-char *xenstore_read_str(const char *base, const char *node);
-int xenstore_read_int(const char *base, const char *node, int *ival);
-
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const char *val);
 int xenstore_write_be_int(struct XenDevice *xendev, const char *node, int ival);
 int xenstore_write_be_int64(struct XenDevice *xendev, const char *node, int64_t ival);
 char *xenstore_read_be_str(struct XenDevice *xendev, const char *node);
 int xenstore_read_be_int(struct XenDevice *xendev, const char *node, int *ival);
+void xenstore_update_fe(char *watch, struct XenDevice *xendev);
+void xenstore_update_be(char *watch, char *type, int dom,
+                        struct XenDevOps *ops);
 char *xenstore_read_fe_str(struct XenDevice *xendev, const char *node);
 int xenstore_read_fe_int(struct XenDevice *xendev, const char *node, int *ival);
-int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval);
-int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node, uint64_t *uval);
+int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node,
+                            uint64_t *uval);
 
-const char *xenbus_strstate(enum xenbus_state state);
-struct XenDevice *xen_be_find_xendev(const char *type, int dom, int dev);
 void xen_be_check_state(struct XenDevice *xendev);
 
 /* xen backend driver bits */
@@ -91,10 +35,6 @@ void xen_be_register_common(void);
 int xen_be_register(const char *type, struct XenDevOps *ops);
 int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state);
 int xen_be_bind_evtchn(struct XenDevice *xendev);
-void xen_be_unbind_evtchn(struct XenDevice *xendev);
-int xen_be_send_notify(struct XenDevice *xendev);
-void xen_be_printf(struct XenDevice *xendev, int msg_level, const char *fmt, ...)
-    GCC_FMT_ATTR(3, 4);
 
 /* actual backend drivers */
 extern struct XenDevOps xen_console_ops;      /* xen_console.c     */
diff --git a/include/hw/xen/xen_pvdev.h b/include/hw/xen/xen_pvdev.h
new file mode 100644
index 0000000000..083f0a9cc7
--- /dev/null
+++ b/include/hw/xen/xen_pvdev.h
@@ -0,0 +1,78 @@
+#ifndef QEMU_HW_XEN_PVDEV_H
+#define QEMU_HW_XEN_PVDEV_H
+
+#include "hw/xen/xen_common.h"
+/* ------------------------------------------------------------- */
+
+#define XEN_BUFSIZE 1024
+
+struct XenDevice;
+
+/* driver uses grant tables  ->  open gntdev device (xendev->gnttabdev) */
+#define DEVOPS_FLAG_NEED_GNTDEV   1
+/* don't expect frontend doing correct state transitions (aka console quirk) */
+#define DEVOPS_FLAG_IGNORE_STATE  2
+
+struct XenDevOps {
+    size_t    size;
+    uint32_t  flags;
+    void      (*alloc)(struct XenDevice *xendev);
+    int       (*init)(struct XenDevice *xendev);
+    int       (*initialise)(struct XenDevice *xendev);
+    void      (*connected)(struct XenDevice *xendev);
+    void      (*event)(struct XenDevice *xendev);
+    void      (*disconnect)(struct XenDevice *xendev);
+    int       (*free)(struct XenDevice *xendev);
+    void      (*backend_changed)(struct XenDevice *xendev, const char *node);
+    void      (*frontend_changed)(struct XenDevice *xendev, const char *node);
+    int       (*backend_register)(void);
+};
+
+struct XenDevice {
+    const char         *type;
+    int                dom;
+    int                dev;
+    char               name[64];
+    int                debug;
+
+    enum xenbus_state  be_state;
+    enum xenbus_state  fe_state;
+    int                online;
+    char               be[XEN_BUFSIZE];
+    char               *fe;
+    char               *protocol;
+    int                remote_port;
+    int                local_port;
+
+    xenevtchn_handle   *evtchndev;
+    xengnttab_handle   *gnttabdev;
+
+    struct XenDevOps   *ops;
+    QTAILQ_ENTRY(XenDevice) next;
+};
+
+/* ------------------------------------------------------------- */
+
+/* xenstore helper functions */
+int xenstore_write_str(const char *base, const char *node, const char *val);
+int xenstore_write_int(const char *base, const char *node, int ival);
+int xenstore_write_int64(const char *base, const char *node, int64_t ival);
+char *xenstore_read_str(const char *base, const char *node);
+int xenstore_read_int(const char *base, const char *node, int *ival);
+int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval);
+void xenstore_update(void *unused);
+
+const char *xenbus_strstate(enum xenbus_state state);
+
+void xen_pv_evtchn_event(void *opaque);
+void xen_pv_insert_xendev(struct XenDevice *xendev);
+void xen_pv_del_xendev(struct XenDevice *xendev);
+struct XenDevice *xen_pv_find_xendev(const char *type, int dom, int dev);
+
+void xen_pv_unbind_evtchn(struct XenDevice *xendev);
+int xen_pv_send_notify(struct XenDevice *xendev);
+
+void xen_pv_printf(struct XenDevice *xendev, int msg_level,
+                   const char *fmt, ...)  GCC_FMT_ATTR(3, 4);
+
+#endif /* QEMU_HW_XEN_PVDEV_H */
diff --git a/include/migration/colo.h b/include/migration/colo.h
new file mode 100644
index 0000000000..e32eef4763
--- /dev/null
+++ b/include/migration/colo.h
@@ -0,0 +1,38 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_H
+#define QEMU_COLO_H
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+
+bool colo_supported(void);
+void colo_info_init(void);
+
+void migrate_start_colo_process(MigrationState *s);
+bool migration_in_colo_state(void);
+
+/* loadvm */
+bool migration_incoming_enable_colo(void);
+void migration_incoming_exit_colo(void);
+void *colo_process_incoming_thread(void *opaque);
+bool migration_incoming_in_colo_state(void);
+
+COLOMode get_colo_mode(void);
+
+/* failover */
+void colo_do_failover(MigrationState *s);
+#endif
diff --git a/include/migration/failover.h b/include/migration/failover.h
new file mode 100644
index 0000000000..ad91ef2381
--- /dev/null
+++ b/include/migration/failover.h
@@ -0,0 +1,26 @@
+/*
+ *  COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ *  (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_FAILOVER_H
+#define QEMU_FAILOVER_H
+
+#include "qemu-common.h"
+#include "qapi-types.h"
+
+void failover_init_state(void);
+FailoverStatus failover_set_state(FailoverStatus old_state,
+                                     FailoverStatus new_state);
+FailoverStatus failover_get_state(void);
+void failover_request_active(Error **errp);
+bool failover_request_is_active(void);
+
+#endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 2791b90c00..c309d23370 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -21,6 +21,7 @@
 #include "migration/vmstate.h"
 #include "qapi-types.h"
 #include "exec/cpu-common.h"
+#include "qemu/coroutine_int.h"
 
 #define QEMU_VM_FILE_MAGIC           0x5145564d
 #define QEMU_VM_FILE_VERSION_COMPAT  0x00000002
@@ -107,6 +108,12 @@ struct MigrationIncomingState {
     QEMUBH *bh;
 
     int state;
+
+    bool have_colo_incoming_thread;
+    QemuThread colo_incoming_thread;
+    /* The coroutine we should enter (back) after failover */
+    Coroutine *migration_incoming_co;
+
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
 };
@@ -298,6 +305,7 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
 
 int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);
+bool migrate_colo_enabled(void);
 
 int64_t xbzrle_cache_resize(int64_t new_size);
 
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index a714d8ef80..8cc532ec0e 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -9,7 +9,7 @@
 extern Monitor *cur_mon;
 
 /* flags for monitor_init */
-#define MONITOR_IS_DEFAULT    0x01
+/* 0x01 unused */
 #define MONITOR_USE_READLINE  0x02
 #define MONITOR_USE_CONTROL   0x04
 #define MONITOR_USE_PRETTY    0x08
diff --git a/include/qemu/log.h b/include/qemu/log.h
index 00bf37fc0f..a50e994c21 100644
--- a/include/qemu/log.h
+++ b/include/qemu/log.h
@@ -51,6 +51,22 @@ static inline bool qemu_loglevel_mask(int mask)
     return (qemu_loglevel & mask) != 0;
 }
 
+/* Lock output for a series of related logs.  Since this is not needed
+ * for a single qemu_log / qemu_log_mask / qemu_log_mask_and_addr, we
+ * assume that qemu_loglevel_mask has already been tested, and that
+ * qemu_loglevel is never set when qemu_logfile is unset.
+ */
+
+static inline void qemu_log_lock(void)
+{
+    qemu_flockfile(qemu_logfile);
+}
+
+static inline void qemu_log_unlock(void)
+{
+    qemu_funlockfile(qemu_logfile);
+}
+
 /* Logging functions: */
 
 /* main logging function
diff --git a/include/qemu/rfifolock.h b/include/qemu/rfifolock.h
deleted file mode 100644
index b23ab538a6..0000000000
--- a/include/qemu/rfifolock.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Recursive FIFO lock
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_RFIFOLOCK_H
-#define QEMU_RFIFOLOCK_H
-
-#include "qemu/thread.h"
-
-/* Recursive FIFO lock
- *
- * This lock provides more features than a plain mutex:
- *
- * 1. Fairness - enforces FIFO order.
- * 2. Nesting - can be taken recursively.
- * 3. Contention callback - optional, called when thread must wait.
- *
- * The recursive FIFO lock is heavyweight so prefer other synchronization
- * primitives if you do not need its features.
- */
-typedef struct {
-    QemuMutex lock;             /* protects all fields */
-
-    /* FIFO order */
-    unsigned int head;          /* active ticket number */
-    unsigned int tail;          /* waiting ticket number */
-    QemuCond cond;              /* used to wait for our ticket number */
-
-    /* Nesting */
-    QemuThread owner_thread;    /* thread that currently has ownership */
-    unsigned int nesting;       /* amount of nesting levels */
-
-    /* Contention callback */
-    void (*cb)(void *);         /* called when thread must wait, with ->lock
-                                 * held so it may not recursively lock/unlock
-                                 */
-    void *cb_opaque;
-} RFifoLock;
-
-void rfifolock_init(RFifoLock *r, void (*cb)(void *), void *opaque);
-void rfifolock_destroy(RFifoLock *r);
-void rfifolock_lock(RFifoLock *r);
-void rfifolock_unlock(RFifoLock *r);
-
-#endif /* QEMU_RFIFOLOCK_H */
diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index 9eb24707df..5589e6842b 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -34,6 +34,8 @@ typedef void NonBlockingConnectHandler(int fd, Error *err, void *opaque);
 
 InetSocketAddress *inet_parse(const char *str, Error **errp);
 int inet_connect(const char *str, Error **errp);
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
+                       NonBlockingConnectHandler *callback, void *opaque);
 
 NetworkAddressFamily inet_netfamily(int family);
 
diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index aa03567e5e..09d1e15728 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -4,6 +4,12 @@
 #include <pthread.h>
 #include <semaphore.h>
 
+typedef QemuMutex QemuRecMutex;
+#define qemu_rec_mutex_destroy qemu_mutex_destroy
+#define qemu_rec_mutex_lock qemu_mutex_lock
+#define qemu_rec_mutex_try_lock qemu_mutex_try_lock
+#define qemu_rec_mutex_unlock qemu_mutex_unlock
+
 struct QemuMutex {
     pthread_mutex_t lock;
 };
diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h
index c7ce8dcd45..5fb6541ae9 100644
--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -8,6 +8,16 @@ struct QemuMutex {
     LONG owner;
 };
 
+typedef struct QemuRecMutex QemuRecMutex;
+struct QemuRecMutex {
+    CRITICAL_SECTION lock;
+};
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex);
+void qemu_rec_mutex_lock(QemuRecMutex *mutex);
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex);
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex);
+
 struct QemuCond {
     LONG waiters, target;
     HANDLE sema;
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index 31237e93ee..e8e665f020 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -25,6 +25,9 @@ void qemu_mutex_lock(QemuMutex *mutex);
 int qemu_mutex_trylock(QemuMutex *mutex);
 void qemu_mutex_unlock(QemuMutex *mutex);
 
+/* Prototypes for other functions are in thread-posix.h/thread-win32.h.  */
+void qemu_rec_mutex_init(QemuRecMutex *mutex);
+
 void qemu_cond_init(QemuCond *cond);
 void qemu_cond_destroy(QemuCond *cond);
 
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 633c3fc124..3f79a8e955 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -231,7 +231,25 @@ struct kvm_run;
 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 
 /* work queue */
-typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
+
+/* The union type allows passing of 64 bit target pointers on 32 bit
+ * hosts in a single parameter
+ */
+typedef union {
+    int           host_int;
+    unsigned long host_ulong;
+    void         *host_ptr;
+    vaddr         target_ptr;
+} run_on_cpu_data;
+
+#define RUN_ON_CPU_HOST_PTR(p)    ((run_on_cpu_data){.host_ptr = (p)})
+#define RUN_ON_CPU_HOST_INT(i)    ((run_on_cpu_data){.host_int = (i)})
+#define RUN_ON_CPU_HOST_ULONG(ul) ((run_on_cpu_data){.host_ulong = (ul)})
+#define RUN_ON_CPU_TARGET_PTR(v)  ((run_on_cpu_data){.target_ptr = (v)})
+#define RUN_ON_CPU_NULL           RUN_ON_CPU_HOST_PTR(NULL)
+
+typedef void (*run_on_cpu_func)(CPUState *cpu, run_on_cpu_data data);
+
 struct qemu_work_item;
 
 /**
@@ -319,7 +337,10 @@ struct CPUState {
     MemoryRegion *memory;
 
     void *env_ptr; /* CPUArchState */
+
+    /* Writes protected by tb_lock, reads not thread-safe  */
     struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
+
     struct GDBRegisterState *gdb_regs;
     int gdb_num_regs;
     int gdb_num_g_regs;
@@ -634,7 +655,7 @@ bool cpu_is_stopped(CPUState *cpu);
  *
  * Used internally in the implementation of run_on_cpu.
  */
-void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
                    QemuMutex *mutex);
 
 /**
@@ -645,7 +666,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
  *
  * Schedules the function @func for execution on the vCPU @cpu.
  */
-void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data);
 
 /**
  * async_run_on_cpu:
@@ -655,7 +676,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
  *
  * Schedules the function @func for execution on the vCPU @cpu asynchronously.
  */
-void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data);
 
 /**
  * async_safe_run_on_cpu:
@@ -669,7 +690,7 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
  * Unlike run_on_cpu and async_run_on_cpu, the function is run outside the
  * BQL.
  */
-void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data);
 
 /**
  * qemu_get_cpu:
diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
index 3cfedbc28b..b0a6c0695b 100644
--- a/include/sysemu/os-posix.h
+++ b/include/sysemu/os-posix.h
@@ -87,4 +87,16 @@ void *qemu_alloc_stack(size_t *sz);
  */
 void qemu_free_stack(void *stack, size_t sz);
 
+/* POSIX and Mingw32 differ in the name of the stdio lock functions.  */
+
+static inline void qemu_flockfile(FILE *f)
+{
+    flockfile(f);
+}
+
+static inline void qemu_funlockfile(FILE *f)
+{
+    funlockfile(f);
+}
+
 #endif
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 17aad3b20f..ff18b23db1 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -103,6 +103,21 @@ static inline char *realpath(const char *path, char *resolved_path)
     return resolved_path;
 }
 
+/* ??? Mingw appears to export _lock_file and _unlock_file as the functions
+ * with which to lock a stdio handle.  But something is wrong in the markup,
+ * either in the header or the library, such that we get undefined references
+ * to "_imp___lock_file" etc when linking.  Since we seem to have no other
+ * alternative, and the usage within the logging functions isn't critical,
+ * ignore FILE locking.
+ */
+
+static inline void qemu_flockfile(FILE *f)
+{
+}
+
+static inline void qemu_funlockfile(FILE *f)
+{
+}
 
 /* We wrap all the sockets functions so that we can
  * set errno based on WSAGetLastError()
diff --git a/iothread.c b/iothread.c
index fbeb8deb38..bd70344811 100644
--- a/iothread.c
+++ b/iothread.c
@@ -16,10 +16,12 @@
 #include "qom/object_interfaces.h"
 #include "qemu/module.h"
 #include "block/aio.h"
+#include "block/block.h"
 #include "sysemu/iothread.h"
 #include "qmp-commands.h"
 #include "qemu/error-report.h"
 #include "qemu/rcu.h"
+#include "qemu/main-loop.h"
 
 typedef ObjectClass IOThreadClass;
 
@@ -28,26 +30,27 @@ typedef ObjectClass IOThreadClass;
 #define IOTHREAD_CLASS(klass) \
    OBJECT_CLASS_CHECK(IOThreadClass, klass, TYPE_IOTHREAD)
 
+static __thread IOThread *my_iothread;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+}
+
 static void *iothread_run(void *opaque)
 {
     IOThread *iothread = opaque;
-    bool blocking;
 
     rcu_register_thread();
 
+    my_iothread = iothread;
     qemu_mutex_lock(&iothread->init_done_lock);
     iothread->thread_id = qemu_get_thread_id();
     qemu_cond_signal(&iothread->init_done_cond);
     qemu_mutex_unlock(&iothread->init_done_lock);
 
-    while (!iothread->stopping) {
-        aio_context_acquire(iothread->ctx);
-        blocking = true;
-        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
-            /* Progress was made, keep going */
-            blocking = false;
-        }
-        aio_context_release(iothread->ctx);
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
     }
 
     rcu_unregister_thread();
@@ -190,6 +193,18 @@ IOThreadInfoList *qmp_query_iothreads(Error **errp)
 void iothread_stop_all(void)
 {
     Object *container = object_get_objects_root();
+    BlockDriverState *bs;
+    BdrvNextIterator it;
+
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        AioContext *ctx = bdrv_get_aio_context(bs);
+        if (ctx == qemu_get_aio_context()) {
+            continue;
+        }
+        aio_context_acquire(ctx);
+        bdrv_set_aio_context(bs, qemu_get_aio_context());
+        aio_context_release(ctx);
+    }
 
     object_child_foreach(container, iothread_stop, NULL);
 }
diff --git a/kvm-all.c b/kvm-all.c
index 3dcce16ca5..330219e9dc 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1856,7 +1856,7 @@ void kvm_flush_coalesced_mmio_buffer(void)
     s->coalesced_flush_in_progress = false;
 }
 
-static void do_kvm_cpu_synchronize_state(CPUState *cpu, void *arg)
+static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
 {
     if (!cpu->kvm_vcpu_dirty) {
         kvm_arch_get_registers(cpu);
@@ -1867,11 +1867,11 @@ static void do_kvm_cpu_synchronize_state(CPUState *cpu, void *arg)
 void kvm_cpu_synchronize_state(CPUState *cpu)
 {
     if (!cpu->kvm_vcpu_dirty) {
-        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, NULL);
+        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
     }
 }
 
-static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, void *arg)
+static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
 {
     kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
     cpu->kvm_vcpu_dirty = false;
@@ -1879,10 +1879,10 @@ static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, void *arg)
 
 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
 {
-    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, NULL);
+    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
 }
 
-static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, void *arg)
+static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
 {
     kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
     cpu->kvm_vcpu_dirty = false;
@@ -1890,7 +1890,7 @@ static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, void *arg)
 
 void kvm_cpu_synchronize_post_init(CPUState *cpu)
 {
-    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, NULL);
+    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
 }
 
 int kvm_cpu_exec(CPUState *cpu)
@@ -2218,9 +2218,10 @@ struct kvm_set_guest_debug_data {
     int err;
 };
 
-static void kvm_invoke_set_guest_debug(CPUState *cpu, void *data)
+static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
 {
-    struct kvm_set_guest_debug_data *dbg_data = data;
+    struct kvm_set_guest_debug_data *dbg_data =
+        (struct kvm_set_guest_debug_data *) data.host_ptr;
 
     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
                                    &dbg_data->dbg);
@@ -2237,7 +2238,8 @@ int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
     }
     kvm_arch_update_guest_debug(cpu, &data.dbg);
 
-    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
+    run_on_cpu(cpu, kvm_invoke_set_guest_debug,
+               RUN_ON_CPU_HOST_PTR(&data));
     return data.err;
 }
 
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 816272aa32..547053c27a 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1842,6 +1842,8 @@ static void load_elf_image(const char *image_name, int image_fd,
     info->pt_dynamic_addr = 0;
 #endif
 
+    mmap_lock();
+
     /* Find the maximum size of the image and allocate an appropriate
        amount of memory to handle that.  */
     loaddr = -1, hiaddr = 0;
@@ -2002,6 +2004,8 @@ static void load_elf_image(const char *image_name, int image_fd,
         load_symbols(ehdr, image_fd, load_bias);
     }
 
+    mmap_unlock();
+
     close(image_fd);
     return;
 
diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index ffd099dfe7..61685bf79e 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -41,6 +41,11 @@ void mmap_unlock(void)
     }
 }
 
+bool have_mmap_lock(void)
+{
+    return mmap_lock_count > 0 ? true : false;
+}
+
 /* Grab lock to make sure things are in a consistent state after fork().  */
 void mmap_fork_start(void)
 {
diff --git a/memory.c b/memory.c
index edbc7012b6..33110e9698 100644
--- a/memory.c
+++ b/memory.c
@@ -1128,6 +1128,71 @@ const MemoryRegionOps unassigned_mem_ops = {
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
+static uint64_t memory_region_ram_device_read(void *opaque,
+                                              hwaddr addr, unsigned size)
+{
+    MemoryRegion *mr = opaque;
+    uint64_t data = (uint64_t)~0;
+
+    switch (size) {
+    case 1:
+        data = *(uint8_t *)(mr->ram_block->host + addr);
+        break;
+    case 2:
+        data = *(uint16_t *)(mr->ram_block->host + addr);
+        break;
+    case 4:
+        data = *(uint32_t *)(mr->ram_block->host + addr);
+        break;
+    case 8:
+        data = *(uint64_t *)(mr->ram_block->host + addr);
+        break;
+    }
+
+    trace_memory_region_ram_device_read(get_cpu_index(), mr, addr, data, size);
+
+    return data;
+}
+
+static void memory_region_ram_device_write(void *opaque, hwaddr addr,
+                                           uint64_t data, unsigned size)
+{
+    MemoryRegion *mr = opaque;
+
+    trace_memory_region_ram_device_write(get_cpu_index(), mr, addr, data, size);
+
+    switch (size) {
+    case 1:
+        *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
+        break;
+    case 2:
+        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
+        break;
+    case 4:
+        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
+        break;
+    case 8:
+        *(uint64_t *)(mr->ram_block->host + addr) = data;
+        break;
+    }
+}
+
+static const MemoryRegionOps ram_device_mem_ops = {
+    .read = memory_region_ram_device_read,
+    .write = memory_region_ram_device_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+};
+
 bool memory_region_access_valid(MemoryRegion *mr,
                                 hwaddr addr,
                                 unsigned size,
@@ -1355,9 +1420,16 @@ void memory_region_init_ram_ptr(MemoryRegion *mr,
     mr->ram_block = qemu_ram_alloc_from_ptr(size, ptr, mr, &error_fatal);
 }
 
-void memory_region_set_skip_dump(MemoryRegion *mr)
+void memory_region_init_ram_device_ptr(MemoryRegion *mr,
+                                       Object *owner,
+                                       const char *name,
+                                       uint64_t size,
+                                       void *ptr)
 {
-    mr->skip_dump = true;
+    memory_region_init_ram_ptr(mr, owner, name, size, ptr);
+    mr->ram_device = true;
+    mr->ops = &ram_device_mem_ops;
+    mr->opaque = mr;
 }
 
 void memory_region_init_alias(MemoryRegion *mr,
@@ -1491,9 +1563,9 @@ const char *memory_region_name(const MemoryRegion *mr)
     return mr->name;
 }
 
-bool memory_region_is_skip_dump(MemoryRegion *mr)
+bool memory_region_is_ram_device(MemoryRegion *mr)
 {
-    return mr->skip_dump;
+    return mr->ram_device;
 }
 
 uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
diff --git a/memory_mapping.c b/memory_mapping.c
index e3e0d95172..6a39d71da2 100644
--- a/memory_mapping.c
+++ b/memory_mapping.c
@@ -206,7 +206,7 @@ static void guest_phys_blocks_region_add(MemoryListener *listener,
 
     /* we only care about RAM */
     if (!memory_region_is_ram(section->mr) ||
-        memory_region_is_skip_dump(section->mr)) {
+        memory_region_is_ram_device(section->mr)) {
         return;
     }
 
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index 30ad945918..3f3e237142 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,5 +1,7 @@
 common-obj-y += migration.o socket.o fd.o exec.o
 common-obj-y += tls.o
+common-obj-y += colo-comm.o
+common-obj-$(CONFIG_COLO) += colo.o colo-failover.o
 common-obj-y += vmstate.o
 common-obj-y += qemu-file.o
 common-obj-y += qemu-file-channel.o
diff --git a/migration/colo-comm.c b/migration/colo-comm.c
new file mode 100644
index 0000000000..20b60ec384
--- /dev/null
+++ b/migration/colo-comm.c
@@ -0,0 +1,72 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <migration/colo.h>
+#include "trace.h"
+
+typedef struct {
+     bool colo_requested;
+} COLOInfo;
+
+static COLOInfo colo_info;
+
+COLOMode get_colo_mode(void)
+{
+    if (migration_in_colo_state()) {
+        return COLO_MODE_PRIMARY;
+    } else if (migration_incoming_in_colo_state()) {
+        return COLO_MODE_SECONDARY;
+    } else {
+        return COLO_MODE_UNKNOWN;
+    }
+}
+
+static void colo_info_pre_save(void *opaque)
+{
+    COLOInfo *s = opaque;
+
+    s->colo_requested = migrate_colo_enabled();
+}
+
+static bool colo_info_need(void *opaque)
+{
+   return migrate_colo_enabled();
+}
+
+static const VMStateDescription colo_state = {
+    .name = "COLOState",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = colo_info_pre_save,
+    .needed = colo_info_need,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(colo_requested, COLOInfo),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+void colo_info_init(void)
+{
+    vmstate_register(NULL, 0, &colo_state, &colo_info);
+}
+
+bool migration_incoming_enable_colo(void)
+{
+    return colo_info.colo_requested;
+}
+
+void migration_incoming_exit_colo(void)
+{
+    colo_info.colo_requested = false;
+}
diff --git a/migration/colo-failover.c b/migration/colo-failover.c
new file mode 100644
index 0000000000..cc229f5ab1
--- /dev/null
+++ b/migration/colo-failover.c
@@ -0,0 +1,83 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/colo.h"
+#include "migration/failover.h"
+#include "qmp-commands.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+static QEMUBH *failover_bh;
+static FailoverStatus failover_state;
+
+static void colo_failover_bh(void *opaque)
+{
+    int old_state;
+
+    qemu_bh_delete(failover_bh);
+    failover_bh = NULL;
+
+    old_state = failover_set_state(FAILOVER_STATUS_REQUIRE,
+                                   FAILOVER_STATUS_ACTIVE);
+    if (old_state != FAILOVER_STATUS_REQUIRE) {
+        error_report("Unknown error for failover, old_state = %s",
+                    FailoverStatus_lookup[old_state]);
+        return;
+    }
+
+    colo_do_failover(NULL);
+}
+
+void failover_request_active(Error **errp)
+{
+   if (failover_set_state(FAILOVER_STATUS_NONE,
+        FAILOVER_STATUS_REQUIRE) != FAILOVER_STATUS_NONE) {
+        error_setg(errp, "COLO failover is already actived");
+        return;
+    }
+    failover_bh = qemu_bh_new(colo_failover_bh, NULL);
+    qemu_bh_schedule(failover_bh);
+}
+
+void failover_init_state(void)
+{
+    failover_state = FAILOVER_STATUS_NONE;
+}
+
+FailoverStatus failover_set_state(FailoverStatus old_state,
+                    FailoverStatus new_state)
+{
+    FailoverStatus old;
+
+    old = atomic_cmpxchg(&failover_state, old_state, new_state);
+    if (old == old_state) {
+        trace_colo_failover_set_state(FailoverStatus_lookup[new_state]);
+    }
+    return old;
+}
+
+FailoverStatus failover_get_state(void)
+{
+    return atomic_read(&failover_state);
+}
+
+void qmp_x_colo_lost_heartbeat(Error **errp)
+{
+    if (get_colo_mode() == COLO_MODE_UNKNOWN) {
+        error_setg(errp, QERR_FEATURE_DISABLED, "colo");
+        return;
+    }
+
+    failover_request_active(errp);
+}
diff --git a/migration/colo.c b/migration/colo.c
new file mode 100644
index 0000000000..93c85c538b
--- /dev/null
+++ b/migration/colo.c
@@ -0,0 +1,529 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"
+#include "migration/colo.h"
+#include "io/channel-buffer.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "migration/failover.h"
+
+#define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
+
+bool colo_supported(void)
+{
+    return true;
+}
+
+bool migration_in_colo_state(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return (s->state == MIGRATION_STATUS_COLO);
+}
+
+bool migration_incoming_in_colo_state(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    return mis && (mis->state == MIGRATION_STATUS_COLO);
+}
+
+static bool colo_runstate_is_stopped(void)
+{
+    return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
+}
+
+static void secondary_vm_do_failover(void)
+{
+    int old_state;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+
+    migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
+                      MIGRATION_STATUS_COMPLETED);
+
+    if (!autostart) {
+        error_report("\"-S\" qemu option will be ignored in secondary side");
+        /* recover runstate to normal migration finish state */
+        autostart = true;
+    }
+
+    old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
+                                   FAILOVER_STATUS_COMPLETED);
+    if (old_state != FAILOVER_STATUS_ACTIVE) {
+        error_report("Incorrect state (%s) while doing failover for "
+                     "secondary VM", FailoverStatus_lookup[old_state]);
+        return;
+    }
+    /* For Secondary VM, jump to incoming co */
+    if (mis->migration_incoming_co) {
+        qemu_coroutine_enter(mis->migration_incoming_co);
+    }
+}
+
+static void primary_vm_do_failover(void)
+{
+    MigrationState *s = migrate_get_current();
+    int old_state;
+
+    migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
+                      MIGRATION_STATUS_COMPLETED);
+
+    old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
+                                   FAILOVER_STATUS_COMPLETED);
+    if (old_state != FAILOVER_STATUS_ACTIVE) {
+        error_report("Incorrect state (%s) while doing failover for Primary VM",
+                     FailoverStatus_lookup[old_state]);
+        return;
+    }
+}
+
+void colo_do_failover(MigrationState *s)
+{
+    /* Make sure VM stopped while failover happened. */
+    if (!colo_runstate_is_stopped()) {
+        vm_stop_force_state(RUN_STATE_COLO);
+    }
+
+    if (get_colo_mode() == COLO_MODE_PRIMARY) {
+        primary_vm_do_failover();
+    } else {
+        secondary_vm_do_failover();
+    }
+}
+
+static void colo_send_message(QEMUFile *f, COLOMessage msg,
+                              Error **errp)
+{
+    int ret;
+
+    if (msg >= COLO_MESSAGE__MAX) {
+        error_setg(errp, "%s: Invalid message", __func__);
+        return;
+    }
+    qemu_put_be32(f, msg);
+    qemu_fflush(f);
+
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Can't send COLO message");
+    }
+    trace_colo_send_message(COLOMessage_lookup[msg]);
+}
+
+static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
+                                    uint64_t value, Error **errp)
+{
+    Error *local_err = NULL;
+    int ret;
+
+    colo_send_message(f, msg, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    qemu_put_be64(f, value);
+    qemu_fflush(f);
+
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to send value for message:%s",
+                         COLOMessage_lookup[msg]);
+    }
+}
+
+static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
+{
+    COLOMessage msg;
+    int ret;
+
+    msg = qemu_get_be32(f);
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Can't receive COLO message");
+        return msg;
+    }
+    if (msg >= COLO_MESSAGE__MAX) {
+        error_setg(errp, "%s: Invalid message", __func__);
+        return msg;
+    }
+    trace_colo_receive_message(COLOMessage_lookup[msg]);
+    return msg;
+}
+
+static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
+                                       Error **errp)
+{
+    COLOMessage msg;
+    Error *local_err = NULL;
+
+    msg = colo_receive_message(f, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    if (msg != expect_msg) {
+        error_setg(errp, "Unexpected COLO message %d, expected %d",
+                          msg, expect_msg);
+    }
+}
+
+static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
+                                           Error **errp)
+{
+    Error *local_err = NULL;
+    uint64_t value;
+    int ret;
+
+    colo_receive_check_message(f, expect_msg, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return 0;
+    }
+
+    value = qemu_get_be64(f);
+    ret = qemu_file_get_error(f);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
+                         COLOMessage_lookup[expect_msg]);
+    }
+    return value;
+}
+
+static int colo_do_checkpoint_transaction(MigrationState *s,
+                                          QIOChannelBuffer *bioc,
+                                          QEMUFile *fb)
+{
+    Error *local_err = NULL;
+    int ret = -1;
+
+    colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
+                      &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                    COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /* Reset channel-buffer directly */
+    qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
+    bioc->usage = 0;
+
+    qemu_mutex_lock_iothread();
+    if (failover_get_state() != FAILOVER_STATUS_NONE) {
+        qemu_mutex_unlock_iothread();
+        goto out;
+    }
+    vm_stop_force_state(RUN_STATE_COLO);
+    qemu_mutex_unlock_iothread();
+    trace_colo_vm_state_change("run", "stop");
+    /*
+     * Failover request bh could be called after vm_stop_force_state(),
+     * So we need check failover_request_is_active() again.
+     */
+    if (failover_get_state() != FAILOVER_STATUS_NONE) {
+        goto out;
+    }
+
+    /* Disable block migration */
+    s->params.blk = 0;
+    s->params.shared = 0;
+    qemu_savevm_state_header(fb);
+    qemu_savevm_state_begin(fb, &s->params);
+    qemu_mutex_lock_iothread();
+    qemu_savevm_state_complete_precopy(fb, false);
+    qemu_mutex_unlock_iothread();
+
+    qemu_fflush(fb);
+
+    colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    /*
+     * We need the size of the VMstate data in Secondary side,
+     * With which we can decide how much data should be read.
+     */
+    colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
+                            bioc->usage, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
+    qemu_fflush(s->to_dst_file);
+    ret = qemu_file_get_error(s->to_dst_file);
+    if (ret < 0) {
+        goto out;
+    }
+
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                       COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                       COLO_MESSAGE_VMSTATE_LOADED, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    ret = 0;
+
+    qemu_mutex_lock_iothread();
+    vm_start();
+    qemu_mutex_unlock_iothread();
+    trace_colo_vm_state_change("stop", "run");
+
+out:
+    if (local_err) {
+        error_report_err(local_err);
+    }
+    return ret;
+}
+
+static void colo_process_checkpoint(MigrationState *s)
+{
+    QIOChannelBuffer *bioc;
+    QEMUFile *fb = NULL;
+    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    Error *local_err = NULL;
+    int ret;
+
+    failover_init_state();
+
+    s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
+    if (!s->rp_state.from_dst_file) {
+        error_report("Open QEMUFile from_dst_file failed");
+        goto out;
+    }
+
+    /*
+     * Wait for Secondary finish loading VM states and enter COLO
+     * restore.
+     */
+    colo_receive_check_message(s->rp_state.from_dst_file,
+                       COLO_MESSAGE_CHECKPOINT_READY, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
+    fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
+    object_unref(OBJECT(bioc));
+
+    qemu_mutex_lock_iothread();
+    vm_start();
+    qemu_mutex_unlock_iothread();
+    trace_colo_vm_state_change("stop", "run");
+
+    while (s->state == MIGRATION_STATUS_COLO) {
+        if (failover_get_state() != FAILOVER_STATUS_NONE) {
+            error_report("failover request");
+            goto out;
+        }
+
+        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+        if (current_time - checkpoint_time <
+            s->parameters.x_checkpoint_delay) {
+            int64_t delay_ms;
+
+            delay_ms = s->parameters.x_checkpoint_delay -
+                       (current_time - checkpoint_time);
+            g_usleep(delay_ms * 1000);
+        }
+        ret = colo_do_checkpoint_transaction(s, bioc, fb);
+        if (ret < 0) {
+            goto out;
+        }
+        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    }
+
+out:
+    /* Throw the unreported error message after exited from loop */
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
+    if (fb) {
+        qemu_fclose(fb);
+    }
+
+    if (s->rp_state.from_dst_file) {
+        qemu_fclose(s->rp_state.from_dst_file);
+    }
+}
+
+void migrate_start_colo_process(MigrationState *s)
+{
+    qemu_mutex_unlock_iothread();
+    migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_COLO);
+    colo_process_checkpoint(s);
+    qemu_mutex_lock_iothread();
+}
+
+static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
+                                     Error **errp)
+{
+    COLOMessage msg;
+    Error *local_err = NULL;
+
+    msg = colo_receive_message(f, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    switch (msg) {
+    case COLO_MESSAGE_CHECKPOINT_REQUEST:
+        *checkpoint_request = 1;
+        break;
+    default:
+        *checkpoint_request = 0;
+        error_setg(errp, "Got unknown COLO message: %d", msg);
+        break;
+    }
+}
+
+void *colo_process_incoming_thread(void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    QEMUFile *fb = NULL;
+    QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
+    uint64_t total_size;
+    uint64_t value;
+    Error *local_err = NULL;
+
+    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_COLO);
+
+    failover_init_state();
+
+    mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
+    if (!mis->to_src_file) {
+        error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
+        goto out;
+    }
+    /*
+     * Note: the communication between Primary side and Secondary side
+     * should be sequential, we set the fd to unblocked in migration incoming
+     * coroutine, and here we are in the COLO incoming thread, so it is ok to
+     * set the fd back to blocked.
+     */
+    qemu_file_set_blocking(mis->from_src_file, true);
+
+    bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
+    fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
+    object_unref(OBJECT(bioc));
+
+    colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
+                      &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    while (mis->state == MIGRATION_STATUS_COLO) {
+        int request = 0;
+
+        colo_wait_handle_message(mis->from_src_file, &request, &local_err);
+        if (local_err) {
+            goto out;
+        }
+        assert(request);
+        if (failover_get_state() != FAILOVER_STATUS_NONE) {
+            error_report("failover request");
+            goto out;
+        }
+
+        /* FIXME: This is unnecessary for periodic checkpoint mode */
+        colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
+                     &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        colo_receive_check_message(mis->from_src_file,
+                           COLO_MESSAGE_VMSTATE_SEND, &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        value = colo_receive_message_value(mis->from_src_file,
+                                 COLO_MESSAGE_VMSTATE_SIZE, &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        /*
+         * Read VM device state data into channel buffer,
+         * It's better to re-use the memory allocated.
+         * Here we need to handle the channel buffer directly.
+         */
+        if (value > bioc->capacity) {
+            bioc->capacity = value;
+            bioc->data = g_realloc(bioc->data, bioc->capacity);
+        }
+        total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
+        if (total_size != value) {
+            error_report("Got %" PRIu64 " VMState data, less than expected"
+                        " %" PRIu64, total_size, value);
+            goto out;
+        }
+        bioc->usage = total_size;
+        qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
+
+        colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
+                     &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        qemu_mutex_lock_iothread();
+        qemu_system_reset(VMRESET_SILENT);
+        if (qemu_loadvm_state(fb) < 0) {
+            error_report("COLO: loadvm failed");
+            qemu_mutex_unlock_iothread();
+            goto out;
+        }
+        qemu_mutex_unlock_iothread();
+
+        colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
+                     &local_err);
+        if (local_err) {
+            goto out;
+        }
+    }
+
+out:
+    /* Throw the unreported error message after exited from loop */
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
+    if (fb) {
+        qemu_fclose(fb);
+    }
+
+    if (mis->to_src_file) {
+        qemu_fclose(mis->to_src_file);
+    }
+    migration_incoming_exit_colo();
+
+    return NULL;
+}
diff --git a/migration/migration.c b/migration/migration.c
index 156e70791a..e331f28382 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -36,6 +36,7 @@
 #include "exec/address-spaces.h"
 #include "io/channel-buffer.h"
 #include "io/channel-tls.h"
+#include "migration/colo.h"
 
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 
@@ -62,6 +63,11 @@
 /* Migration XBZRLE default cache size */
 #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
 
+/* The delay time (in ms) between two COLO checkpoints
+ * Note: Please change this default value to 10000 when we support hybrid mode.
+ */
+#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
+
 static NotifierList migration_state_notifiers =
     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
 
@@ -94,6 +100,7 @@ MigrationState *migrate_get_current(void)
             .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT,
             .max_bandwidth = MAX_THROTTLE,
             .downtime_limit = DEFAULT_MIGRATE_SET_DOWNTIME,
+            .x_checkpoint_delay = DEFAULT_MIGRATE_X_CHECKPOINT_DELAY,
         },
     };
 
@@ -406,6 +413,18 @@ static void process_incoming_migration_co(void *opaque)
         /* Else if something went wrong then just fall out of the normal exit */
     }
 
+    /* we get COLO info, and know if we are in COLO mode */
+    if (!ret && migration_incoming_enable_colo()) {
+        mis->migration_incoming_co = qemu_coroutine_self();
+        qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
+             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
+        mis->have_colo_incoming_thread = true;
+        qemu_coroutine_yield();
+
+        /* Wait checkpoint incoming thread exit before free resource */
+        qemu_thread_join(&mis->colo_incoming_thread);
+    }
+
     qemu_fclose(f);
     free_xbzrle_decoded_buf();
 
@@ -531,6 +550,9 @@ MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 
     caps = NULL; /* silence compiler warning */
     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
+        if (i == MIGRATION_CAPABILITY_X_COLO && !colo_supported()) {
+            continue;
+        }
         if (head == NULL) {
             head = g_malloc0(sizeof(*caps));
             caps = head;
@@ -571,6 +593,7 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->max_bandwidth = s->parameters.max_bandwidth;
     params->has_downtime_limit = true;
     params->downtime_limit = s->parameters.downtime_limit;
+    params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
 
     return params;
 }
@@ -691,6 +714,10 @@ MigrationInfo *qmp_query_migrate(Error **errp)
 
         get_xbzrle_cache_stats(info);
         break;
+    case MIGRATION_STATUS_COLO:
+        info->has_status = true;
+        /* TODO: display COLO specific information (checkpoint info etc.) */
+        break;
     case MIGRATION_STATUS_COMPLETED:
         get_xbzrle_cache_stats(info);
 
@@ -733,6 +760,14 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     }
 
     for (cap = params; cap; cap = cap->next) {
+        if (cap->value->capability == MIGRATION_CAPABILITY_X_COLO) {
+            if (!colo_supported()) {
+                error_setg(errp, "COLO is not currently supported, please"
+                             " configure with --enable-colo option in order to"
+                             " support COLO feature");
+                continue;
+            }
+        }
         s->enabled_capabilities[cap->value->capability] = cap->value->state;
     }
 
@@ -817,6 +852,11 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
                    "an integer in the range of 0 to 2000000 milliseconds");
         return;
     }
+    if (params->has_x_checkpoint_delay && (params->x_checkpoint_delay < 0)) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+                    "x_checkpoint_delay",
+                    "is invalid, it should be positive");
+    }
 
     if (params->has_compress_level) {
         s->parameters.compress_level = params->compress_level;
@@ -851,6 +891,10 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
     if (params->has_downtime_limit) {
         s->parameters.downtime_limit = params->downtime_limit;
     }
+
+    if (params->has_x_checkpoint_delay) {
+        s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
+    }
 }
 
 
@@ -922,7 +966,7 @@ static void migrate_fd_cleanup(void *opaque)
 
 void migrate_fd_error(MigrationState *s, const Error *error)
 {
-    trace_migrate_fd_error(error ? error_get_pretty(error) : "");
+    trace_migrate_fd_error(error_get_pretty(error));
     assert(s->to_dst_file == NULL);
     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
                       MIGRATION_STATUS_FAILED);
@@ -1101,7 +1145,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
     params.shared = has_inc && inc;
 
     if (migration_is_setup_or_active(s->state) ||
-        s->state == MIGRATION_STATUS_CANCELLING) {
+        s->state == MIGRATION_STATUS_CANCELLING ||
+        s->state == MIGRATION_STATUS_COLO) {
         error_setg(errp, QERR_MIGRATION_ACTIVE);
         return;
     }
@@ -1649,7 +1694,11 @@ static void migration_completion(MigrationState *s, int current_active_state,
 
         if (!ret) {
             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-            if (ret >= 0) {
+            /*
+             * Don't mark the image with BDRV_O_INACTIVE flag if
+             * we will go into COLO stage later.
+             */
+            if (ret >= 0 && !migrate_colo_enabled()) {
                 ret = bdrv_inactivate_all();
             }
             if (ret >= 0) {
@@ -1691,8 +1740,11 @@ static void migration_completion(MigrationState *s, int current_active_state,
         goto fail_invalidate;
     }
 
-    migrate_set_state(&s->state, current_active_state,
-                      MIGRATION_STATUS_COMPLETED);
+    if (!migrate_colo_enabled()) {
+        migrate_set_state(&s->state, current_active_state,
+                          MIGRATION_STATUS_COMPLETED);
+    }
+
     return;
 
 fail_invalidate:
@@ -1713,6 +1765,12 @@ fail:
                       MIGRATION_STATUS_FAILED);
 }
 
+bool migrate_colo_enabled(void)
+{
+    MigrationState *s = migrate_get_current();
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
+}
+
 /*
  * Master migration thread on the source VM.
  * It drives the migration and pumps the data down the outgoing channel.
@@ -1731,6 +1789,7 @@ static void *migration_thread(void *opaque)
     bool entered_postcopy = false;
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
+    bool enable_colo = migrate_colo_enabled();
 
     rcu_register_thread();
 
@@ -1839,7 +1898,13 @@ static void *migration_thread(void *opaque)
     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 
     qemu_mutex_lock_iothread();
-    qemu_savevm_state_cleanup();
+    /*
+     * The resource has been allocated by migration will be reused in COLO
+     * process, so don't release them.
+     */
+    if (!enable_colo) {
+        qemu_savevm_state_cleanup();
+    }
     if (s->state == MIGRATION_STATUS_COMPLETED) {
         uint64_t transferred_bytes = qemu_ftell(s->to_dst_file);
         s->total_time = end_time - s->total_time;
@@ -1852,6 +1917,15 @@ static void *migration_thread(void *opaque)
         }
         runstate_set(RUN_STATE_POSTMIGRATE);
     } else {
+        if (s->state == MIGRATION_STATUS_ACTIVE && enable_colo) {
+            migrate_start_colo_process(s);
+            qemu_savevm_state_cleanup();
+            /*
+            * Fixme: we will run VM in COLO no matter its old running state.
+            * After exited COLO, we will keep running.
+            */
+            old_vm_running = true;
+        }
         if (old_vm_running && !entered_postcopy) {
             vm_start();
         } else {
diff --git a/migration/ram.c b/migration/ram.c
index d032d389c4..fb9252d722 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -43,6 +43,7 @@
 #include "trace.h"
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
+#include "migration/colo.h"
 
 #ifdef DEBUG_MIGRATION_RAM
 #define DPRINTF(fmt, ...) \
@@ -1871,16 +1872,8 @@ err:
     return ret;
 }
 
-
-/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
- * long-running RCU critical section.  When rcu-reclaims in the code
- * start to become numerous it will be necessary to reduce the
- * granularity of these critical sections.
- */
-
-static int ram_save_setup(QEMUFile *f, void *opaque)
+static int ram_save_init_globals(void)
 {
-    RAMBlock *block;
     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
 
     dirty_rate_high_cnt = 0;
@@ -1947,6 +1940,29 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     migration_bitmap_sync();
     qemu_mutex_unlock_ramlist();
     qemu_mutex_unlock_iothread();
+    rcu_read_unlock();
+
+    return 0;
+}
+
+/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
+ * long-running RCU critical section.  When rcu-reclaims in the code
+ * start to become numerous it will be necessary to reduce the
+ * granularity of these critical sections.
+ */
+
+static int ram_save_setup(QEMUFile *f, void *opaque)
+{
+    RAMBlock *block;
+
+    /* migration has already setup the bitmap, reuse it. */
+    if (!migration_in_colo_state()) {
+        if (ram_save_init_globals() < 0) {
+            return -1;
+         }
+    }
+
+    rcu_read_lock();
 
     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
@@ -2048,7 +2064,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
     while (true) {
         int pages;
 
-        pages = ram_find_and_save_block(f, true, &bytes_transferred);
+        pages = ram_find_and_save_block(f, !migration_in_colo_state(),
+                                        &bytes_transferred);
         /* no more blocks to sent */
         if (pages == 0) {
             break;
diff --git a/migration/trace-events b/migration/trace-events
index dfee75abf4..94134f700b 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -207,3 +207,9 @@ migration_tls_outgoing_handshake_complete(void) ""
 migration_tls_incoming_handshake_start(void) ""
 migration_tls_incoming_handshake_error(const char *err) "err=%s"
 migration_tls_incoming_handshake_complete(void) ""
+
+# migration/colo.c
+colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
+colo_send_message(const char *msg) "Send '%s' message"
+colo_receive_message(const char *msg) "Receive '%s' message"
+colo_failover_set_state(const char *new_state) "new state %s"
diff --git a/monitor.c b/monitor.c
index 21dcfb28c1..7b963ad1ad 100644
--- a/monitor.c
+++ b/monitor.c
@@ -59,7 +59,6 @@
 #include "qapi/qmp/json-streamer.h"
 #include "qapi/qmp/json-parser.h"
 #include "qom/object_interfaces.h"
-#include "cpu.h"
 #include "trace.h"
 #include "trace/control.h"
 #include "monitor/hmp-target.h"
@@ -76,7 +75,6 @@
 #include "qapi/qmp-event.h"
 #include "qapi-event.h"
 #include "qmp-introspect.h"
-#include "sysemu/block-backend.h"
 #include "sysemu/qtest.h"
 #include "qemu/cutils.h"
 #include "qapi/qmp/dispatch.h"
@@ -4094,7 +4092,7 @@ QemuOptsList qemu_mon_opts = {
             .name = "chardev",
             .type = QEMU_OPT_STRING,
         },{
-            .name = "default",
+            .name = "default",  /* deprecated */
             .type = QEMU_OPT_BOOL,
         },{
             .name = "pretty",
diff --git a/net/colo-compare.c b/net/colo-compare.c
index f791383dbc..9bfc736f55 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -92,10 +92,6 @@ typedef struct CompareClass {
     ObjectClass parent_class;
 } CompareClass;
 
-typedef struct CompareChardevProps {
-    bool is_socket;
-} CompareChardevProps;
-
 enum {
     PRIMARY_IN = 0,
     SECONDARY_IN,
@@ -218,16 +214,17 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
                 (spkt->size - ETH_HLEN));
 
     if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
-        trace_colo_compare_pkt_info(inet_ntoa(ppkt->ip->ip_src),
-                                    inet_ntoa(ppkt->ip->ip_dst),
-                                    ntohl(ptcp->th_seq),
-                                    ntohl(ptcp->th_ack),
-                                    ntohl(stcp->th_seq),
-                                    ntohl(stcp->th_ack),
-                                    res, ptcp->th_flags,
-                                    stcp->th_flags,
-                                    ppkt->size,
-                                    spkt->size);
+        trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
+                                        ntohl(stcp->th_seq),
+                                        ntohl(stcp->th_ack),
+                                        res, stcp->th_flags,
+                                        spkt->size);
+
+        trace_colo_compare_pkt_info_dst(inet_ntoa(ppkt->ip->ip_dst),
+                                        ntohl(ptcp->th_seq),
+                                        ntohl(ptcp->th_ack),
+                                        res, ptcp->th_flags,
+                                        ppkt->size);
 
         qemu_hexdump((char *)ppkt->data, stderr,
                      "colo-compare ppkt", ppkt->size);
@@ -571,8 +568,6 @@ static int find_and_check_chardev(CharDriverState **chr,
                                   char *chr_name,
                                   Error **errp)
 {
-    CompareChardevProps props;
-
     *chr = qemu_chr_find(chr_name);
     if (*chr == NULL) {
         error_setg(errp, "Device '%s' not found",
@@ -580,8 +575,6 @@ static int find_and_check_chardev(CharDriverState **chr,
         return 1;
     }
 
-    memset(&props, 0, sizeof(props));
-
     if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) {
         error_setg(errp, "chardev \"%s\" is not reconnectable",
                    chr_name);
diff --git a/net/trace-events b/net/trace-events
index b1913a6666..35198bc742 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -13,7 +13,8 @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
 colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
 colo_old_packet_check_found(int64_t old_time) "%" PRId64
 colo_compare_miscompare(void) ""
-colo_compare_pkt_info(const char *src, const char *dst, uint32_t pseq, uint32_t pack, uint32_t sseq, uint32_t sack, int res, uint32_t pflag, uint32_t sflag, int psize, int ssize) "src/dst: %s/%s p: seq/ack=%u/%u   s: seq/ack=%u/%u res=%d flags=%x/%x ppkt_size: %d spkt_size: %d\n"
+colo_compare_pkt_info_src(const char *src, uint32_t sseq, uint32_t sack, int res, uint32_t sflag, int ssize) "src/dst: %s s: seq/ack=%u/%u res=%d flags=%x spkt_size: %d\n"
+colo_compare_pkt_info_dst(const char *dst, uint32_t dseq, uint32_t dack, int res, uint32_t dflag, int dsize) "src/dst: %s d: seq/ack=%u/%u res=%d flags=%x dpkt_size: %d\n"
 
 # net/filter-rewriter.c
 colo_filter_rewriter_debug(void) ""
diff --git a/pc-bios/openbios-ppc b/pc-bios/openbios-ppc
index 9b1876b157..9847e7b30f 100644
--- a/pc-bios/openbios-ppc
+++ b/pc-bios/openbios-ppc
diff --git a/pc-bios/openbios-sparc32 b/pc-bios/openbios-sparc32
index 9ee908ff8f..adc7b8abeb 100644
--- a/pc-bios/openbios-sparc32
+++ b/pc-bios/openbios-sparc32
diff --git a/pc-bios/openbios-sparc64 b/pc-bios/openbios-sparc64
index 92ccdf554f..1a7059bf6a 100644
--- a/pc-bios/openbios-sparc64
+++ b/pc-bios/openbios-sparc64
diff --git a/qapi-schema.json b/qapi-schema.json
index d6a43a108c..5dc96af469 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -175,12 +175,15 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @colo: guest is paused to save/restore VM state under colo checkpoint (since
+# 2.8)
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
             'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
             'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-            'guest-panicked' ] }
+            'guest-panicked', 'colo' ] }
 
 ##
 # @StatusInfo:
@@ -459,12 +462,14 @@
 #
 # @failed: some error occurred during migration process.
 #
+# @colo: VM is in the process of fault tolerance. (since 2.8)
+#
 # Since: 2.3
 #
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'postcopy-active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed', 'colo' ] }
 
 ##
 # @MigrationInfo
@@ -574,11 +579,16 @@
 #          been migrated, pulling the remaining pages along as needed. NOTE: If
 #          the migration fails during postcopy the VM will fail.  (since 2.6)
 #
+# @x-colo: If enabled, migration will never end, and the state of the VM on the
+#        primary side will be migrated continuously to the VM on secondary
+#        side, this process is called COarse-Grain LOck Stepping (COLO) for
+#        Non-stop Service. (since 2.8)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
   'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
-           'compress', 'events', 'postcopy-ram'] }
+           'compress', 'events', 'postcopy-ram', 'x-colo'] }
 
 ##
 # @MigrationCapabilityStatus
@@ -664,19 +674,24 @@
 # @downtime-limit: set maximum tolerated downtime for migration. maximum
 #                  downtime in milliseconds (Since 2.8)
 #
+# @x-checkpoint-delay: The delay time (in ms) between two COLO checkpoints in
+#          periodic mode. (Since 2.8)
+#
 # Since: 2.4
 ##
 { 'enum': 'MigrationParameter',
   'data': ['compress-level', 'compress-threads', 'decompress-threads',
            'cpu-throttle-initial', 'cpu-throttle-increment',
            'tls-creds', 'tls-hostname', 'max-bandwidth',
-           'downtime-limit'] }
+           'downtime-limit', 'x-checkpoint-delay' ] }
 
 #
 # @migrate-set-parameters
 #
 # Set various migration parameters.  See MigrationParameters for details.
 #
+# @x-checkpoint-delay: the delay time between two checkpoints. (Since 2.8)
+#
 # Since: 2.4
 ##
 { 'command': 'migrate-set-parameters', 'boxed': true,
@@ -725,6 +740,8 @@
 # @downtime-limit: set maximum tolerated downtime for migration. maximum
 #                  downtime in milliseconds (Since 2.8)
 #
+# @x-checkpoint-delay: the delay time between two COLO checkpoints. (Since 2.8)
+#
 # Since: 2.4
 ##
 { 'struct': 'MigrationParameters',
@@ -736,7 +753,8 @@
             '*tls-creds': 'str',
             '*tls-hostname': 'str',
             '*max-bandwidth': 'int',
-            '*downtime-limit': 'int'} }
+            '*downtime-limit': 'int',
+            '*x-checkpoint-delay': 'int'} }
 
 ##
 # @query-migrate-parameters
@@ -780,6 +798,78 @@
 { 'command': 'migrate-start-postcopy' }
 
 ##
+# @COLOMessage
+#
+# The message transmission between Primary side and Secondary side.
+#
+# @checkpoint-ready: Secondary VM (SVM) is ready for checkpointing
+#
+# @checkpoint-request: Primary VM (PVM) tells SVM to prepare for checkpointing
+#
+# @checkpoint-reply: SVM gets PVM's checkpoint request
+#
+# @vmstate-send: VM's state will be sent by PVM.
+#
+# @vmstate-size: The total size of VMstate.
+#
+# @vmstate-received: VM's state has been received by SVM.
+#
+# @vmstate-loaded: VM's state has been loaded by SVM.
+#
+# Since: 2.8
+##
+{ 'enum': 'COLOMessage',
+  'data': [ 'checkpoint-ready', 'checkpoint-request', 'checkpoint-reply',
+            'vmstate-send', 'vmstate-size', 'vmstate-received',
+            'vmstate-loaded' ] }
+
+##
+# @COLOMode
+#
+# The colo mode
+#
+# @unknown: unknown mode
+#
+# @primary: master side
+#
+# @secondary: slave side
+#
+# Since: 2.8
+##
+{ 'enum': 'COLOMode',
+  'data': [ 'unknown', 'primary', 'secondary'] }
+
+##
+# @FailoverStatus
+#
+# An enumeration of COLO failover status
+#
+# @none: no failover has ever happened
+#
+# @require: got failover requirement but not handled
+#
+# @active: in the process of doing failover
+#
+# @completed: finish the process of failover
+#
+# Since: 2.8
+##
+{ 'enum': 'FailoverStatus',
+  'data': [ 'none', 'require', 'active', 'completed'] }
+
+##
+# @x-colo-lost-heartbeat
+#
+# Tell qemu that heartbeat is lost, request it to do takeover procedures.
+# If this command is sent to the PVM, the Primary side will exit COLO mode.
+# If sent to the Secondary, the Secondary side will run failover work,
+# then takes over server operation to become the service VM.
+#
+# Since: 2.8
+##
+{ 'command': 'x-colo-lost-heartbeat' }
+
+##
 # @MouseInfo:
 #
 # Information about a mouse device.
@@ -973,12 +1063,14 @@
 #
 # @unix: unix socket
 #
+# @vsock: vsock family (since 2.8)
+#
 # @unknown: otherwise
 #
 # Since: 2.1
 ##
 { 'enum': 'NetworkAddressFamily',
-  'data': [ 'ipv4', 'ipv6', 'unix', 'unknown' ] }
+  'data': [ 'ipv4', 'ipv6', 'unix', 'vsock', 'unknown' ] }
 
 ##
 # @VncBasicInfo
@@ -3005,6 +3097,24 @@
     'path': 'str' } }
 
 ##
+# @VsockSocketAddress
+#
+# Captures a socket address in the vsock namespace.
+#
+# @cid: unique host identifier
+# @port: port
+#
+# Note that string types are used to allow for possible future hostname or
+# service resolution support.
+#
+# Since 2.8
+##
+{ 'struct': 'VsockSocketAddress',
+  'data': {
+    'cid': 'str',
+    'port': 'str' } }
+
+##
 # @SocketAddress
 #
 # Captures the address of a socket, which could also be a named file descriptor
@@ -3015,6 +3125,7 @@
   'data': {
     'inet': 'InetSocketAddress',
     'unix': 'UnixSocketAddress',
+    'vsock': 'VsockSocketAddress',
     'fd': 'String' } }
 
 ##
diff --git a/qapi/block-core.json b/qapi/block-core.json
index cd1fa7ba07..bcd3b9effe 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1464,6 +1464,13 @@
 # with query-block-jobs.  The operation can be stopped before it has completed
 # using the block-job-cancel command.
 #
+# The node that receives the data is called the top image, can be located in
+# any part of the chain (but always above the base image; see below) and can be
+# specified using its device or node name. Earlier qemu versions only allowed
+# 'device' to name the top level node; presence of the 'base-node' parameter
+# during introspection can be used as a witness of the enhanced semantics
+# of 'device'.
+#
 # If a base file is specified then sectors are not copied from that base file and
 # its backing chain.  When streaming completes the image file will have the base
 # file as its backing file.  This can be used to stream a subset of the backing
@@ -1475,12 +1482,16 @@
 # @job-id: #optional identifier for the newly-created block job. If
 #          omitted, the device name will be used. (Since 2.7)
 #
-# @device: the device name or node-name of a root node
+# @device: the device or node name of the top image
+#
+# @base:   #optional the common backing file name.
+#                    It cannot be set if @base-node is also set.
 #
-# @base:   #optional the common backing file name
+# @base-node: #optional the node name of the backing file.
+#                       It cannot be set if @base is also set. (Since 2.8)
 #
-# @backing-file: #optional The backing file string to write into the active
-#                          layer. This filename is not validated.
+# @backing-file: #optional The backing file string to write into the top
+#                          image. This filename is not validated.
 #
 #                          If a pathname string is such that it cannot be
 #                          resolved by QEMU, that means that subsequent QMP or
@@ -1504,7 +1515,7 @@
 ##
 { 'command': 'block-stream',
   'data': { '*job-id': 'str', 'device': 'str', '*base': 'str',
-            '*backing-file': 'str', '*speed': 'int',
+            '*base-node': 'str', '*backing-file': 'str', '*speed': 'int',
             '*on-error': 'BlockdevOnError' } }
 
 ##
@@ -1703,16 +1714,17 @@
 #
 # @host_device, @host_cdrom: Since 2.1
 # @gluster: Since 2.7
-# @nbd: Since 2.8
+# @nbd, @nfs, @replication, @ssh: Since 2.8
 #
 # Since: 2.0
 ##
 { 'enum': 'BlockdevDriver',
   'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop',
             'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
-            'host_device', 'http', 'https', 'luks', 'nbd', 'null-aio',
+            'host_device', 'http', 'https', 'luks', 'nbd', 'nfs', 'null-aio',
             'null-co', 'parallels', 'qcow', 'qcow2', 'qed', 'quorum', 'raw',
-            'replication', 'tftp', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+            'replication', 'ssh', 'tftp', 'vdi', 'vhdx', 'vmdk', 'vpc',
+            'vvfat' ] }
 
 ##
 # @BlockdevOptionsFile
@@ -1949,6 +1961,25 @@
             '*vport': 'int',
             '*segment': 'str' } }
 
+##
+# @BlockdevOptionsSsh
+#
+# @server:              host address
+#
+# @path:                path to the image on the host
+#
+# @user:                #optional user as which to connect, defaults to current
+#                       local user name
+#
+# TODO: Expose the host_key_check option in QMP
+#
+# Since 2.8
+##
+{ 'struct': 'BlockdevOptionsSsh',
+  'data': { 'server': 'InetSocketAddress',
+            'path': 'str',
+            '*user': 'str' } }
+
 
 ##
 # @BlkdebugEvent
@@ -2166,7 +2197,7 @@
 #
 # @debug-level: #optional libgfapi log level (default '4' which is Error)
 #
-# @logfile:     #optional libgfapi log file (default /dev/stderr)
+# @logfile:     #optional libgfapi log file (default /dev/stderr) (Since 2.8)
 #
 # Since: 2.7
 ##
@@ -2209,6 +2240,74 @@
             '*top-id': 'str' } }
 
 ##
+# @NFSTransport
+#
+# An enumeration of NFS transport types
+#
+# @inet:        TCP transport
+#
+# Since 2.8
+##
+{ 'enum': 'NFSTransport',
+  'data': [ 'inet' ] }
+
+##
+# @NFSServer
+#
+# Captures the address of the socket
+#
+# @type:        transport type used for NFS (only TCP supported)
+#
+# @host:        host address for NFS server
+#
+# Since 2.8
+##
+{ 'struct': 'NFSServer',
+  'data': { 'type': 'NFSTransport',
+            'host': 'str' } }
+
+##
+# @BlockdevOptionsNfs
+#
+# Driver specific block device option for NFS
+#
+# @server:                  host address
+#
+# @path:                    path of the image on the host
+#
+# @user:                    #optional UID value to use when talking to the
+#                           server (defaults to 65534 on Windows and getuid()
+#                           on unix)
+#
+# @group:                   #optional GID value to use when talking to the
+#                           server (defaults to 65534 on Windows and getgid()
+#                           in unix)
+#
+# @tcp-syn-count:           #optional number of SYNs during the session
+#                           establishment (defaults to libnfs default)
+#
+# @readahead-size:          #optional set the readahead size in bytes (defaults
+#                           to libnfs default)
+#
+# @page-cache-size:         #optional set the pagecache size in bytes (defaults
+#                           to libnfs default)
+#
+# @debug-level:             #optional set the NFS debug level (max 2) (defaults
+#                           to libnfs default)
+#
+# Since 2.8
+##
+{ 'struct': 'BlockdevOptionsNfs',
+  'data': { 'server': 'NFSServer',
+            'path': 'str',
+            '*user': 'int',
+            '*group': 'int',
+            '*tcp-syn-count': 'int',
+            '*readahead-size': 'int',
+            '*page-cache-size': 'int',
+            '*debug-level': 'int' } }
+
+##
 # @BlockdevOptionsCurl
 #
 # Driver specific block device options for the curl backend.
@@ -2239,6 +2338,20 @@
             '*tls-creds': 'str' } }
 
 ##
+# @BlockdevOptionsRaw
+#
+# Driver specific block device options for the raw driver.
+#
+# @offset:      #optional position where the block device starts
+# @size:        #optional the assumed size of the device
+#
+# Since: 2.8
+##
+{ 'struct': 'BlockdevOptionsRaw',
+  'base': 'BlockdevOptionsGenericFormat',
+  'data': { '*offset': 'int', '*size': 'int' } }
+
+##
 # @BlockdevOptions
 #
 # Options for creating a block device.  Many options are available for all
@@ -2284,7 +2397,7 @@
 # TODO iscsi: Wait for structured options
       'luks':       'BlockdevOptionsLUKS',
       'nbd':        'BlockdevOptionsNbd',
-# TODO nfs: Wait for structured options
+      'nfs':        'BlockdevOptionsNfs',
       'null-aio':   'BlockdevOptionsNull',
       'null-co':    'BlockdevOptionsNull',
       'parallels':  'BlockdevOptionsGenericFormat',
@@ -2292,11 +2405,11 @@
       'qcow':       'BlockdevOptionsGenericCOWFormat',
       'qed':        'BlockdevOptionsGenericCOWFormat',
       'quorum':     'BlockdevOptionsQuorum',
-      'raw':        'BlockdevOptionsGenericFormat',
+      'raw':        'BlockdevOptionsRaw',
 # TODO rbd: Wait for structured options
       'replication':'BlockdevOptionsReplication',
 # TODO sheepdog: Wait for structured options
-# TODO ssh: Should take InetSocketAddress for 'host'?
+      'ssh':        'BlockdevOptionsSsh',
       'tftp':       'BlockdevOptionsCurl',
       'vdi':        'BlockdevOptionsGenericFormat',
       'vhdx':       'BlockdevOptionsGenericFormat',
diff --git a/qemu-ga.texi b/qemu-ga.texi
index 0e53bf6b2c..4c7a8fd163 100644
--- a/qemu-ga.texi
+++ b/qemu-ga.texi
@@ -30,7 +30,7 @@ set user's password
 @end itemize
 
 qemu-ga will read a system configuration file on startup (located at
-q@file{/etc/qemu/qemu-ga.conf} by default), then parse remaining
+@file{/etc/qemu/qemu-ga.conf} by default), then parse remaining
 configuration options on the command line. For the same key, the last
 option wins, but the lists accumulate (see below for configuration
 file format).
diff --git a/qemu-img.c b/qemu-img.c
index afcd51ff18..6949b73ca5 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -795,6 +795,7 @@ static void run_block_job(BlockJob *job, Error **errp)
 {
     AioContext *aio_context = blk_get_aio_context(job->blk);
 
+    aio_context_acquire(aio_context);
     do {
         aio_poll(aio_context, true);
         qemu_progress_print(job->len ?
@@ -802,6 +803,7 @@ static void run_block_job(BlockJob *job, Error **errp)
     } while (!job->ready);
 
     block_job_complete_sync(job, errp);
+    aio_context_release(aio_context);
 
     /* A block job may finish instantaneously without publishing any progress,
      * so just signal completion here */
@@ -819,6 +821,7 @@ static int img_commit(int argc, char **argv)
     Error *local_err = NULL;
     CommonBlockJobCBInfo cbi;
     bool image_opts = false;
+    AioContext *aio_context;
 
     fmt = NULL;
     cache = BDRV_DEFAULT_CACHE;
@@ -928,8 +931,12 @@ static int img_commit(int argc, char **argv)
         .bs   = bs,
     };
 
-    commit_active_start("commit", bs, base_bs, 0, BLOCKDEV_ON_ERROR_REPORT,
-                        common_block_job_cb, &cbi, &local_err, false);
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+    commit_active_start("commit", bs, base_bs, BLOCK_JOB_DEFAULT, 0,
+                        BLOCKDEV_ON_ERROR_REPORT, common_block_job_cb, &cbi,
+                        &local_err, false);
+    aio_context_release(aio_context);
     if (local_err) {
         goto done;
     }
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 3a3838a079..95bcde1d88 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -18,7 +18,6 @@
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qemu/timer.h"
-#include "sysemu/block-backend.h"
 #include "qemu/cutils.h"
 
 #define CMD_NOFILE_OK   0x01
@@ -1956,7 +1955,7 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
     qemu_opts_reset(&reopen_opts);
 
     brq = bdrv_reopen_queue(NULL, bs, opts, flags);
-    bdrv_reopen_multiple(brq, &local_err);
+    bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
     if (local_err) {
         error_report_err(local_err);
     } else {
@@ -2216,6 +2215,7 @@ static const cmdinfo_t help_cmd = {
 
 bool qemuio_command(BlockBackend *blk, const char *cmd)
 {
+    AioContext *ctx;
     char *input;
     const cmdinfo_t *ct;
     char **v;
@@ -2227,7 +2227,10 @@ bool qemuio_command(BlockBackend *blk, const char *cmd)
     if (c) {
         ct = find_command(v[0]);
         if (ct) {
+            ctx = blk ? blk_get_aio_context(blk) : qemu_get_aio_context();
+            aio_context_acquire(ctx);
             done = command(blk, ct, c, v);
+            aio_context_release(ctx);
         } else {
             fprintf(stderr, "command \"%s\" not found\n", v[0]);
         }
diff --git a/qemu-options.hx b/qemu-options.hx
index 66f1d28b09..4536e18ac0 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -172,7 +172,7 @@ DEF("set", HAS_ARG, QEMU_OPTION_set,
 STEXI
 @item -set @var{group}.@var{id}.@var{arg}=@var{value}
 @findex -set
-Set parameter @var{arg} for item @var{id} of type @var{group}\n"
+Set parameter @var{arg} for item @var{id} of type @var{group}
 ETEXI
 
 DEF("global", HAS_ARG, QEMU_OPTION_global,
@@ -2239,7 +2239,7 @@ two serial ports and the QEMU monitor:
 
 @example
 -chardev stdio,mux=on,id=char0 \
--mon chardev=char0,mode=readline,default \
+-mon chardev=char0,mode=readline \
 -serial chardev:char0 \
 -serial chardev:char0
 @end example
@@ -2250,7 +2250,7 @@ multiplexed between the QEMU monitor and a parallel port:
 
 @example
 -chardev stdio,mux=on,id=char0 \
--mon chardev=char0,mode=readline,default \
+-mon chardev=char0,mode=readline \
 -parallel chardev:char0 \
 -chardev tcp,...,mux=on,id=char1 \
 -serial chardev:char1 \
@@ -3112,9 +3112,9 @@ Like -qmp but uses pretty JSON formatting.
 ETEXI
 
 DEF("mon", HAS_ARG, QEMU_OPTION_mon, \
-    "-mon [chardev=]name[,mode=readline|control][,default]\n", QEMU_ARCH_ALL)
+    "-mon [chardev=]name[,mode=readline|control]\n", QEMU_ARCH_ALL)
 STEXI
-@item -mon [chardev=]name[,mode=readline|control][,default]
+@item -mon [chardev=]name[,mode=readline|control]
 @findex -mon
 Setup monitor on chardev @var{name}.
 ETEXI
@@ -3902,7 +3902,7 @@ colo secondary:
 -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1
 -object filter-rewriter,id=rew0,netdev=hn0,queue=all
 
-@item -object filter-dump,id=@var{id},netdev=@var{dev},file=@var{filename}][,maxlen=@var{len}]
+@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}]
 
 Dump the network traffic on netdev @var{dev} to the file specified by
 @var{filename}. At most @var{len} bytes (64k by default) per packet are stored.
diff --git a/qga/channel-posix.c b/qga/channel-posix.c
index bb65d8ba17..71582e0c38 100644
--- a/qga/channel-posix.c
+++ b/qga/channel-posix.c
@@ -26,13 +26,10 @@ static gboolean ga_channel_listen_accept(GIOChannel *channel,
     GAChannel *c = data;
     int ret, client_fd;
     bool accepted = false;
-    struct sockaddr_un addr;
-    socklen_t addrlen = sizeof(addr);
 
     g_assert(channel != NULL);
 
-    client_fd = qemu_accept(g_io_channel_unix_get_fd(channel),
-                            (struct sockaddr *)&addr, &addrlen);
+    client_fd = qemu_accept(g_io_channel_unix_get_fd(channel), NULL, NULL);
     if (client_fd == -1) {
         g_warning("error converting fd to gsocket: %s", strerror(errno));
         goto out;
@@ -64,7 +61,6 @@ static void ga_channel_listen_add(GAChannel *c, int listen_fd, bool create)
 
 static void ga_channel_listen_close(GAChannel *c)
 {
-    g_assert(c->method == GA_CHANNEL_UNIX_LISTEN);
     g_assert(c->listen_channel);
     g_io_channel_shutdown(c->listen_channel, true, NULL);
     g_io_channel_unref(c->listen_channel);
@@ -80,7 +76,7 @@ static void ga_channel_client_close(GAChannel *c)
     g_io_channel_shutdown(c->client_channel, true, NULL);
     g_io_channel_unref(c->client_channel);
     c->client_channel = NULL;
-    if (c->method == GA_CHANNEL_UNIX_LISTEN && c->listen_channel) {
+    if (c->listen_channel) {
         ga_channel_listen_add(c, 0, false);
     }
 }
@@ -197,6 +193,31 @@ static gboolean ga_channel_open(GAChannel *c, const gchar *path, GAChannelMethod
         ga_channel_listen_add(c, fd, true);
         break;
     }
+    case GA_CHANNEL_VSOCK_LISTEN: {
+        Error *local_err = NULL;
+        SocketAddress *addr;
+        char *addr_str;
+        int fd;
+
+        addr_str = g_strdup_printf("vsock:%s", path);
+        addr = socket_parse(addr_str, &local_err);
+        g_free(addr_str);
+        if (local_err != NULL) {
+            g_critical("%s", error_get_pretty(local_err));
+            error_free(local_err);
+            return false;
+        }
+
+        fd = socket_listen(addr, &local_err);
+        qapi_free_SocketAddress(addr);
+        if (local_err != NULL) {
+            g_critical("%s", error_get_pretty(local_err));
+            error_free(local_err);
+            return false;
+        }
+        ga_channel_listen_add(c, fd, true);
+        break;
+    }
     default:
         g_critical("error binding/listening to specified socket");
         return false;
@@ -258,8 +279,7 @@ GAChannel *ga_channel_new(GAChannelMethod method, const gchar *path,
 
 void ga_channel_free(GAChannel *c)
 {
-    if (c->method == GA_CHANNEL_UNIX_LISTEN
-        && c->listen_channel) {
+    if (c->listen_channel) {
         ga_channel_listen_close(c);
     }
     if (c->client_channel) {
diff --git a/qga/channel.h b/qga/channel.h
index ae8cf0f7e0..8fd0c8f72c 100644
--- a/qga/channel.h
+++ b/qga/channel.h
@@ -19,6 +19,7 @@ typedef enum {
     GA_CHANNEL_VIRTIO_SERIAL,
     GA_CHANNEL_ISA_SERIAL,
     GA_CHANNEL_UNIX_LISTEN,
+    GA_CHANNEL_VSOCK_LISTEN,
 } GAChannelMethod;
 
 typedef gboolean (*GAChannelCallback)(GIOCondition condition, gpointer opaque);
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index 9c9be12116..19d72b2411 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -840,8 +840,99 @@ static void guest_fsfreeze_cleanup(void)
 GuestFilesystemTrimResponse *
 qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp)
 {
-    error_setg(errp, QERR_UNSUPPORTED);
-    return NULL;
+    GuestFilesystemTrimResponse *resp;
+    HANDLE handle;
+    WCHAR guid[MAX_PATH] = L"";
+
+    handle = FindFirstVolumeW(guid, ARRAYSIZE(guid));
+    if (handle == INVALID_HANDLE_VALUE) {
+        error_setg_win32(errp, GetLastError(), "failed to find any volume");
+        return NULL;
+    }
+
+    resp = g_new0(GuestFilesystemTrimResponse, 1);
+
+    do {
+        GuestFilesystemTrimResult *res;
+        GuestFilesystemTrimResultList *list;
+        PWCHAR uc_path;
+        DWORD char_count = 0;
+        char *path, *out;
+        GError *gerr = NULL;
+        gchar * argv[4];
+
+        GetVolumePathNamesForVolumeNameW(guid, NULL, 0, &char_count);
+
+        if (GetLastError() != ERROR_MORE_DATA) {
+            continue;
+        }
+        if (GetDriveTypeW(guid) != DRIVE_FIXED) {
+            continue;
+        }
+
+        uc_path = g_malloc(sizeof(WCHAR) * char_count);
+        if (!GetVolumePathNamesForVolumeNameW(guid, uc_path, char_count,
+                                              &char_count) || !*uc_path) {
+            /* strange, but this condition could be faced even with size == 2 */
+            g_free(uc_path);
+            continue;
+        }
+
+        res = g_new0(GuestFilesystemTrimResult, 1);
+
+        path = g_utf16_to_utf8(uc_path, char_count, NULL, NULL, &gerr);
+
+        g_free(uc_path);
+
+        if (!path) {
+            res->has_error = true;
+            res->error = g_strdup(gerr->message);
+            g_error_free(gerr);
+            break;
+        }
+
+        res->path = path;
+
+        list = g_new0(GuestFilesystemTrimResultList, 1);
+        list->value = res;
+        list->next = resp->paths;
+
+        resp->paths = list;
+
+        memset(argv, 0, sizeof(argv));
+        argv[0] = (gchar *)"defrag.exe";
+        argv[1] = (gchar *)"/L";
+        argv[2] = path;
+
+        if (!g_spawn_sync(NULL, argv, NULL, G_SPAWN_SEARCH_PATH, NULL, NULL,
+                          &out /* stdout */, NULL /* stdin */,
+                          NULL, &gerr)) {
+            res->has_error = true;
+            res->error = g_strdup(gerr->message);
+            g_error_free(gerr);
+        } else {
+            /* defrag.exe is UGLY. Exit code is ALWAYS zero.
+               Error is reported in the output with something like
+               (x89000020) etc code in the stdout */
+
+            int i;
+            gchar **lines = g_strsplit(out, "\r\n", 0);
+            g_free(out);
+
+            for (i = 0; lines[i] != NULL; i++) {
+                if (g_strstr_len(lines[i], -1, "(0x") == NULL) {
+                    continue;
+                }
+                res->has_error = true;
+                res->error = g_strdup(lines[i]);
+                break;
+            }
+            g_strfreev(lines);
+        }
+    } while (FindNextVolumeW(handle, guid, ARRAYSIZE(guid)));
+
+    FindVolumeClose(handle);
+    return resp;
 }
 
 typedef enum {
@@ -1416,7 +1507,7 @@ GList *ga_command_blacklist_init(GList *blacklist)
         "guest-get-memory-blocks", "guest-set-memory-blocks",
         "guest-get-memory-block-size",
         "guest-fsfreeze-freeze-list",
-        "guest-fstrim", NULL};
+        NULL};
     char **p = (char **)list_unsupported;
 
     while (*p) {
diff --git a/qga/main.c b/qga/main.c
index 0b9d04ea04..6caf215575 100644
--- a/qga/main.c
+++ b/qga/main.c
@@ -190,8 +190,8 @@ static void usage(const char *cmd)
 "Usage: %s [-m <method> -p <path>] [<options>]\n"
 "QEMU Guest Agent %s\n"
 "\n"
-"  -m, --method      transport method: one of unix-listen, virtio-serial, or\n"
-"                    isa-serial (virtio-serial is the default)\n"
+"  -m, --method      transport method: one of unix-listen, virtio-serial,\n"
+"                    isa-serial, or vsock-listen (virtio-serial is the default)\n"
 "  -p, --path        device/socket path (the default for virtio-serial is:\n"
 "                    %s,\n"
 "                    the default for isa-serial is:\n"
@@ -659,6 +659,8 @@ static gboolean channel_init(GAState *s, const gchar *method, const gchar *path)
         channel_method = GA_CHANNEL_ISA_SERIAL;
     } else if (strcmp(method, "unix-listen") == 0) {
         channel_method = GA_CHANNEL_UNIX_LISTEN;
+    } else if (strcmp(method, "vsock-listen") == 0) {
+        channel_method = GA_CHANNEL_VSOCK_LISTEN;
     } else {
         g_critical("unsupported channel method/type: %s", method);
         return false;
diff --git a/qmp.c b/qmp.c
index a06cb7ba62..0028f0b30e 100644
--- a/qmp.c
+++ b/qmp.c
@@ -36,7 +36,6 @@
 #include "qom/object_interfaces.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/acpi_dev_interface.h"
-#include "qemu/uuid.h"
 
 NameInfo *qmp_query_name(Error **errp)
 {
diff --git a/roms/openbios b/roms/openbios
-Subproject c5542f226c0d3d61e7bb578b70e591097d57547
+Subproject 1dc4f162efc0f00a36126cab8e7b906335f6b70
diff --git a/scripts/clean-includes b/scripts/clean-includes
index 4412a5590a..dd938daa3e 100755
--- a/scripts/clean-includes
+++ b/scripts/clean-includes
@@ -14,15 +14,18 @@
 # the top-level directory.
 
 # Usage:
-#   clean-includes [--git subjectprefix] file ...
+#   clean-includes [--git subjectprefix] [--check-dup-head] file ...
 # or
-#   clean-includes [--git subjectprefix] --all
+#   clean-includes [--git subjectprefix] [--check-dup-head] --all
 #
 # If the --git subjectprefix option is given, then after making
 # the changes to the files this script will create a git commit
 # with the subject line "subjectprefix: Clean up includes"
 # and a boilerplate commit message.
 #
+# If --check-dup-head is specified, additionally check for duplicate
+# header includes.
+#
 # Using --all will cause clean-includes to run on the whole source
 # tree (excluding certain directories which are known not to need
 # handling).
@@ -45,23 +48,40 @@
 
 
 GIT=no
+DUPHEAD=no
 
 # Extended regular expression defining files to ignore when using --all
 XDIRREGEX='^(tests/tcg|tests/multiboot|pc-bios|disas/libvixl)'
 
-if [ $# -ne 0 ] && [ "$1" = "--git" ]; then
-    if [ $# -eq 1 ]; then
-        echo "--git option requires an argument"
-        exit 1
-    fi
-    GITSUBJ="$2"
-    GIT=yes
-    shift
-    shift
-fi
+while true
+do
+    case $1 in
+    "--git")
+         if [ $# -eq 1 ]; then
+             echo "--git option requires an argument"
+             exit 1
+         fi
+         GITSUBJ="$2"
+         GIT=yes
+         shift
+         shift
+         ;;
+    "--check-dup-head")
+        DUPHEAD=yes
+        shift
+        ;;
+    "--")
+        shift
+        break
+        ;;
+    *)
+        break
+        ;;
+   esac
+done
 
 if [ $# -eq 0 ]; then
-    echo "Usage: clean-includes [--git subjectprefix] [--all | foo.c ...]"
+    echo "Usage: clean-includes [--git subjectprefix] [--check-dup-head] [--all | foo.c ...]"
     echo "(modifies the files in place)"
     exit 1
 fi
@@ -91,7 +111,6 @@ cat >"$COCCIFILE" <<EOT
 )
 EOT
 
-
 for f in "$@"; do
   case "$f" in
     *.inc.c)
@@ -154,6 +173,15 @@ for f in "$@"; do
 
 done
 
+if [ "$DUPHEAD" = "yes" ]; then
+    egrep "^[[:space:]]*#[[:space:]]*include" "$@" | tr -d '[:blank:]' \
+        | sort | uniq -c | awk '{if ($1 > 1) print $0}'
+    if [ $? -eq 0 ]; then
+        echo "Found duplicate header file includes. Please check the above files manually."
+        exit 1
+    fi
+fi
+
 if [ "$GIT" = "yes" ]; then
     git add -- "$@"
     git commit --signoff -F - <<EOF
diff --git a/scripts/hxtool b/scripts/hxtool
index 995bb7f08c..04f7d7b0ed 100644
--- a/scripts/hxtool
+++ b/scripts/hxtool
@@ -26,32 +26,32 @@ hxtotexi()
             ;;
             STEXI*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected ETEXI, found $str" >&2
+                printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=1
             ;;
             ETEXI*)
             if test $flag -ne 1 ; then
-                echo "line $line: syntax error: expected STEXI, found $str" >&2
+                printf "line %d: syntax error: expected STEXI, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=0
             ;;
             SQMP*|EQMP*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected ETEXI, found $str" >&2
+                printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             ;;
             DEFHEADING*)
-            echo "$(expr "$str" : "DEFHEADING(\(.*\))")"
+            printf '%s\n' "$(expr "$str" : "DEFHEADING(\(.*\))")"
             ;;
             ARCHHEADING*)
-            echo "$(expr "$str" : "ARCHHEADING(\(.*\),.*)")"
+            printf '%s\n' "$(expr "$str" : "ARCHHEADING(\(.*\),.*)")"
             ;;
             *)
-            test $flag -eq 1 && echo "$str"
+            test $flag -eq 1 && printf '%s\n' "$str"
             ;;
         esac
         line=$((line+1))
@@ -69,26 +69,26 @@ hxtoqmp()
             ;;
             SQMP*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected EQMP, found $str" >&2
+                printf "line %d: syntax error: expected EQMP, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=1
             ;;
             EQMP*)
             if test $flag -ne 1 ; then
-                echo "line $line: syntax error: expected SQMP, found $str" >&2
+                printf "line %d: syntax error: expected SQMP, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             flag=0
             ;;
             STEXI*|ETEXI*)
             if test $flag -eq 1 ; then
-                echo "line $line: syntax error: expected EQMP, found $str" >&2
+                printf "line %d: syntax error: expected EQMP, found '%s'\n" "$line" "$str" >&2
                 exit 1
             fi
             ;;
             *)
-            test $flag -eq 1 && echo "$str"
+            test $flag -eq 1 && printf '%s\n' "$str"
             ;;
         esac
         line=$((line+1))
diff --git a/scripts/tracetool.py b/scripts/tracetool.py
index 629b2593c8..fe9c9e904b 100755
--- a/scripts/tracetool.py
+++ b/scripts/tracetool.py
@@ -70,7 +70,7 @@ def make_group_name(filename):
 
     if dirname == "":
         return "common"
-    return re.sub(r"/|-", "_", dirname)
+    return re.sub(r"[^A-Za-z0-9]", "_", dirname)
 
 def main(args):
     global _SCRIPT
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index c5850e858e..7f236a7c1f 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -17,6 +17,7 @@ stub-obj-y += gdbstub.o
 stub-obj-y += get-fd.o
 stub-obj-y += get-next-serial.o
 stub-obj-y += get-vm-name.o
+stub-obj-y += iothread.o
 stub-obj-y += iothread-lock.o
 stub-obj-y += is-daemonized.o
 stub-obj-y += machine-init-done.o
@@ -48,3 +49,4 @@ stub-obj-y += iohandler.o
 stub-obj-y += smbios_type_38.o
 stub-obj-y += ipmi.o
 stub-obj-y += pc_madt_cpu_entry.o
+stub-obj-y += migration-colo.o
diff --git a/stubs/iothread.c b/stubs/iothread.c
new file mode 100644
index 0000000000..8cc9e28c55
--- /dev/null
+++ b/stubs/iothread.c
@@ -0,0 +1,8 @@
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    return qemu_get_aio_context();
+}
diff --git a/stubs/migration-colo.c b/stubs/migration-colo.c
new file mode 100644
index 0000000000..7811764c4b
--- /dev/null
+++ b/stubs/migration-colo.c
@@ -0,0 +1,46 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/colo.h"
+#include "qmp-commands.h"
+
+bool colo_supported(void)
+{
+    return false;
+}
+
+bool migration_in_colo_state(void)
+{
+    return false;
+}
+
+bool migration_incoming_in_colo_state(void)
+{
+    return false;
+}
+
+void migrate_start_colo_process(MigrationState *s)
+{
+}
+
+void *colo_process_incoming_thread(void *opaque)
+{
+    return NULL;
+}
+
+void qmp_x_colo_lost_heartbeat(Error **errp)
+{
+    error_setg(errp, "COLO is not supported, please rerun configure"
+                     " with --enable-colo option in order to support"
+                     " COLO feature");
+}
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 03e47765ed..114927b751 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -2994,9 +2994,11 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, ctx.pc - pc_start, 1);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
index f20641163c..847fb52ee0 100644
--- a/target-arm/Makefile.objs
+++ b/target-arm/Makefile.objs
@@ -9,4 +9,4 @@ obj-y += neon_helper.o iwmmxt_helper.o
 obj-y += gdbstub.o
 obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o helper-a64.o gdbstub64.o
 obj-y += crypto_helper.o
-obj-y += arm-powerctl.o
+obj-$(CONFIG_SOFTMMU) += arm-powerctl.o
diff --git a/target-arm/arm-powerctl.c b/target-arm/arm-powerctl.c
index 6519d52cae..fbb7a15daa 100644
--- a/target-arm/arm-powerctl.c
+++ b/target-arm/arm-powerctl.c
@@ -166,6 +166,8 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id,
     /* Start the new CPU at the requested address */
     cpu_set_pc(target_cpu_state, entry);
 
+    qemu_cpu_kick(target_cpu_state);
+
     /* We are good to go */
     return QEMU_ARM_POWERCTL_RET_SUCCESS;
 }
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 2439ca57d0..99f0dbebb9 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -19,6 +19,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "cpu.h"
 #include "internals.h"
@@ -496,6 +497,10 @@ static Property arm_cpu_rvbar_property =
 static Property arm_cpu_has_el3_property =
             DEFINE_PROP_BOOL("has_el3", ARMCPU, has_el3, true);
 
+/* use property name "pmu" to match other archs and virt tools */
+static Property arm_cpu_has_pmu_property =
+            DEFINE_PROP_BOOL("pmu", ARMCPU, has_pmu, true);
+
 static Property arm_cpu_has_mpu_property =
             DEFINE_PROP_BOOL("has-mpu", ARMCPU, has_mpu, true);
 
@@ -539,6 +544,11 @@ static void arm_cpu_post_init(Object *obj)
 #endif
     }
 
+    if (arm_feature(&cpu->env, ARM_FEATURE_PMU)) {
+        qdev_property_add_static(DEVICE(obj), &arm_cpu_has_pmu_property,
+                                 &error_abort);
+    }
+
     if (arm_feature(&cpu->env, ARM_FEATURE_MPU)) {
         qdev_property_add_static(DEVICE(obj), &arm_cpu_has_mpu_property,
                                  &error_abort);
@@ -677,6 +687,11 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
         cpu->id_aa64pfr0 &= ~0xf000;
     }
 
+    if (!cpu->has_pmu || !kvm_enabled()) {
+        cpu->has_pmu = false;
+        unset_feature(env, ARM_FEATURE_PMU);
+    }
+
     if (!arm_feature(env, ARM_FEATURE_EL2)) {
         /* Disable the hypervisor feature bits in the processor feature
          * registers if we don't have EL2. These are id_pfr1[15:12] and
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 19d967b69e..ca5c849ed6 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -1124,6 +1124,7 @@ enum arm_features {
     ARM_FEATURE_V8_SHA256, /* implements SHA256 part of v8 Crypto Extensions */
     ARM_FEATURE_V8_PMULL, /* implements PMULL part of v8 Crypto Extensions */
     ARM_FEATURE_THUMB_DSP, /* DSP insns supported in the Thumb encodings */
+    ARM_FEATURE_PMU, /* has PMU support */
 };
 
 static inline int arm_feature(CPUARMState *env, int feature)
diff --git a/target-arm/cpu64.c b/target-arm/cpu64.c
index 1635debc1a..549cb1ee93 100644
--- a/target-arm/cpu64.c
+++ b/target-arm/cpu64.c
@@ -111,6 +111,7 @@ static void aarch64_a57_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
     set_feature(&cpu->env, ARM_FEATURE_CRC);
     set_feature(&cpu->env, ARM_FEATURE_EL3);
+    set_feature(&cpu->env, ARM_FEATURE_PMU);
     cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A57;
     cpu->midr = 0x411fd070;
     cpu->revidr = 0x00000000;
@@ -166,6 +167,7 @@ static void aarch64_a53_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
     set_feature(&cpu->env, ARM_FEATURE_CRC);
     set_feature(&cpu->env, ARM_FEATURE_EL3);
+    set_feature(&cpu->env, ARM_FEATURE_PMU);
     cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A53;
     cpu->midr = 0x410fd034;
     cpu->revidr = 0x00000000;
diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
index 5faa76c57e..61111091ad 100644
--- a/target-arm/kvm64.c
+++ b/target-arm/kvm64.c
@@ -428,6 +428,11 @@ static inline void set_feature(uint64_t *features, int feature)
     *features |= 1ULL << feature;
 }
 
+static inline void unset_feature(uint64_t *features, int feature)
+{
+    *features &= ~(1ULL << feature);
+}
+
 bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
 {
     /* Identify the feature bits corresponding to the host CPU, and
@@ -469,6 +474,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
     set_feature(&features, ARM_FEATURE_VFP4);
     set_feature(&features, ARM_FEATURE_NEON);
     set_feature(&features, ARM_FEATURE_AARCH64);
+    set_feature(&features, ARM_FEATURE_PMU);
 
     ahcc->features = features;
 
@@ -482,6 +488,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
     int ret;
     uint64_t mpidr;
     ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
 
     if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE ||
         !object_dynamic_cast(OBJECT(cpu), TYPE_AARCH64_CPU)) {
@@ -501,10 +508,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
     if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_EL1_32BIT;
     }
-    if (kvm_irqchip_in_kernel() &&
-        kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
-        cpu->has_pmu = true;
+    if (!kvm_irqchip_in_kernel() ||
+        !kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
+            cpu->has_pmu = false;
+    }
+    if (cpu->has_pmu) {
         cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
+    } else {
+        unset_feature(&env->features, ARM_FEATURE_PMU);
     }
 
     /* Do KVM_ARM_VCPU_INIT ioctl */
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index ded924a0a9..de48747376 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -11420,11 +11420,13 @@ done_generating:
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM) &&
         qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start,
                          4 | (bswap_code(dc->sctlr_b) ? 2 : 0));
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
     tb->size = dc->pc - pc_start;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 718f7d06f3..0ad9070b45 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -11963,11 +11963,13 @@ done_generating:
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM) &&
         qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start,
                          dc->thumb | (dc->sctlr_b << 1));
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
     tb->size = dc->pc - pc_start;
diff --git a/target-cris/translate.c b/target-cris/translate.c
index b5ab0a5fb2..b91042743f 100644
--- a/target-cris/translate.c
+++ b/target-cris/translate.c
@@ -3135,29 +3135,6 @@ void gen_intermediate_code(CPUCRISState *env, struct TranslationBlock *tb)
 
     dc->cpustate_changed = 0;
 
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
-        qemu_log(
-                "pc=%x %x flg=%" PRIx64 " bt=%x ds=%u ccs=%x\n"
-                "pid=%x usp=%x\n"
-                "%x.%x.%x.%x\n"
-                "%x.%x.%x.%x\n"
-                "%x.%x.%x.%x\n"
-                "%x.%x.%x.%x\n",
-                dc->pc, dc->ppc,
-                (uint64_t)tb->flags,
-                env->btarget, (unsigned)tb->flags & 7,
-                env->pregs[PR_CCS],
-                env->pregs[PR_PID], env->pregs[PR_USP],
-                env->regs[0], env->regs[1], env->regs[2], env->regs[3],
-                env->regs[4], env->regs[5], env->regs[6], env->regs[7],
-                env->regs[8], env->regs[9],
-                env->regs[10], env->regs[11],
-                env->regs[12], env->regs[13],
-                env->regs[14], env->regs[15]);
-        qemu_log("--------------\n");
-        qemu_log("IN: %s\n", lookup_symbol(pc_start));
-    }
-
     next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
@@ -3313,10 +3290,14 @@ void gen_intermediate_code(CPUCRISState *env, struct TranslationBlock *tb)
 #if !DISAS_CRIS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
+        qemu_log("--------------\n");
+        qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start,
                          env->pregs[PR_VR]);
         qemu_log("\nisize=%d osize=%d\n",
                  dc->pc - pc_start, tcg_op_buf_count());
+        qemu_log_unlock();
     }
 #endif
 #endif
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 83998a85c1..0f8a8fbd3b 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1973,6 +1973,11 @@ static const char *x86_cpu_feature_name(FeatureWord w, int bitnr)
  */
 static GList *plus_features, *minus_features;
 
+static gint compare_string(gconstpointer a, gconstpointer b)
+{
+    return g_strcmp0(a, b);
+}
+
 /* Parse "+feature,-feature,feature=foo" CPU feature string
  */
 static void x86_cpu_parse_featurestr(const char *typename, char *features,
@@ -1981,6 +1986,7 @@ static void x86_cpu_parse_featurestr(const char *typename, char *features,
     char *featurestr; /* Single 'key=value" string being parsed */
     Error *local_err = NULL;
     static bool cpu_globals_initialized;
+    bool ambiguous = false;
 
     if (cpu_globals_initialized) {
         return;
@@ -2022,6 +2028,19 @@ static void x86_cpu_parse_featurestr(const char *typename, char *features,
         feat2prop(featurestr);
         name = featurestr;
 
+        if (g_list_find_custom(plus_features, name, compare_string)) {
+            error_report("warning: Ambiguous CPU model string. "
+                         "Don't mix both \"+%s\" and \"%s=%s\"",
+                         name, name, val);
+            ambiguous = true;
+        }
+        if (g_list_find_custom(minus_features, name, compare_string)) {
+            error_report("warning: Ambiguous CPU model string. "
+                         "Don't mix both \"-%s\" and \"%s=%s\"",
+                         name, name, val);
+            ambiguous = true;
+        }
+
         /* Special case: */
         if (!strcmp(name, "tsc-freq")) {
             int64_t tsc_freq;
@@ -2046,6 +2065,11 @@ static void x86_cpu_parse_featurestr(const char *typename, char *features,
         qdev_prop_register_global(prop);
     }
 
+    if (ambiguous) {
+        error_report("warning: Compatibility of ambiguous CPU model "
+                     "strings won't be kept on future QEMU versions");
+    }
+
     if (local_err) {
         error_propagate(errp, local_err);
     }
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 9bc961bff3..4ecc0912a4 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1121,9 +1121,9 @@ typedef struct MCEInjectionParams {
     int flags;
 } MCEInjectionParams;
 
-static void do_inject_x86_mce(CPUState *cs, void *data)
+static void do_inject_x86_mce(CPUState *cs, run_on_cpu_data data)
 {
-    MCEInjectionParams *params = data;
+    MCEInjectionParams *params = data.host_ptr;
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *cenv = &cpu->env;
     uint64_t *banks = cenv->mce_banks + 4 * params->bank;
@@ -1230,7 +1230,7 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank,
         return;
     }
 
-    run_on_cpu(cs, do_inject_x86_mce, &params);
+    run_on_cpu(cs, do_inject_x86_mce, RUN_ON_CPU_HOST_PTR(&params));
     if (flags & MCE_INJECT_BROADCAST) {
         CPUState *other_cs;
 
@@ -1243,7 +1243,7 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank,
             if (other_cs == cs) {
                 continue;
             }
-            run_on_cpu(other_cs, do_inject_x86_mce, &params);
+            run_on_cpu(other_cs, do_inject_x86_mce, RUN_ON_CPU_HOST_PTR(&params));
         }
     }
 }
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 86b41a9c21..1c0864ed16 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -183,7 +183,7 @@ static int kvm_get_tsc(CPUState *cs)
     return 0;
 }
 
-static inline void do_kvm_synchronize_tsc(CPUState *cpu, void *arg)
+static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
 {
     kvm_get_tsc(cpu);
 }
@@ -194,7 +194,7 @@ void kvm_synchronize_all_tsc(void)
 
     if (kvm_enabled()) {
         CPU_FOREACH(cpu) {
-            run_on_cpu(cpu, do_kvm_synchronize_tsc, NULL);
+            run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
         }
     }
 }
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 71c0e4dc47..48037f1575 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -7,10 +7,7 @@
 #include "hw/i386/pc.h"
 #include "hw/isa/isa.h"
 #include "migration/cpu.h"
-#include "exec/exec-all.h"
 
-#include "cpu.h"
-#include "exec/exec-all.h"
 #include "sysemu/kvm.h"
 
 #include "qemu/error-report.h"
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 927b366534..324103c885 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -2432,11 +2432,13 @@ static void gen_unknown_opcode(CPUX86State *env, DisasContext *s)
 
     if (qemu_loglevel_mask(LOG_UNIMP)) {
         target_ulong pc = s->pc_start, end = s->pc;
+        qemu_log_lock();
         qemu_log("ILLOPC: " TARGET_FMT_lx ":", pc);
         for (; pc < end; ++pc) {
             qemu_log(" %02x", cpu_ldub_code(env, pc));
         }
         qemu_log("\n");
+        qemu_log_unlock();
     }
 }
 
@@ -8470,6 +8472,7 @@ done_generating:
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
         int disas_flags;
+        qemu_log_lock();
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
 #ifdef TARGET_X86_64
@@ -8480,6 +8483,7 @@ done_generating:
             disas_flags = !dc->code32;
         log_target_disas(cs, pc_start, pc_ptr - pc_start, disas_flags);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 
diff --git a/target-lm32/translate.c b/target-lm32/translate.c
index 2d8caebbfc..692882f447 100644
--- a/target-lm32/translate.c
+++ b/target-lm32/translate.c
@@ -33,12 +33,14 @@
 #include "exec/log.h"
 
 
-#define DISAS_LM32 1
-#if DISAS_LM32
-#  define LOG_DIS(...) qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__)
-#else
-#  define LOG_DIS(...) do { } while (0)
-#endif
+#define DISAS_LM32 0
+
+#define LOG_DIS(...) \
+    do { \
+        if (DISAS_LM32) { \
+            qemu_log_mask(CPU_LOG_TB_IN_ASM, ## __VA_ARGS__); \
+        } \
+    } while (0)
 
 #define EXTRACT_FIELD(src, start, end) \
             (((src) >> start) & ((1 << (end - start + 1)) - 1))
@@ -211,7 +213,7 @@ static void dec_and(DisasContext *dc)
 
 static void dec_andhi(DisasContext *dc)
 {
-    LOG_DIS("andhi r%d, r%d, %d\n", dc->r2, dc->r0, dc->imm16);
+    LOG_DIS("andhi r%d, r%d, %d\n", dc->r1, dc->r0, dc->imm16);
 
     tcg_gen_andi_tl(cpu_R[dc->r1], cpu_R[dc->r0], (dc->imm16 << 16));
 }
@@ -274,7 +276,7 @@ static inline void gen_cond_branch(DisasContext *dc, int cond)
 
 static void dec_be(DisasContext *dc)
 {
-    LOG_DIS("be r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("be r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_EQ);
@@ -282,7 +284,7 @@ static void dec_be(DisasContext *dc)
 
 static void dec_bg(DisasContext *dc)
 {
-    LOG_DIS("bg r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bg r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16 * 4));
 
     gen_cond_branch(dc, TCG_COND_GT);
@@ -290,7 +292,7 @@ static void dec_bg(DisasContext *dc)
 
 static void dec_bge(DisasContext *dc)
 {
-    LOG_DIS("bge r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bge r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_GE);
@@ -298,7 +300,7 @@ static void dec_bge(DisasContext *dc)
 
 static void dec_bgeu(DisasContext *dc)
 {
-    LOG_DIS("bgeu r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bgeu r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_GEU);
@@ -306,7 +308,7 @@ static void dec_bgeu(DisasContext *dc)
 
 static void dec_bgu(DisasContext *dc)
 {
-    LOG_DIS("bgu r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bgu r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_GTU);
@@ -314,7 +316,7 @@ static void dec_bgu(DisasContext *dc)
 
 static void dec_bne(DisasContext *dc)
 {
-    LOG_DIS("bne r%d, r%d, %d\n", dc->r0, dc->r1,
+    LOG_DIS("bne r%d, r%d, %d\n", dc->r1, dc->r0,
             sign_extend(dc->imm16, 16) * 4);
 
     gen_cond_branch(dc, TCG_COND_NE);
@@ -342,9 +344,6 @@ static void dec_calli(DisasContext *dc)
 
 static inline void gen_compare(DisasContext *dc, int cond)
 {
-    int rX = (dc->format == OP_FMT_RR) ? dc->r2 : dc->r1;
-    int rY = (dc->format == OP_FMT_RR) ? dc->r0 : dc->r0;
-    int rZ = (dc->format == OP_FMT_RR) ? dc->r1 : -1;
     int i;
 
     if (dc->format == OP_FMT_RI) {
@@ -358,16 +357,16 @@ static inline void gen_compare(DisasContext *dc, int cond)
             break;
         }
 
-        tcg_gen_setcondi_tl(cond, cpu_R[rX], cpu_R[rY], i);
+        tcg_gen_setcondi_tl(cond, cpu_R[dc->r1], cpu_R[dc->r0], i);
     } else {
-        tcg_gen_setcond_tl(cond, cpu_R[rX], cpu_R[rY], cpu_R[rZ]);
+        tcg_gen_setcond_tl(cond, cpu_R[dc->r2], cpu_R[dc->r0], cpu_R[dc->r1]);
     }
 }
 
 static void dec_cmpe(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpei r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpei r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpe r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -379,7 +378,7 @@ static void dec_cmpe(DisasContext *dc)
 static void dec_cmpg(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgi r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgi r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpg r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -391,7 +390,7 @@ static void dec_cmpg(DisasContext *dc)
 static void dec_cmpge(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgei r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgei r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpge r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -403,7 +402,7 @@ static void dec_cmpge(DisasContext *dc)
 static void dec_cmpgeu(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgeui r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgeui r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpgeu r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -415,7 +414,7 @@ static void dec_cmpgeu(DisasContext *dc)
 static void dec_cmpgu(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpgui r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpgui r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpgu r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -427,7 +426,7 @@ static void dec_cmpgu(DisasContext *dc)
 static void dec_cmpne(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("cmpnei r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("cmpnei r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("cmpne r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -539,7 +538,7 @@ static void dec_modu(DisasContext *dc)
 static void dec_mul(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("muli r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("muli r%d, r%d, %d\n", dc->r1, dc->r0,
                 sign_extend(dc->imm16, 16));
     } else {
         LOG_DIS("mul r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -563,7 +562,7 @@ static void dec_mul(DisasContext *dc)
 static void dec_nor(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("nori r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("nori r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("nor r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -865,7 +864,7 @@ static void dec_wcsr(DisasContext *dc)
 {
     int no;
 
-    LOG_DIS("wcsr r%d, %d\n", dc->r1, dc->csr);
+    LOG_DIS("wcsr %d, r%d\n", dc->csr, dc->r1);
 
     switch (dc->csr) {
     case CSR_IE:
@@ -959,7 +958,7 @@ static void dec_wcsr(DisasContext *dc)
 static void dec_xnor(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("xnori r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("xnori r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         if (dc->r1 == R_R0) {
@@ -981,7 +980,7 @@ static void dec_xnor(DisasContext *dc)
 static void dec_xor(DisasContext *dc)
 {
     if (dc->format == OP_FMT_RI) {
-        LOG_DIS("xori r%d, r%d, %d\n", dc->r0, dc->r1,
+        LOG_DIS("xori r%d, r%d, %d\n", dc->r1, dc->r0,
                 zero_extend(dc->imm16, 16));
     } else {
         LOG_DIS("xor r%d, r%d, r%d\n", dc->r2, dc->r0, dc->r1);
@@ -1149,10 +1148,12 @@ void gen_intermediate_code(CPULM32State *env, struct TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("\n");
         log_target_disas(cs, pc_start, dc->pc - pc_start, 0);
         qemu_log("\nisize=%d osize=%d\n",
                  dc->pc - pc_start, tcg_op_buf_count());
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-m68k/translate.c b/target-m68k/translate.c
index ee0ffe3e07..9ad974f86a 100644
--- a/target-m68k/translate.c
+++ b/target-m68k/translate.c
@@ -3549,10 +3549,12 @@ void gen_intermediate_code(CPUM68KState *env, TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start, 0);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
     tb->size = dc->pc - pc_start;
diff --git a/target-microblaze/translate.c b/target-microblaze/translate.c
index 80098ece15..de2090ac71 100644
--- a/target-microblaze/translate.c
+++ b/target-microblaze/translate.c
@@ -581,50 +581,10 @@ static void dec_msr(DisasContext *dc)
     }
 }
 
-/* 64-bit signed mul, lower result in d and upper in d2.  */
-static void t_gen_muls(TCGv d, TCGv d2, TCGv a, TCGv b)
-{
-    TCGv_i64 t0, t1;
-
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-
-    tcg_gen_ext_i32_i64(t0, a);
-    tcg_gen_ext_i32_i64(t1, b);
-    tcg_gen_mul_i64(t0, t0, t1);
-
-    tcg_gen_extrl_i64_i32(d, t0);
-    tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_extrl_i64_i32(d2, t0);
-
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
-}
-
-/* 64-bit unsigned muls, lower result in d and upper in d2.  */
-static void t_gen_mulu(TCGv d, TCGv d2, TCGv a, TCGv b)
-{
-    TCGv_i64 t0, t1;
-
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-
-    tcg_gen_extu_i32_i64(t0, a);
-    tcg_gen_extu_i32_i64(t1, b);
-    tcg_gen_mul_i64(t0, t0, t1);
-
-    tcg_gen_extrl_i64_i32(d, t0);
-    tcg_gen_shri_i64(t0, t0, 32);
-    tcg_gen_extrl_i64_i32(d2, t0);
-
-    tcg_temp_free_i64(t0);
-    tcg_temp_free_i64(t1);
-}
-
 /* Multiplier unit.  */
 static void dec_mul(DisasContext *dc)
 {
-    TCGv d[2];
+    TCGv tmp;
     unsigned int subcode;
 
     if ((dc->tb_flags & MSR_EE_FLAG)
@@ -636,13 +596,11 @@ static void dec_mul(DisasContext *dc)
     }
 
     subcode = dc->imm & 3;
-    d[0] = tcg_temp_new();
-    d[1] = tcg_temp_new();
 
     if (dc->type_b) {
         LOG_DIS("muli r%d r%d %x\n", dc->rd, dc->ra, dc->imm);
-        t_gen_mulu(cpu_R[dc->rd], d[1], cpu_R[dc->ra], *(dec_alu_op_b(dc)));
-        goto done;
+        tcg_gen_mul_tl(cpu_R[dc->rd], cpu_R[dc->ra], *(dec_alu_op_b(dc)));
+        return;
     }
 
     /* mulh, mulhsu and mulhu are not available if C_USE_HW_MUL is < 2.  */
@@ -651,30 +609,29 @@ static void dec_mul(DisasContext *dc)
         /* nop??? */
     }
 
+    tmp = tcg_temp_new();
     switch (subcode) {
         case 0:
             LOG_DIS("mul r%d r%d r%d\n", dc->rd, dc->ra, dc->rb);
-            t_gen_mulu(cpu_R[dc->rd], d[1], cpu_R[dc->ra], cpu_R[dc->rb]);
+            tcg_gen_mul_tl(cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
             break;
         case 1:
             LOG_DIS("mulh r%d r%d r%d\n", dc->rd, dc->ra, dc->rb);
-            t_gen_muls(d[0], cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
+            tcg_gen_muls2_tl(tmp, cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
             break;
         case 2:
             LOG_DIS("mulhsu r%d r%d r%d\n", dc->rd, dc->ra, dc->rb);
-            t_gen_muls(d[0], cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
+            tcg_gen_mulsu2_tl(tmp, cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
             break;
         case 3:
             LOG_DIS("mulhu r%d r%d r%d\n", dc->rd, dc->ra, dc->rb);
-            t_gen_mulu(d[0], cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
+            tcg_gen_mulu2_tl(tmp, cpu_R[dc->rd], cpu_R[dc->ra], cpu_R[dc->rb]);
             break;
         default:
             cpu_abort(CPU(dc->cpu), "unknown MUL insn %x\n", subcode);
             break;
     }
-done:
-    tcg_temp_free(d[0]);
-    tcg_temp_free(d[1]);
+    tcg_temp_free(tmp);
 }
 
 /* Div unit.  */
@@ -1670,13 +1627,6 @@ void gen_intermediate_code(CPUMBState *env, struct TranslationBlock *tb)
         cpu_abort(cs, "Microblaze: unaligned PC=%x\n", pc_start);
     }
 
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
-#if !SIM_COMPAT
-        qemu_log("--------------\n");
-        log_cpu_state(CPU(cpu), 0);
-#endif
-    }
-
     next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
     num_insns = 0;
     max_insns = tb->cflags & CF_COUNT_MASK;
@@ -1820,12 +1770,14 @@ void gen_intermediate_code(CPUMBState *env, struct TranslationBlock *tb)
 #if !SIM_COMPAT
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
-        qemu_log("\n");
+        qemu_log_lock();
+        qemu_log("--------------\n");
 #if DISAS_GNU
         log_target_disas(cs, pc_start, dc->pc - pc_start, 0);
 #endif
         qemu_log("\nisize=%d osize=%d\n",
                  dc->pc - pc_start, tcg_op_buf_count());
+        qemu_log_unlock();
     }
 #endif
 #endif
diff --git a/target-mips/machine.c b/target-mips/machine.c
index a27f2f156d..d20d948457 100644
--- a/target-mips/machine.c
+++ b/target-mips/machine.c
@@ -2,7 +2,6 @@
 #include "qemu-common.h"
 #include "cpu.h"
 #include "hw/hw.h"
-#include "cpu.h"
 #include "migration/cpu.h"
 
 static int cpu_post_load(void *opaque, int version_id)
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 55c2ca0c7b..d8dde7a2f5 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -20043,9 +20043,11 @@ done_generating:
     LOG_DISAS("\n");
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, ctx.pc - pc_start, 0);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-openrisc/translate.c b/target-openrisc/translate.c
index 28c944657c..229361aed1 100644
--- a/target-openrisc/translate.c
+++ b/target-openrisc/translate.c
@@ -1651,10 +1651,6 @@ void gen_intermediate_code(CPUOpenRISCState *env, struct TranslationBlock *tb)
     dc->synced_flags = dc->tb_flags = tb->flags;
     dc->delayed_branch = !!(dc->tb_flags & D_FLAG);
     dc->singlestep_enabled = cs->singlestep_enabled;
-    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
-        qemu_log("-----------------------------------------\n");
-        log_cpu_state(CPU(cpu), 0);
-    }
 
     next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
     num_insns = 0;
@@ -1754,10 +1750,13 @@ void gen_intermediate_code(CPUOpenRISCState *env, struct TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
-        qemu_log("\n");
+        qemu_log_lock();
+        qemu_log("----------------\n");
+        qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start, 0);
         qemu_log("\nisize=%d osize=%d\n",
                  dc->pc - pc_start, tcg_op_buf_count());
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index 4820f22377..e43cb6c39d 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -8,7 +8,6 @@
 #include "helper_regs.h"
 #include "mmu-hash64.h"
 #include "migration/cpu.h"
-#include "exec/exec-all.h"
 
 static int cpu_load_old(QEMUFile *f, void *opaque, int version_id)
 {
diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
index 6548715831..1ab8a6eab4 100644
--- a/target-ppc/mem_helper.c
+++ b/target-ppc/mem_helper.c
@@ -23,7 +23,6 @@
 #include "exec/helper-proto.h"
 
 #include "helper_regs.h"
-#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 
 //#define DEBUG_OP
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 43505a936c..54f35e9904 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -7211,9 +7211,11 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
         int flags;
         flags = env->bfd_mach;
         flags |= ctx.le_mode << 16;
+        qemu_log_lock();
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, ctx.nip - pc_start, flags);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index 9e2f239cf1..0a39d31237 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -164,7 +164,7 @@ static void s390_cpu_machine_reset_cb(void *opaque)
 {
     S390CPU *cpu = opaque;
 
-    run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, NULL);
+    run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, RUN_ON_CPU_NULL);
 }
 #endif
 
@@ -220,7 +220,7 @@ static void s390_cpu_realizefn(DeviceState *dev, Error **errp)
     s390_cpu_gdb_init(cs);
     qemu_init_vcpu(cs);
 #if !defined(CONFIG_USER_ONLY)
-    run_on_cpu(cs, s390_do_cpu_full_reset, NULL);
+    run_on_cpu(cs, s390_do_cpu_full_reset, RUN_ON_CPU_NULL);
 #else
     cpu_reset(cs);
 #endif
diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h
index 4e58cdee3e..fd36a25cf5 100644
--- a/target-s390x/cpu.h
+++ b/target-s390x/cpu.h
@@ -502,13 +502,13 @@ static inline hwaddr decode_basedisp_s(CPUS390XState *env, uint32_t ipb,
 #define decode_basedisp_rs decode_basedisp_s
 
 /* helper functions for run_on_cpu() */
-static inline void s390_do_cpu_reset(CPUState *cs, void *arg)
+static inline void s390_do_cpu_reset(CPUState *cs, run_on_cpu_data arg)
 {
     S390CPUClass *scc = S390_CPU_GET_CLASS(cs);
 
     scc->cpu_reset(cs);
 }
-static inline void s390_do_cpu_full_reset(CPUState *cs, void *arg)
+static inline void s390_do_cpu_full_reset(CPUState *cs, run_on_cpu_data arg)
 {
     cpu_reset(cs);
 }
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 7f745726bd..36b4847717 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -1607,7 +1607,7 @@ int kvm_s390_cpu_restart(S390CPU *cpu)
 {
     SigpInfo si = {};
 
-    run_on_cpu(CPU(cpu), sigp_restart, &si);
+    run_on_cpu(CPU(cpu), sigp_restart, RUN_ON_CPU_HOST_PTR(&si));
     DPRINTF("DONE: KVM cpu restart: %p\n", &cpu->env);
     return 0;
 }
@@ -1683,31 +1683,31 @@ static int handle_sigp_single_dst(S390CPU *dst_cpu, uint8_t order,
 
     switch (order) {
     case SIGP_START:
-        run_on_cpu(CPU(dst_cpu), sigp_start, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_start, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_STOP:
-        run_on_cpu(CPU(dst_cpu), sigp_stop, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_stop, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_RESTART:
-        run_on_cpu(CPU(dst_cpu), sigp_restart, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_restart, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_STOP_STORE_STATUS:
-        run_on_cpu(CPU(dst_cpu), sigp_stop_and_store_status, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_stop_and_store_status, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_STORE_STATUS_ADDR:
-        run_on_cpu(CPU(dst_cpu), sigp_store_status_at_address, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_store_status_at_address, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_STORE_ADTL_STATUS:
-        run_on_cpu(CPU(dst_cpu), sigp_store_adtl_status, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_store_adtl_status, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_SET_PREFIX:
-        run_on_cpu(CPU(dst_cpu), sigp_set_prefix, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_set_prefix, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_INITIAL_CPU_RESET:
-        run_on_cpu(CPU(dst_cpu), sigp_initial_cpu_reset, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_initial_cpu_reset, RUN_ON_CPU_HOST_PTR(&si));
         break;
     case SIGP_CPU_RESET:
-        run_on_cpu(CPU(dst_cpu), sigp_cpu_reset, &si);
+        run_on_cpu(CPU(dst_cpu), sigp_cpu_reset, RUN_ON_CPU_HOST_PTR(&si));
         break;
     default:
         DPRINTF("KVM: unknown SIGP: 0x%x\n", order);
diff --git a/target-s390x/misc_helper.c b/target-s390x/misc_helper.c
index 4df2ec6c7d..c9604ea9c7 100644
--- a/target-s390x/misc_helper.c
+++ b/target-s390x/misc_helper.c
@@ -126,7 +126,7 @@ static int modified_clear_reset(S390CPU *cpu)
     pause_all_vcpus();
     cpu_synchronize_all_states();
     CPU_FOREACH(t) {
-        run_on_cpu(t, s390_do_cpu_full_reset, NULL);
+        run_on_cpu(t, s390_do_cpu_full_reset, RUN_ON_CPU_NULL);
     }
     s390_cmma_reset();
     subsystem_reset();
@@ -145,7 +145,7 @@ static int load_normal_reset(S390CPU *cpu)
     pause_all_vcpus();
     cpu_synchronize_all_states();
     CPU_FOREACH(t) {
-        run_on_cpu(t, s390_do_cpu_reset, NULL);
+        run_on_cpu(t, s390_do_cpu_reset, RUN_ON_CPU_NULL);
     }
     s390_cmma_reset();
     subsystem_reset();
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index 1a07d70b21..02bc7058fd 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -5432,9 +5432,11 @@ void gen_intermediate_code(CPUS390XState *env, struct TranslationBlock *tb)
 #if defined(S390X_DEBUG_DISAS)
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc.pc - pc_start, 1);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index ca80cf70ca..c89a14733f 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -1927,9 +1927,11 @@ void gen_intermediate_code(CPUSH4State * env, struct TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
 	qemu_log("IN:\n");	/* , lookup_symbol(pc_start)); */
         log_target_disas(cs, pc_start, ctx.pc - pc_start, 0);
 	qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-sparc/cpu.h b/target-sparc/cpu.h
index 646a103513..5fb0ed1aad 100644
--- a/target-sparc/cpu.h
+++ b/target-sparc/cpu.h
@@ -225,9 +225,9 @@ enum {
 #define MAX_NWINDOWS 32
 
 #if !defined(TARGET_SPARC64)
-#define NB_MMU_MODES 2
+#define NB_MMU_MODES 3
 #else
-#define NB_MMU_MODES 6
+#define NB_MMU_MODES 7
 typedef struct trap_state {
     uint64_t tpc;
     uint64_t tnpc;
@@ -549,6 +549,7 @@ void QEMU_NORETURN sparc_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                                  MMUAccessType access_type,
                                                  int mmu_idx,
                                                  uintptr_t retaddr);
+void cpu_raise_exception_ra(CPUSPARCState *, int, uintptr_t) QEMU_NORETURN;
 
 #ifndef NO_CPU_IO_DEFS
 /* cpu_init.c */
@@ -637,22 +638,16 @@ int cpu_sparc_signal_handler(int host_signum, void *pinfo, void *puc);
 /* MMU modes definitions */
 #if defined (TARGET_SPARC64)
 #define MMU_USER_IDX   0
-#define MMU_MODE0_SUFFIX _user
 #define MMU_USER_SECONDARY_IDX   1
-#define MMU_MODE1_SUFFIX _user_secondary
 #define MMU_KERNEL_IDX 2
-#define MMU_MODE2_SUFFIX _kernel
 #define MMU_KERNEL_SECONDARY_IDX 3
-#define MMU_MODE3_SUFFIX _kernel_secondary
 #define MMU_NUCLEUS_IDX 4
-#define MMU_MODE4_SUFFIX _nucleus
 #define MMU_HYPV_IDX   5
-#define MMU_MODE5_SUFFIX _hypv
+#define MMU_PHYS_IDX   6
 #else
 #define MMU_USER_IDX   0
-#define MMU_MODE0_SUFFIX _user
 #define MMU_KERNEL_IDX 1
-#define MMU_MODE1_SUFFIX _kernel
+#define MMU_PHYS_IDX   2
 #endif
 
 #if defined (TARGET_SPARC64)
@@ -672,18 +667,27 @@ static inline int cpu_supervisor_mode(CPUSPARCState *env1)
 }
 #endif
 
-static inline int cpu_mmu_index(CPUSPARCState *env1, bool ifetch)
+static inline int cpu_mmu_index(CPUSPARCState *env, bool ifetch)
 {
 #if defined(CONFIG_USER_ONLY)
     return MMU_USER_IDX;
 #elif !defined(TARGET_SPARC64)
-    return env1->psrs;
+    if ((env->mmuregs[0] & MMU_E) == 0) { /* MMU disabled */
+        return MMU_PHYS_IDX;
+    } else {
+        return env->psrs;
+    }
 #else
-    if (env1->tl > 0) {
+    /* IMMU or DMMU disabled.  */
+    if (ifetch
+        ? (env->lsu & IMMU_E) == 0 || (env->pstate & PS_RED) != 0
+        : (env->lsu & DMMU_E) == 0) {
+        return MMU_PHYS_IDX;
+    } else if (env->tl > 0) {
         return MMU_NUCLEUS_IDX;
-    } else if (cpu_hypervisor_mode(env1)) {
+    } else if (cpu_hypervisor_mode(env)) {
         return MMU_HYPV_IDX;
-    } else if (cpu_supervisor_mode(env1)) {
+    } else if (cpu_supervisor_mode(env)) {
         return MMU_KERNEL_IDX;
     } else {
         return MMU_USER_IDX;
diff --git a/target-sparc/helper.c b/target-sparc/helper.c
index bedc6722a1..359b0b15ed 100644
--- a/target-sparc/helper.c
+++ b/target-sparc/helper.c
@@ -24,6 +24,14 @@
 #include "exec/helper-proto.h"
 #include "sysemu/sysemu.h"
 
+void cpu_raise_exception_ra(CPUSPARCState *env, int tt, uintptr_t ra)
+{
+    CPUState *cs = CPU(sparc_env_get_cpu(env));
+
+    cs->exception_index = tt;
+    cpu_loop_exit_restore(cs, ra);
+}
+
 void helper_raise_exception(CPUSPARCState *env, int tt)
 {
     CPUState *cs = CPU(sparc_env_get_cpu(env));
@@ -59,7 +67,7 @@ uint64_t helper_tick_get_count(CPUSPARCState *env, void *opaque, int mem_idx)
     CPUTimer *timer = opaque;
 
     if (timer->npt && mem_idx < MMU_KERNEL_IDX) {
-        helper_raise_exception(env, TT_PRIV_INSN);
+        cpu_raise_exception_ra(env, TT_PRIV_INSN, GETPC());
     }
 
     return cpu_tick_get_count(timer);
@@ -76,10 +84,9 @@ void helper_tick_set_limit(void *opaque, uint64_t limit)
 }
 #endif
 
-static target_ulong helper_udiv_common(CPUSPARCState *env, target_ulong a,
-                                       target_ulong b, int cc)
+static target_ulong do_udiv(CPUSPARCState *env, target_ulong a,
+                            target_ulong b, int cc, uintptr_t ra)
 {
-    SPARCCPU *cpu = sparc_env_get_cpu(env);
     int overflow = 0;
     uint64_t x0;
     uint32_t x1;
@@ -88,8 +95,7 @@ static target_ulong helper_udiv_common(CPUSPARCState *env, target_ulong a,
     x1 = (b & 0xffffffff);
 
     if (x1 == 0) {
-        cpu_restore_state(CPU(cpu), GETPC());
-        helper_raise_exception(env, TT_DIV_ZERO);
+        cpu_raise_exception_ra(env, TT_DIV_ZERO, ra);
     }
 
     x0 = x0 / x1;
@@ -108,18 +114,17 @@ static target_ulong helper_udiv_common(CPUSPARCState *env, target_ulong a,
 
 target_ulong helper_udiv(CPUSPARCState *env, target_ulong a, target_ulong b)
 {
-    return helper_udiv_common(env, a, b, 0);
+    return do_udiv(env, a, b, 0, GETPC());
 }
 
 target_ulong helper_udiv_cc(CPUSPARCState *env, target_ulong a, target_ulong b)
 {
-    return helper_udiv_common(env, a, b, 1);
+    return do_udiv(env, a, b, 1, GETPC());
 }
 
-static target_ulong helper_sdiv_common(CPUSPARCState *env, target_ulong a,
-                                       target_ulong b, int cc)
+static target_ulong do_sdiv(CPUSPARCState *env, target_ulong a,
+                            target_ulong b, int cc, uintptr_t ra)
 {
-    SPARCCPU *cpu = sparc_env_get_cpu(env);
     int overflow = 0;
     int64_t x0;
     int32_t x1;
@@ -128,8 +133,7 @@ static target_ulong helper_sdiv_common(CPUSPARCState *env, target_ulong a,
     x1 = (b & 0xffffffff);
 
     if (x1 == 0) {
-        cpu_restore_state(CPU(cpu), GETPC());
-        helper_raise_exception(env, TT_DIV_ZERO);
+        cpu_raise_exception_ra(env, TT_DIV_ZERO, ra);
     } else if (x1 == -1 && x0 == INT64_MIN) {
         x0 = INT32_MAX;
         overflow = 1;
@@ -151,12 +155,12 @@ static target_ulong helper_sdiv_common(CPUSPARCState *env, target_ulong a,
 
 target_ulong helper_sdiv(CPUSPARCState *env, target_ulong a, target_ulong b)
 {
-    return helper_sdiv_common(env, a, b, 0);
+    return do_sdiv(env, a, b, 0, GETPC());
 }
 
 target_ulong helper_sdiv_cc(CPUSPARCState *env, target_ulong a, target_ulong b)
 {
-    return helper_sdiv_common(env, a, b, 1);
+    return do_sdiv(env, a, b, 1, GETPC());
 }
 
 #ifdef TARGET_SPARC64
@@ -164,10 +168,7 @@ int64_t helper_sdivx(CPUSPARCState *env, int64_t a, int64_t b)
 {
     if (b == 0) {
         /* Raise divide by zero trap.  */
-        SPARCCPU *cpu = sparc_env_get_cpu(env);
-
-        cpu_restore_state(CPU(cpu), GETPC());
-        helper_raise_exception(env, TT_DIV_ZERO);
+        cpu_raise_exception_ra(env, TT_DIV_ZERO, GETPC());
     } else if (b == -1) {
         /* Avoid overflow trap with i386 divide insn.  */
         return -a;
@@ -180,10 +181,7 @@ uint64_t helper_udivx(CPUSPARCState *env, uint64_t a, uint64_t b)
 {
     if (b == 0) {
         /* Raise divide by zero trap.  */
-        SPARCCPU *cpu = sparc_env_get_cpu(env);
-
-        cpu_restore_state(CPU(cpu), GETPC());
-        helper_raise_exception(env, TT_DIV_ZERO);
+        cpu_raise_exception_ra(env, TT_DIV_ZERO, GETPC());
     }
     return a / b;
 }
@@ -192,7 +190,6 @@ uint64_t helper_udivx(CPUSPARCState *env, uint64_t a, uint64_t b)
 target_ulong helper_taddcctv(CPUSPARCState *env, target_ulong src1,
                              target_ulong src2)
 {
-    SPARCCPU *cpu = sparc_env_get_cpu(env);
     target_ulong dst;
 
     /* Tag overflow occurs if either input has bits 0 or 1 set.  */
@@ -215,14 +212,12 @@ target_ulong helper_taddcctv(CPUSPARCState *env, target_ulong src1,
     return dst;
 
  tag_overflow:
-    cpu_restore_state(CPU(cpu), GETPC());
-    helper_raise_exception(env, TT_TOVF);
+    cpu_raise_exception_ra(env, TT_TOVF, GETPC());
 }
 
 target_ulong helper_tsubcctv(CPUSPARCState *env, target_ulong src1,
                              target_ulong src2)
 {
-    SPARCCPU *cpu = sparc_env_get_cpu(env);
     target_ulong dst;
 
     /* Tag overflow occurs if either input has bits 0 or 1 set.  */
@@ -245,8 +240,7 @@ target_ulong helper_tsubcctv(CPUSPARCState *env, target_ulong src1,
     return dst;
 
  tag_overflow:
-    cpu_restore_state(CPU(cpu), GETPC());
-    helper_raise_exception(env, TT_TOVF);
+    cpu_raise_exception_ra(env, TT_TOVF, GETPC());
 }
 
 #ifndef TARGET_SPARC64
diff --git a/target-sparc/helper.h b/target-sparc/helper.h
index caa2a895d0..0cf1bfb73a 100644
--- a/target-sparc/helper.h
+++ b/target-sparc/helper.h
@@ -17,8 +17,6 @@ DEF_HELPER_1(rdcwp, tl, env)
 DEF_HELPER_2(wrcwp, void, env, tl)
 DEF_HELPER_FLAGS_2(array8, TCG_CALL_NO_RWG_SE, tl, tl, tl)
 DEF_HELPER_FLAGS_1(popc, TCG_CALL_NO_RWG_SE, tl, tl)
-DEF_HELPER_FLAGS_3(ldda_asi, TCG_CALL_NO_WG, void, env, tl, int)
-DEF_HELPER_FLAGS_5(casx_asi, TCG_CALL_NO_WG, tl, env, tl, tl, tl, i32)
 DEF_HELPER_FLAGS_2(set_softint, TCG_CALL_NO_RWG, void, env, i64)
 DEF_HELPER_FLAGS_2(clear_softint, TCG_CALL_NO_RWG, void, env, i64)
 DEF_HELPER_FLAGS_2(write_softint, TCG_CALL_NO_RWG, void, env, i64)
@@ -26,9 +24,6 @@ DEF_HELPER_FLAGS_2(tick_set_count, TCG_CALL_NO_RWG, void, ptr, i64)
 DEF_HELPER_FLAGS_3(tick_get_count, TCG_CALL_NO_WG, i64, env, ptr, int)
 DEF_HELPER_FLAGS_2(tick_set_limit, TCG_CALL_NO_RWG, void, ptr, i64)
 #endif
-#if !defined(CONFIG_USER_ONLY) || defined(TARGET_SPARC64)
-DEF_HELPER_FLAGS_5(cas_asi, TCG_CALL_NO_WG, tl, env, tl, tl, tl, i32)
-#endif
 DEF_HELPER_FLAGS_3(check_align, TCG_CALL_NO_WG, void, env, tl, i32)
 DEF_HELPER_1(debug, void, env)
 DEF_HELPER_1(save, void, env)
@@ -43,8 +38,6 @@ DEF_HELPER_3(tsubcctv, tl, env, tl, tl)
 DEF_HELPER_FLAGS_3(sdivx, TCG_CALL_NO_WG, s64, env, s64, s64)
 DEF_HELPER_FLAGS_3(udivx, TCG_CALL_NO_WG, i64, env, i64, i64)
 #endif
-DEF_HELPER_FLAGS_3(ldqf, TCG_CALL_NO_WG, void, env, tl, int)
-DEF_HELPER_FLAGS_3(stqf, TCG_CALL_NO_WG, void, env, tl, int)
 #if !defined(CONFIG_USER_ONLY) || defined(TARGET_SPARC64)
 DEF_HELPER_FLAGS_4(ld_asi, TCG_CALL_NO_WG, i64, env, tl, int, i32)
 DEF_HELPER_FLAGS_5(st_asi, TCG_CALL_NO_WG, void, env, tl, i64, int, i32)
diff --git a/target-sparc/ldst_helper.c b/target-sparc/ldst_helper.c
index 6ce5ccc37f..de7d53ae20 100644
--- a/target-sparc/ldst_helper.c
+++ b/target-sparc/ldst_helper.c
@@ -254,18 +254,6 @@ static void replace_tlb_1bit_lru(SparcTLBEntry *tlb,
 
 #endif
 
-#if defined(TARGET_SPARC64) || defined(CONFIG_USER_ONLY)
-static inline target_ulong address_mask(CPUSPARCState *env1, target_ulong addr)
-{
-#ifdef TARGET_SPARC64
-    if (AM_CHECK(env1)) {
-        addr &= 0xffffffffULL;
-    }
-#endif
-    return addr;
-}
-#endif
-
 #ifdef TARGET_SPARC64
 /* returns true if access using this ASI is to have address translated by MMU
    otherwise access is to raw physical address */
@@ -290,28 +278,41 @@ static inline int is_translating_asi(int asi)
     }
 }
 
+static inline target_ulong address_mask(CPUSPARCState *env1, target_ulong addr)
+{
+    if (AM_CHECK(env1)) {
+        addr &= 0xffffffffULL;
+    }
+    return addr;
+}
+
 static inline target_ulong asi_address_mask(CPUSPARCState *env,
                                             int asi, target_ulong addr)
 {
     if (is_translating_asi(asi)) {
-        return address_mask(env, addr);
-    } else {
-        return addr;
+        addr = address_mask(env, addr);
     }
+    return addr;
 }
 #endif
 
-void helper_check_align(CPUSPARCState *env, target_ulong addr, uint32_t align)
+static void do_check_align(CPUSPARCState *env, target_ulong addr,
+                           uint32_t align, uintptr_t ra)
 {
     if (addr & align) {
 #ifdef DEBUG_UNALIGNED
         printf("Unaligned access to 0x" TARGET_FMT_lx " from 0x" TARGET_FMT_lx
                "\n", addr, env->pc);
 #endif
-        helper_raise_exception(env, TT_UNALIGNED);
+        cpu_raise_exception_ra(env, TT_UNALIGNED, ra);
     }
 }
 
+void helper_check_align(CPUSPARCState *env, target_ulong addr, uint32_t align)
+{
+    do_check_align(env, addr, align, GETPC());
+}
+
 #if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY) &&   \
     defined(DEBUG_MXCC)
 static void dump_mxcc(CPUSPARCState *env)
@@ -440,7 +441,7 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
     uint32_t last_addr = addr;
 #endif
 
-    helper_check_align(env, addr, size - 1);
+    do_check_align(env, addr, size - 1, GETPC());
     switch (asi) {
     case ASI_M_MXCC: /* SuperSparc MXCC registers, or... */
     /* case ASI_LEON_CACHEREGS:  Leon3 cache control */
@@ -554,64 +555,11 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
             break;
         }
         break;
-    case ASI_USERDATA: /* User data access */
-        switch (size) {
-        case 1:
-            ret = cpu_ldub_user(env, addr);
-            break;
-        case 2:
-            ret = cpu_lduw_user(env, addr);
-            break;
-        default:
-        case 4:
-            ret = cpu_ldl_user(env, addr);
-            break;
-        case 8:
-            ret = cpu_ldq_user(env, addr);
-            break;
-        }
-        break;
-    case ASI_KERNELDATA: /* Supervisor data access */
-    case ASI_P: /* Implicit primary context data access (v9 only?) */
-        switch (size) {
-        case 1:
-            ret = cpu_ldub_kernel(env, addr);
-            break;
-        case 2:
-            ret = cpu_lduw_kernel(env, addr);
-            break;
-        default:
-        case 4:
-            ret = cpu_ldl_kernel(env, addr);
-            break;
-        case 8:
-            ret = cpu_ldq_kernel(env, addr);
-            break;
-        }
-        break;
     case ASI_M_TXTC_TAG:   /* SparcStation 5 I-cache tag */
     case ASI_M_TXTC_DATA:  /* SparcStation 5 I-cache data */
     case ASI_M_DATAC_TAG:  /* SparcStation 5 D-cache tag */
     case ASI_M_DATAC_DATA: /* SparcStation 5 D-cache data */
         break;
-    case ASI_M_BYPASS:    /* MMU passthrough */
-    case ASI_LEON_BYPASS: /* LEON MMU passthrough */
-        switch (size) {
-        case 1:
-            ret = ldub_phys(cs->as, addr);
-            break;
-        case 2:
-            ret = lduw_phys(cs->as, addr);
-            break;
-        default:
-        case 4:
-            ret = ldl_phys(cs->as, addr);
-            break;
-        case 8:
-            ret = ldq_phys(cs->as, addr);
-            break;
-        }
-        break;
     case 0x21 ... 0x2f: /* MMU passthrough, 0x100000000 to 0xfffffffff */
         switch (size) {
         case 1:
@@ -679,6 +627,14 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
         cpu_unassigned_access(cs, addr, false, false, asi, size);
         ret = 0;
         break;
+
+    case ASI_USERDATA: /* User data access */
+    case ASI_KERNELDATA: /* Supervisor data access */
+    case ASI_P: /* Implicit primary context data access (v9 only?) */
+    case ASI_M_BYPASS:    /* MMU passthrough */
+    case ASI_LEON_BYPASS: /* LEON MMU passthrough */
+        /* These are always handled inline.  */
+        g_assert_not_reached();
     }
     if (sign) {
         switch (size) {
@@ -708,7 +664,7 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, uint64_t val,
     SPARCCPU *cpu = sparc_env_get_cpu(env);
     CPUState *cs = CPU(cpu);
 
-    helper_check_align(env, addr, size - 1);
+    do_check_align(env, addr, size - 1, GETPC());
     switch (asi) {
     case ASI_M_MXCC: /* SuperSparc MXCC registers, or... */
     /* case ASI_LEON_CACHEREGS:  Leon3 cache control */
@@ -881,10 +837,10 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, uint64_t val,
             case 0: /* Control Register */
                 env->mmuregs[reg] = (env->mmuregs[reg] & 0xff000000) |
                     (val & 0x00ffffff);
-                /* Mappings generated during no-fault mode or MMU
-                   disabled mode are invalid in normal mode */
-                if ((oldreg & (MMU_E | MMU_NF | env->def->mmu_bm)) !=
-                    (env->mmuregs[reg] & (MMU_E | MMU_NF | env->def->mmu_bm))) {
+                /* Mappings generated during no-fault mode
+                   are invalid in normal mode.  */
+                if ((oldreg ^ env->mmuregs[reg])
+                    & (MMU_NF | env->def->mmu_bm)) {
                     tlb_flush(CPU(cpu), 1);
                 }
                 break;
@@ -929,41 +885,6 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, uint64_t val,
     case ASI_M_DIAGS:   /* Turbosparc DTLB Diagnostic */
     case ASI_M_IODIAG:  /* Turbosparc IOTLB Diagnostic */
         break;
-    case ASI_USERDATA: /* User data access */
-        switch (size) {
-        case 1:
-            cpu_stb_user(env, addr, val);
-            break;
-        case 2:
-            cpu_stw_user(env, addr, val);
-            break;
-        default:
-        case 4:
-            cpu_stl_user(env, addr, val);
-            break;
-        case 8:
-            cpu_stq_user(env, addr, val);
-            break;
-        }
-        break;
-    case ASI_KERNELDATA: /* Supervisor data access */
-    case ASI_P:
-        switch (size) {
-        case 1:
-            cpu_stb_kernel(env, addr, val);
-            break;
-        case 2:
-            cpu_stw_kernel(env, addr, val);
-            break;
-        default:
-        case 4:
-            cpu_stl_kernel(env, addr, val);
-            break;
-        case 8:
-            cpu_stq_kernel(env, addr, val);
-            break;
-        }
-        break;
     case ASI_M_TXTC_TAG:   /* I-cache tag */
     case ASI_M_TXTC_DATA:  /* I-cache data */
     case ASI_M_DATAC_TAG:  /* D-cache tag */
@@ -974,52 +895,6 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, uint64_t val,
     case ASI_M_FLUSH_CTX:    /* I/D-cache flush context */
     case ASI_M_FLUSH_USER:   /* I/D-cache flush user */
         break;
-    case ASI_M_BCOPY: /* Block copy, sta access */
-        {
-            /* val = src
-               addr = dst
-               copy 32 bytes */
-            unsigned int i;
-            uint32_t src = val & ~3, dst = addr & ~3, temp;
-
-            for (i = 0; i < 32; i += 4, src += 4, dst += 4) {
-                temp = cpu_ldl_kernel(env, src);
-                cpu_stl_kernel(env, dst, temp);
-            }
-        }
-        break;
-    case ASI_M_BFILL: /* Block fill, stda access */
-        {
-            /* addr = dst
-               fill 32 bytes with val */
-            unsigned int i;
-            uint32_t dst = addr & ~7;
-
-            for (i = 0; i < 32; i += 8, dst += 8) {
-                cpu_stq_kernel(env, dst, val);
-            }
-        }
-        break;
-    case ASI_M_BYPASS:    /* MMU passthrough */
-    case ASI_LEON_BYPASS: /* LEON MMU passthrough */
-        {
-            switch (size) {
-            case 1:
-                stb_phys(cs->as, addr, val);
-                break;
-            case 2:
-                stw_phys(cs->as, addr, val);
-                break;
-            case 4:
-            default:
-                stl_phys(cs->as, addr, val);
-                break;
-            case 8:
-                stq_phys(cs->as, addr, val);
-                break;
-            }
-        }
-        break;
     case 0x21 ... 0x2f: /* MMU passthrough, 0x100000000 to 0xfffffffff */
         {
             switch (size) {
@@ -1091,6 +966,16 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, uint64_t val,
         cpu_unassigned_access(CPU(sparc_env_get_cpu(env)),
                               addr, true, false, asi, size);
         break;
+
+    case ASI_USERDATA: /* User data access */
+    case ASI_KERNELDATA: /* Supervisor data access */
+    case ASI_P:
+    case ASI_M_BYPASS:    /* MMU passthrough */
+    case ASI_LEON_BYPASS: /* LEON MMU passthrough */
+    case ASI_M_BCOPY: /* Block copy, sta access */
+    case ASI_M_BFILL: /* Block fill, stda access */
+        /* These are always handled inline.  */
+        g_assert_not_reached();
     }
 #ifdef DEBUG_ASI
     dump_asi("write", addr, asi, size, val);
@@ -1107,68 +992,54 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
     int size = 1 << (memop & MO_SIZE);
     int sign = memop & MO_SIGN;
     uint64_t ret = 0;
-#if defined(DEBUG_ASI)
-    target_ulong last_addr = addr;
-#endif
 
     if (asi < 0x80) {
-        helper_raise_exception(env, TT_PRIV_ACT);
+        cpu_raise_exception_ra(env, TT_PRIV_ACT, GETPC());
     }
-
-    helper_check_align(env, addr, size - 1);
+    do_check_align(env, addr, size - 1, GETPC());
     addr = asi_address_mask(env, asi, addr);
 
     switch (asi) {
     case ASI_PNF:  /* Primary no-fault */
     case ASI_PNFL: /* Primary no-fault LE */
-        if (page_check_range(addr, size, PAGE_READ) == -1) {
-#ifdef DEBUG_ASI
-            dump_asi("read ", last_addr, asi, size, ret);
-#endif
-            return 0;
-        }
-        /* Fall through */
-    case ASI_P: /* Primary */
-    case ASI_PL: /* Primary LE */
-        {
-            switch (size) {
-            case 1:
-                ret = cpu_ldub_data(env, addr);
-                break;
-            case 2:
-                ret = cpu_lduw_data(env, addr);
-                break;
-            case 4:
-                ret = cpu_ldl_data(env, addr);
-                break;
-            default:
-            case 8:
-                ret = cpu_ldq_data(env, addr);
-                break;
-            }
-        }
-        break;
     case ASI_SNF:  /* Secondary no-fault */
     case ASI_SNFL: /* Secondary no-fault LE */
         if (page_check_range(addr, size, PAGE_READ) == -1) {
-#ifdef DEBUG_ASI
-            dump_asi("read ", last_addr, asi, size, ret);
-#endif
-            return 0;
+            ret = 0;
+            break;
+        }
+        switch (size) {
+        case 1:
+            ret = cpu_ldub_data(env, addr);
+            break;
+        case 2:
+            ret = cpu_lduw_data(env, addr);
+            break;
+        case 4:
+            ret = cpu_ldl_data(env, addr);
+            break;
+        case 8:
+            ret = cpu_ldq_data(env, addr);
+            break;
+        default:
+            g_assert_not_reached();
         }
-        /* Fall through */
+        break;
+        break;
+
+    case ASI_P: /* Primary */
+    case ASI_PL: /* Primary LE */
     case ASI_S:  /* Secondary */
     case ASI_SL: /* Secondary LE */
-        /* XXX */
-        break;
+        /* These are always handled inline.  */
+        g_assert_not_reached();
+
     default:
-        break;
+        cpu_raise_exception_ra(env, TT_DATA_ACCESS, GETPC());
     }
 
     /* Convert from little endian */
     switch (asi) {
-    case ASI_PL:   /* Primary LE */
-    case ASI_SL:   /* Secondary LE */
     case ASI_PNFL: /* Primary no-fault LE */
     case ASI_SNFL: /* Secondary no-fault LE */
         switch (size) {
@@ -1181,11 +1052,7 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
         case 8:
             ret = bswap64(ret);
             break;
-        default:
-            break;
         }
-    default:
-        break;
     }
 
     /* Convert to signed number */
@@ -1200,12 +1067,10 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
         case 4:
             ret = (int32_t) ret;
             break;
-        default:
-            break;
         }
     }
 #ifdef DEBUG_ASI
-    dump_asi("read ", last_addr, asi, size, ret);
+    dump_asi("read", addr, asi, size, ret);
 #endif
     return ret;
 }
@@ -1218,66 +1083,24 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val,
     dump_asi("write", addr, asi, size, val);
 #endif
     if (asi < 0x80) {
-        helper_raise_exception(env, TT_PRIV_ACT);
-    }
-
-    helper_check_align(env, addr, size - 1);
-    addr = asi_address_mask(env, asi, addr);
-
-    /* Convert to little endian */
-    switch (asi) {
-    case ASI_PL: /* Primary LE */
-    case ASI_SL: /* Secondary LE */
-        switch (size) {
-        case 2:
-            val = bswap16(val);
-            break;
-        case 4:
-            val = bswap32(val);
-            break;
-        case 8:
-            val = bswap64(val);
-            break;
-        default:
-            break;
-        }
-    default:
-        break;
+        cpu_raise_exception_ra(env, TT_PRIV_ACT, GETPC());
     }
+    do_check_align(env, addr, size - 1, GETPC());
 
     switch (asi) {
     case ASI_P:  /* Primary */
     case ASI_PL: /* Primary LE */
-        {
-            switch (size) {
-            case 1:
-                cpu_stb_data(env, addr, val);
-                break;
-            case 2:
-                cpu_stw_data(env, addr, val);
-                break;
-            case 4:
-                cpu_stl_data(env, addr, val);
-                break;
-            case 8:
-            default:
-                cpu_stq_data(env, addr, val);
-                break;
-            }
-        }
-        break;
     case ASI_S:  /* Secondary */
     case ASI_SL: /* Secondary LE */
-        /* XXX */
-        return;
+        /* These are always handled inline.  */
+        g_assert_not_reached();
 
     case ASI_PNF:  /* Primary no-fault, RO */
     case ASI_SNF:  /* Secondary no-fault, RO */
     case ASI_PNFL: /* Primary no-fault LE, RO */
     case ASI_SNFL: /* Secondary no-fault LE, RO */
     default:
-        helper_raise_exception(env, TT_DATA_ACCESS);
-        return;
+        cpu_raise_exception_ra(env, TT_DATA_ACCESS, GETPC());
     }
 }
 
@@ -1300,176 +1123,94 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
         || (cpu_has_hypervisor(env)
             && asi >= 0x30 && asi < 0x80
             && !(env->hpstate & HS_PRIV))) {
-        helper_raise_exception(env, TT_PRIV_ACT);
+        cpu_raise_exception_ra(env, TT_PRIV_ACT, GETPC());
     }
 
-    helper_check_align(env, addr, size - 1);
+    do_check_align(env, addr, size - 1, GETPC());
     addr = asi_address_mask(env, asi, addr);
 
-    /* process nonfaulting loads first */
-    if ((asi & 0xf6) == 0x82) {
-        int mmu_idx;
-
-        /* secondary space access has lowest asi bit equal to 1 */
-        if (env->pstate & PS_PRIV) {
-            mmu_idx = (asi & 1) ? MMU_KERNEL_SECONDARY_IDX : MMU_KERNEL_IDX;
-        } else {
-            mmu_idx = (asi & 1) ? MMU_USER_SECONDARY_IDX : MMU_USER_IDX;
-        }
+    switch (asi) {
+    case ASI_PNF:
+    case ASI_PNFL:
+    case ASI_SNF:
+    case ASI_SNFL:
+        {
+            TCGMemOpIdx oi;
+            int idx = (env->pstate & PS_PRIV
+                       ? (asi & 1 ? MMU_KERNEL_SECONDARY_IDX : MMU_KERNEL_IDX)
+                       : (asi & 1 ? MMU_USER_SECONDARY_IDX : MMU_USER_IDX));
 
-        if (cpu_get_phys_page_nofault(env, addr, mmu_idx) == -1ULL) {
+            if (cpu_get_phys_page_nofault(env, addr, idx) == -1ULL) {
 #ifdef DEBUG_ASI
-            dump_asi("read ", last_addr, asi, size, ret);
+                dump_asi("read ", last_addr, asi, size, ret);
 #endif
-            /* env->exception_index is set in get_physical_address_data(). */
-            helper_raise_exception(env, cs->exception_index);
-        }
-
-        /* convert nonfaulting load ASIs to normal load ASIs */
-        asi &= ~0x02;
-    }
-
-    switch (asi) {
-    case ASI_AIUP:  /* As if user primary */
-    case ASI_AIUS:  /* As if user secondary */
-    case ASI_AIUPL: /* As if user primary LE */
-    case ASI_AIUSL: /* As if user secondary LE */
-    case ASI_P:  /* Primary */
-    case ASI_S:  /* Secondary */
-    case ASI_PL: /* Primary LE */
-    case ASI_SL: /* Secondary LE */
-        if ((asi & 0x80) && (env->pstate & PS_PRIV)) {
-            if (cpu_hypervisor_mode(env)) {
-                switch (size) {
-                case 1:
-                    ret = cpu_ldub_hypv(env, addr);
-                    break;
-                case 2:
-                    ret = cpu_lduw_hypv(env, addr);
-                    break;
-                case 4:
-                    ret = cpu_ldl_hypv(env, addr);
-                    break;
-                default:
-                case 8:
-                    ret = cpu_ldq_hypv(env, addr);
-                    break;
-                }
-            } else {
-                /* secondary space access has lowest asi bit equal to 1 */
-                if (asi & 1) {
-                    switch (size) {
-                    case 1:
-                        ret = cpu_ldub_kernel_secondary(env, addr);
-                        break;
-                    case 2:
-                        ret = cpu_lduw_kernel_secondary(env, addr);
-                        break;
-                    case 4:
-                        ret = cpu_ldl_kernel_secondary(env, addr);
-                        break;
-                    default:
-                    case 8:
-                        ret = cpu_ldq_kernel_secondary(env, addr);
-                        break;
-                    }
-                } else {
-                    switch (size) {
-                    case 1:
-                        ret = cpu_ldub_kernel(env, addr);
-                        break;
-                    case 2:
-                        ret = cpu_lduw_kernel(env, addr);
-                        break;
-                    case 4:
-                        ret = cpu_ldl_kernel(env, addr);
-                        break;
-                    default:
-                    case 8:
-                        ret = cpu_ldq_kernel(env, addr);
-                        break;
-                    }
-                }
+                /* exception_index is set in get_physical_address_data. */
+                cpu_raise_exception_ra(env, cs->exception_index, GETPC());
             }
-        } else {
-            /* secondary space access has lowest asi bit equal to 1 */
-            if (asi & 1) {
-                switch (size) {
-                case 1:
-                    ret = cpu_ldub_user_secondary(env, addr);
-                    break;
-                case 2:
-                    ret = cpu_lduw_user_secondary(env, addr);
-                    break;
-                case 4:
-                    ret = cpu_ldl_user_secondary(env, addr);
-                    break;
-                default:
-                case 8:
-                    ret = cpu_ldq_user_secondary(env, addr);
-                    break;
-                }
-            } else {
-                switch (size) {
-                case 1:
-                    ret = cpu_ldub_user(env, addr);
-                    break;
-                case 2:
-                    ret = cpu_lduw_user(env, addr);
-                    break;
-                case 4:
-                    ret = cpu_ldl_user(env, addr);
-                    break;
-                default:
-                case 8:
-                    ret = cpu_ldq_user(env, addr);
-                    break;
-                }
-            }
-        }
-        break;
-    case ASI_REAL:        /* Bypass */
-    case ASI_REAL_IO:   /* Bypass, non-cacheable */
-    case ASI_REAL_L:      /* Bypass LE */
-    case ASI_REAL_IO_L: /* Bypass, non-cacheable LE */
-        {
+            oi = make_memop_idx(memop, idx);
             switch (size) {
             case 1:
-                ret = ldub_phys(cs->as, addr);
+                ret = helper_ret_ldub_mmu(env, addr, oi, GETPC());
                 break;
             case 2:
-                ret = lduw_phys(cs->as, addr);
+                if (asi & 8) {
+                    ret = helper_le_lduw_mmu(env, addr, oi, GETPC());
+                } else {
+                    ret = helper_be_lduw_mmu(env, addr, oi, GETPC());
+                }
                 break;
             case 4:
-                ret = ldl_phys(cs->as, addr);
+                if (asi & 8) {
+                    ret = helper_le_ldul_mmu(env, addr, oi, GETPC());
+                } else {
+                    ret = helper_be_ldul_mmu(env, addr, oi, GETPC());
+                }
                 break;
-            default:
             case 8:
-                ret = ldq_phys(cs->as, addr);
+                if (asi & 8) {
+                    ret = helper_le_ldq_mmu(env, addr, oi, GETPC());
+                } else {
+                    ret = helper_be_ldq_mmu(env, addr, oi, GETPC());
+                }
                 break;
+            default:
+                g_assert_not_reached();
             }
-            break;
         }
+        break;
+
+    case ASI_AIUP:  /* As if user primary */
+    case ASI_AIUS:  /* As if user secondary */
+    case ASI_AIUPL: /* As if user primary LE */
+    case ASI_AIUSL: /* As if user secondary LE */
+    case ASI_P:  /* Primary */
+    case ASI_S:  /* Secondary */
+    case ASI_PL: /* Primary LE */
+    case ASI_SL: /* Secondary LE */
+    case ASI_REAL:      /* Bypass */
+    case ASI_REAL_IO:   /* Bypass, non-cacheable */
+    case ASI_REAL_L:    /* Bypass LE */
+    case ASI_REAL_IO_L: /* Bypass, non-cacheable LE */
     case ASI_N:  /* Nucleus */
     case ASI_NL: /* Nucleus Little Endian (LE) */
-        {
-            switch (size) {
-            case 1:
-                ret = cpu_ldub_nucleus(env, addr);
-                break;
-            case 2:
-                ret = cpu_lduw_nucleus(env, addr);
-                break;
-            case 4:
-                ret = cpu_ldl_nucleus(env, addr);
-                break;
-            default:
-            case 8:
-                ret = cpu_ldq_nucleus(env, addr);
-                break;
-            }
-            break;
-        }
+    case ASI_NUCLEUS_QUAD_LDD:   /* Nucleus quad LDD 128 bit atomic */
+    case ASI_NUCLEUS_QUAD_LDD_L: /* Nucleus quad LDD 128 bit atomic LE */
+    case ASI_TWINX_AIUP:   /* As if user primary, twinx */
+    case ASI_TWINX_AIUS:   /* As if user secondary, twinx */
+    case ASI_TWINX_REAL:   /* Real address, twinx */
+    case ASI_TWINX_AIUP_L: /* As if user primary, twinx, LE */
+    case ASI_TWINX_AIUS_L: /* As if user secondary, twinx, LE */
+    case ASI_TWINX_REAL_L: /* Real address, twinx, LE */
+    case ASI_TWINX_N:  /* Nucleus, twinx */
+    case ASI_TWINX_NL: /* Nucleus, twinx, LE */
+    /* ??? From the UA2011 document; overlaps BLK_INIT_QUAD_LDD_* */
+    case ASI_TWINX_P:  /* Primary, twinx */
+    case ASI_TWINX_PL: /* Primary, twinx, LE */
+    case ASI_TWINX_S:  /* Secondary, twinx */
+    case ASI_TWINX_SL: /* Secondary, twinx, LE */
+        /* These are always handled inline.  */
+        g_assert_not_reached();
+
     case ASI_UPA_CONFIG: /* UPA config */
         /* XXX */
         break;
@@ -1597,51 +1338,6 @@ uint64_t helper_ld_asi(CPUSPARCState *env, target_ulong addr,
         cpu_unassigned_access(cs, addr, false, false, 1, size);
         ret = 0;
         break;
-
-    case ASI_NUCLEUS_QUAD_LDD:   /* Nucleus quad LDD 128 bit atomic */
-    case ASI_NUCLEUS_QUAD_LDD_L: /* Nucleus quad LDD 128 bit atomic LE */
-    case ASI_TWINX_AIUP:   /* As if user primary, twinx */
-    case ASI_TWINX_AIUS:   /* As if user secondary, twinx */
-    case ASI_TWINX_REAL:   /* Real address, twinx */
-    case ASI_TWINX_AIUP_L: /* As if user primary, twinx, LE */
-    case ASI_TWINX_AIUS_L: /* As if user secondary, twinx, LE */
-    case ASI_TWINX_REAL_L: /* Real address, twinx, LE */
-    case ASI_TWINX_N:  /* Nucleus, twinx */
-    case ASI_TWINX_NL: /* Nucleus, twinx, LE */
-    /* ??? From the UA2011 document; overlaps BLK_INIT_QUAD_LDD_* */
-    case ASI_TWINX_P:  /* Primary, twinx */
-    case ASI_TWINX_PL: /* Primary, twinx, LE */
-    case ASI_TWINX_S:  /* Secondary, twinx */
-    case ASI_TWINX_SL: /* Secondary, twinx, LE */
-        /* These are all 128-bit atomic; only ldda (now ldtxa) allowed */
-        helper_raise_exception(env, TT_ILL_INSN);
-        return 0;
-    }
-
-    /* Convert from little endian */
-    switch (asi) {
-    case ASI_NL: /* Nucleus Little Endian (LE) */
-    case ASI_AIUPL: /* As if user primary LE */
-    case ASI_AIUSL: /* As if user secondary LE */
-    case ASI_REAL_L:      /* Bypass LE */
-    case ASI_REAL_IO_L: /* Bypass, non-cacheable LE */
-    case ASI_PL: /* Primary LE */
-    case ASI_SL: /* Secondary LE */
-        switch(size) {
-        case 2:
-            ret = bswap16(ret);
-            break;
-        case 4:
-            ret = bswap32(ret);
-            break;
-        case 8:
-            ret = bswap64(ret);
-            break;
-        default:
-            break;
-        }
-    default:
-        break;
     }
 
     /* Convert to signed number */
@@ -1683,38 +1379,12 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val,
         || (cpu_has_hypervisor(env)
             && asi >= 0x30 && asi < 0x80
             && !(env->hpstate & HS_PRIV))) {
-        helper_raise_exception(env, TT_PRIV_ACT);
+        cpu_raise_exception_ra(env, TT_PRIV_ACT, GETPC());
     }
 
-    helper_check_align(env, addr, size - 1);
+    do_check_align(env, addr, size - 1, GETPC());
     addr = asi_address_mask(env, asi, addr);
 
-    /* Convert to little endian */
-    switch (asi) {
-    case ASI_NL: /* Nucleus Little Endian (LE) */
-    case ASI_AIUPL: /* As if user primary LE */
-    case ASI_AIUSL: /* As if user secondary LE */
-    case ASI_REAL_L: /* Bypass LE */
-    case ASI_REAL_IO_L: /* Bypass, non-cacheable LE */
-    case ASI_PL: /* Primary LE */
-    case ASI_SL: /* Secondary LE */
-        switch (size) {
-        case 2:
-            val = bswap16(val);
-            break;
-        case 4:
-            val = bswap32(val);
-            break;
-        case 8:
-            val = bswap64(val);
-            break;
-        default:
-            break;
-        }
-    default:
-        break;
-    }
-
     switch (asi) {
     case ASI_AIUP:  /* As if user primary */
     case ASI_AIUS:  /* As if user secondary */
@@ -1724,160 +1394,36 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val,
     case ASI_S:  /* Secondary */
     case ASI_PL: /* Primary LE */
     case ASI_SL: /* Secondary LE */
-        if ((asi & 0x80) && (env->pstate & PS_PRIV)) {
-            if (cpu_hypervisor_mode(env)) {
-                switch (size) {
-                case 1:
-                    cpu_stb_hypv(env, addr, val);
-                    break;
-                case 2:
-                    cpu_stw_hypv(env, addr, val);
-                    break;
-                case 4:
-                    cpu_stl_hypv(env, addr, val);
-                    break;
-                case 8:
-                default:
-                    cpu_stq_hypv(env, addr, val);
-                    break;
-                }
-            } else {
-                /* secondary space access has lowest asi bit equal to 1 */
-                if (asi & 1) {
-                    switch (size) {
-                    case 1:
-                        cpu_stb_kernel_secondary(env, addr, val);
-                        break;
-                    case 2:
-                        cpu_stw_kernel_secondary(env, addr, val);
-                        break;
-                    case 4:
-                        cpu_stl_kernel_secondary(env, addr, val);
-                        break;
-                    case 8:
-                    default:
-                        cpu_stq_kernel_secondary(env, addr, val);
-                        break;
-                    }
-                } else {
-                    switch (size) {
-                    case 1:
-                        cpu_stb_kernel(env, addr, val);
-                        break;
-                    case 2:
-                        cpu_stw_kernel(env, addr, val);
-                        break;
-                    case 4:
-                        cpu_stl_kernel(env, addr, val);
-                        break;
-                    case 8:
-                    default:
-                        cpu_stq_kernel(env, addr, val);
-                        break;
-                    }
-                }
-            }
-        } else {
-            /* secondary space access has lowest asi bit equal to 1 */
-            if (asi & 1) {
-                switch (size) {
-                case 1:
-                    cpu_stb_user_secondary(env, addr, val);
-                    break;
-                case 2:
-                    cpu_stw_user_secondary(env, addr, val);
-                    break;
-                case 4:
-                    cpu_stl_user_secondary(env, addr, val);
-                    break;
-                case 8:
-                default:
-                    cpu_stq_user_secondary(env, addr, val);
-                    break;
-                }
-            } else {
-                switch (size) {
-                case 1:
-                    cpu_stb_user(env, addr, val);
-                    break;
-                case 2:
-                    cpu_stw_user(env, addr, val);
-                    break;
-                case 4:
-                    cpu_stl_user(env, addr, val);
-                    break;
-                case 8:
-                default:
-                    cpu_stq_user(env, addr, val);
-                    break;
-                }
-            }
-        }
-        break;
     case ASI_REAL:      /* Bypass */
     case ASI_REAL_IO:   /* Bypass, non-cacheable */
     case ASI_REAL_L:    /* Bypass LE */
     case ASI_REAL_IO_L: /* Bypass, non-cacheable LE */
-        {
-            switch (size) {
-            case 1:
-                stb_phys(cs->as, addr, val);
-                break;
-            case 2:
-                stw_phys(cs->as, addr, val);
-                break;
-            case 4:
-                stl_phys(cs->as, addr, val);
-                break;
-            case 8:
-            default:
-                stq_phys(cs->as, addr, val);
-                break;
-            }
-        }
-        return;
     case ASI_N:  /* Nucleus */
     case ASI_NL: /* Nucleus Little Endian (LE) */
-        {
-            switch (size) {
-            case 1:
-                cpu_stb_nucleus(env, addr, val);
-                break;
-            case 2:
-                cpu_stw_nucleus(env, addr, val);
-                break;
-            case 4:
-                cpu_stl_nucleus(env, addr, val);
-                break;
-            default:
-            case 8:
-                cpu_stq_nucleus(env, addr, val);
-                break;
-            }
-            break;
-        }
+    case ASI_NUCLEUS_QUAD_LDD:   /* Nucleus quad LDD 128 bit atomic */
+    case ASI_NUCLEUS_QUAD_LDD_L: /* Nucleus quad LDD 128 bit atomic LE */
+    case ASI_TWINX_AIUP:   /* As if user primary, twinx */
+    case ASI_TWINX_AIUS:   /* As if user secondary, twinx */
+    case ASI_TWINX_REAL:   /* Real address, twinx */
+    case ASI_TWINX_AIUP_L: /* As if user primary, twinx, LE */
+    case ASI_TWINX_AIUS_L: /* As if user secondary, twinx, LE */
+    case ASI_TWINX_REAL_L: /* Real address, twinx, LE */
+    case ASI_TWINX_N:  /* Nucleus, twinx */
+    case ASI_TWINX_NL: /* Nucleus, twinx, LE */
+    /* ??? From the UA2011 document; overlaps BLK_INIT_QUAD_LDD_* */
+    case ASI_TWINX_P:  /* Primary, twinx */
+    case ASI_TWINX_PL: /* Primary, twinx, LE */
+    case ASI_TWINX_S:  /* Secondary, twinx */
+    case ASI_TWINX_SL: /* Secondary, twinx, LE */
+        /* These are always handled inline.  */
+        g_assert_not_reached();
 
     case ASI_UPA_CONFIG: /* UPA config */
         /* XXX */
         return;
     case ASI_LSU_CONTROL: /* LSU */
-        {
-            uint64_t oldreg;
-
-            oldreg = env->lsu;
-            env->lsu = val & (DMMU_E | IMMU_E);
-            /* Mappings generated during D/I MMU disabled mode are
-               invalid in normal mode */
-            if (oldreg != env->lsu) {
-                DPRINTF_MMU("LSU change: 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
-                            oldreg, env->lsu);
-#ifdef DEBUG_MMU
-                dump_mmu(stdout, fprintf, env);
-#endif
-                tlb_flush(CPU(cpu), 1);
-            }
-            return;
-        }
+        env->lsu = val & (DMMU_E | IMMU_E);
+        return;
     case ASI_IMMU: /* I-MMU regs */
         {
             int reg = (addr >> 3) & 0xf;
@@ -2016,24 +1562,6 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val,
     case ASI_INTR_RECEIVE: /* Interrupt data receive */
         env->ivec_status = val & 0x20;
         return;
-    case ASI_NUCLEUS_QUAD_LDD:   /* Nucleus quad LDD 128 bit atomic */
-    case ASI_NUCLEUS_QUAD_LDD_L: /* Nucleus quad LDD 128 bit atomic LE */
-    case ASI_TWINX_AIUP:   /* As if user primary, twinx */
-    case ASI_TWINX_AIUS:   /* As if user secondary, twinx */
-    case ASI_TWINX_REAL:   /* Real address, twinx */
-    case ASI_TWINX_AIUP_L: /* As if user primary, twinx, LE */
-    case ASI_TWINX_AIUS_L: /* As if user secondary, twinx, LE */
-    case ASI_TWINX_REAL_L: /* Real address, twinx, LE */
-    case ASI_TWINX_N:  /* Nucleus, twinx */
-    case ASI_TWINX_NL: /* Nucleus, twinx, LE */
-    /* ??? From the UA2011 document; overlaps BLK_INIT_QUAD_LDD_* */
-    case ASI_TWINX_P:  /* Primary, twinx */
-    case ASI_TWINX_PL: /* Primary, twinx, LE */
-    case ASI_TWINX_S:  /* Secondary, twinx */
-    case ASI_TWINX_SL: /* Secondary, twinx, LE */
-        /* Only stda allowed */
-        helper_raise_exception(env, TT_ILL_INSN);
-        return;
     case ASI_DCACHE_DATA: /* D-cache data */
     case ASI_DCACHE_TAG: /* D-cache tag access */
     case ASI_ESTATE_ERROR_EN: /* E-cache error enable */
@@ -2066,203 +1594,8 @@ void helper_st_asi(CPUSPARCState *env, target_ulong addr, target_ulong val,
     }
 }
 #endif /* CONFIG_USER_ONLY */
-
-/* 128-bit LDDA; result returned in QT0.  */
-void helper_ldda_asi(CPUSPARCState *env, target_ulong addr, int asi)
-{
-    uint64_t h, l;
-
-    if ((asi < 0x80 && (env->pstate & PS_PRIV) == 0)
-        || (cpu_has_hypervisor(env)
-            && asi >= 0x30 && asi < 0x80
-            && !(env->hpstate & HS_PRIV))) {
-        helper_raise_exception(env, TT_PRIV_ACT);
-    }
-
-    addr = asi_address_mask(env, asi, addr);
-
-    switch (asi) {
-#if !defined(CONFIG_USER_ONLY)
-    case ASI_TWINX_AIUP:   /* As if user primary, twinx */
-    case ASI_TWINX_AIUP_L: /* As if user primary, twinx, LE */
-        helper_check_align(env, addr, 0xf);
-        h = cpu_ldq_user(env, addr);
-        l = cpu_ldq_user(env, addr + 8);
-        break;
-    case ASI_TWINX_AIUS:   /* As if user secondary, twinx */
-    case ASI_TWINX_AIUS_L: /* As if user secondary, twinx, LE */
-        helper_check_align(env, addr, 0xf);
-        h = cpu_ldq_user_secondary(env, addr);
-        l = cpu_ldq_user_secondary(env, addr + 8);
-        break;
-    case ASI_TWINX_REAL:   /* Real address, twinx */
-    case ASI_TWINX_REAL_L: /* Real address, twinx, LE */
-        helper_check_align(env, addr, 0xf);
-        {
-            CPUState *cs = CPU(sparc_env_get_cpu(env));
-            h = ldq_phys(cs->as, addr);
-            l = ldq_phys(cs->as, addr + 8);
-        }
-        break;
-    case ASI_NUCLEUS_QUAD_LDD:
-    case ASI_NUCLEUS_QUAD_LDD_L:
-    case ASI_TWINX_N:  /* Nucleus, twinx */
-    case ASI_TWINX_NL: /* Nucleus, twinx, LE */
-        helper_check_align(env, addr, 0xf);
-        h = cpu_ldq_nucleus(env, addr);
-        l = cpu_ldq_nucleus(env, addr + 8);
-        break;
-    case ASI_TWINX_S: /* Secondary, twinx */
-    case ASI_TWINX_SL: /* Secondary, twinx, LE */
-        if (!cpu_hypervisor_mode(env)) {
-            helper_check_align(env, addr, 0xf);
-            if (env->pstate & PS_PRIV) {
-                h = cpu_ldq_kernel_secondary(env, addr);
-                l = cpu_ldq_kernel_secondary(env, addr + 8);
-            } else {
-                h = cpu_ldq_user_secondary(env, addr);
-                l = cpu_ldq_user_secondary(env, addr + 8);
-            }
-            break;
-        }
-        /* fallthru */
-    case ASI_TWINX_P:  /* Primary, twinx */
-    case ASI_TWINX_PL: /* Primary, twinx, LE */
-        helper_check_align(env, addr, 0xf);
-        h = cpu_ldq_data(env, addr);
-        l = cpu_ldq_data(env, addr + 8);
-        break;
-#else
-    case ASI_TWINX_P:  /* Primary, twinx */
-    case ASI_TWINX_PL: /* Primary, twinx, LE */
-    case ASI_TWINX_S:  /* Primary, twinx */
-    case ASI_TWINX_SL: /* Primary, twinx, LE */
-        /* ??? Should be available, but we need to implement
-           an atomic 128-bit load.  */
-        helper_raise_exception(env, TT_PRIV_ACT);
-#endif
-    default:
-        /* Non-twinx asi, so this is the legacy ldda insn, which
-           performs two word sized operations.  */
-        /* ??? The UA2011 manual recommends emulating this with
-           a single 64-bit load.  However, LE asis *are* treated
-           as two 32-bit loads individually byte swapped.  */
-        helper_check_align(env, addr, 0x7);
-        QT0.high = (uint32_t)helper_ld_asi(env, addr, asi, MO_UL);
-        QT0.low = (uint32_t)helper_ld_asi(env, addr + 4, asi, MO_UL);
-        return;
-    }
-
-    if (asi & 8) {
-        h = bswap64(h);
-        l = bswap64(l);
-    }
-    QT0.high = h;
-    QT0.low = l;
-}
-
-target_ulong helper_casx_asi(CPUSPARCState *env, target_ulong addr,
-                             target_ulong val1, target_ulong val2,
-                             uint32_t asi)
-{
-    target_ulong ret;
-
-    ret = helper_ld_asi(env, addr, asi, MO_Q);
-    if (val2 == ret) {
-        helper_st_asi(env, addr, val1, asi, MO_Q);
-    }
-    return ret;
-}
 #endif /* TARGET_SPARC64 */
 
-#if !defined(CONFIG_USER_ONLY) || defined(TARGET_SPARC64)
-target_ulong helper_cas_asi(CPUSPARCState *env, target_ulong addr,
-                            target_ulong val1, target_ulong val2, uint32_t asi)
-{
-    target_ulong ret;
-
-    val2 &= 0xffffffffUL;
-    ret = helper_ld_asi(env, addr, asi, MO_UL);
-    ret &= 0xffffffffUL;
-    if (val2 == ret) {
-        helper_st_asi(env, addr, val1 & 0xffffffffUL, asi, MO_UL);
-    }
-    return ret;
-}
-#endif /* !defined(CONFIG_USER_ONLY) || defined(TARGET_SPARC64) */
-
-void helper_ldqf(CPUSPARCState *env, target_ulong addr, int mem_idx)
-{
-    /* XXX add 128 bit load */
-    CPU_QuadU u;
-
-    helper_check_align(env, addr, 7);
-#if !defined(CONFIG_USER_ONLY)
-    switch (mem_idx) {
-    case MMU_USER_IDX:
-        u.ll.upper = cpu_ldq_user(env, addr);
-        u.ll.lower = cpu_ldq_user(env, addr + 8);
-        QT0 = u.q;
-        break;
-    case MMU_KERNEL_IDX:
-        u.ll.upper = cpu_ldq_kernel(env, addr);
-        u.ll.lower = cpu_ldq_kernel(env, addr + 8);
-        QT0 = u.q;
-        break;
-#ifdef TARGET_SPARC64
-    case MMU_HYPV_IDX:
-        u.ll.upper = cpu_ldq_hypv(env, addr);
-        u.ll.lower = cpu_ldq_hypv(env, addr + 8);
-        QT0 = u.q;
-        break;
-#endif
-    default:
-        DPRINTF_MMU("helper_ldqf: need to check MMU idx %d\n", mem_idx);
-        break;
-    }
-#else
-    u.ll.upper = cpu_ldq_data(env, address_mask(env, addr));
-    u.ll.lower = cpu_ldq_data(env, address_mask(env, addr + 8));
-    QT0 = u.q;
-#endif
-}
-
-void helper_stqf(CPUSPARCState *env, target_ulong addr, int mem_idx)
-{
-    /* XXX add 128 bit store */
-    CPU_QuadU u;
-
-    helper_check_align(env, addr, 7);
-#if !defined(CONFIG_USER_ONLY)
-    switch (mem_idx) {
-    case MMU_USER_IDX:
-        u.q = QT0;
-        cpu_stq_user(env, addr, u.ll.upper);
-        cpu_stq_user(env, addr + 8, u.ll.lower);
-        break;
-    case MMU_KERNEL_IDX:
-        u.q = QT0;
-        cpu_stq_kernel(env, addr, u.ll.upper);
-        cpu_stq_kernel(env, addr + 8, u.ll.lower);
-        break;
-#ifdef TARGET_SPARC64
-    case MMU_HYPV_IDX:
-        u.q = QT0;
-        cpu_stq_hypv(env, addr, u.ll.upper);
-        cpu_stq_hypv(env, addr + 8, u.ll.lower);
-        break;
-#endif
-    default:
-        DPRINTF_MMU("helper_stqf: need to check MMU idx %d\n", mem_idx);
-        break;
-    }
-#else
-    u.q = QT0;
-    cpu_stq_data(env, address_mask(env, addr), u.ll.upper);
-    cpu_stq_data(env, address_mask(env, addr + 8), u.ll.lower);
-#endif
-}
-
 #if !defined(CONFIG_USER_ONLY)
 #ifndef TARGET_SPARC64
 void sparc_cpu_unassigned_access(CPUState *cs, hwaddr addr,
@@ -2314,11 +1647,8 @@ void sparc_cpu_unassigned_access(CPUState *cs, hwaddr addr,
     }
 
     if ((env->mmuregs[0] & MMU_E) && !(env->mmuregs[0] & MMU_NF)) {
-        if (is_exec) {
-            helper_raise_exception(env, TT_CODE_ACCESS);
-        } else {
-            helper_raise_exception(env, TT_DATA_ACCESS);
-        }
+        int tt = is_exec ? TT_CODE_ACCESS : TT_DATA_ACCESS;
+        cpu_raise_exception_ra(env, tt, GETPC());
     }
 
     /* flush neverland mappings created during no-fault mode,
@@ -2334,17 +1664,14 @@ void sparc_cpu_unassigned_access(CPUState *cs, hwaddr addr,
 {
     SPARCCPU *cpu = SPARC_CPU(cs);
     CPUSPARCState *env = &cpu->env;
+    int tt = is_exec ? TT_CODE_ACCESS : TT_DATA_ACCESS;
 
 #ifdef DEBUG_UNASSIGNED
     printf("Unassigned mem access to " TARGET_FMT_plx " from " TARGET_FMT_lx
            "\n", addr, env->pc);
 #endif
 
-    if (is_exec) {
-        helper_raise_exception(env, TT_CODE_ACCESS);
-    } else {
-        helper_raise_exception(env, TT_DATA_ACCESS);
-    }
+    cpu_raise_exception_ra(env, tt, GETPC());
 }
 #endif
 #endif
@@ -2362,10 +1689,7 @@ void QEMU_NORETURN sparc_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
     printf("Unaligned access to 0x" TARGET_FMT_lx " from 0x" TARGET_FMT_lx
            "\n", addr, env->pc);
 #endif
-    if (retaddr) {
-        cpu_restore_state(CPU(cpu), retaddr);
-    }
-    helper_raise_exception(env, TT_UNALIGNED);
+    cpu_raise_exception_ra(env, TT_UNALIGNED, retaddr);
 }
 
 /* try to fill the TLB and return an exception if error. If retaddr is
@@ -2379,10 +1703,7 @@ void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType access_type,
 
     ret = sparc_cpu_handle_mmu_fault(cs, addr, access_type, mmu_idx);
     if (ret) {
-        if (retaddr) {
-            cpu_restore_state(cs, retaddr);
-        }
-        cpu_loop_exit(cs);
+        cpu_loop_exit_restore(cs, retaddr);
     }
 }
 #endif
diff --git a/target-sparc/machine.c b/target-sparc/machine.c
index 59c92f7582..aea6397861 100644
--- a/target-sparc/machine.c
+++ b/target-sparc/machine.c
@@ -6,10 +6,7 @@
 #include "hw/boards.h"
 #include "qemu/timer.h"
 
-#include "cpu.h"
-#include "exec/exec-all.h"
 #include "migration/cpu.h"
-#include "exec/exec-all.h"
 
 #ifdef TARGET_SPARC64
 static const VMStateDescription vmstate_cpu_timer = {
diff --git a/target-sparc/mmu_helper.c b/target-sparc/mmu_helper.c
index 32b629fb0d..044e88c4c5 100644
--- a/target-sparc/mmu_helper.c
+++ b/target-sparc/mmu_helper.c
@@ -92,7 +92,7 @@ static int get_physical_address(CPUSPARCState *env, hwaddr *physical,
 
     is_user = mmu_idx == MMU_USER_IDX;
 
-    if ((env->mmuregs[0] & MMU_E) == 0) { /* MMU disabled */
+    if (mmu_idx == MMU_PHYS_IDX) {
         *page_size = TARGET_PAGE_SIZE;
         /* Boot mode: instruction fetches are taken from PROM */
         if (rw == 2 && (env->mmuregs[0] & env->def->mmu_bm)) {
@@ -494,23 +494,21 @@ static int get_physical_address_data(CPUSPARCState *env,
     unsigned int i;
     uint64_t context;
     uint64_t sfsr = 0;
-
-    int is_user = (mmu_idx == MMU_USER_IDX ||
-                   mmu_idx == MMU_USER_SECONDARY_IDX);
-
-    if ((env->lsu & DMMU_E) == 0) { /* DMMU disabled */
-        *physical = ultrasparc_truncate_physical(address);
-        *prot = PAGE_READ | PAGE_WRITE;
-        return 0;
-    }
+    bool is_user = false;
 
     switch (mmu_idx) {
+    case MMU_PHYS_IDX:
+        g_assert_not_reached();
     case MMU_USER_IDX:
+        is_user = true;
+        /* fallthru */
     case MMU_KERNEL_IDX:
         context = env->dmmu.mmu_primary_context & 0x1fff;
         sfsr |= SFSR_CT_PRIMARY;
         break;
     case MMU_USER_SECONDARY_IDX:
+        is_user = true;
+        /* fallthru */
     case MMU_KERNEL_SECONDARY_IDX:
         context = env->dmmu.mmu_secondary_context & 0x1fff;
         sfsr |= SFSR_CT_SECONDARY;
@@ -613,15 +611,22 @@ static int get_physical_address_code(CPUSPARCState *env,
     CPUState *cs = CPU(sparc_env_get_cpu(env));
     unsigned int i;
     uint64_t context;
+    bool is_user = false;
 
-    int is_user = (mmu_idx == MMU_USER_IDX ||
-                   mmu_idx == MMU_USER_SECONDARY_IDX);
-
-    if ((env->lsu & IMMU_E) == 0 || (env->pstate & PS_RED) != 0) {
-        /* IMMU disabled */
-        *physical = ultrasparc_truncate_physical(address);
-        *prot = PAGE_EXEC;
-        return 0;
+    switch (mmu_idx) {
+    case MMU_PHYS_IDX:
+    case MMU_USER_SECONDARY_IDX:
+    case MMU_KERNEL_SECONDARY_IDX:
+        g_assert_not_reached();
+    case MMU_USER_IDX:
+        is_user = true;
+        /* fallthru */
+    case MMU_KERNEL_IDX:
+        context = env->dmmu.mmu_primary_context & 0x1fff;
+        break;
+    default:
+        context = 0;
+        break;
     }
 
     if (env->tl == 0) {
@@ -700,6 +705,12 @@ static int get_physical_address(CPUSPARCState *env, hwaddr *physical,
         }
     }
 
+    if (mmu_idx == MMU_PHYS_IDX) {
+        *physical = ultrasparc_truncate_physical(address);
+        *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+        return 0;
+    }
+
     if (rw == 2) {
         return get_physical_address_code(env, physical, prot, address,
                                          mmu_idx);
diff --git a/target-sparc/translate.c b/target-sparc/translate.c
index e7691e4458..2205f89837 100644
--- a/target-sparc/translate.c
+++ b/target-sparc/translate.c
@@ -242,7 +242,29 @@ static void gen_op_store_QT0_fpr(unsigned int dst)
                    offsetof(CPU_QuadU, ll.lower));
 }
 
+static void gen_store_fpr_Q(DisasContext *dc, unsigned int dst,
+                            TCGv_i64 v1, TCGv_i64 v2)
+{
+    dst = QFPREG(dst);
+
+    tcg_gen_mov_i64(cpu_fpr[dst / 2], v1);
+    tcg_gen_mov_i64(cpu_fpr[dst / 2 + 1], v2);
+    gen_update_fprs_dirty(dc, dst);
+}
+
 #ifdef TARGET_SPARC64
+static TCGv_i64 gen_load_fpr_Q0(DisasContext *dc, unsigned int src)
+{
+    src = QFPREG(src);
+    return cpu_fpr[src / 2];
+}
+
+static TCGv_i64 gen_load_fpr_Q1(DisasContext *dc, unsigned int src)
+{
+    src = QFPREG(src);
+    return cpu_fpr[src / 2 + 1];
+}
+
 static void gen_move_Q(DisasContext *dc, unsigned int rd, unsigned int rs)
 {
     rd = QFPREG(rd);
@@ -2001,6 +2023,21 @@ static inline void gen_ne_fop_QD(DisasContext *dc, int rd, int rs,
     gen_update_fprs_dirty(dc, QFPREG(rd));
 }
 
+static void gen_swap(DisasContext *dc, TCGv dst, TCGv src,
+                     TCGv addr, int mmu_idx, TCGMemOp memop)
+{
+    gen_address_mask(dc, addr);
+    tcg_gen_atomic_xchg_tl(dst, addr, src, mmu_idx, memop);
+}
+
+static void gen_ldstub(DisasContext *dc, TCGv dst, TCGv addr, int mmu_idx)
+{
+    TCGv m1 = tcg_const_tl(0xff);
+    gen_address_mask(dc, addr);
+    tcg_gen_atomic_xchg_tl(dst, addr, m1, mmu_idx, MO_UB);
+    tcg_temp_free(m1);
+}
+
 /* asi moves */
 #if !defined(CONFIG_USER_ONLY) || defined(TARGET_SPARC64)
 typedef enum {
@@ -2010,6 +2047,8 @@ typedef enum {
     GET_ASI_DTWINX,
     GET_ASI_BLOCK,
     GET_ASI_SHORT,
+    GET_ASI_BCOPY,
+    GET_ASI_BFILL,
 } ASIType;
 
 typedef struct {
@@ -2046,6 +2085,19 @@ static DisasASI get_asi(DisasContext *dc, int insn, TCGMemOp memop)
             mem_idx = MMU_KERNEL_IDX;
             type = GET_ASI_DIRECT;
             break;
+        case ASI_M_BYPASS:    /* MMU passthrough */
+        case ASI_LEON_BYPASS: /* LEON MMU passthrough */
+            mem_idx = MMU_PHYS_IDX;
+            type = GET_ASI_DIRECT;
+            break;
+        case ASI_M_BCOPY: /* Block copy, sta access */
+            mem_idx = MMU_KERNEL_IDX;
+            type = GET_ASI_BCOPY;
+            break;
+        case ASI_M_BFILL: /* Block fill, stda access */
+            mem_idx = MMU_KERNEL_IDX;
+            type = GET_ASI_BFILL;
+            break;
         }
     } else {
         gen_exception(dc, TT_PRIV_INSN);
@@ -2066,10 +2118,22 @@ static DisasASI get_asi(DisasContext *dc, int insn, TCGMemOp memop)
         type = GET_ASI_EXCP;
     } else {
         switch (asi) {
+        case ASI_REAL:      /* Bypass */
+        case ASI_REAL_IO:   /* Bypass, non-cacheable */
+        case ASI_REAL_L:    /* Bypass LE */
+        case ASI_REAL_IO_L: /* Bypass, non-cacheable LE */
+        case ASI_TWINX_REAL:   /* Real address, twinx */
+        case ASI_TWINX_REAL_L: /* Real address, twinx, LE */
+        case ASI_QUAD_LDD_PHYS:
+        case ASI_QUAD_LDD_PHYS_L:
+            mem_idx = MMU_PHYS_IDX;
+            break;
         case ASI_N:  /* Nucleus */
         case ASI_NL: /* Nucleus LE */
         case ASI_TWINX_N:
         case ASI_TWINX_NL:
+        case ASI_NUCLEUS_QUAD_LDD:
+        case ASI_NUCLEUS_QUAD_LDD_L:
             mem_idx = MMU_NUCLEUS_IDX;
             break;
         case ASI_AIUP:  /* As if user primary */
@@ -2123,6 +2187,10 @@ static DisasASI get_asi(DisasContext *dc, int insn, TCGMemOp memop)
             break;
         }
         switch (asi) {
+        case ASI_REAL:
+        case ASI_REAL_IO:
+        case ASI_REAL_L:
+        case ASI_REAL_IO_L:
         case ASI_N:
         case ASI_NL:
         case ASI_AIUP:
@@ -2135,6 +2203,8 @@ static DisasASI get_asi(DisasContext *dc, int insn, TCGMemOp memop)
         case ASI_PL:
             type = GET_ASI_DIRECT;
             break;
+        case ASI_TWINX_REAL:
+        case ASI_TWINX_REAL_L:
         case ASI_TWINX_N:
         case ASI_TWINX_NL:
         case ASI_TWINX_AIUP:
@@ -2145,6 +2215,10 @@ static DisasASI get_asi(DisasContext *dc, int insn, TCGMemOp memop)
         case ASI_TWINX_PL:
         case ASI_TWINX_S:
         case ASI_TWINX_SL:
+        case ASI_QUAD_LDD_PHYS:
+        case ASI_QUAD_LDD_PHYS_L:
+        case ASI_NUCLEUS_QUAD_LDD:
+        case ASI_NUCLEUS_QUAD_LDD_L:
             type = GET_ASI_DTWINX;
             break;
         case ASI_BLK_COMMIT_P:
@@ -2241,6 +2315,38 @@ static void gen_st_asi(DisasContext *dc, TCGv src, TCGv addr,
         gen_address_mask(dc, addr);
         tcg_gen_qemu_st_tl(src, addr, da.mem_idx, da.memop);
         break;
+#if !defined(TARGET_SPARC64) && !defined(CONFIG_USER_ONLY)
+    case GET_ASI_BCOPY:
+        /* Copy 32 bytes from the address in SRC to ADDR.  */
+        /* ??? The original qemu code suggests 4-byte alignment, dropping
+           the low bits, but the only place I can see this used is in the
+           Linux kernel with 32 byte alignment, which would make more sense
+           as a cacheline-style operation.  */
+        {
+            TCGv saddr = tcg_temp_new();
+            TCGv daddr = tcg_temp_new();
+            TCGv four = tcg_const_tl(4);
+            TCGv_i32 tmp = tcg_temp_new_i32();
+            int i;
+
+            tcg_gen_andi_tl(saddr, src, -4);
+            tcg_gen_andi_tl(daddr, addr, -4);
+            for (i = 0; i < 32; i += 4) {
+                /* Since the loads and stores are paired, allow the
+                   copy to happen in the host endianness.  */
+                tcg_gen_qemu_ld_i32(tmp, saddr, da.mem_idx, MO_UL);
+                tcg_gen_qemu_st_i32(tmp, daddr, da.mem_idx, MO_UL);
+                tcg_gen_add_tl(saddr, saddr, four);
+                tcg_gen_add_tl(daddr, daddr, four);
+            }
+
+            tcg_temp_free(saddr);
+            tcg_temp_free(daddr);
+            tcg_temp_free(four);
+            tcg_temp_free_i32(tmp);
+        }
+        break;
+#endif
     default:
         {
             TCGv_i32 r_asi = tcg_const_i32(da.asi);
@@ -2275,48 +2381,37 @@ static void gen_swap_asi(DisasContext *dc, TCGv dst, TCGv src,
     switch (da.type) {
     case GET_ASI_EXCP:
         break;
+    case GET_ASI_DIRECT:
+        gen_swap(dc, dst, src, addr, da.mem_idx, da.memop);
+        break;
     default:
-        {
-            TCGv_i32 r_asi = tcg_const_i32(da.asi);
-            TCGv_i32 r_mop = tcg_const_i32(MO_UL);
-            TCGv_i64 s64, t64;
-
-            save_state(dc);
-            t64 = tcg_temp_new_i64();
-            gen_helper_ld_asi(t64, cpu_env, addr, r_asi, r_mop);
-
-            s64 = tcg_temp_new_i64();
-            tcg_gen_extu_tl_i64(s64, src);
-            gen_helper_st_asi(cpu_env, addr, s64, r_asi, r_mop);
-            tcg_temp_free_i64(s64);
-            tcg_temp_free_i32(r_mop);
-            tcg_temp_free_i32(r_asi);
-
-            tcg_gen_trunc_i64_tl(dst, t64);
-            tcg_temp_free_i64(t64);
-        }
+        /* ??? Should be DAE_invalid_asi.  */
+        gen_exception(dc, TT_DATA_ACCESS);
         break;
     }
 }
 
-static void gen_cas_asi(DisasContext *dc, TCGv addr, TCGv val2,
+static void gen_cas_asi(DisasContext *dc, TCGv addr, TCGv cmpv,
                         int insn, int rd)
 {
     DisasASI da = get_asi(dc, insn, MO_TEUL);
-    TCGv val1, dst;
-    TCGv_i32 r_asi;
+    TCGv oldv;
 
-    if (da.type == GET_ASI_EXCP) {
+    switch (da.type) {
+    case GET_ASI_EXCP:
         return;
+    case GET_ASI_DIRECT:
+        oldv = tcg_temp_new();
+        tcg_gen_atomic_cmpxchg_tl(oldv, addr, cmpv, gen_load_gpr(dc, rd),
+                                  da.mem_idx, da.memop);
+        gen_store_gpr(dc, rd, oldv);
+        tcg_temp_free(oldv);
+        break;
+    default:
+        /* ??? Should be DAE_invalid_asi.  */
+        gen_exception(dc, TT_DATA_ACCESS);
+        break;
     }
-
-    save_state(dc);
-    val1 = gen_load_gpr(dc, rd);
-    dst = gen_dest_gpr(dc, rd);
-    r_asi = tcg_const_i32(da.asi);
-    gen_helper_cas_asi(dst, cpu_env, addr, val1, val2, r_asi);
-    tcg_temp_free_i32(r_asi);
-    gen_store_gpr(dc, rd, dst);
 }
 
 static void gen_ldstub_asi(DisasContext *dc, TCGv dst, TCGv addr, int insn)
@@ -2326,25 +2421,12 @@ static void gen_ldstub_asi(DisasContext *dc, TCGv dst, TCGv addr, int insn)
     switch (da.type) {
     case GET_ASI_EXCP:
         break;
+    case GET_ASI_DIRECT:
+        gen_ldstub(dc, dst, addr, da.mem_idx);
+        break;
     default:
-        {
-            TCGv_i32 r_asi = tcg_const_i32(da.asi);
-            TCGv_i32 r_mop = tcg_const_i32(MO_UB);
-            TCGv_i64 s64, t64;
-
-            save_state(dc);
-            t64 = tcg_temp_new_i64();
-            gen_helper_ld_asi(t64, cpu_env, addr, r_asi, r_mop);
-
-            s64 = tcg_const_i64(0xff);
-            gen_helper_st_asi(cpu_env, addr, s64, r_asi, r_mop);
-            tcg_temp_free_i64(s64);
-            tcg_temp_free_i32(r_mop);
-            tcg_temp_free_i32(r_asi);
-
-            tcg_gen_trunc_i64_tl(dst, t64);
-            tcg_temp_free_i64(t64);
-        }
+        /* ??? Should be DAE_invalid_asi.  */
+        gen_exception(dc, TT_DATA_ACCESS);
         break;
     }
 }
@@ -2356,6 +2438,7 @@ static void gen_ldf_asi(DisasContext *dc, TCGv addr,
 {
     DisasASI da = get_asi(dc, insn, (size == 4 ? MO_TEUL : MO_TEQ));
     TCGv_i32 d32;
+    TCGv_i64 d64;
 
     switch (da.type) {
     case GET_ASI_EXCP:
@@ -2370,12 +2453,17 @@ static void gen_ldf_asi(DisasContext *dc, TCGv addr,
             gen_store_fpr_F(dc, rd, d32);
             break;
         case 8:
-            tcg_gen_qemu_ld_i64(cpu_fpr[rd / 2], addr, da.mem_idx, da.memop);
+            tcg_gen_qemu_ld_i64(cpu_fpr[rd / 2], addr, da.mem_idx,
+                                da.memop | MO_ALIGN_4);
             break;
         case 16:
-            tcg_gen_qemu_ld_i64(cpu_fpr[rd / 2], addr, da.mem_idx, da.memop);
+            d64 = tcg_temp_new_i64();
+            tcg_gen_qemu_ld_i64(d64, addr, da.mem_idx, da.memop | MO_ALIGN_4);
             tcg_gen_addi_tl(addr, addr, 8);
-            tcg_gen_qemu_ld_i64(cpu_fpr[rd/2+1], addr, da.mem_idx, da.memop);
+            tcg_gen_qemu_ld_i64(cpu_fpr[rd/2+1], addr, da.mem_idx,
+                                da.memop | MO_ALIGN_4);
+            tcg_gen_mov_i64(cpu_fpr[rd / 2], d64);
+            tcg_temp_free_i64(d64);
             break;
         default:
             g_assert_not_reached();
@@ -2385,20 +2473,23 @@ static void gen_ldf_asi(DisasContext *dc, TCGv addr,
     case GET_ASI_BLOCK:
         /* Valid for lddfa on aligned registers only.  */
         if (size == 8 && (rd & 7) == 0) {
+            TCGMemOp memop;
             TCGv eight;
             int i;
 
-            gen_check_align(addr, 0x3f);
             gen_address_mask(dc, addr);
 
+            /* The first operation checks required alignment.  */
+            memop = da.memop | MO_ALIGN_64;
             eight = tcg_const_tl(8);
             for (i = 0; ; ++i) {
                 tcg_gen_qemu_ld_i64(cpu_fpr[rd / 2 + i], addr,
-                                    da.mem_idx, da.memop);
+                                    da.mem_idx, memop);
                 if (i == 7) {
                     break;
                 }
                 tcg_gen_add_tl(addr, addr, eight);
+                memop = da.memop;
             }
             tcg_temp_free(eight);
         } else {
@@ -2428,22 +2519,23 @@ static void gen_ldf_asi(DisasContext *dc, TCGv addr,
                but we can just use the integer asi helper for them.  */
             switch (size) {
             case 4:
-                {
-                    TCGv d64 = tcg_temp_new_i64();
-                    gen_helper_ld_asi(d64, cpu_env, addr, r_asi, r_mop);
-                    d32 = gen_dest_fpr_F(dc);
-                    tcg_gen_extrl_i64_i32(d32, d64);
-                    tcg_temp_free_i64(d64);
-                    gen_store_fpr_F(dc, rd, d32);
-                }
+                d64 = tcg_temp_new_i64();
+                gen_helper_ld_asi(d64, cpu_env, addr, r_asi, r_mop);
+                d32 = gen_dest_fpr_F(dc);
+                tcg_gen_extrl_i64_i32(d32, d64);
+                tcg_temp_free_i64(d64);
+                gen_store_fpr_F(dc, rd, d32);
                 break;
             case 8:
                 gen_helper_ld_asi(cpu_fpr[rd / 2], cpu_env, addr, r_asi, r_mop);
                 break;
             case 16:
-                gen_helper_ld_asi(cpu_fpr[rd / 2], cpu_env, addr, r_asi, r_mop);
+                d64 = tcg_temp_new_i64();
+                gen_helper_ld_asi(d64, cpu_env, addr, r_asi, r_mop);
                 tcg_gen_addi_tl(addr, addr, 8);
                 gen_helper_ld_asi(cpu_fpr[rd/2+1], cpu_env, addr, r_asi, r_mop);
+                tcg_gen_mov_i64(cpu_fpr[rd / 2], d64);
+                tcg_temp_free_i64(d64);
                 break;
             default:
                 g_assert_not_reached();
@@ -2473,10 +2565,17 @@ static void gen_stf_asi(DisasContext *dc, TCGv addr,
             tcg_gen_qemu_st_i32(d32, addr, da.mem_idx, da.memop);
             break;
         case 8:
-            tcg_gen_qemu_st_i64(cpu_fpr[rd / 2], addr, da.mem_idx, da.memop);
+            tcg_gen_qemu_st_i64(cpu_fpr[rd / 2], addr, da.mem_idx,
+                                da.memop | MO_ALIGN_4);
             break;
         case 16:
-            tcg_gen_qemu_st_i64(cpu_fpr[rd / 2], addr, da.mem_idx, da.memop);
+            /* Only 4-byte alignment required.  However, it is legal for the
+               cpu to signal the alignment fault, and the OS trap handler is
+               required to fix it up.  Requiring 16-byte alignment here avoids
+               having to probe the second page before performing the first
+               write.  */
+            tcg_gen_qemu_st_i64(cpu_fpr[rd / 2], addr, da.mem_idx,
+                                da.memop | MO_ALIGN_16);
             tcg_gen_addi_tl(addr, addr, 8);
             tcg_gen_qemu_st_i64(cpu_fpr[rd/2+1], addr, da.mem_idx, da.memop);
             break;
@@ -2488,20 +2587,23 @@ static void gen_stf_asi(DisasContext *dc, TCGv addr,
     case GET_ASI_BLOCK:
         /* Valid for stdfa on aligned registers only.  */
         if (size == 8 && (rd & 7) == 0) {
+            TCGMemOp memop;
             TCGv eight;
             int i;
 
-            gen_check_align(addr, 0x3f);
             gen_address_mask(dc, addr);
 
+            /* The first operation checks required alignment.  */
+            memop = da.memop | MO_ALIGN_64;
             eight = tcg_const_tl(8);
             for (i = 0; ; ++i) {
                 tcg_gen_qemu_st_i64(cpu_fpr[rd / 2 + i], addr,
-                                    da.mem_idx, da.memop);
+                                    da.mem_idx, memop);
                 if (i == 7) {
                     break;
                 }
                 tcg_gen_add_tl(addr, addr, eight);
+                memop = da.memop;
             }
             tcg_temp_free(eight);
         } else {
@@ -2539,9 +2641,8 @@ static void gen_ldda_asi(DisasContext *dc, TCGv addr, int insn, int rd)
         return;
 
     case GET_ASI_DTWINX:
-        gen_check_align(addr, 15);
         gen_address_mask(dc, addr);
-        tcg_gen_qemu_ld_i64(hi, addr, da.mem_idx, da.memop);
+        tcg_gen_qemu_ld_i64(hi, addr, da.mem_idx, da.memop | MO_ALIGN_16);
         tcg_gen_addi_tl(addr, addr, 8);
         tcg_gen_qemu_ld_i64(lo, addr, da.mem_idx, da.memop);
         break;
@@ -2566,15 +2667,27 @@ static void gen_ldda_asi(DisasContext *dc, TCGv addr, int insn, int rd)
         break;
 
     default:
+        /* ??? In theory we've handled all of the ASIs that are valid
+           for ldda, and this should raise DAE_invalid_asi.  However,
+           real hardware allows others.  This can be seen with e.g.
+           FreeBSD 10.3 wrt ASI_IC_TAG.  */
         {
             TCGv_i32 r_asi = tcg_const_i32(da.asi);
+            TCGv_i32 r_mop = tcg_const_i32(da.memop);
+            TCGv_i64 tmp = tcg_temp_new_i64();
 
             save_state(dc);
-            gen_helper_ldda_asi(cpu_env, addr, r_asi);
+            gen_helper_ld_asi(tmp, cpu_env, addr, r_asi, r_mop);
             tcg_temp_free_i32(r_asi);
+            tcg_temp_free_i32(r_mop);
 
-            tcg_gen_ld_i64(hi, cpu_env, offsetof(CPUSPARCState, qt0.high));
-            tcg_gen_ld_i64(lo, cpu_env, offsetof(CPUSPARCState, qt0.low));
+            /* See above.  */
+            if ((da.memop & MO_BSWAP) == MO_TE) {
+                tcg_gen_extr32_i64(lo, hi, tmp);
+            } else {
+                tcg_gen_extr32_i64(hi, lo, tmp);
+            }
+            tcg_temp_free_i64(tmp);
         }
         break;
     }
@@ -2594,9 +2707,8 @@ static void gen_stda_asi(DisasContext *dc, TCGv hi, TCGv addr,
         break;
 
     case GET_ASI_DTWINX:
-        gen_check_align(addr, 15);
         gen_address_mask(dc, addr);
-        tcg_gen_qemu_st_i64(hi, addr, da.mem_idx, da.memop);
+        tcg_gen_qemu_st_i64(hi, addr, da.mem_idx, da.memop | MO_ALIGN_16);
         tcg_gen_addi_tl(addr, addr, 8);
         tcg_gen_qemu_st_i64(lo, addr, da.mem_idx, da.memop);
         break;
@@ -2620,15 +2732,21 @@ static void gen_stda_asi(DisasContext *dc, TCGv hi, TCGv addr,
         break;
 
     default:
+        /* ??? In theory we've handled all of the ASIs that are valid
+           for stda, and this should raise DAE_invalid_asi.  */
         {
             TCGv_i32 r_asi = tcg_const_i32(da.asi);
-            TCGv_i32 r_mop = tcg_const_i32(MO_Q);
-            TCGv_i64 t64;
+            TCGv_i32 r_mop = tcg_const_i32(da.memop);
+            TCGv_i64 t64 = tcg_temp_new_i64();
 
-            save_state(dc);
+            /* See above.  */
+            if ((da.memop & MO_BSWAP) == MO_TE) {
+                tcg_gen_concat32_i64(t64, lo, hi);
+            } else {
+                tcg_gen_concat32_i64(t64, hi, lo);
+            }
 
-            t64 = tcg_temp_new_i64();
-            tcg_gen_concat_tl_i64(t64, lo, hi);
+            save_state(dc);
             gen_helper_st_asi(cpu_env, addr, t64, r_asi, r_mop);
             tcg_temp_free_i32(r_mop);
             tcg_temp_free_i32(r_asi);
@@ -2638,23 +2756,27 @@ static void gen_stda_asi(DisasContext *dc, TCGv hi, TCGv addr,
     }
 }
 
-static void gen_casx_asi(DisasContext *dc, TCGv addr, TCGv val2,
+static void gen_casx_asi(DisasContext *dc, TCGv addr, TCGv cmpv,
                          int insn, int rd)
 {
     DisasASI da = get_asi(dc, insn, MO_TEQ);
-    TCGv val1 = gen_load_gpr(dc, rd);
-    TCGv dst = gen_dest_gpr(dc, rd);
-    TCGv_i32 r_asi;
+    TCGv oldv;
 
-    if (da.type == GET_ASI_EXCP) {
+    switch (da.type) {
+    case GET_ASI_EXCP:
         return;
+    case GET_ASI_DIRECT:
+        oldv = tcg_temp_new();
+        tcg_gen_atomic_cmpxchg_tl(oldv, addr, cmpv, gen_load_gpr(dc, rd),
+                                  da.mem_idx, da.memop);
+        gen_store_gpr(dc, rd, oldv);
+        tcg_temp_free(oldv);
+        break;
+    default:
+        /* ??? Should be DAE_invalid_asi.  */
+        gen_exception(dc, TT_DATA_ACCESS);
+        break;
     }
-
-    save_state(dc);
-    r_asi = tcg_const_i32(da.asi);
-    gen_helper_casx_asi(dst, cpu_env, addr, val1, val2, r_asi);
-    tcg_temp_free_i32(r_asi);
-    gen_store_gpr(dc, rd, dst);
 }
 
 #elif !defined(CONFIG_USER_ONLY)
@@ -2712,6 +2834,27 @@ static void gen_stda_asi(DisasContext *dc, TCGv hi, TCGv addr,
         gen_address_mask(dc, addr);
         tcg_gen_qemu_st_i64(t64, addr, da.mem_idx, da.memop);
         break;
+    case GET_ASI_BFILL:
+        /* Store 32 bytes of T64 to ADDR.  */
+        /* ??? The original qemu code suggests 8-byte alignment, dropping
+           the low bits, but the only place I can see this used is in the
+           Linux kernel with 32 byte alignment, which would make more sense
+           as a cacheline-style operation.  */
+        {
+            TCGv d_addr = tcg_temp_new();
+            TCGv eight = tcg_const_tl(8);
+            int i;
+
+            tcg_gen_andi_tl(d_addr, addr, -8);
+            for (i = 0; i < 32; i += 8) {
+                tcg_gen_qemu_st_i64(t64, d_addr, da.mem_idx, da.memop);
+                tcg_gen_add_tl(d_addr, d_addr, eight);
+            }
+
+            tcg_temp_free(d_addr);
+            tcg_temp_free(eight);
+        }
+        break;
     default:
         {
             TCGv_i32 r_asi = tcg_const_i32(da.asi);
@@ -3454,7 +3597,6 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                 break;
             } else if (xop == 0x2b) { /* rdtbr / V9 flushw */
 #ifdef TARGET_SPARC64
-                save_state(dc);
                 gen_helper_flushw(cpu_env);
 #else
                 if (!supervisor(dc))
@@ -5058,12 +5200,10 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                     /* nop */
                     break;
                 case 0x3c:      /* save */
-                    save_state(dc);
                     gen_helper_save(cpu_env);
                     gen_store_gpr(dc, rd, cpu_tmp0);
                     break;
                 case 0x3d:      /* restore */
-                    save_state(dc);
                     gen_helper_restore(cpu_env);
                     gen_store_gpr(dc, rd, cpu_tmp0);
                     break;
@@ -5163,31 +5303,15 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                     gen_address_mask(dc, cpu_addr);
                     tcg_gen_qemu_ld16s(cpu_val, cpu_addr, dc->mem_idx);
                     break;
-                case 0xd:       /* ldstub -- XXX: should be atomically */
-                    {
-                        TCGv r_const;
-                        TCGv tmp = tcg_temp_new();
-
-                        gen_address_mask(dc, cpu_addr);
-                        tcg_gen_qemu_ld8u(tmp, cpu_addr, dc->mem_idx);
-                        r_const = tcg_const_tl(0xff);
-                        tcg_gen_qemu_st8(r_const, cpu_addr, dc->mem_idx);
-                        tcg_gen_mov_tl(cpu_val, tmp);
-                        tcg_temp_free(r_const);
-                        tcg_temp_free(tmp);
-                    }
+                case 0xd:       /* ldstub */
+                    gen_ldstub(dc, cpu_val, cpu_addr, dc->mem_idx);
                     break;
                 case 0x0f:
                     /* swap, swap register with memory. Also atomically */
-                    {
-                        TCGv t0 = get_temp_tl(dc);
-                        CHECK_IU_FEATURE(dc, SWAP);
-                        cpu_src1 = gen_load_gpr(dc, rd);
-                        gen_address_mask(dc, cpu_addr);
-                        tcg_gen_qemu_ld32u(t0, cpu_addr, dc->mem_idx);
-                        tcg_gen_qemu_st32(cpu_src1, cpu_addr, dc->mem_idx);
-                        tcg_gen_mov_tl(cpu_val, t0);
-                    }
+                    CHECK_IU_FEATURE(dc, SWAP);
+                    cpu_src1 = gen_load_gpr(dc, rd);
+                    gen_swap(dc, cpu_val, cpu_src1, cpu_addr,
+                             dc->mem_idx, MO_TEUL);
                     break;
 #if !defined(CONFIG_USER_ONLY) || defined(TARGET_SPARC64)
                 case 0x10:      /* lda, V9 lduwa, load word alternate */
@@ -5278,18 +5402,15 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
             skip_move: ;
 #endif
             } else if (xop >= 0x20 && xop < 0x24) {
-                TCGv t0;
-
                 if (gen_trap_ifnofpu(dc)) {
                     goto jmp_insn;
                 }
                 switch (xop) {
                 case 0x20:      /* ldf, load fpreg */
                     gen_address_mask(dc, cpu_addr);
-                    t0 = get_temp_tl(dc);
-                    tcg_gen_qemu_ld32u(t0, cpu_addr, dc->mem_idx);
                     cpu_dst_32 = gen_dest_fpr_F(dc);
-                    tcg_gen_trunc_tl_i32(cpu_dst_32, t0);
+                    tcg_gen_qemu_ld_i32(cpu_dst_32, cpu_addr,
+                                        dc->mem_idx, MO_TEUL);
                     gen_store_fpr_F(dc, rd, cpu_dst_32);
                     break;
                 case 0x21:      /* ldfsr, V9 ldxfsr */
@@ -5297,35 +5418,37 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                     gen_address_mask(dc, cpu_addr);
                     if (rd == 1) {
                         TCGv_i64 t64 = tcg_temp_new_i64();
-                        tcg_gen_qemu_ld64(t64, cpu_addr, dc->mem_idx);
+                        tcg_gen_qemu_ld_i64(t64, cpu_addr,
+                                            dc->mem_idx, MO_TEQ);
                         gen_helper_ldxfsr(cpu_fsr, cpu_env, cpu_fsr, t64);
                         tcg_temp_free_i64(t64);
                         break;
                     }
 #endif
                     cpu_dst_32 = get_temp_i32(dc);
-                    t0 = get_temp_tl(dc);
-                    tcg_gen_qemu_ld32u(t0, cpu_addr, dc->mem_idx);
-                    tcg_gen_trunc_tl_i32(cpu_dst_32, t0);
+                    tcg_gen_qemu_ld_i32(cpu_dst_32, cpu_addr,
+                                        dc->mem_idx, MO_TEUL);
                     gen_helper_ldfsr(cpu_fsr, cpu_env, cpu_fsr, cpu_dst_32);
                     break;
                 case 0x22:      /* ldqf, load quad fpreg */
-                    {
-                        TCGv_i32 r_const;
-
-                        CHECK_FPU_FEATURE(dc, FLOAT128);
-                        r_const = tcg_const_i32(dc->mem_idx);
-                        gen_address_mask(dc, cpu_addr);
-                        gen_helper_ldqf(cpu_env, cpu_addr, r_const);
-                        tcg_temp_free_i32(r_const);
-                        gen_op_store_QT0_fpr(QFPREG(rd));
-                        gen_update_fprs_dirty(dc, QFPREG(rd));
-                    }
+                    CHECK_FPU_FEATURE(dc, FLOAT128);
+                    gen_address_mask(dc, cpu_addr);
+                    cpu_src1_64 = tcg_temp_new_i64();
+                    tcg_gen_qemu_ld_i64(cpu_src1_64, cpu_addr, dc->mem_idx,
+                                        MO_TEQ | MO_ALIGN_4);
+                    tcg_gen_addi_tl(cpu_addr, cpu_addr, 8);
+                    cpu_src2_64 = tcg_temp_new_i64();
+                    tcg_gen_qemu_ld_i64(cpu_src2_64, cpu_addr, dc->mem_idx,
+                                        MO_TEQ | MO_ALIGN_4);
+                    gen_store_fpr_Q(dc, rd, cpu_src1_64, cpu_src2_64);
+                    tcg_temp_free_i64(cpu_src1_64);
+                    tcg_temp_free_i64(cpu_src2_64);
                     break;
                 case 0x23:      /* lddf, load double fpreg */
                     gen_address_mask(dc, cpu_addr);
                     cpu_dst_64 = gen_dest_fpr_D(dc, rd);
-                    tcg_gen_qemu_ld64(cpu_dst_64, cpu_addr, dc->mem_idx);
+                    tcg_gen_qemu_ld_i64(cpu_dst_64, cpu_addr, dc->mem_idx,
+                                        MO_TEQ | MO_ALIGN_4);
                     gen_store_fpr_D(dc, rd, cpu_dst_64);
                     break;
                 default:
@@ -5398,13 +5521,10 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                 }
                 switch (xop) {
                 case 0x24: /* stf, store fpreg */
-                    {
-                        TCGv t = get_temp_tl(dc);
-                        gen_address_mask(dc, cpu_addr);
-                        cpu_src1_32 = gen_load_fpr_F(dc, rd);
-                        tcg_gen_ext_i32_tl(t, cpu_src1_32);
-                        tcg_gen_qemu_st32(t, cpu_addr, dc->mem_idx);
-                    }
+                    gen_address_mask(dc, cpu_addr);
+                    cpu_src1_32 = gen_load_fpr_F(dc, rd);
+                    tcg_gen_qemu_st_i32(cpu_src1_32, cpu_addr,
+                                        dc->mem_idx, MO_TEUL);
                     break;
                 case 0x25: /* stfsr, V9 stxfsr */
                     {
@@ -5421,16 +5541,20 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                 case 0x26:
 #ifdef TARGET_SPARC64
                     /* V9 stqf, store quad fpreg */
-                    {
-                        TCGv_i32 r_const;
-
-                        CHECK_FPU_FEATURE(dc, FLOAT128);
-                        gen_op_load_fpr_QT0(QFPREG(rd));
-                        r_const = tcg_const_i32(dc->mem_idx);
-                        gen_address_mask(dc, cpu_addr);
-                        gen_helper_stqf(cpu_env, cpu_addr, r_const);
-                        tcg_temp_free_i32(r_const);
-                    }
+                    CHECK_FPU_FEATURE(dc, FLOAT128);
+                    gen_address_mask(dc, cpu_addr);
+                    /* ??? While stqf only requires 4-byte alignment, it is
+                       legal for the cpu to signal the unaligned exception.
+                       The OS trap handler is then required to fix it up.
+                       For qemu, this avoids having to probe the second page
+                       before performing the first write.  */
+                    cpu_src1_64 = gen_load_fpr_Q0(dc, rd);
+                    tcg_gen_qemu_st_i64(cpu_src1_64, cpu_addr,
+                                        dc->mem_idx, MO_TEQ | MO_ALIGN_16);
+                    tcg_gen_addi_tl(cpu_addr, cpu_addr, 8);
+                    cpu_src2_64 = gen_load_fpr_Q1(dc, rd);
+                    tcg_gen_qemu_st_i64(cpu_src1_64, cpu_addr,
+                                        dc->mem_idx, MO_TEQ);
                     break;
 #else /* !TARGET_SPARC64 */
                     /* stdfq, store floating point queue */
@@ -5448,7 +5572,8 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                 case 0x27: /* stdf, store double fpreg */
                     gen_address_mask(dc, cpu_addr);
                     cpu_src1_64 = gen_load_fpr_D(dc, rd);
-                    tcg_gen_qemu_st64(cpu_src1_64, cpu_addr, dc->mem_idx);
+                    tcg_gen_qemu_st_i64(cpu_src1_64, cpu_addr, dc->mem_idx,
+                                        MO_TEQ | MO_ALIGN_4);
                     break;
                 default:
                     goto illegal_insn;
@@ -5468,7 +5593,6 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         if (gen_trap_ifnofpu(dc)) {
                             goto jmp_insn;
                         }
-                        gen_check_align(cpu_addr, 7);
                         gen_stf_asi(dc, cpu_addr, insn, 16, QFPREG(rd));
                     }
                     break;
@@ -5672,10 +5796,12 @@ void gen_intermediate_code(CPUSPARCState * env, TranslationBlock * tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("--------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, last_pc + 4 - pc_start, 0);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-sparc/win_helper.c b/target-sparc/win_helper.c
index a8a6c0cfc4..2d5b5469a9 100644
--- a/target-sparc/win_helper.c
+++ b/target-sparc/win_helper.c
@@ -19,6 +19,7 @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
+#include "exec/exec-all.h"
 #include "exec/helper-proto.h"
 #include "trace.h"
 
@@ -111,13 +112,13 @@ void helper_rett(CPUSPARCState *env)
     unsigned int cwp;
 
     if (env->psret == 1) {
-        helper_raise_exception(env, TT_ILL_INSN);
+        cpu_raise_exception_ra(env, TT_ILL_INSN, GETPC());
     }
 
     env->psret = 1;
     cwp = cpu_cwp_inc(env, env->cwp + 1) ;
     if (env->wim & (1 << cwp)) {
-        helper_raise_exception(env, TT_WIN_UNF);
+        cpu_raise_exception_ra(env, TT_WIN_UNF, GETPC());
     }
     cpu_set_cwp(env, cwp);
     env->psrs = env->psrps;
@@ -131,7 +132,7 @@ void helper_save(CPUSPARCState *env)
 
     cwp = cpu_cwp_dec(env, env->cwp - 1);
     if (env->wim & (1 << cwp)) {
-        helper_raise_exception(env, TT_WIN_OVF);
+        cpu_raise_exception_ra(env, TT_WIN_OVF, GETPC());
     }
     cpu_set_cwp(env, cwp);
 }
@@ -142,7 +143,7 @@ void helper_restore(CPUSPARCState *env)
 
     cwp = cpu_cwp_inc(env, env->cwp + 1);
     if (env->wim & (1 << cwp)) {
-        helper_raise_exception(env, TT_WIN_UNF);
+        cpu_raise_exception_ra(env, TT_WIN_UNF, GETPC());
     }
     cpu_set_cwp(env, cwp);
 }
@@ -150,7 +151,7 @@ void helper_restore(CPUSPARCState *env)
 void helper_wrpsr(CPUSPARCState *env, target_ulong new_psr)
 {
     if ((new_psr & PSR_CWP) >= env->nwindows) {
-        helper_raise_exception(env, TT_ILL_INSN);
+        cpu_raise_exception_ra(env, TT_ILL_INSN, GETPC());
     } else {
         cpu_put_psr(env, new_psr);
     }
@@ -170,14 +171,14 @@ void helper_save(CPUSPARCState *env)
 
     cwp = cpu_cwp_dec(env, env->cwp - 1);
     if (env->cansave == 0) {
-        helper_raise_exception(env, TT_SPILL | (env->otherwin != 0 ?
-                                                (TT_WOTHER |
-                                                 ((env->wstate & 0x38) >> 1)) :
-                                                ((env->wstate & 0x7) << 2)));
+        int tt = TT_SPILL | (env->otherwin != 0
+                             ? (TT_WOTHER | ((env->wstate & 0x38) >> 1))
+                             : ((env->wstate & 0x7) << 2));
+        cpu_raise_exception_ra(env, tt, GETPC());
     } else {
         if (env->cleanwin - env->canrestore == 0) {
             /* XXX Clean windows without trap */
-            helper_raise_exception(env, TT_CLRWIN);
+            cpu_raise_exception_ra(env, TT_CLRWIN, GETPC());
         } else {
             env->cansave--;
             env->canrestore++;
@@ -192,10 +193,10 @@ void helper_restore(CPUSPARCState *env)
 
     cwp = cpu_cwp_inc(env, env->cwp + 1);
     if (env->canrestore == 0) {
-        helper_raise_exception(env, TT_FILL | (env->otherwin != 0 ?
-                                               (TT_WOTHER |
-                                                ((env->wstate & 0x38) >> 1)) :
-                                               ((env->wstate & 0x7) << 2)));
+        int tt = TT_FILL | (env->otherwin != 0
+                            ? (TT_WOTHER | ((env->wstate & 0x38) >> 1))
+                            : ((env->wstate & 0x7) << 2));
+        cpu_raise_exception_ra(env, tt, GETPC());
     } else {
         env->cansave++;
         env->canrestore--;
@@ -206,10 +207,10 @@ void helper_restore(CPUSPARCState *env)
 void helper_flushw(CPUSPARCState *env)
 {
     if (env->cansave != env->nwindows - 2) {
-        helper_raise_exception(env, TT_SPILL | (env->otherwin != 0 ?
-                                                (TT_WOTHER |
-                                                 ((env->wstate & 0x38) >> 1)) :
-                                                ((env->wstate & 0x7) << 2)));
+        int tt = TT_SPILL | (env->otherwin != 0
+                             ? (TT_WOTHER | ((env->wstate & 0x38) >> 1))
+                             : ((env->wstate & 0x7) << 2));
+        cpu_raise_exception_ra(env, tt, GETPC());
     }
 }
 
diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c
index 11c9732389..9c734eeba3 100644
--- a/target-tilegx/translate.c
+++ b/target-tilegx/translate.c
@@ -2391,6 +2391,7 @@ void gen_intermediate_code(CPUTLGState *env, struct TranslationBlock *tb)
     TCGV_UNUSED_I64(dc->zero);
 
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+        qemu_log_lock();
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
     }
     if (!max_insns) {
@@ -2429,7 +2430,10 @@ void gen_intermediate_code(CPUTLGState *env, struct TranslationBlock *tb)
     tb->size = dc->pc - pc_start;
     tb->icount = num_insns;
 
-    qemu_log_mask(CPU_LOG_TB_IN_ASM, "\n");
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+        qemu_log("\n");
+        qemu_log_unlock();
+    }
 }
 
 void restore_state_to_opc(CPUTLGState *env, TranslationBlock *tb,
diff --git a/target-tricore/translate.c b/target-tricore/translate.c
index 9a50df9a88..36f734a662 100644
--- a/target-tricore/translate.c
+++ b/target-tricore/translate.c
@@ -8789,9 +8789,11 @@ void gen_intermediate_code(CPUTriCoreState *env, struct TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, ctx.pc - pc_start, 0);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 }
diff --git a/target-unicore32/translate.c b/target-unicore32/translate.c
index 09354f92d2..514d460408 100644
--- a/target-unicore32/translate.c
+++ b/target-unicore32/translate.c
@@ -2024,10 +2024,12 @@ done_generating:
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc->pc - pc_start, 0);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
     tb->size = dc->pc - pc_start;
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 4c1e48748b..0858c296ea 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -36,7 +36,6 @@
 #include "tcg-op.h"
 #include "qemu/log.h"
 #include "sysemu/sysemu.h"
-#include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "exec/semihost.h"
 
@@ -3156,10 +3155,12 @@ void gen_intermediate_code(CPUXtensaState *env, TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)
         && qemu_log_in_addr_range(pc_start)) {
+        qemu_log_lock();
         qemu_log("----------------\n");
         qemu_log("IN: %s\n", lookup_symbol(pc_start));
         log_target_disas(cs, pc_start, dc.pc - pc_start, 0);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
     tb->size = dc.pc - pc_start;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index bb2bfeef3c..6e2fb3522f 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -678,6 +678,33 @@ void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
     }
 }
 
+void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        TCGv_i32 t2 = tcg_temp_new_i32();
+        tcg_gen_mulu2_i32(t0, t1, arg1, arg2);
+        /* Adjust for negative input for the signed arg1.  */
+        tcg_gen_sari_i32(t2, arg1, 31);
+        tcg_gen_and_i32(t2, t2, arg2);
+        tcg_gen_sub_i32(rh, t1, t2);
+        tcg_gen_mov_i32(rl, t0);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+        tcg_temp_free_i32(t2);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(t0, arg1);
+        tcg_gen_extu_i32_i64(t1, arg2);
+        tcg_gen_mul_i64(t0, t0, t1);
+        tcg_gen_extr_i64_i32(rl, rh, t0);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
 void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg)
 {
     if (TCG_TARGET_HAS_ext8s_i32) {
@@ -790,7 +817,7 @@ void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
 void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
 {
     tcg_gen_ld8s_i32(TCGV_LOW(ret), arg2, offset);
-    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), 31);
+    tcg_gen_sari_i32(TCGV_HIGH(ret), TCGV_LOW(ret), 31);
 }
 
 void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset)
@@ -1748,6 +1775,22 @@ void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
     }
 }
 
+void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    tcg_gen_mulu2_i64(t0, t1, arg1, arg2);
+    /* Adjust for negative input for the signed arg1.  */
+    tcg_gen_sari_i64(t2, arg1, 63);
+    tcg_gen_and_i64(t2, t2, arg2);
+    tcg_gen_sub_i64(rh, t1, t2);
+    tcg_gen_mov_i64(rl, t0);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
 /* Size changing operations.  */
 
 void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 89b59e867a..6d044b7c5b 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -306,6 +306,7 @@ void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
                       TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
 void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
 void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
 void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
@@ -482,6 +483,7 @@ void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
                       TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
 void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
 void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
 void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
 void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
 void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
@@ -956,6 +958,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 #define tcg_gen_sub2_tl tcg_gen_sub2_i64
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
 #define tcg_gen_muls2_tl tcg_gen_muls2_i64
+#define tcg_gen_mulsu2_tl tcg_gen_mulsu2_i64
 #define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i64
 #define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i64
 #define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i64
@@ -1043,6 +1046,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 #define tcg_gen_sub2_tl tcg_gen_sub2_i32
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
 #define tcg_gen_muls2_tl tcg_gen_muls2_i32
+#define tcg_gen_mulsu2_tl tcg_gen_mulsu2_i32
 #define tcg_gen_atomic_cmpxchg_tl tcg_gen_atomic_cmpxchg_i32
 #define tcg_gen_atomic_xchg_tl tcg_gen_atomic_xchg_i32
 #define tcg_gen_atomic_fetch_add_tl tcg_gen_atomic_fetch_add_i32
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 2d3e498bc2..aabf94f365 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -412,10 +412,12 @@ void tcg_prologue_init(TCGContext *s)
 
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM)) {
+        qemu_log_lock();
         qemu_log("PROLOGUE: [size=%zu]\n", prologue_size);
         log_disas(buf0, prologue_size);
         qemu_log("\n");
         qemu_log_flush();
+        qemu_log_unlock();
     }
 #endif
 }
@@ -2542,9 +2544,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
                  && qemu_log_in_addr_range(tb->pc))) {
+        qemu_log_lock();
         qemu_log("OP:\n");
         tcg_dump_ops(s);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 
@@ -2570,9 +2574,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 #ifdef DEBUG_DISAS
             if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
                          && qemu_log_in_addr_range(tb->pc))) {
+                qemu_log_lock();
                 qemu_log("OP before indirect lowering:\n");
                 tcg_dump_ops(s);
                 qemu_log("\n");
+                qemu_log_unlock();
             }
 #endif
             /* Replace indirect temps with direct temps.  */
@@ -2590,9 +2596,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 #ifdef DEBUG_DISAS
     if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
                  && qemu_log_in_addr_range(tb->pc))) {
+        qemu_log_lock();
         qemu_log("OP after optimization and liveness analysis:\n");
         tcg_dump_ops(s);
         qemu_log("\n");
+        qemu_log_unlock();
     }
 #endif
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index b34b5fbc2f..a35e4c4fd4 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -376,14 +376,36 @@ static inline unsigned get_alignment_bits(TCGMemOp memop)
 
 typedef tcg_target_ulong TCGArg;
 
-/* Define a type and accessor macros for variables.  Using pointer types
-   is nice because it gives some level of type safely.  Converting to and
-   from intptr_t rather than int reduces the number of sign-extension
-   instructions that get implied on 64-bit hosts.  Users of tcg_gen_* don't
-   need to know about any of this, and should treat TCGv as an opaque type.
-   In addition we do typechecking for different types of variables.  TCGv_i32
-   and TCGv_i64 are 32/64-bit variables respectively.  TCGv and TCGv_ptr
-   are aliases for target_ulong and host pointer sized values respectively.  */
+/* Define type and accessor macros for TCG variables.
+
+   TCG variables are the inputs and outputs of TCG ops, as described
+   in tcg/README. Target CPU front-end code uses these types to deal
+   with TCG variables as it emits TCG code via the tcg_gen_* functions.
+   They come in several flavours:
+    * TCGv_i32 : 32 bit integer type
+    * TCGv_i64 : 64 bit integer type
+    * TCGv_ptr : a host pointer type
+    * TCGv : an integer type the same size as target_ulong
+             (an alias for either TCGv_i32 or TCGv_i64)
+   The compiler's type checking will complain if you mix them
+   up and pass the wrong sized TCGv to a function.
+
+   Users of tcg_gen_* don't need to know about any of the internal
+   details of these, and should treat them as opaque types.
+   You won't be able to look inside them in a debugger either.
+
+   Internal implementation details follow:
+
+   Note that there is no definition of the structs TCGv_i32_d etc anywhere.
+   This is deliberate, because the values we store in variables of type
+   TCGv_i32 are not really pointers-to-structures. They're just small
+   integers, but keeping them in pointer types like this means that the
+   compiler will complain if you accidentally pass a TCGv_i32 to a
+   function which takes a TCGv_i64, and so on. Only the internals of
+   TCG need to care about the actual contents of the types, and they always
+   box and unbox via the MAKE_TCGV_* and GET_TCGV_* functions.
+   Converting to and from intptr_t rather than int reduces the number
+   of sign-extension instructions that get implied on 64-bit hosts.  */
 
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
@@ -726,6 +748,7 @@ static inline bool tcg_op_buf_full(void)
 
 /* pool based memory allocation */
 
+/* tb_lock must be held for tcg_malloc_internal. */
 void *tcg_malloc_internal(TCGContext *s, int size);
 void tcg_pool_reset(TCGContext *s);
 
@@ -733,6 +756,7 @@ void tb_lock(void);
 void tb_unlock(void);
 void tb_lock_reset(void);
 
+/* Called with tb_lock held.  */
 static inline void *tcg_malloc(int size)
 {
     TCGContext *s = &tcg_ctx;
diff --git a/tests/.gitignore b/tests/.gitignore
index 64e050e859..c0d7857538 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -67,7 +67,6 @@ test-qmp-marshal.c
 test-qobject-output-visitor
 test-rcu-list
 test-replication
-test-rfifolock
 test-string-input-visitor
 test-string-output-visitor
 test-thread-pool
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 1a135d2340..de516341fd 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -45,7 +45,6 @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
-check-unit-$(CONFIG_POSIX) += tests/test-rfifolock$(EXESUF)
 check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
@@ -491,7 +490,6 @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
 tests/test-char$(EXESUF): tests/test-char.o qemu-char.o qemu-timer.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y)
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
-tests/test-rfifolock$(EXESUF): tests/test-rfifolock.o $(test-util-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
diff --git a/tests/crypto-tls-x509-helpers.h b/tests/crypto-tls-x509-helpers.h
index a8faa92bc0..921341c649 100644
--- a/tests/crypto-tls-x509-helpers.h
+++ b/tests/crypto-tls-x509-helpers.h
@@ -21,9 +21,6 @@
 #include <gnutls/gnutls.h>
 #include <gnutls/x509.h>
 
-#include <gnutls/gnutls.h>
-#include <gnutls/x509.h>
-
 #if !(defined WIN32) && \
     defined(CONFIG_TASN1) && \
     (LIBGNUTLS_VERSION_NUMBER >= 0x020600)
diff --git a/tests/ide-test.c b/tests/ide-test.c
index 67c7df0c8d..fb541f88b5 100644
--- a/tests/ide-test.c
+++ b/tests/ide-test.c
@@ -620,7 +620,6 @@ static void test_retry_flush(const char *machine)
     prepare_blkdebug_script(debug_path, "flush_to_disk");
 
     ide_test_start(
-        "-vnc none "
         "-drive file=blkdebug:%s:%s,if=ide,cache=writeback,format=raw,"
         "rerror=stop,werror=stop",
         debug_path, tmp_path);
diff --git a/tests/ipmi-bt-test.c b/tests/ipmi-bt-test.c
index 65d05b3d43..e84dd6889b 100644
--- a/tests/ipmi-bt-test.c
+++ b/tests/ipmi-bt-test.c
@@ -415,7 +415,7 @@ int main(int argc, char **argv)
     /* Run the tests */
     g_test_init(&argc, &argv, NULL);
 
-    cmdline = g_strdup_printf("-vnc none"
+    cmdline = g_strdup_printf(
           " -chardev socket,id=ipmi0,host=localhost,port=%d,reconnect=10"
           " -device ipmi-bmc-extern,chardev=ipmi0,id=bmc0"
           " -device isa-ipmi-bt,bmc=bmc0", emu_port);
diff --git a/tests/ipmi-kcs-test.c b/tests/ipmi-kcs-test.c
index 3750389651..9cf0b34a33 100644
--- a/tests/ipmi-kcs-test.c
+++ b/tests/ipmi-kcs-test.c
@@ -276,7 +276,7 @@ int main(int argc, char **argv)
     /* Run the tests */
     g_test_init(&argc, &argv, NULL);
 
-    cmdline = g_strdup_printf("-vnc none -device ipmi-bmc-sim,id=bmc0"
+    cmdline = g_strdup_printf("-device ipmi-bmc-sim,id=bmc0"
                               " -device isa-ipmi-kcs,bmc=bmc0");
     qtest_start(cmdline);
     qtest_irq_intercept_in(global_qtest, "ioapic");
diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 107049b50f..54db54a1ea 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -36,7 +36,7 @@ class TestSingleDrive(iotests.QMPTestCase):
         qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % mid_img, test_img)
         qemu_io('-f', 'raw', '-c', 'write -P 0x1 0 512', backing_img)
         qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0x1 524288 512', mid_img)
-        self.vm = iotests.VM().add_drive("blkdebug::" + test_img)
+        self.vm = iotests.VM().add_drive("blkdebug::" + test_img, "backing.node-name=mid")
         self.vm.launch()
 
     def tearDown(self):
@@ -60,6 +60,25 @@ class TestSingleDrive(iotests.QMPTestCase):
                          qemu_io('-f', iotests.imgfmt, '-c', 'map', test_img),
                          'image file map does not match backing file after streaming')
 
+    def test_stream_intermediate(self):
+        self.assert_no_active_block_jobs()
+
+        self.assertNotEqual(qemu_io('-f', 'raw', '-c', 'map', backing_img),
+                            qemu_io('-f', iotests.imgfmt, '-c', 'map', mid_img),
+                            'image file map matches backing file before streaming')
+
+        result = self.vm.qmp('block-stream', device='mid', job_id='stream-mid')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_until_completed(drive='stream-mid')
+
+        self.assert_no_active_block_jobs()
+        self.vm.shutdown()
+
+        self.assertEqual(qemu_io('-f', 'raw', '-c', 'map', backing_img),
+                         qemu_io('-f', iotests.imgfmt, '-c', 'map', mid_img),
+                         'image file map does not match backing file after streaming')
+
     def test_stream_pause(self):
         self.assert_no_active_block_jobs()
 
@@ -129,6 +148,298 @@ class TestSingleDrive(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
 
+class TestParallelOps(iotests.QMPTestCase):
+    num_ops = 4 # Number of parallel block-stream operations
+    num_imgs = num_ops * 2 + 1
+    image_len = num_ops * 1024 * 1024
+    imgs = []
+
+    def setUp(self):
+        opts = []
+        self.imgs = []
+
+        # Initialize file names and command-line options
+        for i in range(self.num_imgs):
+            img_depth = self.num_imgs - i - 1
+            opts.append("backing." * img_depth + "node-name=node%d" % i)
+            self.imgs.append(os.path.join(iotests.test_dir, 'img-%d.img' % i))
+
+        # Create all images
+        iotests.create_image(self.imgs[0], self.image_len)
+        for i in range(1, self.num_imgs):
+            qemu_img('create', '-f', iotests.imgfmt,
+                     '-o', 'backing_file=%s' % self.imgs[i-1], self.imgs[i])
+
+        # Put data into the images we are copying data from
+        for i in range(self.num_imgs / 2):
+            img_index = i * 2 + 1
+            # Alternate between 512k and 1M.
+            # This way jobs will not finish in the same order they were created
+            num_kb = 512 + 512 * (i % 2)
+            qemu_io('-f', iotests.imgfmt,
+                    '-c', 'write -P %d %d %d' % (i, i*1024*1024, num_kb * 1024),
+                    self.imgs[img_index])
+
+        # Attach the drive to the VM
+        self.vm = iotests.VM()
+        self.vm.add_drive(self.imgs[-1], ','.join(opts))
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        for img in self.imgs:
+            os.remove(img)
+
+    # Test that it's possible to run several block-stream operations
+    # in parallel in the same snapshot chain
+    def test_stream_parallel(self):
+        self.assert_no_active_block_jobs()
+
+        # Check that the maps don't match before the streaming operations
+        for i in range(2, self.num_imgs, 2):
+            self.assertNotEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[i]),
+                                qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[i-1]),
+                                'image file map matches backing file before streaming')
+
+        # Create all streaming jobs
+        pending_jobs = []
+        for i in range(2, self.num_imgs, 2):
+            node_name = 'node%d' % i
+            job_id = 'stream-%s' % node_name
+            pending_jobs.append(job_id)
+            result = self.vm.qmp('block-stream', device=node_name, job_id=job_id, base=self.imgs[i-2], speed=512*1024)
+            self.assert_qmp(result, 'return', {})
+
+        # Wait for all jobs to be finished.
+        while len(pending_jobs) > 0:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_COMPLETED':
+                    job_id = self.dictpath(event, 'data/device')
+                    self.assertTrue(job_id in pending_jobs)
+                    self.assert_qmp_absent(event, 'data/error')
+                    pending_jobs.remove(job_id)
+
+        self.assert_no_active_block_jobs()
+        self.vm.shutdown()
+
+        # Check that all maps match now
+        for i in range(2, self.num_imgs, 2):
+            self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[i]),
+                             qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[i-1]),
+                             'image file map does not match backing file after streaming')
+
+    # Test that it's not possible to perform two block-stream
+    # operations if there are nodes involved in both.
+    def test_overlapping_1(self):
+        self.assert_no_active_block_jobs()
+
+        # Set a speed limit to make sure that this job blocks the rest
+        result = self.vm.qmp('block-stream', device='node4', job_id='stream-node4', base=self.imgs[1], speed=1024*1024)
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('block-stream', device='node5', job_id='stream-node5', base=self.imgs[2])
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        result = self.vm.qmp('block-stream', device='node3', job_id='stream-node3', base=self.imgs[2])
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        result = self.vm.qmp('block-stream', device='node4', job_id='stream-node4-v2')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # block-commit should also fail if it touches nodes used by the stream job
+        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[4], job_id='commit-node4')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[1], top=self.imgs[3], job_id='commit-node1')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # This fails because it needs to modify the backing string in node2, which is blocked
+        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[0], top=self.imgs[1], job_id='commit-node0')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        self.wait_until_completed(drive='stream-node4')
+        self.assert_no_active_block_jobs()
+
+    # Similar to test_overlapping_1, but with block-commit
+    # blocking the other jobs
+    def test_overlapping_2(self):
+        self.assertLessEqual(9, self.num_imgs)
+        self.assert_no_active_block_jobs()
+
+        # Set a speed limit to make sure that this job blocks the rest
+        result = self.vm.qmp('block-commit', device='drive0', top=self.imgs[5], base=self.imgs[3], job_id='commit-node3', speed=1024*1024)
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('block-stream', device='node3', job_id='stream-node3')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        result = self.vm.qmp('block-stream', device='node6', base=self.imgs[2], job_id='stream-node6')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        result = self.vm.qmp('block-stream', device='node4', base=self.imgs[2], job_id='stream-node4')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        result = self.vm.qmp('block-stream', device='node6', base=self.imgs[4], job_id='stream-node6-v2')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # This fails because block-commit needs to block node6, the overlay of the 'top' image
+        result = self.vm.qmp('block-stream', device='node7', base=self.imgs[5], job_id='stream-node6-v3')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # This fails because block-commit currently blocks the active layer even if it's not used
+        result = self.vm.qmp('block-stream', device='drive0', base=self.imgs[5], job_id='stream-drive0')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        self.wait_until_completed(drive='commit-node3')
+
+    # Similar to test_overlapping_2, but here block-commit doesn't use the 'top' parameter.
+    # Internally this uses a mirror block job, hence the separate test case.
+    def test_overlapping_3(self):
+        self.assertLessEqual(8, self.num_imgs)
+        self.assert_no_active_block_jobs()
+
+        # Set a speed limit to make sure that this job blocks the rest
+        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[3], job_id='commit-drive0', speed=1024*1024)
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('block-stream', device='node5', base=self.imgs[3], job_id='stream-node6')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        event = self.vm.get_qmp_event(wait=True)
+        self.assertEqual(event['event'], 'BLOCK_JOB_READY')
+        self.assert_qmp(event, 'data/device', 'commit-drive0')
+        self.assert_qmp(event, 'data/type', 'commit')
+        self.assert_qmp_absent(event, 'data/error')
+
+        result = self.vm.qmp('block-job-complete', device='commit-drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_until_completed(drive='commit-drive0')
+
+    # Test a block-stream and a block-commit job in parallel
+    def test_stream_commit(self):
+        self.assertLessEqual(8, self.num_imgs)
+        self.assert_no_active_block_jobs()
+
+        # Stream from node0 into node2
+        result = self.vm.qmp('block-stream', device='node2', job_id='node2')
+        self.assert_qmp(result, 'return', {})
+
+        # Commit from the active layer into node3
+        result = self.vm.qmp('block-commit', device='drive0', base=self.imgs[3])
+        self.assert_qmp(result, 'return', {})
+
+        # Wait for all jobs to be finished.
+        pending_jobs = ['node2', 'drive0']
+        while len(pending_jobs) > 0:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_COMPLETED':
+                    node_name = self.dictpath(event, 'data/device')
+                    self.assertTrue(node_name in pending_jobs)
+                    self.assert_qmp_absent(event, 'data/error')
+                    pending_jobs.remove(node_name)
+                if event['event'] == 'BLOCK_JOB_READY':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/type', 'commit')
+                    self.assert_qmp_absent(event, 'data/error')
+                    self.assertTrue('drive0' in pending_jobs)
+                    self.vm.qmp('block-job-complete', device='drive0')
+
+        self.assert_no_active_block_jobs()
+
+    # Test the base_node parameter
+    def test_stream_base_node_name(self):
+        self.assert_no_active_block_jobs()
+
+        self.assertNotEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[4]),
+                            qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[3]),
+                            'image file map matches backing file before streaming')
+
+        # Error: the base node does not exist
+        result = self.vm.qmp('block-stream', device='node4', base_node='none', job_id='stream')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # Error: the base node is not a backing file of the top node
+        result = self.vm.qmp('block-stream', device='node4', base_node='node6', job_id='stream')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # Error: the base node is the same as the top node
+        result = self.vm.qmp('block-stream', device='node4', base_node='node4', job_id='stream')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # Error: cannot specify 'base' and 'base-node' at the same time
+        result = self.vm.qmp('block-stream', device='node4', base=self.imgs[2], base_node='node2', job_id='stream')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        # Success: the base node is a backing file of the top node
+        result = self.vm.qmp('block-stream', device='node4', base_node='node2', job_id='stream')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_until_completed(drive='stream')
+
+        self.assert_no_active_block_jobs()
+        self.vm.shutdown()
+
+        self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[4]),
+                         qemu_io('-f', iotests.imgfmt, '-c', 'map', self.imgs[3]),
+                         'image file map matches backing file after streaming')
+
+class TestQuorum(iotests.QMPTestCase):
+    num_children = 3
+    children = []
+    backing = []
+
+    def setUp(self):
+        opts = ['driver=quorum', 'vote-threshold=2']
+
+        # Initialize file names and command-line options
+        for i in range(self.num_children):
+            child_img = os.path.join(iotests.test_dir, 'img-%d.img' % i)
+            backing_img = os.path.join(iotests.test_dir, 'backing-%d.img' % i)
+            self.children.append(child_img)
+            self.backing.append(backing_img)
+            qemu_img('create', '-f', iotests.imgfmt, backing_img, '1M')
+            qemu_io('-f', iotests.imgfmt,
+                    '-c', 'write -P 0x55 0 1024', backing_img)
+            qemu_img('create', '-f', iotests.imgfmt,
+                     '-o', 'backing_file=%s' % backing_img, child_img)
+            opts.append("children.%d.file.filename=%s" % (i, child_img))
+            opts.append("children.%d.node-name=node%d" % (i, i))
+
+        # Attach the drive to the VM
+        self.vm = iotests.VM()
+        self.vm.add_drive(path = None, opts = ','.join(opts))
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        for img in self.children:
+            os.remove(img)
+        for img in self.backing:
+            os.remove(img)
+
+    def test_stream_quorum(self):
+        if not iotests.supports_quorum():
+            return
+
+        self.assertNotEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', self.children[0]),
+                            qemu_io('-f', iotests.imgfmt, '-c', 'map', self.backing[0]),
+                            'image file map matches backing file before streaming')
+
+        self.assert_no_active_block_jobs()
+
+        result = self.vm.qmp('block-stream', device='node0', job_id='stream-node0')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_until_completed(drive='stream-node0')
+
+        self.assert_no_active_block_jobs()
+        self.vm.shutdown()
+
+        self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', self.children[0]),
+                         qemu_io('-f', iotests.imgfmt, '-c', 'map', self.backing[0]),
+                         'image file map does not match backing file after streaming')
+
 class TestSmallerBackingFile(iotests.QMPTestCase):
     backing_len = 1 * 1024 * 1024 # MB
     image_len = 2 * backing_len
diff --git a/tests/qemu-iotests/030.out b/tests/qemu-iotests/030.out
index 6323079e08..84bfd63fba 100644
--- a/tests/qemu-iotests/030.out
+++ b/tests/qemu-iotests/030.out
@@ -1,5 +1,5 @@
-..............
+......................
 ----------------------------------------------------------------------
-Ran 14 tests
+Ran 22 tests
 
 OK
diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041
index 30e628f0f7..bc6cf782fe 100755
--- a/tests/qemu-iotests/041
+++ b/tests/qemu-iotests/041
@@ -760,9 +760,6 @@ class TestRepairQuorum(iotests.QMPTestCase):
     image_len = 1 * 1024 * 1024 # MB
     IMAGES = [ quorum_img1, quorum_img2, quorum_img3 ]
 
-    def has_quorum(self):
-        return 'quorum' in iotests.qemu_img_pipe('--help')
-
     def setUp(self):
         self.vm = iotests.VM()
 
@@ -783,7 +780,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         #assemble the quorum block device from the individual files
         args = { "driver": "quorum", "node-name": "quorum0",
                  "vote-threshold": 2, "children": [ "img0", "img1", "img2" ] }
-        if self.has_quorum():
+        if iotests.supports_quorum():
             result = self.vm.qmp("blockdev-add", **args)
             self.assert_qmp(result, 'return', {})
 
@@ -798,7 +795,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
                 pass
 
     def test_complete(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         self.assert_no_active_block_jobs()
@@ -817,7 +814,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
                         'target image does not match source after mirroring')
 
     def test_cancel(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         self.assert_no_active_block_jobs()
@@ -834,7 +831,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.vm.shutdown()
 
     def test_cancel_after_ready(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         self.assert_no_active_block_jobs()
@@ -853,7 +850,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
                         'target image does not match source after mirroring')
 
     def test_pause(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         self.assert_no_active_block_jobs()
@@ -883,7 +880,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
                         'target image does not match source after mirroring')
 
     def test_medium_not_found(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         if iotests.qemu_default_machine != 'pc':
@@ -897,7 +894,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
     def test_image_not_found(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0',
@@ -907,7 +904,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
     def test_device_not_found(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         result = self.vm.qmp('drive-mirror', job_id='job0',
@@ -918,7 +915,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
     def test_wrong_sync_mode(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         result = self.vm.qmp('drive-mirror', device='quorum0', job_id='job0',
@@ -928,7 +925,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
     def test_no_node_name(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0',
@@ -937,7 +934,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
     def test_nonexistent_replaces(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         result = self.vm.qmp('drive-mirror', job_id='job0', device='quorum0',
@@ -946,7 +943,7 @@ class TestRepairQuorum(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
     def test_after_a_quorum_snapshot(self):
-        if not self.has_quorum():
+        if not iotests.supports_quorum():
             return
 
         result = self.vm.qmp('blockdev-snapshot-sync', node_name='img1',
diff --git a/tests/qemu-iotests/139 b/tests/qemu-iotests/139
index 6a0f6ca569..6d98e4f879 100644
--- a/tests/qemu-iotests/139
+++ b/tests/qemu-iotests/139
@@ -336,8 +336,9 @@ class TestBlockdevDel(iotests.QMPTestCase):
         self.checkBlockDriverState('node1', False)
 
     def testQuorum(self):
-        if not 'quorum' in iotests.qemu_img_pipe('--help'):
+        if not iotests.supports_quorum():
             return
+
         self.addQuorum('quorum0', 'node0', 'node1')
         # We cannot remove the children of a Quorum device
         self.delBlockDriverState('node0', expect_error = True)
diff --git a/tests/qemu-iotests/171 b/tests/qemu-iotests/171
new file mode 100755
index 0000000000..257be10a0e
--- /dev/null
+++ b/tests/qemu-iotests/171
@@ -0,0 +1,212 @@
+#!/bin/bash
+#
+# Test 'offset' and 'size' options of the raw driver. Make sure we can't
+# (or can) read and write outside of the image size.
+#
+# Copyright (C) 2016 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=tgolembi@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt raw
+_supported_proto file
+_supported_os Linux
+
+
+# Create JSON with options
+img_json() {
+    echo -n 'json:{"driver":"raw", '
+    echo -n "\"offset\":\"$img_offset\", "
+    if [ "$img_size" -ne -1 ] ; then
+        echo -n "\"size\":\"$img_size\", "
+    fi
+    echo -n '"file": {'
+    echo -n    '"driver":"file", '
+    echo -n    "\"filename\":\"$TEST_IMG\" "
+    echo -n "} }"
+}
+
+do_general_test() {
+    if [ "$img_size" -ge 0 ] ; then
+        test_size=$img_size
+    else
+        test_size=$((size-img_offset))
+    fi
+
+    echo
+    echo "write to image"
+    $QEMU_IO -c "write -P 0x0a 0 $test_size" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "read the image"
+    $QEMU_IO -c "read -P 0x0a 0 $test_size" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "check that offset is respected"
+    $QEMU_IO -c "read -v $((img_offset-2)) 4" $TEST_IMG | _filter_qemu_io
+
+    echo
+    echo "write before image boundary"
+    $QEMU_IO -c "write $((test_size-1)) 1" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "write across image boundary"
+    $QEMU_IO -c "write $((test_size-1)) 2" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "write at image boundary"
+    $QEMU_IO -c "write $test_size 1" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "write after image boundary"
+    $QEMU_IO -c "write $((test_size+512)) 1" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "writev before/after image boundary"
+    $QEMU_IO -c "writev $((test_size-512)) 512 512" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "read before image boundary"
+    $QEMU_IO -c "read $((test_size-1)) 1" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "read across image boundary"
+    $QEMU_IO -c "read $((test_size-1)) 2" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "read at image boundary"
+    $QEMU_IO -c "read $test_size 1" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "read after image boundary"
+    $QEMU_IO -c "read $((test_size+512)) 1" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "readv before/after image boundary"
+    $QEMU_IO -c "readv $((test_size-512)) 512 512" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "fill image with pattern"
+    $QEMU_IO -c "write -P 0x0a 0 $size" $TEST_IMG | _filter_qemu_io
+
+    echo
+    echo "write zeroes and check"
+    $QEMU_IO -c "write -z 0 512" "$(img_json)" | _filter_qemu_io
+    $QEMU_IO -c "read -v $((img_offset-2)) 4" $TEST_IMG | _filter_qemu_io
+
+    echo
+    echo "write zeroes across image boundary"
+    $QEMU_IO -c "write -z $((test_size-1)) 2" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "write zeroes at image boundary and check"
+    $QEMU_IO -c "write -z $((test_size-2)) 2" "$(img_json)" | _filter_qemu_io
+    $QEMU_IO -c "read -v $((img_offset+test_size-2)) 2" $TEST_IMG | _filter_qemu_io
+    $QEMU_IO -c "read -v $((img_offset+test_size)) 2" $TEST_IMG | _filter_qemu_io
+
+    echo
+    echo "fill image with pattern"
+    $QEMU_IO -c "write -P 0x0a 0 $size" $TEST_IMG | _filter_qemu_io
+
+    echo
+    echo "discard and check"
+    $QEMU_IO -c "discard 0 512" "$(img_json)" | _filter_qemu_io
+    $QEMU_IO -c "read -v $((img_offset-2)) 4" $TEST_IMG | _filter_qemu_io
+
+    echo
+    echo "discard across image boundary"
+    $QEMU_IO -c "discard $((test_size-1)) 2" "$(img_json)" | _filter_qemu_io
+
+    echo
+    echo "discard at image boundary and check"
+    $QEMU_IO -c "discard $((test_size-2)) 2" "$(img_json)" | _filter_qemu_io
+    $QEMU_IO -c "read -v $((img_offset+test_size-2)) 2" $TEST_IMG | _filter_qemu_io
+    $QEMU_IO -c "read -v $((img_offset+test_size)) 2" $TEST_IMG | _filter_qemu_io
+}
+
+echo
+echo "== test 'offset' option =="
+size=4096
+img_offset=512
+img_size=-1
+_make_test_img $size
+do_general_test
+_cleanup_test_img
+
+echo
+echo "== test 'offset' and 'size' options =="
+size=4096
+img_offset=512
+img_size=2048
+_make_test_img $size
+do_general_test
+_cleanup_test_img
+
+echo
+echo "== test misaligned 'offset' =="
+size=4096
+img_offset=10
+img_size=2048
+_make_test_img $size
+do_general_test
+_cleanup_test_img
+
+echo
+echo "== test reopen =="
+size=4096
+img_offset=512
+img_size=512
+_make_test_img $size
+(
+$QEMU_IO "$(img_json)"  <<EOT
+write -P 0x0a 0 512
+write -P 0x0a 511 1
+write -P 0x0a 512 1
+reopen -o driver=raw,offset=1536,size=1024
+write -P 0x0a 0 1024
+write -P 0x0a 1023 1
+write -P 0x0a 1024 1
+EOT
+) | _filter_qemu_io
+echo "checking boundaries"
+$QEMU_IO -c "read -v 510 4" $TEST_IMG | _filter_qemu_io
+$QEMU_IO -c "read -v 1022 4" $TEST_IMG | _filter_qemu_io
+$QEMU_IO -c "read -v 1534 4" $TEST_IMG | _filter_qemu_io
+$QEMU_IO -c "read -v 2558 4" $TEST_IMG | _filter_qemu_io
+_cleanup_test_img
+
+# success, all done
+echo
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/171.out b/tests/qemu-iotests/171.out
new file mode 100644
index 0000000000..ec3363b4f4
--- /dev/null
+++ b/tests/qemu-iotests/171.out
@@ -0,0 +1,313 @@
+QA output created by 171
+
+== test 'offset' option ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4096
+
+write to image
+wrote 3584/3584 bytes at offset 0
+3.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read the image
+read 3584/3584 bytes at offset 0
+3.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+check that offset is respected
+000001fe:  00 00 0a 0a  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write before image boundary
+wrote 1/1 bytes at offset 3583
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write across image boundary
+write failed: Input/output error
+
+write at image boundary
+write failed: Input/output error
+
+write after image boundary
+write failed: Input/output error
+
+writev before/after image boundary
+writev failed: Input/output error
+
+read before image boundary
+read 1/1 bytes at offset 3583
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read across image boundary
+read failed: Input/output error
+
+read at image boundary
+read failed: Input/output error
+
+read after image boundary
+read failed: Input/output error
+
+readv before/after image boundary
+readv failed: Input/output error
+
+fill image with pattern
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write zeroes and check
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000001fe:  0a 0a 00 00  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write zeroes across image boundary
+write failed: Input/output error
+
+write zeroes at image boundary and check
+wrote 2/2 bytes at offset 3582
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000ffe:  00 00  ..
+read 2/2 bytes at offset 4094
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read failed: Input/output error
+
+fill image with pattern
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+discard and check
+discard 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000001fe:  0a 0a 00 00  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+discard across image boundary
+discard failed: Input/output error
+
+discard at image boundary and check
+discard 2/2 bytes at offset 3582
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000ffe:  00 00  ..
+read 2/2 bytes at offset 4094
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read failed: Input/output error
+
+== test 'offset' and 'size' options ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4096
+
+write to image
+wrote 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read the image
+read 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+check that offset is respected
+000001fe:  00 00 0a 0a  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write before image boundary
+wrote 1/1 bytes at offset 2047
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write across image boundary
+write failed: Input/output error
+
+write at image boundary
+write failed: Input/output error
+
+write after image boundary
+write failed: Input/output error
+
+writev before/after image boundary
+writev failed: Input/output error
+
+read before image boundary
+read 1/1 bytes at offset 2047
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read across image boundary
+read failed: Input/output error
+
+read at image boundary
+read failed: Input/output error
+
+read after image boundary
+read failed: Input/output error
+
+readv before/after image boundary
+readv failed: Input/output error
+
+fill image with pattern
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write zeroes and check
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000001fe:  0a 0a 00 00  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write zeroes across image boundary
+write failed: Input/output error
+
+write zeroes at image boundary and check
+wrote 2/2 bytes at offset 2046
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000009fe:  00 00  ..
+read 2/2 bytes at offset 2558
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000a00:  0a 0a  ..
+read 2/2 bytes at offset 2560
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+fill image with pattern
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+discard and check
+discard 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000001fe:  0a 0a 00 00  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+discard across image boundary
+discard failed: Input/output error
+
+discard at image boundary and check
+discard 2/2 bytes at offset 2046
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000009fe:  00 00  ..
+read 2/2 bytes at offset 2558
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000a00:  0a 0a  ..
+read 2/2 bytes at offset 2560
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== test misaligned 'offset' ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4096
+
+write to image
+wrote 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read the image
+read 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+check that offset is respected
+00000008:  00 00 0a 0a  ....
+read 4/4 bytes at offset 8
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write before image boundary
+wrote 1/1 bytes at offset 2047
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write across image boundary
+write failed: Input/output error
+
+write at image boundary
+write failed: Input/output error
+
+write after image boundary
+write failed: Input/output error
+
+writev before/after image boundary
+writev failed: Input/output error
+
+read before image boundary
+read 1/1 bytes at offset 2047
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+read across image boundary
+read failed: Input/output error
+
+read at image boundary
+read failed: Input/output error
+
+read after image boundary
+read failed: Input/output error
+
+readv before/after image boundary
+readv failed: Input/output error
+
+fill image with pattern
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write zeroes and check
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000008:  0a 0a 00 00  ....
+read 4/4 bytes at offset 8
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+write zeroes across image boundary
+write failed: Input/output error
+
+write zeroes at image boundary and check
+wrote 2/2 bytes at offset 2046
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000808:  00 00  ..
+read 2/2 bytes at offset 2056
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+0000080a:  0a 0a  ..
+read 2/2 bytes at offset 2058
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+fill image with pattern
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+discard and check
+discard 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000008:  0a 0a 00 00  ....
+read 4/4 bytes at offset 8
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+discard across image boundary
+discard failed: Input/output error
+
+discard at image boundary and check
+discard 2/2 bytes at offset 2046
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+00000808:  00 00  ..
+read 2/2 bytes at offset 2056
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+0000080a:  0a 0a  ..
+read 2/2 bytes at offset 2058
+2 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== test reopen ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4096
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1/1 bytes at offset 511
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: Input/output error
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1/1 bytes at offset 1023
+1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: Input/output error
+checking boundaries
+000001fe:  00 00 0a 0a  ....
+read 4/4 bytes at offset 510
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000003fe:  0a 0a 00 00  ....
+read 4/4 bytes at offset 1022
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000005fe:  00 00 0a 0a  ....
+read 4/4 bytes at offset 1534
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+000009fe:  0a 0a 00 00  ....
+read 4/4 bytes at offset 2558
+4 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 597fc2c952..866c1a032d 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -163,4 +163,5 @@
 160 rw auto quick
 162 auto quick
 170 rw auto quick
+171 rw auto quick
 172 auto
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 1f30cfcc75..bec8eb4b8d 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -348,9 +348,12 @@ def verify_platform(supported_oses=['linux']):
     if True not in [sys.platform.startswith(x) for x in supported_oses]:
         notrun('not suitable for this OS: %s' % sys.platform)
 
+def supports_quorum():
+    return 'quorum' in qemu_img_pipe('--help')
+
 def verify_quorum():
     '''Skip test suite if quorum support is not available'''
-    if 'quorum' not in qemu_img_pipe('--help'):
+    if not supports_quorum():
         notrun('quorum support missing')
 
 def main(supported_fmts=[], supported_oses=['linux']):
diff --git a/tests/test-aio.c b/tests/test-aio.c
index 03aa846970..5be99f8287 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -100,6 +100,7 @@ static void event_ready_cb(EventNotifier *e)
 
 typedef struct {
     QemuMutex start_lock;
+    EventNotifier notifier;
     bool thread_acquired;
 } AcquireTestData;
 
@@ -111,6 +112,11 @@ static void *test_acquire_thread(void *opaque)
     qemu_mutex_lock(&data->start_lock);
     qemu_mutex_unlock(&data->start_lock);
 
+    /* event_notifier_set might be called either before or after
+     * the main thread's call to poll().  The test case's outcome
+     * should be the same in either case.
+     */
+    event_notifier_set(&data->notifier);
     aio_context_acquire(ctx);
     aio_context_release(ctx);
 
@@ -125,20 +131,19 @@ static void set_event_notifier(AioContext *ctx, EventNotifier *notifier,
     aio_set_event_notifier(ctx, notifier, false, handler);
 }
 
-static void dummy_notifier_read(EventNotifier *unused)
+static void dummy_notifier_read(EventNotifier *n)
 {
-    g_assert(false); /* should never be invoked */
+    event_notifier_test_and_clear(n);
 }
 
 static void test_acquire(void)
 {
     QemuThread thread;
-    EventNotifier notifier;
     AcquireTestData data;
 
     /* Dummy event notifier ensures aio_poll() will block */
-    event_notifier_init(&notifier, false);
-    set_event_notifier(ctx, &notifier, dummy_notifier_read);
+    event_notifier_init(&data.notifier, false);
+    set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
     g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
 
     qemu_mutex_init(&data.start_lock);
@@ -152,12 +157,13 @@ static void test_acquire(void)
     /* Block in aio_poll(), let other thread kick us and acquire context */
     aio_context_acquire(ctx);
     qemu_mutex_unlock(&data.start_lock); /* let the thread run */
-    g_assert(!aio_poll(ctx, true));
+    g_assert(aio_poll(ctx, true));
+    g_assert(!data.thread_acquired);
     aio_context_release(ctx);
 
     qemu_thread_join(&thread);
-    set_event_notifier(ctx, &notifier, NULL);
-    event_notifier_cleanup(&notifier);
+    set_event_notifier(ctx, &data.notifier, NULL);
+    event_notifier_cleanup(&data.notifier);
 
     g_assert(data.thread_acquired);
 }
diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
index d049cba8a3..f9afc3be41 100644
--- a/tests/test-blockjob-txn.c
+++ b/tests/test-blockjob-txn.c
@@ -13,7 +13,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 
 typedef struct {
@@ -98,7 +98,8 @@ static BlockJob *test_block_job_start(unsigned int iterations,
     bs = bdrv_new();
     snprintf(job_id, sizeof(job_id), "job%u", counter++);
     s = block_job_create(job_id, &test_block_job_driver, bs, 0,
-                         test_block_job_cb, data, &error_abort);
+                         BLOCK_JOB_DEFAULT, test_block_job_cb,
+                         data, &error_abort);
     s->iterations = iterations;
     s->use_timer = use_timer;
     s->rc = rc;
diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c
index 5b0e934a0c..60b78a3342 100644
--- a/tests/test-blockjob.c
+++ b/tests/test-blockjob.c
@@ -13,7 +13,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
-#include "block/blockjob.h"
+#include "block/blockjob_int.h"
 #include "sysemu/block-backend.h"
 
 static const BlockJobDriver test_block_job_driver = {
@@ -31,7 +31,7 @@ static BlockJob *do_test_id(BlockBackend *blk, const char *id,
     Error *errp = NULL;
 
     job = block_job_create(id, &test_block_job_driver, blk_bs(blk), 0,
-                           block_job_cb, NULL, &errp);
+                           BLOCK_JOB_DEFAULT, block_job_cb, NULL, &errp);
     if (should_succeed) {
         g_assert_null(errp);
         g_assert_nonnull(job);
diff --git a/tests/test-rfifolock.c b/tests/test-rfifolock.c
deleted file mode 100644
index 471a81114d..0000000000
--- a/tests/test-rfifolock.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * RFifoLock tests
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi    <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- */
-
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "qemu/rfifolock.h"
-
-static void test_nesting(void)
-{
-    RFifoLock lock;
-
-    /* Trivial test, ensure the lock is recursive */
-    rfifolock_init(&lock, NULL, NULL);
-    rfifolock_lock(&lock);
-    rfifolock_lock(&lock);
-    rfifolock_lock(&lock);
-    rfifolock_unlock(&lock);
-    rfifolock_unlock(&lock);
-    rfifolock_unlock(&lock);
-    rfifolock_destroy(&lock);
-}
-
-typedef struct {
-    RFifoLock lock;
-    int fd[2];
-} CallbackTestData;
-
-static void rfifolock_cb(void *opaque)
-{
-    CallbackTestData *data = opaque;
-    int ret;
-    char c = 0;
-
-    ret = write(data->fd[1], &c, sizeof(c));
-    g_assert(ret == 1);
-}
-
-static void *callback_thread(void *opaque)
-{
-    CallbackTestData *data = opaque;
-
-    /* The other thread holds the lock so the contention callback will be
-     * invoked...
-     */
-    rfifolock_lock(&data->lock);
-    rfifolock_unlock(&data->lock);
-    return NULL;
-}
-
-static void test_callback(void)
-{
-    CallbackTestData data;
-    QemuThread thread;
-    int ret;
-    char c;
-
-    rfifolock_init(&data.lock, rfifolock_cb, &data);
-    ret = qemu_pipe(data.fd);
-    g_assert(ret == 0);
-
-    /* Hold lock but allow the callback to kick us by writing to the pipe */
-    rfifolock_lock(&data.lock);
-    qemu_thread_create(&thread, "callback_thread",
-                       callback_thread, &data, QEMU_THREAD_JOINABLE);
-    ret = read(data.fd[0], &c, sizeof(c));
-    g_assert(ret == 1);
-    rfifolock_unlock(&data.lock);
-    /* If we got here then the callback was invoked, as expected */
-
-    qemu_thread_join(&thread);
-    close(data.fd[0]);
-    close(data.fd[1]);
-    rfifolock_destroy(&data.lock);
-}
-
-int main(int argc, char **argv)
-{
-    g_test_init(&argc, &argv, NULL);
-    g_test_add_func("/nesting", test_nesting);
-    g_test_add_func("/callback", test_callback);
-    return g_test_run();
-}
diff --git a/tests/test-x86-cpuid-compat.c b/tests/test-x86-cpuid-compat.c
index 260dd2795c..79a2e69a28 100644
--- a/tests/test-x86-cpuid-compat.c
+++ b/tests/test-x86-cpuid-compat.c
@@ -35,6 +35,7 @@ static QObject *qom_get(const char *path, const char *prop)
     return ret;
 }
 
+#ifdef CONFIG_HAS_GLIB_SUBPROCESS_TESTS
 static bool qom_get_bool(const char *path, const char *prop)
 {
     QBool *value = qobject_to_qbool(qom_get(path, prop));
@@ -43,6 +44,7 @@ static bool qom_get_bool(const char *path, const char *prop)
     QDECREF(value);
     return b;
 }
+#endif
 
 typedef struct CpuidTestArgs {
     const char *cmdline;
@@ -76,7 +78,8 @@ static void add_cpuid_test(const char *name, const char *cmdline,
     qtest_add_data_func(name, args, test_cpuid_prop);
 }
 
-static void test_plus_minus(void)
+#ifdef CONFIG_HAS_GLIB_SUBPROCESS_TESTS
+static void test_plus_minus_subprocess(void)
 {
     char *path;
 
@@ -86,9 +89,8 @@ static void test_plus_minus(void)
      * 3) Old feature names with underscores (e.g. "sse4_2")
      *    should keep working
      *
-     * Note: rules 1 and 2 are planned to be removed soon, but we
-     * need to keep compatibility for a while until we start
-     * warning users about it.
+     * Note: rules 1 and 2 are planned to be removed soon, and
+     * should generate a warning.
      */
     qtest_start("-cpu pentium,-fpu,+fpu,-mce,mce=on,+cx8,cx8=off,+sse4_1,sse4_2=on");
     path = get_cpu0_qom_path();
@@ -108,11 +110,27 @@ static void test_plus_minus(void)
     g_free(path);
 }
 
+static void test_plus_minus(void)
+{
+    g_test_trap_subprocess("/x86/cpuid/parsing-plus-minus/subprocess", 0, 0);
+    g_test_trap_assert_passed();
+    g_test_trap_assert_stderr("*Ambiguous CPU model string. "
+                              "Don't mix both \"-mce\" and \"mce=on\"*");
+    g_test_trap_assert_stderr("*Ambiguous CPU model string. "
+                              "Don't mix both \"+cx8\" and \"cx8=off\"*");
+    g_test_trap_assert_stdout("");
+}
+#endif
+
 int main(int argc, char **argv)
 {
     g_test_init(&argc, &argv, NULL);
 
-    qtest_add_func("x86/cpuid/parsing-plus-minus", test_plus_minus);
+#ifdef CONFIG_HAS_GLIB_SUBPROCESS_TESTS
+    g_test_add_func("/x86/cpuid/parsing-plus-minus/subprocess",
+                    test_plus_minus_subprocess);
+    g_test_add_func("/x86/cpuid/parsing-plus-minus", test_plus_minus);
+#endif
 
     /* Original level values for CPU models: */
     add_cpuid_test("x86/cpuid/phenom/level",
diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index e4b2900898..96bf00eefa 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -22,8 +22,6 @@
 #include "libqos/virtio-pci.h"
 #include "qapi/error.h"
 
-#include "libqos/pci-pc.h"
-#include "libqos/virtio-pci.h"
 #include "libqos/malloc-pc.h"
 #include "hw/virtio/virtio-net.h"
 
diff --git a/trace-events b/trace-events
index 8ecded5150..f74e1d3d22 100644
--- a/trace-events
+++ b/trace-events
@@ -121,6 +121,8 @@ memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t va
 memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset %#"PRIx64" value %#"PRIx64" size %u"
 memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr %#"PRIx64" value %#"PRIx64" size %u"
 memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr %#"PRIx64" value %#"PRIx64" size %u"
+memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr %#"PRIx64" value %#"PRIx64" size %u"
+memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr %#"PRIx64" value %#"PRIx64" size %u"
 
 ### Guest events, keep at bottom
 
diff --git a/translate-all.c b/translate-all.c
index 76fc18c98f..3dd9214904 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -31,6 +31,7 @@
 #include "tcg.h"
 #if defined(CONFIG_USER_ONLY)
 #include "qemu.h"
+#include "exec/exec-all.h"
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 #include <sys/param.h>
 #if __FreeBSD_version >= 700104
@@ -56,16 +57,39 @@
 #include "qemu/timer.h"
 #include "exec/log.h"
 
-//#define DEBUG_TB_INVALIDATE
-//#define DEBUG_FLUSH
+/* #define DEBUG_TB_INVALIDATE */
+/* #define DEBUG_TB_FLUSH */
+/* #define DEBUG_LOCKING */
 /* make various TB consistency checks */
-//#define DEBUG_TB_CHECK
+/* #define DEBUG_TB_CHECK */
 
 #if !defined(CONFIG_USER_ONLY)
 /* TB consistency checks only implemented for usermode emulation.  */
 #undef DEBUG_TB_CHECK
 #endif
 
+/* Access to the various translations structures need to be serialised via locks
+ * for consistency. This is automatic for SoftMMU based system
+ * emulation due to its single threaded nature. In user-mode emulation
+ * access to the memory related structures are protected with the
+ * mmap_lock.
+ */
+#ifdef DEBUG_LOCKING
+#define DEBUG_MEM_LOCKS 1
+#else
+#define DEBUG_MEM_LOCKS 0
+#endif
+
+#ifdef CONFIG_SOFTMMU
+#define assert_memory_lock() do { /* nothing */ } while (0)
+#else
+#define assert_memory_lock() do {               \
+        if (DEBUG_MEM_LOCKS) {                  \
+            g_assert(have_mmap_lock());         \
+        }                                       \
+    } while (0)
+#endif
+
 #define SMC_BITMAP_USE_THRESHOLD 10
 
 typedef struct PageDesc {
@@ -173,6 +197,23 @@ void tb_lock_reset(void)
 #endif
 }
 
+#ifdef DEBUG_LOCKING
+#define DEBUG_TB_LOCKS 1
+#else
+#define DEBUG_TB_LOCKS 0
+#endif
+
+#ifdef CONFIG_SOFTMMU
+#define assert_tb_lock() do { /* nothing */ } while (0)
+#else
+#define assert_tb_lock() do {               \
+        if (DEBUG_TB_LOCKS) {               \
+            g_assert(have_tb_lock);         \
+        }                                   \
+    } while (0)
+#endif
+
+
 static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);
 
 void cpu_gen_init(void)
@@ -267,7 +308,9 @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
     return p - block;
 }
 
-/* The cpu state corresponding to 'searched_pc' is restored.  */
+/* The cpu state corresponding to 'searched_pc' is restored.
+ * Called with tb_lock held.
+ */
 static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                      uintptr_t searched_pc)
 {
@@ -320,7 +363,9 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
 bool cpu_restore_state(CPUState *cpu, uintptr_t retaddr)
 {
     TranslationBlock *tb;
+    bool r = false;
 
+    tb_lock();
     tb = tb_find_pc(retaddr);
     if (tb) {
         cpu_restore_state_from_tb(cpu, tb, retaddr);
@@ -329,9 +374,11 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t retaddr)
             tb_phys_invalidate(tb, -1);
             tb_free(tb);
         }
-        return true;
+        r = true;
     }
-    return false;
+    tb_unlock();
+
+    return r;
 }
 
 void page_size_init(void)
@@ -421,6 +468,7 @@ static void page_init(void)
 }
 
 /* If alloc=1:
+ * Called with tb_lock held for system emulation.
  * Called with mmap_lock held for user-mode emulation.
  */
 static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
@@ -429,6 +477,10 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
     void **lp;
     int i;
 
+    if (alloc) {
+        assert_memory_lock();
+    }
+
     /* Level 1.  Always allocated.  */
     lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
 
@@ -785,12 +837,18 @@ bool tcg_enabled(void)
     return tcg_ctx.code_gen_buffer != NULL;
 }
 
-/* Allocate a new translation block. Flush the translation buffer if
-   too many translation blocks or too much generated code. */
+/*
+ * Allocate a new translation block. Flush the translation buffer if
+ * too many translation blocks or too much generated code.
+ *
+ * Called with tb_lock held.
+ */
 static TranslationBlock *tb_alloc(target_ulong pc)
 {
     TranslationBlock *tb;
 
+    assert_tb_lock();
+
     if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks) {
         return NULL;
     }
@@ -801,8 +859,11 @@ static TranslationBlock *tb_alloc(target_ulong pc)
     return tb;
 }
 
+/* Called with tb_lock held.  */
 void tb_free(TranslationBlock *tb)
 {
+    assert_tb_lock();
+
     /* In practice this is mostly used for single use temporary TB
        Ignore the hard cases and just back up if this TB happens to
        be the last one generated.  */
@@ -856,20 +917,18 @@ static void page_flush_tb(void)
 }
 
 /* flush all the translation blocks */
-static void do_tb_flush(CPUState *cpu, void *data)
+static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
 {
-    unsigned tb_flush_req = (unsigned) (uintptr_t) data;
-
     tb_lock();
 
-    /* If it's already been done on request of another CPU,
+    /* If it is already been done on request of another CPU,
      * just retry.
      */
-    if (tcg_ctx.tb_ctx.tb_flush_count != tb_flush_req) {
+    if (tcg_ctx.tb_ctx.tb_flush_count != tb_flush_count.host_int) {
         goto done;
     }
 
-#if defined(DEBUG_FLUSH)
+#if defined(DEBUG_TB_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
            tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.tb_ctx.nb_tbs > 0 ?
@@ -906,8 +965,9 @@ done:
 void tb_flush(CPUState *cpu)
 {
     if (tcg_enabled()) {
-        uintptr_t tb_flush_req = atomic_mb_read(&tcg_ctx.tb_ctx.tb_flush_count);
-        async_safe_run_on_cpu(cpu, do_tb_flush, (void *) tb_flush_req);
+        unsigned tb_flush_count = atomic_mb_read(&tcg_ctx.tb_ctx.tb_flush_count);
+        async_safe_run_on_cpu(cpu, do_tb_flush,
+                              RUN_ON_CPU_HOST_INT(tb_flush_count));
     }
 }
 
@@ -925,6 +985,10 @@ do_tb_invalidate_check(struct qht *ht, void *p, uint32_t hash, void *userp)
     }
 }
 
+/* verify that all the pages have correct rights for code
+ *
+ * Called with tb_lock held.
+ */
 static void tb_invalidate_check(target_ulong address)
 {
     address &= TARGET_PAGE_MASK;
@@ -1029,7 +1093,10 @@ static inline void tb_jmp_unlink(TranslationBlock *tb)
     }
 }
 
-/* invalidate one TB */
+/* invalidate one TB
+ *
+ * Called with tb_lock held.
+ */
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 {
     CPUState *cpu;
@@ -1037,6 +1104,8 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
     uint32_t h;
     tb_page_addr_t phys_pc;
 
+    assert_tb_lock();
+
     atomic_set(&tb->invalid, true);
 
     /* remove the TB from the hash list */
@@ -1094,7 +1163,7 @@ static void build_page_bitmap(PageDesc *p)
             tb_end = tb_start + tb->size;
             if (tb_end > TARGET_PAGE_SIZE) {
                 tb_end = TARGET_PAGE_SIZE;
-            }
+             }
         } else {
             tb_start = 0;
             tb_end = ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
@@ -1117,6 +1186,8 @@ static inline void tb_alloc_page(TranslationBlock *tb,
     bool page_already_protected;
 #endif
 
+    assert_memory_lock();
+
     tb->page_addr[n] = page_addr;
     p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
     tb->page_next[n] = p->first_tb;
@@ -1173,6 +1244,8 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
 {
     uint32_t h;
 
+    assert_memory_lock();
+
     /* add in the page list */
     tb_alloc_page(tb, 0, phys_pc & TARGET_PAGE_MASK);
     if (phys_page2 != -1) {
@@ -1204,6 +1277,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 #ifdef CONFIG_PROFILER
     int64_t ti;
 #endif
+    assert_memory_lock();
 
     phys_pc = get_page_addr_code(env, pc);
     if (use_icount && !(cflags & CF_IGNORE_ICOUNT)) {
@@ -1281,10 +1355,12 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 #ifdef DEBUG_DISAS
     if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
         qemu_log_in_addr_range(tb->pc)) {
+        qemu_log_lock();
         qemu_log("OUT: [size=%d]\n", gen_code_size);
         log_disas(tb->tc_ptr, gen_code_size);
         qemu_log("\n");
         qemu_log_flush();
+        qemu_log_unlock();
     }
 #endif
 
@@ -1328,9 +1404,10 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
  * access: the virtual CPU will exit the current TB if code is modified inside
  * this TB.
  *
- * Called with mmap_lock held for user-mode emulation
+ * Called with mmap_lock held for user-mode emulation, grabs tb_lock
+ * Called with tb_lock held for system-mode emulation
  */
-void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+static void tb_invalidate_phys_range_1(tb_page_addr_t start, tb_page_addr_t end)
 {
     while (start < end) {
         tb_invalidate_phys_page_range(start, end, 0);
@@ -1339,6 +1416,21 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
     }
 }
 
+#ifdef CONFIG_SOFTMMU
+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+{
+    assert_tb_lock();
+    tb_invalidate_phys_range_1(start, end);
+}
+#else
+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+{
+    assert_memory_lock();
+    tb_lock();
+    tb_invalidate_phys_range_1(start, end);
+    tb_unlock();
+}
+#endif
 /*
  * Invalidate all TBs which intersect with the target physical address range
  * [start;end[. NOTE: start and end must refer to the *same* physical page.
@@ -1346,7 +1438,8 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
  * access: the virtual CPU will exit the current TB if code is modified inside
  * this TB.
  *
- * Called with mmap_lock held for user-mode emulation
+ * Called with tb_lock/mmap_lock held for user-mode emulation
+ * Called with tb_lock held for system-mode emulation
  */
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
                                    int is_cpu_write_access)
@@ -1368,6 +1461,9 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
     uint32_t current_flags = 0;
 #endif /* TARGET_HAS_PRECISE_SMC */
 
+    assert_memory_lock();
+    assert_tb_lock();
+
     p = page_find(start >> TARGET_PAGE_BITS);
     if (!p) {
         return;
@@ -1443,7 +1539,10 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
 }
 
 #ifdef CONFIG_SOFTMMU
-/* len must be <= 8 and start must be a multiple of len */
+/* len must be <= 8 and start must be a multiple of len.
+ * Called via softmmu_template.h when code areas are written to with
+ * tb_lock held.
+ */
 void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
 {
     PageDesc *p;
@@ -1457,13 +1556,17 @@ void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
                   (intptr_t)cpu_single_env->segs[R_CS].base);
     }
 #endif
+    assert_memory_lock();
+
     p = page_find(start >> TARGET_PAGE_BITS);
     if (!p) {
         return;
     }
     if (!p->code_bitmap &&
         ++p->code_write_count >= SMC_BITMAP_USE_THRESHOLD) {
-        /* build code bitmap */
+        /* build code bitmap.  FIXME: writes should be protected by
+         * tb_lock, reads by tb_lock or RCU.
+         */
         build_page_bitmap(p);
     }
     if (p->code_bitmap) {
@@ -1502,11 +1605,15 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
     uint32_t current_flags = 0;
 #endif
 
+    assert_memory_lock();
+
     addr &= TARGET_PAGE_MASK;
     p = page_find(addr >> TARGET_PAGE_BITS);
     if (!p) {
         return false;
     }
+
+    tb_lock();
     tb = p->first_tb;
 #ifdef TARGET_HAS_PRECISE_SMC
     if (tb && pc != 0) {
@@ -1544,9 +1651,13 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
            modifying the memory. It will ensure that it cannot modify
            itself */
         tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
+        /* tb_lock will be reset after cpu_loop_exit_noexc longjmps
+         * back into the cpu_exec loop. */
         return true;
     }
 #endif
+    tb_unlock();
+
     return false;
 }
 #endif
@@ -1599,11 +1710,14 @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr)
         return;
     }
     ram_addr = memory_region_get_ram_addr(mr) + addr;
+    tb_lock();
     tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
+    tb_unlock();
     rcu_read_unlock();
 }
 #endif /* !defined(CONFIG_USER_ONLY) */
 
+/* Called with tb_lock held.  */
 void tb_check_watchpoint(CPUState *cpu)
 {
     TranslationBlock *tb;
@@ -1640,6 +1754,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     target_ulong pc, cs_base;
     uint32_t flags;
 
+    tb_lock();
     tb = tb_find_pc(retaddr);
     if (!tb) {
         cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
@@ -1691,11 +1806,16 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     /* FIXME: In theory this could raise an exception.  In practice
        we have already translated the block once so it's probably ok.  */
     tb_gen_code(cpu, pc, cs_base, flags, cflags);
+
     /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not
-       the first in the TB) then we end up generating a whole new TB and
-       repeating the fault, which is horribly inefficient.
-       Better would be to execute just this insn uncached, or generate a
-       second new TB.  */
+     * the first in the TB) then we end up generating a whole new TB and
+     *  repeating the fault, which is horribly inefficient.
+     *  Better would be to execute just this insn uncached, or generate a
+     *  second new TB.
+     *
+     * cpu_loop_exit_noexc will longjmp back to cpu_exec where the
+     * tb_lock gets reset.
+     */
     cpu_loop_exit_noexc(cpu);
 }
 
@@ -1759,6 +1879,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     TranslationBlock *tb;
     struct qht_stats hst;
 
+    tb_lock();
+
     target_code_size = 0;
     max_target_code_size = 0;
     cross_page = 0;
@@ -1820,6 +1942,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
             tcg_ctx.tb_ctx.tb_phys_invalidate_count);
     cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
     tcg_dump_info(f, cpu_fprintf);
+
+    tb_unlock();
 }
 
 void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf)
@@ -1972,6 +2096,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
     assert(end < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
 #endif
     assert(start < end);
+    assert_memory_lock();
 
     start = start & TARGET_PAGE_MASK;
     end = TARGET_PAGE_ALIGN(end);
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 36c7dcc1fa..ad0f9c7fe4 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -25,7 +25,6 @@ util-obj-y += uuid.o
 util-obj-y += throttle.o
 util-obj-y += getauxval.o
 util-obj-y += readline.o
-util-obj-y += rfifolock.o
 util-obj-y += rcu.o
 util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 8ec99ccb4f..67c65893a4 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -28,7 +28,6 @@
 
 #include "qemu/osdep.h"
 #include <termios.h>
-#include <termios.h>
 
 #include <glib/gprintf.h>
 
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 4cef549d90..fe1d07aaef 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -17,6 +17,10 @@
  */
 #include "qemu/osdep.h"
 
+#ifdef CONFIG_AF_VSOCK
+#include <linux/vm_sockets.h>
+#endif /* CONFIG_AF_VSOCK */
+
 #include "monitor/monitor.h"
 #include "qapi/error.h"
 #include "qemu/sockets.h"
@@ -75,6 +79,9 @@ NetworkAddressFamily inet_netfamily(int family)
     case PF_INET6: return NETWORK_ADDRESS_FAMILY_IPV6;
     case PF_INET:  return NETWORK_ADDRESS_FAMILY_IPV4;
     case PF_UNIX:  return NETWORK_ADDRESS_FAMILY_UNIX;
+#ifdef CONFIG_AF_VSOCK
+    case PF_VSOCK: return NETWORK_ADDRESS_FAMILY_VSOCK;
+#endif /* CONFIG_AF_VSOCK */
     }
     return NETWORK_ADDRESS_FAMILY_UNKNOWN;
 }
@@ -412,8 +419,8 @@ static struct addrinfo *inet_parse_connect_saddr(InetSocketAddress *saddr,
  * function succeeds, callback will be called when the connection
  * completes, with the file descriptor on success, or -1 on error.
  */
-static int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
-                              NonBlockingConnectHandler *callback, void *opaque)
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
+                       NonBlockingConnectHandler *callback, void *opaque)
 {
     Error *local_err = NULL;
     struct addrinfo *res, *e;
@@ -650,6 +657,181 @@ int inet_connect(const char *str, Error **errp)
     return sock;
 }
 
+#ifdef CONFIG_AF_VSOCK
+static bool vsock_parse_vaddr_to_sockaddr(const VsockSocketAddress *vaddr,
+                                          struct sockaddr_vm *svm,
+                                          Error **errp)
+{
+    unsigned long long val;
+
+    memset(svm, 0, sizeof(*svm));
+    svm->svm_family = AF_VSOCK;
+
+    if (parse_uint_full(vaddr->cid, &val, 10) < 0 ||
+        val > UINT32_MAX) {
+        error_setg(errp, "Failed to parse cid '%s'", vaddr->cid);
+        return false;
+    }
+    svm->svm_cid = val;
+
+    if (parse_uint_full(vaddr->port, &val, 10) < 0 ||
+        val > UINT32_MAX) {
+        error_setg(errp, "Failed to parse port '%s'", vaddr->port);
+        return false;
+    }
+    svm->svm_port = val;
+
+    return true;
+}
+
+static int vsock_connect_addr(const struct sockaddr_vm *svm, bool *in_progress,
+                              ConnectState *connect_state, Error **errp)
+{
+    int sock, rc;
+
+    *in_progress = false;
+
+    sock = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
+    if (sock < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket");
+        return -1;
+    }
+    if (connect_state != NULL) {
+        qemu_set_nonblock(sock);
+    }
+    /* connect to peer */
+    do {
+        rc = 0;
+        if (connect(sock, (const struct sockaddr *)svm, sizeof(*svm)) < 0) {
+            rc = -errno;
+        }
+    } while (rc == -EINTR);
+
+    if (connect_state != NULL && QEMU_SOCKET_RC_INPROGRESS(rc)) {
+        connect_state->fd = sock;
+        qemu_set_fd_handler(sock, NULL, wait_for_connect, connect_state);
+        *in_progress = true;
+    } else if (rc < 0) {
+        error_setg_errno(errp, errno, "Failed to connect socket");
+        closesocket(sock);
+        return -1;
+    }
+    return sock;
+}
+
+static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp,
+                               NonBlockingConnectHandler *callback,
+                               void *opaque)
+{
+    struct sockaddr_vm svm;
+    int sock = -1;
+    bool in_progress;
+    ConnectState *connect_state = NULL;
+
+    if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) {
+        return -1;
+    }
+
+    if (callback != NULL) {
+        connect_state = g_malloc0(sizeof(*connect_state));
+        connect_state->callback = callback;
+        connect_state->opaque = opaque;
+    }
+
+    sock = vsock_connect_addr(&svm, &in_progress, connect_state, errp);
+    if (sock < 0) {
+        /* do nothing */
+    } else if (in_progress) {
+        /* wait_for_connect() will do the rest */
+        return sock;
+    } else {
+        if (callback) {
+            callback(sock, NULL, opaque);
+        }
+    }
+    g_free(connect_state);
+    return sock;
+}
+
+static int vsock_listen_saddr(VsockSocketAddress *vaddr,
+                              Error **errp)
+{
+    struct sockaddr_vm svm;
+    int slisten;
+
+    if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) {
+        return -1;
+    }
+
+    slisten = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
+    if (slisten < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket");
+        return -1;
+    }
+
+    if (bind(slisten, (const struct sockaddr *)&svm, sizeof(svm)) != 0) {
+        error_setg_errno(errp, errno, "Failed to bind socket");
+        closesocket(slisten);
+        return -1;
+    }
+
+    if (listen(slisten, 1) != 0) {
+        error_setg_errno(errp, errno, "Failed to listen on socket");
+        closesocket(slisten);
+        return -1;
+    }
+    return slisten;
+}
+
+static VsockSocketAddress *vsock_parse(const char *str, Error **errp)
+{
+    VsockSocketAddress *addr = NULL;
+    char cid[33];
+    char port[33];
+    int n;
+
+    if (sscanf(str, "%32[^:]:%32[^,]%n", cid, port, &n) != 2) {
+        error_setg(errp, "error parsing address '%s'", str);
+        return NULL;
+    }
+    if (str[n] != '\0') {
+        error_setg(errp, "trailing characters in address '%s'", str);
+        return NULL;
+    }
+
+    addr = g_new0(VsockSocketAddress, 1);
+    addr->cid = g_strdup(cid);
+    addr->port = g_strdup(port);
+    return addr;
+}
+#else
+static void vsock_unsupported(Error **errp)
+{
+    error_setg(errp, "socket family AF_VSOCK unsupported");
+}
+
+static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp,
+                               NonBlockingConnectHandler *callback,
+                               void *opaque)
+{
+    vsock_unsupported(errp);
+    return -1;
+}
+
+static int vsock_listen_saddr(VsockSocketAddress *vaddr,
+                              Error **errp)
+{
+    vsock_unsupported(errp);
+    return -1;
+}
+
+static VsockSocketAddress *vsock_parse(const char *str, Error **errp)
+{
+    vsock_unsupported(errp);
+    return NULL;
+}
+#endif /* CONFIG_AF_VSOCK */
+
 #ifndef _WIN32
 
 static int unix_listen_saddr(UnixSocketAddress *saddr,
@@ -864,6 +1046,12 @@ SocketAddress *socket_parse(const char *str, Error **errp)
             addr->u.fd.data = g_new(String, 1);
             addr->u.fd.data->str = g_strdup(str + 3);
         }
+    } else if (strstart(str, "vsock:", NULL)) {
+        addr->type = SOCKET_ADDRESS_KIND_VSOCK;
+        addr->u.vsock.data = vsock_parse(str + strlen("vsock:"), errp);
+        if (addr->u.vsock.data == NULL) {
+            goto fail;
+        }
     } else {
         addr->type = SOCKET_ADDRESS_KIND_INET;
         addr->u.inet.data = inet_parse(str, errp);
@@ -900,6 +1088,10 @@ int socket_connect(SocketAddress *addr, Error **errp,
         }
         break;
 
+    case SOCKET_ADDRESS_KIND_VSOCK:
+        fd = vsock_connect_saddr(addr->u.vsock.data, errp, callback, opaque);
+        break;
+
     default:
         abort();
     }
@@ -923,6 +1115,10 @@ int socket_listen(SocketAddress *addr, Error **errp)
         fd = monitor_get_fd(cur_mon, addr->u.fd.data->str, errp);
         break;
 
+    case SOCKET_ADDRESS_KIND_VSOCK:
+        fd = vsock_listen_saddr(addr->u.vsock.data, errp);
+        break;
+
     default:
         abort();
     }
@@ -1022,6 +1218,26 @@ socket_sockaddr_to_address_unix(struct sockaddr_storage *sa,
 }
 #endif /* WIN32 */
 
+#ifdef CONFIG_AF_VSOCK
+static SocketAddress *
+socket_sockaddr_to_address_vsock(struct sockaddr_storage *sa,
+                                 socklen_t salen,
+                                 Error **errp)
+{
+    SocketAddress *addr;
+    VsockSocketAddress *vaddr;
+    struct sockaddr_vm *svm = (struct sockaddr_vm *)sa;
+
+    addr = g_new0(SocketAddress, 1);
+    addr->type = SOCKET_ADDRESS_KIND_VSOCK;
+    addr->u.vsock.data = vaddr = g_new0(VsockSocketAddress, 1);
+    vaddr->cid = g_strdup_printf("%u", svm->svm_cid);
+    vaddr->port = g_strdup_printf("%u", svm->svm_port);
+
+    return addr;
+}
+#endif /* CONFIG_AF_VSOCK */
+
 SocketAddress *
 socket_sockaddr_to_address(struct sockaddr_storage *sa,
                            socklen_t salen,
@@ -1037,6 +1253,11 @@ socket_sockaddr_to_address(struct sockaddr_storage *sa,
         return socket_sockaddr_to_address_unix(sa, salen, errp);
 #endif /* WIN32 */
 
+#ifdef CONFIG_AF_VSOCK
+    case AF_VSOCK:
+        return socket_sockaddr_to_address_vsock(sa, salen, errp);
+#endif
+
     default:
         error_setg(errp, "socket family %d unsupported",
                    sa->ss_family);
@@ -1103,6 +1324,12 @@ char *socket_address_to_string(struct SocketAddress *addr, Error **errp)
         buf = g_strdup(addr->u.fd.data->str);
         break;
 
+    case SOCKET_ADDRESS_KIND_VSOCK:
+        buf = g_strdup_printf("%s:%s",
+                              addr->u.vsock.data->cid,
+                              addr->u.vsock.data->port);
+        break;
+
     default:
         error_setg(errp, "socket family %d unsupported",
                    addr->type);
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index ce51b37c1d..d20cddec0c 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -80,6 +80,20 @@ void qemu_mutex_unlock(QemuMutex *mutex)
         error_exit(err, __func__);
 }
 
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+    int err;
+    pthread_mutexattr_t attr;
+
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    err = pthread_mutex_init(&mutex->lock, &attr);
+    pthread_mutexattr_destroy(&attr);
+    if (err) {
+        error_exit(err, __func__);
+    }
+}
+
 void qemu_cond_init(QemuCond *cond)
 {
     int err;
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
index 072806f792..728e76b5b2 100644
--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -79,6 +79,31 @@ void qemu_mutex_unlock(QemuMutex *mutex)
     LeaveCriticalSection(&mutex->lock);
 }
 
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+    InitializeCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+    DeleteCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_lock(QemuRecMutex *mutex)
+{
+    EnterCriticalSection(&mutex->lock);
+}
+
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex)
+{
+    return !TryEnterCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex)
+{
+    LeaveCriticalSection(&mutex->lock);
+}
+
 void qemu_cond_init(QemuCond *cond)
 {
     memset(cond, 0, sizeof(*cond));
diff --git a/util/rfifolock.c b/util/rfifolock.c
deleted file mode 100644
index 084c2f0ea1..0000000000
--- a/util/rfifolock.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Recursive FIFO lock
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "qemu/rfifolock.h"
-
-void rfifolock_init(RFifoLock *r, void (*cb)(void *), void *opaque)
-{
-    qemu_mutex_init(&r->lock);
-    r->head = 0;
-    r->tail = 0;
-    qemu_cond_init(&r->cond);
-    r->nesting = 0;
-    r->cb = cb;
-    r->cb_opaque = opaque;
-}
-
-void rfifolock_destroy(RFifoLock *r)
-{
-    qemu_cond_destroy(&r->cond);
-    qemu_mutex_destroy(&r->lock);
-}
-
-/*
- * Theory of operation:
- *
- * In order to ensure FIFO ordering, implement a ticketlock.  Threads acquiring
- * the lock enqueue themselves by incrementing the tail index.  When the lock
- * is unlocked, the head is incremented and waiting threads are notified.
- *
- * Recursive locking does not take a ticket since the head is only incremented
- * when the outermost recursive caller unlocks.
- */
-void rfifolock_lock(RFifoLock *r)
-{
-    qemu_mutex_lock(&r->lock);
-
-    /* Take a ticket */
-    unsigned int ticket = r->tail++;
-
-    if (r->nesting > 0 && qemu_thread_is_self(&r->owner_thread)) {
-        r->tail--; /* put ticket back, we're nesting */
-    } else {
-        while (ticket != r->head) {
-            /* Invoke optional contention callback */
-            if (r->cb) {
-                r->cb(r->cb_opaque);
-            }
-            qemu_cond_wait(&r->cond, &r->lock);
-        }
-        qemu_thread_get_self(&r->owner_thread);
-    }
-
-    r->nesting++;
-    qemu_mutex_unlock(&r->lock);
-}
-
-void rfifolock_unlock(RFifoLock *r)
-{
-    qemu_mutex_lock(&r->lock);
-    assert(r->nesting > 0);
-    assert(qemu_thread_is_self(&r->owner_thread));
-    if (--r->nesting == 0) {
-        r->head++;
-        qemu_cond_broadcast(&r->cond);
-    }
-    qemu_mutex_unlock(&r->lock);
-}
diff --git a/vl.c b/vl.c
index 74dfe4eef9..368510fd8c 100644
--- a/vl.c
+++ b/vl.c
@@ -90,6 +90,7 @@ int main(int argc, char **argv)
 #include "audio/audio.h"
 #include "migration/migration.h"
 #include "sysemu/cpus.h"
+#include "migration/colo.h"
 #include "sysemu/kvm.h"
 #include "qapi/qmp/qjson.h"
 #include "qemu/option.h"
@@ -110,7 +111,6 @@ int main(int argc, char **argv)
 #include "trace.h"
 #include "trace/control.h"
 #include "qemu/queue.h"
-#include "sysemu/cpus.h"
 #include "sysemu/arch_init.h"
 
 #include "ui/qemu-spice.h"
@@ -613,6 +613,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_INMIGRATE, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_INMIGRATE, RUN_STATE_PRELAUNCH },
     { RUN_STATE_INMIGRATE, RUN_STATE_POSTMIGRATE },
+    { RUN_STATE_INMIGRATE, RUN_STATE_COLO },
 
     { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED },
     { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE },
@@ -625,6 +626,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_PAUSED, RUN_STATE_RUNNING },
     { RUN_STATE_PAUSED, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_PAUSED, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_PAUSED, RUN_STATE_COLO},
 
     { RUN_STATE_POSTMIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_POSTMIGRATE, RUN_STATE_FINISH_MIGRATE },
@@ -637,10 +639,13 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO},
 
     { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
 
+    { RUN_STATE_COLO, RUN_STATE_RUNNING },
+
     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
     { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR },
     { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR },
@@ -651,6 +656,7 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
     { RUN_STATE_RUNNING, RUN_STATE_WATCHDOG },
     { RUN_STATE_RUNNING, RUN_STATE_GUEST_PANICKED },
+    { RUN_STATE_RUNNING, RUN_STATE_COLO},
 
     { RUN_STATE_SAVE_VM, RUN_STATE_RUNNING },
 
@@ -663,10 +669,12 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
     { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_SUSPENDED, RUN_STATE_COLO},
 
     { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
     { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
     { RUN_STATE_WATCHDOG, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_WATCHDOG, RUN_STATE_COLO},
 
     { RUN_STATE_GUEST_PANICKED, RUN_STATE_RUNNING },
     { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
@@ -2408,8 +2416,9 @@ static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
     if (qemu_opt_get_bool(opts, "pretty", 0))
         flags |= MONITOR_USE_PRETTY;
 
-    if (qemu_opt_get_bool(opts, "default", 0))
-        flags |= MONITOR_IS_DEFAULT;
+    if (qemu_opt_get_bool(opts, "default", 0)) {
+        error_report("option 'default' does nothing and is deprecated");
+    }
 
     chardev = qemu_opt_get(opts, "chardev");
     chr = qemu_chr_find(chardev);
@@ -2428,16 +2437,12 @@ static void monitor_parse(const char *optarg, const char *mode, bool pretty)
     QemuOpts *opts;
     const char *p;
     char label[32];
-    int def = 0;
 
     if (strstart(optarg, "chardev:", &p)) {
         snprintf(label, sizeof(label), "%s", p);
     } else {
         snprintf(label, sizeof(label), "compat_monitor%d",
                  monitor_device_index);
-        if (monitor_device_index == 0) {
-            def = 1;
-        }
         opts = qemu_chr_parse_compat(label, optarg);
         if (!opts) {
             error_report("parse error: %s", optarg);
@@ -2449,8 +2454,6 @@ static void monitor_parse(const char *optarg, const char *mode, bool pretty)
     qemu_opt_set(opts, "mode", mode, &error_abort);
     qemu_opt_set(opts, "chardev", label, &error_abort);
     qemu_opt_set_bool(opts, "pretty", pretty, &error_abort);
-    if (def)
-        qemu_opt_set(opts, "default", "on", &error_abort);
     monitor_device_index++;
 }
 
@@ -4426,6 +4429,8 @@ int main(int argc, char **argv, char **envp)
 #endif
     }
 
+    colo_info_init();
+
     if (net_init_clients() < 0) {
         exit(1);
     }
diff --git a/xen-common.c b/xen-common.c
index e641ad1aef..909976071c 100644
--- a/xen-common.c
+++ b/xen-common.c
@@ -116,12 +116,12 @@ static int xen_init(MachineState *ms)
 {
     xen_xc = xc_interface_open(0, 0, 0);
     if (xen_xc == NULL) {
-        xen_be_printf(NULL, 0, "can't open xen interface\n");
+        xen_pv_printf(NULL, 0, "can't open xen interface\n");
         return -1;
     }
     xen_fmem = xenforeignmemory_open(0, 0);
     if (xen_fmem == NULL) {
-        xen_be_printf(NULL, 0, "can't open xen fmem interface\n");
+        xen_pv_printf(NULL, 0, "can't open xen fmem interface\n");
         xc_interface_close(xen_xc);
         return -1;
     }