178 files changed, 6607 insertions, 5036 deletions
diff --git a/.gitignore b/.gitignore
index 88a80ff4a5..38ee1c5681 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,4 +108,5 @@
 cscope.*
 tags
 TAGS
+docker-src.*
 *~
diff --git a/MAINTAINERS b/MAINTAINERS
index ea812b3284..df990a8ff6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -165,6 +165,7 @@ F: hw/openrisc/
 F: tests/tcg/openrisc/
 
 PowerPC
+M: David Gibson <david@gibson.dropbear.id.au>
 M: Alexander Graf <agraf@suse.de>
 L: qemu-ppc@nongnu.org
 S: Maintained
@@ -597,7 +598,7 @@ F: hw/pci-host/grackle.c
 F: hw/misc/macio/
 
 PReP
-M: Andreas Färber <andreas.faerber@web.de>
+L: qemu-devel@nongnu.org
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: hw/ppc/prep.c
@@ -1064,7 +1065,7 @@ S: Supported
 F: scripts/coverity-model.c
 
 CPU
-M: Andreas Färber <afaerber@suse.de>
+L: qemu-devel@nongnu.org
 S: Supported
 F: qom/cpu.c
 F: include/qom/cpu.h
@@ -1123,7 +1124,6 @@ F: ui/
 F: include/ui/
 
 Cocoa graphics
-M: Andreas Färber <andreas.faerber@web.de>
 M: Peter Maydell <peter.maydell@linaro.org>
 S: Odd Fixes
 F: ui/cocoa.m
@@ -1418,9 +1418,8 @@ S: Orphan
 
 Stable 0.15
 L: qemu-stable@nongnu.org
-M: Andreas Färber <afaerber@suse.de>
 T: git git://git.qemu-project.org/qemu-stable-0.15.git
-S: Supported
+S: Orphan
 
 Stable 0.14
 L: qemu-stable@nongnu.org
@@ -1634,3 +1633,10 @@ Build system architecture
 M: Daniel P. Berrange <berrange@redhat.com>
 S: Odd Fixes
 F: docs/build-system.txt
+
+Docker testing
+--------------
+Docker based testing framework and cases
+M: Fam Zheng <famz@redhat.com>
+S: Maintained
+F: tests/docker/
diff --git a/Makefile b/Makefile
index a5d7e62626..b8563db686 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BUILD_DIR=$(CURDIR)
 # Before including a proper config-host.mak, assume we are in the source tree
 SRC_PATH=.
 
-UNCHECKED_GOALS := %clean TAGS cscope ctags
+UNCHECKED_GOALS := %clean TAGS cscope ctags docker docker-%
 
 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
@@ -30,7 +30,6 @@ CONFIG_ALL=y
 -include config-all-devices.mak
 -include config-all-disas.mak
 
-include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
 	@echo $@ is out-of-date, running configure
 	@# TODO: The next lines include code which supports a smooth
@@ -49,6 +48,8 @@ ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fa
 endif
 endif
 
+include $(SRC_PATH)/rules.mak
+
 GENERATED_HEADERS = config-host.h qemu-options.def
 GENERATED_HEADERS += qmp-commands.h qapi-types.h qapi-visit.h qapi-event.h
 GENERATED_SOURCES += qmp-marshal.c qapi-types.c qapi-visit.c qapi-event.c
@@ -92,9 +93,6 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
 DOCS+=qmp-commands.txt
-ifdef CONFIG_LINUX
-DOCS+=kvm_stat.1
-endif
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -571,12 +569,6 @@ qemu-ga.8: qemu-ga.texi
 	  $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \
 	  "  GEN   $@")
 
-kvm_stat.1: scripts/kvm/kvm_stat.texi
-	$(call quiet-command, \
-	  perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \
-	  $(POD2MAN) --section=1 --center=" " --release=" " kvm_stat.pod > $@, \
-	  "  GEN   $@")
-
 dvi: qemu-doc.dvi qemu-tech.dvi
 html: qemu-doc.html qemu-tech.html
 info: qemu-doc.info qemu-tech.info
@@ -652,3 +644,5 @@ endif
 # Include automatically generated dependency files
 # Dependencies in Makefile.objs files come from our recursive subdir rules
 -include $(wildcard *.d tests/*.d)
+
+include $(SRC_PATH)/tests/docker/Makefile.include
diff --git a/Makefile.target b/Makefile.target
index 34ddb7e762..5b80dd7fc9 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -108,7 +108,12 @@ obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o
 
 ifdef CONFIG_LINUX_USER
 
-QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) -I$(SRC_PATH)/linux-user
+# Note that we only add linux-user/host/$ARCH if it exists, and
+# that it must come before linux-user/host/generic in the search path.
+QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
+             $(patsubst %,-I%,$(wildcard $(SRC_PATH)/linux-user/host/$(ARCH))) \
+             -I$(SRC_PATH)/linux-user/host/generic \
+             -I$(SRC_PATH)/linux-user
 
 obj-y += linux-user/
 obj-y += gdbstub.o thunk.o user-exec.o
diff --git a/block.c b/block.c
index 1205ef8860..736432f67e 100644
--- a/block.c
+++ b/block.c
@@ -64,16 +64,16 @@ static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
-static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
-                             const char *reference, QDict *options, int flags,
-                             BlockDriverState *parent,
-                             const BdrvChildRole *child_role, Error **errp);
+static BlockDriverState *bdrv_open_inherit(const char *filename,
+                                           const char *reference,
+                                           QDict *options, int flags,
+                                           BlockDriverState *parent,
+                                           const BdrvChildRole *child_role,
+                                           Error **errp);
 
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
 
-static void bdrv_close(BlockDriverState *bs);
-
 #ifdef _WIN32
 static int is_windows_drive_prefix(const char *filename)
 {
@@ -220,11 +220,6 @@ void bdrv_register(BlockDriver *bdrv)
     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }
 
-BlockDriverState *bdrv_new_root(void)
-{
-    return bdrv_new();
-}
-
 BlockDriverState *bdrv_new(void)
 {
     BlockDriverState *bs;
@@ -664,6 +659,18 @@ int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
     return 0;
 }
 
+static void bdrv_child_cb_drained_begin(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_drained_begin(bs);
+}
+
+static void bdrv_child_cb_drained_end(BdrvChild *child)
+{
+    BlockDriverState *bs = child->opaque;
+    bdrv_drained_end(bs);
+}
+
 /*
  * Returns the options and flags that a temporary snapshot should get, based on
  * the originally requested flags (the originally requested image will have
@@ -710,6 +717,8 @@ static void bdrv_inherited_options(int *child_flags, QDict *child_options,
 
 const BdrvChildRole child_file = {
     .inherit_options = bdrv_inherited_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };
 
 /*
@@ -728,6 +737,8 @@ static void bdrv_inherited_fmt_options(int *child_flags, QDict *child_options,
 
 const BdrvChildRole child_format = {
     .inherit_options = bdrv_inherited_fmt_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };
 
 /*
@@ -755,6 +766,8 @@ static void bdrv_backing_options(int *child_flags, QDict *child_options,
 
 static const BdrvChildRole child_backing = {
     .inherit_options = bdrv_backing_options,
+    .drained_begin   = bdrv_child_cb_drained_begin,
+    .drained_end     = bdrv_child_cb_drained_end,
 };
 
 static int bdrv_open_flags(BlockDriverState *bs, int flags)
@@ -1155,18 +1168,41 @@ static int bdrv_fill_options(QDict **options, const char *filename,
     return 0;
 }
 
+static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
+{
+    BlockDriverState *old_bs = child->bs;
+
+    if (old_bs) {
+        if (old_bs->quiesce_counter && child->role->drained_end) {
+            child->role->drained_end(child);
+        }
+        QLIST_REMOVE(child, next_parent);
+    }
+
+    child->bs = new_bs;
+
+    if (new_bs) {
+        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
+        if (new_bs->quiesce_counter && child->role->drained_begin) {
+            child->role->drained_begin(child);
+        }
+    }
+}
+
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                   const char *child_name,
-                                  const BdrvChildRole *child_role)
+                                  const BdrvChildRole *child_role,
+                                  void *opaque)
 {
     BdrvChild *child = g_new(BdrvChild, 1);
     *child = (BdrvChild) {
-        .bs     = child_bs,
+        .bs     = NULL,
         .name   = g_strdup(child_name),
         .role   = child_role,
+        .opaque = opaque,
     };
 
-    QLIST_INSERT_HEAD(&child_bs->parents, child, next_parent);
+    bdrv_replace_child(child, child_bs);
 
     return child;
 }
@@ -1176,7 +1212,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
                              const char *child_name,
                              const BdrvChildRole *child_role)
 {
-    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role);
+    BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, child_role,
+                                              parent_bs);
     QLIST_INSERT_HEAD(&parent_bs->children, child, next);
     return child;
 }
@@ -1187,7 +1224,9 @@ static void bdrv_detach_child(BdrvChild *child)
         QLIST_REMOVE(child, next);
         child->next.le_prev = NULL;
     }
-    QLIST_REMOVE(child, next_parent);
+
+    bdrv_replace_child(child, NULL);
+
     g_free(child->name);
     g_free(child);
 }
@@ -1341,14 +1380,13 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
         qdict_put(options, "driver", qstring_from_str(bs->backing_format));
     }
 
-    backing_hd = NULL;
-    ret = bdrv_open_inherit(&backing_hd,
-                            *backing_filename ? backing_filename : NULL,
-                            reference, options, 0, bs, &child_backing,
-                            errp);
-    if (ret < 0) {
+    backing_hd = bdrv_open_inherit(*backing_filename ? backing_filename : NULL,
+                                   reference, options, 0, bs, &child_backing,
+                                   errp);
+    if (!backing_hd) {
         bs->open_flags |= BDRV_O_NO_BACKING;
         error_prepend(errp, "Could not open backing file: ");
+        ret = -EINVAL;
         goto free_exit;
     }
 
@@ -1388,7 +1426,6 @@ BdrvChild *bdrv_open_child(const char *filename,
     BdrvChild *c = NULL;
     BlockDriverState *bs;
     QDict *image_options;
-    int ret;
     char *bdref_key_dot;
     const char *reference;
 
@@ -1408,10 +1445,9 @@ BdrvChild *bdrv_open_child(const char *filename,
         goto done;
     }
 
-    bs = NULL;
-    ret = bdrv_open_inherit(&bs, filename, reference, image_options, 0,
-                            parent, child_role, errp);
-    if (ret < 0) {
+    bs = bdrv_open_inherit(filename, reference, image_options, 0,
+                           parent, child_role, errp);
+    if (!bs) {
         goto done;
     }
 
@@ -1422,15 +1458,16 @@ done:
     return c;
 }
 
-static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
-                                     QDict *snapshot_options, Error **errp)
+static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
+                                                   int flags,
+                                                   QDict *snapshot_options,
+                                                   Error **errp)
 {
     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
     char *tmp_filename = g_malloc0(PATH_MAX + 1);
     int64_t total_size;
     QemuOpts *opts = NULL;
     BlockDriverState *bs_snapshot;
-    Error *local_err = NULL;
     int ret;
 
     /* if snapshot, we create a temporary backing file and open it
@@ -1439,7 +1476,6 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
     /* Get the required size from the image */
     total_size = bdrv_getlength(bs);
     if (total_size < 0) {
-        ret = total_size;
         error_setg_errno(errp, -total_size, "Could not get image size");
         goto out;
     }
@@ -1470,22 +1506,26 @@ static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
     qdict_put(snapshot_options, "driver",
               qstring_from_str("qcow2"));
 
-    bs_snapshot = bdrv_new();
-
-    ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
-                    flags, &local_err);
+    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
     snapshot_options = NULL;
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    if (!bs_snapshot) {
+        ret = -EINVAL;
         goto out;
     }
 
+    /* bdrv_append() consumes a strong reference to bs_snapshot (i.e. it will
+     * call bdrv_unref() on it), so in order to be able to return one, we have
+     * to increase bs_snapshot's refcount here */
+    bdrv_ref(bs_snapshot);
     bdrv_append(bs_snapshot, bs);
 
+    g_free(tmp_filename);
+    return bs_snapshot;
+
 out:
     QDECREF(snapshot_options);
     g_free(tmp_filename);
-    return ret;
+    return NULL;
 }
 
 /*
@@ -1503,10 +1543,12 @@ out:
  * should be opened. If specified, neither options nor a filename may be given,
  * nor can an existing BDS be reused (that is, *pbs has to be NULL).
  */
-static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
-                             const char *reference, QDict *options, int flags,
-                             BlockDriverState *parent,
-                             const BdrvChildRole *child_role, Error **errp)
+static BlockDriverState *bdrv_open_inherit(const char *filename,
+                                           const char *reference,
+                                           QDict *options, int flags,
+                                           BlockDriverState *parent,
+                                           const BdrvChildRole *child_role,
+                                           Error **errp)
 {
     int ret;
     BdrvChild *file = NULL;
@@ -1518,7 +1560,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
     QDict *snapshot_options = NULL;
     int snapshot_flags = 0;
 
-    assert(pbs);
     assert(!child_role || !flags);
     assert(!child_role == !parent);
 
@@ -1526,33 +1567,22 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
         bool options_non_empty = options ? qdict_size(options) : false;
         QDECREF(options);
 
-        if (*pbs) {
-            error_setg(errp, "Cannot reuse an existing BDS when referencing "
-                       "another block device");
-            return -EINVAL;
-        }
-
         if (filename || options_non_empty) {
             error_setg(errp, "Cannot reference an existing block device with "
                        "additional options or a new filename");
-            return -EINVAL;
+            return NULL;
         }
 
         bs = bdrv_lookup_bs(reference, reference, errp);
         if (!bs) {
-            return -ENODEV;
+            return NULL;
         }
 
         bdrv_ref(bs);
-        *pbs = bs;
-        return 0;
+        return bs;
     }
 
-    if (*pbs) {
-        bs = *pbs;
-    } else {
-        bs = bdrv_new();
-    }
+    bs = bdrv_new();
 
     /* NULL means an empty set of options */
     if (options == NULL) {
@@ -1562,7 +1592,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
     /* json: syntax counts as explicit options, as if in the QDict */
     parse_json_protocol(options, &filename, &local_err);
     if (local_err) {
-        ret = -EINVAL;
         goto fail;
     }
 
@@ -1589,7 +1618,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
         drv = bdrv_find_format(drvname);
         if (!drv) {
             error_setg(errp, "Unknown driver: '%s'", drvname);
-            ret = -EINVAL;
             goto fail;
         }
     }
@@ -1619,7 +1647,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
         file = bdrv_open_child(filename, options, "file", bs,
                                &child_file, true, &local_err);
         if (local_err) {
-            ret = -EINVAL;
             goto fail;
         }
     }
@@ -1646,7 +1673,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
         qdict_put(options, "driver", qstring_from_str(drv->format_name));
     } else if (!drv) {
         error_setg(errp, "Must specify either driver or file");
-        ret = -EINVAL;
         goto fail;
     }
 
@@ -1689,7 +1715,6 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
                        drv->format_name, entry->key);
         }
 
-        ret = -EINVAL;
         goto close_and_fail;
     }
 
@@ -1700,25 +1725,30 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
                && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
         error_setg(errp,
                    "Guest must be stopped for opening of encrypted image");
-        ret = -EBUSY;
         goto close_and_fail;
     }
 
     QDECREF(options);
-    *pbs = bs;
 
     /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
      * temporary snapshot afterwards. */
     if (snapshot_flags) {
-        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options,
-                                        &local_err);
+        BlockDriverState *snapshot_bs;
+        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
+                                                snapshot_options, &local_err);
         snapshot_options = NULL;
         if (local_err) {
             goto close_and_fail;
         }
+        /* We are not going to return bs but the overlay on top of it
+         * (snapshot_bs); thus, we have to drop the strong reference to bs
+         * (which we obtained by calling bdrv_new()). bs will not be deleted,
+         * though, because the overlay still has a reference to it. */
+        bdrv_unref(bs);
+        bs = snapshot_bs;
     }
 
-    return 0;
+    return bs;
 
 fail:
     if (file != NULL) {
@@ -1729,36 +1759,26 @@ fail:
     QDECREF(bs->options);
     QDECREF(options);
     bs->options = NULL;
-    if (!*pbs) {
-        /* If *pbs is NULL, a new BDS has been created in this function and
-           needs to be freed now. Otherwise, it does not need to be closed,
-           since it has not really been opened yet. */
-        bdrv_unref(bs);
-    }
+    bdrv_unref(bs);
     if (local_err) {
         error_propagate(errp, local_err);
     }
-    return ret;
+    return NULL;
 
 close_and_fail:
-    /* See fail path, but now the BDS has to be always closed */
-    if (*pbs) {
-        bdrv_close(bs);
-    } else {
-        bdrv_unref(bs);
-    }
+    bdrv_unref(bs);
     QDECREF(snapshot_options);
     QDECREF(options);
     if (local_err) {
         error_propagate(errp, local_err);
     }
-    return ret;
+    return NULL;
 }
 
-int bdrv_open(BlockDriverState **pbs, const char *filename,
-              const char *reference, QDict *options, int flags, Error **errp)
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+                            QDict *options, int flags, Error **errp)
 {
-    return bdrv_open_inherit(pbs, filename, reference, options, flags, NULL,
+    return bdrv_open_inherit(filename, reference, options, flags, NULL,
                              NULL, errp);
 }
 
@@ -2132,6 +2152,7 @@ static void bdrv_close(BlockDriverState *bs)
     BdrvAioNotifier *ban, *ban_next;
 
     assert(!bs->job);
+    assert(!bs->refcnt);
 
     bdrv_drained_begin(bs); /* complete I/O */
     bdrv_flush(bs);
@@ -2140,8 +2161,6 @@ static void bdrv_close(BlockDriverState *bs)
     bdrv_release_named_dirty_bitmaps(bs);
     assert(QLIST_EMPTY(&bs->dirty_bitmaps));
 
-    bdrv_parent_cb_change_media(bs, false);
-
     if (bs->drv) {
         BdrvChild *child, *next;
 
@@ -2190,8 +2209,7 @@ static void bdrv_close(BlockDriverState *bs)
 
 void bdrv_close_all(void)
 {
-    BlockDriverState *bs;
-    AioContext *aio_context;
+    block_job_cancel_sync_all();
 
     /* Drop references from requests still in flight, such as canceled block
      * jobs whose AIO context has not been polled yet */
@@ -2200,25 +2218,7 @@ void bdrv_close_all(void)
     blk_remove_all_bs();
     blockdev_close_all_bdrv_states();
 
-    /* Cancel all block jobs */
-    while (!QTAILQ_EMPTY(&all_bdrv_states)) {
-        QTAILQ_FOREACH(bs, &all_bdrv_states, bs_list) {
-            aio_context = bdrv_get_aio_context(bs);
-
-            aio_context_acquire(aio_context);
-            if (bs->job) {
-                block_job_cancel_sync(bs->job);
-                aio_context_release(aio_context);
-                break;
-            }
-            aio_context_release(aio_context);
-        }
-
-        /* All the remaining BlockDriverStates are referenced directly or
-         * indirectly from block jobs, so there needs to be at least one BDS
-         * directly used by a block job */
-        assert(bs);
-    }
+    assert(QTAILQ_EMPTY(&all_bdrv_states));
 }
 
 static void change_parent_backing_link(BlockDriverState *from,
@@ -2228,10 +2228,8 @@ static void change_parent_backing_link(BlockDriverState *from,
 
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
         assert(c->role != &child_backing);
-        c->bs = to;
-        QLIST_REMOVE(c, next_parent);
-        QLIST_INSERT_HEAD(&to->parents, c, next_parent);
         bdrv_ref(to);
+        bdrv_replace_child(c, to);
         bdrv_unref(from);
     }
 }
@@ -3195,9 +3193,9 @@ void bdrv_invalidate_cache_all(Error **errp)
 {
     BlockDriverState *bs;
     Error *local_err = NULL;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
@@ -3239,11 +3237,11 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
 int bdrv_inactivate_all(void)
 {
     BlockDriverState *bs = NULL;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
     int ret = 0;
     int pass;
 
-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         aio_context_acquire(bdrv_get_aio_context(bs));
     }
 
@@ -3252,8 +3250,7 @@ int bdrv_inactivate_all(void)
      * the second pass sets the BDRV_O_INACTIVE flag so that no further write
      * is allowed. */
     for (pass = 0; pass < 2; pass++) {
-        it = NULL;
-        while ((it = bdrv_next(it, &bs)) != NULL) {
+        for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
             ret = bdrv_inactivate_recurse(bs, pass);
             if (ret < 0) {
                 goto out;
@@ -3262,8 +3259,7 @@ int bdrv_inactivate_all(void)
     }
 
 out:
-    it = NULL;
-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         aio_context_release(bdrv_get_aio_context(bs));
     }
 
@@ -3547,11 +3543,10 @@ void bdrv_img_create(const char *filename, const char *fmt,
                           qstring_from_str(backing_fmt));
             }
 
-            bs = NULL;
-            ret = bdrv_open(&bs, full_backing, NULL, backing_options,
-                            back_flags, &local_err);
+            bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
+                           &local_err);
             g_free(full_backing);
-            if (ret < 0) {
+            if (!bs) {
                 goto out;
             }
             size = bdrv_getlength(bs);
@@ -3753,10 +3748,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
     /* walk down the bs forest recursively */
-    while ((it = bdrv_next(it, &bs)) != NULL) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         bool perm;
 
         /* try to recurse in this top level bs */
diff --git a/block/backup.c b/block/backup.c
index fec45e8212..feeb9f8bf2 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -36,7 +36,7 @@ typedef struct CowRequest {
 
 typedef struct BackupBlockJob {
     BlockJob common;
-    BlockDriverState *target;
+    BlockBackend *target;
     /* bitmap for sync=incremental */
     BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
@@ -47,6 +47,7 @@ typedef struct BackupBlockJob {
     uint64_t sectors_read;
     unsigned long *done_bitmap;
     int64_t cluster_size;
+    NotifierWithReturn before_write;
     QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;
 
@@ -93,12 +94,12 @@ static void cow_request_end(CowRequest *req)
     qemu_co_queue_restart_all(&req->wait_queue);
 }
 
-static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+static int coroutine_fn backup_do_cow(BackupBlockJob *job,
                                       int64_t sector_num, int nb_sectors,
                                       bool *error_is_read,
                                       bool is_write_notifier)
 {
-    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    BlockBackend *blk = job->common.blk;
     CowRequest cow_request;
     struct iovec iov;
     QEMUIOVector bounce_qiov;
@@ -131,20 +132,15 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                 start * sectors_per_cluster);
 
         if (!bounce_buffer) {
-            bounce_buffer = qemu_blockalign(bs, job->cluster_size);
+            bounce_buffer = blk_blockalign(blk, job->cluster_size);
         }
         iov.iov_base = bounce_buffer;
         iov.iov_len = n * BDRV_SECTOR_SIZE;
         qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-        if (is_write_notifier) {
-            ret = bdrv_co_readv_no_serialising(bs,
-                                           start * sectors_per_cluster,
-                                           n, &bounce_qiov);
-        } else {
-            ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
-                                &bounce_qiov);
-        }
+        ret = blk_co_preadv(blk, start * job->cluster_size,
+                            bounce_qiov.size, &bounce_qiov,
+                            is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
         if (ret < 0) {
             trace_backup_do_cow_read_fail(job, start, ret);
             if (error_is_read) {
@@ -154,13 +150,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
         }
 
         if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-            ret = bdrv_co_write_zeroes(job->target,
-                                       start * sectors_per_cluster,
-                                       n, BDRV_REQ_MAY_UNMAP);
+            ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size,
+                                       bounce_qiov.size, BDRV_REQ_MAY_UNMAP);
         } else {
-            ret = bdrv_co_writev(job->target,
-                                 start * sectors_per_cluster, n,
-                                 &bounce_qiov);
+            ret = blk_co_pwritev(job->target, start * job->cluster_size,
+                                 bounce_qiov.size, &bounce_qiov, 0);
         }
         if (ret < 0) {
             trace_backup_do_cow_write_fail(job, start, ret);
@@ -197,14 +191,16 @@ static int coroutine_fn backup_before_write_notify(
         NotifierWithReturn *notifier,
         void *opaque)
 {
+    BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write);
     BdrvTrackedRequest *req = opaque;
     int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
     int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
 
+    assert(req->bs == blk_bs(job->common.blk));
     assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
+    return backup_do_cow(job, sector_num, nb_sectors, NULL, true);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -221,7 +217,7 @@ static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
 {
     BdrvDirtyBitmap *bm;
-    BlockDriverState *bs = job->common.bs;
+    BlockDriverState *bs = blk_bs(job->common.blk);
 
     if (ret < 0 || block_job_is_cancelled(&job->common)) {
         /* Merge the successor back into the parent, delete nothing. */
@@ -279,7 +275,7 @@ static void backup_complete(BlockJob *job, void *opaque)
     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
     BackupCompleteData *data = opaque;
 
-    bdrv_unref(s->target);
+    blk_unref(s->target);
 
     block_job_completed(job, data->ret);
     g_free(data);
@@ -321,7 +317,6 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
     int64_t end;
     int64_t last_cluster = -1;
     int64_t sectors_per_cluster = cluster_size_sectors(job);
-    BlockDriverState *bs = job->common.bs;
     HBitmapIter hbi;
 
     granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
@@ -343,7 +338,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                 if (yield_and_check(job)) {
                     return ret;
                 }
-                ret = backup_do_cow(bs, cluster * sectors_per_cluster,
+                ret = backup_do_cow(job, cluster * sectors_per_cluster,
                                     sectors_per_cluster, &error_is_read,
                                     false);
                 if ((ret < 0) &&
@@ -376,11 +371,8 @@ static void coroutine_fn backup_run(void *opaque)
 {
     BackupBlockJob *job = opaque;
     BackupCompleteData *data;
-    BlockDriverState *bs = job->common.bs;
-    BlockDriverState *target = job->target;
-    NotifierWithReturn before_write = {
-        .notify = backup_before_write_notify,
-    };
+    BlockDriverState *bs = blk_bs(job->common.blk);
+    BlockBackend *target = job->target;
     int64_t start, end;
     int64_t sectors_per_cluster = cluster_size_sectors(job);
     int ret = 0;
@@ -393,7 +385,8 @@ static void coroutine_fn backup_run(void *opaque)
 
     job->done_bitmap = bitmap_new(end);
 
-    bdrv_add_before_write_notifier(bs, &before_write);
+    job->before_write.notify = backup_before_write_notify;
+    bdrv_add_before_write_notifier(bs, &job->before_write);
 
     if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
         while (!block_job_is_cancelled(&job->common)) {
@@ -445,7 +438,7 @@ static void coroutine_fn backup_run(void *opaque)
                 }
             }
             /* FULL sync mode we copy the whole drive. */
-            ret = backup_do_cow(bs, start * sectors_per_cluster,
+            ret = backup_do_cow(job, start * sectors_per_cluster,
                                 sectors_per_cluster, &error_is_read, false);
             if (ret < 0) {
                 /* Depending on error action, fail now or retry cluster */
@@ -461,14 +454,14 @@ static void coroutine_fn backup_run(void *opaque)
         }
     }
 
-    notifier_with_return_remove(&before_write);
+    notifier_with_return_remove(&job->before_write);
 
     /* wait until pending backup_do_cow() calls have completed */
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
     g_free(job->done_bitmap);
 
-    bdrv_op_unblock_all(target, job->common.blocker);
+    bdrv_op_unblock_all(blk_bs(target), job->common.blocker);
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
@@ -485,6 +478,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
 {
     int64_t len;
     BlockDriverInfo bdi;
+    BackupBlockJob *job = NULL;
     int ret;
 
     assert(bs);
@@ -542,15 +536,16 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         goto error;
     }
 
-    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
-                                           cb, opaque, errp);
+    job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp);
     if (!job) {
         goto error;
     }
 
+    job->target = blk_new();
+    blk_insert_bs(job->target, target);
+
     job->on_source_error = on_source_error;
     job->on_target_error = on_target_error;
-    job->target = target;
     job->sync_mode = sync_mode;
     job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
                        sync_bitmap : NULL;
@@ -558,7 +553,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     /* If there is no backing file on the target, we cannot rely on COW if our
      * backup cluster size is smaller than the target cluster size. Even for
      * targets with a backing file, try to avoid COW if possible. */
-    ret = bdrv_get_info(job->target, &bdi);
+    ret = bdrv_get_info(target, &bdi);
     if (ret < 0 && !target->backing) {
         error_setg_errno(errp, -ret,
             "Couldn't determine the cluster size of the target image, "
@@ -584,4 +579,8 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     if (sync_bitmap) {
         bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
     }
+    if (job) {
+        blk_unref(job->target);
+        block_job_unref(&job->common);
+    }
 }
diff --git a/block/block-backend.c b/block/block-backend.c
index 6928d61de4..34500e6080 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -19,6 +19,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi-event.h"
 #include "qemu/id.h"
+#include "trace.h"
 
 /* Number of coroutines to reserve per attached device model */
 #define COROUTINE_POOL_RESERVATION 64
@@ -119,12 +120,14 @@ static const BdrvChildRole child_root = {
  * Store an error through @errp on failure, unless it's null.
  * Return the new BlockBackend on success, null on failure.
  */
-BlockBackend *blk_new(Error **errp)
+BlockBackend *blk_new(void)
 {
     BlockBackend *blk;
 
     blk = g_new0(BlockBackend, 1);
     blk->refcnt = 1;
+    blk_set_enable_write_cache(blk, true);
+
     qemu_co_queue_init(&blk->public.throttled_reqs[0]);
     qemu_co_queue_init(&blk->public.throttled_reqs[1]);
 
@@ -136,27 +139,7 @@ BlockBackend *blk_new(Error **errp)
 }
 
 /*
- * Create a new BlockBackend with a new BlockDriverState attached.
- * Otherwise just like blk_new(), which see.
- */
-BlockBackend *blk_new_with_bs(Error **errp)
-{
-    BlockBackend *blk;
-    BlockDriverState *bs;
-
-    blk = blk_new(errp);
-    if (!blk) {
-        return NULL;
-    }
-
-    bs = bdrv_new_root();
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    blk->root->opaque = blk;
-    return blk;
-}
-
-/*
- * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState.
+ * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
  *
  * Just as with bdrv_open(), after having called this function the reference to
  * @options belongs to the block layer (even on failure).
@@ -171,21 +154,16 @@ BlockBackend *blk_new_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp)
 {
     BlockBackend *blk;
-    int ret;
-
-    blk = blk_new_with_bs(errp);
-    if (!blk) {
-        QDECREF(options);
-        return NULL;
-    }
+    BlockDriverState *bs;
 
-    ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp);
-    if (ret < 0) {
+    blk = blk_new();
+    bs = bdrv_open(filename, reference, options, flags, errp);
+    if (!bs) {
         blk_unref(blk);
         return NULL;
     }
 
-    blk_set_enable_write_cache(blk, true);
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
 
     return blk;
 }
@@ -286,25 +264,11 @@ BlockBackend *blk_next(BlockBackend *blk)
                : QTAILQ_FIRST(&monitor_block_backends);
 }
 
-struct BdrvNextIterator {
-    enum {
-        BDRV_NEXT_BACKEND_ROOTS,
-        BDRV_NEXT_MONITOR_OWNED,
-    } phase;
-    BlockBackend *blk;
-    BlockDriverState *bs;
-};
-
 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
  * the monitor or attached to a BlockBackend */
-BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-    if (!it) {
-        it = g_new(BdrvNextIterator, 1);
-        *it = (BdrvNextIterator) {
-            .phase = BDRV_NEXT_BACKEND_ROOTS,
-        };
-    }
+    BlockDriverState *bs;
 
     /* First, return all root nodes of BlockBackends. In order to avoid
      * returning a BDS twice when multiple BBs refer to it, we only return it
@@ -312,11 +276,11 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
     if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
         do {
             it->blk = blk_all_next(it->blk);
-            *bs = it->blk ? blk_bs(it->blk) : NULL;
-        } while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk));
+            bs = it->blk ? blk_bs(it->blk) : NULL;
+        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 
-        if (*bs) {
-            return it;
+        if (bs) {
+            return bs;
         }
         it->phase = BDRV_NEXT_MONITOR_OWNED;
     }
@@ -326,10 +290,19 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
      * by the above block already */
     do {
         it->bs = bdrv_next_monitor_owned(it->bs);
-        *bs = it->bs;
-    } while (*bs && bdrv_has_blk(*bs));
+        bs = it->bs;
+    } while (bs && bdrv_has_blk(bs));
 
-    return *bs ? it : NULL;
+    return bs;
+}
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it)
+{
+    *it = (BdrvNextIterator) {
+        .phase = BDRV_NEXT_BACKEND_ROOTS,
+    };
+
+    return bdrv_next(it);
 }
 
 /*
@@ -509,8 +482,7 @@ void blk_remove_bs(BlockBackend *blk)
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
 {
     bdrv_ref(bs);
-    blk->root = bdrv_root_attach_child(bs, "root", &child_root);
-    blk->root->opaque = blk;
+    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk);
 
     notifier_list_notify(&blk->insert_bs_notifiers, blk);
     if (blk->public.throttle_state) {
@@ -770,11 +742,15 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num,
                                   nb_sectors * BDRV_SECTOR_SIZE);
 }
 
-static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags)
 {
-    int ret = blk_check_byte_request(blk, offset, bytes);
+    int ret;
+
+    trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
@@ -787,12 +763,14 @@ static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
     return bdrv_co_preadv(blk_bs(blk), offset, bytes, qiov, flags);
 }
 
-static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                      unsigned int bytes, QEMUIOVector *qiov,
-                                      BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
 {
     int ret;
 
+    trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
@@ -885,8 +863,8 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
     return ret;
 }
 
-int blk_write_zeroes(BlockBackend *blk, int64_t offset,
-                     int count, BdrvRequestFlags flags)
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                      int count, BdrvRequestFlags flags)
 {
     return blk_prw(blk, offset, NULL, count, blk_write_entry,
                    flags | BDRV_REQ_ZERO_WRITE);
@@ -1001,9 +979,9 @@ static void blk_aio_write_entry(void *opaque)
     blk_aio_complete(acb);
 }
 
-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
-                                 int count, BdrvRequestFlags flags,
-                                 BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                  int count, BdrvRequestFlags flags,
+                                  BlockCompletionFunc *cb, void *opaque)
 {
     return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
                         flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
@@ -1492,8 +1470,8 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
     return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }
 
-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
-                                     int count, BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
 {
     return blk_co_pwritev(blk, offset, count, NULL,
                           flags | BDRV_REQ_ZERO_WRITE);
@@ -1704,6 +1682,9 @@ static void blk_root_drained_begin(BdrvChild *child)
 {
     BlockBackend *blk = child->opaque;
 
+    /* Note that blk->root may not be accessible here yet if we are just
+     * attaching to a BlockDriverState that is drained. Use child instead. */
+
     if (blk->public.io_limits_disabled++ == 0) {
         throttle_group_restart_blk(blk);
     }
diff --git a/block/commit.c b/block/commit.c
index f308c8c6f0..8a00e1146c 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -36,28 +36,36 @@ typedef struct CommitBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *active;
-    BlockDriverState *top;
-    BlockDriverState *base;
+    BlockBackend *top;
+    BlockBackend *base;
     BlockdevOnError on_error;
     int base_flags;
     int orig_overlay_flags;
     char *backing_file_str;
 } CommitBlockJob;
 
-static int coroutine_fn commit_populate(BlockDriverState *bs,
-                                        BlockDriverState *base,
+static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
                                         int64_t sector_num, int nb_sectors,
                                         void *buf)
 {
     int ret = 0;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };
 
-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    if (ret) {
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
+                        qiov.size, &qiov, 0);
+    if (ret < 0) {
         return ret;
     }
 
-    ret = bdrv_write(base, sector_num, buf, nb_sectors);
-    if (ret) {
+    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
+                         qiov.size, &qiov, 0);
+    if (ret < 0) {
         return ret;
     }
 
@@ -73,8 +81,8 @@ static void commit_complete(BlockJob *job, void *opaque)
     CommitBlockJob *s = container_of(job, CommitBlockJob, common);
     CommitCompleteData *data = opaque;
     BlockDriverState *active = s->active;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
+    BlockDriverState *top = blk_bs(s->top);
+    BlockDriverState *base = blk_bs(s->base);
     BlockDriverState *overlay_bs;
     int ret = data->ret;
 
@@ -94,6 +102,8 @@ static void commit_complete(BlockJob *job, void *opaque)
         bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
     }
     g_free(s->backing_file_str);
+    blk_unref(s->top);
+    blk_unref(s->base);
     block_job_completed(&s->common, ret);
     g_free(data);
 }
@@ -102,8 +112,6 @@ static void coroutine_fn commit_run(void *opaque)
 {
     CommitBlockJob *s = opaque;
     CommitCompleteData *data;
-    BlockDriverState *top = s->top;
-    BlockDriverState *base = s->base;
     int64_t sector_num, end;
     int ret = 0;
     int n = 0;
@@ -111,27 +119,27 @@ static void coroutine_fn commit_run(void *opaque)
     int bytes_written = 0;
     int64_t base_len;
 
-    ret = s->common.len = bdrv_getlength(top);
+    ret = s->common.len = blk_getlength(s->top);
 
 
     if (s->common.len < 0) {
         goto out;
     }
 
-    ret = base_len = bdrv_getlength(base);
+    ret = base_len = blk_getlength(s->base);
     if (base_len < 0) {
         goto out;
     }
 
     if (base_len < s->common.len) {
-        ret = bdrv_truncate(base, s->common.len);
+        ret = blk_truncate(s->base, s->common.len);
         if (ret) {
             goto out;
         }
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 
     for (sector_num = 0; sector_num < end; sector_num += n) {
         uint64_t delay_ns = 0;
@@ -146,7 +154,8 @@ wait:
             break;
         }
         /* Copy if allocated above the base */
-        ret = bdrv_is_allocated_above(top, base, sector_num,
+        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
+                                      sector_num,
                                       COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
                                       &n);
         copy = (ret == 1);
@@ -158,7 +167,7 @@ wait:
                     goto wait;
                 }
             }
-            ret = commit_populate(top, base, sector_num, n, buf);
+            ret = commit_populate(s->top, s->base, sector_num, n, buf);
             bytes_written += n * BDRV_SECTOR_SIZE;
         }
         if (ret < 0) {
@@ -253,8 +262,12 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
-    s->base   = base;
-    s->top    = top;
+    s->base = blk_new();
+    blk_insert_bs(s->base, base);
+
+    s->top = blk_new();
+    blk_insert_bs(s->top, top);
+
     s->active = bs;
 
     s->base_flags          = orig_base_flags;
diff --git a/block/io.c b/block/io.c
index 60a6bd8bdb..2d832aa532 100644
--- a/block/io.c
+++ b/block/io.c
@@ -225,6 +225,34 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
     assert(data.done);
 }
 
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    if (!bs->quiesce_counter++) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+        bdrv_parent_drained_begin(bs);
+    }
+
+    bdrv_io_unplugged_begin(bs);
+    bdrv_drain_recurse(bs);
+    if (qemu_in_coroutine()) {
+        bdrv_co_yield_to_drain(bs);
+    } else {
+        bdrv_drain_poll(bs);
+    }
+    bdrv_io_unplugged_end(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    assert(bs->quiesce_counter > 0);
+    if (--bs->quiesce_counter > 0) {
+        return;
+    }
+
+    bdrv_parent_drained_end(bs);
+    aio_enable_external(bdrv_get_aio_context(bs));
+}
+
 /*
  * Wait for pending requests to complete on a single BlockDriverState subtree,
  * and suspend block driver's internal I/O until next request arrives.
@@ -238,26 +266,15 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
  */
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 {
-    bdrv_parent_drained_begin(bs);
-    bdrv_io_unplugged_begin(bs);
-    bdrv_drain_recurse(bs);
-    bdrv_co_yield_to_drain(bs);
-    bdrv_io_unplugged_end(bs);
-    bdrv_parent_drained_end(bs);
+    assert(qemu_in_coroutine());
+    bdrv_drained_begin(bs);
+    bdrv_drained_end(bs);
 }
 
 void bdrv_drain(BlockDriverState *bs)
 {
-    bdrv_parent_drained_begin(bs);
-    bdrv_io_unplugged_begin(bs);
-    bdrv_drain_recurse(bs);
-    if (qemu_in_coroutine()) {
-        bdrv_co_yield_to_drain(bs);
-    } else {
-        bdrv_drain_poll(bs);
-    }
-    bdrv_io_unplugged_end(bs);
-    bdrv_parent_drained_end(bs);
+    bdrv_drained_begin(bs);
+    bdrv_drained_end(bs);
 }
 
 /*
@@ -271,10 +288,10 @@ void bdrv_drain_all(void)
     /* Always run first iteration so any pending completion BHs run */
     bool busy = true;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
     GSList *aio_ctxs = NULL, *ctx;
 
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
@@ -302,10 +319,9 @@ void bdrv_drain_all(void)
 
         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
             AioContext *aio_context = ctx->data;
-            it = NULL;
 
             aio_context_acquire(aio_context);
-            while ((it = bdrv_next(it, &bs))) {
+            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
                 if (aio_context == bdrv_get_aio_context(bs)) {
                     if (bdrv_requests_pending(bs)) {
                         busy = true;
@@ -318,8 +334,7 @@ void bdrv_drain_all(void)
         }
     }
 
-    it = NULL;
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
@@ -1093,24 +1108,6 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
 }
 
-int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_NO_SERIALISING);
-}
-
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_COPY_ON_READ);
-}
-
 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
 
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
@@ -2543,23 +2540,3 @@ void bdrv_io_unplugged_end(BlockDriverState *bs)
         }
     }
 }
-
-void bdrv_drained_begin(BlockDriverState *bs)
-{
-    if (!bs->quiesce_counter++) {
-        aio_disable_external(bdrv_get_aio_context(bs));
-    }
-    bdrv_parent_drained_begin(bs);
-    bdrv_drain(bs);
-}
-
-void bdrv_drained_end(BlockDriverState *bs)
-{
-    bdrv_parent_drained_end(bs);
-
-    assert(bs->quiesce_counter > 0);
-    if (--bs->quiesce_counter > 0) {
-        return;
-    }
-    aio_enable_external(bdrv_get_aio_context(bs));
-}
diff --git a/block/iscsi.c b/block/iscsi.c
index 2ca8e72967..e7d5f7b0c3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -833,6 +833,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
         return &acb->common;
     }
 
+    if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) {
+        error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)",
+                     acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE);
+        qemu_aio_unref(acb);
+        return NULL;
+    }
+
     acb->task = malloc(sizeof(struct scsi_task));
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to allocate task for scsi command. %s",
diff --git a/block/mirror.c b/block/mirror.c
index b9986d8218..80fd3c7469 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -20,7 +20,6 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
-#include "qemu/error-report.h"
 
 #define SLICE_TIME    100000000ULL /* ns */
 #define MAX_IN_FLIGHT 16
@@ -36,7 +35,7 @@ typedef struct MirrorBuffer {
 typedef struct MirrorBlockJob {
     BlockJob common;
     RateLimit limit;
-    BlockDriverState *target;
+    BlockBackend *target;
     BlockDriverState *base;
     /* The name of the graph node to replace */
     char *replaces;
@@ -157,7 +156,8 @@ static void mirror_read_complete(void *opaque, int ret)
         mirror_iteration_done(op, ret);
         return;
     }
-    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+    blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                    op->nb_sectors * BDRV_SECTOR_SIZE,
                     mirror_write_complete, op);
 }
 
@@ -186,7 +186,7 @@ static int mirror_cow_align(MirrorBlockJob *s,
     need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors,
                           s->cow_bitmap);
     if (need_cow) {
-        bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors,
+        bdrv_round_to_clusters(blk_bs(s->target), *sector_num, *nb_sectors,
                                &align_sector_num, &align_nb_sectors);
     }
 
@@ -224,7 +224,7 @@ static inline void mirror_wait_for_io(MirrorBlockJob *s)
 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
                           int nb_sectors)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockBackend *source = s->common.blk;
     int sectors_per_chunk, nb_chunks;
     int ret = nb_sectors;
     MirrorOp *op;
@@ -274,7 +274,8 @@ static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num,
     s->sectors_in_flight += nb_sectors;
     trace_mirror_one_iteration(s, sector_num, nb_sectors);
 
-    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+    blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov,
+                   nb_sectors * BDRV_SECTOR_SIZE,
                    mirror_read_complete, op);
     return ret;
 }
@@ -296,10 +297,11 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
     s->in_flight++;
     s->sectors_in_flight += nb_sectors;
     if (is_discard) {
-        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
-                         mirror_write_complete, op);
+        blk_aio_discard(s->target, sector_num, op->nb_sectors,
+                        mirror_write_complete, op);
     } else {
-        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+        blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
+                              op->nb_sectors * BDRV_SECTOR_SIZE,
                               s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
                               mirror_write_complete, op);
     }
@@ -307,7 +309,7 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
 
 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
-    BlockDriverState *source = s->common.bs;
+    BlockDriverState *source = blk_bs(s->common.blk);
     int64_t sector_num, first_chunk;
     uint64_t delay_ns = 0;
     /* At least the first dirty chunk is mirrored in one iteration. */
@@ -384,7 +386,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
             int64_t target_sector_num;
             int target_nb_sectors;
-            bdrv_round_to_clusters(s->target, sector_num, io_sectors,
+            bdrv_round_to_clusters(blk_bs(s->target), sector_num, io_sectors,
                                    &target_sector_num, &target_nb_sectors);
             if (target_sector_num == sector_num &&
                 target_nb_sectors == io_sectors) {
@@ -449,7 +451,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
     MirrorExitData *data = opaque;
     AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->common.bs;
+    BlockDriverState *src = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
 
     /* Make sure that the source BDS doesn't go away before we called
      * block_job_completed(). */
@@ -461,26 +464,25 @@ static void mirror_exit(BlockJob *job, void *opaque)
     }
 
     if (s->should_complete && data->ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
+        BlockDriverState *to_replace = src;
         if (s->to_replace) {
             to_replace = s->to_replace;
         }
 
-        /* This was checked in mirror_start_job(), but meanwhile one of the
-         * nodes could have been newly attached to a BlockBackend. */
-        if (bdrv_has_blk(to_replace) && bdrv_has_blk(s->target)) {
-            error_report("block job: Can't create node with two BlockBackends");
-            data->ret = -EINVAL;
-            goto out;
+        if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) {
+            bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL);
         }
 
-        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
-            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
-        }
-        bdrv_replace_in_backing_chain(to_replace, s->target);
-    }
+        /* The mirror job has no requests in flight any more, but we need to
+         * drain potential other users of the BDS before changing the graph. */
+        bdrv_drained_begin(target_bs);
+        bdrv_replace_in_backing_chain(to_replace, target_bs);
+        bdrv_drained_end(target_bs);
 
-out:
+        /* We just changed the BDS the job BB refers to */
+        blk_remove_bs(job->blk);
+        blk_insert_bs(job->blk, src);
+    }
     if (s->to_replace) {
         bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
         error_free(s->replace_blocker);
@@ -490,8 +492,8 @@ out:
         aio_context_release(replace_aio_context);
     }
     g_free(s->replaces);
-    bdrv_op_unblock_all(s->target, s->common.blocker);
-    bdrv_unref(s->target);
+    bdrv_op_unblock_all(target_bs, s->common.blocker);
+    blk_unref(s->target);
     block_job_completed(&s->common, data->ret);
     g_free(data);
     bdrv_drained_end(src);
@@ -505,7 +507,8 @@ static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     MirrorExitData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
     int64_t sector_num, end, length;
     uint64_t last_pause_ns;
     BlockDriverInfo bdi;
@@ -541,18 +544,18 @@ static void coroutine_fn mirror_run(void *opaque)
      * the destination do COW.  Instead, we copy sectors around the
      * dirty data if needed.  We need a bitmap to do that.
      */
-    bdrv_get_backing_filename(s->target, backing_filename,
+    bdrv_get_backing_filename(target_bs, backing_filename,
                               sizeof(backing_filename));
-    if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) {
+    if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) {
         target_cluster_size = bdi.cluster_size;
     }
-    if (backing_filename[0] && !s->target->backing
+    if (backing_filename[0] && !target_bs->backing
         && s->granularity < target_cluster_size) {
         s->buf_size = MAX(s->buf_size, target_cluster_size);
         s->cow_bitmap = bitmap_new(length);
     }
     s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
-    s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov);
+    s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
 
     end = s->bdev_length / BDRV_SECTOR_SIZE;
     s->buf = qemu_try_blockalign(bs, s->buf_size);
@@ -567,7 +570,7 @@ static void coroutine_fn mirror_run(void *opaque)
     if (!s->is_none_mode) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
         BlockDriverState *base = s->base;
-        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target);
+        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs);
 
         for (sector_num = 0; sector_num < end; ) {
             /* Just to make sure we are not exceeding int limit. */
@@ -637,7 +640,7 @@ static void coroutine_fn mirror_run(void *opaque)
         should_complete = false;
         if (s->in_flight == 0 && cnt == 0) {
             trace_mirror_before_flush(s);
-            ret = bdrv_flush(s->target);
+            ret = blk_flush(s->target);
             if (ret < 0) {
                 if (mirror_error_action(s, false, -ret) ==
                     BLOCK_ERROR_ACTION_REPORT) {
@@ -715,7 +718,7 @@ immediate_exit:
     data->ret = ret;
     /* Before we switch to target in mirror_exit, make sure data doesn't
      * change. */
-    bdrv_drained_begin(s->common.bs);
+    bdrv_drained_begin(bs);
     if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) {
         /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the
          * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we
@@ -742,7 +745,8 @@ static void mirror_complete(BlockJob *job, Error **errp)
     Error *local_err = NULL;
     int ret;
 
-    ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err);
+    ret = bdrv_open_backing_file(blk_bs(s->target), NULL, "backing",
+                                 &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         return;
@@ -804,7 +808,6 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                              bool is_none_mode, BlockDriverState *base)
 {
     MirrorBlockJob *s;
-    BlockDriverState *replaced_bs;
 
     if (granularity == 0) {
         granularity = bdrv_get_default_bitmap_granularity(target);
@@ -821,30 +824,17 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
         buf_size = DEFAULT_MIRROR_BUF_SIZE;
     }
 
-    /* We can't support this case as long as the block layer can't handle
-     * multiple BlockBackends per BlockDriverState. */
-    if (replaces) {
-        replaced_bs = bdrv_lookup_bs(replaces, replaces, errp);
-        if (replaced_bs == NULL) {
-            return;
-        }
-    } else {
-        replaced_bs = bs;
-    }
-    if (bdrv_has_blk(replaced_bs) && bdrv_has_blk(target)) {
-        error_setg(errp, "Can't create node with two BlockBackends");
-        return;
-    }
-
     s = block_job_create(driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
 
+    s->target = blk_new();
+    blk_insert_bs(s->target, target);
+
     s->replaces = g_strdup(replaces);
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
-    s->target = target;
     s->is_none_mode = is_none_mode;
     s->base = base;
     s->granularity = granularity;
@@ -854,11 +844,12 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         g_free(s->replaces);
+        blk_unref(s->target);
         block_job_unref(&s->common);
         return;
     }
 
-    bdrv_op_block_all(s->target, s->common.blocker);
+    bdrv_op_block_all(target, s->common.blocker);
 
     s->common.co = qemu_coroutine_create(mirror_run);
     trace_mirror_start(bs, s, s->common.co, opaque);
@@ -931,7 +922,6 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
         }
     }
 
-    bdrv_ref(base);
     mirror_start_job(bs, base, NULL, speed, 0, 0,
                      on_error, on_error, false, cb, opaque, &local_err,
                      &commit_active_job_driver, false, base);
diff --git a/block/parallels.c b/block/parallels.c
index 88cfacebe3..99fc0f77ef 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -517,8 +517,8 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
     if (ret < 0) {
         goto exit;
     }
-    ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE,
-                           (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
+    ret = blk_pwrite_zeroes(file, BDRV_SECTOR_SIZE,
+                            (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);
     if (ret < 0) {
         goto exit;
     }
diff --git a/block/snapshot.c b/block/snapshot.c
index 3917ec5c91..6e6e34fcf4 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -374,9 +374,9 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
 {
     bool ok = true;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while (ok && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -384,8 +384,12 @@ bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
             ok = bdrv_can_snapshot(bs);
         }
         aio_context_release(ctx);
+        if (!ok) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return ok;
 }
@@ -395,20 +399,27 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
 {
     int ret = 0;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
     QEMUSnapshotInfo sn1, *snapshot = &sn1;
 
-    while (ret == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
         if (bdrv_can_snapshot(bs) &&
                 bdrv_snapshot_find(bs, snapshot, name) >= 0) {
             ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
+            if (ret < 0) {
+                goto fail;
+            }
         }
         aio_context_release(ctx);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return ret;
 }
@@ -418,9 +429,9 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
 {
     int err = 0;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while (err == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -428,8 +439,12 @@ int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs)
             err = bdrv_snapshot_goto(bs, name);
         }
         aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return err;
 }
@@ -439,9 +454,9 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
     QEMUSnapshotInfo sn;
     int err = 0;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while (err == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -449,8 +464,12 @@ int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
             err = bdrv_snapshot_find(bs, &sn, name);
         }
         aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return err;
 }
@@ -462,9 +481,9 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
 {
     int err = 0;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while (err == 0 && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
 
         aio_context_acquire(ctx);
@@ -476,24 +495,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
             err = bdrv_snapshot_create(bs, sn);
         }
         aio_context_release(ctx);
+        if (err < 0) {
+            goto fail;
+        }
     }
 
+fail:
     *first_bad_bs = bs;
     return err;
 }
 
 BlockDriverState *bdrv_all_find_vmstate_bs(void)
 {
-    bool not_found = true;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while (not_found && (it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *ctx = bdrv_get_aio_context(bs);
+        bool found;
 
         aio_context_acquire(ctx);
-        not_found = !bdrv_can_snapshot(bs);
+        found = bdrv_can_snapshot(bs);
         aio_context_release(ctx);
+
+        if (found) {
+            break;
+        }
     }
     return bs;
 }
diff --git a/block/stream.c b/block/stream.c
index 40aa32212e..c0efbda34e 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -39,7 +39,7 @@ typedef struct StreamBlockJob {
     char *backing_file_str;
 } StreamBlockJob;
 
-static int coroutine_fn stream_populate(BlockDriverState *bs,
+static int coroutine_fn stream_populate(BlockBackend *blk,
                                         int64_t sector_num, int nb_sectors,
                                         void *buf)
 {
@@ -52,7 +52,8 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
     qemu_iovec_init_external(&qiov, &iov, 1);
 
     /* Copy-on-read the unallocated clusters */
-    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
+    return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov,
+                         BDRV_REQ_COPY_ON_READ);
 }
 
 typedef struct {
@@ -64,6 +65,7 @@ static void stream_complete(BlockJob *job, void *opaque)
 {
     StreamBlockJob *s = container_of(job, StreamBlockJob, common);
     StreamCompleteData *data = opaque;
+    BlockDriverState *bs = blk_bs(job->blk);
     BlockDriverState *base = s->base;
 
     if (!block_job_is_cancelled(&s->common) && data->reached_end &&
@@ -75,8 +77,8 @@ static void stream_complete(BlockJob *job, void *opaque)
                 base_fmt = base->drv->format_name;
             }
         }
-        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt);
-        bdrv_set_backing_hd(job->bs, base);
+        data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+        bdrv_set_backing_hd(bs, base);
     }
 
     g_free(s->backing_file_str);
@@ -88,7 +90,8 @@ static void coroutine_fn stream_run(void *opaque)
 {
     StreamBlockJob *s = opaque;
     StreamCompleteData *data;
-    BlockDriverState *bs = s->common.bs;
+    BlockBackend *blk = s->common.blk;
+    BlockDriverState *bs = blk_bs(blk);
     BlockDriverState *base = s->base;
     int64_t sector_num = 0;
     int64_t end = -1;
@@ -159,7 +162,7 @@ wait:
                     goto wait;
                 }
             }
-            ret = stream_populate(bs, sector_num, n, buf);
+            ret = stream_populate(blk, sector_num, n, buf);
         }
         if (ret < 0) {
             BlockErrorAction action =
diff --git a/block/vvfat.c b/block/vvfat.c
index 3e484a1dcc..a39dbe67e2 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2998,12 +2998,12 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp)
         goto err;
     }
 
-    s->qcow = NULL;
     options = qdict_new();
     qdict_put(options, "driver", qstring_from_str("qcow"));
-    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options,
-                    BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
-    if (ret < 0) {
+    s->qcow = bdrv_open(s->qcow_filename, NULL, options,
+                        BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp);
+    if (!s->qcow) {
+        ret = -EINVAL;
         goto err;
     }
 
diff --git a/blockdev.c b/blockdev.c
index 40e4e6fc6f..717785eb8d 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -567,11 +567,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     if ((!file || !*file) && !qdict_size(bs_opts)) {
         BlockBackendRootState *blk_rs;
 
-        blk = blk_new(errp);
-        if (!blk) {
-            goto early_err;
-        }
-
+        blk = blk_new();
         blk_rs = blk_get_root_state(blk);
         blk_rs->open_flags    = bdrv_flags;
         blk_rs->read_only     = !(bdrv_flags & BDRV_O_RDWR);
@@ -657,7 +653,6 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
     QemuOpts *opts;
     Error *local_error = NULL;
     BlockdevDetectZeroesOptions detect_zeroes;
-    int ret;
     int bdrv_flags = 0;
 
     opts = qemu_opts_create(&qemu_root_bds_opts, NULL, 1, errp);
@@ -688,9 +683,8 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp)
         bdrv_flags |= BDRV_O_INACTIVE;
     }
 
-    bs = NULL;
-    ret = bdrv_open(&bs, NULL, NULL, bs_opts, bdrv_flags, errp);
-    if (ret < 0) {
+    bs = bdrv_open(NULL, NULL, bs_opts, bdrv_flags, errp);
+    if (!bs) {
         goto fail_no_bs_opts;
     }
 
@@ -1643,7 +1637,7 @@ typedef struct ExternalSnapshotState {
 static void external_snapshot_prepare(BlkActionState *common,
                                       Error **errp)
 {
-    int flags = 0, ret;
+    int flags = 0;
     QDict *options = NULL;
     Error *local_err = NULL;
     /* Device and node name of the image to generate the snapshot from */
@@ -1768,11 +1762,10 @@ static void external_snapshot_prepare(BlkActionState *common,
         flags |= BDRV_O_NO_BACKING;
     }
 
-    assert(state->new_bs == NULL);
-    ret = bdrv_open(&state->new_bs, new_image_file, snapshot_ref, options,
-                    flags, errp);
+    state->new_bs = bdrv_open(new_image_file, snapshot_ref, options, flags,
+                              errp);
     /* We will manually add the backing_hd field to the bs later */
-    if (ret != 0) {
+    if (!state->new_bs) {
         return;
     }
 
@@ -2540,7 +2533,7 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
 {
     BlockBackend *blk;
     BlockDriverState *medium_bs = NULL;
-    int bdrv_flags, ret;
+    int bdrv_flags;
     QDict *options = NULL;
     Error *err = NULL;
 
@@ -2584,9 +2577,8 @@ void qmp_blockdev_change_medium(const char *device, const char *filename,
         qdict_put(options, "driver", qstring_from_str(format));
     }
 
-    assert(!medium_bs);
-    ret = bdrv_open(&medium_bs, filename, NULL, options, bdrv_flags, errp);
-    if (ret < 0) {
+    medium_bs = bdrv_open(filename, NULL, options, bdrv_flags, errp);
+    if (!medium_bs) {
         goto fail;
     }
 
@@ -3199,7 +3191,6 @@ static void do_drive_backup(const char *device, const char *target,
     Error *local_err = NULL;
     int flags;
     int64_t size;
-    int ret;
 
     if (!has_speed) {
         speed = 0;
@@ -3283,10 +3274,8 @@ static void do_drive_backup(const char *device, const char *target,
         qdict_put(options, "driver", qstring_from_str(format));
     }
 
-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options, flags, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags, errp);
+    if (!target_bs) {
         goto out;
     }
 
@@ -3304,8 +3293,8 @@ static void do_drive_backup(const char *device, const char *target,
     backup_start(bs, target_bs, speed, sync, bmap,
                  on_source_error, on_target_error,
                  block_job_cb, bs, txn, &local_err);
+    bdrv_unref(target_bs);
     if (local_err != NULL) {
-        bdrv_unref(target_bs);
         error_propagate(errp, local_err);
         goto out;
     }
@@ -3389,12 +3378,10 @@ void do_blockdev_backup(const char *device, const char *target,
     }
     target_bs = blk_bs(target_blk);
 
-    bdrv_ref(target_bs);
     bdrv_set_aio_context(target_bs, aio_context);
     backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
                  on_target_error, block_job_cb, bs, txn, &local_err);
     if (local_err != NULL) {
-        bdrv_unref(target_bs);
         error_propagate(errp, local_err);
     }
 out:
@@ -3470,10 +3457,6 @@ static void blockdev_mirror_common(BlockDriverState *bs,
     if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_MIRROR_TARGET, errp)) {
         return;
     }
-    if (bdrv_has_blk(target)) {
-        error_setg(errp, "Cannot mirror to an attached block device");
-        return;
-    }
 
     if (!bs->backing && sync == MIRROR_SYNC_MODE_TOP) {
         sync = MIRROR_SYNC_MODE_FULL;
@@ -3511,7 +3494,6 @@ void qmp_drive_mirror(const char *device, const char *target,
     QDict *options = NULL;
     int flags;
     int64_t size;
-    int ret;
 
     blk = blk_by_name(device);
     if (!blk) {
@@ -3620,11 +3602,9 @@ void qmp_drive_mirror(const char *device, const char *target,
     /* Mirroring takes care of copy-on-write using the source's backing
      * file.
      */
-    target_bs = NULL;
-    ret = bdrv_open(&target_bs, target, NULL, options,
-                    flags | BDRV_O_NO_BACKING, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    target_bs = bdrv_open(target, NULL, options, flags | BDRV_O_NO_BACKING,
+                          errp);
+    if (!target_bs) {
         goto out;
     }
 
@@ -3639,9 +3619,9 @@ void qmp_drive_mirror(const char *device, const char *target,
                            has_on_target_error, on_target_error,
                            has_unmap, unmap,
                            &local_err);
+    bdrv_unref(target_bs);
     if (local_err) {
         error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
     }
 out:
     aio_context_release(aio_context);
@@ -3685,7 +3665,6 @@ void qmp_blockdev_mirror(const char *device, const char *target,
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    bdrv_ref(target_bs);
     bdrv_set_aio_context(target_bs, aio_context);
 
     blockdev_mirror_common(bs, target_bs,
@@ -3699,7 +3678,6 @@ void qmp_blockdev_mirror(const char *device, const char *target,
                            &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        bdrv_unref(target_bs);
     }
 
     aio_context_release(aio_context);
@@ -4164,9 +4142,9 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
     BlockJobInfoList *head = NULL, **p_next = &head;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
diff --git a/blockjob.c b/blockjob.c
index 5b840a7df6..c095cc57cb 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -50,17 +50,31 @@ struct BlockJobTxn {
     int refcnt;
 };
 
+static QLIST_HEAD(, BlockJob) block_jobs = QLIST_HEAD_INITIALIZER(block_jobs);
+
+BlockJob *block_job_next(BlockJob *job)
+{
+    if (!job) {
+        return QLIST_FIRST(&block_jobs);
+    }
+    return QLIST_NEXT(job, job_list);
+}
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                        int64_t speed, BlockCompletionFunc *cb,
                        void *opaque, Error **errp)
 {
+    BlockBackend *blk;
     BlockJob *job;
 
     if (bs->job) {
         error_setg(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
         return NULL;
     }
-    bdrv_ref(bs);
+
+    blk = blk_new();
+    blk_insert_bs(blk, bs);
+
     job = g_malloc0(driver->instance_size);
     error_setg(&job->blocker, "block device is in use by block job: %s",
                BlockJobType_lookup[driver->job_type]);
@@ -69,13 +83,15 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
 
     job->driver        = driver;
     job->id            = g_strdup(bdrv_get_device_name(bs));
-    job->bs            = bs;
+    job->blk           = blk;
     job->cb            = cb;
     job->opaque        = opaque;
     job->busy          = true;
     job->refcnt        = 1;
     bs->job = job;
 
+    QLIST_INSERT_HEAD(&block_jobs, job, job_list);
+
     /* Only set speed when necessary to avoid NotSupported error */
     if (speed != 0) {
         Error *local_err = NULL;
@@ -98,11 +114,13 @@ void block_job_ref(BlockJob *job)
 void block_job_unref(BlockJob *job)
 {
     if (--job->refcnt == 0) {
-        job->bs->job = NULL;
-        bdrv_op_unblock_all(job->bs, job->blocker);
-        bdrv_unref(job->bs);
+        BlockDriverState *bs = blk_bs(job->blk);
+        bs->job = NULL;
+        bdrv_op_unblock_all(bs, job->blocker);
+        blk_unref(job->blk);
         error_free(job->blocker);
         g_free(job->id);
+        QLIST_REMOVE(job, job_list);
         g_free(job);
     }
 }
@@ -140,7 +158,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
     txn->aborting = true;
     /* We are the first failed job. Cancel other jobs. */
     QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
         aio_context_acquire(ctx);
     }
     QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
@@ -157,7 +175,7 @@ static void block_job_completed_txn_abort(BlockJob *job)
         assert(other_job->completed);
     }
     QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
         block_job_completed_single(other_job);
         aio_context_release(ctx);
     }
@@ -179,7 +197,7 @@ static void block_job_completed_txn_success(BlockJob *job)
     }
     /* We are the last completed job, commit the transaction. */
     QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
-        ctx = bdrv_get_aio_context(other_job->bs);
+        ctx = blk_get_aio_context(other_job->blk);
         aio_context_acquire(ctx);
         assert(other_job->ret == 0);
         block_job_completed_single(other_job);
@@ -189,9 +207,7 @@ static void block_job_completed_txn_success(BlockJob *job)
 
 void block_job_completed(BlockJob *job, int ret)
 {
-    BlockDriverState *bs = job->bs;
-
-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);
     assert(!job->completed);
     job->completed = true;
     job->ret = ret;
@@ -282,11 +298,10 @@ static int block_job_finish_sync(BlockJob *job,
                                  void (*finish)(BlockJob *, Error **errp),
                                  Error **errp)
 {
-    BlockDriverState *bs = job->bs;
     Error *local_err = NULL;
     int ret;
 
-    assert(bs->job == job);
+    assert(blk_bs(job->blk)->job == job);
 
     block_job_ref(job);
     finish(job, &local_err);
@@ -297,7 +312,7 @@ static int block_job_finish_sync(BlockJob *job,
     }
     while (!job->completed) {
         aio_poll(job->deferred_to_main_loop ? qemu_get_aio_context() :
-                                              bdrv_get_aio_context(bs),
+                                              blk_get_aio_context(job->blk),
                  true);
     }
     ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
@@ -318,6 +333,19 @@ int block_job_cancel_sync(BlockJob *job)
     return block_job_finish_sync(job, &block_job_cancel_err, NULL);
 }
 
+void block_job_cancel_sync_all(void)
+{
+    BlockJob *job;
+    AioContext *aio_context;
+
+    while ((job = QLIST_FIRST(&block_jobs))) {
+        aio_context = blk_get_aio_context(job->blk);
+        aio_context_acquire(aio_context);
+        block_job_cancel_sync(job);
+        aio_context_release(aio_context);
+    }
+}
+
 int block_job_complete_sync(BlockJob *job, Error **errp)
 {
     return block_job_finish_sync(job, &block_job_complete, errp);
@@ -336,7 +364,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
     if (block_job_is_paused(job)) {
         qemu_coroutine_yield();
     } else {
-        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
+        co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
     }
     job->busy = true;
 }
@@ -465,7 +493,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
     aio_context_acquire(data->aio_context);
 
     /* Fetch BDS AioContext again, in case it has changed */
-    aio_context = bdrv_get_aio_context(data->job->bs);
+    aio_context = blk_get_aio_context(data->job->blk);
     aio_context_acquire(aio_context);
 
     data->job->deferred_to_main_loop = false;
@@ -485,7 +513,7 @@ void block_job_defer_to_main_loop(BlockJob *job,
     BlockJobDeferToMainLoopData *data = g_malloc(sizeof(*data));
     data->job = job;
     data->bh = qemu_bh_new(block_job_defer_to_main_loop_bh, data);
-    data->aio_context = bdrv_get_aio_context(job->bs);
+    data->aio_context = blk_get_aio_context(job->blk);
     data->fn = fn;
     data->opaque = opaque;
     job->deferred_to_main_loop = true;
diff --git a/cpu-exec.c b/cpu-exec.c
index 602d0c4d0c..f7c642f4a9 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -345,6 +345,15 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
         *last_tb = NULL;
         cpu->tb_flushed = false;
     }
+#ifndef CONFIG_USER_ONLY
+    /* We don't take care of direct jumps when address mapping changes in
+     * system emulation. So it's not safe to make a direct jump to a TB
+     * spanning two pages because the mapping for the second page can change.
+     */
+    if (tb->page_addr[1] != -1) {
+        *last_tb = NULL;
+    }
+#endif
     /* See if we can patch the calling TB. */
     if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
         tb_add_jump(*last_tb, tb_exit, tb);
diff --git a/cpus.c b/cpus.c
index e75895a458..326742f445 100644
--- a/cpus.c
+++ b/cpus.c
@@ -972,6 +972,18 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
     qemu_cpu_kick(cpu);
 }
 
+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        error_report("kvm_destroy_vcpu failed");
+        exit(EXIT_FAILURE);
+    }
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
@@ -1061,7 +1073,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
     cpu->created = true;
     qemu_cond_signal(&qemu_cpu_cond);
 
-    while (1) {
+    do {
         if (cpu_can_run(cpu)) {
             r = kvm_cpu_exec(cpu);
             if (r == EXCP_DEBUG) {
@@ -1069,8 +1081,12 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
             }
         }
         qemu_kvm_wait_io_event(cpu);
-    }
+    } while (!cpu->unplug || cpu_can_run(cpu));
 
+    qemu_kvm_destroy_vcpu(cpu);
+    cpu->created = false;
+    qemu_cond_signal(&qemu_cpu_cond);
+    qemu_mutex_unlock_iothread();
     return NULL;
 }
 
@@ -1124,6 +1140,7 @@ static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;
 
     rcu_register_thread();
 
@@ -1161,6 +1178,18 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
             }
         }
         qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        CPU_FOREACH(cpu) {
+            if (cpu->unplug && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            cpu->created = false;
+            qemu_cond_signal(&qemu_cpu_cond);
+            remove_cpu = NULL;
+        }
     }
 
     return NULL;
@@ -1317,6 +1346,21 @@ void resume_all_vcpus(void)
     }
 }
 
+void cpu_remove(CPUState *cpu)
+{
+    cpu->stop = true;
+    cpu->unplug = true;
+    qemu_cpu_kick(cpu);
+}
+
+void cpu_remove_sync(CPUState *cpu)
+{
+    cpu_remove(cpu);
+    while (cpu->created) {
+        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
+    }
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16
 
@@ -1533,6 +1577,9 @@ static void tcg_exec_all(void)
                 break;
             }
         } else if (cpu->stop || cpu->stopped) {
+            if (cpu->unplug) {
+                next_cpu = CPU_NEXT(cpu);
+            }
             break;
         }
     }
diff --git a/cputlb.c b/cputlb.c
index 1ff6354b04..23c9b91200 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -246,7 +246,8 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
 {
     ram_addr_t ram_addr;
 
-    if (qemu_ram_addr_from_host(ptr, &ram_addr) == NULL) {
+    ram_addr = qemu_ram_addr_from_host(ptr);
+    if (ram_addr == RAM_ADDR_INVALID) {
         fprintf(stderr, "Bad ram pointer %p\n", ptr);
         abort();
     }
diff --git a/dma-helpers.c b/dma-helpers.c
index a6cc15f534..b521d84ebd 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -70,7 +70,7 @@ void qemu_sglist_destroy(QEMUSGList *qsg)
 
 typedef struct {
     BlockAIOCB common;
-    BlockBackend *blk;
+    AioContext *ctx;
     BlockAIOCB *acb;
     QEMUSGList *sg;
     uint64_t offset;
@@ -80,6 +80,7 @@ typedef struct {
     QEMUIOVector iov;
     QEMUBH *bh;
     DMAIOFunc *io_func;
+    void *io_func_opaque;
 } DMAAIOCB;
 
 static void dma_blk_cb(void *opaque, int ret);
@@ -154,8 +155,7 @@ static void dma_blk_cb(void *opaque, int ret)
 
     if (dbs->iov.size == 0) {
         trace_dma_map_wait(dbs);
-        dbs->bh = aio_bh_new(blk_get_aio_context(dbs->blk),
-                             reschedule_dma, dbs);
+        dbs->bh = aio_bh_new(dbs->ctx, reschedule_dma, dbs);
         cpu_register_map_client(dbs->bh);
         return;
     }
@@ -164,8 +164,8 @@ static void dma_blk_cb(void *opaque, int ret)
         qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK);
     }
 
-    dbs->acb = dbs->io_func(dbs->blk, dbs->offset, &dbs->iov, 0,
-                            dma_blk_cb, dbs);
+    dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
+                            dma_blk_cb, dbs, dbs->io_func_opaque);
     assert(dbs->acb);
 }
 
@@ -191,23 +191,25 @@ static const AIOCBInfo dma_aiocb_info = {
     .cancel_async       = dma_aio_cancel,
 };
 
-BlockAIOCB *dma_blk_io(
-    BlockBackend *blk, QEMUSGList *sg, uint64_t sector_num,
-    DMAIOFunc *io_func, BlockCompletionFunc *cb,
+BlockAIOCB *dma_blk_io(AioContext *ctx,
+    QEMUSGList *sg, uint64_t offset,
+    DMAIOFunc *io_func, void *io_func_opaque,
+    BlockCompletionFunc *cb,
     void *opaque, DMADirection dir)
 {
-    DMAAIOCB *dbs = blk_aio_get(&dma_aiocb_info, blk, cb, opaque);
+    DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque);
 
-    trace_dma_blk_io(dbs, blk, sector_num, (dir == DMA_DIRECTION_TO_DEVICE));
+    trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE));
 
     dbs->acb = NULL;
-    dbs->blk = blk;
     dbs->sg = sg;
-    dbs->offset = sector_num << BDRV_SECTOR_BITS;
+    dbs->ctx = ctx;
+    dbs->offset = offset;
     dbs->sg_cur_index = 0;
     dbs->sg_cur_byte = 0;
     dbs->dir = dir;
     dbs->io_func = io_func;
+    dbs->io_func_opaque = io_func_opaque;
     dbs->bh = NULL;
     qemu_iovec_init(&dbs->iov, sg->nsg);
     dma_blk_cb(dbs, 0);
@@ -215,19 +217,39 @@ BlockAIOCB *dma_blk_io(
 }
 
 
+static
+BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
+                                 BlockCompletionFunc *cb, void *cb_opaque,
+                                 void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
-                         QEMUSGList *sg, uint64_t sector,
+                         QEMUSGList *sg, uint64_t offset,
                          void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_preadv, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_read_io_func, blk, cb, opaque,
                       DMA_DIRECTION_FROM_DEVICE);
 }
 
+static
+BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
+                                  BlockCompletionFunc *cb, void *cb_opaque,
+                                  void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
+}
+
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
-                          QEMUSGList *sg, uint64_t sector,
+                          QEMUSGList *sg, uint64_t offset,
                           void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_blk_io(blk, sg, sector, blk_aio_pwritev, cb, opaque,
+    return dma_blk_io(blk_get_aio_context(blk),
+                      sg, offset, dma_blk_write_io_func, blk, cb, opaque,
                       DMA_DIRECTION_TO_DEVICE);
 }
 
diff --git a/docs/atomics.txt b/docs/atomics.txt
index bba771ecd6..c95950b6c5 100644
--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -326,21 +326,41 @@ and memory barriers, and the equivalents in QEMU:
   use a boxed atomic_t type; atomic operations in QEMU are polymorphic
   and use normal C types.
 
-- atomic_read and atomic_set in Linux give no guarantee at all;
-  atomic_read and atomic_set in QEMU include a compiler barrier
-  (similar to the READ_ONCE/WRITE_ONCE macros in Linux).
-
-- most atomic read-modify-write operations in Linux return void;
-  in QEMU, all of them return the old value of the variable.
+- Originally, atomic_read and atomic_set in Linux gave no guarantee
+  at all. Linux 4.1 updated them to implement volatile
+  semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).
+
+  QEMU's atomic_read/set implement, if the compiler supports it, C11
+  atomic relaxed semantics, and volatile semantics otherwise.
+  Both semantics prevent the compiler from doing certain transformations;
+  the difference is that atomic accesses are guaranteed to be atomic,
+  while volatile accesses aren't. Thus, in the volatile case we just cross
+  our fingers hoping that the compiler will generate atomic accesses,
+  since we assume the variables passed are machine-word sized and
+  properly aligned.
+  No barriers are implied by atomic_read/set in either Linux or QEMU.
+
+- atomic read-modify-write operations in Linux are of three kinds:
+
+         atomic_OP          returns void
+         atomic_OP_return   returns new value of the variable
+         atomic_fetch_OP    returns the old value of the variable
+         atomic_cmpxchg     returns the old value of the variable
+
+  In QEMU, the second kind does not exist.  Currently Linux has
+  atomic_fetch_or only.  QEMU provides and, or, inc, dec, add, sub.
 
 - different atomic read-modify-write operations in Linux imply
   a different set of memory barriers; in QEMU, all of them enforce
   sequential consistency, which means they imply full memory barriers
   before and after the operation.
 
-- Linux does not have an equivalent of atomic_mb_read() and
-  atomic_mb_set().  In particular, note that set_mb() is a little
-  weaker than atomic_mb_set().
+- Linux does not have an equivalent of atomic_mb_set().  In particular,
+  note that smp_store_mb() is a little weaker than atomic_mb_set().
+  atomic_mb_read() compiles to the same instructions as Linux's
+  smp_load_acquire(), but this should be treated as an implementation
+  detail.  If required, QEMU might later add atomic_load_acquire() and
+  atomic_store_release() macros.
 
 
 SOURCES
diff --git a/docs/build-system.txt b/docs/build-system.txt
index 5ddddeaafb..2af1e668c5 100644
--- a/docs/build-system.txt
+++ b/docs/build-system.txt
@@ -438,6 +438,11 @@ top level Makefile, so anything defined in this file will influence the
 entire build system. Care needs to be taken when writing rules for tests
 to ensure they only apply to the unit test execution / build.
 
+- tests/docker/Makefile.include
+
+Rules for Docker tests. Like tests/Makefile, this file is included
+directly by the top level Makefile, anything defined in this file will
+influence the entire build system.
 
 - po/Makefile
 
diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt
new file mode 100644
index 0000000000..e17bb50789
--- /dev/null
+++ b/docs/igd-assign.txt
@@ -0,0 +1,133 @@
+Intel Graphics Device (IGD) assignment with vfio-pci
+====================================================
+
+IGD has two different modes for assignment using vfio-pci:
+
+1) Universal Pass-Through (UPT) mode:
+
+   In this mode the IGD device is added as a *secondary* (ie. non-primary)
+   graphics device in combination with an emulated primary graphics device.
+   This mode *requires* guest driver support to remove the external
+   dependencies generally associated with IGD (see below).  Those guest
+   drivers only support this mode for Broadwell and newer IGD, according to
+   Intel.  Additionally, this mode by default, and as officially supported
+   by Intel, does not support direct video output.  The intention is to use
+   this mode either to provide hardware acceleration to the emulated graphics
+   or to use this mode in combination with guest-based remote access software,
+   for example VNC (see below for optional output support).  This mode
+   theoretically has no device specific handling dependencies on vfio-pci or
+   the VM firmware.
+
+2) "Legacy" mode:
+
+   In this mode the IGD device is intended to be the primary and exclusive
+   graphics device in the VM[1], as such QEMU does not facilitate any sort
+   of remote graphics to the VM in this mode.  A connected physical monitor
+   is the intended output device for IGD.  This mode includes several
+   requirements and restrictions:
+
+    * IGD must be given address 02.0 on the PCI root bus in the VM
+    * The host kernel must support vfio extensions for IGD (v4.6)
+    * vfio VGA support very likely needs to be enabled in the host kernel
+    * The VM firmware must support specific fw_cfg enablers for IGD
+    * The VM machine type must support a PCI host bridge at 00.0 (standard)
+    * The VM machine type must provide or allow to be created a special
+      ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at
+      PCI address 1f.0.
+    * The IGD device must have a VGA ROM, either provided via the romfile
+      option or loaded automatically through vfio (standard).  rombar=0
+      will disable legacy mode support.
+    * Hotplug of the IGD device is not supported.
+    * The IGD device must be a SandyBridge or newer model device.
+
+For either mode, depending on the host kernel, the i915 driver in the host
+may generate faults and errors upon re-binding to an IGD device after it
+has been assigned to a VM.  It's therefore generally recommended to prevent
+such driver binding unless the host driver is known to work well for this.
+There are numerous ways to do this, i915 can be blacklisted on the host,
+the driver_override option can be used to ensure that only vfio-pci can bind
+to the device on the host[2], virsh nodedev-detach can be used to bind the
+device to vfio drivers and then managed='no' set in the VM xml to prevent
+re-binding to i915, etc.  Also note that IGD is also typically the primary
+graphics in the host and special options may be required beyond simply
+blacklisting i915 or using pci-stub/vfio-pci to take ownership of IGD as a
+PCI class device.  Lower level drivers exist that may still claim the device.
+It may therefore be necessary to use kernel boot options video=vesafb:off or
+video=efifb:off (depending on host BIOS/UEFI) or these can be combined to
+a catch-all, video=vesafb:off,efifb:off.  Error messages such as:
+
+    Failed to mmap 0000:00:02.0 BAR <>. Performance may be slow
+
+are a good indicator that such a problem exists.  The host files /proc/iomem
+and /proc/ioports are often useful for identifying drivers consuming ranges
+of the device to cause such conflicts.
+
+Additionally, IGD device are known to generate small numbers of DMAR faults
+when initially assigned.  It is believed that this is simply the IGD attempting
+to access the reserved GTT space after reset, which it no longer has access to
+when accessed from userspace.  So long as the DMAR faults are small in number
+and most importantly, not ongoing, these are not an indication of an error.
+
+Additionally++, analog VGA output (as opposed to digital outputs like HDMI,
+DVI, or DisplayPort) may be unsupported in some use cases.  In the author's
+experience, even DP to VGA adapters can be troublesome while adapters between
+digital formats work well.
+
+Usage
+=====
+The intention is for IGD assignment to be transparent for users and thus for
+management tools like libvirt.  To make use of legacy mode, simply remove all
+other graphics options and use "-nographic" and either "-vga none" or
+"-nodefaults", along with adding the device using vfio-pci:
+
+    -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2
+
+For UPT mode, retain the default emulated graphics and simply add the vfio-pci
+device making use of any other bus address other than 02.0.  libvirt will
+default to assigning the device a UPT compatible address while legacy mode
+users will need to manually edit the XML if using a tool like virt-manager
+where the VM device address is not expressly specified.
+
+An experimental vfio-pci option also exists to enable OpRegion, and thus
+external monitor support, for UPT mode.  This can be enabled by adding
+"x-igd-opregion=on" to the vfio-pci device options for the IGD device.  As
+with legacy mode, this requires the host to support features introduced in
+the v4.6 kernel.  If Intel chooses to embrace this support, the option may
+be made non-experimental in the future, opening it to libvirt support.
+
+Developer ABI
+=============
+Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware:
+
+1) "etc/igd-opregion"
+
+   This fw_cfg file exposes the OpRegion for the IGD device.  A reserved
+   region should be created below 4GB (recommended 4KB alignment), sized
+   sufficient for the fw_cfg file size, and the content of this file copied
+   to it.  The dword based address of this reserved memory region must also
+   be written to the ASLS register at offset 0xFC on the IGD device.  It is
+   recommended that firmware should make use of this fw_cfg entry for any
+   PCI class VGA device with Intel vendor ID.  Multiple of such devices
+   within a VM is undefined.
+
+2) "etc/igd-bdsm-size"
+
+   This fw_cfg file contains an 8-byte, little endian integer indicating
+   the size of the reserved memory region required for IGD stolen memory.
+   Firmware must allocate a reserved memory below 4GB with required 1MB
+   alignment equal to this size.  Additionally the base address of this
+   reserved region must be written to the dword BDSM register in PCI config
+   space of the IGD device at offset 0x5C.  As this support is related to
+   running the IGD ROM, which has other dependencies on the device appearing
+   at guest address 00:02.0, it's expected that this fw_cfg file is only
+   relevant to a single PCI class VGA device with Intel vendor ID, appearing
+   at PCI bus address 00:02.0.
+
+Footnotes
+=========
+[1] Nothing precludes adding additional emulated or assigned graphics devices
+    as non-primary, other than the combination typically not working.  I only
+    intend to set user expectations, others are welcome to find working
+    combinations or fix whatever issues prevent this from working in the common
+    case.
+[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override
diff --git a/docs/migration.txt b/docs/migration.txt
index 90209ab294..6503c17685 100644
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -403,8 +403,8 @@ listen thread:                     --- page -- page -- page -- page -- page --
 
 On receipt of CMD_PACKAGED (1)
    All the data associated with the package - the ( ... ) section in the
-diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
-recurses into qemu_loadvm_state_main to process the contents of the package (2)
+diagram - is read into memory, and the main thread recurses into
+qemu_loadvm_state_main to process the contents of the package (2)
 which contains commands (3,6) and devices (4...)
 
 On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
diff --git a/exec.c b/exec.c
index a3a93aeed3..f2c9e374f5 100644
--- a/exec.c
+++ b/exec.c
@@ -57,6 +57,8 @@
 #include "exec/ram_addr.h"
 #include "exec/log.h"
 
+#include "migration/vmstate.h"
+
 #include "qemu/range.h"
 #ifndef _WIN32
 #include "qemu/mmap-alloc.h"
@@ -612,15 +614,9 @@ static int cpu_get_free_index(Error **errp)
     return cpu;
 }
 
-void cpu_exec_exit(CPUState *cpu)
+static void cpu_release_index(CPUState *cpu)
 {
-    if (cpu->cpu_index == -1) {
-        /* cpu_index was never allocated by this @cpu or was already freed. */
-        return;
-    }
-
     bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
-    cpu->cpu_index = -1;
 }
 #else
 
@@ -635,11 +631,42 @@ static int cpu_get_free_index(Error **errp)
     return cpu_index;
 }
 
-void cpu_exec_exit(CPUState *cpu)
+static void cpu_release_index(CPUState *cpu)
 {
+    return;
 }
 #endif
 
+void cpu_exec_exit(CPUState *cpu)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_lock();
+#endif
+    if (cpu->cpu_index == -1) {
+        /* cpu_index was never allocated by this @cpu or was already freed. */
+#if defined(CONFIG_USER_ONLY)
+        cpu_list_unlock();
+#endif
+        return;
+    }
+
+    QTAILQ_REMOVE(&cpus, cpu, node);
+    cpu_release_index(cpu);
+    cpu->cpu_index = -1;
+#if defined(CONFIG_USER_ONLY)
+    cpu_list_unlock();
+#endif
+
+    if (cc->vmsd != NULL) {
+        vmstate_unregister(NULL, cc->vmsd, cpu);
+    }
+    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
+    }
+}
+
 void cpu_exec_init(CPUState *cpu, Error **errp)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
@@ -1815,40 +1842,6 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
 }
 #endif /* !_WIN32 */
 
-int qemu_get_ram_fd(ram_addr_t addr)
-{
-    RAMBlock *block;
-    int fd;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    fd = block->fd;
-    rcu_read_unlock();
-    return fd;
-}
-
-void qemu_set_ram_fd(ram_addr_t addr, int fd)
-{
-    RAMBlock *block;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    block->fd = fd;
-    rcu_read_unlock();
-}
-
-void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-    void *ptr;
-
-    rcu_read_lock();
-    block = qemu_get_ram_block(addr);
-    ptr = ramblock_ptr(block, 0);
-    rcu_read_unlock();
-    return ptr;
-}
-
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
  * This should not be used for general purpose DMA.  Use address_space_map
  * or address_space_rw instead. For local memory (e.g. video ram) that the
@@ -1856,12 +1849,13 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
  *
  * Called within RCU critical section.
  */
-void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
 {
     RAMBlock *block = ram_block;
 
     if (block == NULL) {
         block = qemu_get_ram_block(addr);
+        addr -= block->offset;
     }
 
     if (xen_enabled() && block->host == NULL) {
@@ -1875,10 +1869,10 @@ void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
 
         block->host = xen_map_cache(block->offset, block->max_length, 1);
     }
-    return ramblock_ptr(block, addr - block->offset);
+    return ramblock_ptr(block, addr);
 }
 
-/* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
+/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
  * but takes a size argument.
  *
  * Called within RCU critical section.
@@ -1887,16 +1881,15 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
                                  hwaddr *size)
 {
     RAMBlock *block = ram_block;
-    ram_addr_t offset_inside_block;
     if (*size == 0) {
         return NULL;
     }
 
     if (block == NULL) {
         block = qemu_get_ram_block(addr);
+        addr -= block->offset;
     }
-    offset_inside_block = addr - block->offset;
-    *size = MIN(*size, block->max_length - offset_inside_block);
+    *size = MIN(*size, block->max_length - addr);
 
     if (xen_enabled() && block->host == NULL) {
         /* We need to check if the requested address is in the RAM
@@ -1910,7 +1903,7 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
         block->host = xen_map_cache(block->offset, block->max_length, 1);
     }
 
-    return ramblock_ptr(block, offset_inside_block);
+    return ramblock_ptr(block, addr);
 }
 
 /*
@@ -1931,16 +1924,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
  * ram_addr_t.
  */
 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
-                                   ram_addr_t *ram_addr,
                                    ram_addr_t *offset)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
 
     if (xen_enabled()) {
+        ram_addr_t ram_addr;
         rcu_read_lock();
-        *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        block = qemu_get_ram_block(*ram_addr);
+        ram_addr = xen_ram_addr_from_mapcache(ptr);
+        block = qemu_get_ram_block(ram_addr);
         if (block) {
             *offset = (host - block->host);
         }
@@ -1972,7 +1965,6 @@ found:
     if (round_offset) {
         *offset &= TARGET_PAGE_MASK;
     }
-    *ram_addr = block->offset + *offset;
     rcu_read_unlock();
     return block;
 }
@@ -1999,18 +1991,17 @@ RAMBlock *qemu_ram_block_by_name(const char *name)
 
 /* Some of the softmmu routines need to translate from a host pointer
    (typically a TLB entry) back to a ram offset.  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+ram_addr_t qemu_ram_addr_from_host(void *ptr)
 {
     RAMBlock *block;
-    ram_addr_t offset; /* Not used */
-
-    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+    ram_addr_t offset;
 
+    block = qemu_ram_block_from_host(ptr, false, &offset);
     if (!block) {
-        return NULL;
+        return RAM_ADDR_INVALID;
     }
 
-    return block->mr;
+    return block->offset + offset;
 }
 
 /* Called within RCU critical section.  */
@@ -2022,13 +2013,13 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
     }
     switch (size) {
     case 1:
-        stb_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
         break;
     case 2:
-        stw_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
         break;
     case 4:
-        stl_p(qemu_get_ram_ptr(NULL, ram_addr), val);
+        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
         break;
     default:
         abort();
@@ -2490,6 +2481,8 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
                                      hwaddr length)
 {
     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+    addr += memory_region_get_ram_addr(mr);
+
     /* No early return if dirty_log_mask is or becomes 0, because
      * cpu_physical_memory_set_dirty_range will still call
      * xen_modified_memory.
@@ -2602,9 +2595,8 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
                 abort();
             }
         } else {
-            addr1 += memory_region_get_ram_addr(mr);
             /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
             memcpy(ptr, buf, l);
             invalidate_and_set_dirty(mr, addr1, l);
         }
@@ -2695,8 +2687,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
             }
         } else {
             /* RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block,
-                                   memory_region_get_ram_addr(mr) + addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
             memcpy(buf, ptr, l);
         }
 
@@ -2779,9 +2770,8 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
               memory_region_is_romd(mr))) {
             l = memory_access_size(mr, l, addr1);
         } else {
-            addr1 += memory_region_get_ram_addr(mr);
             /* ROM/RAM case */
-            ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
             switch (type) {
             case WRITE_DATA:
                 memcpy(ptr, buf, l);
@@ -2939,7 +2929,6 @@ void *address_space_map(AddressSpace *as,
     hwaddr done = 0;
     hwaddr l, xlat, base;
     MemoryRegion *mr, *this_mr;
-    ram_addr_t raddr;
     void *ptr;
 
     if (len == 0) {
@@ -2974,7 +2963,6 @@ void *address_space_map(AddressSpace *as,
     }
 
     base = xlat;
-    raddr = memory_region_get_ram_addr(mr);
 
     for (;;) {
         len -= l;
@@ -2993,7 +2981,7 @@ void *address_space_map(AddressSpace *as,
 
     memory_region_ref(mr);
     *plen = done;
-    ptr = qemu_ram_ptr_length(mr->ram_block, raddr + base, plen);
+    ptr = qemu_ram_ptr_length(mr->ram_block, base, plen);
     rcu_read_unlock();
 
     return ptr;
@@ -3010,7 +2998,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
         MemoryRegion *mr;
         ram_addr_t addr1;
 
-        mr = qemu_ram_addr_from_host(buffer, &addr1);
+        mr = memory_region_from_host(buffer, &addr1);
         assert(mr != NULL);
         if (is_write) {
             invalidate_and_set_dirty(mr, addr1, access_len);
@@ -3077,8 +3065,7 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
 #endif
     } else {
         /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               memory_region_get_ram_addr(mr) + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             val = ldl_le_p(ptr);
@@ -3171,8 +3158,7 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
 #endif
     } else {
         /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               memory_region_get_ram_addr(mr) + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             val = ldq_le_p(ptr);
@@ -3285,8 +3271,7 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
 #endif
     } else {
         /* RAM case */
-        ptr = qemu_get_ram_ptr(mr->ram_block,
-                               memory_region_get_ram_addr(mr) + addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             val = lduw_le_p(ptr);
@@ -3368,13 +3353,13 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
 
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
-        addr1 += memory_region_get_ram_addr(mr);
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         stl_p(ptr, val);
 
         dirty_log_mask = memory_region_get_dirty_log_mask(mr);
         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
-        cpu_physical_memory_set_dirty_range(addr1, 4, dirty_log_mask);
+        cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+                                            4, dirty_log_mask);
         r = MEMTX_OK;
     }
     if (result) {
@@ -3423,8 +3408,7 @@ static inline void address_space_stl_internal(AddressSpace *as,
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
         /* RAM case */
-        addr1 += memory_region_get_ram_addr(mr);
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             stl_le_p(ptr, val);
@@ -3533,8 +3517,7 @@ static inline void address_space_stw_internal(AddressSpace *as,
         r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
     } else {
         /* RAM case */
-        addr1 += memory_region_get_ram_addr(mr);
-        ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
         switch (endian) {
         case DEVICE_LITTLE_ENDIAN:
             stw_le_p(ptr, val);
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 4f4f60a0df..98b4b1a82c 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1008,7 +1008,7 @@ ETEXI
 
     {
         .name       = "migrate_set_parameter",
-        .args_type  = "parameter:s,value:i",
+        .args_type  = "parameter:s,value:s",
         .params     = "parameter value",
         .help       = "Set the parameter for migration",
         .mhandler.cmd = hmp_migrate_set_parameter,
diff --git a/hmp.c b/hmp.c
index 9f9bcf9d83..a4b1d3d220 100644
--- a/hmp.c
+++ b/hmp.c
@@ -35,6 +35,7 @@
 #include "block/qapi.h"
 #include "qemu-io.h"
 #include "qemu/cutils.h"
+#include "qemu/error-report.h"
 
 #ifdef CONFIG_SPICE
 #include <spice/enums.h>
@@ -168,8 +169,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
     }
 
     if (info->has_status) {
-        monitor_printf(mon, "Migration status: %s\n",
+        monitor_printf(mon, "Migration status: %s",
                        MigrationStatus_lookup[info->status]);
+        if (info->status == MIGRATION_STATUS_FAILED &&
+            info->has_error_desc) {
+            monitor_printf(mon, " (%s)\n", info->error_desc);
+        } else {
+            monitor_printf(mon, "\n");
+        }
+
         monitor_printf(mon, "total time: %" PRIu64 " milliseconds\n",
                        info->total_time);
         if (info->has_expected_downtime) {
@@ -286,6 +294,12 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, " %s: %" PRId64,
             MigrationParameter_lookup[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT],
             params->cpu_throttle_increment);
+        monitor_printf(mon, " %s: '%s'",
+            MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_CREDS],
+            params->tls_creds ? : "");
+        monitor_printf(mon, " %s: '%s'",
+            MigrationParameter_lookup[MIGRATION_PARAMETER_TLS_HOSTNAME],
+            params->tls_hostname ? : "");
         monitor_printf(mon, "\n");
     }
 
@@ -1235,13 +1249,17 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict)
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
 {
     const char *param = qdict_get_str(qdict, "parameter");
-    int value = qdict_get_int(qdict, "value");
+    const char *valuestr = qdict_get_str(qdict, "value");
+    long valueint = 0;
     Error *err = NULL;
     bool has_compress_level = false;
     bool has_compress_threads = false;
     bool has_decompress_threads = false;
     bool has_cpu_throttle_initial = false;
     bool has_cpu_throttle_increment = false;
+    bool has_tls_creds = false;
+    bool has_tls_hostname = false;
+    bool use_int_value = false;
     int i;
 
     for (i = 0; i < MIGRATION_PARAMETER__MAX; i++) {
@@ -1249,25 +1267,46 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
             switch (i) {
             case MIGRATION_PARAMETER_COMPRESS_LEVEL:
                 has_compress_level = true;
+                use_int_value = true;
                 break;
             case MIGRATION_PARAMETER_COMPRESS_THREADS:
                 has_compress_threads = true;
+                use_int_value = true;
                 break;
             case MIGRATION_PARAMETER_DECOMPRESS_THREADS:
                 has_decompress_threads = true;
+                use_int_value = true;
                 break;
             case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL:
                 has_cpu_throttle_initial = true;
+                use_int_value = true;
                 break;
             case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT:
                 has_cpu_throttle_increment = true;
                 break;
+            case MIGRATION_PARAMETER_TLS_CREDS:
+                has_tls_creds = true;
+                break;
+            case MIGRATION_PARAMETER_TLS_HOSTNAME:
+                has_tls_hostname = true;
+                break;
             }
-            qmp_migrate_set_parameters(has_compress_level, value,
-                                       has_compress_threads, value,
-                                       has_decompress_threads, value,
-                                       has_cpu_throttle_initial, value,
-                                       has_cpu_throttle_increment, value,
+
+            if (use_int_value) {
+                if (qemu_strtol(valuestr, NULL, 10, &valueint) < 0) {
+                    error_setg(&err, "Unable to parse '%s' as an int",
+                               valuestr);
+                    goto cleanup;
+                }
+            }
+
+            qmp_migrate_set_parameters(has_compress_level, valueint,
+                                       has_compress_threads, valueint,
+                                       has_decompress_threads, valueint,
+                                       has_cpu_throttle_initial, valueint,
+                                       has_cpu_throttle_increment, valueint,
+                                       has_tls_creds, valuestr,
+                                       has_tls_hostname, valuestr,
                                        &err);
             break;
         }
@@ -1277,6 +1316,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         error_setg(&err, QERR_INVALID_PARAMETER, param);
     }
 
+ cleanup:
     if (err) {
         error_report_err(err);
     }
@@ -1533,6 +1573,9 @@ static void hmp_migrate_status_cb(void *opaque)
         if (status->is_block_migration) {
             monitor_printf(status->mon, "\n");
         }
+        if (info->has_error_desc) {
+            error_report("%s", info->error_desc);
+        }
         monitor_resume(status->mon);
         timer_del(status->timer);
         g_free(status);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 173988ee84..9faad29fad 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -239,7 +239,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
     uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
     uint64_t data_size = (uint64_t)nlb << data_shift;
-    uint64_t aio_slba  = slba << (data_shift - BDRV_SECTOR_BITS);
+    uint64_t data_offset = slba << data_shift;
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
@@ -258,8 +258,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     req->has_sg = true;
     dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
     req->aiocb = is_write ?
-        dma_blk_write(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req) :
-        dma_blk_read(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req);
+        dma_blk_write(n->conf.blk, &req->qsg, data_offset, nvme_rw_cb, req) :
+        dma_blk_read(n->conf.blk, &req->qsg, data_offset, nvme_rw_cb, req);
 
     return NVME_NO_COMPLETE;
 }
diff --git a/hw/bt/hci-csr.c b/hw/bt/hci-csr.c
index e6b8998253..d688372ca3 100644
--- a/hw/bt/hci-csr.c
+++ b/hw/bt/hci-csr.c
@@ -39,9 +39,14 @@ struct csrhci_s {
     int out_size;
     uint8_t outfifo[FIFO_LEN * 2];
     uint8_t inpkt[FIFO_LEN];
+    enum {
+        CSR_HDR_LEN,
+        CSR_DATA_LEN,
+        CSR_DATA
+    } in_state;
     int in_len;
     int in_hdr;
-    int in_data;
+    int in_needed;
     QEMUTimer *out_tm;
     int64_t baud_delay;
 
@@ -296,38 +301,60 @@ static int csrhci_data_len(const uint8_t *pkt)
     exit(-1);
 }
 
+static void csrhci_ready_for_next_inpkt(struct csrhci_s *s)
+{
+    s->in_state = CSR_HDR_LEN;
+    s->in_len = 0;
+    s->in_needed = 2;
+    s->in_hdr = INT_MAX;
+}
+
 static int csrhci_write(struct CharDriverState *chr,
                 const uint8_t *buf, int len)
 {
     struct csrhci_s *s = (struct csrhci_s *) chr->opaque;
-    int plen = s->in_len;
+    int total = 0;
 
     if (!s->enable)
         return 0;
 
-    s->in_len += len;
-    memcpy(s->inpkt + plen, buf, len);
+    for (;;) {
+        int cnt = MIN(len, s->in_needed - s->in_len);
+        if (cnt) {
+            memcpy(s->inpkt + s->in_len, buf, cnt);
+            s->in_len += cnt;
+            buf += cnt;
+            len -= cnt;
+            total += cnt;
+        }
+
+        if (s->in_len < s->in_needed) {
+            break;
+        }
 
-    while (1) {
-        if (s->in_len >= 2 && plen < 2)
+        if (s->in_state == CSR_HDR_LEN) {
             s->in_hdr = csrhci_header_len(s->inpkt) + 1;
+            assert(s->in_hdr >= s->in_needed);
+            s->in_needed = s->in_hdr;
+            s->in_state = CSR_DATA_LEN;
+            continue;
+        }
 
-        if (s->in_len >= s->in_hdr && plen < s->in_hdr)
-            s->in_data = csrhci_data_len(s->inpkt) + s->in_hdr;
+        if (s->in_state == CSR_DATA_LEN) {
+            s->in_needed += csrhci_data_len(s->inpkt);
+            /* hci_acl_hdr could specify more than 4096 bytes, so assert.  */
+            assert(s->in_needed <= sizeof(s->inpkt));
+            s->in_state = CSR_DATA;
+            continue;
+        }
 
-        if (s->in_len >= s->in_data) {
+        if (s->in_state == CSR_DATA) {
             csrhci_in_packet(s, s->inpkt);
-
-            memmove(s->inpkt, s->inpkt + s->in_len, s->in_len - s->in_data);
-            s->in_len -= s->in_data;
-            s->in_hdr = INT_MAX;
-            s->in_data = INT_MAX;
-            plen = 0;
-        } else
-            break;
+            csrhci_ready_for_next_inpkt(s);
+        }
     }
 
-    return len;
+    return total;
 }
 
 static void csrhci_out_hci_packet_event(void *opaque,
@@ -389,11 +416,9 @@ static void csrhci_reset(struct csrhci_s *s)
 {
     s->out_len = 0;
     s->out_size = FIFO_LEN;
-    s->in_len = 0;
+    csrhci_ready_for_next_inpkt(s);
     s->baud_delay = NANOSECONDS_PER_SECOND;
     s->enable = 0;
-    s->in_hdr = INT_MAX;
-    s->in_data = INT_MAX;
 
     s->modem_state = 0;
     /* After a while... (but sooner than 10ms) */
diff --git a/hw/char/escc.c b/hw/char/escc.c
index 7bf09a0077..8e6a7df465 100644
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -983,9 +983,10 @@ void slavio_serial_ms_kbd_init(hwaddr base, qemu_irq irq,
     sysbus_mmio_map(s, 0, base);
 }
 
-static int escc_init1(SysBusDevice *dev)
+static void escc_init1(Object *obj)
 {
-    ESCCState *s = ESCC(dev);
+    ESCCState *s = ESCC(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
     unsigned int i;
 
     s->chn[0].disabled = s->disabled;
@@ -994,17 +995,26 @@ static int escc_init1(SysBusDevice *dev)
         sysbus_init_irq(dev, &s->chn[i].irq);
         s->chn[i].chn = 1 - i;
         s->chn[i].clock = s->frequency / 2;
-        if (s->chn[i].chr) {
-            qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive,
-                                  serial_receive1, serial_event, &s->chn[i]);
-        }
     }
     s->chn[0].otherchn = &s->chn[1];
     s->chn[1].otherchn = &s->chn[0];
 
-    memory_region_init_io(&s->mmio, OBJECT(s), &escc_mem_ops, s, "escc",
+    memory_region_init_io(&s->mmio, obj, &escc_mem_ops, s, "escc",
                           ESCC_SIZE << s->it_shift);
     sysbus_init_mmio(dev, &s->mmio);
+}
+
+static void escc_realize(DeviceState *dev, Error **errp)
+{
+    ESCCState *s = ESCC(dev);
+    unsigned int i;
+
+    for (i = 0; i < 2; i++) {
+        if (s->chn[i].chr) {
+            qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive,
+                                  serial_receive1, serial_event, &s->chn[i]);
+        }
+    }
 
     if (s->chn[0].type == mouse) {
         qemu_add_mouse_event_handler(sunmouse_event, &s->chn[0], 0,
@@ -1014,8 +1024,6 @@ static int escc_init1(SysBusDevice *dev)
         s->chn[1].hs = qemu_input_handler_register((DeviceState *)(&s->chn[1]),
                                                    &sunkbd_handler);
     }
-
-    return 0;
 }
 
 static Property escc_properties[] = {
@@ -1032,10 +1040,9 @@ static Property escc_properties[] = {
 static void escc_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-    k->init = escc_init1;
     dc->reset = escc_reset;
+    dc->realize = escc_realize;
     dc->vmsd = &vmstate_escc;
     dc->props = escc_properties;
     set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
@@ -1045,6 +1052,7 @@ static const TypeInfo escc_info = {
     .name          = TYPE_ESCC,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(ESCCState),
+    .instance_init = escc_init1,
     .class_init    = escc_class_init,
 };
 
diff --git a/hw/char/etraxfs_ser.c b/hw/char/etraxfs_ser.c
index 146b387e7e..04ca04fe2c 100644
--- a/hw/char/etraxfs_ser.c
+++ b/hw/char/etraxfs_ser.c
@@ -159,6 +159,11 @@ static const MemoryRegionOps ser_ops = {
     }
 };
 
+static Property etraxfs_ser_properties[] = {
+    DEFINE_PROP_CHR("chardev", ETRAXSerial, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void serial_receive(void *opaque, const uint8_t *buf, int size)
 {
     ETRAXSerial *s = opaque;
@@ -209,40 +214,42 @@ static void etraxfs_ser_reset(DeviceState *d)
 
 }
 
-static int etraxfs_ser_init(SysBusDevice *dev)
+static void etraxfs_ser_init(Object *obj)
 {
-    ETRAXSerial *s = ETRAX_SERIAL(dev);
+    ETRAXSerial *s = ETRAX_SERIAL(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
     sysbus_init_irq(dev, &s->irq);
-    memory_region_init_io(&s->mmio, OBJECT(s), &ser_ops, s,
+    memory_region_init_io(&s->mmio, obj, &ser_ops, s,
                           "etraxfs-serial", R_MAX * 4);
     sysbus_init_mmio(dev, &s->mmio);
+}
+
+static void etraxfs_ser_realize(DeviceState *dev, Error **errp)
+{
+    ETRAXSerial *s = ETRAX_SERIAL(dev);
 
-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
     if (s->chr) {
         qemu_chr_add_handlers(s->chr,
                               serial_can_receive, serial_receive,
                               serial_event, s);
     }
-    return 0;
 }
 
 static void etraxfs_ser_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-    k->init = etraxfs_ser_init;
     dc->reset = etraxfs_ser_reset;
-    /* Reason: init() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = etraxfs_ser_properties;
+    dc->realize = etraxfs_ser_realize;
 }
 
 static const TypeInfo etraxfs_ser_info = {
     .name          = TYPE_ETRAX_FS_SERIAL,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(ETRAXSerial),
+    .instance_init = etraxfs_ser_init,
     .class_init    = etraxfs_ser_class_init,
 };
 
diff --git a/hw/char/lm32_juart.c b/hw/char/lm32_juart.c
index 5bf8acfe8f..28c2cf702d 100644
--- a/hw/char/lm32_juart.c
+++ b/hw/char/lm32_juart.c
@@ -114,17 +114,13 @@ static void juart_reset(DeviceState *d)
     s->jrx = 0;
 }
 
-static int lm32_juart_init(SysBusDevice *dev)
+static void lm32_juart_realize(DeviceState *dev, Error **errp)
 {
     LM32JuartState *s = LM32_JUART(dev);
 
-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
     if (s->chr) {
         qemu_chr_add_handlers(s->chr, juart_can_rx, juart_rx, juart_event, s);
     }
-
-    return 0;
 }
 
 static const VMStateDescription vmstate_lm32_juart = {
@@ -138,16 +134,19 @@ static const VMStateDescription vmstate_lm32_juart = {
     }
 };
 
+static Property lm32_juart_properties[] = {
+    DEFINE_PROP_CHR("chardev", LM32JuartState, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void lm32_juart_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-    k->init = lm32_juart_init;
     dc->reset = juart_reset;
     dc->vmsd = &vmstate_lm32_juart;
-    /* Reason: init() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = lm32_juart_properties;
+    dc->realize = lm32_juart_realize;
 }
 
 static const TypeInfo lm32_juart_info = {
diff --git a/hw/char/lm32_uart.c b/hw/char/lm32_uart.c
index 036813d0f3..b5c760dda3 100644
--- a/hw/char/lm32_uart.c
+++ b/hw/char/lm32_uart.c
@@ -249,23 +249,25 @@ static void uart_reset(DeviceState *d)
     s->regs[R_LSR] = LSR_THRE | LSR_TEMT;
 }
 
-static int lm32_uart_init(SysBusDevice *dev)
+static void lm32_uart_init(Object *obj)
 {
-    LM32UartState *s = LM32_UART(dev);
+    LM32UartState *s = LM32_UART(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
     sysbus_init_irq(dev, &s->irq);
 
-    memory_region_init_io(&s->iomem, OBJECT(s), &uart_ops, s,
+    memory_region_init_io(&s->iomem, obj, &uart_ops, s,
                           "uart", R_MAX * 4);
     sysbus_init_mmio(dev, &s->iomem);
+}
+
+static void lm32_uart_realize(DeviceState *dev, Error **errp)
+{
+    LM32UartState *s = LM32_UART(dev);
 
-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
     if (s->chr) {
         qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
     }
-
-    return 0;
 }
 
 static const VMStateDescription vmstate_lm32_uart = {
@@ -278,22 +280,26 @@ static const VMStateDescription vmstate_lm32_uart = {
     }
 };
 
+static Property lm32_uart_properties[] = {
+    DEFINE_PROP_CHR("chardev", LM32UartState, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void lm32_uart_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-    k->init = lm32_uart_init;
     dc->reset = uart_reset;
     dc->vmsd = &vmstate_lm32_uart;
-    /* Reason: init() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = lm32_uart_properties;
+    dc->realize = lm32_uart_realize;
 }
 
 static const TypeInfo lm32_uart_info = {
     .name          = TYPE_LM32_UART,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(LM32UartState),
+    .instance_init = lm32_uart_init,
     .class_init    = lm32_uart_class_init,
 };
 
diff --git a/hw/char/milkymist-uart.c b/hw/char/milkymist-uart.c
index 03b36b2236..72f8484668 100644
--- a/hw/char/milkymist-uart.c
+++ b/hw/char/milkymist-uart.c
@@ -200,8 +200,6 @@ static void milkymist_uart_realize(DeviceState *dev, Error **errp)
 {
     MilkymistUartState *s = MILKYMIST_UART(dev);
 
-    /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-    s->chr = qemu_char_get_next_serial();
     if (s->chr) {
         qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
     }
@@ -229,6 +227,11 @@ static const VMStateDescription vmstate_milkymist_uart = {
     }
 };
 
+static Property milkymist_uart_properties[] = {
+    DEFINE_PROP_CHR("chardev", MilkymistUartState, chr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void milkymist_uart_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -236,8 +239,7 @@ static void milkymist_uart_class_init(ObjectClass *klass, void *data)
     dc->realize = milkymist_uart_realize;
     dc->reset = milkymist_uart_reset;
     dc->vmsd = &vmstate_milkymist_uart;
-    /* Reason: realize() method uses qemu_char_get_next_serial() */
-    dc->cannot_instantiate_with_device_add_yet = true;
+    dc->props = milkymist_uart_properties;
 }
 
 static const TypeInfo milkymist_uart_info = {
diff --git a/hw/core/Makefile.objs b/hw/core/Makefile.objs
index 70951d4137..82a9ef84f8 100644
--- a/hw/core/Makefile.objs
+++ b/hw/core/Makefile.objs
@@ -1,5 +1,6 @@
 # core qdev-related obj files, also used by *-user:
 common-obj-y += qdev.o qdev-properties.o
+common-obj-y += bus.o
 common-obj-y += fw-path-provider.o
 # irq.o needed for qdev GPIO handling:
 common-obj-y += irq.o
diff --git a/hw/core/bus.c b/hw/core/bus.c
new file mode 100644
index 0000000000..3e3f8ac740
--- /dev/null
+++ b/hw/core/bus.c
@@ -0,0 +1,251 @@
+/*
+ *  Dynamic device configuration and creation -- buses.
+ *
+ *  Copyright (c) 2009 CodeSourcery
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "hw/qdev.h"
+#include "qapi/error.h"
+
+static void qbus_set_hotplug_handler_internal(BusState *bus, Object *handler,
+                                              Error **errp)
+{
+
+    object_property_set_link(OBJECT(bus), OBJECT(handler),
+                             QDEV_HOTPLUG_HANDLER_PROPERTY, errp);
+}
+
+void qbus_set_hotplug_handler(BusState *bus, DeviceState *handler, Error **errp)
+{
+    qbus_set_hotplug_handler_internal(bus, OBJECT(handler), errp);
+}
+
+void qbus_set_bus_hotplug_handler(BusState *bus, Error **errp)
+{
+    qbus_set_hotplug_handler_internal(bus, OBJECT(bus), errp);
+}
+
+int qbus_walk_children(BusState *bus,
+                       qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn,
+                       qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn,
+                       void *opaque)
+{
+    BusChild *kid;
+    int err;
+
+    if (pre_busfn) {
+        err = pre_busfn(bus, opaque);
+        if (err) {
+            return err;
+        }
+    }
+
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        err = qdev_walk_children(kid->child,
+                                 pre_devfn, pre_busfn,
+                                 post_devfn, post_busfn, opaque);
+        if (err < 0) {
+            return err;
+        }
+    }
+
+    if (post_busfn) {
+        err = post_busfn(bus, opaque);
+        if (err) {
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
+{
+    const char *typename = object_get_typename(OBJECT(bus));
+    BusClass *bc;
+    char *buf;
+    int i, len, bus_id;
+
+    bus->parent = parent;
+
+    if (name) {
+        bus->name = g_strdup(name);
+    } else if (bus->parent && bus->parent->id) {
+        /* parent device has id -> use it plus parent-bus-id for bus name */
+        bus_id = bus->parent->num_child_bus;
+
+        len = strlen(bus->parent->id) + 16;
+        buf = g_malloc(len);
+        snprintf(buf, len, "%s.%d", bus->parent->id, bus_id);
+        bus->name = buf;
+    } else {
+        /* no id -> use lowercase bus type plus global bus-id for bus name */
+        bc = BUS_GET_CLASS(bus);
+        bus_id = bc->automatic_ids++;
+
+        len = strlen(typename) + 16;
+        buf = g_malloc(len);
+        len = snprintf(buf, len, "%s.%d", typename, bus_id);
+        for (i = 0; i < len; i++) {
+            buf[i] = qemu_tolower(buf[i]);
+        }
+        bus->name = buf;
+    }
+
+    if (bus->parent) {
+        QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling);
+        bus->parent->num_child_bus++;
+        object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL);
+        object_unref(OBJECT(bus));
+    } else if (bus != sysbus_get_default()) {
+        /* TODO: once all bus devices are qdevified,
+           only reset handler for main_system_bus should be registered here. */
+        qemu_register_reset(qbus_reset_all_fn, bus);
+    }
+}
+
+static void bus_unparent(Object *obj)
+{
+    BusState *bus = BUS(obj);
+    BusChild *kid;
+
+    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
+        DeviceState *dev = kid->child;
+        object_unparent(OBJECT(dev));
+    }
+    if (bus->parent) {
+        QLIST_REMOVE(bus, sibling);
+        bus->parent->num_child_bus--;
+        bus->parent = NULL;
+    } else {
+        assert(bus != sysbus_get_default()); /* main_system_bus is never freed */
+        qemu_unregister_reset(qbus_reset_all_fn, bus);
+    }
+}
+
+void qbus_create_inplace(void *bus, size_t size, const char *typename,
+                         DeviceState *parent, const char *name)
+{
+    object_initialize(bus, size, typename);
+    qbus_realize(bus, parent, name);
+}
+
+BusState *qbus_create(const char *typename, DeviceState *parent, const char *name)
+{
+    BusState *bus;
+
+    bus = BUS(object_new(typename));
+    qbus_realize(bus, parent, name);
+
+    return bus;
+}
+
+static bool bus_get_realized(Object *obj, Error **errp)
+{
+    BusState *bus = BUS(obj);
+
+    return bus->realized;
+}
+
+static void bus_set_realized(Object *obj, bool value, Error **errp)
+{
+    BusState *bus = BUS(obj);
+    BusClass *bc = BUS_GET_CLASS(bus);
+    BusChild *kid;
+    Error *local_err = NULL;
+
+    if (value && !bus->realized) {
+        if (bc->realize) {
+            bc->realize(bus, &local_err);
+        }
+
+        /* TODO: recursive realization */
+    } else if (!value && bus->realized) {
+        QTAILQ_FOREACH(kid, &bus->children, sibling) {
+            DeviceState *dev = kid->child;
+            object_property_set_bool(OBJECT(dev), false, "realized",
+                                     &local_err);
+            if (local_err != NULL) {
+                break;
+            }
+        }
+        if (bc->unrealize && local_err == NULL) {
+            bc->unrealize(bus, &local_err);
+        }
+    }
+
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    bus->realized = value;
+}
+
+static void qbus_initfn(Object *obj)
+{
+    BusState *bus = BUS(obj);
+
+    QTAILQ_INIT(&bus->children);
+    object_property_add_link(obj, QDEV_HOTPLUG_HANDLER_PROPERTY,
+                             TYPE_HOTPLUG_HANDLER,
+                             (Object **)&bus->hotplug_handler,
+                             object_property_allow_set_link,
+                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
+                             NULL);
+    object_property_add_bool(obj, "realized",
+                             bus_get_realized, bus_set_realized, NULL);
+}
+
+static char *default_bus_get_fw_dev_path(DeviceState *dev)
+{
+    return g_strdup(object_get_typename(OBJECT(dev)));
+}
+
+static void bus_class_init(ObjectClass *class, void *data)
+{
+    BusClass *bc = BUS_CLASS(class);
+
+    class->unparent = bus_unparent;
+    bc->get_fw_dev_path = default_bus_get_fw_dev_path;
+}
+
+static void qbus_finalize(Object *obj)
+{
+    BusState *bus = BUS(obj);
+
+    g_free((char *)bus->name);
+}
+
+static const TypeInfo bus_info = {
+    .name = TYPE_BUS,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(BusState),
+    .abstract = true,
+    .class_size = sizeof(BusClass),
+    .instance_init = qbus_initfn,
+    .instance_finalize = qbus_finalize,
+    .class_init = bus_class_init,
+};
+
+static void bus_register_types(void)
+{
+    type_register_static(&bus_info);
+}
+
+type_init(bus_register_types)
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index db41aa1f26..853162b670 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -109,24 +109,6 @@ void qdev_set_parent_bus(DeviceState *dev, BusState *bus)
     bus_add_child(bus, dev);
 }
 
-static void qbus_set_hotplug_handler_internal(BusState *bus, Object *handler,
-                                              Error **errp)
-{
-
-    object_property_set_link(OBJECT(bus), OBJECT(handler),
-                             QDEV_HOTPLUG_HANDLER_PROPERTY, errp);
-}
-
-void qbus_set_hotplug_handler(BusState *bus, DeviceState *handler, Error **errp)
-{
-    qbus_set_hotplug_handler_internal(bus, OBJECT(handler), errp);
-}
-
-void qbus_set_bus_hotplug_handler(BusState *bus, Error **errp)
-{
-    qbus_set_hotplug_handler_internal(bus, OBJECT(bus), errp);
-}
-
 /* Create a new device.  This only initializes the device state
    structure and allows properties to be set.  The device still needs
    to be realized.  See qdev-core.h.  */
@@ -595,40 +577,6 @@ BusState *qdev_get_child_bus(DeviceState *dev, const char *name)
     return NULL;
 }
 
-int qbus_walk_children(BusState *bus,
-                       qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn,
-                       qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn,
-                       void *opaque)
-{
-    BusChild *kid;
-    int err;
-
-    if (pre_busfn) {
-        err = pre_busfn(bus, opaque);
-        if (err) {
-            return err;
-        }
-    }
-
-    QTAILQ_FOREACH(kid, &bus->children, sibling) {
-        err = qdev_walk_children(kid->child,
-                                 pre_devfn, pre_busfn,
-                                 post_devfn, post_busfn, opaque);
-        if (err < 0) {
-            return err;
-        }
-    }
-
-    if (post_busfn) {
-        err = post_busfn(bus, opaque);
-        if (err) {
-            return err;
-        }
-    }
-
-    return 0;
-}
-
 int qdev_walk_children(DeviceState *dev,
                        qdev_walkerfn *pre_devfn, qbus_walkerfn *pre_busfn,
                        qdev_walkerfn *post_devfn, qbus_walkerfn *post_busfn,
@@ -685,129 +633,6 @@ DeviceState *qdev_find_recursive(BusState *bus, const char *id)
     return NULL;
 }
 
-static void qbus_realize(BusState *bus, DeviceState *parent, const char *name)
-{
-    const char *typename = object_get_typename(OBJECT(bus));
-    BusClass *bc;
-    char *buf;
-    int i, len, bus_id;
-
-    bus->parent = parent;
-
-    if (name) {
-        bus->name = g_strdup(name);
-    } else if (bus->parent && bus->parent->id) {
-        /* parent device has id -> use it plus parent-bus-id for bus name */
-        bus_id = bus->parent->num_child_bus;
-
-        len = strlen(bus->parent->id) + 16;
-        buf = g_malloc(len);
-        snprintf(buf, len, "%s.%d", bus->parent->id, bus_id);
-        bus->name = buf;
-    } else {
-        /* no id -> use lowercase bus type plus global bus-id for bus name */
-        bc = BUS_GET_CLASS(bus);
-        bus_id = bc->automatic_ids++;
-
-        len = strlen(typename) + 16;
-        buf = g_malloc(len);
-        len = snprintf(buf, len, "%s.%d", typename, bus_id);
-        for (i = 0; i < len; i++) {
-            buf[i] = qemu_tolower(buf[i]);
-        }
-        bus->name = buf;
-    }
-
-    if (bus->parent) {
-        QLIST_INSERT_HEAD(&bus->parent->child_bus, bus, sibling);
-        bus->parent->num_child_bus++;
-        object_property_add_child(OBJECT(bus->parent), bus->name, OBJECT(bus), NULL);
-        object_unref(OBJECT(bus));
-    } else if (bus != sysbus_get_default()) {
-        /* TODO: once all bus devices are qdevified,
-           only reset handler for main_system_bus should be registered here. */
-        qemu_register_reset(qbus_reset_all_fn, bus);
-    }
-}
-
-static void bus_unparent(Object *obj)
-{
-    BusState *bus = BUS(obj);
-    BusChild *kid;
-
-    while ((kid = QTAILQ_FIRST(&bus->children)) != NULL) {
-        DeviceState *dev = kid->child;
-        object_unparent(OBJECT(dev));
-    }
-    if (bus->parent) {
-        QLIST_REMOVE(bus, sibling);
-        bus->parent->num_child_bus--;
-        bus->parent = NULL;
-    } else {
-        assert(bus != sysbus_get_default()); /* main_system_bus is never freed */
-        qemu_unregister_reset(qbus_reset_all_fn, bus);
-    }
-}
-
-static bool bus_get_realized(Object *obj, Error **errp)
-{
-    BusState *bus = BUS(obj);
-
-    return bus->realized;
-}
-
-static void bus_set_realized(Object *obj, bool value, Error **errp)
-{
-    BusState *bus = BUS(obj);
-    BusClass *bc = BUS_GET_CLASS(bus);
-    BusChild *kid;
-    Error *local_err = NULL;
-
-    if (value && !bus->realized) {
-        if (bc->realize) {
-            bc->realize(bus, &local_err);
-        }
-
-        /* TODO: recursive realization */
-    } else if (!value && bus->realized) {
-        QTAILQ_FOREACH(kid, &bus->children, sibling) {
-            DeviceState *dev = kid->child;
-            object_property_set_bool(OBJECT(dev), false, "realized",
-                                     &local_err);
-            if (local_err != NULL) {
-                break;
-            }
-        }
-        if (bc->unrealize && local_err == NULL) {
-            bc->unrealize(bus, &local_err);
-        }
-    }
-
-    if (local_err != NULL) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
-    bus->realized = value;
-}
-
-void qbus_create_inplace(void *bus, size_t size, const char *typename,
-                         DeviceState *parent, const char *name)
-{
-    object_initialize(bus, size, typename);
-    qbus_realize(bus, parent, name);
-}
-
-BusState *qbus_create(const char *typename, DeviceState *parent, const char *name)
-{
-    BusState *bus;
-
-    bus = BUS(object_new(typename));
-    qbus_realize(bus, parent, name);
-
-    return bus;
-}
-
 static char *bus_get_fw_dev_path(BusState *bus, DeviceState *dev)
 {
     BusClass *bc = BUS_GET_CLASS(bus);
@@ -1315,55 +1140,8 @@ static const TypeInfo device_type_info = {
     .class_size = sizeof(DeviceClass),
 };
 
-static void qbus_initfn(Object *obj)
-{
-    BusState *bus = BUS(obj);
-
-    QTAILQ_INIT(&bus->children);
-    object_property_add_link(obj, QDEV_HOTPLUG_HANDLER_PROPERTY,
-                             TYPE_HOTPLUG_HANDLER,
-                             (Object **)&bus->hotplug_handler,
-                             object_property_allow_set_link,
-                             OBJ_PROP_LINK_UNREF_ON_RELEASE,
-                             NULL);
-    object_property_add_bool(obj, "realized",
-                             bus_get_realized, bus_set_realized, NULL);
-}
-
-static char *default_bus_get_fw_dev_path(DeviceState *dev)
-{
-    return g_strdup(object_get_typename(OBJECT(dev)));
-}
-
-static void bus_class_init(ObjectClass *class, void *data)
-{
-    BusClass *bc = BUS_CLASS(class);
-
-    class->unparent = bus_unparent;
-    bc->get_fw_dev_path = default_bus_get_fw_dev_path;
-}
-
-static void qbus_finalize(Object *obj)
-{
-    BusState *bus = BUS(obj);
-
-    g_free((char *)bus->name);
-}
-
-static const TypeInfo bus_info = {
-    .name = TYPE_BUS,
-    .parent = TYPE_OBJECT,
-    .instance_size = sizeof(BusState),
-    .abstract = true,
-    .class_size = sizeof(BusClass),
-    .instance_init = qbus_initfn,
-    .instance_finalize = qbus_finalize,
-    .class_init = bus_class_init,
-};
-
 static void qdev_register_types(void)
 {
-    type_register_static(&bus_info);
     type_register_static(&device_type_info);
 }
 
diff --git a/hw/cris/axis_dev88.c b/hw/cris/axis_dev88.c
index 9f58658741..60df8877c1 100644
--- a/hw/cris/axis_dev88.c
+++ b/hw/cris/axis_dev88.c
@@ -37,6 +37,7 @@
 #include "sysemu/block-backend.h"
 #include "exec/address-spaces.h"
 #include "sysemu/qtest.h"
+#include "sysemu/sysemu.h"
 
 #define D(x)
 #define DNAND(x)
@@ -341,8 +342,7 @@ void axisdev88_init(MachineState *machine)
     sysbus_create_varargs("etraxfs,timer", 0x3005e000, irq[0x1b], nmi[1], NULL);
 
     for (i = 0; i < 4; i++) {
-        sysbus_create_simple("etraxfs,serial", 0x30026000 + i * 0x2000,
-                             irq[0x14 + i]);
+        etraxfs_ser_create(0x30026000 + i * 0x2000, irq[0x14 + i], serial_hds[i]);
     }
 
     if (kernel_filename) {
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index f244bc01c9..502d4f1c7b 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1006,7 +1006,8 @@ static void execute_ncq_command(NCQTransferState *ncq_tfs)
         dma_acct_start(ide_state->blk, &ncq_tfs->acct,
                        &ncq_tfs->sglist, BLOCK_ACCT_READ);
         ncq_tfs->aiocb = dma_blk_read(ide_state->blk, &ncq_tfs->sglist,
-                                      ncq_tfs->lba, ncq_cb, ncq_tfs);
+                                      ncq_tfs->lba << BDRV_SECTOR_BITS,
+                                      ncq_cb, ncq_tfs);
         break;
     case WRITE_FPDMA_QUEUED:
         DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n",
@@ -1018,7 +1019,8 @@ static void execute_ncq_command(NCQTransferState *ncq_tfs)
         dma_acct_start(ide_state->blk, &ncq_tfs->acct,
                        &ncq_tfs->sglist, BLOCK_ACCT_WRITE);
         ncq_tfs->aiocb = dma_blk_write(ide_state->blk, &ncq_tfs->sglist,
-                                       ncq_tfs->lba, ncq_cb, ncq_tfs);
+                                       ncq_tfs->lba << BDRV_SECTOR_BITS,
+                                       ncq_cb, ncq_tfs);
         break;
     default:
         DPRINTF(port, "error: unsupported NCQ command (0x%02x) received\n",
diff --git a/hw/ide/core.c b/hw/ide/core.c
index fe2bfba489..029f6b9b12 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -441,13 +441,14 @@ static void ide_issue_trim_cb(void *opaque, int ret)
     }
 }
 
-BlockAIOCB *ide_issue_trim(BlockBackend *blk,
-        int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *ide_issue_trim(
+        int64_t offset, QEMUIOVector *qiov,
+        BlockCompletionFunc *cb, void *cb_opaque, void *opaque)
 {
+    BlockBackend *blk = opaque;
     TrimAIOCB *iocb;
 
-    iocb = blk_aio_get(&trim_aiocb_info, blk, cb, opaque);
+    iocb = blk_aio_get(&trim_aiocb_info, blk, cb, cb_opaque);
     iocb->blk = blk;
     iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb);
     iocb->ret = 0;
@@ -799,6 +800,7 @@ static void ide_dma_cb(void *opaque, int ret)
     IDEState *s = opaque;
     int n;
     int64_t sector_num;
+    uint64_t offset;
     bool stay_active = false;
 
     if (ret == -ECANCELED) {
@@ -859,18 +861,20 @@ static void ide_dma_cb(void *opaque, int ret)
         return;
     }
 
+    offset = sector_num << BDRV_SECTOR_BITS;
     switch (s->dma_cmd) {
     case IDE_DMA_READ:
-        s->bus->dma->aiocb = dma_blk_read(s->blk, &s->sg, sector_num,
+        s->bus->dma->aiocb = dma_blk_read(s->blk, &s->sg, offset,
                                           ide_dma_cb, s);
         break;
     case IDE_DMA_WRITE:
-        s->bus->dma->aiocb = dma_blk_write(s->blk, &s->sg, sector_num,
+        s->bus->dma->aiocb = dma_blk_write(s->blk, &s->sg, offset,
                                            ide_dma_cb, s);
         break;
     case IDE_DMA_TRIM:
-        s->bus->dma->aiocb = dma_blk_io(s->blk, &s->sg, sector_num,
-                                        ide_issue_trim, ide_dma_cb, s,
+        s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk),
+                                        &s->sg, offset,
+                                        ide_issue_trim, s->blk, ide_dma_cb, s,
                                         DMA_DIRECTION_TO_DEVICE);
         break;
     default:
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index ceb9e5994a..773928af77 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -613,9 +613,9 @@ void ide_transfer_start(IDEState *s, uint8_t *buf, int size,
                         EndTransferFunc *end_transfer_func);
 void ide_transfer_stop(IDEState *s);
 void ide_set_inactive(IDEState *s, bool more);
-BlockAIOCB *ide_issue_trim(BlockBackend *blk,
-        int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *ide_issue_trim(
+        int64_t offset, QEMUIOVector *qiov,
+        BlockCompletionFunc *cb, void *cb_opaque, void *opaque);
 BlockAIOCB *ide_buffered_readv(IDEState *s, int64_t sector_num,
                                QEMUIOVector *iov, int nb_sectors,
                                BlockCompletionFunc *cb, void *opaque);
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index d7d9c0ff3a..42ad68a1c0 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -230,7 +230,7 @@ static void pmac_dma_trim(BlockBackend *blk,
     s->io_buffer_index += io->len;
     io->len = 0;
 
-    s->bus->dma->aiocb = ide_issue_trim(blk, offset, &io->iov, 0, cb, io);
+    s->bus->dma->aiocb = ide_issue_trim(offset, &io->iov, cb, io, blk);
 }
 
 static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
diff --git a/hw/lm32/lm32.h b/hw/lm32/lm32.h
index 18aa6fdc15..e338bfeae5 100644
--- a/hw/lm32/lm32.h
+++ b/hw/lm32/lm32.h
@@ -16,14 +16,31 @@ static inline DeviceState *lm32_pic_init(qemu_irq cpu_irq)
     return dev;
 }
 
-static inline DeviceState *lm32_juart_init(void)
+static inline DeviceState *lm32_juart_init(CharDriverState *chr)
 {
     DeviceState *dev;
 
     dev = qdev_create(NULL, TYPE_LM32_JUART);
+    qdev_prop_set_chr(dev, "chardev", chr);
     qdev_init_nofail(dev);
 
     return dev;
 }
 
+static inline DeviceState *lm32_uart_create(hwaddr addr,
+                                            qemu_irq irq,
+                                            CharDriverState *chr)
+{
+    DeviceState *dev;
+    SysBusDevice *s;
+
+    dev = qdev_create(NULL, "lm32-uart");
+    s = SYS_BUS_DEVICE(dev);
+    qdev_prop_set_chr(dev, "chardev", chr);
+    qdev_init_nofail(dev);
+    sysbus_mmio_map(s, 0, addr);
+    sysbus_connect_irq(s, 0, irq);
+    return dev;
+}
+
 #endif
diff --git a/hw/lm32/lm32_boards.c b/hw/lm32/lm32_boards.c
index c0290560fc..8f0c3079d6 100644
--- a/hw/lm32/lm32_boards.c
+++ b/hw/lm32/lm32_boards.c
@@ -31,6 +31,7 @@
 #include "lm32_hwsetup.h"
 #include "lm32.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
 
 typedef struct {
     LM32CPU *cpu;
@@ -131,12 +132,12 @@ static void lm32_evr_init(MachineState *machine)
         irq[i] = qdev_get_gpio_in(env->pic_state, i);
     }
 
-    sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]);
+    lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]);
     sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]);
     sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);
 
     /* make sure juart isn't the first chardev */
-    env->juart_state = lm32_juart_init();
+    env->juart_state = lm32_juart_init(serial_hds[1]);
 
     reset_info->bootstrap_pc = flash_base;
 
@@ -232,13 +233,13 @@ static void lm32_uclinux_init(MachineState *machine)
         irq[i] = qdev_get_gpio_in(env->pic_state, i);
     }
 
-    sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]);
+    lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]);
     sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]);
     sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);
     sysbus_create_simple("lm32-timer", timer2_base, irq[timer2_irq]);
 
     /* make sure juart isn't the first chardev */
-    env->juart_state = lm32_juart_init();
+    env->juart_state = lm32_juart_init(serial_hds[1]);
 
     reset_info->bootstrap_pc = flash_base;
 
diff --git a/hw/lm32/milkymist-hw.h b/hw/lm32/milkymist-hw.h
index f857d2846f..eb6a3a2559 100644
--- a/hw/lm32/milkymist-hw.h
+++ b/hw/lm32/milkymist-hw.h
@@ -5,11 +5,13 @@
 #include "net/net.h"
 
 static inline DeviceState *milkymist_uart_create(hwaddr base,
-        qemu_irq irq)
+                                                 qemu_irq irq,
+                                                 CharDriverState *chr)
 {
     DeviceState *dev;
 
     dev = qdev_create(NULL, "milkymist-uart");
+    qdev_prop_set_chr(dev, "chardev", chr);
     qdev_init_nofail(dev);
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
     sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, irq);
diff --git a/hw/lm32/milkymist.c b/hw/lm32/milkymist.c
index 1abdf6e2e6..5cae0f19dd 100644
--- a/hw/lm32/milkymist.c
+++ b/hw/lm32/milkymist.c
@@ -159,7 +159,7 @@ milkymist_init(MachineState *machine)
     }
     g_free(bios_filename);
 
-    milkymist_uart_create(0x60000000, irq[0]);
+    milkymist_uart_create(0x60000000, irq[0], serial_hds[0]);
     milkymist_sysctl_create(0x60001000, irq[1], irq[2], irq[3],
             80000000, 0x10014d31, 0x0000041f, 0x00000001);
     milkymist_hpdmc_create(0x60002000);
@@ -175,7 +175,7 @@ milkymist_init(MachineState *machine)
             0x20000000, 0x1000, 0x20020000, 0x2000);
 
     /* make sure juart isn't the first chardev */
-    env->juart_state = lm32_juart_init();
+    env->juart_state = lm32_juart_init(serial_hds[1]);
 
     if (kernel_filename) {
         uint64_t entry;
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index e40f23bfc2..90be9f7617 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -33,7 +33,6 @@
 #include "sysemu/hostmem.h"
 #include "sysemu/qtest.h"
 #include "qapi/visitor.h"
-#include "exec/ram_addr.h"
 
 #include "hw/misc/ivshmem.h"
 
@@ -533,7 +532,7 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
     }
     memory_region_init_ram_ptr(&s->server_bar2, OBJECT(s),
                                "ivshmem.bar2", size, ptr);
-    qemu_set_ram_fd(memory_region_get_ram_addr(&s->server_bar2), fd);
+    memory_region_set_fd(&s->server_bar2, fd);
     s->ivshmem_bar2 = &s->server_bar2;
 }
 
@@ -940,7 +939,7 @@ static void ivshmem_exit(PCIDevice *dev)
                              strerror(errno));
             }
 
-            fd = qemu_get_ram_fd(memory_region_get_ram_addr(s->ivshmem_bar2));
+            fd = memory_region_get_fd(s->ivshmem_bar2);
             close(fd);
         }
 
diff --git a/hw/net/spapr_llan.c b/hw/net/spapr_llan.c
index a8266f8ec7..8b2eebd4e3 100644
--- a/hw/net/spapr_llan.c
+++ b/hw/net/spapr_llan.c
@@ -110,6 +110,7 @@ typedef struct VIOsPAPRVLANDevice {
     hwaddr buf_list;
     uint32_t add_buf_ptr, use_buf_ptr, rx_bufs;
     hwaddr rxq_ptr;
+    QEMUTimer *rxp_timer;
     uint32_t compat_flags;             /* Compatability flags for migration */
     RxBufPool *rx_pool[RX_MAX_POOLS];  /* Receive buffer descriptor pools */
 } VIOsPAPRVLANDevice;
@@ -122,6 +123,21 @@ static int spapr_vlan_can_receive(NetClientState *nc)
 }
 
 /**
+ * The last 8 bytes of the receive buffer list page (that has been
+ * supplied by the guest with the H_REGISTER_LOGICAL_LAN call) contain
+ * a counter for frames that have been dropped because there was no
+ * suitable receive buffer available. This function is used to increase
+ * this counter by one.
+ */
+static void spapr_vlan_record_dropped_rx_frame(VIOsPAPRVLANDevice *dev)
+{
+    uint64_t cnt;
+
+    cnt = vio_ldq(&dev->sdev, dev->buf_list + 4096 - 8);
+    vio_stq(&dev->sdev, dev->buf_list + 4096 - 8, cnt + 1);
+}
+
+/**
  * Get buffer descriptor from one of our receive buffer pools
  */
 static vlan_bd_t spapr_vlan_get_rx_bd_from_pool(VIOsPAPRVLANDevice *dev,
@@ -206,7 +222,8 @@ static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf,
     }
 
     if (!dev->rx_bufs) {
-        return -1;
+        spapr_vlan_record_dropped_rx_frame(dev);
+        return 0;
     }
 
     if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
@@ -215,7 +232,8 @@ static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf,
         bd = spapr_vlan_get_rx_bd_from_page(dev, size);
     }
     if (!bd) {
-        return -1;
+        spapr_vlan_record_dropped_rx_frame(dev);
+        return 0;
     }
 
     dev->rx_bufs--;
@@ -266,6 +284,13 @@ static NetClientInfo net_spapr_vlan_info = {
     .receive = spapr_vlan_receive,
 };
 
+static void spapr_vlan_flush_rx_queue(void *opaque)
+{
+    VIOsPAPRVLANDevice *dev = opaque;
+
+    qemu_flush_queued_packets(qemu_get_queue(dev->nic));
+}
+
 static void spapr_vlan_reset_rx_pool(RxBufPool *rxp)
 {
     /*
@@ -302,6 +327,9 @@ static void spapr_vlan_realize(VIOsPAPRDevice *sdev, Error **errp)
     dev->nic = qemu_new_nic(&net_spapr_vlan_info, &dev->nicconf,
                             object_get_typename(OBJECT(sdev)), sdev->qdev.id, dev);
     qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
+
+    dev->rxp_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, spapr_vlan_flush_rx_queue,
+                                  dev);
 }
 
 static void spapr_vlan_instance_init(Object *obj)
@@ -332,6 +360,11 @@ static void spapr_vlan_instance_finalize(Object *obj)
             dev->rx_pool[i] = NULL;
         }
     }
+
+    if (dev->rxp_timer) {
+        timer_del(dev->rxp_timer);
+        timer_free(dev->rxp_timer);
+    }
 }
 
 void spapr_vlan_create(VIOsPAPRBus *bus, NICInfo *nd)
@@ -629,7 +662,13 @@ static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu,
 
     dev->rx_bufs++;
 
-    qemu_flush_queued_packets(qemu_get_queue(dev->nic));
+    /*
+     * Give guest some more time to add additional RX buffers before we
+     * flush the receive queue, so that e.g. fragmented IP packets can
+     * be passed to the guest in one go later (instead of passing single
+     * fragments if there is only one receive buffer available).
+     */
+    timer_mod(dev->rxp_timer, qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + 500);
 
     return H_SUCCESS;
 }
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index add68acfef..44e401ae99 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1842,6 +1842,10 @@ static void ppc_spapr_init(MachineState *machine)
         exit(1);
     }
     spapr->rtas_size = get_image_size(filename);
+    if (spapr->rtas_size < 0) {
+        error_report("Could not get size of LPAR rtas '%s'", filename);
+        exit(1);
+    }
     spapr->rtas_blob = g_malloc(spapr->rtas_size);
     if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
         error_report("Could not load LPAR rtas '%s'", filename);
@@ -2132,15 +2136,6 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
     int i, fdt_offset, fdt_size;
     void *fdt;
 
-    /*
-     * Check for DRC connectors and send hotplug notification to the
-     * guest only in case of hotplugged memory. This allows cold plugged
-     * memory to be specified at boot time.
-     */
-    if (!dev->hotplugged) {
-        return;
-    }
-
     for (i = 0; i < nr_lmbs; i++) {
         drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_LMB,
                 addr/SPAPR_MEMORY_BLOCK_SIZE);
@@ -2154,7 +2149,12 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr, uint64_t size,
         drck->attach(drc, dev, fdt, fdt_offset, !dev->hotplugged, errp);
         addr += SPAPR_MEMORY_BLOCK_SIZE;
     }
-    spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+    /* send hotplug notification to the
+     * guest only in case of hotplugged memory
+     */
+    if (dev->hotplugged) {
+       spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs);
+    }
 }
 
 static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index feb3629664..9a3f4ecc1e 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -186,6 +186,7 @@ static RemoveResult remove_hpte(PowerPCCPU *cpu, target_ulong ptex,
 static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                              target_ulong opcode, target_ulong *args)
 {
+    CPUPPCState *env = &cpu->env;
     target_ulong flags = args[0];
     target_ulong pte_index = args[1];
     target_ulong avpn = args[2];
@@ -196,6 +197,7 @@ static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 
     switch (ret) {
     case REMOVE_SUCCESS:
+        check_tlb_flush(env);
         return H_SUCCESS;
 
     case REMOVE_NOT_FOUND:
@@ -232,7 +234,9 @@ static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
 static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
                                   target_ulong opcode, target_ulong *args)
 {
+    CPUPPCState *env = &cpu->env;
     int i;
+    target_ulong rc = H_SUCCESS;
 
     for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) {
         target_ulong *tsh = &args[i*2];
@@ -265,14 +269,18 @@ static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr,
             break;
 
         case REMOVE_PARM:
-            return H_PARAMETER;
+            rc = H_PARAMETER;
+            goto exit;
 
         case REMOVE_HW:
-            return H_HARDWARE;
+            rc = H_HARDWARE;
+            goto exit;
         }
     }
+ exit:
+    check_tlb_flush(env);
 
-    return H_SUCCESS;
+    return rc;
 }
 
 static target_ulong h_protect(PowerPCCPU *cpu, sPAPRMachineState *spapr,
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 722db91ffa..96bb0181a7 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -76,6 +76,37 @@ static IOMMUAccessFlags spapr_tce_iommu_access_flags(uint64_t tce)
     }
 }
 
+static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
+                                       uint32_t page_shift,
+                                       uint32_t nb_table,
+                                       int *fd,
+                                       bool need_vfio)
+{
+    uint64_t *table = NULL;
+    uint64_t window_size = (uint64_t)nb_table << page_shift;
+
+    if (kvm_enabled() && !(window_size >> 32)) {
+        table = kvmppc_create_spapr_tce(liobn, window_size, fd, need_vfio);
+    }
+
+    if (!table) {
+        *fd = -1;
+        table = g_malloc0(nb_table * sizeof(uint64_t));
+    }
+
+    trace_spapr_iommu_new_table(liobn, table, *fd);
+
+    return table;
+}
+
+static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table)
+{
+    if (!kvm_enabled() ||
+        (kvmppc_remove_spapr_tce(table, fd, nb_table) != 0)) {
+        g_free(table);
+    }
+}
+
 /* Called from RCU critical section */
 static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr,
                                                bool is_write)
@@ -142,21 +173,13 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = {
 static int spapr_tce_table_realize(DeviceState *dev)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
-    uint64_t window_size = (uint64_t)tcet->nb_table << tcet->page_shift;
-
-    if (kvm_enabled() && !(window_size >> 32)) {
-        tcet->table = kvmppc_create_spapr_tce(tcet->liobn,
-                                              window_size,
-                                              &tcet->fd,
-                                              tcet->need_vfio);
-    }
 
-    if (!tcet->table) {
-        size_t table_size = tcet->nb_table * sizeof(uint64_t);
-        tcet->table = g_malloc0(table_size);
-    }
-
-    trace_spapr_iommu_new_table(tcet->liobn, tcet, tcet->table, tcet->fd);
+    tcet->fd = -1;
+    tcet->table = spapr_tce_alloc_table(tcet->liobn,
+                                        tcet->page_shift,
+                                        tcet->nb_table,
+                                        &tcet->fd,
+                                        tcet->need_vfio);
 
     memory_region_init_iommu(&tcet->iommu, OBJECT(dev), &spapr_iommu_ops,
                              "iommu-spapr",
@@ -242,11 +265,8 @@ static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp)
 
     QLIST_REMOVE(tcet, list);
 
-    if (!kvm_enabled() ||
-        (kvmppc_remove_spapr_tce(tcet->table, tcet->fd,
-                                 tcet->nb_table) != 0)) {
-        g_free(tcet->table);
-    }
+    spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
+    tcet->fd = -1;
 }
 
 MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet)
@@ -278,7 +298,7 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
     tcet->table[index] = tce;
 
     entry.target_as = &address_space_memory,
-    entry.iova = ioba & page_mask;
+    entry.iova = (ioba - tcet->bus_offset) & page_mask;
     entry.translated_addr = tce & page_mask;
     entry.addr_mask = ~page_mask;
     entry.perm = spapr_tce_iommu_access_flags(tce);
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index e55b505c96..856aec7f51 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1093,13 +1093,11 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
         spapr_tce_set_need_vfio(tcet, true);
     }
 
-    if (dev->hotplugged) {
-        fdt = create_device_tree(&fdt_size);
-        fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
-        if (!fdt_start_offset) {
-            error_setg(errp, "Failed to create pci child device tree node");
-            goto out;
-        }
+    fdt = create_device_tree(&fdt_size);
+    fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
+    if (!fdt_start_offset) {
+        error_setg(errp, "Failed to create pci child device tree node");
+        goto out;
     }
 
     drck->attach(drc, DEVICE(pdev),
@@ -1816,7 +1814,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
                      sizeof(interrupt_map)));
 
-    tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(phb->index, 0));
+    tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
     if (!tcet) {
         return -1;
     }
diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c
index d772cfc7ea..e2d4e1af79 100644
--- a/hw/s390x/s390-skeys.c
+++ b/hw/s390x/s390-skeys.c
@@ -47,15 +47,11 @@ void s390_skeys_init(void)
     qdev_init_nofail(DEVICE(obj));
 }
 
-static void write_keys(QEMUFile *f, uint8_t *keys, uint64_t startgfn,
+static void write_keys(FILE *f, uint8_t *keys, uint64_t startgfn,
                        uint64_t count, Error **errp)
 {
     uint64_t curpage = startgfn;
     uint64_t maxpage = curpage + count - 1;
-    const char *fmt = "page=%03" PRIx64 ": key(%d) => ACC=%X, FP=%d, REF=%d,"
-                      " ch=%d, reserved=%d\n";
-    char buf[128];
-    int len;
 
     for (; curpage <= maxpage; curpage++) {
         uint8_t acc = (*keys & 0xF0) >> 4;
@@ -64,10 +60,9 @@ static void write_keys(QEMUFile *f, uint8_t *keys, uint64_t startgfn,
         int ch = (*keys & 0x02);
         int res = (*keys & 0x01);
 
-        len = snprintf(buf, sizeof(buf), fmt, curpage,
-                       *keys, acc, fp, ref, ch, res);
-        assert(len < sizeof(buf));
-        qemu_put_buffer(f, (uint8_t *)buf, len);
+        fprintf(f, "page=%03" PRIx64 ": key(%d) => ACC=%X, FP=%d, REF=%d,"
+                " ch=%d, reserved=%d\n",
+                curpage, *keys, acc, fp, ref, ch, res);
         keys++;
     }
 }
@@ -116,7 +111,8 @@ void qmp_dump_skeys(const char *filename, Error **errp)
     vaddr cur_gfn = 0;
     uint8_t *buf;
     int ret;
-    QEMUFile *f;
+    int fd;
+    FILE *f;
 
     /* Quick check to see if guest is using storage keys*/
     if (!skeyclass->skeys_enabled(ss)) {
@@ -125,8 +121,14 @@ void qmp_dump_skeys(const char *filename, Error **errp)
         return;
     }
 
-    f = qemu_fopen(filename, "wb");
+    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+    if (fd < 0) {
+        error_setg_file_open(errp, errno, filename);
+        return;
+    }
+    f = fdopen(fd, "wb");
     if (!f) {
+        close(fd);
         error_setg_file_open(errp, errno, filename);
         return;
     }
@@ -162,7 +164,7 @@ out_free:
     error_propagate(errp, lerr);
     g_free(buf);
 out:
-    qemu_fclose(f);
+    fclose(f);
 }
 
 static void qemu_s390_skeys_init(Object *obj)
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index a63a581550..cc66d36186 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -650,7 +650,9 @@ static int megasas_init_firmware(MegasasState *s, MegasasCmd *cmd)
     pa_hi = le32_to_cpu(initq->pi_addr_hi);
     s->producer_pa = ((uint64_t) pa_hi << 32) | pa_lo;
     s->reply_queue_head = ldl_le_pci_dma(pcid, s->producer_pa);
+    s->reply_queue_head %= MEGASAS_MAX_FRAMES;
     s->reply_queue_tail = ldl_le_pci_dma(pcid, s->consumer_pa);
+    s->reply_queue_tail %= MEGASAS_MAX_FRAMES;
     flags = le32_to_cpu(initq->flags);
     if (flags & MFI_QUEUE_FLAG_CONTEXT64) {
         s->flags |= MEGASAS_MASK_USE_QUEUE64;
@@ -1293,7 +1295,7 @@ static int megasas_dcmd_ld_get_info(MegasasState *s, MegasasCmd *cmd)
 
 static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd)
 {
-    uint8_t data[4096];
+    uint8_t data[4096] = { 0 };
     struct mfi_config_data *info;
     int num_pd_disks = 0, array_offset, ld_offset;
     BusChild *kid;
@@ -1446,7 +1448,7 @@ static int megasas_dcmd_set_properties(MegasasState *s, MegasasCmd *cmd)
                                             dcmd_size);
         return MFI_STAT_INVALID_PARAMETER;
     }
-    dma_buf_write((uint8_t *)&info, cmd->iov_size, &cmd->qsg);
+    dma_buf_write((uint8_t *)&info, dcmd_size, &cmd->qsg);
     trace_megasas_dcmd_unsupported(cmd->index, cmd->iov_size);
     return MFI_STAT_OK;
 }
diff --git a/hw/scsi/mptsas.c b/hw/scsi/mptsas.c
index 499c1465ae..be88e161a9 100644
--- a/hw/scsi/mptsas.c
+++ b/hw/scsi/mptsas.c
@@ -754,11 +754,6 @@ static void mptsas_fetch_request(MPTSASState *s)
     hwaddr addr;
     int size;
 
-    if (s->state != MPI_IOC_STATE_OPERATIONAL) {
-        mptsas_set_fault(s, MPI_IOCSTATUS_INVALID_STATE);
-        return;
-    }
-
     /* Read the message header from the guest first. */
     addr = s->host_mfa_high_addr | MPTSAS_FIFO_GET(s, request_post);
     pci_dma_read(pci, addr, req, sizeof(hdr));
@@ -789,6 +784,10 @@ static void mptsas_fetch_requests(void *opaque)
 {
     MPTSASState *s = opaque;
 
+    if (s->state != MPI_IOC_STATE_OPERATIONAL) {
+        mptsas_set_fault(s, MPI_IOCSTATUS_INVALID_STATE);
+        return;
+    }
     while (!MPTSAS_FIFO_EMPTY(s, request_post)) {
         mptsas_fetch_request(s);
     }
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index ce89c98b4e..ace65e0720 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -53,7 +53,21 @@ do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0)
 #define DEFAULT_MAX_UNMAP_SIZE      (1 << 30)   /* 1 GB */
 #define DEFAULT_MAX_IO_SIZE         INT_MAX     /* 2 GB - 1 block */
 
-typedef struct SCSIDiskState SCSIDiskState;
+#define TYPE_SCSI_DISK_BASE         "scsi-disk-base"
+
+#define SCSI_DISK_BASE(obj) \
+     OBJECT_CHECK(SCSIDiskState, (obj), TYPE_SCSI_DISK_BASE)
+#define SCSI_DISK_BASE_CLASS(klass) \
+     OBJECT_CLASS_CHECK(SCSIDiskClass, (klass), TYPE_SCSI_DISK_BASE)
+#define SCSI_DISK_BASE_GET_CLASS(obj) \
+     OBJECT_GET_CLASS(SCSIDiskClass, (obj), TYPE_SCSI_DISK_BASE)
+
+typedef struct SCSIDiskClass {
+    SCSIDeviceClass parent_class;
+    DMAIOFunc       *dma_readv;
+    DMAIOFunc       *dma_writev;
+    bool            (*need_fua_emulation)(SCSICommand *cmd);
+} SCSIDiskClass;
 
 typedef struct SCSIDiskReq {
     SCSIRequest req;
@@ -62,16 +76,18 @@ typedef struct SCSIDiskReq {
     uint32_t sector_count;
     uint32_t buflen;
     bool started;
+    bool need_fua_emulation;
     struct iovec iov;
     QEMUIOVector qiov;
     BlockAcctCookie acct;
+    unsigned char *status;
 } SCSIDiskReq;
 
 #define SCSI_DISK_F_REMOVABLE             0
 #define SCSI_DISK_F_DPOFUA                1
 #define SCSI_DISK_F_NO_REMOVABLE_DEVOPS   2
 
-struct SCSIDiskState
+typedef struct SCSIDiskState
 {
     SCSIDevice qdev;
     uint32_t features;
@@ -88,7 +104,7 @@ struct SCSIDiskState
     char *product;
     bool tray_open;
     bool tray_locked;
-};
+} SCSIDiskState;
 
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed);
 
@@ -161,6 +177,29 @@ static void scsi_disk_load_request(QEMUFile *f, SCSIRequest *req)
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
 }
 
+static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed)
+{
+    if (r->req.io_canceled) {
+        scsi_req_cancel_complete(&r->req);
+        return true;
+    }
+
+    if (ret < 0) {
+        return scsi_handle_rw_error(r, -ret, acct_failed);
+    }
+
+    if (r->status && *r->status) {
+        if (acct_failed) {
+            SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+            block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+        }
+        scsi_req_complete(&r->req, *r->status);
+        return true;
+    }
+
+    return false;
+}
+
 static void scsi_aio_complete(void *opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
@@ -168,17 +207,10 @@ static void scsi_aio_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, true)) {
-            goto done;
-        }
-    }
-
     block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     scsi_req_complete(&r->req, GOOD);
 
@@ -217,13 +249,9 @@ static void scsi_write_do_fua(SCSIDiskReq *r)
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 
     assert(r->req.aiocb == NULL);
+    assert(!r->req.io_canceled);
 
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
-        goto done;
-    }
-
-    if (scsi_is_cmd_fua(&r->req.cmd)) {
+    if (r->need_fua_emulation) {
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
                          BLOCK_ACCT_FLUSH);
         r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_aio_complete, r);
@@ -231,26 +259,16 @@ static void scsi_write_do_fua(SCSIDiskReq *r)
     }
 
     scsi_req_complete(&r->req, GOOD);
-
-done:
     scsi_req_unref(&r->req);
 }
 
 static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
 {
     assert(r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
     r->sector += r->sector_count;
     r->sector_count = 0;
     if (r->req.cmd.mode == SCSI_XFER_TO_DEV) {
@@ -288,17 +306,10 @@ static void scsi_read_complete(void * opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, true)) {
-            goto done;
-        }
-    }
-
     block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size);
 
@@ -315,35 +326,29 @@ done:
 static void scsi_do_read(SCSIDiskReq *r, int ret)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
 
     assert (r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
     /* The request is used as the AIO opaque value, so add a ref.  */
     scsi_req_ref(&r->req);
 
     if (r->req.sg) {
         dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ);
         r->req.resid -= r->req.sg->size;
-        r->req.aiocb = dma_blk_read(s->qdev.conf.blk, r->req.sg, r->sector,
-                                    scsi_dma_complete, r);
+        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
+                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
+                                  sdc->dma_readv, r, scsi_dma_complete, r,
+                                  DMA_DIRECTION_FROM_DEVICE);
     } else {
         scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                          r->qiov.size, BLOCK_ACCT_READ);
-        r->req.aiocb = blk_aio_preadv(s->qdev.conf.blk,
-                                      r->sector << BDRV_SECTOR_BITS, &r->qiov,
-                                      0, scsi_read_complete, r);
+        r->req.aiocb = sdc->dma_readv(r->sector, &r->qiov,
+                                      scsi_read_complete, r, r);
     }
 
 done:
@@ -398,7 +403,7 @@ static void scsi_read_data(SCSIRequest *req)
 
     first = !r->started;
     r->started = true;
-    if (first && scsi_is_cmd_fua(&r->req.cmd)) {
+    if (first && r->need_fua_emulation) {
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
                          BLOCK_ACCT_FLUSH);
         r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
@@ -455,18 +460,10 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
     uint32_t n;
 
     assert (r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
     n = r->qiov.size / 512;
     r->sector += n;
     r->sector_count -= n;
@@ -503,6 +500,7 @@ static void scsi_write_data(SCSIRequest *req)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
 
     /* No data transfer may already be in progress */
     assert(r->req.aiocb == NULL);
@@ -539,14 +537,15 @@ static void scsi_write_data(SCSIRequest *req)
     if (r->req.sg) {
         dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE);
         r->req.resid -= r->req.sg->size;
-        r->req.aiocb = dma_blk_write(s->qdev.conf.blk, r->req.sg, r->sector,
-                                     scsi_dma_complete, r);
+        r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
+                                  r->req.sg, r->sector << BDRV_SECTOR_BITS,
+                                  sdc->dma_writev, r, scsi_dma_complete, r,
+                                  DMA_DIRECTION_TO_DEVICE);
     } else {
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                          r->qiov.size, BLOCK_ACCT_WRITE);
-        r->req.aiocb = blk_aio_pwritev(s->qdev.conf.blk,
-                                       r->sector << BDRV_SECTOR_BITS, &r->qiov,
-                                       0, scsi_write_complete, r);
+        r->req.aiocb = sdc->dma_writev(r->sector << BDRV_SECTOR_BITS, &r->qiov,
+                                       scsi_write_complete, r, r);
     }
 }
 
@@ -1598,18 +1597,10 @@ static void scsi_unmap_complete_noio(UnmapCBData *data, int ret)
     uint32_t nb_sectors;
 
     assert(r->req.aiocb == NULL);
-
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, false)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, false)) {
-            goto done;
-        }
-    }
-
     if (data->count > 0) {
         sector_num = ldq_be_p(&data->inbuf[0]);
         nb_sectors = ldl_be_p(&data->inbuf[8]) & 0xffffffffULL;
@@ -1709,17 +1700,10 @@ static void scsi_write_same_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
-    if (r->req.io_canceled) {
-        scsi_req_cancel_complete(&r->req);
+    if (scsi_disk_req_check_error(r, ret, true)) {
         goto done;
     }
 
-    if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret, true)) {
-            goto done;
-        }
-    }
-
     block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
 
     data->nb_sectors -= data->iov.iov_len / 512;
@@ -1778,7 +1762,7 @@ static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct,
                          nb_sectors * s->qdev.blocksize,
                         BLOCK_ACCT_WRITE);
-        r->req.aiocb = blk_aio_write_zeroes(s->qdev.conf.blk,
+        r->req.aiocb = blk_aio_pwrite_zeroes(s->qdev.conf.blk,
                                 r->req.cmd.lba * s->qdev.blocksize,
                                 nb_sectors * s->qdev.blocksize,
                                 flags, scsi_aio_complete, r);
@@ -2136,6 +2120,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
 {
     SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
+    SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s));
     uint32_t len;
     uint8_t command;
 
@@ -2194,6 +2179,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
         scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
         return 0;
     }
+    r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
     if (r->sector_count == 0) {
         scsi_req_complete(&r->req, GOOD);
     }
@@ -2576,16 +2562,145 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
     scsi_generic_read_device_identification(&s->qdev);
 }
 
+typedef struct SCSIBlockReq {
+    SCSIDiskReq req;
+    sg_io_hdr_t io_header;
+
+    /* Selected bytes of the original CDB, copied into our own CDB.  */
+    uint8_t cmd, cdb1, group_number;
+
+    /* CDB passed to SG_IO.  */
+    uint8_t cdb[16];
+} SCSIBlockReq;
+
+static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req,
+                                      int64_t offset, QEMUIOVector *iov,
+                                      int direction,
+                                      BlockCompletionFunc *cb, void *opaque)
+{
+    sg_io_hdr_t *io_header = &req->io_header;
+    SCSIDiskReq *r = &req->req;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    int nb_logical_blocks;
+    uint64_t lba;
+    BlockAIOCB *aiocb;
+
+    /* This is not supported yet.  It can only happen if the guest does
+     * reads and writes that are not aligned to one logical sectors
+     * _and_ cover multiple MemoryRegions.
+     */
+    assert(offset % s->qdev.blocksize == 0);
+    assert(iov->size % s->qdev.blocksize == 0);
+
+    io_header->interface_id = 'S';
+
+    /* The data transfer comes from the QEMUIOVector.  */
+    io_header->dxfer_direction = direction;
+    io_header->dxfer_len = iov->size;
+    io_header->dxferp = (void *)iov->iov;
+    io_header->iovec_count = iov->niov;
+    assert(io_header->iovec_count == iov->niov); /* no overflow! */
+
+    /* Build a new CDB with the LBA and length patched in, in case
+     * DMA helpers split the transfer in multiple segments.  Do not
+     * build a CDB smaller than what the guest wanted, and only build
+     * a larger one if strictly necessary.
+     */
+    io_header->cmdp = req->cdb;
+    lba = offset / s->qdev.blocksize;
+    nb_logical_blocks = io_header->dxfer_len / s->qdev.blocksize;
+
+    if ((req->cmd >> 5) == 0 && lba <= 0x1ffff) {
+        /* 6-byte CDB */
+        stl_be_p(&req->cdb[0], lba | (req->cmd << 24));
+        req->cdb[4] = nb_logical_blocks;
+        req->cdb[5] = 0;
+        io_header->cmd_len = 6;
+    } else if ((req->cmd >> 5) <= 1 && lba <= 0xffffffffULL) {
+        /* 10-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0x20;
+        req->cdb[1] = req->cdb1;
+        stl_be_p(&req->cdb[2], lba);
+        req->cdb[6] = req->group_number;
+        stw_be_p(&req->cdb[7], nb_logical_blocks);
+        req->cdb[9] = 0;
+        io_header->cmd_len = 10;
+    } else if ((req->cmd >> 5) != 4 && lba <= 0xffffffffULL) {
+        /* 12-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0xA0;
+        req->cdb[1] = req->cdb1;
+        stl_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[6], nb_logical_blocks);
+        req->cdb[10] = req->group_number;
+        req->cdb[11] = 0;
+        io_header->cmd_len = 12;
+    } else {
+        /* 16-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0x80;
+        req->cdb[1] = req->cdb1;
+        stq_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[10], nb_logical_blocks);
+        req->cdb[14] = req->group_number;
+        req->cdb[15] = 0;
+        io_header->cmd_len = 16;
+    }
+
+    /* The rest is as in scsi-generic.c.  */
+    io_header->mx_sb_len = sizeof(r->req.sense);
+    io_header->sbp = r->req.sense;
+    io_header->timeout = UINT_MAX;
+    io_header->usr_ptr = r;
+    io_header->flags |= SG_FLAG_DIRECT_IO;
+
+    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque);
+    assert(aiocb != NULL);
+    return aiocb;
+}
+
+static bool scsi_block_no_fua(SCSICommand *cmd)
+{
+    return false;
+}
+
+static BlockAIOCB *scsi_block_dma_readv(int64_t offset,
+                                        QEMUIOVector *iov,
+                                        BlockCompletionFunc *cb, void *cb_opaque,
+                                        void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, offset, iov,
+                              SG_DXFER_FROM_DEV, cb, cb_opaque);
+}
+
+static BlockAIOCB *scsi_block_dma_writev(int64_t offset,
+                                         QEMUIOVector *iov,
+                                         BlockCompletionFunc *cb, void *cb_opaque,
+                                         void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, offset, iov,
+                              SG_DXFER_TO_DEV, cb, cb_opaque);
+}
+
 static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 {
     switch (buf[0]) {
+    case VERIFY_10:
+    case VERIFY_12:
+    case VERIFY_16:
+        /* Check if BYTCHK == 0x01 (data-out buffer contains data
+         * for the number of logical blocks specified in the length
+         * field).  For other modes, do not use scatter/gather operation.
+         */
+        if ((buf[1] & 6) != 2) {
+            return false;
+        }
+        break;
+
     case READ_6:
     case READ_10:
     case READ_12:
     case READ_16:
-    case VERIFY_10:
-    case VERIFY_12:
-    case VERIFY_16:
     case WRITE_6:
     case WRITE_10:
     case WRITE_12:
@@ -2593,21 +2708,8 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
     case WRITE_VERIFY_10:
     case WRITE_VERIFY_12:
     case WRITE_VERIFY_16:
-        /* If we are not using O_DIRECT, we might read stale data from the
-         * host cache if writes were made using other commands than these
-         * ones (such as WRITE SAME or EXTENDED COPY, etc.).  So, without
-         * O_DIRECT everything must go through SG_IO.
-         */
-        if (!(blk_get_flags(s->qdev.conf.blk) & BDRV_O_NOCACHE)) {
-            break;
-        }
-
-        /* MMC writing cannot be done via pread/pwrite, because it sometimes
+        /* MMC writing cannot be done via DMA helpers, because it sometimes
          * involves writing beyond the maximum LBA or to negative LBA (lead-in).
-         * And once you do these writes, reading from the block device is
-         * unreliable, too.  It is even possible that reads deliver random data
-         * from the host page cache (this is probably a Linux bug).
-         *
          * We might use scsi_disk_dma_reqops as long as no writing commands are
          * seen, but performance usually isn't paramount on optical media.  So,
          * just make scsi-block operate the same as scsi-generic for them.
@@ -2625,6 +2727,54 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 }
 
 
+static int32_t scsi_block_dma_command(SCSIRequest *req, uint8_t *buf)
+{
+    SCSIBlockReq *r = (SCSIBlockReq *)req;
+    r->cmd = req->cmd.buf[0];
+    switch (r->cmd >> 5) {
+    case 0:
+        /* 6-byte CDB.  */
+        r->cdb1 = r->group_number = 0;
+        break;
+    case 1:
+        /* 10-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[6];
+    case 4:
+        /* 12-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[10];
+        break;
+    case 5:
+        /* 16-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[14];
+        break;
+    default:
+        abort();
+    }
+
+    if (r->cdb1 & 0xe0) {
+        /* Protection information is not supported.  */
+        scsi_check_condition(&r->req, SENSE_CODE(INVALID_FIELD));
+        return 0;
+    }
+
+    r->req.status = &r->io_header.status;
+    return scsi_disk_dma_command(req, buf);
+}
+
+static const SCSIReqOps scsi_block_dma_reqops = {
+    .size         = sizeof(SCSIBlockReq),
+    .free_req     = scsi_free_request,
+    .send_command = scsi_block_dma_command,
+    .read_data    = scsi_read_data,
+    .write_data   = scsi_write_data,
+    .get_buf      = scsi_get_buf,
+    .load_request = scsi_disk_load_request,
+    .save_request = scsi_disk_save_request,
+};
+
 static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
                                            uint32_t lun, uint8_t *buf,
                                            void *hba_private)
@@ -2635,7 +2785,7 @@ static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
         return scsi_req_alloc(&scsi_generic_req_ops, &s->qdev, tag, lun,
                               hba_private);
     } else {
-        return scsi_req_alloc(&scsi_disk_dma_reqops, &s->qdev, tag, lun,
+        return scsi_req_alloc(&scsi_block_dma_reqops, &s->qdev, tag, lun,
                               hba_private);
     }
 }
@@ -2654,6 +2804,46 @@ static int scsi_block_parse_cdb(SCSIDevice *d, SCSICommand *cmd,
 
 #endif
 
+static
+BlockAIOCB *scsi_dma_readv(int64_t offset, QEMUIOVector *iov,
+                           BlockCompletionFunc *cb, void *cb_opaque,
+                           void *opaque)
+{
+    SCSIDiskReq *r = opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    return blk_aio_preadv(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+}
+
+static
+BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
+                            BlockCompletionFunc *cb, void *cb_opaque,
+                            void *opaque)
+{
+    SCSIDiskReq *r = opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+}
+
+static void scsi_disk_base_class_initfn(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);
+
+    dc->fw_name = "disk";
+    dc->reset = scsi_disk_reset;
+    sdc->dma_readv = scsi_dma_readv;
+    sdc->dma_writev = scsi_dma_writev;
+    sdc->need_fua_emulation = scsi_is_cmd_fua;
+}
+
+static const TypeInfo scsi_disk_base_info = {
+    .name          = TYPE_SCSI_DISK_BASE,
+    .parent        = TYPE_SCSI_DEVICE,
+    .class_init    = scsi_disk_base_class_initfn,
+    .instance_size = sizeof(SCSIDiskState),
+    .class_size    = sizeof(SCSIDiskClass),
+};
+
 #define DEFINE_SCSI_DISK_PROPERTIES()                                \
     DEFINE_BLOCK_PROPERTIES(SCSIDiskState, qdev.conf),               \
     DEFINE_PROP_STRING("ver", SCSIDiskState, version),               \
@@ -2701,17 +2891,14 @@ static void scsi_hd_class_initfn(ObjectClass *klass, void *data)
     sc->realize      = scsi_hd_realize;
     sc->alloc_req    = scsi_new_request;
     sc->unit_attention_reported = scsi_disk_unit_attention_reported;
-    dc->fw_name = "disk";
     dc->desc = "virtual SCSI disk";
-    dc->reset = scsi_disk_reset;
     dc->props = scsi_hd_properties;
     dc->vmsd  = &vmstate_scsi_disk_state;
 }
 
 static const TypeInfo scsi_hd_info = {
     .name          = "scsi-hd",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
     .class_init    = scsi_hd_class_initfn,
 };
 
@@ -2733,17 +2920,14 @@ static void scsi_cd_class_initfn(ObjectClass *klass, void *data)
     sc->realize      = scsi_cd_realize;
     sc->alloc_req    = scsi_new_request;
     sc->unit_attention_reported = scsi_disk_unit_attention_reported;
-    dc->fw_name = "disk";
     dc->desc = "virtual SCSI CD-ROM";
-    dc->reset = scsi_disk_reset;
     dc->props = scsi_cd_properties;
     dc->vmsd  = &vmstate_scsi_disk_state;
 }
 
 static const TypeInfo scsi_cd_info = {
     .name          = "scsi-cd",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
     .class_init    = scsi_cd_class_initfn,
 };
 
@@ -2757,21 +2941,22 @@ static void scsi_block_class_initfn(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     SCSIDeviceClass *sc = SCSI_DEVICE_CLASS(klass);
+    SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);
 
     sc->realize      = scsi_block_realize;
     sc->alloc_req    = scsi_block_new_request;
     sc->parse_cdb    = scsi_block_parse_cdb;
-    dc->fw_name = "disk";
+    sdc->dma_readv   = scsi_block_dma_readv;
+    sdc->dma_writev  = scsi_block_dma_writev;
+    sdc->need_fua_emulation = scsi_block_no_fua;
     dc->desc = "SCSI block device passthrough";
-    dc->reset = scsi_disk_reset;
     dc->props = scsi_block_properties;
     dc->vmsd  = &vmstate_scsi_disk_state;
 }
 
 static const TypeInfo scsi_block_info = {
     .name          = "scsi-block",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
     .class_init    = scsi_block_class_initfn,
 };
 #endif
@@ -2809,13 +2994,13 @@ static void scsi_disk_class_initfn(ObjectClass *klass, void *data)
 
 static const TypeInfo scsi_disk_info = {
     .name          = "scsi-disk",
-    .parent        = TYPE_SCSI_DEVICE,
-    .instance_size = sizeof(SCSIDiskState),
+    .parent        = TYPE_SCSI_DISK_BASE,
     .class_init    = scsi_disk_class_initfn,
 };
 
 static void scsi_disk_register_types(void)
 {
+    type_register_static(&scsi_disk_base_info);
     type_register_static(&scsi_hd_info);
     type_register_static(&scsi_cd_info);
 #ifdef __linux__
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 7459465f60..71372a8383 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -222,6 +222,18 @@ static void scsi_read_complete(void * opaque, int ret)
             r->buf[3] |= 0x80;
         }
     }
+    if (s->type == TYPE_DISK &&
+        r->req.cmd.buf[0] == INQUIRY &&
+        r->req.cmd.buf[2] == 0xb0) {
+        uint32_t max_xfer_len = blk_get_max_transfer_length(s->conf.blk);
+        if (max_xfer_len) {
+            stl_be_p(&r->buf[8], max_xfer_len);
+            /* Also take care of the opt xfer len. */
+            if (ldl_be_p(&r->buf[12]) > max_xfer_len) {
+                stl_be_p(&r->buf[12], max_xfer_len);
+            }
+        }
+    }
     scsi_req_data(&r->req, len);
     scsi_req_unref(&r->req);
 }
diff --git a/hw/scsi/vmw_pvscsi.c b/hw/scsi/vmw_pvscsi.c
index f67b5bf7d3..2d7528d1dd 100644
--- a/hw/scsi/vmw_pvscsi.c
+++ b/hw/scsi/vmw_pvscsi.c
@@ -153,7 +153,7 @@ pvscsi_log2(uint32_t input)
     return log;
 }
 
-static void
+static int
 pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri)
 {
     int i;
@@ -161,6 +161,10 @@ pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri)
     uint32_t req_ring_size, cmp_ring_size;
     m->rs_pa = ri->ringsStatePPN << VMW_PAGE_SHIFT;
 
+    if ((ri->reqRingNumPages > PVSCSI_SETUP_RINGS_MAX_NUM_PAGES)
+        || (ri->cmpRingNumPages > PVSCSI_SETUP_RINGS_MAX_NUM_PAGES)) {
+        return -1;
+    }
     req_ring_size = ri->reqRingNumPages * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE;
     cmp_ring_size = ri->cmpRingNumPages * PVSCSI_MAX_NUM_CMP_ENTRIES_PER_PAGE;
     txr_len_log2 = pvscsi_log2(req_ring_size - 1);
@@ -192,15 +196,20 @@ pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri)
 
     /* Flush ring state page changes */
     smp_wmb();
+
+    return 0;
 }
 
-static void
+static int
 pvscsi_ring_init_msg(PVSCSIRingInfo *m, PVSCSICmdDescSetupMsgRing *ri)
 {
     int i;
     uint32_t len_log2;
     uint32_t ring_size;
 
+    if (ri->numPages > PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES) {
+        return -1;
+    }
     ring_size = ri->numPages * PVSCSI_MAX_NUM_MSG_ENTRIES_PER_PAGE;
     len_log2 = pvscsi_log2(ring_size - 1);
 
@@ -220,6 +229,8 @@ pvscsi_ring_init_msg(PVSCSIRingInfo *m, PVSCSICmdDescSetupMsgRing *ri)
 
     /* Flush ring state page changes */
     smp_wmb();
+
+    return 0;
 }
 
 static void
@@ -770,7 +781,10 @@ pvscsi_on_cmd_setup_rings(PVSCSIState *s)
     trace_pvscsi_on_cmd_arrived("PVSCSI_CMD_SETUP_RINGS");
 
     pvscsi_dbg_dump_tx_rings_config(rc);
-    pvscsi_ring_init_data(&s->rings, rc);
+    if (pvscsi_ring_init_data(&s->rings, rc) < 0) {
+        return PVSCSI_COMMAND_PROCESSING_FAILED;
+    }
+
     s->rings_info_valid = TRUE;
     return PVSCSI_COMMAND_PROCESSING_SUCCEEDED;
 }
@@ -850,7 +864,9 @@ pvscsi_on_cmd_setup_msg_ring(PVSCSIState *s)
     }
 
     if (s->rings_info_valid) {
-        pvscsi_ring_init_msg(&s->rings, rc);
+        if (pvscsi_ring_init_msg(&s->rings, rc) < 0) {
+            return PVSCSI_COMMAND_PROCESSING_FAILED;
+        }
         s->msg_ring_info_valid = TRUE;
     }
     return sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 88154a1f03..e51ed3a348 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -260,14 +260,20 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
     IOMMUTLBEntry *iotlb = data;
+    hwaddr iova = iotlb->iova + giommu->iommu_offset;
     MemoryRegion *mr;
     hwaddr xlat;
     hwaddr len = iotlb->addr_mask + 1;
     void *vaddr;
     int ret;
 
-    trace_vfio_iommu_map_notify(iotlb->iova,
-                                iotlb->iova + iotlb->addr_mask);
+    trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask);
+
+    if (iotlb->target_as != &address_space_memory) {
+        error_report("Wrong target AS \"%s\", only system memory is allowed",
+                     iotlb->target_as->name ? iotlb->target_as->name : "none");
+        return;
+    }
 
     /*
      * The IOMMU TLB entry we have just covers translation through
@@ -294,21 +300,21 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
 
     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
         vaddr = memory_region_get_ram_ptr(mr) + xlat;
-        ret = vfio_dma_map(container, iotlb->iova,
+        ret = vfio_dma_map(container, iova,
                            iotlb->addr_mask + 1, vaddr,
                            !(iotlb->perm & IOMMU_WO) || mr->readonly);
         if (ret) {
             error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx", %p) = %d (%m)",
-                         container, iotlb->iova,
+                         container, iova,
                          iotlb->addr_mask + 1, vaddr, ret);
         }
     } else {
-        ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
+        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%m)",
-                         container, iotlb->iova,
+                         container, iova,
                          iotlb->addr_mask + 1, ret);
         }
     }
@@ -380,6 +386,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
          */
         giommu = g_malloc0(sizeof(*giommu));
         giommu->iommu = section->mr;
+        giommu->iommu_offset = section->offset_within_address_space -
+                               section->offset_within_region;
         giommu->container = container;
         giommu->n.notify = vfio_iommu_map_notify;
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
@@ -433,6 +441,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     hwaddr iova, end;
+    Int128 llend, llsize;
     int ret;
 
     if (vfio_listener_skipped_section(section)) {
@@ -471,21 +480,25 @@ static void vfio_listener_region_del(MemoryListener *listener,
     }
 
     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
-    end = (section->offset_within_address_space + int128_get64(section->size)) &
-          TARGET_PAGE_MASK;
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
 
-    if (iova >= end) {
+    if (int128_ge(int128_make64(iova), llend)) {
         return;
     }
+    end = int128_get64(int128_sub(llend, int128_one()));
+
+    llsize = int128_sub(llend, int128_make64(iova));
 
-    trace_vfio_listener_region_del(iova, end - 1);
+    trace_vfio_listener_region_del(iova, end);
 
-    ret = vfio_dma_unmap(container, iova, end - iova);
+    ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
     memory_region_unref(section->mr);
     if (ret) {
         error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                      "0x%"HWADDR_PRIx") = %d (%m)",
-                     container, iova, end - iova, ret);
+                     container, iova, int128_get64(llsize), ret);
     }
 }
 
@@ -499,6 +512,54 @@ static void vfio_listener_release(VFIOContainer *container)
     memory_listener_unregister(&container->listener);
 }
 
+static struct vfio_info_cap_header *
+vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
+{
+    struct vfio_info_cap_header *hdr;
+    void *ptr = info;
+
+    if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
+        return NULL;
+    }
+
+    for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
+        if (hdr->id == id) {
+            return hdr;
+        }
+    }
+
+    return NULL;
+}
+
+static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
+                                           struct vfio_region_info *info)
+{
+    struct vfio_info_cap_header *hdr;
+    struct vfio_region_info_cap_sparse_mmap *sparse;
+    int i;
+
+    hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
+    if (!hdr) {
+        return;
+    }
+
+    sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
+
+    trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
+                                         region->nr, sparse->nr_areas);
+
+    region->nr_mmaps = sparse->nr_areas;
+    region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+
+    for (i = 0; i < region->nr_mmaps; i++) {
+        region->mmaps[i].offset = sparse->areas[i].offset;
+        region->mmaps[i].size = sparse->areas[i].size;
+        trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset,
+                                            region->mmaps[i].offset +
+                                            region->mmaps[i].size);
+    }
+}
+
 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
                       int index, const char *name)
 {
@@ -525,11 +586,14 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
             region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
             !(region->size & ~qemu_real_host_page_mask)) {
 
-            region->nr_mmaps = 1;
-            region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+            vfio_setup_region_sparse_mmaps(region, info);
 
-            region->mmaps[0].offset = 0;
-            region->mmaps[0].size = region->size;
+            if (!region->nr_mmaps) {
+                region->nr_mmaps = 1;
+                region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+                region->mmaps[0].offset = 0;
+                region->mmaps[0].size = region->size;
+            }
         }
     }
 
@@ -1089,16 +1153,60 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
     *info = g_malloc0(argsz);
 
     (*info)->index = index;
+retry:
     (*info)->argsz = argsz;
 
     if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
         g_free(*info);
+        *info = NULL;
         return -errno;
     }
 
+    if ((*info)->argsz > argsz) {
+        argsz = (*info)->argsz;
+        *info = g_realloc(*info, argsz);
+
+        goto retry;
+    }
+
     return 0;
 }
 
+int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+                             uint32_t subtype, struct vfio_region_info **info)
+{
+    int i;
+
+    for (i = 0; i < vbasedev->num_regions; i++) {
+        struct vfio_info_cap_header *hdr;
+        struct vfio_region_info_cap_type *cap_type;
+
+        if (vfio_get_region_info(vbasedev, i, info)) {
+            continue;
+        }
+
+        hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
+        if (!hdr) {
+            g_free(*info);
+            continue;
+        }
+
+        cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
+
+        trace_vfio_get_dev_region(vbasedev->name, i,
+                                  cap_type->type, cap_type->subtype);
+
+        if (cap_type->type == type && cap_type->subtype == subtype) {
+            return 0;
+        }
+
+        g_free(*info);
+    }
+
+    *info = NULL;
+    return -ENODEV;
+}
+
 /*
  * Interfaces for IBM EEH (Enhanced Error Handling)
  */
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index 49ecf1172a..35d32b78f4 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -11,9 +11,12 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/range.h"
+#include "qapi/error.h"
+#include "hw/nvram/fw_cfg.h"
 #include "pci.h"
 #include "trace.h"
-#include "qemu/range.h"
 
 /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
 static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
@@ -962,6 +965,643 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 }
 
 /*
+ * Intel IGD support
+ *
+ * Obviously IGD is not a discrete device, this is evidenced not only by it
+ * being integrated into the CPU, but by the various chipset and BIOS
+ * dependencies that it brings along with it.  Intel is trying to move away
+ * from this and Broadwell and newer devices can run in what Intel calls
+ * "Universal Pass-Through" mode, or UPT.  Theoretically in UPT mode, nothing
+ * more is required beyond assigning the IGD device to a VM.  There are
+ * however support limitations to this mode.  It only supports IGD as a
+ * secondary graphics device in the VM and it doesn't officially support any
+ * physical outputs.
+ *
+ * The code here attempts to enable what we'll call legacy mode assignment,
+ * IGD retains most of the capabilities we expect for it to have on bare
+ * metal.  To enable this mode, the IGD device must be assigned to the VM
+ * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA
+ * support, we must have VM BIOS support for reserving and populating some
+ * of the required tables, and we need to tweak the chipset with revisions
+ * and IDs and an LPC/ISA bridge device.  The intention is to make all of
+ * this happen automatically by installing the device at the correct VM PCI
+ * bus address.  If any of the conditions are not met, we cross our fingers
+ * and hope the user knows better.
+ *
+ * NB - It is possible to enable physical outputs in UPT mode by supplying
+ * an OpRegion table.  We don't do this by default because the guest driver
+ * behaves differently if an OpRegion is provided and no monitor is attached
+ * vs no OpRegion and a monitor being attached or not.  Effectively, if a
+ * headless setup is desired, the OpRegion gets in the way of that.
+ */
+
+/*
+ * This presumes the device is already known to be an Intel VGA device, so we
+ * take liberties in which device ID bits match which generation.  This should
+ * not be taken as an indication that all the devices are supported, or even
+ * supportable, some of them don't even support VT-d.
+ * See linux:include/drm/i915_pciids.h for IDs.
+ */
+static int igd_gen(VFIOPCIDevice *vdev)
+{
+    if ((vdev->device_id & 0xfff) == 0xa84) {
+        return 8; /* Broxton */
+    }
+
+    switch (vdev->device_id & 0xff00) {
+    /* Old, untested, unavailable, unknown */
+    case 0x0000:
+    case 0x2500:
+    case 0x2700:
+    case 0x2900:
+    case 0x2a00:
+    case 0x2e00:
+    case 0x3500:
+    case 0xa000:
+        return -1;
+    /* SandyBridge, IvyBridge, ValleyView, Haswell */
+    case 0x0100:
+    case 0x0400:
+    case 0x0a00:
+    case 0x0c00:
+    case 0x0d00:
+    case 0x0f00:
+        return 6;
+    /* BroadWell, CherryView, SkyLake, KabyLake */
+    case 0x1600:
+    case 0x1900:
+    case 0x2200:
+    case 0x5900:
+        return 8;
+    }
+
+    return 8; /* Assume newer is compatible */
+}
+
+typedef struct VFIOIGDQuirk {
+    struct VFIOPCIDevice *vdev;
+    uint32_t index;
+} VFIOIGDQuirk;
+
+#define IGD_GMCH 0x50 /* Graphics Control Register */
+#define IGD_BDSM 0x5c /* Base Data of Stolen Memory */
+#define IGD_ASLS 0xfc /* ASL Storage Register */
+
+/*
+ * The OpRegion includes the Video BIOS Table, which seems important for
+ * telling the driver what sort of outputs it has.  Without this, the device
+ * may work in the guest, but we may not get output.  This also requires BIOS
+ * support to reserve and populate a section of guest memory sufficient for
+ * the table and to write the base address of that memory to the ASLS register
+ * of the IGD device.
+ */
+int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
+                               struct vfio_region_info *info)
+{
+    int ret;
+
+    vdev->igd_opregion = g_malloc0(info->size);
+    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
+                info->size, info->offset);
+    if (ret != info->size) {
+        error_report("vfio: Error reading IGD OpRegion");
+        g_free(vdev->igd_opregion);
+        vdev->igd_opregion = NULL;
+        return -EINVAL;
+    }
+
+    /*
+     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
+     * allocate 32bit reserved memory for, copy these contents into, and write
+     * the reserved memory base address to the device ASLS register at 0xFC.
+     * Alignment of this reserved region seems flexible, but using a 4k page
+     * alignment seems to work well.  This interface assumes a single IGD
+     * device, which may be at VM address 00:02.0 in legacy mode or another
+     * address in UPT mode.
+     *
+     * NB, there may be future use cases discovered where the VM should have
+     * direct interaction with the host OpRegion, in which case the write to
+     * the ASLS register would trigger MemoryRegion setup to enable that.
+     */
+    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
+                    vdev->igd_opregion, info->size);
+
+    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
+
+    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
+    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
+    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
+
+    return 0;
+}
+
+/*
+ * The rather short list of registers that we copy from the host devices.
+ * The LPC/ISA bridge values are definitely needed to support the vBIOS, the
+ * host bridge values may or may not be needed depending on the guest OS.
+ * Since we're only munging revision and subsystem values on the host bridge,
+ * we don't require our own device.  The LPC/ISA bridge needs to be our very
+ * own though.
+ */
+typedef struct {
+    uint8_t offset;
+    uint8_t len;
+} IGDHostInfo;
+
+static const IGDHostInfo igd_host_bridge_infos[] = {
+    {PCI_REVISION_ID,         2},
+    {PCI_SUBSYSTEM_VENDOR_ID, 2},
+    {PCI_SUBSYSTEM_ID,        2},
+};
+
+static const IGDHostInfo igd_lpc_bridge_infos[] = {
+    {PCI_VENDOR_ID,           2},
+    {PCI_DEVICE_ID,           2},
+    {PCI_REVISION_ID,         2},
+    {PCI_SUBSYSTEM_VENDOR_ID, 2},
+    {PCI_SUBSYSTEM_ID,        2},
+};
+
+static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev,
+                             struct vfio_region_info *info,
+                             const IGDHostInfo *list, int len)
+{
+    int i, ret;
+
+    for (i = 0; i < len; i++) {
+        ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset,
+                    list[i].len, info->offset + list[i].offset);
+        if (ret != list[i].len) {
+            error_report("IGD copy failed: %m");
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Stuff a few values into the host bridge.
+ */
+static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev,
+                                  struct vfio_region_info *info)
+{
+    PCIBus *bus;
+    PCIDevice *host_bridge;
+    int ret;
+
+    bus = pci_device_root_bus(&vdev->pdev);
+    host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0));
+
+    if (!host_bridge) {
+        error_report("Can't find host bridge");
+        return -ENODEV;
+    }
+
+    ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos,
+                            ARRAY_SIZE(igd_host_bridge_infos));
+    if (!ret) {
+        trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name);
+    }
+
+    return ret;
+}
+
+/*
+ * IGD LPC/ISA bridge support code.  The vBIOS needs this, but we can't write
+ * arbitrary values into just any bridge, so we must create our own.  We try
+ * to handle if the user has created it for us, which they might want to do
+ * to enable multifuction so we don't occupy the whole PCI slot.
+ */
+static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp)
+{
+    if (pdev->devfn != PCI_DEVFN(0x1f, 0)) {
+        error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0");
+    }
+}
+
+static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment";
+    dc->hotpluggable = false;
+    k->realize = vfio_pci_igd_lpc_bridge_realize;
+    k->class_id = PCI_CLASS_BRIDGE_ISA;
+}
+
+static TypeInfo vfio_pci_igd_lpc_bridge_info = {
+    .name = "vfio-pci-igd-lpc-bridge",
+    .parent = TYPE_PCI_DEVICE,
+    .class_init = vfio_pci_igd_lpc_bridge_class_init,
+};
+
+static void vfio_pci_igd_register_types(void)
+{
+    type_register_static(&vfio_pci_igd_lpc_bridge_info);
+}
+
+type_init(vfio_pci_igd_register_types)
+
+static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
+                                 struct vfio_region_info *info)
+{
+    PCIDevice *lpc_bridge;
+    int ret;
+
+    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
+                                 0, PCI_DEVFN(0x1f, 0));
+    if (!lpc_bridge) {
+        lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev),
+                                 PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge");
+    }
+
+    ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos,
+                            ARRAY_SIZE(igd_lpc_bridge_infos));
+    if (!ret) {
+        trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name);
+    }
+
+    return ret;
+}
+
+/*
+ * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE
+ * entry, older IGDs use 2MB and 32bit.  Each PTE maps a 4k page.  Therefore
+ * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index
+ * for programming the GTT.
+ *
+ * See linux:include/drm/i915_drm.h for shift and mask values.
+ */
+static int vfio_igd_gtt_max(VFIOPCIDevice *vdev)
+{
+    uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
+    int ggms, gen = igd_gen(vdev);
+
+    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
+    ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
+    if (gen > 6) {
+        ggms = 1 << ggms;
+    }
+
+    ggms *= 1024 * 1024;
+
+    return (ggms / (4 * 1024)) * (gen < 8 ? 4 : 8);
+}
+
+/*
+ * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes.
+ * Somehow the host stolen memory range is used for this, but how the ROM gets
+ * it is a mystery, perhaps it's hardcoded into the ROM.  Thankfully though, it
+ * reprograms the GTT through the IOBAR where we can trap it and transpose the
+ * programming to the VM allocated buffer.  That buffer gets reserved by the VM
+ * firmware via the fw_cfg entry added below.  Here we're just monitoring the
+ * IOBAR address and data registers to detect a write sequence targeting the
+ * GTTADR.  This code is developed by observed behavior and doesn't have a
+ * direct spec reference, unfortunately.
+ */
+static uint64_t vfio_igd_quirk_data_read(void *opaque,
+                                         hwaddr addr, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+
+    igd->index = ~0;
+
+    return vfio_region_read(&vdev->bars[4].region, addr + 4, size);
+}
+
+static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr,
+                                      uint64_t data, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+    uint64_t val = data;
+    int gen = igd_gen(vdev);
+
+    /*
+     * Programming the GGMS starts at index 0x1 and uses every 4th index (ie.
+     * 0x1, 0x5, 0x9, 0xd,...).  For pre-Gen8 each 4-byte write is a whole PTE
+     * entry, with 0th bit enable set.  For Gen8 and up, PTEs are 64bit, so
+     * entries 0x5 & 0xd are the high dword, in our case zero.  Each PTE points
+     * to a 4k page, which we translate to a page from the VM allocated region,
+     * pointed to by the BDSM register.  If this is not set, we fail.
+     *
+     * We trap writes to the full configured GTT size, but we typically only
+     * see the vBIOS writing up to (nearly) the 1MB barrier.  In fact it often
+     * seems to miss the last entry for an even 1MB GTT.  Doing a gratuitous
+     * write of that last entry does work, but is hopefully unnecessary since
+     * we clear the previous GTT on initialization.
+     */
+    if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) {
+        if (gen < 8 || (igd->index % 8 == 1)) {
+            uint32_t base;
+
+            base = pci_get_long(vdev->pdev.config + IGD_BDSM);
+            if (!base) {
+                hw_error("vfio-igd: Guest attempted to program IGD GTT before "
+                         "BIOS reserved stolen memory.  Unsupported BIOS?");
+            }
+
+            val = base | (data & ((1 << 20) - 1));
+        } else {
+            val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */
+        }
+
+        trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name,
+                                      igd->index, data, val);
+    }
+
+    vfio_region_write(&vdev->bars[4].region, addr + 4, val, size);
+
+    igd->index = ~0;
+}
+
+static const MemoryRegionOps vfio_igd_data_quirk = {
+    .read = vfio_igd_quirk_data_read,
+    .write = vfio_igd_quirk_data_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static uint64_t vfio_igd_quirk_index_read(void *opaque,
+                                          hwaddr addr, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+
+    igd->index = ~0;
+
+    return vfio_region_read(&vdev->bars[4].region, addr, size);
+}
+
+static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr,
+                                       uint64_t data, unsigned size)
+{
+    VFIOIGDQuirk *igd = opaque;
+    VFIOPCIDevice *vdev = igd->vdev;
+
+    igd->index = data;
+
+    vfio_region_write(&vdev->bars[4].region, addr, data, size);
+}
+
+static const MemoryRegionOps vfio_igd_index_quirk = {
+    .read = vfio_igd_quirk_index_read,
+    .write = vfio_igd_quirk_index_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
+{
+    struct vfio_region_info *rom = NULL, *opregion = NULL,
+                            *host = NULL, *lpc = NULL;
+    VFIOQuirk *quirk;
+    VFIOIGDQuirk *igd;
+    PCIDevice *lpc_bridge;
+    int i, ret, ggms_mb, gms_mb = 0, gen;
+    uint64_t *bdsm_size;
+    uint32_t gmch;
+    uint16_t cmd_orig, cmd;
+
+    /*
+     * This must be an Intel VGA device at address 00:02.0 for us to even
+     * consider enabling legacy mode.  The vBIOS has dependencies on the
+     * PCI bus address.
+     */
+    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
+        !vfio_is_vga(vdev) || nr != 4 ||
+        &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
+                                       0, PCI_DEVFN(0x2, 0))) {
+        return;
+    }
+
+    /*
+     * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we
+     * can stuff host values into, so if there's already one there and it's not
+     * one we can hack on, legacy mode is no-go.  Sorry Q35.
+     */
+    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
+                                 0, PCI_DEVFN(0x1f, 0));
+    if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge),
+                                           "vfio-pci-igd-lpc-bridge")) {
+        error_report("IGD device %s cannot support legacy mode due to existing "
+                     "devices at address 1f.0", vdev->vbasedev.name);
+        return;
+    }
+
+    /*
+     * IGD is not a standard, they like to change their specs often.  We
+     * only attempt to support back to SandBridge and we hope that newer
+     * devices maintain compatibility with generation 8.
+     */
+    gen = igd_gen(vdev);
+    if (gen != 6 && gen != 8) {
+        error_report("IGD device %s is unsupported in legacy mode, "
+                     "try SandyBridge or newer", vdev->vbasedev.name);
+        return;
+    }
+
+    /*
+     * Most of what we're doing here is to enable the ROM to run, so if
+     * there's no ROM, there's no point in setting up this quirk.
+     * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support.
+     */
+    ret = vfio_get_region_info(&vdev->vbasedev,
+                               VFIO_PCI_ROM_REGION_INDEX, &rom);
+    if ((ret || !rom->size) && !vdev->pdev.romfile) {
+        error_report("IGD device %s has no ROM, legacy mode disabled",
+                     vdev->vbasedev.name);
+        goto out;
+    }
+
+    /*
+     * Ignore the hotplug corner case, mark the ROM failed, we can't
+     * create the devices we need for legacy mode in the hotplug scenario.
+     */
+    if (vdev->pdev.qdev.hotplugged) {
+        error_report("IGD device %s hotplugged, ROM disabled, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        vdev->rom_read_failed = true;
+        goto out;
+    }
+
+    /*
+     * Check whether we have all the vfio device specific regions to
+     * support legacy mode (added in Linux v4.6).  If not, bail.
+     */
+    ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
+    if (ret) {
+        error_report("IGD device %s does not support OpRegion access,"
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host);
+    if (ret) {
+        error_report("IGD device %s does not support host bridge access,"
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc);
+    if (ret) {
+        error_report("IGD device %s does not support LPC bridge access,"
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
+
+    /*
+     * If IGD VGA Disable is clear (expected) and VGA is not already enabled,
+     * try to enable it.  Probably shouldn't be using legacy mode without VGA,
+     * but also no point in us enabling VGA if disabled in hardware.
+     */
+    if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev)) {
+        error_report("IGD device %s failed to enable VGA access, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Create our LPC/ISA bridge */
+    ret = vfio_pci_igd_lpc_init(vdev, lpc);
+    if (ret) {
+        error_report("IGD device %s failed to create LPC bridge, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Stuff some host values into the VM PCI host bridge */
+    ret = vfio_pci_igd_host_init(vdev, host);
+    if (ret) {
+        error_report("IGD device %s failed to modify host bridge, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Setup OpRegion access */
+    ret = vfio_pci_igd_opregion_init(vdev, opregion);
+    if (ret) {
+        error_report("IGD device %s failed to setup OpRegion, "
+                     "legacy mode disabled", vdev->vbasedev.name);
+        goto out;
+    }
+
+    /* Setup our quirk to munge GTT addresses to the VM allocated buffer */
+    quirk = g_malloc0(sizeof(*quirk));
+    quirk->mem = g_new0(MemoryRegion, 2);
+    quirk->nr_mem = 2;
+    igd = quirk->data = g_malloc0(sizeof(*igd));
+    igd->vdev = vdev;
+    igd->index = ~0;
+
+    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk,
+                          igd, "vfio-igd-index-quirk", 4);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        0, &quirk->mem[0], 1);
+
+    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk,
+                          igd, "vfio-igd-data-quirk", 4);
+    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
+                                        4, &quirk->mem[1], 1);
+
+    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+
+    /* Determine the size of stolen memory needed for GTT */
+    ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
+    if (gen > 6) {
+        ggms_mb = 1 << ggms_mb;
+    }
+
+    /*
+     * Assume we have no GMS memory, but allow it to be overrided by device
+     * option (experimental).  The spec doesn't actually allow zero GMS when
+     * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused,
+     * so let's not waste VM memory for it.
+     */
+    gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
+
+    if (vdev->igd_gms) {
+        if (vdev->igd_gms <= 0x10) {
+            gms_mb = vdev->igd_gms * 32;
+            gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8);
+        } else {
+            error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms);
+            vdev->igd_gms = 0;
+        }
+    }
+
+    /*
+     * Request reserved memory for stolen memory via fw_cfg.  VM firmware
+     * must allocate a 1MB aligned reserved memory region below 4GB with
+     * the requested size (in bytes) for use by the Intel PCI class VGA
+     * device at VM address 00:02.0.  The base address of this reserved
+     * memory region must be written to the device BDSM regsiter at PCI
+     * config offset 0x5C.
+     */
+    bdsm_size = g_malloc(sizeof(*bdsm_size));
+    *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * 1024 * 1024);
+    fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size",
+                    bdsm_size, sizeof(*bdsm_size));
+
+    /* GMCH is read-only, emulated */
+    pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
+    pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
+    pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
+
+    /* BDSM is read-write, emulated.  The BIOS needs to be able to write it */
+    pci_set_long(vdev->pdev.config + IGD_BDSM, 0);
+    pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0);
+    pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0);
+
+    /*
+     * This IOBAR gives us access to GTTADR, which allows us to write to
+     * the GTT itself.  So let's go ahead and write zero to all the GTT
+     * entries to avoid spurious DMA faults.  Be sure I/O access is enabled
+     * before talking to the device.
+     */
+    if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
+              vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
+        error_report("IGD device %s - failed to read PCI command register",
+                     vdev->vbasedev.name);
+    }
+
+    cmd = cmd_orig | PCI_COMMAND_IO;
+
+    if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd),
+               vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) {
+        error_report("IGD device %s - failed to write PCI command register",
+                     vdev->vbasedev.name);
+    }
+
+    for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) {
+        vfio_region_write(&vdev->bars[4].region, 0, i, 4);
+        vfio_region_write(&vdev->bars[4].region, 4, 0, 4);
+    }
+
+    if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
+               vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
+        error_report("IGD device %s - failed to restore PCI command register",
+                     vdev->vbasedev.name);
+    }
+
+    trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb);
+
+out:
+    g_free(rom);
+    g_free(opregion);
+    g_free(host);
+    g_free(lpc);
+}
+
+/*
  * Common quirk probe entry points.
  */
 void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
@@ -1010,6 +1650,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
     vfio_probe_nvidia_bar5_quirk(vdev, nr);
     vfio_probe_nvidia_bar0_quirk(vdev, nr);
     vfio_probe_rtl8168_bar2_quirk(vdev, nr);
+    vfio_probe_igd_bar4_quirk(vdev, nr);
 }
 
 void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d091d8cf0e..deab0c601a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1440,8 +1440,6 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr)
                      vdev->vbasedev.name, nr);
     }
 
-    vfio_bar_quirk_setup(vdev, nr);
-
     pci_register_bar(&vdev->pdev, nr, type, bar->region.mem);
 }
 
@@ -1452,29 +1450,6 @@ static void vfio_bars_setup(VFIOPCIDevice *vdev)
     for (i = 0; i < PCI_ROM_SLOT; i++) {
         vfio_bar_setup(vdev, i);
     }
-
-    if (vdev->vga) {
-        memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
-                              OBJECT(vdev), &vfio_vga_ops,
-                              &vdev->vga->region[QEMU_PCI_VGA_MEM],
-                              "vfio-vga-mmio@0xa0000",
-                              QEMU_PCI_VGA_MEM_SIZE);
-        memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
-                              OBJECT(vdev), &vfio_vga_ops,
-                              &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
-                              "vfio-vga-io@0x3b0",
-                              QEMU_PCI_VGA_IO_LO_SIZE);
-        memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
-                              OBJECT(vdev), &vfio_vga_ops,
-                              &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
-                              "vfio-vga-io@0x3c0",
-                              QEMU_PCI_VGA_IO_HI_SIZE);
-
-        pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
-                         &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
-                         &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
-        vfio_vga_quirk_setup(vdev);
-    }
 }
 
 static void vfio_bars_exit(VFIOPCIDevice *vdev)
@@ -2061,42 +2036,61 @@ int vfio_populate_vga(VFIOPCIDevice *vdev)
     struct vfio_region_info *reg_info;
     int ret;
 
-    if (vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) {
-        ret = vfio_get_region_info(vbasedev,
-                                   VFIO_PCI_VGA_REGION_INDEX, &reg_info);
-        if (ret) {
-            return ret;
-        }
+    ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
+    if (ret) {
+        return ret;
+    }
 
-        if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
-            !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
-            reg_info->size < 0xbffff + 1) {
-            error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
-                         (unsigned long)reg_info->flags,
-                         (unsigned long)reg_info->size);
-            g_free(reg_info);
-            return -EINVAL;
-        }
+    if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
+        !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
+        reg_info->size < 0xbffff + 1) {
+        error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
+                     (unsigned long)reg_info->flags,
+                     (unsigned long)reg_info->size);
+        g_free(reg_info);
+        return -EINVAL;
+    }
 
-        vdev->vga = g_new0(VFIOVGA, 1);
+    vdev->vga = g_new0(VFIOVGA, 1);
 
-        vdev->vga->fd_offset = reg_info->offset;
-        vdev->vga->fd = vdev->vbasedev.fd;
+    vdev->vga->fd_offset = reg_info->offset;
+    vdev->vga->fd = vdev->vbasedev.fd;
 
-        g_free(reg_info);
+    g_free(reg_info);
 
-        vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
-        vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
-        QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
+    vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
+    vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
+    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
 
-        vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
-        vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
-        QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
+    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
+                          OBJECT(vdev), &vfio_vga_ops,
+                          &vdev->vga->region[QEMU_PCI_VGA_MEM],
+                          "vfio-vga-mmio@0xa0000",
+                          QEMU_PCI_VGA_MEM_SIZE);
 
-        vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
-        vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
-        QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
-    }
+    vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
+    vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
+    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
+
+    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
+                          OBJECT(vdev), &vfio_vga_ops,
+                          &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
+                          "vfio-vga-io@0x3b0",
+                          QEMU_PCI_VGA_IO_LO_SIZE);
+
+    vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
+    vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
+    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
+
+    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
+                          OBJECT(vdev), &vfio_vga_ops,
+                          &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
+                          "vfio-vga-io@0x3c0",
+                          QEMU_PCI_VGA_IO_HI_SIZE);
+
+    pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
+                     &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
+                     &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
 
     return 0;
 }
@@ -2398,7 +2392,7 @@ static int vfio_initfn(PCIDevice *pdev)
     ssize_t len;
     struct stat st;
     int groupid;
-    int ret;
+    int i, ret;
 
     if (!vdev->vbasedev.sysfsdev) {
         vdev->vbasedev.sysfsdev =
@@ -2560,6 +2554,43 @@ static int vfio_initfn(PCIDevice *pdev)
         goto out_teardown;
     }
 
+    if (vdev->vga) {
+        vfio_vga_quirk_setup(vdev);
+    }
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_bar_quirk_setup(vdev, i);
+    }
+
+    if (!vdev->igd_opregion &&
+        vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
+        struct vfio_region_info *opregion;
+
+        if (vdev->pdev.qdev.hotplugged) {
+            error_report("Cannot support IGD OpRegion feature on hotplugged "
+                         "device %s", vdev->vbasedev.name);
+            ret = -EINVAL;
+            goto out_teardown;
+        }
+
+        ret = vfio_get_dev_region_info(&vdev->vbasedev,
+                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
+        if (ret) {
+            error_report("Device %s does not support requested IGD OpRegion "
+                         "feature", vdev->vbasedev.name);
+            goto out_teardown;
+        }
+
+        ret = vfio_pci_igd_opregion_init(vdev, opregion);
+        g_free(opregion);
+        if (ret) {
+            error_report("Device %s IGD OpRegion initialization failed",
+                         vdev->vbasedev.name);
+            goto out_teardown;
+        }
+    }
+
     /* QEMU emulates all of MSI & MSIX */
     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
@@ -2603,6 +2634,13 @@ static void vfio_instance_finalize(Object *obj)
     vfio_bars_finalize(vdev);
     g_free(vdev->emulated_config_bits);
     g_free(vdev->rom);
+    /*
+     * XXX Leaking igd_opregion is not an oversight, we can't remove the
+     * fw_cfg entry therefore leaking this allocation seems like the safest
+     * option.
+     *
+     * g_free(vdev->igd_opregion);
+     */
     vfio_put_device(vdev);
     vfio_put_group(group);
 }
@@ -2677,6 +2715,8 @@ static Property vfio_pci_dev_properties[] = {
                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
+    DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
+                    VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
@@ -2687,6 +2727,7 @@ static Property vfio_pci_dev_properties[] = {
                        sub_vendor_id, PCI_ANY_ID),
     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
                        sub_device_id, PCI_ANY_ID),
+    DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
     /*
      * TODO - support passed fds... is this necessary?
      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3976f68549..b3eb0d838e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -115,6 +115,7 @@ typedef struct VFIOPCIDevice {
     int interrupt; /* Current interrupt type */
     VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
     VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */
+    void *igd_opregion;
     PCIHostDeviceAddress host;
     EventNotifier err_notifier;
     EventNotifier req_notifier;
@@ -128,7 +129,11 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
+#define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2
+#define VFIO_FEATURE_ENABLE_IGD_OPREGION \
+                                (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
     int32_t bootindex;
+    uint32_t igd_gms;
     uint8_t pm_cap;
     bool has_vga;
     bool pci_aer;
@@ -159,4 +164,7 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev);
 
 int vfio_populate_vga(VFIOPCIDevice *vdev);
 
+int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
+                               struct vfio_region_info *info);
+
 #endif /* HW_VFIO_VFIO_PCI_H */
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 5914e85107..495e09fd4e 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -17,7 +17,6 @@
 #include "sysemu/kvm.h"
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
-#include "exec/ram_addr.h"
 #include "migration/migration.h"
 
 #include <sys/ioctl.h>
@@ -247,18 +246,18 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
 
     for (i = 0; i < dev->mem->nregions; ++i) {
         struct vhost_memory_region *reg = dev->mem->regions + i;
-        ram_addr_t ram_addr;
+        ram_addr_t offset;
+        MemoryRegion *mr;
 
         assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
-        qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr,
-                                &ram_addr);
-        fd = qemu_get_ram_fd(ram_addr);
+        mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr,
+                                     &offset);
+        fd = memory_region_get_fd(mr);
         if (fd > 0) {
             msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
             msg.payload.memory.regions[fd_num].memory_size  = reg->memory_size;
             msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
-            msg.payload.memory.regions[fd_num].mmap_offset = reg->userspace_addr -
-                (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr);
+            msg.payload.memory.regions[fd_num].mmap_offset = offset;
             assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
             fds[fd_num++] = fd;
         }
@@ -616,17 +615,15 @@ static bool vhost_user_can_merge(struct vhost_dev *dev,
                                  uint64_t start1, uint64_t size1,
                                  uint64_t start2, uint64_t size2)
 {
-    ram_addr_t ram_addr;
+    ram_addr_t offset;
     int mfd, rfd;
     MemoryRegion *mr;
 
-    mr = qemu_ram_addr_from_host((void *)(uintptr_t)start1, &ram_addr);
-    assert(mr);
-    mfd = qemu_get_ram_fd(ram_addr);
+    mr = memory_region_from_host((void *)(uintptr_t)start1, &offset);
+    mfd = memory_region_get_fd(mr);
 
-    mr = qemu_ram_addr_from_host((void *)(uintptr_t)start2, &ram_addr);
-    assert(mr);
-    rfd = qemu_get_ram_fd(ram_addr);
+    mr = memory_region_from_host((void *)(uintptr_t)start2, &offset);
+    rfd = memory_region_get_fd(mr);
 
     return mfd == rfd;
 }
diff --git a/include/block/block.h b/include/block/block.h
index a8c15e36e7..70ea29947c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -17,7 +17,6 @@ typedef struct BlockJob BlockJob;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildRole BdrvChildRole;
 typedef struct BlockJobTxn BlockJobTxn;
-typedef struct BdrvNextIterator BdrvNextIterator;
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
@@ -198,7 +197,6 @@ BlockDriver *bdrv_find_format(const char *format_name);
 int bdrv_create(BlockDriver *drv, const char* filename,
                 QemuOpts *opts, Error **errp);
 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
-BlockDriverState *bdrv_new_root(void);
 BlockDriverState *bdrv_new(void);
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top);
 void bdrv_replace_in_backing_chain(BlockDriverState *old,
@@ -214,8 +212,8 @@ BdrvChild *bdrv_open_child(const char *filename,
 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd);
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
                            const char *bdref_key, Error **errp);
-int bdrv_open(BlockDriverState **pbs, const char *filename,
-              const char *reference, QDict *options, int flags, Error **errp);
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+                            QDict *options, int flags, Error **errp);
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                     BlockDriverState *bs,
                                     QDict *options, int flags);
@@ -244,10 +242,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     const void *buf, int count);
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
-int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 /*
@@ -402,7 +396,19 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
                                  Error **errp);
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
 BlockDriverState *bdrv_next_node(BlockDriverState *bs);
-BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs);
+
+typedef struct BdrvNextIterator {
+    enum {
+        BDRV_NEXT_BACKEND_ROOTS,
+        BDRV_NEXT_MONITOR_OWNED,
+    } phase;
+    BlockBackend *blk;
+    BlockDriverState *bs;
+} BdrvNextIterator;
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it);
+BlockDriverState *bdrv_next(BdrvNextIterator *it);
+
 BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
 int bdrv_is_encrypted(BlockDriverState *bs);
 int bdrv_key_required(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index b6f4755725..30a97178c8 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -719,7 +719,8 @@ void hmp_drive_add_node(Monitor *mon, const char *optstr);
 
 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
                                   const char *child_name,
-                                  const BdrvChildRole *child_role);
+                                  const BdrvChildRole *child_role,
+                                  void *opaque);
 void bdrv_root_unref_child(BdrvChild *child);
 
 const char *bdrv_get_parent_name(const BlockDriverState *bs);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 073a433cf8..86d28070b8 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -82,7 +82,7 @@ struct BlockJob {
     const BlockJobDriver *driver;
 
     /** The block device on which the job is operating.  */
-    BlockDriverState *bs;
+    BlockBackend *blk;
 
     /**
      * The ID of the block job. Currently the BlockBackend name of the BDS
@@ -135,6 +135,9 @@ struct BlockJob {
      */
     bool deferred_to_main_loop;
 
+    /** Element of the list of block jobs */
+    QLIST_ENTRY(BlockJob) job_list;
+
     /** Status that is published by the query-block-jobs QMP API */
     BlockDeviceIoStatus iostatus;
 
@@ -173,6 +176,17 @@ struct BlockJob {
 };
 
 /**
+ * block_job_next:
+ * @job: A block job, or %NULL.
+ *
+ * Get the next element from the list of block jobs after @job, or the
+ * first one if @job is %NULL.
+ *
+ * Returns the requested job, or %NULL if there are no more jobs left.
+ */
+BlockJob *block_job_next(BlockJob *job);
+
+/**
  * block_job_create:
  * @job_type: The class object for the newly-created job.
  * @bs: The block
@@ -357,6 +371,13 @@ bool block_job_is_paused(BlockJob *job);
 int block_job_cancel_sync(BlockJob *job);
 
 /**
+ * block_job_cancel_sync_all:
+ *
+ * Synchronously cancels all jobs using block_job_cancel_sync().
+ */
+void block_job_cancel_sync_all(void);
+
+/**
  * block_job_complete_sync:
  * @job: The job to be completed.
  * @errp: Error object which may be set by block_job_complete(); this is not
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index a2c3b92742..aaee995634 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -57,10 +57,10 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
 
 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
 /* This should not be used by devices.  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+ram_addr_t qemu_ram_addr_from_host(void *ptr);
 RAMBlock *qemu_ram_block_by_name(const char *name);
 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
-                                   ram_addr_t *ram_addr, ram_addr_t *offset);
+                                   ram_addr_t *offset);
 void qemu_ram_set_idstr(RAMBlock *block, const char *name, DeviceState *dev);
 void qemu_ram_unset_idstr(RAMBlock *block);
 const char *qemu_ram_get_idstr(RAMBlock *rb);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index f649697ee9..4ab680052f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -32,6 +32,8 @@
 #include "qom/object.h"
 #include "qemu/rcu.h"
 
+#define RAM_ADDR_INVALID (~(ram_addr_t)0)
+
 #define MAX_PHYS_ADDR_SPACE_BITS 62
 #define MAX_PHYS_ADDR            (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1)
 
@@ -667,6 +669,35 @@ static inline bool memory_region_is_rom(MemoryRegion *mr)
 int memory_region_get_fd(MemoryRegion *mr);
 
 /**
+ * memory_region_set_fd: Mark a RAM memory region as backed by a
+ * file descriptor.
+ *
+ * This function is typically used after memory_region_init_ram_ptr().
+ *
+ * @mr: the memory region being queried.
+ * @fd: the file descriptor that backs @mr.
+ */
+void memory_region_set_fd(MemoryRegion *mr, int fd);
+
+/**
+ * memory_region_from_host: Convert a pointer into a RAM memory region
+ * and an offset within it.
+ *
+ * Given a host pointer inside a RAM memory region (created with
+ * memory_region_init_ram() or memory_region_init_ram_ptr()), return
+ * the MemoryRegion and the offset within it.
+ *
+ * Use with care; by the time this function returns, the returned pointer is
+ * not protected by RCU anymore.  If the caller is not within an RCU critical
+ * section and does not hold the iothread lock, it must have other means of
+ * protecting the pointer, such as a reference to the region that includes
+ * the incoming ram_addr_t.
+ *
+ * @mr: the memory region being queried.
+ */
+MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset);
+
+/**
  * memory_region_get_ram_ptr: Get a pointer into a RAM memory region.
  *
  * Returns a host pointer to a RAM memory region (created with
@@ -1362,7 +1393,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
 					MemoryRegion *mr);
 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
                                     MemTxAttrs attrs, uint8_t *buf, int len);
-void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);
+void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);
 
 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 {
@@ -1400,8 +1431,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
             l = len;
             mr = address_space_translate(as, addr, &addr1, &l, false);
             if (len == l && memory_access_is_direct(mr, false)) {
-                addr1 += memory_region_get_ram_addr(mr);
-                ptr = qemu_get_ram_ptr(mr->ram_block, addr1);
+                ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
                 memcpy(buf, ptr, len);
             } else {
                 result = address_space_read_continue(as, addr, attrs, buf, len,
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 5b6e1b8b86..2a9465da11 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -105,9 +105,6 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t max_size,
                                                     uint64_t length,
                                                     void *host),
                                     MemoryRegion *mr, Error **errp);
-int qemu_get_ram_fd(ram_addr_t addr);
-void qemu_set_ram_fd(ram_addr_t addr, int fd);
-void *qemu_get_ram_block_host_ptr(ram_addr_t addr);
 void qemu_ram_free(RAMBlock *block);
 
 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp);
diff --git a/include/hw/cris/etraxfs.h b/include/hw/cris/etraxfs.h
index 73a6134c1e..eb664181e7 100644
--- a/include/hw/cris/etraxfs.h
+++ b/include/hw/cris/etraxfs.h
@@ -46,4 +46,20 @@ etraxfs_eth_init(NICInfo *nd, hwaddr base, int phyaddr,
     return dev;
 }
 
+static inline DeviceState *etraxfs_ser_create(hwaddr addr,
+                                              qemu_irq irq,
+                                              CharDriverState *chr)
+{
+    DeviceState *dev;
+    SysBusDevice *s;
+
+    dev = qdev_create(NULL, "etraxfs,serial");
+    s = SYS_BUS_DEVICE(dev);
+    qdev_prop_set_chr(dev, "chardev", chr);
+    qdev_init_nofail(dev);
+    sysbus_mmio_map(s, 0, addr);
+    sysbus_connect_irq(s, 0, irq);
+    return dev;
+}
+
 #endif
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index eb0e1b0342..0610377789 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -90,6 +90,7 @@ typedef struct VFIOContainer {
 typedef struct VFIOGuestIOMMU {
     VFIOContainer *container;
     MemoryRegion *iommu;
+    hwaddr iommu_offset;
     Notifier n;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;
@@ -154,5 +155,7 @@ extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces;
 #ifdef CONFIG_LINUX
 int vfio_get_region_info(VFIODevice *vbasedev, int index,
                          struct vfio_region_info **info);
+int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
+                             uint32_t subtype, struct vfio_region_info **info);
 #endif
 #endif /* !HW_VFIO_VFIO_COMMON_H */
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 9e36a97fc5..13b12b7e87 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -135,9 +135,12 @@ struct MigrationState
     QemuThread thread;
     QEMUBH *cleanup_bh;
     QEMUFile *to_dst_file;
-    int parameters[MIGRATION_PARAMETER__MAX];
+
+    /* New style params from 'migrate-set-parameters' */
+    MigrationParameters parameters;
 
     int state;
+    /* Old style params from 'migrate' command */
     MigrationParams params;
 
     /* State related to return path */
@@ -171,6 +174,9 @@ struct MigrationState
     QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests;
     /* The RAMBlock used in the last src_page_request */
     RAMBlock *last_req_rb;
+
+    /* The last error that occurred */
+    Error *error;
 };
 
 void migrate_set_state(int *state, int old_state, int new_state);
@@ -179,6 +185,22 @@ void process_incoming_migration(QEMUFile *f);
 
 void qemu_start_incoming_migration(const char *uri, Error **errp);
 
+void migration_set_incoming_channel(MigrationState *s,
+                                    QIOChannel *ioc);
+
+void migration_tls_set_incoming_channel(MigrationState *s,
+                                        QIOChannel *ioc,
+                                        Error **errp);
+
+void migration_set_outgoing_channel(MigrationState *s,
+                                    QIOChannel *ioc,
+                                    const char *hostname);
+
+void migration_tls_set_outgoing_channel(MigrationState *s,
+                                        QIOChannel *ioc,
+                                        const char *hostname,
+                                        Error **errp);
+
 uint64_t migrate_max_downtime(void);
 
 void exec_start_incoming_migration(const char *host_port, Error **errp);
@@ -201,7 +223,7 @@ void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **
 
 void rdma_start_incoming_migration(const char *host_port, Error **errp);
 
-void migrate_fd_error(MigrationState *s);
+void migrate_fd_error(MigrationState *s, const Error *error);
 
 void migrate_fd_connect(MigrationState *s);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 3f6b4ed581..2409a98967 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -23,16 +23,11 @@
  */
 #ifndef QEMU_FILE_H
 #define QEMU_FILE_H 1
+#include "qemu-common.h"
 #include "exec/cpu-common.h"
+#include "io/channel.h"
 
 
-/* This function writes a chunk of data to a file at the given position.
- * The pos argument can be ignored if the file is only being used for
- * streaming.  The handler should try to write all of the data it can.
- */
-typedef ssize_t (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
-                                        int64_t pos, size_t size);
-
 /* Read a chunk of data from a file at the given position.  The pos argument
  * can be ignored if the file is only be used for streaming.  The number of
  * bytes actually read should be returned.
@@ -53,8 +48,13 @@ typedef int (QEMUFileCloseFunc)(void *opaque);
  */
 typedef int (QEMUFileGetFD)(void *opaque);
 
+/* Called to change the blocking mode of the file
+ */
+typedef int (QEMUFileSetBlocking)(void *opaque, bool enabled);
+
 /*
- * This function writes an iovec to file.
+ * This function writes an iovec to file. The handler must write all
+ * of the data or return a negative errno value.
  */
 typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov,
                                            int iovcnt, int64_t pos);
@@ -101,32 +101,25 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
 typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr);
 
 typedef struct QEMUFileOps {
-    QEMUFilePutBufferFunc *put_buffer;
     QEMUFileGetBufferFunc *get_buffer;
     QEMUFileCloseFunc *close;
-    QEMUFileGetFD *get_fd;
+    QEMUFileSetBlocking *set_blocking;
     QEMUFileWritevBufferFunc *writev_buffer;
-    QEMURamHookFunc *before_ram_iterate;
-    QEMURamHookFunc *after_ram_iterate;
-    QEMURamHookFunc *hook_ram_load;
-    QEMURamSaveFunc *save_page;
     QEMURetPathFunc *get_return_path;
     QEMUFileShutdownFunc *shut_down;
 } QEMUFileOps;
 
-struct QEMUSizedBuffer {
-    struct iovec *iov;
-    size_t n_iov;
-    size_t size; /* total allocated size in all iov's */
-    size_t used; /* number of used bytes */
-};
+typedef struct QEMUFileHooks {
+    QEMURamHookFunc *before_ram_iterate;
+    QEMURamHookFunc *after_ram_iterate;
+    QEMURamHookFunc *hook_ram_load;
+    QEMURamSaveFunc *save_page;
+} QEMUFileHooks;
 
 QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
-QEMUFile *qemu_fopen(const char *filename, const char *mode);
-QEMUFile *qemu_fdopen(int fd, const char *mode);
-QEMUFile *qemu_fopen_socket(int fd, const char *mode);
-QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
-QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input);
+QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc);
+QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc);
+void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks);
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
@@ -141,20 +134,6 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size);
 bool qemu_file_mode_is_not_valid(const char *mode);
 bool qemu_file_is_writable(QEMUFile *f);
 
-QEMUSizedBuffer *qsb_create(const uint8_t *buffer, size_t len);
-void qsb_free(QEMUSizedBuffer *);
-size_t qsb_set_length(QEMUSizedBuffer *qsb, size_t length);
-size_t qsb_get_length(const QEMUSizedBuffer *qsb);
-ssize_t qsb_get_buffer(const QEMUSizedBuffer *, off_t start, size_t count,
-                       uint8_t *buf);
-ssize_t qsb_write_at(QEMUSizedBuffer *qsb, const uint8_t *buf,
-                     off_t pos, size_t count);
-
-
-/*
- * For use on files opened with qemu_bufopen
- */
-const QEMUSizedBuffer *qemu_buf_get(QEMUFile *f);
 
 static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
 {
diff --git a/include/qapi/error.h b/include/qapi/error.h
index 11be2327c0..0576659603 100644
--- a/include/qapi/error.h
+++ b/include/qapi/error.h
@@ -134,7 +134,7 @@ typedef enum ErrorClass {
 /*
  * Get @err's human-readable error message.
  */
-const char *error_get_pretty(Error *err);
+const char *error_get_pretty(const Error *err);
 
 /*
  * Get @err's error class.
diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 5bc4d6cc47..7a590969b5 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -36,7 +36,18 @@
 #define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); })
 #define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); })
 
+/* Most compilers currently treat consume and acquire the same, but really
+ * no processors except Alpha need a barrier here.  Leave it in if
+ * using Thread Sanitizer to avoid warnings, otherwise optimize it away.
+ */
+#if defined(__SANITIZE_THREAD__)
 #define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })
+#elsif defined(__alpha__)
+#define smp_read_barrier_depends()   asm volatile("mb":::"memory")
+#else
+#define smp_read_barrier_depends()   barrier()
+#endif
+
 
 /* Weak atomic operations prevent the compiler moving other
  * loads/stores past the atomic operation load/store. However there is
@@ -56,13 +67,23 @@
     __atomic_store(ptr, &_val, __ATOMIC_RELAXED);     \
 } while(0)
 
-/* Atomic RCU operations imply weak memory barriers */
+/* See above: most compilers currently treat consume and acquire the
+ * same, but this slows down atomic_rcu_read unnecessarily.
+ */
+#ifdef __SANITIZE_THREAD__
+#define atomic_rcu_read__nocheck(ptr, valptr)           \
+    __atomic_load(ptr, valptr, __ATOMIC_CONSUME);
+#else
+#define atomic_rcu_read__nocheck(ptr, valptr)           \
+    __atomic_load(ptr, valptr, __ATOMIC_RELAXED);       \
+    smp_read_barrier_depends();
+#endif
 
 #define atomic_rcu_read(ptr)                          \
     ({                                                \
     QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
     typeof(*ptr) _val;                                \
-    __atomic_load(ptr, &_val, __ATOMIC_CONSUME);      \
+    atomic_rcu_read__nocheck(ptr, &_val);             \
     _val;                                             \
     })
 
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 1dcf6f5d53..b113fcf156 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -82,7 +82,6 @@ typedef struct QemuOpt QemuOpt;
 typedef struct QemuOpts QemuOpts;
 typedef struct QemuOptsList QemuOptsList;
 typedef struct QEMUSGList QEMUSGList;
-typedef struct QEMUSizedBuffer QEMUSizedBuffer;
 typedef struct QEMUTimer QEMUTimer;
 typedef struct QEMUTimerListGroup QEMUTimerListGroup;
 typedef struct QObject QObject;
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index c9ba16ca82..32f3af3e1c 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -244,6 +244,7 @@ struct qemu_work_item {
  * @halted: Nonzero if the CPU is in suspended state.
  * @stop: Indicates a pending stop request.
  * @stopped: Indicates the CPU has been artificially stopped.
+ * @unplug: Indicates a pending CPU unplug request.
  * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *           CPU and return to its top level loop.
@@ -296,6 +297,7 @@ struct CPUState {
     bool created;
     bool stop;
     bool stopped;
+    bool unplug;
     bool crash_occurred;
     bool exit_request;
     bool tb_flushed;
@@ -763,6 +765,22 @@ void cpu_exit(CPUState *cpu);
 void cpu_resume(CPUState *cpu);
 
 /**
+ * cpu_remove:
+ * @cpu: The CPU to remove.
+ *
+ * Requests the CPU to be removed.
+ */
+void cpu_remove(CPUState *cpu);
+
+ /**
+ * cpu_remove_sync:
+ * @cpu: The CPU to remove.
+ *
+ * Requests the CPU to be removed and waits till it is removed.
+ */
+void cpu_remove_sync(CPUState *cpu);
+
+/**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
  *
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 68d92b556e..c04af8ea46 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -78,8 +78,7 @@ typedef struct BlockBackendPublic {
     QLIST_ENTRY(BlockBackendPublic) round_robin;
 } BlockBackendPublic;
 
-BlockBackend *blk_new(Error **errp);
-BlockBackend *blk_new_with_bs(Error **errp);
+BlockBackend *blk_new(void);
 BlockBackend *blk_new_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp);
 int blk_get_refcnt(BlockBackend *blk);
@@ -114,11 +113,17 @@ void *blk_get_attached_dev(BlockBackend *blk);
 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque);
 int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
                           int count);
-int blk_write_zeroes(BlockBackend *blk, int64_t offset,
-                     int count, BdrvRequestFlags flags);
-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
-                                 int count, BdrvRequestFlags flags,
-                                 BlockCompletionFunc *cb, void *opaque);
+int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags);
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                               unsigned int bytes, QEMUIOVector *qiov,
+                               BdrvRequestFlags flags);
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                      int count, BdrvRequestFlags flags);
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                  int count, BdrvRequestFlags flags,
+                                  BlockCompletionFunc *cb, void *opaque);
 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count);
 int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
                BdrvRequestFlags flags);
@@ -196,8 +201,8 @@ int blk_get_open_flags_from_root_state(BlockBackend *blk);
 
 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
                   BlockCompletionFunc *cb, void *opaque);
-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
-                                     int count, BdrvRequestFlags flags);
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                                      int count, BdrvRequestFlags flags);
 int blk_write_compressed(BlockBackend *blk, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 int blk_truncate(BlockBackend *blk, int64_t offset);
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index d6e96a4298..34c8eaf64e 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -194,19 +194,19 @@ void qemu_sglist_add(QEMUSGList *qsg, dma_addr_t base, dma_addr_t len);
 void qemu_sglist_destroy(QEMUSGList *qsg);
 #endif
 
-typedef BlockAIOCB *DMAIOFunc(BlockBackend *blk, int64_t offset,
-                              QEMUIOVector *iov, BdrvRequestFlags flags,
-                              BlockCompletionFunc *cb, void *opaque);
-
-BlockAIOCB *dma_blk_io(BlockBackend *blk,
-                       QEMUSGList *sg, uint64_t sector_num,
-                       DMAIOFunc *io_func, BlockCompletionFunc *cb,
-                       void *opaque, DMADirection dir);
+typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov,
+                              BlockCompletionFunc *cb, void *cb_opaque,
+                              void *opaque);
+
+BlockAIOCB *dma_blk_io(AioContext *ctx,
+                       QEMUSGList *sg, uint64_t offset,
+                       DMAIOFunc *io_func, void *io_func_opaque,
+                       BlockCompletionFunc *cb, void *opaque, DMADirection dir);
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
-                         QEMUSGList *sg, uint64_t sector,
+                         QEMUSGList *sg, uint64_t offset,
                          BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
-                          QEMUSGList *sg, uint64_t sector,
+                          QEMUSGList *sg, uint64_t offset,
                           BlockCompletionFunc *cb, void *opaque);
 uint64_t dma_buf_read(uint8_t *ptr, int32_t len, QEMUSGList *sg);
 uint64_t dma_buf_write(uint8_t *ptr, int32_t len, QEMUSGList *sg);
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index f9f00e2e56..65569ed438 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -216,6 +216,7 @@ int kvm_has_intx_set_mask(void);
 
 int kvm_init_vcpu(CPUState *cpu);
 int kvm_cpu_exec(CPUState *cpu);
+int kvm_destroy_vcpu(CPUState *cpu);
 
 #ifdef NEED_CPU_H
 #include "cpu.h"
@@ -345,6 +346,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s);
 
 int kvm_arch_init_vcpu(CPUState *cpu);
 
+bool kvm_vcpu_id_is_valid(int vcpu_id);
+
 /* Returns VCPU ID to be used on KVM_CREATE_VCPU ioctl() */
 unsigned long kvm_arch_vcpu_id(CPUState *cpu);
 
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 618169c4d5..94281413d0 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -119,7 +119,7 @@ void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
                               uint16_t len, uint8_t *data);
 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
 void qemu_savevm_send_open_return_path(QEMUFile *f);
-int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb);
+int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
 void qemu_savevm_send_postcopy_advise(QEMUFile *f);
 void qemu_savevm_send_postcopy_listen(QEMUFile *f);
 void qemu_savevm_send_postcopy_run(QEMUFile *f);
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index 3e5117bf28..43d795976d 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -140,6 +140,7 @@ static int qio_channel_buffer_close(QIOChannel *ioc,
     QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
 
     g_free(bioc->data);
+    bioc->data = NULL;
     bioc->capacity = bioc->usage = bioc->offset = 0;
 
     return 0;
diff --git a/kvm-all.c b/kvm-all.c
index f9ae8f9bf8..d317dcb33e 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -61,6 +61,12 @@
 
 #define KVM_MSI_HASHTAB_SIZE    256
 
+struct KVMParkedVcpu {
+    unsigned long vcpu_id;
+    int kvm_fd;
+    QLIST_ENTRY(KVMParkedVcpu) node;
+};
+
 struct KVMState
 {
     AccelState parent_obj;
@@ -94,6 +100,7 @@ struct KVMState
     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 #endif
     KVMMemoryListener memory_listener;
+    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 };
 
 KVMState *kvm_state;
@@ -237,6 +244,53 @@ static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
     return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 }
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    KVMState *s = kvm_state;
+    long mmap_size;
+    struct KVMParkedVcpu *vcpu = NULL;
+    int ret = 0;
+
+    DPRINTF("kvm_destroy_vcpu\n");
+
+    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+    if (mmap_size < 0) {
+        ret = mmap_size;
+        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
+        goto err;
+    }
+
+    ret = munmap(cpu->kvm_run, mmap_size);
+    if (ret < 0) {
+        goto err;
+    }
+
+    vcpu = g_malloc0(sizeof(*vcpu));
+    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+    vcpu->kvm_fd = cpu->kvm_fd;
+    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+err:
+    return ret;
+}
+
+static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+    struct KVMParkedVcpu *cpu;
+
+    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+        if (cpu->vcpu_id == vcpu_id) {
+            int kvm_fd;
+
+            QLIST_REMOVE(cpu, node);
+            kvm_fd = cpu->kvm_fd;
+            g_free(cpu);
+            return kvm_fd;
+        }
+    }
+
+    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     KVMState *s = kvm_state;
@@ -245,7 +299,7 @@ int kvm_init_vcpu(CPUState *cpu)
 
     DPRINTF("kvm_init_vcpu\n");
 
-    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
+    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
     if (ret < 0) {
         DPRINTF("kvm_create_vcpu failed\n");
         goto err;
@@ -1459,6 +1513,12 @@ static int kvm_max_vcpus(KVMState *s)
     return (ret) ? ret : kvm_recommended_vcpus(s);
 }
 
+bool kvm_vcpu_id_is_valid(int vcpu_id)
+{
+    KVMState *s = KVM_STATE(current_machine->accelerator);
+    return vcpu_id >= 0 && vcpu_id < kvm_max_vcpus(s);
+}
+
 static int kvm_init(MachineState *ms)
 {
     MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -1495,6 +1555,7 @@ static int kvm_init(MachineState *ms)
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
 #endif
+    QLIST_INIT(&s->kvm_parked_vcpus);
     s->vmfd = -1;
     s->fd = qemu_open("/dev/kvm", O_RDWR);
     if (s->fd == -1) {
diff --git a/kvm-stub.c b/kvm-stub.c
index 63735a872a..07c09d1141 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -32,6 +32,11 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_ioeventfd_any_length_allowed;
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    return -ENOSYS;
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     return -ENOSYS;
diff --git a/linux-user/Makefile.objs b/linux-user/Makefile.objs
index fd5021788f..8c93058100 100644
--- a/linux-user/Makefile.objs
+++ b/linux-user/Makefile.objs
@@ -1,5 +1,6 @@
 obj-y = main.o syscall.o strace.o mmap.o signal.o \
-	elfload.o linuxload.o uaccess.o uname.o
+	elfload.o linuxload.o uaccess.o uname.o \
+	safe-syscall.o
 
 obj-$(TARGET_HAS_BFLT) += flatload.o
 obj-$(TARGET_I386) += vm86.o
diff --git a/linux-user/alpha/target_signal.h b/linux-user/alpha/target_signal.h
index d3822da60e..4c78319145 100644
--- a/linux-user/alpha/target_signal.h
+++ b/linux-user/alpha/target_signal.h
@@ -27,6 +27,7 @@ static inline abi_ulong get_sp_from_cpustate(CPUAlphaState *state)
     return state->ir[IR_SP];
 }
 
+
 /* From <asm/gentrap.h>.  */
 #define TARGET_GEN_INTOVF      -1      /* integer overflow */
 #define TARGET_GEN_INTDIV      -2      /* integer division by zero */
diff --git a/linux-user/arm/target_signal.h b/linux-user/arm/target_signal.h
index 2b3281312b..fb31f4c5ec 100644
--- a/linux-user/arm/target_signal.h
+++ b/linux-user/arm/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUARMState *state)
    return state->regs[13];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/arm/target_syscall.h b/linux-user/arm/target_syscall.h
index ea863db0b9..11077b761b 100644
--- a/linux-user/arm/target_syscall.h
+++ b/linux-user/arm/target_syscall.h
@@ -4,29 +4,11 @@
 /* this struct defines the way the registers are stored on the
    stack during a system call. */
 
+/* uregs[0..15] are r0 to r15; uregs[16] is CPSR; uregs[17] is ORIG_r0 */
 struct target_pt_regs {
     abi_long uregs[18];
 };
 
-#define ARM_cpsr	uregs[16]
-#define ARM_pc		uregs[15]
-#define ARM_lr		uregs[14]
-#define ARM_sp		uregs[13]
-#define ARM_ip		uregs[12]
-#define ARM_fp		uregs[11]
-#define ARM_r10		uregs[10]
-#define ARM_r9		uregs[9]
-#define ARM_r8		uregs[8]
-#define ARM_r7		uregs[7]
-#define ARM_r6		uregs[6]
-#define ARM_r5		uregs[5]
-#define ARM_r4		uregs[4]
-#define ARM_r3		uregs[3]
-#define ARM_r2		uregs[2]
-#define ARM_r1		uregs[1]
-#define ARM_r0		uregs[0]
-#define ARM_ORIG_r0	uregs[17]
-
 #define ARM_SYSCALL_BASE	0x900000
 #define ARM_THUMB_SYSCALL	0
 
diff --git a/linux-user/cris/target_signal.h b/linux-user/cris/target_signal.h
index 5611840f83..e0f1382815 100644
--- a/linux-user/cris/target_signal.h
+++ b/linux-user/cris/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUCRISState *state)
     return state->regs[14];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index e47caff7ae..bb2558f284 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -274,19 +274,20 @@ static inline void init_thread(struct target_pt_regs *regs,
     abi_long stack = infop->start_stack;
     memset(regs, 0, sizeof(*regs));
 
-    regs->ARM_cpsr = 0x10;
-    if (infop->entry & 1)
-        regs->ARM_cpsr |= CPSR_T;
-    regs->ARM_pc = infop->entry & 0xfffffffe;
-    regs->ARM_sp = infop->start_stack;
+    regs->uregs[16] = ARM_CPU_MODE_USR;
+    if (infop->entry & 1) {
+        regs->uregs[16] |= CPSR_T;
+    }
+    regs->uregs[15] = infop->entry & 0xfffffffe;
+    regs->uregs[13] = infop->start_stack;
     /* FIXME - what to for failure of get_user()? */
-    get_user_ual(regs->ARM_r2, stack + 8); /* envp */
-    get_user_ual(regs->ARM_r1, stack + 4); /* envp */
+    get_user_ual(regs->uregs[2], stack + 8); /* envp */
+    get_user_ual(regs->uregs[1], stack + 4); /* envp */
     /* XXX: it seems that r0 is zeroed after ! */
-    regs->ARM_r0 = 0;
+    regs->uregs[0] = 0;
     /* For uClinux PIC binaries.  */
     /* XXX: Linux does this only on ARM with no MMU (do we care ?) */
-    regs->ARM_r10 = infop->start_data;
+    regs->uregs[10] = infop->start_data;
 }
 
 #define ELF_NREG    18
diff --git a/linux-user/errno_defs.h b/linux-user/errno_defs.h
index 8a1cf76cdb..65522c4516 100644
--- a/linux-user/errno_defs.h
+++ b/linux-user/errno_defs.h
@@ -139,3 +139,20 @@
 /* for robust mutexes */
 #define TARGET_EOWNERDEAD      130     /* Owner died */
 #define TARGET_ENOTRECOVERABLE 131     /* State not recoverable */
+
+/* QEMU internal, not visible to the guest. This is returned when a
+ * system call should be restarted, to tell the main loop that it
+ * should wind the guest PC backwards so it will re-execute the syscall
+ * after handling any pending signals. They match with the ones the guest
+ * kernel uses for the same purpose.
+ */
+#define TARGET_ERESTARTSYS     512     /* Restart system call (if SA_RESTART) */
+
+/* QEMU internal, not visible to the guest. This is returned by the
+ * do_sigreturn() code after a successful sigreturn syscall, to indicate
+ * that it has correctly set the guest registers and so the main loop
+ * should not touch them. We use the value the guest would use for
+ * ERESTART_NOINTR (which is kernel internal) to guarantee that we won't
+ * clash with a valid guest errno now or in the future.
+ */
+#define TARGET_QEMU_ESIGRETURN 513     /* Return from signal */
diff --git a/linux-user/host/generic/hostdep.h b/linux-user/host/generic/hostdep.h
new file mode 100644
index 0000000000..cfabc3590b
--- /dev/null
+++ b/linux-user/host/generic/hostdep.h
@@ -0,0 +1,20 @@
+/*
+ * hostdep.h : fallback generic version of header for things
+ * which are dependent on the host architecture
+ *
+ *  * Written by Peter Maydell <peter.maydell@linaro.org>
+ *
+ * Copyright (C) 2016 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_HOSTDEP_H
+#define QEMU_HOSTDEP_H
+
+/* This is the fallback header which is only used if the host
+ * architecture doesn't provide one in linux-user/host/$ARCH.
+ */
+
+#endif
diff --git a/linux-user/host/x86_64/hostdep.h b/linux-user/host/x86_64/hostdep.h
new file mode 100644
index 0000000000..9dfbf3ae6a
--- /dev/null
+++ b/linux-user/host/x86_64/hostdep.h
@@ -0,0 +1,38 @@
+/*
+ * hostdep.h : things which are dependent on the host architecture
+ *
+ *  * Written by Peter Maydell <peter.maydell@linaro.org>
+ *
+ * Copyright (C) 2016 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_HOSTDEP_H
+#define QEMU_HOSTDEP_H
+
+/* We have a safe-syscall.inc.S */
+#define HAVE_SAFE_SYSCALL
+
+#ifndef __ASSEMBLER__
+
+/* These are defined by the safe-syscall.inc.S file */
+extern char safe_syscall_start[];
+extern char safe_syscall_end[];
+
+/* Adjust the signal context to rewind out of safe-syscall if we're in it */
+static inline void rewind_if_in_safe_syscall(void *puc)
+{
+    struct ucontext *uc = puc;
+    greg_t *pcreg = &uc->uc_mcontext.gregs[REG_RIP];
+
+    if (*pcreg > (uintptr_t)safe_syscall_start
+        && *pcreg < (uintptr_t)safe_syscall_end) {
+        *pcreg = (uintptr_t)safe_syscall_start;
+    }
+}
+
+#endif /* __ASSEMBLER__ */
+
+#endif
diff --git a/linux-user/host/x86_64/safe-syscall.inc.S b/linux-user/host/x86_64/safe-syscall.inc.S
new file mode 100644
index 0000000000..dde434c8d7
--- /dev/null
+++ b/linux-user/host/x86_64/safe-syscall.inc.S
@@ -0,0 +1,81 @@
+/*
+ * safe-syscall.inc.S : host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ * This is intended to be included by linux-user/safe-syscall.S
+ *
+ * Copyright (C) 2015 Timothy Edward Baldwin <T.E.Baldwin99@members.leeds.ac.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+        .global safe_syscall_base
+        .global safe_syscall_start
+        .global safe_syscall_end
+        .type   safe_syscall_base, @function
+
+        /* This is the entry point for making a system call. The calling
+         * convention here is that of a C varargs function with the
+         * first argument an 'int *' to the signal_pending flag, the
+         * second one the system call number (as a 'long'), and all further
+         * arguments being syscall arguments (also 'long').
+         * We return a long which is the syscall's return value, which
+         * may be negative-errno on failure. Conversion to the
+         * -1-and-errno-set convention is done by the calling wrapper.
+         */
+safe_syscall_base:
+        /* This saves a frame pointer and aligns the stack for the syscall.
+         * (It's unclear if the syscall ABI has the same stack alignment
+         * requirements as the userspace function call ABI, but better safe than
+         * sorry. Appendix A2 of http://www.x86-64.org/documentation/abi.pdf
+         * does not list any ABI differences regarding stack alignment.)
+         */
+        push    %rbp
+
+        /* The syscall calling convention isn't the same as the
+         * C one:
+         * we enter with rdi == *signal_pending
+         *               rsi == syscall number
+         *               rdx, rcx, r8, r9, (stack), (stack) == syscall arguments
+         *               and return the result in rax
+         * and the syscall instruction needs
+         *               rax == syscall number
+         *               rdi, rsi, rdx, r10, r8, r9 == syscall arguments
+         *               and returns the result in rax
+         * Shuffle everything around appropriately.
+         * Note that syscall will trash rcx and r11.
+         */
+        mov     %rsi, %rax /* syscall number */
+        mov     %rdi, %rbp /* signal_pending pointer */
+        /* and the syscall arguments */
+        mov     %rdx, %rdi
+        mov     %rcx, %rsi
+        mov     %r8,  %rdx
+        mov     %r9,  %r10
+        mov     16(%rsp), %r8
+        mov     24(%rsp), %r9
+
+        /* This next sequence of code works in conjunction with the
+         * rewind_if_safe_syscall_function(). If a signal is taken
+         * and the interrupted PC is anywhere between 'safe_syscall_start'
+         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
+         * The code sequence must therefore be able to cope with this, and
+         * the syscall instruction must be the final one in the sequence.
+         */
+safe_syscall_start:
+        /* if signal_pending is non-zero, don't do the call */
+        testl   $1, (%rbp)
+        jnz     return_ERESTARTSYS
+        syscall
+safe_syscall_end:
+        /* code path for having successfully executed the syscall */
+        pop     %rbp
+        ret
+
+return_ERESTARTSYS:
+        /* code path when we didn't execute the syscall */
+        mov     $-TARGET_ERESTARTSYS, %rax
+        pop     %rbp
+        ret
+
+        .size   safe_syscall_base, .-safe_syscall_base
diff --git a/linux-user/m68k/target_signal.h b/linux-user/m68k/target_signal.h
index 479758a421..9deaa89c80 100644
--- a/linux-user/m68k/target_signal.h
+++ b/linux-user/m68k/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUM68KState *state)
     return state->aregs[7];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/main.c b/linux-user/main.c
index 95ed11d85c..b2bc6ab2f7 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -285,6 +285,7 @@ void cpu_loop(CPUX86State *env)
     CPUState *cs = CPU(x86_env_get_cpu(env));
     int trapnr;
     abi_ulong pc;
+    abi_ulong ret;
     target_siginfo_t info;
 
     for(;;) {
@@ -294,28 +295,38 @@ void cpu_loop(CPUX86State *env)
         switch(trapnr) {
         case 0x80:
             /* linux syscall from int $0x80 */
-            env->regs[R_EAX] = do_syscall(env,
-                                          env->regs[R_EAX],
-                                          env->regs[R_EBX],
-                                          env->regs[R_ECX],
-                                          env->regs[R_EDX],
-                                          env->regs[R_ESI],
-                                          env->regs[R_EDI],
-                                          env->regs[R_EBP],
-                                          0, 0);
+            ret = do_syscall(env,
+                             env->regs[R_EAX],
+                             env->regs[R_EBX],
+                             env->regs[R_ECX],
+                             env->regs[R_EDX],
+                             env->regs[R_ESI],
+                             env->regs[R_EDI],
+                             env->regs[R_EBP],
+                             0, 0);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->eip -= 2;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->regs[R_EAX] = ret;
+            }
             break;
 #ifndef TARGET_ABI32
         case EXCP_SYSCALL:
             /* linux syscall from syscall instruction */
-            env->regs[R_EAX] = do_syscall(env,
-                                          env->regs[R_EAX],
-                                          env->regs[R_EDI],
-                                          env->regs[R_ESI],
-                                          env->regs[R_EDX],
-                                          env->regs[10],
-                                          env->regs[8],
-                                          env->regs[9],
-                                          0, 0);
+            ret = do_syscall(env,
+                             env->regs[R_EAX],
+                             env->regs[R_EDI],
+                             env->regs[R_ESI],
+                             env->regs[R_EDX],
+                             env->regs[10],
+                             env->regs[8],
+                             env->regs[9],
+                             0, 0);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->eip -= 2;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->regs[R_EAX] = ret;
+            }
             break;
 #endif
         case EXCP0B_NOSEG:
@@ -716,6 +727,7 @@ void cpu_loop(CPUARMState *env)
     unsigned int n, insn;
     target_siginfo_t info;
     uint32_t addr;
+    abi_ulong ret;
 
     for(;;) {
         cpu_exec_start(cs);
@@ -854,15 +866,20 @@ void cpu_loop(CPUARMState *env)
                             break;
                         }
                     } else {
-                        env->regs[0] = do_syscall(env,
-                                                  n,
-                                                  env->regs[0],
-                                                  env->regs[1],
-                                                  env->regs[2],
-                                                  env->regs[3],
-                                                  env->regs[4],
-                                                  env->regs[5],
-                                                  0, 0);
+                        ret = do_syscall(env,
+                                         n,
+                                         env->regs[0],
+                                         env->regs[1],
+                                         env->regs[2],
+                                         env->regs[3],
+                                         env->regs[4],
+                                         env->regs[5],
+                                         0, 0);
+                        if (ret == -TARGET_ERESTARTSYS) {
+                            env->regs[15] -= env->thumb ? 2 : 4;
+                        } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                            env->regs[0] = ret;
+                        }
                     }
                 } else {
                     goto error;
@@ -1045,6 +1062,7 @@ void cpu_loop(CPUARMState *env)
 {
     CPUState *cs = CPU(arm_env_get_cpu(env));
     int trapnr, sig;
+    abi_long ret;
     target_siginfo_t info;
 
     for (;;) {
@@ -1054,15 +1072,20 @@ void cpu_loop(CPUARMState *env)
 
         switch (trapnr) {
         case EXCP_SWI:
-            env->xregs[0] = do_syscall(env,
-                                       env->xregs[8],
-                                       env->xregs[0],
-                                       env->xregs[1],
-                                       env->xregs[2],
-                                       env->xregs[3],
-                                       env->xregs[4],
-                                       env->xregs[5],
-                                       0, 0);
+            ret = do_syscall(env,
+                             env->xregs[8],
+                             env->xregs[0],
+                             env->xregs[1],
+                             env->xregs[2],
+                             env->xregs[3],
+                             env->xregs[4],
+                             env->xregs[5],
+                             0, 0);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->pc -= 4;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->xregs[0] = ret;
+            }
             break;
         case EXCP_INTERRUPT:
             /* just indicate that signals should be handled asap */
@@ -1148,7 +1171,7 @@ void cpu_loop(CPUUniCore32State *env)
                             cpu_set_tls(env, env->regs[0]);
                             env->regs[0] = 0;
                     } else {
-                        env->regs[0] = do_syscall(env,
+                        abi_long ret = do_syscall(env,
                                                   n,
                                                   env->regs[0],
                                                   env->regs[1],
@@ -1157,6 +1180,11 @@ void cpu_loop(CPUUniCore32State *env)
                                                   env->regs[4],
                                                   env->regs[5],
                                                   0, 0);
+                        if (ret == -TARGET_ERESTARTSYS) {
+                            env->regs[31] -= 4;
+                        } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                            env->regs[0] = ret;
+                        }
                     }
                 } else {
                     goto error;
@@ -1353,6 +1381,9 @@ void cpu_loop (CPUSPARCState *env)
                               env->regwptr[2], env->regwptr[3],
                               env->regwptr[4], env->regwptr[5],
                               0, 0);
+            if (ret == -TARGET_ERESTARTSYS || ret == -TARGET_QEMU_ESIGRETURN) {
+                break;
+            }
             if ((abi_ulong)ret >= (abi_ulong)(-515)) {
 #if defined(TARGET_SPARC64) && !defined(TARGET_ABI32)
                 env->xcc |= PSR_CARRY;
@@ -1964,6 +1995,10 @@ void cpu_loop(CPUPPCState *env)
             ret = do_syscall(env, env->gpr[0], env->gpr[3], env->gpr[4],
                              env->gpr[5], env->gpr[6], env->gpr[7],
                              env->gpr[8], 0, 0);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->nip -= 4;
+                break;
+            }
             if (ret == (target_ulong)(-TARGET_QEMU_ESIGRETURN)) {
                 /* Returning from a successful sigreturn syscall.
                    Avoid corrupting register state.  */
@@ -2505,6 +2540,10 @@ done_syscall:
                              env->active_tc.gpr[8], env->active_tc.gpr[9],
                              env->active_tc.gpr[10], env->active_tc.gpr[11]);
 # endif /* O32 */
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->active_tc.PC -= 4;
+                break;
+            }
             if (ret == -TARGET_QEMU_ESIGRETURN) {
                 /* Returning from a successful sigreturn syscall.
                    Avoid clobbering register state.  */
@@ -2685,6 +2724,7 @@ void cpu_loop(CPUOpenRISCState *env)
 {
     CPUState *cs = CPU(openrisc_env_get_cpu(env));
     int trapnr, gdbsig;
+    abi_long ret;
 
     for (;;) {
         cpu_exec_start(cs);
@@ -2730,14 +2770,19 @@ void cpu_loop(CPUOpenRISCState *env)
             break;
         case EXCP_SYSCALL:
             env->pc += 4;   /* 0xc00; */
-            env->gpr[11] = do_syscall(env,
-                                      env->gpr[11], /* return value       */
-                                      env->gpr[3],  /* r3 - r7 are params */
-                                      env->gpr[4],
-                                      env->gpr[5],
-                                      env->gpr[6],
-                                      env->gpr[7],
-                                      env->gpr[8], 0, 0);
+            ret = do_syscall(env,
+                             env->gpr[11], /* return value       */
+                             env->gpr[3],  /* r3 - r7 are params */
+                             env->gpr[4],
+                             env->gpr[5],
+                             env->gpr[6],
+                             env->gpr[7],
+                             env->gpr[8], 0, 0);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->pc -= 4;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->gpr[11] = ret;
+            }
             break;
         case EXCP_FPE:
             qemu_log_mask(CPU_LOG_INT, "\nFloating point error\n");
@@ -2792,7 +2837,11 @@ void cpu_loop(CPUSH4State *env)
                              env->gregs[0],
                              env->gregs[1],
                              0, 0);
-            env->gregs[0] = ret;
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->pc -= 2;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->gregs[0] = ret;
+            }
             break;
         case EXCP_INTERRUPT:
             /* just indicate that signals should be handled asap */
@@ -2865,7 +2914,11 @@ void cpu_loop(CPUCRISState *env)
                              env->pregs[7], 
                              env->pregs[11],
                              0, 0);
-            env->regs[10] = ret;
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->pc -= 2;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->regs[10] = ret;
+            }
             break;
         case EXCP_DEBUG:
             {
@@ -2929,7 +2982,19 @@ void cpu_loop(CPUMBState *env)
                              env->regs[9], 
                              env->regs[10],
                              0, 0);
-            env->regs[3] = ret;
+            if (ret == -TARGET_ERESTARTSYS) {
+                /* Wind back to before the syscall. */
+                env->sregs[SR_PC] -= 4;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->regs[3] = ret;
+            }
+            /* All syscall exits result in guest r14 being equal to the
+             * PC we return to, because the kernel syscall exit "rtbd" does
+             * this. (This is true even for sigreturn(); note that r14 is
+             * not a userspace-usable register, as the kernel may clobber it
+             * at any point.)
+             */
+            env->regs[14] = env->sregs[SR_PC];
             break;
         case EXCP_HW_EXCP:
             env->regs[17] = env->sregs[SR_PC] + 4;
@@ -3037,18 +3102,24 @@ void cpu_loop(CPUM68KState *env)
             break;
         case EXCP_TRAP0:
             {
+                abi_long ret;
                 ts->sim_syscalls = 0;
                 n = env->dregs[0];
                 env->pc += 2;
-                env->dregs[0] = do_syscall(env,
-                                          n,
-                                          env->dregs[1],
-                                          env->dregs[2],
-                                          env->dregs[3],
-                                          env->dregs[4],
-                                          env->dregs[5],
-                                          env->aregs[0],
-                                          0, 0);
+                ret = do_syscall(env,
+                                 n,
+                                 env->dregs[1],
+                                 env->dregs[2],
+                                 env->dregs[3],
+                                 env->dregs[4],
+                                 env->dregs[5],
+                                 env->aregs[0],
+                                 0, 0);
+                if (ret == -TARGET_ERESTARTSYS) {
+                    env->pc -= 2;
+                } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                    env->dregs[0] = ret;
+                }
             }
             break;
         case EXCP_INTERRUPT:
@@ -3229,8 +3300,11 @@ void cpu_loop(CPUAlphaState *env)
                                     env->ir[IR_A2], env->ir[IR_A3],
                                     env->ir[IR_A4], env->ir[IR_A5],
                                     0, 0);
-                if (trapnr == TARGET_NR_sigreturn
-                    || trapnr == TARGET_NR_rt_sigreturn) {
+                if (sysret == -TARGET_ERESTARTSYS) {
+                    env->pc -= 4;
+                    break;
+                }
+                if (sysret == -TARGET_QEMU_ESIGRETURN) {
                     break;
                 }
                 /* Syscall writes 0 to V0 to bypass error check, similar
@@ -3327,6 +3401,7 @@ void cpu_loop(CPUS390XState *env)
     int trapnr, n, sig;
     target_siginfo_t info;
     target_ulong addr;
+    abi_long ret;
 
     while (1) {
         cpu_exec_start(cs);
@@ -3344,9 +3419,14 @@ void cpu_loop(CPUS390XState *env)
                 n = env->regs[1];
             }
             env->psw.addr += env->int_svc_ilen;
-            env->regs[2] = do_syscall(env, n, env->regs[2], env->regs[3],
-                                      env->regs[4], env->regs[5],
-                                      env->regs[6], env->regs[7], 0, 0);
+            ret = do_syscall(env, n, env->regs[2], env->regs[3],
+                             env->regs[4], env->regs[5],
+                             env->regs[6], env->regs[7], 0, 0);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->psw.addr -= env->int_svc_ilen;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->regs[2] = ret;
+            }
             break;
 
         case EXCP_DEBUG:
@@ -3638,15 +3718,20 @@ void cpu_loop(CPUTLGState *env)
         cpu_exec_end(cs);
         switch (trapnr) {
         case TILEGX_EXCP_SYSCALL:
-            env->regs[TILEGX_R_RE] = do_syscall(env, env->regs[TILEGX_R_NR],
-                                                env->regs[0], env->regs[1],
-                                                env->regs[2], env->regs[3],
-                                                env->regs[4], env->regs[5],
-                                                env->regs[6], env->regs[7]);
-            env->regs[TILEGX_R_ERR] = TILEGX_IS_ERRNO(env->regs[TILEGX_R_RE])
-                                                      ? - env->regs[TILEGX_R_RE]
-                                                      : 0;
+        {
+            abi_ulong ret = do_syscall(env, env->regs[TILEGX_R_NR],
+                                       env->regs[0], env->regs[1],
+                                       env->regs[2], env->regs[3],
+                                       env->regs[4], env->regs[5],
+                                       env->regs[6], env->regs[7]);
+            if (ret == -TARGET_ERESTARTSYS) {
+                env->pc -= 8;
+            } else if (ret != -TARGET_QEMU_ESIGRETURN) {
+                env->regs[TILEGX_R_RE] = ret;
+                env->regs[TILEGX_R_ERR] = TILEGX_IS_ERRNO(ret) ? -ret : 0;
+            }
             break;
+        }
         case TILEGX_EXCP_OPCODE_EXCH:
             do_exch(env, true, false);
             break;
diff --git a/linux-user/microblaze/target_signal.h b/linux-user/microblaze/target_signal.h
index 3d1f7a7238..acdf3b5acd 100644
--- a/linux-user/microblaze/target_signal.h
+++ b/linux-user/microblaze/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUMBState *state)
     return state->regs[14];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/mips/target_signal.h b/linux-user/mips/target_signal.h
index 6e1dc8b6e6..460cc9ffef 100644
--- a/linux-user/mips/target_signal.h
+++ b/linux-user/mips/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUMIPSState *state)
     return state->active_tc.gpr[29];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/mips/target_syscall.h b/linux-user/mips/target_syscall.h
index 68db160e53..e8e305cc9c 100644
--- a/linux-user/mips/target_syscall.h
+++ b/linux-user/mips/target_syscall.h
@@ -222,10 +222,6 @@ struct target_pt_regs {
 #define TARGET_ENOTRECOVERABLE 166     /* State not recoverable */
 
 
-
-/* Nasty hack: define a fake errno value for use by sigreturn.  */
-#define TARGET_QEMU_ESIGRETURN 255
-
 #define UNAME_MACHINE "mips"
 #define UNAME_MINIMUM_RELEASE "2.6.32"
 
diff --git a/linux-user/mips64/target_signal.h b/linux-user/mips64/target_signal.h
index 5fb6a2ccfc..a2dc514e3e 100644
--- a/linux-user/mips64/target_signal.h
+++ b/linux-user/mips64/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUMIPSState *state)
     return state->active_tc.gpr[29];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/mips64/target_syscall.h b/linux-user/mips64/target_syscall.h
index 0e0c2d232f..5789e86150 100644
--- a/linux-user/mips64/target_syscall.h
+++ b/linux-user/mips64/target_syscall.h
@@ -219,10 +219,6 @@ struct target_pt_regs {
 #define TARGET_ENOTRECOVERABLE 166     /* State not recoverable */
 
 
-
-/* Nasty hack: define a fake errno value for use by sigreturn. */
-#define TARGET_QEMU_ESIGRETURN 255
-
 #define UNAME_MACHINE "mips64"
 #define UNAME_MINIMUM_RELEASE "2.6.32"
 
diff --git a/linux-user/openrisc/target_signal.h b/linux-user/openrisc/target_signal.h
index 964aed69f1..f600501f6f 100644
--- a/linux-user/openrisc/target_signal.h
+++ b/linux-user/openrisc/target_signal.h
@@ -23,4 +23,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUOpenRISCState *state)
     return state->gpr[1];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/ppc/target_signal.h b/linux-user/ppc/target_signal.h
index a93b5cf1df..4f01dd4ea8 100644
--- a/linux-user/ppc/target_signal.h
+++ b/linux-user/ppc/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUPPCState *state)
     return state->gpr[1];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/ppc/target_syscall.h b/linux-user/ppc/target_syscall.h
index 35cab59462..7ca83c2280 100644
--- a/linux-user/ppc/target_syscall.h
+++ b/linux-user/ppc/target_syscall.h
@@ -53,8 +53,6 @@ struct target_revectored_struct {
 	abi_ulong __map[8];			/* 256 bits */
 };
 
-/* Nasty hack: define a fake errno value for use by sigreturn.  */
-#define TARGET_QEMU_ESIGRETURN 255
 
 /*
  * flags masks
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 208c63eb2a..f09b750bbf 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -1,7 +1,7 @@
 #ifndef QEMU_H
 #define QEMU_H
 
-
+#include "hostdep.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
@@ -205,6 +205,131 @@ unsigned long init_guest_space(unsigned long host_start,
 
 #include "qemu/log.h"
 
+/* safe_syscall.S */
+
+/**
+ * safe_syscall:
+ * @int number: number of system call to make
+ * ...: arguments to the system call
+ *
+ * Call a system call if guest signal not pending.
+ * This has the same API as the libc syscall() function, except that it
+ * may return -1 with errno == TARGET_ERESTARTSYS if a signal was pending.
+ *
+ * Returns: the system call result, or -1 with an error code in errno
+ * (Errnos are host errnos; we rely on TARGET_ERESTARTSYS not clashing
+ * with any of the host errno values.)
+ */
+
+/* A guide to using safe_syscall() to handle interactions between guest
+ * syscalls and guest signals:
+ *
+ * Guest syscalls come in two flavours:
+ *
+ * (1) Non-interruptible syscalls
+ *
+ * These are guest syscalls that never get interrupted by signals and
+ * so never return EINTR. They can be implemented straightforwardly in
+ * QEMU: just make sure that if the implementation code has to make any
+ * blocking calls that those calls are retried if they return EINTR.
+ * It's also OK to implement these with safe_syscall, though it will be
+ * a little less efficient if a signal is delivered at the 'wrong' moment.
+ *
+ * (2) Interruptible syscalls
+ *
+ * These are guest syscalls that can be interrupted by signals and
+ * for which we need to either return EINTR or arrange for the guest
+ * syscall to be restarted. This category includes both syscalls which
+ * always restart (and in the kernel return -ERESTARTNOINTR), ones
+ * which only restart if there is no handler (kernel returns -ERESTARTNOHAND
+ * or -ERESTART_RESTARTBLOCK), and the most common kind which restart
+ * if the handler was registered with SA_RESTART (kernel returns
+ * -ERESTARTSYS). System calls which are only interruptible in some
+ * situations (like 'open') also need to be handled this way.
+ *
+ * Here it is important that the host syscall is made
+ * via this safe_syscall() function, and *not* via the host libc.
+ * If the host libc is used then the implementation will appear to work
+ * most of the time, but there will be a race condition where a
+ * signal could arrive just before we make the host syscall inside libc,
+ * and then then guest syscall will not correctly be interrupted.
+ * Instead the implementation of the guest syscall can use the safe_syscall
+ * function but otherwise just return the result or errno in the usual
+ * way; the main loop code will take care of restarting the syscall
+ * if appropriate.
+ *
+ * (If the implementation needs to make multiple host syscalls this is
+ * OK; any which might really block must be via safe_syscall(); for those
+ * which are only technically blocking (ie which we know in practice won't
+ * stay in the host kernel indefinitely) it's OK to use libc if necessary.
+ * You must be able to cope with backing out correctly if some safe_syscall
+ * you make in the implementation returns either -TARGET_ERESTARTSYS or
+ * EINTR though.)
+ *
+ *
+ * How and why the safe_syscall implementation works:
+ *
+ * The basic setup is that we make the host syscall via a known
+ * section of host native assembly. If a signal occurs, our signal
+ * handler checks the interrupted host PC against the addresse of that
+ * known section. If the PC is before or at the address of the syscall
+ * instruction then we change the PC to point at a "return
+ * -TARGET_ERESTARTSYS" code path instead, and then exit the signal handler
+ * (causing the safe_syscall() call to immediately return that value).
+ * Then in the main.c loop if we see this magic return value we adjust
+ * the guest PC to wind it back to before the system call, and invoke
+ * the guest signal handler as usual.
+ *
+ * This winding-back will happen in two cases:
+ * (1) signal came in just before we took the host syscall (a race);
+ *   in this case we'll take the guest signal and have another go
+ *   at the syscall afterwards, and this is indistinguishable for the
+ *   guest from the timing having been different such that the guest
+ *   signal really did win the race
+ * (2) signal came in while the host syscall was blocking, and the
+ *   host kernel decided the syscall should be restarted;
+ *   in this case we want to restart the guest syscall also, and so
+ *   rewinding is the right thing. (Note that "restart" semantics mean
+ *   "first call the signal handler, then reattempt the syscall".)
+ * The other situation to consider is when a signal came in while the
+ * host syscall was blocking, and the host kernel decided that the syscall
+ * should not be restarted; in this case QEMU's host signal handler will
+ * be invoked with the PC pointing just after the syscall instruction,
+ * with registers indicating an EINTR return; the special code in the
+ * handler will not kick in, and we will return EINTR to the guest as
+ * we should.
+ *
+ * Notice that we can leave the host kernel to make the decision for
+ * us about whether to do a restart of the syscall or not; we do not
+ * need to check SA_RESTART flags in QEMU or distinguish the various
+ * kinds of restartability.
+ */
+#ifdef HAVE_SAFE_SYSCALL
+/* The core part of this function is implemented in assembly */
+extern long safe_syscall_base(int *pending, long number, ...);
+
+#define safe_syscall(...)                                               \
+    ({                                                                  \
+        long ret_;                                                      \
+        int *psp_ = &((TaskState *)thread_cpu->opaque)->signal_pending; \
+        ret_ = safe_syscall_base(psp_, __VA_ARGS__);                    \
+        if (is_error(ret_)) {                                           \
+            errno = -ret_;                                              \
+            ret_ = -1;                                                  \
+        }                                                               \
+        ret_;                                                           \
+    })
+
+#else
+
+/* Fallback for architectures which don't yet provide a safe-syscall assembly
+ * fragment; note that this is racy!
+ * This should go away when all host architectures have been updated.
+ */
+#define safe_syscall syscall
+
+#endif
+
 /* syscall.c */
 int host_to_target_waitstatus(int status);
 
diff --git a/linux-user/s390x/target_signal.h b/linux-user/s390x/target_signal.h
index b4816b040f..a6fb2873a3 100644
--- a/linux-user/s390x/target_signal.h
+++ b/linux-user/s390x/target_signal.h
@@ -23,4 +23,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUS390XState *state)
    return state->regs[15];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/safe-syscall.S b/linux-user/safe-syscall.S
new file mode 100644
index 0000000000..b5df6254ae
--- /dev/null
+++ b/linux-user/safe-syscall.S
@@ -0,0 +1,30 @@
+/*
+ * safe-syscall.S : include the host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ *
+ * Written by Peter Maydell <peter.maydell@linaro.org>
+ *
+ * Copyright (C) 2016 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "hostdep.h"
+#include "errno_defs.h"
+
+/* We have the correct host directory on our include path
+ * so that this will pull in the right fragment for the architecture.
+ */
+#ifdef HAVE_SAFE_SYSCALL
+#include "safe-syscall.inc.S"
+#endif
+
+/* We must specifically say that we're happy for the stack to not be
+ * executable, otherwise the toolchain will default to assuming our
+ * assembly needs an executable stack and the whole QEMU binary will
+ * needlessly end up with one. This should be the last thing in this file.
+ */
+#if defined(__linux__) && defined(__ELF__)
+.section        .note.GNU-stack, "", %progbits
+#endif
diff --git a/linux-user/sh4/target_signal.h b/linux-user/sh4/target_signal.h
index e148da0925..f9911aa7f2 100644
--- a/linux-user/sh4/target_signal.h
+++ b/linux-user/sh4/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUSH4State *state)
     return state->gregs[15];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/signal.c b/linux-user/signal.c
index 96e86c0a29..8090b4de1f 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -157,7 +157,7 @@ static void target_to_host_sigset_internal(sigset_t *d,
         if (target_sigismember(s, i)) {
             sigaddset(d, target_to_host_signal(i));
         }
-     }
+    }
 }
 
 void target_to_host_sigset(sigset_t *d, const target_sigset_t *s)
@@ -250,18 +250,18 @@ static inline void host_to_target_siginfo_noswap(target_siginfo_t *tinfo,
     tinfo->si_code = info->si_code;
 
     if (sig == TARGET_SIGILL || sig == TARGET_SIGFPE || sig == TARGET_SIGSEGV
-        || sig == TARGET_SIGBUS || sig == TARGET_SIGTRAP) {
+            || sig == TARGET_SIGBUS || sig == TARGET_SIGTRAP) {
         /* Should never come here, but who knows. The information for
            the target is irrelevant.  */
         tinfo->_sifields._sigfault._addr = 0;
     } else if (sig == TARGET_SIGIO) {
         tinfo->_sifields._sigpoll._band = info->si_band;
-	tinfo->_sifields._sigpoll._fd = info->si_fd;
+        tinfo->_sifields._sigpoll._fd = info->si_fd;
     } else if (sig == TARGET_SIGCHLD) {
         tinfo->_sifields._sigchld._pid = info->si_pid;
         tinfo->_sifields._sigchld._uid = info->si_uid;
         tinfo->_sifields._sigchld._status
-            = host_to_target_waitstatus(info->si_status);
+                = host_to_target_waitstatus(info->si_status);
         tinfo->_sifields._sigchld._utime = info->si_utime;
         tinfo->_sifields._sigchld._stime = info->si_stime;
     } else if (sig >= TARGET_SIGRTMIN) {
@@ -269,7 +269,7 @@ static inline void host_to_target_siginfo_noswap(target_siginfo_t *tinfo,
         tinfo->_sifields._rt._uid = info->si_uid;
         /* XXX: potential problem if 64 bit */
         tinfo->_sifields._rt._sigval.sival_ptr
-            = (abi_ulong)(unsigned long)info->si_value.sival_ptr;
+                = (abi_ulong)(unsigned long)info->si_value.sival_ptr;
     }
 }
 
@@ -561,6 +561,13 @@ int queue_signal(CPUArchState *env, int sig, target_siginfo_t *info)
     }
 }
 
+#ifndef HAVE_SAFE_SYSCALL
+static inline void rewind_if_in_safe_syscall(void *puc)
+{
+    /* Default version: never rewind */
+}
+#endif
+
 static void host_signal_handler(int host_signum, siginfo_t *info,
                                 void *puc)
 {
@@ -581,6 +588,9 @@ static void host_signal_handler(int host_signum, siginfo_t *info,
     if (sig < 1 || sig > TARGET_NSIG)
         return;
     trace_user_host_signal(env, host_signum, sig);
+
+    rewind_if_in_safe_syscall(puc);
+
     host_to_target_siginfo_noswap(&tinfo, info);
     if (queue_signal(env, sig, &tinfo) == 1) {
         /* interrupt the virtual CPU as soon as possible */
@@ -723,75 +733,75 @@ int do_sigaction(int sig, const struct target_sigaction *act,
 /* from the Linux kernel */
 
 struct target_fpreg {
-	uint16_t significand[4];
-	uint16_t exponent;
+    uint16_t significand[4];
+    uint16_t exponent;
 };
 
 struct target_fpxreg {
-	uint16_t significand[4];
-	uint16_t exponent;
-	uint16_t padding[3];
+    uint16_t significand[4];
+    uint16_t exponent;
+    uint16_t padding[3];
 };
 
 struct target_xmmreg {
-	abi_ulong element[4];
+    abi_ulong element[4];
 };
 
 struct target_fpstate {
-	/* Regular FPU environment */
-        abi_ulong       cw;
-        abi_ulong       sw;
-        abi_ulong       tag;
-        abi_ulong       ipoff;
-        abi_ulong       cssel;
-        abi_ulong       dataoff;
-        abi_ulong       datasel;
-	struct target_fpreg	_st[8];
-	uint16_t	status;
-	uint16_t	magic;		/* 0xffff = regular FPU data only */
-
-	/* FXSR FPU environment */
-        abi_ulong       _fxsr_env[6];   /* FXSR FPU env is ignored */
-        abi_ulong       mxcsr;
-        abi_ulong       reserved;
-	struct target_fpxreg	_fxsr_st[8];	/* FXSR FPU reg data is ignored */
-	struct target_xmmreg	_xmm[8];
-        abi_ulong       padding[56];
+    /* Regular FPU environment */
+    abi_ulong cw;
+    abi_ulong sw;
+    abi_ulong tag;
+    abi_ulong ipoff;
+    abi_ulong cssel;
+    abi_ulong dataoff;
+    abi_ulong datasel;
+    struct target_fpreg _st[8];
+    uint16_t  status;
+    uint16_t  magic;          /* 0xffff = regular FPU data only */
+
+    /* FXSR FPU environment */
+    abi_ulong _fxsr_env[6];   /* FXSR FPU env is ignored */
+    abi_ulong mxcsr;
+    abi_ulong reserved;
+    struct target_fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+    struct target_xmmreg _xmm[8];
+    abi_ulong padding[56];
 };
 
 #define X86_FXSR_MAGIC		0x0000
 
 struct target_sigcontext {
-	uint16_t gs, __gsh;
-	uint16_t fs, __fsh;
-	uint16_t es, __esh;
-	uint16_t ds, __dsh;
-        abi_ulong edi;
-        abi_ulong esi;
-        abi_ulong ebp;
-        abi_ulong esp;
-        abi_ulong ebx;
-        abi_ulong edx;
-        abi_ulong ecx;
-        abi_ulong eax;
-        abi_ulong trapno;
-        abi_ulong err;
-        abi_ulong eip;
-	uint16_t cs, __csh;
-        abi_ulong eflags;
-        abi_ulong esp_at_signal;
-	uint16_t ss, __ssh;
-        abi_ulong fpstate; /* pointer */
-        abi_ulong oldmask;
-        abi_ulong cr2;
+    uint16_t gs, __gsh;
+    uint16_t fs, __fsh;
+    uint16_t es, __esh;
+    uint16_t ds, __dsh;
+    abi_ulong edi;
+    abi_ulong esi;
+    abi_ulong ebp;
+    abi_ulong esp;
+    abi_ulong ebx;
+    abi_ulong edx;
+    abi_ulong ecx;
+    abi_ulong eax;
+    abi_ulong trapno;
+    abi_ulong err;
+    abi_ulong eip;
+    uint16_t cs, __csh;
+    abi_ulong eflags;
+    abi_ulong esp_at_signal;
+    uint16_t ss, __ssh;
+    abi_ulong fpstate; /* pointer */
+    abi_ulong oldmask;
+    abi_ulong cr2;
 };
 
 struct target_ucontext {
-        abi_ulong         tuc_flags;
-        abi_ulong         tuc_link;
-	target_stack_t	  tuc_stack;
-	struct target_sigcontext tuc_mcontext;
-	target_sigset_t	  tuc_sigmask;	/* mask last for extensibility */
+    abi_ulong         tuc_flags;
+    abi_ulong         tuc_link;
+    target_stack_t    tuc_stack;
+    struct target_sigcontext tuc_mcontext;
+    target_sigset_t   tuc_sigmask;  /* mask last for extensibility */
 };
 
 struct sigframe
@@ -828,7 +838,7 @@ static void setup_sigcontext(struct target_sigcontext *sc,
     CPUState *cs = CPU(x86_env_get_cpu(env));
     uint16_t magic;
 
-	/* already locked in setup_frame() */
+    /* already locked in setup_frame() */
     __put_user(env->segs[R_GS].selector, (unsigned int *)&sc->gs);
     __put_user(env->segs[R_FS].selector, (unsigned int *)&sc->fs);
     __put_user(env->segs[R_ES].selector, (unsigned int *)&sc->es);
@@ -849,13 +859,13 @@ static void setup_sigcontext(struct target_sigcontext *sc,
     __put_user(env->regs[R_ESP], &sc->esp_at_signal);
     __put_user(env->segs[R_SS].selector, (unsigned int *)&sc->ss);
 
-        cpu_x86_fsave(env, fpstate_addr, 1);
-        fpstate->status = fpstate->sw;
-        magic = 0xffff;
+    cpu_x86_fsave(env, fpstate_addr, 1);
+    fpstate->status = fpstate->sw;
+    magic = 0xffff;
     __put_user(magic, &fpstate->magic);
     __put_user(fpstate_addr, &sc->fpstate);
 
-	/* non-iBCS2 extensions.. */
+    /* non-iBCS2 extensions.. */
     __put_user(mask, &sc->oldmask);
     __put_user(env->cr[2], &sc->cr2);
 }
@@ -867,110 +877,112 @@ static void setup_sigcontext(struct target_sigcontext *sc,
 static inline abi_ulong
 get_sigframe(struct target_sigaction *ka, CPUX86State *env, size_t frame_size)
 {
-	unsigned long esp;
+    unsigned long esp;
 
-	/* Default to using normal stack */
-	esp = env->regs[R_ESP];
-	/* This is the X/Open sanctioned signal stack switching.  */
-        if (ka->sa_flags & TARGET_SA_ONSTACK) {
-            if (sas_ss_flags(esp) == 0)
-                esp = target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size;
+    /* Default to using normal stack */
+    esp = env->regs[R_ESP];
+    /* This is the X/Open sanctioned signal stack switching.  */
+    if (ka->sa_flags & TARGET_SA_ONSTACK) {
+        if (sas_ss_flags(esp) == 0) {
+            esp = target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size;
         }
+    } else {
 
-	/* This is the legacy signal stack switching. */
-	else
+        /* This is the legacy signal stack switching. */
         if ((env->segs[R_SS].selector & 0xffff) != __USER_DS &&
-            !(ka->sa_flags & TARGET_SA_RESTORER) &&
-            ka->sa_restorer) {
+                !(ka->sa_flags & TARGET_SA_RESTORER) &&
+                ka->sa_restorer) {
             esp = (unsigned long) ka->sa_restorer;
-	}
-        return (esp - frame_size) & -8ul;
+        }
+    }
+    return (esp - frame_size) & -8ul;
 }
 
 /* compare linux/arch/i386/kernel/signal.c:setup_frame() */
 static void setup_frame(int sig, struct target_sigaction *ka,
-			target_sigset_t *set, CPUX86State *env)
+                        target_sigset_t *set, CPUX86State *env)
 {
-	abi_ulong frame_addr;
-	struct sigframe *frame;
-	int i;
+    abi_ulong frame_addr;
+    struct sigframe *frame;
+    int i;
 
-	frame_addr = get_sigframe(ka, env, sizeof(*frame));
-        trace_user_setup_frame(env, frame_addr);
+    frame_addr = get_sigframe(ka, env, sizeof(*frame));
+    trace_user_setup_frame(env, frame_addr);
 
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-		goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
+        goto give_sigsegv;
 
     __put_user(sig, &frame->sig);
 
-	setup_sigcontext(&frame->sc, &frame->fpstate, env, set->sig[0],
-                         frame_addr + offsetof(struct sigframe, fpstate));
+    setup_sigcontext(&frame->sc, &frame->fpstate, env, set->sig[0],
+            frame_addr + offsetof(struct sigframe, fpstate));
 
     for(i = 1; i < TARGET_NSIG_WORDS; i++) {
         __put_user(set->sig[i], &frame->extramask[i - 1]);
     }
 
-	/* Set up to return from userspace.  If provided, use a stub
-	   already in userspace.  */
-	if (ka->sa_flags & TARGET_SA_RESTORER) {
+    /* Set up to return from userspace.  If provided, use a stub
+       already in userspace.  */
+    if (ka->sa_flags & TARGET_SA_RESTORER) {
         __put_user(ka->sa_restorer, &frame->pretcode);
-	} else {
-                uint16_t val16;
-                abi_ulong retcode_addr;
-                retcode_addr = frame_addr + offsetof(struct sigframe, retcode);
+    } else {
+        uint16_t val16;
+        abi_ulong retcode_addr;
+        retcode_addr = frame_addr + offsetof(struct sigframe, retcode);
         __put_user(retcode_addr, &frame->pretcode);
-		/* This is popl %eax ; movl $,%eax ; int $0x80 */
-                val16 = 0xb858;
+        /* This is popl %eax ; movl $,%eax ; int $0x80 */
+        val16 = 0xb858;
         __put_user(val16, (uint16_t *)(frame->retcode+0));
         __put_user(TARGET_NR_sigreturn, (int *)(frame->retcode+2));
-                val16 = 0x80cd;
+        val16 = 0x80cd;
         __put_user(val16, (uint16_t *)(frame->retcode+6));
-	}
+    }
 
 
-	/* Set up registers for signal handler */
-	env->regs[R_ESP] = frame_addr;
-	env->eip = ka->_sa_handler;
+    /* Set up registers for signal handler */
+    env->regs[R_ESP] = frame_addr;
+    env->eip = ka->_sa_handler;
 
-        cpu_x86_load_seg(env, R_DS, __USER_DS);
-        cpu_x86_load_seg(env, R_ES, __USER_DS);
-        cpu_x86_load_seg(env, R_SS, __USER_DS);
-        cpu_x86_load_seg(env, R_CS, __USER_CS);
-	env->eflags &= ~TF_MASK;
+    cpu_x86_load_seg(env, R_DS, __USER_DS);
+    cpu_x86_load_seg(env, R_ES, __USER_DS);
+    cpu_x86_load_seg(env, R_SS, __USER_DS);
+    cpu_x86_load_seg(env, R_CS, __USER_CS);
+    env->eflags &= ~TF_MASK;
 
-	unlock_user_struct(frame, frame_addr, 1);
+    unlock_user_struct(frame, frame_addr, 1);
 
-	return;
+    return;
 
 give_sigsegv:
-	if (sig == TARGET_SIGSEGV)
-		ka->_sa_handler = TARGET_SIG_DFL;
-	force_sig(TARGET_SIGSEGV /* , current */);
+    if (sig == TARGET_SIGSEGV) {
+        ka->_sa_handler = TARGET_SIG_DFL;
+    }
+    force_sig(TARGET_SIGSEGV /* , current */);
 }
 
 /* compare linux/arch/i386/kernel/signal.c:setup_rt_frame() */
 static void setup_rt_frame(int sig, struct target_sigaction *ka,
                            target_siginfo_t *info,
-			   target_sigset_t *set, CPUX86State *env)
+                           target_sigset_t *set, CPUX86State *env)
 {
-        abi_ulong frame_addr, addr;
-	struct rt_sigframe *frame;
-	int i;
+    abi_ulong frame_addr, addr;
+    struct rt_sigframe *frame;
+    int i;
 
-	frame_addr = get_sigframe(ka, env, sizeof(*frame));
-        trace_user_setup_rt_frame(env, frame_addr);
+    frame_addr = get_sigframe(ka, env, sizeof(*frame));
+    trace_user_setup_rt_frame(env, frame_addr);
 
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-		goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
+        goto give_sigsegv;
 
     __put_user(sig, &frame->sig);
-        addr = frame_addr + offsetof(struct rt_sigframe, info);
+    addr = frame_addr + offsetof(struct rt_sigframe, info);
     __put_user(addr, &frame->pinfo);
-        addr = frame_addr + offsetof(struct rt_sigframe, uc);
+    addr = frame_addr + offsetof(struct rt_sigframe, uc);
     __put_user(addr, &frame->puc);
     tswap_siginfo(&frame->info, info);
 
-	/* Create the ucontext.  */
+    /* Create the ucontext.  */
     __put_user(0, &frame->uc.tuc_flags);
     __put_user(0, &frame->uc.tuc_link);
     __put_user(target_sigaltstack_used.ss_sp, &frame->uc.tuc_stack.ss_sp);
@@ -985,81 +997,82 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
         __put_user(set->sig[i], &frame->uc.tuc_sigmask.sig[i]);
     }
 
-	/* Set up to return from userspace.  If provided, use a stub
-	   already in userspace.  */
-	if (ka->sa_flags & TARGET_SA_RESTORER) {
+    /* Set up to return from userspace.  If provided, use a stub
+       already in userspace.  */
+    if (ka->sa_flags & TARGET_SA_RESTORER) {
         __put_user(ka->sa_restorer, &frame->pretcode);
-	} else {
-                uint16_t val16;
-                addr = frame_addr + offsetof(struct rt_sigframe, retcode);
+    } else {
+        uint16_t val16;
+        addr = frame_addr + offsetof(struct rt_sigframe, retcode);
         __put_user(addr, &frame->pretcode);
-		/* This is movl $,%eax ; int $0x80 */
+        /* This is movl $,%eax ; int $0x80 */
         __put_user(0xb8, (char *)(frame->retcode+0));
         __put_user(TARGET_NR_rt_sigreturn, (int *)(frame->retcode+1));
-                val16 = 0x80cd;
+        val16 = 0x80cd;
         __put_user(val16, (uint16_t *)(frame->retcode+5));
-	}
+    }
 
-	/* Set up registers for signal handler */
-	env->regs[R_ESP] = frame_addr;
-	env->eip = ka->_sa_handler;
+    /* Set up registers for signal handler */
+    env->regs[R_ESP] = frame_addr;
+    env->eip = ka->_sa_handler;
 
-        cpu_x86_load_seg(env, R_DS, __USER_DS);
-        cpu_x86_load_seg(env, R_ES, __USER_DS);
-        cpu_x86_load_seg(env, R_SS, __USER_DS);
-        cpu_x86_load_seg(env, R_CS, __USER_CS);
-	env->eflags &= ~TF_MASK;
+    cpu_x86_load_seg(env, R_DS, __USER_DS);
+    cpu_x86_load_seg(env, R_ES, __USER_DS);
+    cpu_x86_load_seg(env, R_SS, __USER_DS);
+    cpu_x86_load_seg(env, R_CS, __USER_CS);
+    env->eflags &= ~TF_MASK;
 
-	unlock_user_struct(frame, frame_addr, 1);
+    unlock_user_struct(frame, frame_addr, 1);
 
-	return;
+    return;
 
 give_sigsegv:
-	if (sig == TARGET_SIGSEGV)
-		ka->_sa_handler = TARGET_SIG_DFL;
-	force_sig(TARGET_SIGSEGV /* , current */);
+    if (sig == TARGET_SIGSEGV) {
+        ka->_sa_handler = TARGET_SIG_DFL;
+    }
+    force_sig(TARGET_SIGSEGV /* , current */);
 }
 
 static int
-restore_sigcontext(CPUX86State *env, struct target_sigcontext *sc, int *peax)
-{
-	unsigned int err = 0;
-        abi_ulong fpstate_addr;
-        unsigned int tmpflags;
-
-        cpu_x86_load_seg(env, R_GS, tswap16(sc->gs));
-        cpu_x86_load_seg(env, R_FS, tswap16(sc->fs));
-        cpu_x86_load_seg(env, R_ES, tswap16(sc->es));
-        cpu_x86_load_seg(env, R_DS, tswap16(sc->ds));
-
-        env->regs[R_EDI] = tswapl(sc->edi);
-        env->regs[R_ESI] = tswapl(sc->esi);
-        env->regs[R_EBP] = tswapl(sc->ebp);
-        env->regs[R_ESP] = tswapl(sc->esp);
-        env->regs[R_EBX] = tswapl(sc->ebx);
-        env->regs[R_EDX] = tswapl(sc->edx);
-        env->regs[R_ECX] = tswapl(sc->ecx);
-        env->eip = tswapl(sc->eip);
-
-        cpu_x86_load_seg(env, R_CS, lduw_p(&sc->cs) | 3);
-        cpu_x86_load_seg(env, R_SS, lduw_p(&sc->ss) | 3);
-
-        tmpflags = tswapl(sc->eflags);
-        env->eflags = (env->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
-        //		regs->orig_eax = -1;		/* disable syscall checks */
-
-        fpstate_addr = tswapl(sc->fpstate);
-	if (fpstate_addr != 0) {
-                if (!access_ok(VERIFY_READ, fpstate_addr, 
-                               sizeof(struct target_fpstate)))
-                        goto badframe;
-                cpu_x86_frstor(env, fpstate_addr, 1);
-	}
+restore_sigcontext(CPUX86State *env, struct target_sigcontext *sc)
+{
+    unsigned int err = 0;
+    abi_ulong fpstate_addr;
+    unsigned int tmpflags;
+
+    cpu_x86_load_seg(env, R_GS, tswap16(sc->gs));
+    cpu_x86_load_seg(env, R_FS, tswap16(sc->fs));
+    cpu_x86_load_seg(env, R_ES, tswap16(sc->es));
+    cpu_x86_load_seg(env, R_DS, tswap16(sc->ds));
+
+    env->regs[R_EDI] = tswapl(sc->edi);
+    env->regs[R_ESI] = tswapl(sc->esi);
+    env->regs[R_EBP] = tswapl(sc->ebp);
+    env->regs[R_ESP] = tswapl(sc->esp);
+    env->regs[R_EBX] = tswapl(sc->ebx);
+    env->regs[R_EDX] = tswapl(sc->edx);
+    env->regs[R_ECX] = tswapl(sc->ecx);
+    env->regs[R_EAX] = tswapl(sc->eax);
+    env->eip = tswapl(sc->eip);
+
+    cpu_x86_load_seg(env, R_CS, lduw_p(&sc->cs) | 3);
+    cpu_x86_load_seg(env, R_SS, lduw_p(&sc->ss) | 3);
+
+    tmpflags = tswapl(sc->eflags);
+    env->eflags = (env->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+    //		regs->orig_eax = -1;		/* disable syscall checks */
+
+    fpstate_addr = tswapl(sc->fpstate);
+    if (fpstate_addr != 0) {
+        if (!access_ok(VERIFY_READ, fpstate_addr,
+                       sizeof(struct target_fpstate)))
+            goto badframe;
+        cpu_x86_frstor(env, fpstate_addr, 1);
+    }
 
-        *peax = tswapl(sc->eax);
-	return err;
+    return err;
 badframe:
-	return 1;
+    return 1;
 }
 
 long do_sigreturn(CPUX86State *env)
@@ -1068,7 +1081,7 @@ long do_sigreturn(CPUX86State *env)
     abi_ulong frame_addr = env->regs[R_ESP] - 8;
     target_sigset_t target_set;
     sigset_t set;
-    int eax, i;
+    int i;
 
     trace_user_do_sigreturn(env, frame_addr);
     if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
@@ -1083,10 +1096,10 @@ long do_sigreturn(CPUX86State *env)
     do_sigprocmask(SIG_SETMASK, &set, NULL);
 
     /* restore registers */
-    if (restore_sigcontext(env, &frame->sc, &eax))
+    if (restore_sigcontext(env, &frame->sc))
         goto badframe;
     unlock_user_struct(frame, frame_addr, 0);
-    return eax;
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     unlock_user_struct(frame, frame_addr, 0);
@@ -1096,32 +1109,33 @@ badframe:
 
 long do_rt_sigreturn(CPUX86State *env)
 {
-        abi_ulong frame_addr;
-	struct rt_sigframe *frame;
-        sigset_t set;
-	int eax;
+    abi_ulong frame_addr;
+    struct rt_sigframe *frame;
+    sigset_t set;
 
-        frame_addr = env->regs[R_ESP] - 4;
-        trace_user_do_rt_sigreturn(env, frame_addr);
-        if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-                goto badframe;
-        target_to_host_sigset(&set, &frame->uc.tuc_sigmask);
-        do_sigprocmask(SIG_SETMASK, &set, NULL);
+    frame_addr = env->regs[R_ESP] - 4;
+    trace_user_do_rt_sigreturn(env, frame_addr);
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
+        goto badframe;
+    target_to_host_sigset(&set, &frame->uc.tuc_sigmask);
+    do_sigprocmask(SIG_SETMASK, &set, NULL);
 
-	if (restore_sigcontext(env, &frame->uc.tuc_mcontext, &eax))
-		goto badframe;
+    if (restore_sigcontext(env, &frame->uc.tuc_mcontext)) {
+        goto badframe;
+    }
 
-	if (do_sigaltstack(frame_addr + offsetof(struct rt_sigframe, uc.tuc_stack), 0, 
-                           get_sp_from_cpustate(env)) == -EFAULT)
-		goto badframe;
+    if (do_sigaltstack(frame_addr + offsetof(struct rt_sigframe, uc.tuc_stack), 0,
+                       get_sp_from_cpustate(env)) == -EFAULT) {
+        goto badframe;
+    }
 
-        unlock_user_struct(frame, frame_addr, 0);
-	return eax;
+    unlock_user_struct(frame, frame_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
-        unlock_user_struct(frame, frame_addr, 0);
-        force_sig(TARGET_SIGSEGV);
-	return 0;
+    unlock_user_struct(frame, frame_addr, 0);
+    force_sig(TARGET_SIGSEGV);
+    return 0;
 }
 
 #elif defined(TARGET_AARCH64)
@@ -1386,7 +1400,7 @@ long do_rt_sigreturn(CPUARMState *env)
     }
 
     unlock_user_struct(frame, frame_addr, 0);
-    return env->xregs[0];
+    return -TARGET_QEMU_ESIGRETURN;
 
  badframe:
     unlock_user_struct(frame, frame_addr, 0);
@@ -1402,27 +1416,27 @@ long do_sigreturn(CPUARMState *env)
 #elif defined(TARGET_ARM)
 
 struct target_sigcontext {
-	abi_ulong trap_no;
-	abi_ulong error_code;
-	abi_ulong oldmask;
-	abi_ulong arm_r0;
-	abi_ulong arm_r1;
-	abi_ulong arm_r2;
-	abi_ulong arm_r3;
-	abi_ulong arm_r4;
-	abi_ulong arm_r5;
-	abi_ulong arm_r6;
-	abi_ulong arm_r7;
-	abi_ulong arm_r8;
-	abi_ulong arm_r9;
-	abi_ulong arm_r10;
-	abi_ulong arm_fp;
-	abi_ulong arm_ip;
-	abi_ulong arm_sp;
-	abi_ulong arm_lr;
-	abi_ulong arm_pc;
-	abi_ulong arm_cpsr;
-	abi_ulong fault_address;
+    abi_ulong trap_no;
+    abi_ulong error_code;
+    abi_ulong oldmask;
+    abi_ulong arm_r0;
+    abi_ulong arm_r1;
+    abi_ulong arm_r2;
+    abi_ulong arm_r3;
+    abi_ulong arm_r4;
+    abi_ulong arm_r5;
+    abi_ulong arm_r6;
+    abi_ulong arm_r7;
+    abi_ulong arm_r8;
+    abi_ulong arm_r9;
+    abi_ulong arm_r10;
+    abi_ulong arm_fp;
+    abi_ulong arm_ip;
+    abi_ulong arm_sp;
+    abi_ulong arm_lr;
+    abi_ulong arm_pc;
+    abi_ulong arm_cpsr;
+    abi_ulong fault_address;
 };
 
 struct target_ucontext_v1 {
@@ -1581,7 +1595,7 @@ get_sigframe(struct target_sigaction *ka, CPUARMState *regs, int framesize)
 
 static void
 setup_return(CPUARMState *env, struct target_sigaction *ka,
-	     abi_ulong *rc, abi_ulong frame_addr, int usig, abi_ulong rc_addr)
+             abi_ulong *rc, abi_ulong frame_addr, int usig, abi_ulong rc_addr)
 {
     abi_ulong handler = ka->_sa_handler;
     abi_ulong retcode;
@@ -1691,42 +1705,44 @@ static void setup_sigframe_v2(struct target_ucontext_v2 *uc,
 static void setup_frame_v1(int usig, struct target_sigaction *ka,
                            target_sigset_t *set, CPUARMState *regs)
 {
-	struct sigframe_v1 *frame;
-	abi_ulong frame_addr = get_sigframe(ka, regs, sizeof(*frame));
-	int i;
+    struct sigframe_v1 *frame;
+    abi_ulong frame_addr = get_sigframe(ka, regs, sizeof(*frame));
+    int i;
 
-        trace_user_setup_frame(regs, frame_addr);
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-		return;
+    trace_user_setup_frame(regs, frame_addr);
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        return;
+    }
 
-	setup_sigcontext(&frame->sc, regs, set->sig[0]);
+    setup_sigcontext(&frame->sc, regs, set->sig[0]);
 
     for(i = 1; i < TARGET_NSIG_WORDS; i++) {
         __put_user(set->sig[i], &frame->extramask[i - 1]);
     }
 
-        setup_return(regs, ka, &frame->retcode, frame_addr, usig,
-                     frame_addr + offsetof(struct sigframe_v1, retcode));
+    setup_return(regs, ka, &frame->retcode, frame_addr, usig,
+                 frame_addr + offsetof(struct sigframe_v1, retcode));
 
-	unlock_user_struct(frame, frame_addr, 1);
+    unlock_user_struct(frame, frame_addr, 1);
 }
 
 static void setup_frame_v2(int usig, struct target_sigaction *ka,
                            target_sigset_t *set, CPUARMState *regs)
 {
-	struct sigframe_v2 *frame;
-	abi_ulong frame_addr = get_sigframe(ka, regs, sizeof(*frame));
+    struct sigframe_v2 *frame;
+    abi_ulong frame_addr = get_sigframe(ka, regs, sizeof(*frame));
 
-        trace_user_setup_frame(regs, frame_addr);
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-		return;
+    trace_user_setup_frame(regs, frame_addr);
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        return;
+    }
 
-        setup_sigframe_v2(&frame->uc, set, regs);
+    setup_sigframe_v2(&frame->uc, set, regs);
 
-        setup_return(regs, ka, &frame->retcode, frame_addr, usig,
-                     frame_addr + offsetof(struct sigframe_v2, retcode));
+    setup_return(regs, ka, &frame->retcode, frame_addr, usig,
+                 frame_addr + offsetof(struct sigframe_v2, retcode));
 
-	unlock_user_struct(frame, frame_addr, 1);
+    unlock_user_struct(frame, frame_addr, 1);
 }
 
 static void setup_frame(int usig, struct target_sigaction *ka,
@@ -1744,70 +1760,72 @@ static void setup_rt_frame_v1(int usig, struct target_sigaction *ka,
                               target_siginfo_t *info,
                               target_sigset_t *set, CPUARMState *env)
 {
-	struct rt_sigframe_v1 *frame;
-	abi_ulong frame_addr = get_sigframe(ka, env, sizeof(*frame));
-	struct target_sigaltstack stack;
-	int i;
-        abi_ulong info_addr, uc_addr;
+    struct rt_sigframe_v1 *frame;
+    abi_ulong frame_addr = get_sigframe(ka, env, sizeof(*frame));
+    struct target_sigaltstack stack;
+    int i;
+    abi_ulong info_addr, uc_addr;
 
-        trace_user_setup_rt_frame(env, frame_addr);
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-            return /* 1 */;
+    trace_user_setup_rt_frame(env, frame_addr);
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        return /* 1 */;
+    }
 
-        info_addr = frame_addr + offsetof(struct rt_sigframe_v1, info);
-	__put_user(info_addr, &frame->pinfo);
-        uc_addr = frame_addr + offsetof(struct rt_sigframe_v1, uc);
-	__put_user(uc_addr, &frame->puc);
-        tswap_siginfo(&frame->info, info);
+    info_addr = frame_addr + offsetof(struct rt_sigframe_v1, info);
+    __put_user(info_addr, &frame->pinfo);
+    uc_addr = frame_addr + offsetof(struct rt_sigframe_v1, uc);
+    __put_user(uc_addr, &frame->puc);
+    tswap_siginfo(&frame->info, info);
 
-	/* Clear all the bits of the ucontext we don't use.  */
-	memset(&frame->uc, 0, offsetof(struct target_ucontext_v1, tuc_mcontext));
+    /* Clear all the bits of the ucontext we don't use.  */
+    memset(&frame->uc, 0, offsetof(struct target_ucontext_v1, tuc_mcontext));
 
-        memset(&stack, 0, sizeof(stack));
-        __put_user(target_sigaltstack_used.ss_sp, &stack.ss_sp);
-        __put_user(target_sigaltstack_used.ss_size, &stack.ss_size);
-        __put_user(sas_ss_flags(get_sp_from_cpustate(env)), &stack.ss_flags);
-        memcpy(&frame->uc.tuc_stack, &stack, sizeof(stack));
+    memset(&stack, 0, sizeof(stack));
+    __put_user(target_sigaltstack_used.ss_sp, &stack.ss_sp);
+    __put_user(target_sigaltstack_used.ss_size, &stack.ss_size);
+    __put_user(sas_ss_flags(get_sp_from_cpustate(env)), &stack.ss_flags);
+    memcpy(&frame->uc.tuc_stack, &stack, sizeof(stack));
 
-	setup_sigcontext(&frame->uc.tuc_mcontext, env, set->sig[0]);
-        for(i = 0; i < TARGET_NSIG_WORDS; i++) {
-            __put_user(set->sig[i], &frame->uc.tuc_sigmask.sig[i]);
-        }
+    setup_sigcontext(&frame->uc.tuc_mcontext, env, set->sig[0]);
+    for(i = 0; i < TARGET_NSIG_WORDS; i++) {
+        __put_user(set->sig[i], &frame->uc.tuc_sigmask.sig[i]);
+    }
 
-        setup_return(env, ka, &frame->retcode, frame_addr, usig,
-                     frame_addr + offsetof(struct rt_sigframe_v1, retcode));
+    setup_return(env, ka, &frame->retcode, frame_addr, usig,
+                 frame_addr + offsetof(struct rt_sigframe_v1, retcode));
 
-        env->regs[1] = info_addr;
-        env->regs[2] = uc_addr;
+    env->regs[1] = info_addr;
+    env->regs[2] = uc_addr;
 
-	unlock_user_struct(frame, frame_addr, 1);
+    unlock_user_struct(frame, frame_addr, 1);
 }
 
 static void setup_rt_frame_v2(int usig, struct target_sigaction *ka,
                               target_siginfo_t *info,
                               target_sigset_t *set, CPUARMState *env)
 {
-	struct rt_sigframe_v2 *frame;
-	abi_ulong frame_addr = get_sigframe(ka, env, sizeof(*frame));
-        abi_ulong info_addr, uc_addr;
+    struct rt_sigframe_v2 *frame;
+    abi_ulong frame_addr = get_sigframe(ka, env, sizeof(*frame));
+    abi_ulong info_addr, uc_addr;
 
-        trace_user_setup_rt_frame(env, frame_addr);
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-            return /* 1 */;
+    trace_user_setup_rt_frame(env, frame_addr);
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        return /* 1 */;
+    }
 
-        info_addr = frame_addr + offsetof(struct rt_sigframe_v2, info);
-        uc_addr = frame_addr + offsetof(struct rt_sigframe_v2, uc);
-        tswap_siginfo(&frame->info, info);
+    info_addr = frame_addr + offsetof(struct rt_sigframe_v2, info);
+    uc_addr = frame_addr + offsetof(struct rt_sigframe_v2, uc);
+    tswap_siginfo(&frame->info, info);
 
-        setup_sigframe_v2(&frame->uc, set, env);
+    setup_sigframe_v2(&frame->uc, set, env);
 
-        setup_return(env, ka, &frame->retcode, frame_addr, usig,
-                     frame_addr + offsetof(struct rt_sigframe_v2, retcode));
+    setup_return(env, ka, &frame->retcode, frame_addr, usig,
+                 frame_addr + offsetof(struct rt_sigframe_v2, retcode));
 
-        env->regs[1] = info_addr;
-        env->regs[2] = uc_addr;
+    env->regs[1] = info_addr;
+    env->regs[2] = uc_addr;
 
-	unlock_user_struct(frame, frame_addr, 1);
+    unlock_user_struct(frame, frame_addr, 1);
 }
 
 static void setup_rt_frame(int usig, struct target_sigaction *ka,
@@ -1824,8 +1842,8 @@ static void setup_rt_frame(int usig, struct target_sigaction *ka,
 static int
 restore_sigcontext(CPUARMState *env, struct target_sigcontext *sc)
 {
-	int err = 0;
-        uint32_t cpsr;
+    int err = 0;
+    uint32_t cpsr;
 
     __get_user(env->regs[0], &sc->arm_r0);
     __get_user(env->regs[1], &sc->arm_r1);
@@ -1848,55 +1866,57 @@ restore_sigcontext(CPUARMState *env, struct target_sigcontext *sc)
     cpsr_write(env, cpsr, CPSR_USER | CPSR_EXEC, CPSRWriteByInstr);
 #endif
 
-	err |= !valid_user_regs(env);
+    err |= !valid_user_regs(env);
 
-	return err;
+    return err;
 }
 
 static long do_sigreturn_v1(CPUARMState *env)
 {
-        abi_ulong frame_addr;
-        struct sigframe_v1 *frame = NULL;
-	target_sigset_t set;
-        sigset_t host_set;
-        int i;
-
-	/*
-	 * Since we stacked the signal on a 64-bit boundary,
-	 * then 'sp' should be word aligned here.  If it's
-	 * not, then the user is trying to mess with us.
-	 */
-        frame_addr = env->regs[13];
-        trace_user_do_sigreturn(env, frame_addr);
-        if (frame_addr & 7) {
-            goto badframe;
-        }
+    abi_ulong frame_addr;
+    struct sigframe_v1 *frame = NULL;
+    target_sigset_t set;
+    sigset_t host_set;
+    int i;
 
-	if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-                goto badframe;
+    /*
+     * Since we stacked the signal on a 64-bit boundary,
+     * then 'sp' should be word aligned here.  If it's
+     * not, then the user is trying to mess with us.
+     */
+    frame_addr = env->regs[13];
+    trace_user_do_sigreturn(env, frame_addr);
+    if (frame_addr & 7) {
+        goto badframe;
+    }
+
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
     __get_user(set.sig[0], &frame->sc.oldmask);
     for(i = 1; i < TARGET_NSIG_WORDS; i++) {
         __get_user(set.sig[i], &frame->extramask[i - 1]);
     }
 
-        target_to_host_sigset_internal(&host_set, &set);
-        do_sigprocmask(SIG_SETMASK, &host_set, NULL);
+    target_to_host_sigset_internal(&host_set, &set);
+    do_sigprocmask(SIG_SETMASK, &host_set, NULL);
 
-	if (restore_sigcontext(env, &frame->sc))
-		goto badframe;
+    if (restore_sigcontext(env, &frame->sc)) {
+        goto badframe;
+    }
 
 #if 0
-	/* Send SIGTRAP if we're single-stepping */
-	if (ptrace_cancel_bpt(current))
-		send_sig(SIGTRAP, current, 1);
+    /* Send SIGTRAP if we're single-stepping */
+    if (ptrace_cancel_bpt(current))
+        send_sig(SIGTRAP, current, 1);
 #endif
-	unlock_user_struct(frame, frame_addr, 0);
-        return env->regs[0];
+    unlock_user_struct(frame, frame_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
-        force_sig(TARGET_SIGSEGV /* , current */);
-	return 0;
+    force_sig(TARGET_SIGSEGV /* , current */);
+    return 0;
 }
 
 static abi_ulong *restore_sigframe_v2_vfp(CPUARMState *env, abi_ulong *regspace)
@@ -1987,7 +2007,7 @@ static int do_sigframe_return_v2(CPUARMState *env, target_ulong frame_addr,
 #if 0
     /* Send SIGTRAP if we're single-stepping */
     if (ptrace_cancel_bpt(current))
-            send_sig(SIGTRAP, current, 1);
+        send_sig(SIGTRAP, current, 1);
 #endif
 
     return 0;
@@ -1995,33 +2015,35 @@ static int do_sigframe_return_v2(CPUARMState *env, target_ulong frame_addr,
 
 static long do_sigreturn_v2(CPUARMState *env)
 {
-        abi_ulong frame_addr;
-        struct sigframe_v2 *frame = NULL;
-
-	/*
-	 * Since we stacked the signal on a 64-bit boundary,
-	 * then 'sp' should be word aligned here.  If it's
-	 * not, then the user is trying to mess with us.
-	 */
-        frame_addr = env->regs[13];
-        trace_user_do_sigreturn(env, frame_addr);
-        if (frame_addr & 7) {
-            goto badframe;
-        }
+    abi_ulong frame_addr;
+    struct sigframe_v2 *frame = NULL;
 
-	if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-                goto badframe;
+    /*
+     * Since we stacked the signal on a 64-bit boundary,
+     * then 'sp' should be word aligned here.  If it's
+     * not, then the user is trying to mess with us.
+     */
+    frame_addr = env->regs[13];
+    trace_user_do_sigreturn(env, frame_addr);
+    if (frame_addr & 7) {
+        goto badframe;
+    }
+
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
-        if (do_sigframe_return_v2(env, frame_addr, &frame->uc))
-                goto badframe;
+    if (do_sigframe_return_v2(env, frame_addr, &frame->uc)) {
+        goto badframe;
+    }
 
-	unlock_user_struct(frame, frame_addr, 0);
-	return env->regs[0];
+    unlock_user_struct(frame, frame_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
-	unlock_user_struct(frame, frame_addr, 0);
-        force_sig(TARGET_SIGSEGV /* , current */);
-	return 0;
+    unlock_user_struct(frame, frame_addr, 0);
+    force_sig(TARGET_SIGSEGV /* , current */);
+    return 0;
 }
 
 long do_sigreturn(CPUARMState *env)
@@ -2035,76 +2057,80 @@ long do_sigreturn(CPUARMState *env)
 
 static long do_rt_sigreturn_v1(CPUARMState *env)
 {
-        abi_ulong frame_addr;
-        struct rt_sigframe_v1 *frame = NULL;
-        sigset_t host_set;
-
-	/*
-	 * Since we stacked the signal on a 64-bit boundary,
-	 * then 'sp' should be word aligned here.  If it's
-	 * not, then the user is trying to mess with us.
-	 */
-        frame_addr = env->regs[13];
-        trace_user_do_rt_sigreturn(env, frame_addr);
-        if (frame_addr & 7) {
-            goto badframe;
-        }
+    abi_ulong frame_addr;
+    struct rt_sigframe_v1 *frame = NULL;
+    sigset_t host_set;
+
+    /*
+     * Since we stacked the signal on a 64-bit boundary,
+     * then 'sp' should be word aligned here.  If it's
+     * not, then the user is trying to mess with us.
+     */
+    frame_addr = env->regs[13];
+    trace_user_do_rt_sigreturn(env, frame_addr);
+    if (frame_addr & 7) {
+        goto badframe;
+    }
 
-	if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-                goto badframe;
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
-        target_to_host_sigset(&host_set, &frame->uc.tuc_sigmask);
-        do_sigprocmask(SIG_SETMASK, &host_set, NULL);
+    target_to_host_sigset(&host_set, &frame->uc.tuc_sigmask);
+    do_sigprocmask(SIG_SETMASK, &host_set, NULL);
 
-	if (restore_sigcontext(env, &frame->uc.tuc_mcontext))
-		goto badframe;
+    if (restore_sigcontext(env, &frame->uc.tuc_mcontext)) {
+        goto badframe;
+    }
 
-	if (do_sigaltstack(frame_addr + offsetof(struct rt_sigframe_v1, uc.tuc_stack), 0, get_sp_from_cpustate(env)) == -EFAULT)
-		goto badframe;
+    if (do_sigaltstack(frame_addr + offsetof(struct rt_sigframe_v1, uc.tuc_stack), 0, get_sp_from_cpustate(env)) == -EFAULT)
+        goto badframe;
 
 #if 0
-	/* Send SIGTRAP if we're single-stepping */
-	if (ptrace_cancel_bpt(current))
-		send_sig(SIGTRAP, current, 1);
+    /* Send SIGTRAP if we're single-stepping */
+    if (ptrace_cancel_bpt(current))
+        send_sig(SIGTRAP, current, 1);
 #endif
-	unlock_user_struct(frame, frame_addr, 0);
-	return env->regs[0];
+    unlock_user_struct(frame, frame_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
-	unlock_user_struct(frame, frame_addr, 0);
-        force_sig(TARGET_SIGSEGV /* , current */);
-	return 0;
+    unlock_user_struct(frame, frame_addr, 0);
+    force_sig(TARGET_SIGSEGV /* , current */);
+    return 0;
 }
 
 static long do_rt_sigreturn_v2(CPUARMState *env)
 {
-        abi_ulong frame_addr;
-        struct rt_sigframe_v2 *frame = NULL;
-
-	/*
-	 * Since we stacked the signal on a 64-bit boundary,
-	 * then 'sp' should be word aligned here.  If it's
-	 * not, then the user is trying to mess with us.
-	 */
-        frame_addr = env->regs[13];
-        trace_user_do_rt_sigreturn(env, frame_addr);
-        if (frame_addr & 7) {
-            goto badframe;
-        }
+    abi_ulong frame_addr;
+    struct rt_sigframe_v2 *frame = NULL;
 
-	if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-                goto badframe;
+    /*
+     * Since we stacked the signal on a 64-bit boundary,
+     * then 'sp' should be word aligned here.  If it's
+     * not, then the user is trying to mess with us.
+     */
+    frame_addr = env->regs[13];
+    trace_user_do_rt_sigreturn(env, frame_addr);
+    if (frame_addr & 7) {
+        goto badframe;
+    }
 
-        if (do_sigframe_return_v2(env, frame_addr, &frame->uc))
-                goto badframe;
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
-	unlock_user_struct(frame, frame_addr, 0);
-	return env->regs[0];
+    if (do_sigframe_return_v2(env, frame_addr, &frame->uc)) {
+        goto badframe;
+    }
+
+    unlock_user_struct(frame, frame_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
-	unlock_user_struct(frame, frame_addr, 0);
-        force_sig(TARGET_SIGSEGV /* , current */);
-	return 0;
+    unlock_user_struct(frame, frame_addr, 0);
+    force_sig(TARGET_SIGSEGV /* , current */);
+    return 0;
 }
 
 long do_rt_sigreturn(CPUARMState *env)
@@ -2122,83 +2148,83 @@ long do_rt_sigreturn(CPUARMState *env)
 
 /* This is what SunOS does, so shall I. */
 struct target_sigcontext {
-        abi_ulong sigc_onstack;      /* state to restore */
+    abi_ulong sigc_onstack;      /* state to restore */
 
-        abi_ulong sigc_mask;         /* sigmask to restore */
-        abi_ulong sigc_sp;           /* stack pointer */
-        abi_ulong sigc_pc;           /* program counter */
-        abi_ulong sigc_npc;          /* next program counter */
-        abi_ulong sigc_psr;          /* for condition codes etc */
-        abi_ulong sigc_g1;           /* User uses these two registers */
-        abi_ulong sigc_o0;           /* within the trampoline code. */
+    abi_ulong sigc_mask;         /* sigmask to restore */
+    abi_ulong sigc_sp;           /* stack pointer */
+    abi_ulong sigc_pc;           /* program counter */
+    abi_ulong sigc_npc;          /* next program counter */
+    abi_ulong sigc_psr;          /* for condition codes etc */
+    abi_ulong sigc_g1;           /* User uses these two registers */
+    abi_ulong sigc_o0;           /* within the trampoline code. */
 
-        /* Now comes information regarding the users window set
+    /* Now comes information regarding the users window set
          * at the time of the signal.
          */
-        abi_ulong sigc_oswins;       /* outstanding windows */
+    abi_ulong sigc_oswins;       /* outstanding windows */
 
-        /* stack ptrs for each regwin buf */
-        char *sigc_spbuf[__SUNOS_MAXWIN];
+    /* stack ptrs for each regwin buf */
+    char *sigc_spbuf[__SUNOS_MAXWIN];
 
-        /* Windows to restore after signal */
-        struct {
-                abi_ulong locals[8];
-                abi_ulong ins[8];
-        } sigc_wbuf[__SUNOS_MAXWIN];
+    /* Windows to restore after signal */
+    struct {
+        abi_ulong locals[8];
+        abi_ulong ins[8];
+    } sigc_wbuf[__SUNOS_MAXWIN];
 };
 /* A Sparc stack frame */
 struct sparc_stackf {
-        abi_ulong locals[8];
-        abi_ulong ins[8];
-        /* It's simpler to treat fp and callers_pc as elements of ins[]
+    abi_ulong locals[8];
+    abi_ulong ins[8];
+    /* It's simpler to treat fp and callers_pc as elements of ins[]
          * since we never need to access them ourselves.
          */
-        char *structptr;
-        abi_ulong xargs[6];
-        abi_ulong xxargs[1];
+    char *structptr;
+    abi_ulong xargs[6];
+    abi_ulong xxargs[1];
 };
 
 typedef struct {
-        struct {
-                abi_ulong psr;
-                abi_ulong pc;
-                abi_ulong npc;
-                abi_ulong y;
-                abi_ulong u_regs[16]; /* globals and ins */
-        }               si_regs;
-        int             si_mask;
+    struct {
+        abi_ulong psr;
+        abi_ulong pc;
+        abi_ulong npc;
+        abi_ulong y;
+        abi_ulong u_regs[16]; /* globals and ins */
+    }               si_regs;
+    int             si_mask;
 } __siginfo_t;
 
 typedef struct {
-        abi_ulong       si_float_regs[32];
-        unsigned   long si_fsr;
-        unsigned   long si_fpqdepth;
-        struct {
-                unsigned long *insn_addr;
-                unsigned long insn;
-        } si_fpqueue [16];
+    abi_ulong  si_float_regs[32];
+    unsigned   long si_fsr;
+    unsigned   long si_fpqdepth;
+    struct {
+        unsigned long *insn_addr;
+        unsigned long insn;
+    } si_fpqueue [16];
 } qemu_siginfo_fpu_t;
 
 
 struct target_signal_frame {
-	struct sparc_stackf	ss;
-	__siginfo_t		info;
-	abi_ulong               fpu_save;
-	abi_ulong		insns[2] __attribute__ ((aligned (8)));
-	abi_ulong		extramask[TARGET_NSIG_WORDS - 1];
-	abi_ulong		extra_size; /* Should be 0 */
-	qemu_siginfo_fpu_t	fpu_state;
+    struct sparc_stackf ss;
+    __siginfo_t         info;
+    abi_ulong           fpu_save;
+    abi_ulong           insns[2] __attribute__ ((aligned (8)));
+    abi_ulong           extramask[TARGET_NSIG_WORDS - 1];
+    abi_ulong           extra_size; /* Should be 0 */
+    qemu_siginfo_fpu_t fpu_state;
 };
 struct target_rt_signal_frame {
-	struct sparc_stackf	ss;
-	siginfo_t		info;
-	abi_ulong		regs[20];
-	sigset_t		mask;
-	abi_ulong               fpu_save;
-	unsigned int		insns[2];
-	stack_t			stack;
-	unsigned int		extra_size; /* Should be 0 */
-	qemu_siginfo_fpu_t	fpu_state;
+    struct sparc_stackf ss;
+    siginfo_t           info;
+    abi_ulong           regs[20];
+    sigset_t            mask;
+    abi_ulong           fpu_save;
+    unsigned int        insns[2];
+    stack_t             stack;
+    unsigned int        extra_size; /* Should be 0 */
+    qemu_siginfo_fpu_t  fpu_state;
 };
 
 #define UREG_O0        16
@@ -2219,36 +2245,37 @@ static inline abi_ulong get_sigframe(struct target_sigaction *sa,
                                      CPUSPARCState *env,
                                      unsigned long framesize)
 {
-	abi_ulong sp;
+    abi_ulong sp;
 
-	sp = env->regwptr[UREG_FP];
+    sp = env->regwptr[UREG_FP];
 
-	/* This is the X/Open sanctioned signal stack switching.  */
-	if (sa->sa_flags & TARGET_SA_ONSTACK) {
-            if (!on_sig_stack(sp)
-                && !((target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size) & 7))
-                sp = target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size;
-	}
-	return sp - framesize;
+    /* This is the X/Open sanctioned signal stack switching.  */
+    if (sa->sa_flags & TARGET_SA_ONSTACK) {
+        if (!on_sig_stack(sp)
+                && !((target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size) & 7)) {
+            sp = target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size;
+        }
+    }
+    return sp - framesize;
 }
 
 static int
 setup___siginfo(__siginfo_t *si, CPUSPARCState *env, abi_ulong mask)
 {
-	int err = 0, i;
+    int err = 0, i;
 
     __put_user(env->psr, &si->si_regs.psr);
     __put_user(env->pc, &si->si_regs.pc);
     __put_user(env->npc, &si->si_regs.npc);
     __put_user(env->y, &si->si_regs.y);
-	for (i=0; i < 8; i++) {
+    for (i=0; i < 8; i++) {
         __put_user(env->gregs[i], &si->si_regs.u_regs[i]);
-	}
-	for (i=0; i < 8; i++) {
+    }
+    for (i=0; i < 8; i++) {
         __put_user(env->regwptr[UREG_I0 + i], &si->si_regs.u_regs[i+8]);
-	}
+    }
     __put_user(mask, &si->si_mask);
-	return err;
+    return err;
 }
 
 #if 0
@@ -2256,7 +2283,7 @@ static int
 setup_sigcontext(struct target_sigcontext *sc, /*struct _fpstate *fpstate,*/
                  CPUSPARCState *env, unsigned long mask)
 {
-	int err = 0;
+    int err = 0;
 
     __put_user(mask, &sc->sigc_mask);
     __put_user(env->regwptr[UREG_SP], &sc->sigc_sp);
@@ -2266,7 +2293,7 @@ setup_sigcontext(struct target_sigcontext *sc, /*struct _fpstate *fpstate,*/
     __put_user(env->gregs[1], &sc->sigc_g1);
     __put_user(env->regwptr[UREG_O0], &sc->sigc_o0);
 
-	return err;
+    return err;
 }
 #endif
 #define NF_ALIGNEDSZ  (((sizeof(struct target_signal_frame) + 7) & (~7)))
@@ -2274,90 +2301,90 @@ setup_sigcontext(struct target_sigcontext *sc, /*struct _fpstate *fpstate,*/
 static void setup_frame(int sig, struct target_sigaction *ka,
                         target_sigset_t *set, CPUSPARCState *env)
 {
-        abi_ulong sf_addr;
-	struct target_signal_frame *sf;
-	int sigframe_size, err, i;
+    abi_ulong sf_addr;
+    struct target_signal_frame *sf;
+    int sigframe_size, err, i;
 
-	/* 1. Make sure everything is clean */
-	//synchronize_user_stack();
+    /* 1. Make sure everything is clean */
+    //synchronize_user_stack();
 
-        sigframe_size = NF_ALIGNEDSZ;
-	sf_addr = get_sigframe(ka, env, sigframe_size);
-        trace_user_setup_frame(env, sf_addr);
+    sigframe_size = NF_ALIGNEDSZ;
+    sf_addr = get_sigframe(ka, env, sigframe_size);
+    trace_user_setup_frame(env, sf_addr);
 
-        sf = lock_user(VERIFY_WRITE, sf_addr, 
-                       sizeof(struct target_signal_frame), 0);
-        if (!sf)
-		goto sigsegv;
-                
+    sf = lock_user(VERIFY_WRITE, sf_addr,
+                   sizeof(struct target_signal_frame), 0);
+    if (!sf) {
+        goto sigsegv;
+    }
 #if 0
-	if (invalid_frame_pointer(sf, sigframe_size))
-		goto sigill_and_return;
+    if (invalid_frame_pointer(sf, sigframe_size))
+        goto sigill_and_return;
 #endif
-	/* 2. Save the current process state */
-	err = setup___siginfo(&sf->info, env, set->sig[0]);
+    /* 2. Save the current process state */
+    err = setup___siginfo(&sf->info, env, set->sig[0]);
     __put_user(0, &sf->extra_size);
 
-	//save_fpu_state(regs, &sf->fpu_state);
-	//__put_user(&sf->fpu_state, &sf->fpu_save);
+    //save_fpu_state(regs, &sf->fpu_state);
+    //__put_user(&sf->fpu_state, &sf->fpu_save);
 
     __put_user(set->sig[0], &sf->info.si_mask);
-	for (i = 0; i < TARGET_NSIG_WORDS - 1; i++) {
+    for (i = 0; i < TARGET_NSIG_WORDS - 1; i++) {
         __put_user(set->sig[i + 1], &sf->extramask[i]);
-	}
+    }
 
-	for (i = 0; i < 8; i++) {
+    for (i = 0; i < 8; i++) {
         __put_user(env->regwptr[i + UREG_L0], &sf->ss.locals[i]);
-	}
-	for (i = 0; i < 8; i++) {
+    }
+    for (i = 0; i < 8; i++) {
         __put_user(env->regwptr[i + UREG_I0], &sf->ss.ins[i]);
-	}
-	if (err)
-		goto sigsegv;
-
-	/* 3. signal handler back-trampoline and parameters */
-	env->regwptr[UREG_FP] = sf_addr;
-	env->regwptr[UREG_I0] = sig;
-	env->regwptr[UREG_I1] = sf_addr + 
-                offsetof(struct target_signal_frame, info);
-	env->regwptr[UREG_I2] = sf_addr + 
-                offsetof(struct target_signal_frame, info);
-
-	/* 4. signal handler */
-	env->pc = ka->_sa_handler;
-	env->npc = (env->pc + 4);
-	/* 5. return to kernel instructions */
-	if (ka->sa_restorer)
-		env->regwptr[UREG_I7] = ka->sa_restorer;
-	else {
-                uint32_t val32;
-
-		env->regwptr[UREG_I7] = sf_addr + 
-                        offsetof(struct target_signal_frame, insns) - 2 * 4;
-
-		/* mov __NR_sigreturn, %g1 */
-                val32 = 0x821020d8;
+    }
+    if (err)
+        goto sigsegv;
+
+    /* 3. signal handler back-trampoline and parameters */
+    env->regwptr[UREG_FP] = sf_addr;
+    env->regwptr[UREG_I0] = sig;
+    env->regwptr[UREG_I1] = sf_addr +
+            offsetof(struct target_signal_frame, info);
+    env->regwptr[UREG_I2] = sf_addr +
+            offsetof(struct target_signal_frame, info);
+
+    /* 4. signal handler */
+    env->pc = ka->_sa_handler;
+    env->npc = (env->pc + 4);
+    /* 5. return to kernel instructions */
+    if (ka->sa_restorer) {
+        env->regwptr[UREG_I7] = ka->sa_restorer;
+    } else {
+        uint32_t val32;
+
+        env->regwptr[UREG_I7] = sf_addr +
+                offsetof(struct target_signal_frame, insns) - 2 * 4;
+
+        /* mov __NR_sigreturn, %g1 */
+        val32 = 0x821020d8;
         __put_user(val32, &sf->insns[0]);
 
-		/* t 0x10 */
-                val32 = 0x91d02010;
+        /* t 0x10 */
+        val32 = 0x91d02010;
         __put_user(val32, &sf->insns[1]);
-		if (err)
-			goto sigsegv;
+        if (err)
+            goto sigsegv;
 
-		/* Flush instruction space. */
-		//flush_sig_insns(current->mm, (unsigned long) &(sf->insns[0]));
-                //		tb_flush(CPU(sparc_env_get_cpu(env)));
-	}
-        unlock_user(sf, sf_addr, sizeof(struct target_signal_frame));
-	return;
+        /* Flush instruction space. */
+        // flush_sig_insns(current->mm, (unsigned long) &(sf->insns[0]));
+        // tb_flush(env);
+    }
+    unlock_user(sf, sf_addr, sizeof(struct target_signal_frame));
+    return;
 #if 0
 sigill_and_return:
-	force_sig(TARGET_SIGILL);
+    force_sig(TARGET_SIGILL);
 #endif
 sigsegv:
-        unlock_user(sf, sf_addr, sizeof(struct target_signal_frame));
-	force_sig(TARGET_SIGSEGV);
+    unlock_user(sf, sf_addr, sizeof(struct target_signal_frame));
+    force_sig(TARGET_SIGSEGV);
 }
 
 static void setup_rt_frame(int sig, struct target_sigaction *ka,
@@ -2369,71 +2396,74 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 long do_sigreturn(CPUSPARCState *env)
 {
-        abi_ulong sf_addr;
-        struct target_signal_frame *sf;
-        uint32_t up_psr, pc, npc;
-        target_sigset_t set;
-        sigset_t host_set;
-        int err=0, i;
+    abi_ulong sf_addr;
+    struct target_signal_frame *sf;
+    uint32_t up_psr, pc, npc;
+    target_sigset_t set;
+    sigset_t host_set;
+    int err=0, i;
 
-        sf_addr = env->regwptr[UREG_FP];
-        trace_user_do_sigreturn(env, sf_addr);
-        if (!lock_user_struct(VERIFY_READ, sf, sf_addr, 1))
-                goto segv_and_exit;
+    sf_addr = env->regwptr[UREG_FP];
+    trace_user_do_sigreturn(env, sf_addr);
+    if (!lock_user_struct(VERIFY_READ, sf, sf_addr, 1)) {
+        goto segv_and_exit;
+    }
 
-        /* 1. Make sure we are not getting garbage from the user */
+    /* 1. Make sure we are not getting garbage from the user */
 
-        if (sf_addr & 3)
-                goto segv_and_exit;
+    if (sf_addr & 3)
+        goto segv_and_exit;
 
-        __get_user(pc,  &sf->info.si_regs.pc);
-        __get_user(npc, &sf->info.si_regs.npc);
+    __get_user(pc,  &sf->info.si_regs.pc);
+    __get_user(npc, &sf->info.si_regs.npc);
 
-        if ((pc | npc) & 3)
-                goto segv_and_exit;
+    if ((pc | npc) & 3) {
+        goto segv_and_exit;
+    }
 
-        /* 2. Restore the state */
-        __get_user(up_psr, &sf->info.si_regs.psr);
+    /* 2. Restore the state */
+    __get_user(up_psr, &sf->info.si_regs.psr);
 
-        /* User can only change condition codes and FPU enabling in %psr. */
-        env->psr = (up_psr & (PSR_ICC /* | PSR_EF */))
-                  | (env->psr & ~(PSR_ICC /* | PSR_EF */));
+    /* User can only change condition codes and FPU enabling in %psr. */
+    env->psr = (up_psr & (PSR_ICC /* | PSR_EF */))
+            | (env->psr & ~(PSR_ICC /* | PSR_EF */));
 
-	env->pc = pc;
-	env->npc = npc;
-        __get_user(env->y, &sf->info.si_regs.y);
-	for (i=0; i < 8; i++) {
-		__get_user(env->gregs[i], &sf->info.si_regs.u_regs[i]);
-	}
-	for (i=0; i < 8; i++) {
-		__get_user(env->regwptr[i + UREG_I0], &sf->info.si_regs.u_regs[i+8]);
-	}
+    env->pc = pc;
+    env->npc = npc;
+    __get_user(env->y, &sf->info.si_regs.y);
+    for (i=0; i < 8; i++) {
+        __get_user(env->gregs[i], &sf->info.si_regs.u_regs[i]);
+    }
+    for (i=0; i < 8; i++) {
+        __get_user(env->regwptr[i + UREG_I0], &sf->info.si_regs.u_regs[i+8]);
+    }
 
-        /* FIXME: implement FPU save/restore:
+    /* FIXME: implement FPU save/restore:
          * __get_user(fpu_save, &sf->fpu_save);
          * if (fpu_save)
          *        err |= restore_fpu_state(env, fpu_save);
          */
 
-        /* This is pretty much atomic, no amount locking would prevent
+    /* This is pretty much atomic, no amount locking would prevent
          * the races which exist anyways.
          */
-        __get_user(set.sig[0], &sf->info.si_mask);
-        for(i = 1; i < TARGET_NSIG_WORDS; i++) {
-            __get_user(set.sig[i], &sf->extramask[i - 1]);
-        }
+    __get_user(set.sig[0], &sf->info.si_mask);
+    for(i = 1; i < TARGET_NSIG_WORDS; i++) {
+        __get_user(set.sig[i], &sf->extramask[i - 1]);
+    }
 
-        target_to_host_sigset_internal(&host_set, &set);
-        do_sigprocmask(SIG_SETMASK, &host_set, NULL);
+    target_to_host_sigset_internal(&host_set, &set);
+    do_sigprocmask(SIG_SETMASK, &host_set, NULL);
 
-        if (err)
-                goto segv_and_exit;
-        unlock_user_struct(sf, sf_addr, 0);
-        return env->regwptr[0];
+    if (err) {
+        goto segv_and_exit;
+    }
+    unlock_user_struct(sf, sf_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
 
 segv_and_exit:
-        unlock_user_struct(sf, sf_addr, 0);
-	force_sig(TARGET_SIGSEGV);
+    unlock_user_struct(sf, sf_addr, 0);
+    force_sig(TARGET_SIGSEGV);
 }
 
 long do_rt_sigreturn(CPUSPARCState *env)
@@ -2522,13 +2552,15 @@ void sparc64_set_context(CPUSPARCState *env)
     unsigned int i;
 
     ucp_addr = env->regwptr[UREG_I0];
-    if (!lock_user_struct(VERIFY_READ, ucp, ucp_addr, 1))
+    if (!lock_user_struct(VERIFY_READ, ucp, ucp_addr, 1)) {
         goto do_sigsegv;
+    }
     grp  = &ucp->tuc_mcontext.mc_gregs;
     __get_user(pc, &((*grp)[MC_PC]));
     __get_user(npc, &((*grp)[MC_NPC]));
-    if ((pc | npc) & 3)
+    if ((pc | npc) & 3) {
         goto do_sigsegv;
+    }
     if (env->regwptr[UREG_I1]) {
         target_sigset_t target_set;
         sigset_t set;
@@ -2573,12 +2605,14 @@ void sparc64_set_context(CPUSPARCState *env)
     __get_user(i7, &(ucp->tuc_mcontext.mc_i7));
 
     w_addr = TARGET_STACK_BIAS+env->regwptr[UREG_I6];
-    if (put_user(fp, w_addr + offsetof(struct target_reg_window, ins[6]), 
-                 abi_ulong) != 0)
+    if (put_user(fp, w_addr + offsetof(struct target_reg_window, ins[6]),
+                 abi_ulong) != 0) {
         goto do_sigsegv;
-    if (put_user(i7, w_addr + offsetof(struct target_reg_window, ins[7]), 
-                 abi_ulong) != 0)
+    }
+    if (put_user(i7, w_addr + offsetof(struct target_reg_window, ins[7]),
+                 abi_ulong) != 0) {
         goto do_sigsegv;
+    }
     /* FIXME this does not match how the kernel handles the FPU in
      * its sparc64_set_context implementation. In particular the FPU
      * is only restored if fenab is non-zero in:
@@ -2601,7 +2635,7 @@ void sparc64_set_context(CPUSPARCState *env)
                &(ucp->tuc_mcontext.mc_fpregs.mcfpu_gsr));
     unlock_user_struct(ucp, ucp_addr, 0);
     return;
- do_sigsegv:
+do_sigsegv:
     unlock_user_struct(ucp, ucp_addr, 0);
     force_sig(TARGET_SIGSEGV);
 }
@@ -2619,8 +2653,9 @@ void sparc64_get_context(CPUSPARCState *env)
     sigset_t set;
 
     ucp_addr = env->regwptr[UREG_I0];
-    if (!lock_user_struct(VERIFY_WRITE, ucp, ucp_addr, 0))
+    if (!lock_user_struct(VERIFY_WRITE, ucp, ucp_addr, 0)) {
         goto do_sigsegv;
+    }
     
     mcp = &ucp->tuc_mcontext;
     grp = &mcp->mc_gregs;
@@ -2670,12 +2705,14 @@ void sparc64_get_context(CPUSPARCState *env)
 
     w_addr = TARGET_STACK_BIAS+env->regwptr[UREG_I6];
     fp = i7 = 0;
-    if (get_user(fp, w_addr + offsetof(struct target_reg_window, ins[6]), 
-                 abi_ulong) != 0)
+    if (get_user(fp, w_addr + offsetof(struct target_reg_window, ins[6]),
+                 abi_ulong) != 0) {
         goto do_sigsegv;
-    if (get_user(i7, w_addr + offsetof(struct target_reg_window, ins[7]), 
-                 abi_ulong) != 0)
+    }
+    if (get_user(i7, w_addr + offsetof(struct target_reg_window, ins[7]),
+                 abi_ulong) != 0) {
         goto do_sigsegv;
+    }
     __put_user(fp, &(mcp->mc_fp));
     __put_user(i7, &(mcp->mc_i7));
 
@@ -2697,7 +2734,7 @@ void sparc64_get_context(CPUSPARCState *env)
         goto do_sigsegv;
     unlock_user_struct(ucp, ucp_addr, 1);
     return;
- do_sigsegv:
+do_sigsegv:
     unlock_user_struct(ucp, ucp_addr, 1);
     force_sig(TARGET_SIGSEGV);
 }
@@ -2787,7 +2824,7 @@ static inline int install_sigtramp(unsigned int *tramp,   unsigned int syscall)
 }
 
 static inline void setup_sigcontext(CPUMIPSState *regs,
-        struct target_sigcontext *sc)
+                                    struct target_sigcontext *sc)
 {
     int i;
 
@@ -2899,8 +2936,9 @@ static void setup_frame(int sig, struct target_sigaction * ka,
 
     frame_addr = get_sigframe(ka, regs, sizeof(*frame));
     trace_user_setup_frame(regs, frame_addr);
-    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-	goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        goto give_sigsegv;
+    }
 
     install_sigtramp(frame->sf_code, TARGET_NR_sigreturn);
 
@@ -2948,7 +2986,7 @@ long do_sigreturn(CPUMIPSState *regs)
     frame_addr = regs->active_tc.gpr[29];
     trace_user_do_sigreturn(regs, frame_addr);
     if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-   	goto badframe;
+        goto badframe;
 
     for(i = 0; i < TARGET_NSIG_WORDS; i++) {
         __get_user(target_set.sig[i], &frame->sf_mask.sig[i]);
@@ -2994,8 +3032,9 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
 
     frame_addr = get_sigframe(ka, env, sizeof(*frame));
     trace_user_setup_rt_frame(env, frame_addr);
-    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-	goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        goto give_sigsegv;
+    }
 
     install_sigtramp(frame->rs_code, TARGET_NR_rt_sigreturn);
 
@@ -3053,8 +3092,9 @@ long do_rt_sigreturn(CPUMIPSState *env)
 
     frame_addr = env->active_tc.gpr[29];
     trace_user_do_rt_sigreturn(env, frame_addr);
-    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-   	goto badframe;
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
     target_to_host_sigset(&blocked, &frame->rs_uc.tuc_sigmask);
     do_sigprocmask(SIG_SETMASK, &blocked, NULL);
@@ -3062,8 +3102,8 @@ long do_rt_sigreturn(CPUMIPSState *env)
     restore_sigcontext(env, &frame->rs_uc.tuc_mcontext);
 
     if (do_sigaltstack(frame_addr +
-		       offsetof(struct target_rt_sigframe, rs_uc.tuc_stack),
-		       0, get_sp_from_cpustate(env)) == -EFAULT)
+                       offsetof(struct target_rt_sigframe, rs_uc.tuc_stack),
+                       0, get_sp_from_cpustate(env)) == -EFAULT)
         goto badframe;
 
     env->active_tc.PC = env->CP0_EPC;
@@ -3134,7 +3174,7 @@ struct target_rt_sigframe
 #define TRAP_NOARG 0xc310         /* Syscall w/no args (NR in R3) SH3/4 */
 
 static abi_ulong get_sigframe(struct target_sigaction *ka,
-                         unsigned long sp, size_t frame_size)
+                              unsigned long sp, size_t frame_size)
 {
     if ((ka->sa_flags & TARGET_SA_ONSTACK) && (sas_ss_flags(sp) == 0)) {
         sp = target_sigaltstack_used.ss_sp + target_sigaltstack_used.ss_size;
@@ -3144,7 +3184,7 @@ static abi_ulong get_sigframe(struct target_sigaction *ka,
 }
 
 static void setup_sigcontext(struct target_sigcontext *sc,
-                            CPUSH4State *regs, unsigned long mask)
+                             CPUSH4State *regs, unsigned long mask)
 {
     int i;
 
@@ -3172,13 +3212,12 @@ static void setup_sigcontext(struct target_sigcontext *sc,
     __put_user(mask, &sc->oldmask);
 }
 
-static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc,
-                              target_ulong *r0_p)
+static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc)
 {
     int i;
 
 #define COPY(x)         __get_user(regs->x, &sc->sc_##x)
-    COPY(gregs[1]);
+    COPY(gregs[0]); COPY(gregs[1]);
     COPY(gregs[2]); COPY(gregs[3]);
     COPY(gregs[4]); COPY(gregs[5]);
     COPY(gregs[6]); COPY(gregs[7]);
@@ -3198,7 +3237,6 @@ static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc,
     __get_user(regs->fpul, &sc->sc_fpul);
 
     regs->tra = -1;         /* disable syscall checks */
-    __get_user(*r0_p, &sc->sc_gregs[0]);
 }
 
 static void setup_frame(int sig, struct target_sigaction *ka,
@@ -3210,8 +3248,9 @@ static void setup_frame(int sig, struct target_sigaction *ka,
 
     frame_addr = get_sigframe(ka, regs->gregs[15], sizeof(*frame));
     trace_user_setup_frame(regs, frame_addr);
-    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-	goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        goto give_sigsegv;
+    }
 
     setup_sigcontext(&frame->sc, regs, set->sig[0]);
 
@@ -3258,8 +3297,9 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
 
     frame_addr = get_sigframe(ka, regs->gregs[15], sizeof(*frame));
     trace_user_setup_rt_frame(regs, frame_addr);
-    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-	goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        goto give_sigsegv;
+    }
 
     tswap_siginfo(&frame->info, info);
 
@@ -3273,7 +3313,7 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     __put_user(target_sigaltstack_used.ss_size,
                &frame->uc.tuc_stack.ss_size);
     setup_sigcontext(&frame->uc.tuc_mcontext,
-			    regs, set->sig[0]);
+                     regs, set->sig[0]);
     for(i = 0; i < TARGET_NSIG_WORDS; i++) {
         __put_user(set->sig[i], &frame->uc.tuc_sigmask.sig[i]);
     }
@@ -3313,14 +3353,14 @@ long do_sigreturn(CPUSH4State *regs)
     abi_ulong frame_addr;
     sigset_t blocked;
     target_sigset_t target_set;
-    target_ulong r0;
     int i;
     int err = 0;
 
     frame_addr = regs->gregs[15];
     trace_user_do_sigreturn(regs, frame_addr);
-    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-   	goto badframe;
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
     __get_user(target_set.sig[0], &frame->sc.oldmask);
     for(i = 1; i < TARGET_NSIG_WORDS; i++) {
@@ -3333,10 +3373,10 @@ long do_sigreturn(CPUSH4State *regs)
     target_to_host_sigset_internal(&blocked, &target_set);
     do_sigprocmask(SIG_SETMASK, &blocked, NULL);
 
-    restore_sigcontext(regs, &frame->sc, &r0);
+    restore_sigcontext(regs, &frame->sc);
 
     unlock_user_struct(frame, frame_addr, 0);
-    return r0;
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     unlock_user_struct(frame, frame_addr, 0);
@@ -3349,25 +3389,26 @@ long do_rt_sigreturn(CPUSH4State *regs)
     struct target_rt_sigframe *frame;
     abi_ulong frame_addr;
     sigset_t blocked;
-    target_ulong r0;
 
     frame_addr = regs->gregs[15];
     trace_user_do_rt_sigreturn(regs, frame_addr);
-    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
-   	goto badframe;
+    if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
     target_to_host_sigset(&blocked, &frame->uc.tuc_sigmask);
     do_sigprocmask(SIG_SETMASK, &blocked, NULL);
 
-    restore_sigcontext(regs, &frame->uc.tuc_mcontext, &r0);
+    restore_sigcontext(regs, &frame->uc.tuc_mcontext);
 
     if (do_sigaltstack(frame_addr +
-		       offsetof(struct target_rt_sigframe, uc.tuc_stack),
-		       0, get_sp_from_cpustate(regs)) == -EFAULT)
+                       offsetof(struct target_rt_sigframe, uc.tuc_stack),
+                       0, get_sp_from_cpustate(regs)) == -EFAULT) {
         goto badframe;
+    }
 
     unlock_user_struct(frame, frame_addr, 0);
-    return r0;
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     unlock_user_struct(frame, frame_addr, 0);
@@ -3532,7 +3573,8 @@ static void setup_frame(int sig, struct target_sigaction *ka,
 
         /* Return from sighandler will jump to the tramp.
            Negative 8 offset because return is rtsd r15, 8 */
-        env->regs[15] = ((unsigned long)frame->tramp) - 8;
+        env->regs[15] = frame_addr + offsetof(struct target_signal_frame, tramp)
+                                   - 8;
     }
 
     /* Set up registers for signal handler */
@@ -3548,7 +3590,7 @@ static void setup_frame(int sig, struct target_sigaction *ka,
 
     unlock_user_struct(frame, frame_addr, 1);
     return;
-  badframe:
+badframe:
     force_sig(TARGET_SIGSEGV);
 }
 
@@ -3576,7 +3618,7 @@ long do_sigreturn(CPUMBState *env)
     /* Restore blocked signals */
     __get_user(target_set.sig[0], &frame->uc.tuc_mcontext.oldmask);
     for(i = 1; i < TARGET_NSIG_WORDS; i++) {
-       __get_user(target_set.sig[i], &frame->extramask[i - 1]);
+        __get_user(target_set.sig[i], &frame->extramask[i - 1]);
     }
     target_to_host_sigset_internal(&set, &target_set);
     do_sigprocmask(SIG_SETMASK, &set, NULL);
@@ -3585,10 +3627,10 @@ long do_sigreturn(CPUMBState *env)
     /* We got here through a sigreturn syscall, our path back is via an
        rtb insn so setup r14 for that.  */
     env->regs[14] = env->sregs[SR_PC];
- 
+
     unlock_user_struct(frame, frame_addr, 0);
-    return env->regs[10];
-  badframe:
+    return -TARGET_QEMU_ESIGRETURN;
+badframe:
     force_sig(TARGET_SIGSEGV);
 }
 
@@ -3602,124 +3644,124 @@ long do_rt_sigreturn(CPUMBState *env)
 #elif defined(TARGET_CRIS)
 
 struct target_sigcontext {
-        struct target_pt_regs regs;  /* needs to be first */
-        uint32_t oldmask;
-        uint32_t usp;    /* usp before stacking this gunk on it */
+    struct target_pt_regs regs;  /* needs to be first */
+    uint32_t oldmask;
+    uint32_t usp;    /* usp before stacking this gunk on it */
 };
 
 /* Signal frames. */
 struct target_signal_frame {
-        struct target_sigcontext sc;
-        uint32_t extramask[TARGET_NSIG_WORDS - 1];
-        uint16_t retcode[4];      /* Trampoline code. */
+    struct target_sigcontext sc;
+    uint32_t extramask[TARGET_NSIG_WORDS - 1];
+    uint16_t retcode[4];      /* Trampoline code. */
 };
 
 struct rt_signal_frame {
-        siginfo_t *pinfo;
-        void *puc;
-        siginfo_t info;
-        struct ucontext uc;
-        uint16_t retcode[4];      /* Trampoline code. */
+    siginfo_t *pinfo;
+    void *puc;
+    siginfo_t info;
+    struct ucontext uc;
+    uint16_t retcode[4];      /* Trampoline code. */
 };
 
 static void setup_sigcontext(struct target_sigcontext *sc, CPUCRISState *env)
 {
-	__put_user(env->regs[0], &sc->regs.r0);
-	__put_user(env->regs[1], &sc->regs.r1);
-	__put_user(env->regs[2], &sc->regs.r2);
-	__put_user(env->regs[3], &sc->regs.r3);
-	__put_user(env->regs[4], &sc->regs.r4);
-	__put_user(env->regs[5], &sc->regs.r5);
-	__put_user(env->regs[6], &sc->regs.r6);
-	__put_user(env->regs[7], &sc->regs.r7);
-	__put_user(env->regs[8], &sc->regs.r8);
-	__put_user(env->regs[9], &sc->regs.r9);
-	__put_user(env->regs[10], &sc->regs.r10);
-	__put_user(env->regs[11], &sc->regs.r11);
-	__put_user(env->regs[12], &sc->regs.r12);
-	__put_user(env->regs[13], &sc->regs.r13);
-	__put_user(env->regs[14], &sc->usp);
-	__put_user(env->regs[15], &sc->regs.acr);
-	__put_user(env->pregs[PR_MOF], &sc->regs.mof);
-	__put_user(env->pregs[PR_SRP], &sc->regs.srp);
-	__put_user(env->pc, &sc->regs.erp);
+    __put_user(env->regs[0], &sc->regs.r0);
+    __put_user(env->regs[1], &sc->regs.r1);
+    __put_user(env->regs[2], &sc->regs.r2);
+    __put_user(env->regs[3], &sc->regs.r3);
+    __put_user(env->regs[4], &sc->regs.r4);
+    __put_user(env->regs[5], &sc->regs.r5);
+    __put_user(env->regs[6], &sc->regs.r6);
+    __put_user(env->regs[7], &sc->regs.r7);
+    __put_user(env->regs[8], &sc->regs.r8);
+    __put_user(env->regs[9], &sc->regs.r9);
+    __put_user(env->regs[10], &sc->regs.r10);
+    __put_user(env->regs[11], &sc->regs.r11);
+    __put_user(env->regs[12], &sc->regs.r12);
+    __put_user(env->regs[13], &sc->regs.r13);
+    __put_user(env->regs[14], &sc->usp);
+    __put_user(env->regs[15], &sc->regs.acr);
+    __put_user(env->pregs[PR_MOF], &sc->regs.mof);
+    __put_user(env->pregs[PR_SRP], &sc->regs.srp);
+    __put_user(env->pc, &sc->regs.erp);
 }
 
 static void restore_sigcontext(struct target_sigcontext *sc, CPUCRISState *env)
 {
-	__get_user(env->regs[0], &sc->regs.r0);
-	__get_user(env->regs[1], &sc->regs.r1);
-	__get_user(env->regs[2], &sc->regs.r2);
-	__get_user(env->regs[3], &sc->regs.r3);
-	__get_user(env->regs[4], &sc->regs.r4);
-	__get_user(env->regs[5], &sc->regs.r5);
-	__get_user(env->regs[6], &sc->regs.r6);
-	__get_user(env->regs[7], &sc->regs.r7);
-	__get_user(env->regs[8], &sc->regs.r8);
-	__get_user(env->regs[9], &sc->regs.r9);
-	__get_user(env->regs[10], &sc->regs.r10);
-	__get_user(env->regs[11], &sc->regs.r11);
-	__get_user(env->regs[12], &sc->regs.r12);
-	__get_user(env->regs[13], &sc->regs.r13);
-	__get_user(env->regs[14], &sc->usp);
-	__get_user(env->regs[15], &sc->regs.acr);
-	__get_user(env->pregs[PR_MOF], &sc->regs.mof);
-	__get_user(env->pregs[PR_SRP], &sc->regs.srp);
-	__get_user(env->pc, &sc->regs.erp);
+    __get_user(env->regs[0], &sc->regs.r0);
+    __get_user(env->regs[1], &sc->regs.r1);
+    __get_user(env->regs[2], &sc->regs.r2);
+    __get_user(env->regs[3], &sc->regs.r3);
+    __get_user(env->regs[4], &sc->regs.r4);
+    __get_user(env->regs[5], &sc->regs.r5);
+    __get_user(env->regs[6], &sc->regs.r6);
+    __get_user(env->regs[7], &sc->regs.r7);
+    __get_user(env->regs[8], &sc->regs.r8);
+    __get_user(env->regs[9], &sc->regs.r9);
+    __get_user(env->regs[10], &sc->regs.r10);
+    __get_user(env->regs[11], &sc->regs.r11);
+    __get_user(env->regs[12], &sc->regs.r12);
+    __get_user(env->regs[13], &sc->regs.r13);
+    __get_user(env->regs[14], &sc->usp);
+    __get_user(env->regs[15], &sc->regs.acr);
+    __get_user(env->pregs[PR_MOF], &sc->regs.mof);
+    __get_user(env->pregs[PR_SRP], &sc->regs.srp);
+    __get_user(env->pc, &sc->regs.erp);
 }
 
 static abi_ulong get_sigframe(CPUCRISState *env, int framesize)
 {
-	abi_ulong sp;
-	/* Align the stack downwards to 4.  */
-	sp = (env->regs[R_SP] & ~3);
-	return sp - framesize;
+    abi_ulong sp;
+    /* Align the stack downwards to 4.  */
+    sp = (env->regs[R_SP] & ~3);
+    return sp - framesize;
 }
 
 static void setup_frame(int sig, struct target_sigaction *ka,
                         target_sigset_t *set, CPUCRISState *env)
 {
-	struct target_signal_frame *frame;
-	abi_ulong frame_addr;
-	int i;
-
-	frame_addr = get_sigframe(env, sizeof *frame);
-        trace_user_setup_frame(env, frame_addr);
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-		goto badframe;
-
-	/*
-	 * The CRIS signal return trampoline. A real linux/CRIS kernel doesn't
-	 * use this trampoline anymore but it sets it up for GDB.
-	 * In QEMU, using the trampoline simplifies things a bit so we use it.
-	 *
-	 * This is movu.w __NR_sigreturn, r9; break 13;
-	 */
+    struct target_signal_frame *frame;
+    abi_ulong frame_addr;
+    int i;
+
+    frame_addr = get_sigframe(env, sizeof *frame);
+    trace_user_setup_frame(env, frame_addr);
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
+        goto badframe;
+
+    /*
+     * The CRIS signal return trampoline. A real linux/CRIS kernel doesn't
+     * use this trampoline anymore but it sets it up for GDB.
+     * In QEMU, using the trampoline simplifies things a bit so we use it.
+     *
+     * This is movu.w __NR_sigreturn, r9; break 13;
+     */
     __put_user(0x9c5f, frame->retcode+0);
     __put_user(TARGET_NR_sigreturn,
                frame->retcode + 1);
     __put_user(0xe93d, frame->retcode + 2);
 
-	/* Save the mask.  */
+    /* Save the mask.  */
     __put_user(set->sig[0], &frame->sc.oldmask);
 
     for(i = 1; i < TARGET_NSIG_WORDS; i++) {
         __put_user(set->sig[i], &frame->extramask[i - 1]);
     }
 
-	setup_sigcontext(&frame->sc, env);
+    setup_sigcontext(&frame->sc, env);
 
-	/* Move the stack and setup the arguments for the handler.  */
-	env->regs[R_SP] = frame_addr;
-	env->regs[10] = sig;
-	env->pc = (unsigned long) ka->_sa_handler;
-	/* Link SRP so the guest returns through the trampoline.  */
-	env->pregs[PR_SRP] = frame_addr + offsetof(typeof(*frame), retcode);
+    /* Move the stack and setup the arguments for the handler.  */
+    env->regs[R_SP] = frame_addr;
+    env->regs[10] = sig;
+    env->pc = (unsigned long) ka->_sa_handler;
+    /* Link SRP so the guest returns through the trampoline.  */
+    env->pregs[PR_SRP] = frame_addr + offsetof(typeof(*frame), retcode);
 
-	unlock_user_struct(frame, frame_addr, 1);
-	return;
-  badframe:
-	force_sig(TARGET_SIGSEGV);
+    unlock_user_struct(frame, frame_addr, 1);
+    return;
+badframe:
+    force_sig(TARGET_SIGSEGV);
 }
 
 static void setup_rt_frame(int sig, struct target_sigaction *ka,
@@ -3731,31 +3773,32 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 long do_sigreturn(CPUCRISState *env)
 {
-	struct target_signal_frame *frame;
-	abi_ulong frame_addr;
-	target_sigset_t target_set;
-	sigset_t set;
-	int i;
+    struct target_signal_frame *frame;
+    abi_ulong frame_addr;
+    target_sigset_t target_set;
+    sigset_t set;
+    int i;
 
-	frame_addr = env->regs[R_SP];
-        trace_user_do_sigreturn(env, frame_addr);
-	/* Make sure the guest isn't playing games.  */
-	if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 1))
-		goto badframe;
+    frame_addr = env->regs[R_SP];
+    trace_user_do_sigreturn(env, frame_addr);
+    /* Make sure the guest isn't playing games.  */
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 1)) {
+        goto badframe;
+    }
 
-	/* Restore blocked signals */
+    /* Restore blocked signals */
     __get_user(target_set.sig[0], &frame->sc.oldmask);
-	for(i = 1; i < TARGET_NSIG_WORDS; i++) {
+    for(i = 1; i < TARGET_NSIG_WORDS; i++) {
         __get_user(target_set.sig[i], &frame->extramask[i - 1]);
-	}
-	target_to_host_sigset_internal(&set, &target_set);
-        do_sigprocmask(SIG_SETMASK, &set, NULL);
+    }
+    target_to_host_sigset_internal(&set, &target_set);
+    do_sigprocmask(SIG_SETMASK, &set, NULL);
 
-	restore_sigcontext(&frame->sc, env);
-	unlock_user_struct(frame, frame_addr, 0);
-	return env->regs[10];
-  badframe:
-	force_sig(TARGET_SIGSEGV);
+    restore_sigcontext(&frame->sc, env);
+    unlock_user_struct(frame, frame_addr, 0);
+    return -TARGET_QEMU_ESIGRETURN;
+badframe:
+    force_sig(TARGET_SIGSEGV);
 }
 
 long do_rt_sigreturn(CPUCRISState *env)
@@ -3841,8 +3884,8 @@ badframe:
 /* Set up a signal frame.  */
 
 static void setup_sigcontext(struct target_sigcontext *sc,
-                            CPUOpenRISCState *regs,
-                            unsigned long mask)
+                             CPUOpenRISCState *regs,
+                             unsigned long mask)
 {
     unsigned long usp = regs->gpr[1];
 
@@ -4100,7 +4143,7 @@ static void setup_frame(int sig, struct target_sigaction *ka,
     frame_addr = get_sigframe(ka, env, sizeof(*frame));
     trace_user_setup_frame(env, frame_addr);
     if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
-            goto give_sigsegv;
+        goto give_sigsegv;
     }
 
     __put_user(set->sig[0], &frame->sc.oldmask[0]);
@@ -4113,13 +4156,13 @@ static void setup_frame(int sig, struct target_sigaction *ka,
     /* Set up to return from userspace.  If provided, use a stub
        already in userspace.  */
     if (ka->sa_flags & TARGET_SA_RESTORER) {
-            env->regs[14] = (unsigned long)
-                    ka->sa_restorer | PSW_ADDR_AMODE;
+        env->regs[14] = (unsigned long)
+                ka->sa_restorer | PSW_ADDR_AMODE;
     } else {
-            env->regs[14] = (unsigned long)
-                    frame->retcode | PSW_ADDR_AMODE;
-            __put_user(S390_SYSCALL_OPCODE | TARGET_NR_sigreturn,
-                       (uint16_t *)(frame->retcode));
+        env->regs[14] = (frame_addr + offsetof(sigframe, retcode))
+                        | PSW_ADDR_AMODE;
+        __put_user(S390_SYSCALL_OPCODE | TARGET_NR_sigreturn,
+                   (uint16_t *)(frame->retcode));
     }
 
     /* Set up backchain. */
@@ -4167,12 +4210,12 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     __put_user((abi_ulong)0, (abi_ulong *)&frame->uc.tuc_link);
     __put_user(target_sigaltstack_used.ss_sp, &frame->uc.tuc_stack.ss_sp);
     __put_user(sas_ss_flags(get_sp_from_cpustate(env)),
-                      &frame->uc.tuc_stack.ss_flags);
+               &frame->uc.tuc_stack.ss_flags);
     __put_user(target_sigaltstack_used.ss_size, &frame->uc.tuc_stack.ss_size);
     save_sigregs(env, &frame->uc.tuc_mcontext);
     for (i = 0; i < TARGET_NSIG_WORDS; i++) {
         __put_user((abi_ulong)set->sig[i],
-        (abi_ulong *)&frame->uc.tuc_sigmask.sig[i]);
+                   (abi_ulong *)&frame->uc.tuc_sigmask.sig[i]);
     }
 
     /* Set up to return from userspace.  If provided, use a stub
@@ -4248,7 +4291,7 @@ long do_sigreturn(CPUS390XState *env)
     }
 
     unlock_user_struct(frame, frame_addr, 0);
-    return env->regs[2];
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     force_sig(TARGET_SIGSEGV);
@@ -4278,7 +4321,7 @@ long do_rt_sigreturn(CPUS390XState *env)
         goto badframe;
     }
     unlock_user_struct(frame, frame_addr, 0);
-    return env->regs[2];
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     unlock_user_struct(frame, frame_addr, 0);
@@ -4423,15 +4466,15 @@ struct target_sigframe {
 #define TARGET_TRAMP_SIZE 6
 
 struct target_rt_sigframe {
-        /* sys_rt_sigreturn requires the ucontext be the first field */
-        struct target_ucontext uc;
-        target_ulong  _unused[2];
-        uint32_t trampoline[TARGET_TRAMP_SIZE];
-        target_ulong pinfo; /* struct siginfo __user * */
-        target_ulong puc; /* void __user * */
-        struct target_siginfo info;
-        /* 64 bit ABI allows for 288 bytes below sp before decrementing it. */
-        char abigap[288];
+    /* sys_rt_sigreturn requires the ucontext be the first field */
+    struct target_ucontext uc;
+    target_ulong  _unused[2];
+    uint32_t trampoline[TARGET_TRAMP_SIZE];
+    target_ulong pinfo; /* struct siginfo __user * */
+    target_ulong puc; /* void __user * */
+    struct target_siginfo info;
+    /* 64 bit ABI allows for 288 bytes below sp before decrementing it. */
+    char abigap[288];
 } __attribute__((aligned(16)));
 
 #else
@@ -4466,7 +4509,7 @@ static target_ulong get_sigframe(struct target_sigaction *ka,
     oldsp = env->gpr[1];
 
     if ((ka->sa_flags & TARGET_SA_ONSTACK) &&
-        (sas_ss_flags(oldsp) == 0)) {
+            (sas_ss_flags(oldsp) == 0)) {
         oldsp = (target_sigaltstack_used.ss_sp
                  + target_sigaltstack_used.ss_size);
     }
@@ -4488,7 +4531,7 @@ static void save_user_regs(CPUPPCState *env, struct target_mcontext *frame)
 
     /* Save general registers.  */
     for (i = 0; i < ARRAY_SIZE(env->gpr); i++) {
-       __put_user(env->gpr[i], &frame->mc_gregs[i]);
+        __put_user(env->gpr[i], &frame->mc_gregs[i]);
     }
     __put_user(env->nip, &frame->mc_gregs[TARGET_PT_NIP]);
     __put_user(env->ctr, &frame->mc_gregs[TARGET_PT_CTR]);
@@ -4589,7 +4632,7 @@ static void restore_user_regs(CPUPPCState *env,
 
     /* If doing signal return, restore the previous little-endian mode.  */
     if (sig)
-        env->msr = (env->msr & ~MSR_LE) | (msr & MSR_LE);
+        env->msr = (env->msr & ~(1ull << MSR_LE)) | (msr & (1ull << MSR_LE));
 
     /* Restore Altivec registers if necessary.  */
     if (env->insns_flags & PPC_ALTIVEC) {
@@ -4704,7 +4747,7 @@ static void setup_frame(int sig, struct target_sigaction *ka,
 #endif
 
     /* Signal handlers are entered in big-endian mode.  */
-    env->msr &= ~MSR_LE;
+    env->msr &= ~(1ull << MSR_LE);
 
     unlock_user_struct(frame, frame_addr, 1);
     return;
@@ -4799,7 +4842,7 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
 #endif
 
     /* Signal handlers are entered in big-endian mode.  */
-    env->msr &= ~MSR_LE;
+    env->msr &= ~(1ull << MSR_LE);
 
     unlock_user_struct(rt_sf, rt_sf_addr, 1);
     return;
@@ -4925,7 +4968,7 @@ struct target_sigframe
     abi_ulong extramask[TARGET_NSIG_WORDS-1];
     struct target_sigcontext sc;
 };
- 
+
 typedef int target_greg_t;
 #define TARGET_NGREG 18
 typedef target_greg_t target_gregset_t[TARGET_NGREG];
@@ -4964,7 +5007,7 @@ struct target_rt_sigframe
 };
 
 static void setup_sigcontext(struct target_sigcontext *sc, CPUM68KState *env,
-        abi_ulong mask)
+                             abi_ulong mask)
 {
     __put_user(mask, &sc->sc_mask);
     __put_user(env->aregs[7], &sc->sc_usp);
@@ -4977,19 +5020,18 @@ static void setup_sigcontext(struct target_sigcontext *sc, CPUM68KState *env,
 }
 
 static void
-restore_sigcontext(CPUM68KState *env, struct target_sigcontext *sc, int *pd0)
+restore_sigcontext(CPUM68KState *env, struct target_sigcontext *sc)
 {
     int temp;
 
     __get_user(env->aregs[7], &sc->sc_usp);
+    __get_user(env->dregs[0], &sc->sc_d0);
     __get_user(env->dregs[1], &sc->sc_d1);
     __get_user(env->aregs[0], &sc->sc_a0);
     __get_user(env->aregs[1], &sc->sc_a1);
     __get_user(env->pc, &sc->sc_pc);
     __get_user(temp, &sc->sc_sr);
     env->sr = (env->sr & 0xff00) | (temp & 0xff);
-
-    *pd0 = tswapl(sc->sc_d0);
 }
 
 /*
@@ -5022,8 +5064,9 @@ static void setup_frame(int sig, struct target_sigaction *ka,
 
     frame_addr = get_sigframe(ka, env, sizeof *frame);
     trace_user_setup_frame(env, frame_addr);
-    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-	goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        goto give_sigsegv;
+    }
 
     __put_user(sig, &frame->sig);
 
@@ -5044,7 +5087,7 @@ static void setup_frame(int sig, struct target_sigaction *ka,
     /* moveq #,d0; trap #0 */
 
     __put_user(0x70004e40 + (TARGET_NR_sigreturn << 16),
-                      (uint32_t *)(frame->retcode));
+               (uint32_t *)(frame->retcode));
 
     /* Set up to return from userspace */
 
@@ -5085,10 +5128,9 @@ static inline int target_rt_setup_ucontext(struct target_ucontext *uc,
 
     return 0;
 }
- 
+
 static inline int target_rt_restore_ucontext(CPUM68KState *env,
-                                             struct target_ucontext *uc,
-                                             int *pd0)
+                                             struct target_ucontext *uc)
 {
     int temp;
     target_greg_t *gregs = uc->tuc_mcontext.gregs;
@@ -5118,7 +5160,6 @@ static inline int target_rt_restore_ucontext(CPUM68KState *env,
     __get_user(temp, &gregs[17]);
     env->sr = (env->sr & 0xff00) | (temp & 0xff);
 
-    *pd0 = env->dregs[0];
     return 0;
 
 badframe:
@@ -5139,8 +5180,9 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
 
     frame_addr = get_sigframe(ka, env, sizeof *frame);
     trace_user_setup_rt_frame(env, frame_addr);
-    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0))
-	goto give_sigsegv;
+    if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
+        goto give_sigsegv;
+    }
 
     __put_user(sig, &frame->sig);
 
@@ -5159,13 +5201,13 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     __put_user(target_sigaltstack_used.ss_sp,
                &frame->uc.tuc_stack.ss_sp);
     __put_user(sas_ss_flags(env->aregs[7]),
-               &frame->uc.tuc_stack.ss_flags);
+            &frame->uc.tuc_stack.ss_flags);
     __put_user(target_sigaltstack_used.ss_size,
                &frame->uc.tuc_stack.ss_size);
     err |= target_rt_setup_ucontext(&frame->uc, env);
 
     if (err)
-            goto give_sigsegv;
+        goto give_sigsegv;
 
     for(i = 0; i < TARGET_NSIG_WORDS; i++) {
         __put_user(set->sig[i], &frame->uc.tuc_sigmask.sig[i]);
@@ -5204,7 +5246,7 @@ long do_sigreturn(CPUM68KState *env)
     abi_ulong frame_addr = env->aregs[7] - 4;
     target_sigset_t target_set;
     sigset_t set;
-    int d0, i;
+    int i;
 
     trace_user_do_sigreturn(env, frame_addr);
     if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
@@ -5223,10 +5265,10 @@ long do_sigreturn(CPUM68KState *env)
 
     /* restore registers */
 
-    restore_sigcontext(env, &frame->sc, &d0);
+    restore_sigcontext(env, &frame->sc);
 
     unlock_user_struct(frame, frame_addr, 0);
-    return d0;
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     force_sig(TARGET_SIGSEGV);
@@ -5239,7 +5281,6 @@ long do_rt_sigreturn(CPUM68KState *env)
     abi_ulong frame_addr = env->aregs[7] - 4;
     target_sigset_t target_set;
     sigset_t set;
-    int d0;
 
     trace_user_do_rt_sigreturn(env, frame_addr);
     if (!lock_user_struct(VERIFY_READ, frame, frame_addr, 1))
@@ -5250,7 +5291,7 @@ long do_rt_sigreturn(CPUM68KState *env)
 
     /* restore registers */
 
-    if (target_rt_restore_ucontext(env, &frame->uc, &d0))
+    if (target_rt_restore_ucontext(env, &frame->uc))
         goto badframe;
 
     if (do_sigaltstack(frame_addr +
@@ -5259,7 +5300,7 @@ long do_rt_sigreturn(CPUM68KState *env)
         goto badframe;
 
     unlock_user_struct(frame, frame_addr, 0);
-    return d0;
+    return -TARGET_QEMU_ESIGRETURN;
 
 badframe:
     unlock_user_struct(frame, frame_addr, 0);
@@ -5316,7 +5357,7 @@ struct target_rt_sigframe {
 #define INSN_CALLSYS            0x00000083
 
 static void setup_sigcontext(struct target_sigcontext *sc, CPUAlphaState *env,
-                            abi_ulong frame_addr, target_sigset_t *set)
+                             abi_ulong frame_addr, target_sigset_t *set)
 {
     int i;
 
@@ -5342,7 +5383,7 @@ static void setup_sigcontext(struct target_sigcontext *sc, CPUAlphaState *env,
 }
 
 static void restore_sigcontext(CPUAlphaState *env,
-                              struct target_sigcontext *sc)
+                               struct target_sigcontext *sc)
 {
     uint64_t fpcr;
     int i;
@@ -5402,7 +5443,7 @@ static void setup_frame(int sig, struct target_sigaction *ka,
     unlock_user_struct(frame, frame_addr, 1);
 
     if (err) {
-    give_sigsegv:
+give_sigsegv:
         if (sig == TARGET_SIGSEGV) {
             ka->_sa_handler = TARGET_SIG_DFL;
         }
@@ -5459,8 +5500,8 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     }
 
     if (err) {
-    give_sigsegv:
-       if (sig == TARGET_SIGSEGV) {
+give_sigsegv:
+        if (sig == TARGET_SIGSEGV) {
             ka->_sa_handler = TARGET_SIG_DFL;
         }
         force_sig(TARGET_SIGSEGV);
@@ -5493,9 +5534,9 @@ long do_sigreturn(CPUAlphaState *env)
 
     restore_sigcontext(env, sc);
     unlock_user_struct(sc, sc_addr, 0);
-    return env->ir[IR_V0];
+    return -TARGET_QEMU_ESIGRETURN;
 
- badframe:
+badframe:
     force_sig(TARGET_SIGSEGV);
 }
 
@@ -5520,10 +5561,10 @@ long do_rt_sigreturn(CPUAlphaState *env)
     }
 
     unlock_user_struct(frame, frame_addr, 0);
-    return env->ir[IR_V0];
+    return -TARGET_QEMU_ESIGRETURN;
 
 
- badframe:
+badframe:
     unlock_user_struct(frame, frame_addr, 0);
     force_sig(TARGET_SIGSEGV);
 }
@@ -5559,8 +5600,13 @@ struct target_rt_sigframe {
     unsigned char save_area[16]; /* caller save area */
     struct target_siginfo info;
     struct target_ucontext uc;
+    abi_ulong retcode[2];
 };
 
+#define INSN_MOVELI_R10_139  0x00045fe551483000ULL /* { moveli r10, 139 } */
+#define INSN_SWINT1          0x286b180051485000ULL /* { swint1 } */
+
+
 static void setup_sigcontext(struct target_sigcontext *sc,
                              CPUArchState *env, int signo)
 {
@@ -5636,9 +5682,12 @@ static void setup_rt_frame(int sig, struct target_sigaction *ka,
     __put_user(target_sigaltstack_used.ss_size, &frame->uc.tuc_stack.ss_size);
     setup_sigcontext(&frame->uc.tuc_mcontext, env, info->si_signo);
 
-    restorer = (unsigned long) do_rt_sigreturn;
     if (ka->sa_flags & TARGET_SA_RESTORER) {
-            restorer = (unsigned long) ka->sa_restorer;
+        restorer = (unsigned long) ka->sa_restorer;
+    } else {
+        __put_user(INSN_MOVELI_R10_139, &frame->retcode[0]);
+        __put_user(INSN_SWINT1, &frame->retcode[1]);
+        restorer = frame_addr + offsetof(struct target_rt_sigframe, retcode);
     }
     env->pc = (unsigned long) ka->_sa_handler;
     env->regs[TILEGX_R_SP] = (unsigned long) frame;
@@ -5679,7 +5728,7 @@ long do_rt_sigreturn(CPUTLGState *env)
     }
 
     unlock_user_struct(frame, frame_addr, 0);
-    return env->regs[TILEGX_R_RE];
+    return -TARGET_QEMU_ESIGRETURN;
 
 
  badframe:
@@ -5690,14 +5739,14 @@ long do_rt_sigreturn(CPUTLGState *env)
 #else
 
 static void setup_frame(int sig, struct target_sigaction *ka,
-			target_sigset_t *set, CPUArchState *env)
+                        target_sigset_t *set, CPUArchState *env)
 {
     fprintf(stderr, "setup_frame: not implemented\n");
 }
 
 static void setup_rt_frame(int sig, struct target_sigaction *ka,
                            target_siginfo_t *info,
-			   target_sigset_t *set, CPUArchState *env)
+                           target_sigset_t *set, CPUArchState *env)
 {
     fprintf(stderr, "setup_rt_frame: not implemented\n");
 }
diff --git a/linux-user/sparc/target_signal.h b/linux-user/sparc/target_signal.h
index c7de300cd7..2df38c805f 100644
--- a/linux-user/sparc/target_signal.h
+++ b/linux-user/sparc/target_signal.h
@@ -33,4 +33,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUSPARCState *state)
     return state->regwptr[UREG_FP];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/sparc64/target_signal.h b/linux-user/sparc64/target_signal.h
index c7de300cd7..2df38c805f 100644
--- a/linux-user/sparc64/target_signal.h
+++ b/linux-user/sparc64/target_signal.h
@@ -33,4 +33,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUSPARCState *state)
     return state->regwptr[UREG_FP];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 032d338869..df70255e5f 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -110,6 +110,10 @@ int __clone2(int (*fn)(void *), void *child_stack_base,
     CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
 
 //#define DEBUG
+/* Define DEBUG_ERESTARTSYS to force every syscall to be restarted
+ * once. This exercises the codepaths for restart.
+ */
+//#define DEBUG_ERESTARTSYS
 
 //#include <linux/msdos_fs.h>
 #define	VFAT_IOCTL_READDIR_BOTH		_IOR('r', 1, struct linux_dirent [2])
@@ -355,18 +359,6 @@ static int sys_getcwd1(char *buf, size_t size)
   return strlen(buf)+1;
 }
 
-static int sys_openat(int dirfd, const char *pathname, int flags, mode_t mode)
-{
-  /*
-   * open(2) has extra parameter 'mode' when called with
-   * flag O_CREAT.
-   */
-  if ((flags & O_CREAT) != 0) {
-      return (openat(dirfd, pathname, flags, mode));
-  }
-  return (openat(dirfd, pathname, flags));
-}
-
 #ifdef TARGET_NR_utimensat
 #ifdef CONFIG_UTIMENSAT
 static int sys_utimensat(int dirfd, const char *pathname,
@@ -438,15 +430,6 @@ _syscall5(int, sys_ppoll, struct pollfd *, fds, nfds_t, nfds,
           size_t, sigsetsize)
 #endif
 
-#if defined(TARGET_NR_pselect6)
-#ifndef __NR_pselect6
-# define __NR_pselect6 -1
-#endif
-#define __NR_sys_pselect6 __NR_pselect6
-_syscall6(int, sys_pselect6, int, nfds, fd_set *, readfds, fd_set *, writefds,
-          fd_set *, exceptfds, struct timespec *, timeout, void *, sig);
-#endif
-
 #if defined(TARGET_NR_prlimit64)
 #ifndef __NR_prlimit64
 # define __NR_prlimit64 -1
@@ -619,15 +602,19 @@ static uint16_t host_to_target_errno_table[ERRNO_TABLE_SIZE] = {
 
 static inline int host_to_target_errno(int err)
 {
-    if(host_to_target_errno_table[err])
+    if (err >= 0 && err < ERRNO_TABLE_SIZE &&
+        host_to_target_errno_table[err]) {
         return host_to_target_errno_table[err];
+    }
     return err;
 }
 
 static inline int target_to_host_errno(int err)
 {
-    if (target_to_host_errno_table[err])
+    if (err >= 0 && err < ERRNO_TABLE_SIZE &&
+        target_to_host_errno_table[err]) {
         return target_to_host_errno_table[err];
+    }
     return err;
 }
 
@@ -652,6 +639,67 @@ char *target_strerror(int err)
     return strerror(target_to_host_errno(err));
 }
 
+#define safe_syscall0(type, name) \
+static type safe_##name(void) \
+{ \
+    return safe_syscall(__NR_##name); \
+}
+
+#define safe_syscall1(type, name, type1, arg1) \
+static type safe_##name(type1 arg1) \
+{ \
+    return safe_syscall(__NR_##name, arg1); \
+}
+
+#define safe_syscall2(type, name, type1, arg1, type2, arg2) \
+static type safe_##name(type1 arg1, type2 arg2) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2); \
+}
+
+#define safe_syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3); \
+}
+
+#define safe_syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \
+    type4, arg4) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3, arg4); \
+}
+
+#define safe_syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \
+    type4, arg4, type5, arg5) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+    type5 arg5) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3, arg4, arg5); \
+}
+
+#define safe_syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \
+    type4, arg4, type5, arg5, type6, arg6) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+    type5 arg5, type6 arg6) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3, arg4, arg5, arg6); \
+}
+
+safe_syscall3(ssize_t, read, int, fd, void *, buff, size_t, count)
+safe_syscall3(ssize_t, write, int, fd, const void *, buff, size_t, count)
+safe_syscall4(int, openat, int, dirfd, const char *, pathname, \
+              int, flags, mode_t, mode)
+safe_syscall4(pid_t, wait4, pid_t, pid, int *, status, int, options, \
+              struct rusage *, rusage)
+safe_syscall5(int, waitid, idtype_t, idtype, id_t, id, siginfo_t *, infop, \
+              int, options, struct rusage *, rusage)
+safe_syscall3(int, execve, const char *, filename, char **, argv, char **, envp)
+safe_syscall6(int, pselect6, int, nfds, fd_set *, readfds, fd_set *, writefds, \
+              fd_set *, exceptfds, struct timespec *, timeout, void *, sig)
+safe_syscall6(int,futex,int *,uaddr,int,op,int,val, \
+              const struct timespec *,timeout,int *,uaddr2,int,val3)
+
 static inline int host_to_target_sock_type(int host_type)
 {
     int target_type;
@@ -1062,7 +1110,8 @@ static abi_long do_select(int n,
 {
     fd_set rfds, wfds, efds;
     fd_set *rfds_ptr, *wfds_ptr, *efds_ptr;
-    struct timeval tv, *tv_ptr;
+    struct timeval tv;
+    struct timespec ts, *ts_ptr;
     abi_long ret;
 
     ret = copy_from_user_fdset_ptr(&rfds, &rfds_ptr, rfd_addr, n);
@@ -1081,12 +1130,15 @@ static abi_long do_select(int n,
     if (target_tv_addr) {
         if (copy_from_user_timeval(&tv, target_tv_addr))
             return -TARGET_EFAULT;
-        tv_ptr = &tv;
+        ts.tv_sec = tv.tv_sec;
+        ts.tv_nsec = tv.tv_usec * 1000;
+        ts_ptr = &ts;
     } else {
-        tv_ptr = NULL;
+        ts_ptr = NULL;
     }
 
-    ret = get_errno(select(n, rfds_ptr, wfds_ptr, efds_ptr, tv_ptr));
+    ret = get_errno(safe_pselect6(n, rfds_ptr, wfds_ptr, efds_ptr,
+                                  ts_ptr, NULL));
 
     if (!is_error(ret)) {
         if (rfd_addr && copy_to_user_fdset(rfd_addr, &rfds, n))
@@ -1096,8 +1148,13 @@ static abi_long do_select(int n,
         if (efd_addr && copy_to_user_fdset(efd_addr, &efds, n))
             return -TARGET_EFAULT;
 
-        if (target_tv_addr && copy_to_user_timeval(target_tv_addr, &tv))
-            return -TARGET_EFAULT;
+        if (target_tv_addr) {
+            tv.tv_sec = ts.tv_sec;
+            tv.tv_usec = ts.tv_nsec / 1000;
+            if (copy_to_user_timeval(target_tv_addr, &tv)) {
+                return -TARGET_EFAULT;
+            }
+        }
     }
 
     return ret;
@@ -3095,7 +3152,7 @@ static inline abi_long do_msgsnd(int msqid, abi_long msgp,
 }
 
 static inline abi_long do_msgrcv(int msqid, abi_long msgp,
-                                 unsigned int msgsz, abi_long msgtyp,
+                                 ssize_t msgsz, abi_long msgtyp,
                                  int msgflg)
 {
     struct target_msgbuf *target_mb;
@@ -3103,10 +3160,18 @@ static inline abi_long do_msgrcv(int msqid, abi_long msgp,
     struct msgbuf *host_mb;
     abi_long ret = 0;
 
+    if (msgsz < 0) {
+        return -TARGET_EINVAL;
+    }
+
     if (!lock_user_struct(VERIFY_WRITE, target_mb, msgp, 0))
         return -TARGET_EFAULT;
 
-    host_mb = g_malloc(msgsz+sizeof(long));
+    host_mb = g_try_malloc(msgsz + sizeof(long));
+    if (!host_mb) {
+        ret = -TARGET_ENOMEM;
+        goto end;
+    }
     ret = get_errno(msgrcv(msqid, host_mb, msgsz, msgtyp, msgflg));
 
     if (ret > 0) {
@@ -5034,6 +5099,40 @@ static inline int tswapid(int id)
 
 #endif /* USE_UID16 */
 
+/* We must do direct syscalls for setting UID/GID, because we want to
+ * implement the Linux system call semantics of "change only for this thread",
+ * not the libc/POSIX semantics of "change for all threads in process".
+ * (See http://ewontfix.com/17/ for more details.)
+ * We use the 32-bit version of the syscalls if present; if it is not
+ * then either the host architecture supports 32-bit UIDs natively with
+ * the standard syscall, or the 16-bit UID is the best we can do.
+ */
+#ifdef __NR_setuid32
+#define __NR_sys_setuid __NR_setuid32
+#else
+#define __NR_sys_setuid __NR_setuid
+#endif
+#ifdef __NR_setgid32
+#define __NR_sys_setgid __NR_setgid32
+#else
+#define __NR_sys_setgid __NR_setgid
+#endif
+#ifdef __NR_setresuid32
+#define __NR_sys_setresuid __NR_setresuid32
+#else
+#define __NR_sys_setresuid __NR_setresuid
+#endif
+#ifdef __NR_setresgid32
+#define __NR_sys_setresgid __NR_setresgid32
+#else
+#define __NR_sys_setresgid __NR_setresgid
+#endif
+
+_syscall1(int, sys_setuid, uid_t, uid)
+_syscall1(int, sys_setgid, gid_t, gid)
+_syscall3(int, sys_setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+_syscall3(int, sys_setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+
 void syscall_init(void)
 {
     IOCTLEntry *ie;
@@ -5137,8 +5236,8 @@ static inline abi_long target_to_host_timespec(struct timespec *host_ts,
 
     if (!lock_user_struct(VERIFY_READ, target_ts, target_addr, 1))
         return -TARGET_EFAULT;
-    host_ts->tv_sec = tswapal(target_ts->tv_sec);
-    host_ts->tv_nsec = tswapal(target_ts->tv_nsec);
+    __get_user(host_ts->tv_sec, &target_ts->tv_sec);
+    __get_user(host_ts->tv_nsec, &target_ts->tv_nsec);
     unlock_user_struct(target_ts, target_addr, 0);
     return 0;
 }
@@ -5150,8 +5249,8 @@ static inline abi_long host_to_target_timespec(abi_ulong target_addr,
 
     if (!lock_user_struct(VERIFY_WRITE, target_ts, target_addr, 0))
         return -TARGET_EFAULT;
-    target_ts->tv_sec = tswapal(host_ts->tv_sec);
-    target_ts->tv_nsec = tswapal(host_ts->tv_nsec);
+    __put_user(host_ts->tv_sec, &target_ts->tv_sec);
+    __put_user(host_ts->tv_nsec, &target_ts->tv_nsec);
     unlock_user_struct(target_ts, target_addr, 1);
     return 0;
 }
@@ -5326,12 +5425,12 @@ static int do_futex(target_ulong uaddr, int op, int val, target_ulong timeout,
         } else {
             pts = NULL;
         }
-        return get_errno(sys_futex(g2h(uaddr), op, tswap32(val),
+        return get_errno(safe_futex(g2h(uaddr), op, tswap32(val),
                          pts, NULL, val3));
     case FUTEX_WAKE:
-        return get_errno(sys_futex(g2h(uaddr), op, val, NULL, NULL, 0));
+        return get_errno(safe_futex(g2h(uaddr), op, val, NULL, NULL, 0));
     case FUTEX_FD:
-        return get_errno(sys_futex(g2h(uaddr), op, val, NULL, NULL, 0));
+        return get_errno(safe_futex(g2h(uaddr), op, val, NULL, NULL, 0));
     case FUTEX_REQUEUE:
     case FUTEX_CMP_REQUEUE:
     case FUTEX_WAKE_OP:
@@ -5341,11 +5440,11 @@ static int do_futex(target_ulong uaddr, int op, int val, target_ulong timeout,
            to satisfy the compiler.  We do not need to tswap TIMEOUT
            since it's not compared to guest memory.  */
         pts = (struct timespec *)(uintptr_t) timeout;
-        return get_errno(sys_futex(g2h(uaddr), op, val, pts,
-                                   g2h(uaddr2),
-                                   (base_op == FUTEX_CMP_REQUEUE
-                                    ? tswap32(val3)
-                                    : val3)));
+        return get_errno(safe_futex(g2h(uaddr), op, val, pts,
+                                    g2h(uaddr2),
+                                    (base_op == FUTEX_CMP_REQUEUE
+                                     ? tswap32(val3)
+                                     : val3)));
     default:
         return -TARGET_ENOSYS;
     }
@@ -5555,7 +5654,9 @@ static int open_self_cmdline(void *cpu_env, int fd)
 
         nb_read = read(fd_orig, buf, sizeof(buf));
         if (nb_read < 0) {
+            int e = errno;
             fd_orig = close(fd_orig);
+            errno = e;
             return -1;
         } else if (nb_read == 0) {
             break;
@@ -5575,7 +5676,9 @@ static int open_self_cmdline(void *cpu_env, int fd)
 
         if (word_skipped) {
             if (write(fd, cp_buf, nb_read) != nb_read) {
+                int e = errno;
                 close(fd_orig);
+                errno = e;
                 return -1;
             }
         }
@@ -5595,7 +5698,7 @@ static int open_self_maps(void *cpu_env, int fd)
 
     fp = fopen("/proc/self/maps", "r");
     if (fp == NULL) {
-        return -EACCES;
+        return -1;
     }
 
     while ((read = getline(&line, &len, fp)) != -1) {
@@ -5739,7 +5842,7 @@ static int open_net_route(void *cpu_env, int fd)
 
     fp = fopen("/proc/net/route", "r");
     if (fp == NULL) {
-        return -EACCES;
+        return -1;
     }
 
     /* read header */
@@ -5789,7 +5892,7 @@ static int do_openat(void *cpu_env, int dirfd, const char *pathname, int flags,
 
     if (is_proc_myself(pathname, "exe")) {
         int execfd = qemu_getauxval(AT_EXECFD);
-        return execfd ? execfd : get_errno(sys_openat(dirfd, exec_path, flags, mode));
+        return execfd ? execfd : safe_openat(dirfd, exec_path, flags, mode);
     }
 
     for (fake_open = fakes; fake_open->filename; fake_open++) {
@@ -5815,7 +5918,9 @@ static int do_openat(void *cpu_env, int dirfd, const char *pathname, int flags,
         unlink(filename);
 
         if ((r = fake_open->fill(cpu_env, fd))) {
+            int e = errno;
             close(fd);
+            errno = e;
             return r;
         }
         lseek(fd, 0, SEEK_SET);
@@ -5823,7 +5928,7 @@ static int do_openat(void *cpu_env, int dirfd, const char *pathname, int flags,
         return fd;
     }
 
-    return get_errno(sys_openat(dirfd, path(pathname), flags, mode));
+    return safe_openat(dirfd, path(pathname), flags, mode);
 }
 
 #define TIMER_MAGIC 0x0caf0000
@@ -5861,6 +5966,21 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
     struct statfs stfs;
     void *p;
 
+#if defined(DEBUG_ERESTARTSYS)
+    /* Debug-only code for exercising the syscall-restart code paths
+     * in the per-architecture cpu main loops: restart every syscall
+     * the guest makes once before letting it through.
+     */
+    {
+        static int flag;
+
+        flag = !flag;
+        if (flag) {
+            return -TARGET_ERESTARTSYS;
+        }
+    }
+#endif
+
 #ifdef DEBUG
     gemu_log("syscall %d", num);
 #endif
@@ -5907,7 +6027,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
         else {
             if (!(p = lock_user(VERIFY_WRITE, arg2, arg3, 0)))
                 goto efault;
-            ret = get_errno(read(arg1, p, arg3));
+            ret = get_errno(safe_read(arg1, p, arg3));
             if (ret >= 0 &&
                 fd_trans_host_to_target_data(arg1)) {
                 ret = fd_trans_host_to_target_data(arg1)(p, ret);
@@ -5918,7 +6038,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
     case TARGET_NR_write:
         if (!(p = lock_user(VERIFY_READ, arg2, arg3, 1)))
             goto efault;
-        ret = get_errno(write(arg1, p, arg3));
+        ret = get_errno(safe_write(arg1, p, arg3));
         unlock_user(p, arg2, 0);
         break;
 #ifdef TARGET_NR_open
@@ -5968,7 +6088,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
     case TARGET_NR_waitpid:
         {
             int status;
-            ret = get_errno(waitpid(arg1, &status, arg3));
+            ret = get_errno(safe_wait4(arg1, &status, arg3, 0));
             if (!is_error(ret) && arg2 && ret
                 && put_user_s32(host_to_target_waitstatus(status), arg2))
                 goto efault;
@@ -5980,7 +6100,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
         {
             siginfo_t info;
             info.si_pid = 0;
-            ret = get_errno(waitid(arg1, arg2, &info, arg4));
+            ret = get_errno(safe_waitid(arg1, arg2, &info, arg4, NULL));
             if (!is_error(ret) && arg3 && info.si_pid != 0) {
                 if (!(p = lock_user(VERIFY_WRITE, arg3, sizeof(target_siginfo_t), 0)))
                     goto efault;
@@ -6106,7 +6226,17 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 
             if (!(p = lock_user_string(arg1)))
                 goto execve_efault;
-            ret = get_errno(execve(p, argp, envp));
+            /* Although execve() is not an interruptible syscall it is
+             * a special case where we must use the safe_syscall wrapper:
+             * if we allow a signal to happen before we make the host
+             * syscall then we will 'lose' it, because at the point of
+             * execve the process leaves QEMU's control. So we use the
+             * safe syscall wrapper to ensure that we either take the
+             * signal as a guest signal, or else it does not happen
+             * before the execve completes and makes it the other
+             * program's problem.
+             */
+            ret = get_errno(safe_execve(p, argp, envp));
             unlock_user(p, arg1, 0);
 
             goto execve_end;
@@ -6930,12 +7060,10 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
         break;
 #ifdef TARGET_NR_sigreturn
     case TARGET_NR_sigreturn:
-        /* NOTE: ret is eax, so not transcoding must be done */
         ret = do_sigreturn(cpu_env);
         break;
 #endif
     case TARGET_NR_rt_sigreturn:
-        /* NOTE: ret is eax, so not transcoding must be done */
         ret = do_rt_sigreturn(cpu_env);
         break;
     case TARGET_NR_sethostname:
@@ -7124,8 +7252,8 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
                 sig_ptr = NULL;
             }
 
-            ret = get_errno(sys_pselect6(n, rfds_ptr, wfds_ptr, efds_ptr,
-                                         ts_ptr, sig_ptr));
+            ret = get_errno(safe_pselect6(n, rfds_ptr, wfds_ptr, efds_ptr,
+                                          ts_ptr, sig_ptr));
 
             if (!is_error(ret)) {
                 if (rfd_addr && copy_to_user_fdset(rfd_addr, &rfds, n))
@@ -7694,7 +7822,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
                 rusage_ptr = &rusage;
             else
                 rusage_ptr = NULL;
-            ret = get_errno(wait4(arg1, &status, arg3, rusage_ptr));
+            ret = get_errno(safe_wait4(arg1, &status, arg3, rusage_ptr));
             if (!is_error(ret)) {
                 if (status_ptr && ret) {
                     status = host_to_target_waitstatus(status);
@@ -8740,9 +8868,9 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
 #ifdef TARGET_NR_setresuid
     case TARGET_NR_setresuid:
-        ret = get_errno(setresuid(low2highuid(arg1),
-                                  low2highuid(arg2),
-                                  low2highuid(arg3)));
+        ret = get_errno(sys_setresuid(low2highuid(arg1),
+                                      low2highuid(arg2),
+                                      low2highuid(arg3)));
         break;
 #endif
 #ifdef TARGET_NR_getresuid
@@ -8761,9 +8889,9 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
 #ifdef TARGET_NR_getresgid
     case TARGET_NR_setresgid:
-        ret = get_errno(setresgid(low2highgid(arg1),
-                                  low2highgid(arg2),
-                                  low2highgid(arg3)));
+        ret = get_errno(sys_setresgid(low2highgid(arg1),
+                                      low2highgid(arg2),
+                                      low2highgid(arg3)));
         break;
 #endif
 #ifdef TARGET_NR_getresgid
@@ -8789,10 +8917,10 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
         break;
 #endif
     case TARGET_NR_setuid:
-        ret = get_errno(setuid(low2highuid(arg1)));
+        ret = get_errno(sys_setuid(low2highuid(arg1)));
         break;
     case TARGET_NR_setgid:
-        ret = get_errno(setgid(low2highgid(arg1)));
+        ret = get_errno(sys_setgid(low2highgid(arg1)));
         break;
     case TARGET_NR_setfsuid:
         ret = get_errno(setfsuid(arg1));
@@ -9074,7 +9202,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
 #ifdef TARGET_NR_setresuid32
     case TARGET_NR_setresuid32:
-        ret = get_errno(setresuid(arg1, arg2, arg3));
+        ret = get_errno(sys_setresuid(arg1, arg2, arg3));
         break;
 #endif
 #ifdef TARGET_NR_getresuid32
@@ -9093,7 +9221,7 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
 #ifdef TARGET_NR_setresgid32
     case TARGET_NR_setresgid32:
-        ret = get_errno(setresgid(arg1, arg2, arg3));
+        ret = get_errno(sys_setresgid(arg1, arg2, arg3));
         break;
 #endif
 #ifdef TARGET_NR_getresgid32
@@ -9120,12 +9248,12 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 #endif
 #ifdef TARGET_NR_setuid32
     case TARGET_NR_setuid32:
-        ret = get_errno(setuid(arg1));
+        ret = get_errno(sys_setuid(arg1));
         break;
 #endif
 #ifdef TARGET_NR_setgid32
     case TARGET_NR_setgid32:
-        ret = get_errno(setgid(arg1));
+        ret = get_errno(sys_setgid(arg1));
         break;
 #endif
 #ifdef TARGET_NR_setfsuid32
diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h
index 9e2b3c200a..34af15a683 100644
--- a/linux-user/syscall_defs.h
+++ b/linux-user/syscall_defs.h
@@ -55,7 +55,8 @@
 #define TARGET_IOC_NRBITS	8
 #define TARGET_IOC_TYPEBITS	8
 
-#if defined(TARGET_I386) || (defined(TARGET_ARM) && defined(TARGET_ABI32)) \
+#if (defined(TARGET_I386) && defined(TARGET_ABI32)) \
+    || (defined(TARGET_ARM) && defined(TARGET_ABI32)) \
     || defined(TARGET_SPARC) \
     || defined(TARGET_M68K) || defined(TARGET_SH4) || defined(TARGET_CRIS)
     /* 16 bit uid wrappers emulation */
diff --git a/linux-user/tilegx/target_signal.h b/linux-user/tilegx/target_signal.h
index b595f985cf..fcf10405c4 100644
--- a/linux-user/tilegx/target_signal.h
+++ b/linux-user/tilegx/target_signal.h
@@ -25,4 +25,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUTLGState *state)
     return state->regs[TILEGX_R_SP];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/memory.c b/memory.c
index 4e3cda8a12..8ba496dc7b 100644
--- a/memory.c
+++ b/memory.c
@@ -33,8 +33,6 @@
 
 //#define DEBUG_UNASSIGNED
 
-#define RAM_ADDR_INVALID (~(ram_addr_t)0)
-
 static unsigned memory_region_transaction_depth;
 static bool memory_region_update_pending;
 static bool ioeventfd_update_pending;
@@ -227,6 +225,7 @@ struct FlatRange {
     hwaddr offset_in_region;
     AddrRange addr;
     uint8_t dirty_log_mask;
+    bool romd_mode;
     bool readonly;
 };
 
@@ -251,6 +250,7 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b)
     return a->mr == b->mr
         && addrrange_equal(a->addr, b->addr)
         && a->offset_in_region == b->offset_in_region
+        && a->romd_mode == b->romd_mode
         && a->readonly == b->readonly;
 }
 
@@ -310,6 +310,7 @@ static bool can_merge(FlatRange *r1, FlatRange *r2)
                                 r1->addr.size),
                      int128_make64(r2->offset_in_region))
         && r1->dirty_log_mask == r2->dirty_log_mask
+        && r1->romd_mode == r2->romd_mode
         && r1->readonly == r2->readonly;
 }
 
@@ -663,6 +664,7 @@ static void render_memory_region(FlatView *view,
 
     fr.mr = mr;
     fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+    fr.romd_mode = mr->romd_mode;
     fr.readonly = readonly;
 
     /* Render the region itself into any gaps left by the current view. */
@@ -1622,13 +1624,26 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr,
 
 int memory_region_get_fd(MemoryRegion *mr)
 {
-    if (mr->alias) {
-        return memory_region_get_fd(mr->alias);
+    int fd;
+
+    rcu_read_lock();
+    while (mr->alias) {
+        mr = mr->alias;
     }
+    fd = mr->ram_block->fd;
+    rcu_read_unlock();
 
-    assert(mr->ram_block);
+    return fd;
+}
 
-    return qemu_get_ram_fd(memory_region_get_ram_addr(mr));
+void memory_region_set_fd(MemoryRegion *mr, int fd)
+{
+    rcu_read_lock();
+    while (mr->alias) {
+        mr = mr->alias;
+    }
+    mr->ram_block->fd = fd;
+    rcu_read_unlock();
 }
 
 void *memory_region_get_ram_ptr(MemoryRegion *mr)
@@ -1642,10 +1657,22 @@ void *memory_region_get_ram_ptr(MemoryRegion *mr)
         mr = mr->alias;
     }
     assert(mr->ram_block);
-    ptr = qemu_get_ram_ptr(mr->ram_block, memory_region_get_ram_addr(mr));
+    ptr = qemu_map_ram_ptr(mr->ram_block, offset);
     rcu_read_unlock();
 
-    return ptr + offset;
+    return ptr;
+}
+
+MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset)
+{
+    RAMBlock *block;
+
+    block = qemu_ram_block_from_host(ptr, false, offset);
+    if (!block) {
+        return NULL;
+    }
+
+    return block->mr;
 }
 
 ram_addr_t memory_region_get_ram_addr(MemoryRegion *mr)
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index d25ff483eb..30ad945918 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,11 +1,12 @@
-common-obj-y += migration.o tcp.o
+common-obj-y += migration.o socket.o fd.o exec.o
+common-obj-y += tls.o
 common-obj-y += vmstate.o
-common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o
+common-obj-y += qemu-file.o
+common-obj-y += qemu-file-channel.o
 common-obj-y += xbzrle.o postcopy-ram.o
 common-obj-y += qjson.o
 
 common-obj-$(CONFIG_RDMA) += rdma.o
-common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
 
 common-obj-y += block.o
 
diff --git a/migration/block.c b/migration/block.c
index a7a76a0fb9..e0628d187f 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -383,7 +383,7 @@ static void init_blk_migration(QEMUFile *f)
     BlockDriverState *bs;
     BlkMigDevState *bmds;
     int64_t sectors;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
     block_mig_state.submitted = 0;
     block_mig_state.read_done = 0;
@@ -394,7 +394,7 @@ static void init_blk_migration(QEMUFile *f)
     block_mig_state.zero_blocks = migrate_zero_blocks();
 
 
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         if (bdrv_is_read_only(bs)) {
             continue;
         }
diff --git a/migration/exec.c b/migration/exec.c
index 559420969b..1515cc3319 100644
--- a/migration/exec.c
+++ b/migration/exec.c
@@ -3,10 +3,12 @@
  *
  * Copyright IBM, Corp. 2008
  * Copyright Dell MessageOne 2008
+ * Copyright Red Hat, Inc. 2015-2016
  *
  * Authors:
  *  Anthony Liguori   <aliguori@us.ibm.com>
  *  Charles Duffy     <charles_duffy@messageone.com>
+ *  Daniel P. Berrange <berrange@redhat.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -18,53 +20,53 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu-common.h"
-#include "qemu/sockets.h"
-#include "qemu/main-loop.h"
 #include "migration/migration.h"
-#include "migration/qemu-file.h"
-#include "block/block.h"
-#include <sys/wait.h>
+#include "io/channel-command.h"
+#include "trace.h"
 
-//#define DEBUG_MIGRATION_EXEC
-
-#ifdef DEBUG_MIGRATION_EXEC
-#define DPRINTF(fmt, ...) \
-    do { printf("migration-exec: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
 
 void exec_start_outgoing_migration(MigrationState *s, const char *command, Error **errp)
 {
-    s->to_dst_file = qemu_popen_cmd(command, "w");
-    if (s->to_dst_file == NULL) {
-        error_setg_errno(errp, errno, "failed to popen the migration target");
+    QIOChannel *ioc;
+    const char *argv[] = { "/bin/sh", "-c", command, NULL };
+
+    trace_migration_exec_outgoing(command);
+    ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv,
+                                                    O_WRONLY,
+                                                    errp));
+    if (!ioc) {
         return;
     }
 
-    migrate_fd_connect(s);
+    migration_set_outgoing_channel(s, ioc, NULL);
+    object_unref(OBJECT(ioc));
 }
 
-static void exec_accept_incoming_migration(void *opaque)
+static gboolean exec_accept_incoming_migration(QIOChannel *ioc,
+                                               GIOCondition condition,
+                                               gpointer opaque)
 {
-    QEMUFile *f = opaque;
-
-    qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL);
-    process_incoming_migration(f);
+    migration_set_incoming_channel(migrate_get_current(), ioc);
+    object_unref(OBJECT(ioc));
+    return FALSE; /* unregister */
 }
 
 void exec_start_incoming_migration(const char *command, Error **errp)
 {
-    QEMUFile *f;
+    QIOChannel *ioc;
+    const char *argv[] = { "/bin/sh", "-c", command, NULL };
 
-    DPRINTF("Attempting to start an incoming migration\n");
-    f = qemu_popen_cmd(command, "r");
-    if(f == NULL) {
-        error_setg_errno(errp, errno, "failed to popen the migration source");
+    trace_migration_exec_incoming(command);
+    ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv,
+                                                    O_RDONLY,
+                                                    errp));
+    if (!ioc) {
         return;
     }
 
-    qemu_set_fd_handler(qemu_get_fd(f), exec_accept_incoming_migration, NULL,
-                        f);
+    qio_channel_add_watch(ioc,
+                          G_IO_IN,
+                          exec_accept_incoming_migration,
+                          NULL,
+                          NULL);
 }
diff --git a/migration/fd.c b/migration/fd.c
index 3d788bb297..fc5c9eee02 100644
--- a/migration/fd.c
+++ b/migration/fd.c
@@ -1,10 +1,11 @@
 /*
  * QEMU live migration via generic fd
  *
- * Copyright Red Hat, Inc. 2009
+ * Copyright Red Hat, Inc. 2009-2016
  *
  * Authors:
  *  Chris Lalancette <clalance@redhat.com>
+ *  Daniel P. Berrange <berrange@redhat.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -16,75 +17,57 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu-common.h"
-#include "qemu/main-loop.h"
-#include "qemu/sockets.h"
 #include "migration/migration.h"
 #include "monitor/monitor.h"
-#include "migration/qemu-file.h"
-#include "block/block.h"
+#include "io/channel-util.h"
+#include "trace.h"
 
-//#define DEBUG_MIGRATION_FD
-
-#ifdef DEBUG_MIGRATION_FD
-#define DPRINTF(fmt, ...) \
-    do { printf("migration-fd: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-static bool fd_is_socket(int fd)
-{
-    struct stat stat;
-    int ret = fstat(fd, &stat);
-    if (ret == -1) {
-        /* When in doubt say no */
-        return false;
-    }
-    return S_ISSOCK(stat.st_mode);
-}
 
 void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp)
 {
+    QIOChannel *ioc;
     int fd = monitor_get_fd(cur_mon, fdname, errp);
     if (fd == -1) {
         return;
     }
 
-    if (fd_is_socket(fd)) {
-        s->to_dst_file = qemu_fopen_socket(fd, "wb");
-    } else {
-        s->to_dst_file = qemu_fdopen(fd, "wb");
+    trace_migration_fd_outgoing(fd);
+    ioc = qio_channel_new_fd(fd, errp);
+    if (!ioc) {
+        close(fd);
+        return;
     }
 
-    migrate_fd_connect(s);
+    migration_set_outgoing_channel(s, ioc, NULL);
+    object_unref(OBJECT(ioc));
 }
 
-static void fd_accept_incoming_migration(void *opaque)
+static gboolean fd_accept_incoming_migration(QIOChannel *ioc,
+                                             GIOCondition condition,
+                                             gpointer opaque)
 {
-    QEMUFile *f = opaque;
-
-    qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL);
-    process_incoming_migration(f);
+    migration_set_incoming_channel(migrate_get_current(), ioc);
+    object_unref(OBJECT(ioc));
+    return FALSE; /* unregister */
 }
 
 void fd_start_incoming_migration(const char *infd, Error **errp)
 {
+    QIOChannel *ioc;
     int fd;
-    QEMUFile *f;
-
-    DPRINTF("Attempting to start an incoming migration via fd\n");
 
     fd = strtol(infd, NULL, 0);
-    if (fd_is_socket(fd)) {
-        f = qemu_fopen_socket(fd, "rb");
-    } else {
-        f = qemu_fdopen(fd, "rb");
-    }
-    if(f == NULL) {
-        error_setg_errno(errp, errno, "failed to open the source descriptor");
+    trace_migration_fd_incoming(fd);
+
+    ioc = qio_channel_new_fd(fd, errp);
+    if (!ioc) {
+        close(fd);
         return;
     }
 
-    qemu_set_fd_handler(fd, fd_accept_incoming_migration, NULL, f);
+    qio_channel_add_watch(ioc,
+                          G_IO_IN,
+                          fd_accept_incoming_migration,
+                          NULL,
+                          NULL);
 }
diff --git a/migration/migration.c b/migration/migration.c
index f5327e8c0a..7ecbadee6f 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -34,6 +34,8 @@
 #include "qom/cpu.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
+#include "io/channel-buffer.h"
+#include "io/channel-tls.h"
 
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 
@@ -81,16 +83,13 @@ MigrationState *migrate_get_current(void)
         .bandwidth_limit = MAX_THROTTLE,
         .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
         .mbps = -1,
-        .parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] =
-                DEFAULT_MIGRATE_COMPRESS_LEVEL,
-        .parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] =
-                DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT,
-        .parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
-                DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT,
-        .parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL] =
-                DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL,
-        .parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT] =
-                DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT,
+        .parameters = {
+            .compress_level = DEFAULT_MIGRATE_COMPRESS_LEVEL,
+            .compress_threads = DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT,
+            .decompress_threads = DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT,
+            .cpu_throttle_initial = DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL,
+            .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT,
+        },
     };
 
     if (!once) {
@@ -310,14 +309,12 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
     } else if (strstart(uri, "rdma:", &p)) {
         rdma_start_incoming_migration(p, errp);
 #endif
-#if !defined(WIN32)
     } else if (strstart(uri, "exec:", &p)) {
         exec_start_incoming_migration(p, errp);
     } else if (strstart(uri, "unix:", &p)) {
         unix_start_incoming_migration(p, errp);
     } else if (strstart(uri, "fd:", &p)) {
         fd_start_incoming_migration(p, errp);
-#endif
     } else {
         error_setg(errp, "unknown migration protocol: %s", uri);
     }
@@ -422,14 +419,60 @@ static void process_incoming_migration_co(void *opaque)
 void process_incoming_migration(QEMUFile *f)
 {
     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
-    int fd = qemu_get_fd(f);
 
-    assert(fd != -1);
     migrate_decompress_threads_create();
-    qemu_set_nonblock(fd);
+    qemu_file_set_blocking(f, false);
     qemu_coroutine_enter(co, f);
 }
 
+
+void migration_set_incoming_channel(MigrationState *s,
+                                    QIOChannel *ioc)
+{
+    trace_migration_set_incoming_channel(
+        ioc, object_get_typename(OBJECT(ioc)));
+
+    if (s->parameters.tls_creds &&
+        !object_dynamic_cast(OBJECT(ioc),
+                             TYPE_QIO_CHANNEL_TLS)) {
+        Error *local_err = NULL;
+        migration_tls_set_incoming_channel(s, ioc, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+        }
+    } else {
+        QEMUFile *f = qemu_fopen_channel_input(ioc);
+        process_incoming_migration(f);
+    }
+}
+
+
+void migration_set_outgoing_channel(MigrationState *s,
+                                    QIOChannel *ioc,
+                                    const char *hostname)
+{
+    trace_migration_set_outgoing_channel(
+        ioc, object_get_typename(OBJECT(ioc)), hostname);
+
+    if (s->parameters.tls_creds &&
+        !object_dynamic_cast(OBJECT(ioc),
+                             TYPE_QIO_CHANNEL_TLS)) {
+        Error *local_err = NULL;
+        migration_tls_set_outgoing_channel(s, ioc, hostname, &local_err);
+        if (local_err) {
+            migrate_fd_error(s, local_err);
+            error_free(local_err);
+        }
+    } else {
+        QEMUFile *f = qemu_fopen_channel_output(ioc);
+
+        s->to_dst_file = f;
+
+        migrate_fd_connect(s);
+    }
+}
+
+
 /*
  * Send a message on the return channel back to the source
  * of the migration.
@@ -516,15 +559,13 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     MigrationState *s = migrate_get_current();
 
     params = g_malloc0(sizeof(*params));
-    params->compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL];
-    params->compress_threads =
-            s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS];
-    params->decompress_threads =
-            s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS];
-    params->cpu_throttle_initial =
-            s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL];
-    params->cpu_throttle_increment =
-            s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT];
+    params->compress_level = s->parameters.compress_level;
+    params->compress_threads = s->parameters.compress_threads;
+    params->decompress_threads = s->parameters.decompress_threads;
+    params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
+    params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
+    params->tls_creds = g_strdup(s->parameters.tls_creds);
+    params->tls_hostname = g_strdup(s->parameters.tls_hostname);
 
     return params;
 }
@@ -672,6 +713,10 @@ MigrationInfo *qmp_query_migrate(Error **errp)
         break;
     case MIGRATION_STATUS_FAILED:
         info->has_status = true;
+        if (s->error) {
+            info->has_error_desc = true;
+            info->error_desc = g_strdup(error_get_pretty(s->error));
+        }
         break;
     case MIGRATION_STATUS_CANCELLED:
         info->has_status = true;
@@ -721,7 +766,12 @@ void qmp_migrate_set_parameters(bool has_compress_level,
                                 bool has_cpu_throttle_initial,
                                 int64_t cpu_throttle_initial,
                                 bool has_cpu_throttle_increment,
-                                int64_t cpu_throttle_increment, Error **errp)
+                                int64_t cpu_throttle_increment,
+                                bool has_tls_creds,
+                                const char *tls_creds,
+                                bool has_tls_hostname,
+                                const char *tls_hostname,
+                                Error **errp)
 {
     MigrationState *s = migrate_get_current();
 
@@ -758,26 +808,31 @@ void qmp_migrate_set_parameters(bool has_compress_level,
     }
 
     if (has_compress_level) {
-        s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level;
+        s->parameters.compress_level = compress_level;
     }
     if (has_compress_threads) {
-        s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = compress_threads;
+        s->parameters.compress_threads = compress_threads;
     }
     if (has_decompress_threads) {
-        s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
-                                                    decompress_threads;
+        s->parameters.decompress_threads = decompress_threads;
     }
     if (has_cpu_throttle_initial) {
-        s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL] =
-                                                    cpu_throttle_initial;
+        s->parameters.cpu_throttle_initial = cpu_throttle_initial;
     }
-
     if (has_cpu_throttle_increment) {
-        s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT] =
-                                                    cpu_throttle_increment;
+        s->parameters.cpu_throttle_increment = cpu_throttle_increment;
+    }
+    if (has_tls_creds) {
+        g_free(s->parameters.tls_creds);
+        s->parameters.tls_creds = g_strdup(tls_creds);
+    }
+    if (has_tls_hostname) {
+        g_free(s->parameters.tls_hostname);
+        s->parameters.tls_hostname = g_strdup(tls_hostname);
     }
 }
 
+
 void qmp_migrate_start_postcopy(Error **errp)
 {
     MigrationState *s = migrate_get_current();
@@ -844,12 +899,15 @@ static void migrate_fd_cleanup(void *opaque)
     notifier_list_notify(&migration_state_notifiers, s);
 }
 
-void migrate_fd_error(MigrationState *s)
+void migrate_fd_error(MigrationState *s, const Error *error)
 {
-    trace_migrate_fd_error();
+    trace_migrate_fd_error(error ? error_get_pretty(error) : "");
     assert(s->to_dst_file == NULL);
     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
                       MIGRATION_STATUS_FAILED);
+    if (!s->error) {
+        s->error = error_copy(error);
+    }
     notifier_list_notify(&migration_state_notifiers, s);
 }
 
@@ -948,6 +1006,8 @@ MigrationState *migrate_init(const MigrationParams *params)
     s->postcopy_after_devices = false;
     s->migration_thread_running = false;
     s->last_req_rb = NULL;
+    error_free(s->error);
+    s->error = NULL;
 
     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
 
@@ -1040,14 +1100,12 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
     } else if (strstart(uri, "rdma:", &p)) {
         rdma_start_outgoing_migration(s, p, &local_err);
 #endif
-#if !defined(WIN32)
     } else if (strstart(uri, "exec:", &p)) {
         exec_start_outgoing_migration(s, p, &local_err);
     } else if (strstart(uri, "unix:", &p)) {
         unix_start_outgoing_migration(s, p, &local_err);
     } else if (strstart(uri, "fd:", &p)) {
         fd_start_outgoing_migration(s, p, &local_err);
-#endif
     } else {
         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
                    "a valid migration protocol");
@@ -1057,7 +1115,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
     }
 
     if (local_err) {
-        migrate_fd_error(s);
+        migrate_fd_error(s, local_err);
         error_propagate(errp, local_err);
         return;
     }
@@ -1170,7 +1228,7 @@ int migrate_compress_level(void)
 
     s = migrate_get_current();
 
-    return s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL];
+    return s->parameters.compress_level;
 }
 
 int migrate_compress_threads(void)
@@ -1179,7 +1237,7 @@ int migrate_compress_threads(void)
 
     s = migrate_get_current();
 
-    return s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS];
+    return s->parameters.compress_threads;
 }
 
 int migrate_decompress_threads(void)
@@ -1188,7 +1246,7 @@ int migrate_decompress_threads(void)
 
     s = migrate_get_current();
 
-    return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS];
+    return s->parameters.decompress_threads;
 }
 
 bool migrate_use_events(void)
@@ -1429,7 +1487,8 @@ static int await_return_path_close_on_source(MigrationState *ms)
 static int postcopy_start(MigrationState *ms, bool *old_vm_running)
 {
     int ret;
-    const QEMUSizedBuffer *qsb;
+    QIOChannelBuffer *bioc;
+    QEMUFile *fb;
     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
@@ -1488,11 +1547,9 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
      * So we wrap the device state up in a package with a length at the start;
      * to do this we use a qemu_buf to hold the whole of the device state.
      */
-    QEMUFile *fb = qemu_bufopen("w", NULL);
-    if (!fb) {
-        error_report("Failed to create buffered file");
-        goto fail;
-    }
+    bioc = qio_channel_buffer_new(4096);
+    fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
+    object_unref(OBJECT(bioc));
 
     /*
      * Make sure the receiver can get incoming pages before we send the rest
@@ -1506,10 +1563,9 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
     qemu_savevm_send_postcopy_run(fb);
 
     /* <><> end of stuff going into the package */
-    qsb = qemu_buf_get(fb);
 
     /* Now send that blob */
-    if (qemu_savevm_send_packaged(ms->to_dst_file, qsb)) {
+    if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
         goto fail_closefb;
     }
     qemu_fclose(fb);
@@ -1793,6 +1849,7 @@ void migrate_fd_connect(MigrationState *s)
     s->expected_downtime = max_downtime/1000000;
     s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
 
+    qemu_file_set_blocking(s->to_dst_file, true);
     qemu_file_set_rate_limit(s->to_dst_file,
                              s->bandwidth_limit / XFER_LIMIT_RATIO);
 
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index fbd0064fce..cf7dcd25d4 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -407,7 +407,6 @@ static void *postcopy_ram_fault_thread(void *opaque)
 
     while (true) {
         ram_addr_t rb_offset;
-        ram_addr_t in_raspace;
         struct pollfd pfd[2];
 
         /*
@@ -459,7 +458,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
 
         rb = qemu_ram_block_from_host(
                  (void *)(uintptr_t)msg.arg.pagefault.address,
-                 true, &in_raspace, &rb_offset);
+                 true, &rb_offset);
         if (!rb) {
             error_report("postcopy_ram_fault_thread: Fault outside guest: %"
                          PRIx64, (uint64_t)msg.arg.pagefault.address);
diff --git a/migration/qemu-file-buf.c b/migration/qemu-file-buf.c
deleted file mode 100644
index 7b8e78e99c..0000000000
--- a/migration/qemu-file-buf.c
+++ /dev/null
@@ -1,464 +0,0 @@
-/*
- * QEMU System Emulator
- *
- * Copyright (c) 2003-2008 Fabrice Bellard
- * Copyright (c) 2014 IBM Corp.
- *
- * Authors:
- *  Stefan Berger <stefanb@linux.vnet.ibm.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "qemu/error-report.h"
-#include "qemu/iov.h"
-#include "qemu/sockets.h"
-#include "qemu/coroutine.h"
-#include "migration/migration.h"
-#include "migration/qemu-file.h"
-#include "migration/qemu-file-internal.h"
-#include "trace.h"
-
-#define QSB_CHUNK_SIZE      (1 << 10)
-#define QSB_MAX_CHUNK_SIZE  (16 * QSB_CHUNK_SIZE)
-
-/**
- * Create a QEMUSizedBuffer
- * This type of buffer uses scatter-gather lists internally and
- * can grow to any size. Any data array in the scatter-gather list
- * can hold different amount of bytes.
- *
- * @buffer: Optional buffer to copy into the QSB
- * @len: size of initial buffer; if @buffer is given, buffer must
- *       hold at least len bytes
- *
- * Returns a pointer to a QEMUSizedBuffer or NULL on allocation failure
- */
-QEMUSizedBuffer *qsb_create(const uint8_t *buffer, size_t len)
-{
-    QEMUSizedBuffer *qsb;
-    size_t alloc_len, num_chunks, i, to_copy;
-    size_t chunk_size = (len > QSB_MAX_CHUNK_SIZE)
-                        ? QSB_MAX_CHUNK_SIZE
-                        : QSB_CHUNK_SIZE;
-
-    num_chunks = DIV_ROUND_UP(len ? len : QSB_CHUNK_SIZE, chunk_size);
-    alloc_len = num_chunks * chunk_size;
-
-    qsb = g_try_new0(QEMUSizedBuffer, 1);
-    if (!qsb) {
-        return NULL;
-    }
-
-    qsb->iov = g_try_new0(struct iovec, num_chunks);
-    if (!qsb->iov) {
-        g_free(qsb);
-        return NULL;
-    }
-
-    qsb->n_iov = num_chunks;
-
-    for (i = 0; i < num_chunks; i++) {
-        qsb->iov[i].iov_base = g_try_malloc0(chunk_size);
-        if (!qsb->iov[i].iov_base) {
-            /* qsb_free is safe since g_free can cope with NULL */
-            qsb_free(qsb);
-            return NULL;
-        }
-
-        qsb->iov[i].iov_len = chunk_size;
-        if (buffer) {
-            to_copy = (len - qsb->used) > chunk_size
-                      ? chunk_size : (len - qsb->used);
-            memcpy(qsb->iov[i].iov_base, &buffer[qsb->used], to_copy);
-            qsb->used += to_copy;
-        }
-    }
-
-    qsb->size = alloc_len;
-
-    return qsb;
-}
-
-/**
- * Free the QEMUSizedBuffer
- *
- * @qsb: The QEMUSizedBuffer to free
- */
-void qsb_free(QEMUSizedBuffer *qsb)
-{
-    size_t i;
-
-    if (!qsb) {
-        return;
-    }
-
-    for (i = 0; i < qsb->n_iov; i++) {
-        g_free(qsb->iov[i].iov_base);
-    }
-    g_free(qsb->iov);
-    g_free(qsb);
-}
-
-/**
- * Get the number of used bytes in the QEMUSizedBuffer
- *
- * @qsb: A QEMUSizedBuffer
- *
- * Returns the number of bytes currently used in this buffer
- */
-size_t qsb_get_length(const QEMUSizedBuffer *qsb)
-{
-    return qsb->used;
-}
-
-/**
- * Set the length of the buffer; the primary usage of this
- * function is to truncate the number of used bytes in the buffer.
- * The size will not be extended beyond the current number of
- * allocated bytes in the QEMUSizedBuffer.
- *
- * @qsb: A QEMUSizedBuffer
- * @new_len: The new length of bytes in the buffer
- *
- * Returns the number of bytes the buffer was truncated or extended
- * to.
- */
-size_t qsb_set_length(QEMUSizedBuffer *qsb, size_t new_len)
-{
-    if (new_len <= qsb->size) {
-        qsb->used = new_len;
-    } else {
-        qsb->used = qsb->size;
-    }
-    return qsb->used;
-}
-
-/**
- * Get the iovec that holds the data for a given position @pos.
- *
- * @qsb: A QEMUSizedBuffer
- * @pos: The index of a byte in the buffer
- * @d_off: Pointer to an offset that this function will indicate
- *         at what position within the returned iovec the byte
- *         is to be found
- *
- * Returns the index of the iovec that holds the byte at the given
- * index @pos in the byte stream; a negative number if the iovec
- * for the given position @pos does not exist.
- */
-static ssize_t qsb_get_iovec(const QEMUSizedBuffer *qsb,
-                             off_t pos, off_t *d_off)
-{
-    ssize_t i;
-    off_t curr = 0;
-
-    if (pos > qsb->used) {
-        return -1;
-    }
-
-    for (i = 0; i < qsb->n_iov; i++) {
-        if (curr + qsb->iov[i].iov_len > pos) {
-            *d_off = pos - curr;
-            return i;
-        }
-        curr += qsb->iov[i].iov_len;
-    }
-    return -1;
-}
-
-/*
- * Convert the QEMUSizedBuffer into a flat buffer.
- *
- * Note: If at all possible, try to avoid this function since it
- *       may unnecessarily copy memory around.
- *
- * @qsb: pointer to QEMUSizedBuffer
- * @start: offset to start at
- * @count: number of bytes to copy
- * @buf: a pointer to a buffer to write into (at least @count bytes)
- *
- * Returns the number of bytes copied into the output buffer
- */
-ssize_t qsb_get_buffer(const QEMUSizedBuffer *qsb, off_t start,
-                       size_t count, uint8_t *buffer)
-{
-    const struct iovec *iov;
-    size_t to_copy, all_copy;
-    ssize_t index;
-    off_t s_off;
-    off_t d_off = 0;
-    char *s;
-
-    if (start > qsb->used) {
-        return 0;
-    }
-
-    all_copy = qsb->used - start;
-    if (all_copy > count) {
-        all_copy = count;
-    } else {
-        count = all_copy;
-    }
-
-    index = qsb_get_iovec(qsb, start, &s_off);
-    if (index < 0) {
-        return 0;
-    }
-
-    while (all_copy > 0) {
-        iov = &qsb->iov[index];
-
-        s = iov->iov_base;
-
-        to_copy = iov->iov_len - s_off;
-        if (to_copy > all_copy) {
-            to_copy = all_copy;
-        }
-        memcpy(&buffer[d_off], &s[s_off], to_copy);
-
-        d_off += to_copy;
-        all_copy -= to_copy;
-
-        s_off = 0;
-        index++;
-    }
-
-    return count;
-}
-
-/**
- * Grow the QEMUSizedBuffer to the given size and allocate
- * memory for it.
- *
- * @qsb: A QEMUSizedBuffer
- * @new_size: The new size of the buffer
- *
- * Return:
- *    a negative error code in case of memory allocation failure
- * or
- *    the new size of the buffer. The returned size may be greater or equal
- *    to @new_size.
- */
-static ssize_t qsb_grow(QEMUSizedBuffer *qsb, size_t new_size)
-{
-    size_t needed_chunks, i;
-
-    if (qsb->size < new_size) {
-        struct iovec *new_iov;
-        size_t size_diff = new_size - qsb->size;
-        size_t chunk_size = (size_diff > QSB_MAX_CHUNK_SIZE)
-                             ? QSB_MAX_CHUNK_SIZE : QSB_CHUNK_SIZE;
-
-        needed_chunks = DIV_ROUND_UP(size_diff, chunk_size);
-
-        new_iov = g_try_new(struct iovec, qsb->n_iov + needed_chunks);
-        if (new_iov == NULL) {
-            return -ENOMEM;
-        }
-
-        /* Allocate new chunks as needed into new_iov */
-        for (i = qsb->n_iov; i < qsb->n_iov + needed_chunks; i++) {
-            new_iov[i].iov_base = g_try_malloc0(chunk_size);
-            new_iov[i].iov_len = chunk_size;
-            if (!new_iov[i].iov_base) {
-                size_t j;
-
-                /* Free previously allocated new chunks */
-                for (j = qsb->n_iov; j < i; j++) {
-                    g_free(new_iov[j].iov_base);
-                }
-                g_free(new_iov);
-
-                return -ENOMEM;
-            }
-        }
-
-        /*
-         * Now we can't get any allocation errors, copy over to new iov
-         * and switch.
-         */
-        for (i = 0; i < qsb->n_iov; i++) {
-            new_iov[i] = qsb->iov[i];
-        }
-
-        qsb->n_iov += needed_chunks;
-        g_free(qsb->iov);
-        qsb->iov = new_iov;
-        qsb->size += (needed_chunks * chunk_size);
-    }
-
-    return qsb->size;
-}
-
-/**
- * Write into the QEMUSizedBuffer at a given position and a given
- * number of bytes. This function will automatically grow the
- * QEMUSizedBuffer.
- *
- * @qsb: A QEMUSizedBuffer
- * @source: A byte array to copy data from
- * @pos: The position within the @qsb to write data to
- * @size: The number of bytes to copy into the @qsb
- *
- * Returns @size or a negative error code in case of memory allocation failure,
- *           or with an invalid 'pos'
- */
-ssize_t qsb_write_at(QEMUSizedBuffer *qsb, const uint8_t *source,
-                     off_t pos, size_t count)
-{
-    ssize_t rc = qsb_grow(qsb, pos + count);
-    size_t to_copy;
-    size_t all_copy = count;
-    const struct iovec *iov;
-    ssize_t index;
-    char *dest;
-    off_t d_off, s_off = 0;
-
-    if (rc < 0) {
-        return rc;
-    }
-
-    if (pos + count > qsb->used) {
-        qsb->used = pos + count;
-    }
-
-    index = qsb_get_iovec(qsb, pos, &d_off);
-    if (index < 0) {
-        return -EINVAL;
-    }
-
-    while (all_copy > 0) {
-        iov = &qsb->iov[index];
-
-        dest = iov->iov_base;
-
-        to_copy = iov->iov_len - d_off;
-        if (to_copy > all_copy) {
-            to_copy = all_copy;
-        }
-
-        memcpy(&dest[d_off], &source[s_off], to_copy);
-
-        s_off += to_copy;
-        all_copy -= to_copy;
-
-        d_off = 0;
-        index++;
-    }
-
-    return count;
-}
-
-typedef struct QEMUBuffer {
-    QEMUSizedBuffer *qsb;
-    QEMUFile *file;
-    bool qsb_allocated;
-} QEMUBuffer;
-
-static ssize_t buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
-                              size_t size)
-{
-    QEMUBuffer *s = opaque;
-    ssize_t len = qsb_get_length(s->qsb) - pos;
-
-    if (len <= 0) {
-        return 0;
-    }
-
-    if (len > size) {
-        len = size;
-    }
-    return qsb_get_buffer(s->qsb, pos, len, buf);
-}
-
-static ssize_t buf_put_buffer(void *opaque, const uint8_t *buf,
-                              int64_t pos, size_t size)
-{
-    QEMUBuffer *s = opaque;
-
-    return qsb_write_at(s->qsb, buf, pos, size);
-}
-
-static int buf_close(void *opaque)
-{
-    QEMUBuffer *s = opaque;
-
-    if (s->qsb_allocated) {
-        qsb_free(s->qsb);
-    }
-
-    g_free(s);
-
-    return 0;
-}
-
-const QEMUSizedBuffer *qemu_buf_get(QEMUFile *f)
-{
-    QEMUBuffer *p;
-
-    qemu_fflush(f);
-
-    p = f->opaque;
-
-    return p->qsb;
-}
-
-static const QEMUFileOps buf_read_ops = {
-    .get_buffer = buf_get_buffer,
-    .close =      buf_close,
-};
-
-static const QEMUFileOps buf_write_ops = {
-    .put_buffer = buf_put_buffer,
-    .close =      buf_close,
-};
-
-QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input)
-{
-    QEMUBuffer *s;
-
-    if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') ||
-        mode[1] != '\0') {
-        error_report("qemu_bufopen: Argument validity check failed");
-        return NULL;
-    }
-
-    s = g_new0(QEMUBuffer, 1);
-    s->qsb = input;
-
-    if (s->qsb == NULL) {
-        s->qsb = qsb_create(NULL, 0);
-        s->qsb_allocated = true;
-    }
-    if (!s->qsb) {
-        g_free(s);
-        error_report("qemu_bufopen: qsb_create failed");
-        return NULL;
-    }
-
-
-    if (mode[0] == 'r') {
-        s->file = qemu_fopen_ops(s, &buf_read_ops);
-    } else {
-        s->file = qemu_fopen_ops(s, &buf_write_ops);
-    }
-    return s->file;
-}
diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c
new file mode 100644
index 0000000000..45c13f1028
--- /dev/null
+++ b/migration/qemu-file-channel.c
@@ -0,0 +1,180 @@
+/*
+ * QEMUFile backend for QIOChannel objects
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/qemu-file.h"
+#include "io/channel-socket.h"
+#include "qemu/iov.h"
+
+
+static ssize_t channel_writev_buffer(void *opaque,
+                                     struct iovec *iov,
+                                     int iovcnt,
+                                     int64_t pos)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+    ssize_t done = 0;
+    struct iovec *local_iov = g_new(struct iovec, iovcnt);
+    struct iovec *local_iov_head = local_iov;
+    unsigned int nlocal_iov = iovcnt;
+
+    nlocal_iov = iov_copy(local_iov, nlocal_iov,
+                          iov, iovcnt,
+                          0, iov_size(iov, iovcnt));
+
+    while (nlocal_iov > 0) {
+        ssize_t len;
+        len = qio_channel_writev(ioc, local_iov, nlocal_iov, NULL);
+        if (len == QIO_CHANNEL_ERR_BLOCK) {
+            qio_channel_wait(ioc, G_IO_OUT);
+            continue;
+        }
+        if (len < 0) {
+            /* XXX handle Error objects */
+            done = -EIO;
+            goto cleanup;
+        }
+
+        iov_discard_front(&local_iov, &nlocal_iov, len);
+        done += len;
+    }
+
+ cleanup:
+    g_free(local_iov_head);
+    return done;
+}
+
+
+static ssize_t channel_get_buffer(void *opaque,
+                                  uint8_t *buf,
+                                  int64_t pos,
+                                  size_t size)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+    ssize_t ret;
+
+    do {
+        ret = qio_channel_read(ioc, (char *)buf, size, NULL);
+        if (ret < 0) {
+            if (ret == QIO_CHANNEL_ERR_BLOCK) {
+                qio_channel_yield(ioc, G_IO_IN);
+            } else {
+                /* XXX handle Error * object */
+                return -EIO;
+            }
+        }
+    } while (ret == QIO_CHANNEL_ERR_BLOCK);
+
+    return ret;
+}
+
+
+static int channel_close(void *opaque)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+    qio_channel_close(ioc, NULL);
+    object_unref(OBJECT(ioc));
+    return 0;
+}
+
+
+static int channel_shutdown(void *opaque,
+                            bool rd,
+                            bool wr)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+
+    if (qio_channel_has_feature(ioc,
+                                QIO_CHANNEL_FEATURE_SHUTDOWN)) {
+        QIOChannelShutdown mode;
+        if (rd && wr) {
+            mode = QIO_CHANNEL_SHUTDOWN_BOTH;
+        } else if (rd) {
+            mode = QIO_CHANNEL_SHUTDOWN_READ;
+        } else {
+            mode = QIO_CHANNEL_SHUTDOWN_WRITE;
+        }
+        if (qio_channel_shutdown(ioc, mode, NULL) < 0) {
+            /* XXX handler Error * object */
+            return -EIO;
+        }
+    }
+    return 0;
+}
+
+
+static int channel_set_blocking(void *opaque,
+                                bool enabled)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+
+    if (qio_channel_set_blocking(ioc, enabled, NULL) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+static QEMUFile *channel_get_input_return_path(void *opaque)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+
+    return qemu_fopen_channel_output(ioc);
+}
+
+static QEMUFile *channel_get_output_return_path(void *opaque)
+{
+    QIOChannel *ioc = QIO_CHANNEL(opaque);
+
+    return qemu_fopen_channel_input(ioc);
+}
+
+static const QEMUFileOps channel_input_ops = {
+    .get_buffer = channel_get_buffer,
+    .close = channel_close,
+    .shut_down = channel_shutdown,
+    .set_blocking = channel_set_blocking,
+    .get_return_path = channel_get_input_return_path,
+};
+
+
+static const QEMUFileOps channel_output_ops = {
+    .writev_buffer = channel_writev_buffer,
+    .close = channel_close,
+    .shut_down = channel_shutdown,
+    .set_blocking = channel_set_blocking,
+    .get_return_path = channel_get_output_return_path,
+};
+
+
+QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc)
+{
+    object_ref(OBJECT(ioc));
+    return qemu_fopen_ops(ioc, &channel_input_ops);
+}
+
+QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc)
+{
+    object_ref(OBJECT(ioc));
+    return qemu_fopen_ops(ioc, &channel_output_ops);
+}
diff --git a/migration/qemu-file-internal.h b/migration/qemu-file-internal.h
deleted file mode 100644
index d95e8538e7..0000000000
--- a/migration/qemu-file-internal.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * QEMU System Emulator
- *
- * Copyright (c) 2003-2008 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef QEMU_FILE_INTERNAL_H
-#define QEMU_FILE_INTERNAL_H 1
-
-#include "qemu-common.h"
-#include "qemu/iov.h"
-
-#define IO_BUF_SIZE 32768
-#define MAX_IOV_SIZE MIN(IOV_MAX, 64)
-
-struct QEMUFile {
-    const QEMUFileOps *ops;
-    void *opaque;
-
-    int64_t bytes_xfer;
-    int64_t xfer_limit;
-
-    int64_t pos; /* start of buffer when writing, end of buffer
-                    when reading */
-    int buf_index;
-    int buf_size; /* 0 when writing */
-    uint8_t buf[IO_BUF_SIZE];
-
-    struct iovec iov[MAX_IOV_SIZE];
-    unsigned int iovcnt;
-
-    int last_error;
-};
-
-#endif
diff --git a/migration/qemu-file-stdio.c b/migration/qemu-file-stdio.c
deleted file mode 100644
index f402e8f708..0000000000
--- a/migration/qemu-file-stdio.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * QEMU System Emulator
- *
- * Copyright (c) 2003-2008 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "qemu/coroutine.h"
-#include "migration/qemu-file.h"
-
-typedef struct QEMUFileStdio {
-    FILE *stdio_file;
-    QEMUFile *file;
-} QEMUFileStdio;
-
-static int stdio_get_fd(void *opaque)
-{
-    QEMUFileStdio *s = opaque;
-
-    return fileno(s->stdio_file);
-}
-
-static ssize_t stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos,
-                                size_t size)
-{
-    QEMUFileStdio *s = opaque;
-    size_t res;
-
-    res = fwrite(buf, 1, size, s->stdio_file);
-
-    if (res != size) {
-        return -errno;
-    }
-    return res;
-}
-
-static ssize_t stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
-                                size_t size)
-{
-    QEMUFileStdio *s = opaque;
-    FILE *fp = s->stdio_file;
-    ssize_t bytes;
-
-    for (;;) {
-        clearerr(fp);
-        bytes = fread(buf, 1, size, fp);
-        if (bytes != 0 || !ferror(fp)) {
-            break;
-        }
-        if (errno == EAGAIN) {
-            yield_until_fd_readable(fileno(fp));
-        } else if (errno != EINTR) {
-            break;
-        }
-    }
-    return bytes;
-}
-
-static int stdio_pclose(void *opaque)
-{
-    QEMUFileStdio *s = opaque;
-    int ret;
-    ret = pclose(s->stdio_file);
-    if (ret == -1) {
-        ret = -errno;
-    } else if (!WIFEXITED(ret) || WEXITSTATUS(ret) != 0) {
-        /* close succeeded, but non-zero exit code: */
-        ret = -EIO; /* fake errno value */
-    }
-    g_free(s);
-    return ret;
-}
-
-static int stdio_fclose(void *opaque)
-{
-    QEMUFileStdio *s = opaque;
-    int ret = 0;
-
-    if (qemu_file_is_writable(s->file)) {
-        int fd = fileno(s->stdio_file);
-        struct stat st;
-
-        ret = fstat(fd, &st);
-        if (ret == 0 && S_ISREG(st.st_mode)) {
-            /*
-             * If the file handle is a regular file make sure the
-             * data is flushed to disk before signaling success.
-             */
-            ret = fsync(fd);
-            if (ret != 0) {
-                ret = -errno;
-                return ret;
-            }
-        }
-    }
-    if (fclose(s->stdio_file) == EOF) {
-        ret = -errno;
-    }
-    g_free(s);
-    return ret;
-}
-
-static const QEMUFileOps stdio_pipe_read_ops = {
-    .get_fd =     stdio_get_fd,
-    .get_buffer = stdio_get_buffer,
-    .close =      stdio_pclose
-};
-
-static const QEMUFileOps stdio_pipe_write_ops = {
-    .get_fd =     stdio_get_fd,
-    .put_buffer = stdio_put_buffer,
-    .close =      stdio_pclose
-};
-
-QEMUFile *qemu_popen_cmd(const char *command, const char *mode)
-{
-    FILE *stdio_file;
-    QEMUFileStdio *s;
-
-    if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || mode[1] != 0) {
-        fprintf(stderr, "qemu_popen: Argument validity check failed\n");
-        return NULL;
-    }
-
-    stdio_file = popen(command, mode);
-    if (stdio_file == NULL) {
-        return NULL;
-    }
-
-    s = g_new0(QEMUFileStdio, 1);
-
-    s->stdio_file = stdio_file;
-
-    if (mode[0] == 'r') {
-        s->file = qemu_fopen_ops(s, &stdio_pipe_read_ops);
-    } else {
-        s->file = qemu_fopen_ops(s, &stdio_pipe_write_ops);
-    }
-    return s->file;
-}
-
-static const QEMUFileOps stdio_file_read_ops = {
-    .get_fd =     stdio_get_fd,
-    .get_buffer = stdio_get_buffer,
-    .close =      stdio_fclose
-};
-
-static const QEMUFileOps stdio_file_write_ops = {
-    .get_fd =     stdio_get_fd,
-    .put_buffer = stdio_put_buffer,
-    .close =      stdio_fclose
-};
-
-QEMUFile *qemu_fopen(const char *filename, const char *mode)
-{
-    QEMUFileStdio *s;
-
-    if (qemu_file_mode_is_not_valid(mode)) {
-        return NULL;
-    }
-
-    s = g_new0(QEMUFileStdio, 1);
-
-    s->stdio_file = fopen(filename, mode);
-    if (!s->stdio_file) {
-        goto fail;
-    }
-
-    if (mode[0] == 'w') {
-        s->file = qemu_fopen_ops(s, &stdio_file_write_ops);
-    } else {
-        s->file = qemu_fopen_ops(s, &stdio_file_read_ops);
-    }
-    return s->file;
-fail:
-    g_free(s);
-    return NULL;
-}
diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c
deleted file mode 100644
index 4474e18ff8..0000000000
--- a/migration/qemu-file-unix.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * QEMU System Emulator
- *
- * Copyright (c) 2003-2008 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include "qemu/osdep.h"
-#include "qemu-common.h"
-#include "qemu/error-report.h"
-#include "qemu/iov.h"
-#include "qemu/sockets.h"
-#include "qemu/coroutine.h"
-#include "migration/qemu-file.h"
-#include "migration/qemu-file-internal.h"
-
-typedef struct QEMUFileSocket {
-    int fd;
-    QEMUFile *file;
-} QEMUFileSocket;
-
-static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
-                                    int64_t pos)
-{
-    QEMUFileSocket *s = opaque;
-    ssize_t len;
-    ssize_t size = iov_size(iov, iovcnt);
-    ssize_t offset = 0;
-    int     err;
-
-    while (size > 0) {
-        len = iov_send(s->fd, iov, iovcnt, offset, size);
-
-        if (len > 0) {
-            size -= len;
-            offset += len;
-        }
-
-        if (size > 0) {
-            if (errno != EAGAIN && errno != EWOULDBLOCK) {
-                error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)",
-                             errno, (size_t)size, (size_t)len);
-                /*
-                 * If I've already sent some but only just got the error, I
-                 * could return the amount validly sent so far and wait for the
-                 * next call to report the error, but I'd rather flag the error
-                 * immediately.
-                 */
-                return -errno;
-            }
-
-            /* Emulate blocking */
-            GPollFD pfd;
-
-            pfd.fd = s->fd;
-            pfd.events = G_IO_OUT | G_IO_ERR;
-            pfd.revents = 0;
-            TFR(err = g_poll(&pfd, 1, -1 /* no timeout */));
-            /* Errors other than EINTR intentionally ignored */
-        }
-     }
-
-    return offset;
-}
-
-static int socket_get_fd(void *opaque)
-{
-    QEMUFileSocket *s = opaque;
-
-    return s->fd;
-}
-
-static ssize_t socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
-                                 size_t size)
-{
-    QEMUFileSocket *s = opaque;
-    ssize_t len;
-
-    for (;;) {
-        len = qemu_recv(s->fd, buf, size, 0);
-        if (len != -1) {
-            break;
-        }
-        if (errno == EAGAIN) {
-            yield_until_fd_readable(s->fd);
-        } else if (errno != EINTR) {
-            break;
-        }
-    }
-
-    if (len == -1) {
-        len = -errno;
-    }
-    return len;
-}
-
-static int socket_close(void *opaque)
-{
-    QEMUFileSocket *s = opaque;
-    closesocket(s->fd);
-    g_free(s);
-    return 0;
-}
-
-static int socket_shutdown(void *opaque, bool rd, bool wr)
-{
-    QEMUFileSocket *s = opaque;
-
-    if (shutdown(s->fd, rd ? (wr ? SHUT_RDWR : SHUT_RD) : SHUT_WR)) {
-        return -errno;
-    } else {
-        return 0;
-    }
-}
-
-static int socket_return_close(void *opaque)
-{
-    QEMUFileSocket *s = opaque;
-    /*
-     * Note: We don't close the socket, that should be done by the forward
-     * path.
-     */
-    g_free(s);
-    return 0;
-}
-
-static const QEMUFileOps socket_return_read_ops = {
-    .get_fd          = socket_get_fd,
-    .get_buffer      = socket_get_buffer,
-    .close           = socket_return_close,
-    .shut_down       = socket_shutdown,
-};
-
-static const QEMUFileOps socket_return_write_ops = {
-    .get_fd          = socket_get_fd,
-    .writev_buffer   = socket_writev_buffer,
-    .close           = socket_return_close,
-    .shut_down       = socket_shutdown,
-};
-
-/*
- * Give a QEMUFile* off the same socket but data in the opposite
- * direction.
- */
-static QEMUFile *socket_get_return_path(void *opaque)
-{
-    QEMUFileSocket *forward = opaque;
-    QEMUFileSocket *reverse;
-
-    if (qemu_file_get_error(forward->file)) {
-        /* If the forward file is in error, don't try and open a return */
-        return NULL;
-    }
-
-    reverse = g_malloc0(sizeof(QEMUFileSocket));
-    reverse->fd = forward->fd;
-    /* I don't think there's a better way to tell which direction 'this' is */
-    if (forward->file->ops->get_buffer != NULL) {
-        /* being called from the read side, so we need to be able to write */
-        return qemu_fopen_ops(reverse, &socket_return_write_ops);
-    } else {
-        return qemu_fopen_ops(reverse, &socket_return_read_ops);
-    }
-}
-
-static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
-                                  int64_t pos)
-{
-    QEMUFileSocket *s = opaque;
-    ssize_t len, offset;
-    ssize_t size = iov_size(iov, iovcnt);
-    ssize_t total = 0;
-
-    assert(iovcnt > 0);
-    offset = 0;
-    while (size > 0) {
-        /* Find the next start position; skip all full-sized vector elements  */
-        while (offset >= iov[0].iov_len) {
-            offset -= iov[0].iov_len;
-            iov++, iovcnt--;
-        }
-
-        /* skip `offset' bytes from the (now) first element, undo it on exit */
-        assert(iovcnt > 0);
-        iov[0].iov_base += offset;
-        iov[0].iov_len -= offset;
-
-        do {
-            len = writev(s->fd, iov, iovcnt);
-        } while (len == -1 && errno == EINTR);
-        if (len == -1) {
-            return -errno;
-        }
-
-        /* Undo the changes above */
-        iov[0].iov_base -= offset;
-        iov[0].iov_len += offset;
-
-        /* Prepare for the next iteration */
-        offset += len;
-        total += len;
-        size -= len;
-    }
-
-    return total;
-}
-
-static ssize_t unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
-                              size_t size)
-{
-    QEMUFileSocket *s = opaque;
-    ssize_t len;
-
-    for (;;) {
-        len = read(s->fd, buf, size);
-        if (len != -1) {
-            break;
-        }
-        if (errno == EAGAIN) {
-            yield_until_fd_readable(s->fd);
-        } else if (errno != EINTR) {
-            break;
-        }
-    }
-
-    if (len == -1) {
-        len = -errno;
-    }
-    return len;
-}
-
-static int unix_close(void *opaque)
-{
-    QEMUFileSocket *s = opaque;
-    close(s->fd);
-    g_free(s);
-    return 0;
-}
-
-static const QEMUFileOps unix_read_ops = {
-    .get_fd =     socket_get_fd,
-    .get_buffer = unix_get_buffer,
-    .close =      unix_close
-};
-
-static const QEMUFileOps unix_write_ops = {
-    .get_fd =     socket_get_fd,
-    .writev_buffer = unix_writev_buffer,
-    .close =      unix_close
-};
-
-QEMUFile *qemu_fdopen(int fd, const char *mode)
-{
-    QEMUFileSocket *s;
-
-    if (mode == NULL ||
-        (mode[0] != 'r' && mode[0] != 'w') ||
-        mode[1] != 'b' || mode[2] != 0) {
-        fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
-        return NULL;
-    }
-
-    s = g_new0(QEMUFileSocket, 1);
-    s->fd = fd;
-
-    if (mode[0] == 'r') {
-        s->file = qemu_fopen_ops(s, &unix_read_ops);
-    } else {
-        s->file = qemu_fopen_ops(s, &unix_write_ops);
-    }
-    return s->file;
-}
-
-static const QEMUFileOps socket_read_ops = {
-    .get_fd          = socket_get_fd,
-    .get_buffer      = socket_get_buffer,
-    .close           = socket_close,
-    .shut_down       = socket_shutdown,
-    .get_return_path = socket_get_return_path
-};
-
-static const QEMUFileOps socket_write_ops = {
-    .get_fd          = socket_get_fd,
-    .writev_buffer   = socket_writev_buffer,
-    .close           = socket_close,
-    .shut_down       = socket_shutdown,
-    .get_return_path = socket_get_return_path
-};
-
-QEMUFile *qemu_fopen_socket(int fd, const char *mode)
-{
-    QEMUFileSocket *s;
-
-    if (qemu_file_mode_is_not_valid(mode)) {
-        return NULL;
-    }
-
-    s = g_new0(QEMUFileSocket, 1);
-    s->fd = fd;
-    if (mode[0] == 'w') {
-        qemu_set_block(s->fd);
-        s->file = qemu_fopen_ops(s, &socket_write_ops);
-    } else {
-        s->file = qemu_fopen_ops(s, &socket_read_ops);
-    }
-    return s->file;
-}
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 6f4a1299b3..8aea1c7094 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -30,9 +30,31 @@
 #include "qemu/coroutine.h"
 #include "migration/migration.h"
 #include "migration/qemu-file.h"
-#include "migration/qemu-file-internal.h"
 #include "trace.h"
 
+#define IO_BUF_SIZE 32768
+#define MAX_IOV_SIZE MIN(IOV_MAX, 64)
+
+struct QEMUFile {
+    const QEMUFileOps *ops;
+    const QEMUFileHooks *hooks;
+    void *opaque;
+
+    int64_t bytes_xfer;
+    int64_t xfer_limit;
+
+    int64_t pos; /* start of buffer when writing, end of buffer
+                    when reading */
+    int buf_index;
+    int buf_size; /* 0 when writing */
+    uint8_t buf[IO_BUF_SIZE];
+
+    struct iovec iov[MAX_IOV_SIZE];
+    unsigned int iovcnt;
+
+    int last_error;
+};
+
 /*
  * Stop a file from being read/written - not all backing files can do this
  * typically only sockets can.
@@ -80,6 +102,12 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
     return f;
 }
 
+
+void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks)
+{
+    f->hooks = hooks;
+}
+
 /*
  * Get last error for stream f
  *
@@ -101,48 +129,49 @@ void qemu_file_set_error(QEMUFile *f, int ret)
 
 bool qemu_file_is_writable(QEMUFile *f)
 {
-    return f->ops->writev_buffer || f->ops->put_buffer;
+    return f->ops->writev_buffer;
 }
 
 /**
  * Flushes QEMUFile buffer
  *
  * If there is writev_buffer QEMUFileOps it uses it otherwise uses
- * put_buffer ops.
+ * put_buffer ops. This will flush all pending data. If data was
+ * only partially flushed, it will set an error state.
  */
 void qemu_fflush(QEMUFile *f)
 {
     ssize_t ret = 0;
+    ssize_t expect = 0;
 
     if (!qemu_file_is_writable(f)) {
         return;
     }
 
-    if (f->ops->writev_buffer) {
-        if (f->iovcnt > 0) {
-            ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos);
-        }
-    } else {
-        if (f->buf_index > 0) {
-            ret = f->ops->put_buffer(f->opaque, f->buf, f->pos, f->buf_index);
-        }
+    if (f->iovcnt > 0) {
+        expect = iov_size(f->iov, f->iovcnt);
+        ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos);
     }
+
     if (ret >= 0) {
         f->pos += ret;
     }
+    /* We expect the QEMUFile write impl to send the full
+     * data set we requested, so sanity check that.
+     */
+    if (ret != expect) {
+        qemu_file_set_error(f, ret < 0 ? ret : -EIO);
+    }
     f->buf_index = 0;
     f->iovcnt = 0;
-    if (ret < 0) {
-        qemu_file_set_error(f, ret);
-    }
 }
 
 void ram_control_before_iterate(QEMUFile *f, uint64_t flags)
 {
     int ret = 0;
 
-    if (f->ops->before_ram_iterate) {
-        ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL);
+    if (f->hooks && f->hooks->before_ram_iterate) {
+        ret = f->hooks->before_ram_iterate(f, f->opaque, flags, NULL);
         if (ret < 0) {
             qemu_file_set_error(f, ret);
         }
@@ -153,8 +182,8 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags)
 {
     int ret = 0;
 
-    if (f->ops->after_ram_iterate) {
-        ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL);
+    if (f->hooks && f->hooks->after_ram_iterate) {
+        ret = f->hooks->after_ram_iterate(f, f->opaque, flags, NULL);
         if (ret < 0) {
             qemu_file_set_error(f, ret);
         }
@@ -165,8 +194,8 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data)
 {
     int ret = -EINVAL;
 
-    if (f->ops->hook_ram_load) {
-        ret = f->ops->hook_ram_load(f, f->opaque, flags, data);
+    if (f->hooks && f->hooks->hook_ram_load) {
+        ret = f->hooks->hook_ram_load(f, f->opaque, flags, data);
         if (ret < 0) {
             qemu_file_set_error(f, ret);
         }
@@ -185,9 +214,9 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
                              ram_addr_t offset, size_t size,
                              uint64_t *bytes_sent)
 {
-    if (f->ops->save_page) {
-        int ret = f->ops->save_page(f, f->opaque, block_offset,
-                                    offset, size, bytes_sent);
+    if (f->hooks && f->hooks->save_page) {
+        int ret = f->hooks->save_page(f, f->opaque, block_offset,
+                                      offset, size, bytes_sent);
 
         if (ret != RAM_SAVE_CONTROL_DELAYED) {
             if (bytes_sent && *bytes_sent > 0) {
@@ -239,14 +268,6 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
     return len;
 }
 
-int qemu_get_fd(QEMUFile *f)
-{
-    if (f->ops->get_fd) {
-        return f->ops->get_fd(f->opaque);
-    }
-    return -1;
-}
-
 void qemu_update_position(QEMUFile *f, size_t size)
 {
     f->pos += size;
@@ -301,11 +322,6 @@ static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size)
 
 void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size)
 {
-    if (!f->ops->writev_buffer) {
-        qemu_put_buffer(f, buf, size);
-        return;
-    }
-
     if (f->last_error) {
         return;
     }
@@ -329,9 +345,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
         }
         memcpy(f->buf + f->buf_index, buf, l);
         f->bytes_xfer += l;
-        if (f->ops->writev_buffer) {
-            add_to_iovec(f, f->buf + f->buf_index, l);
-        }
+        add_to_iovec(f, f->buf + f->buf_index, l);
         f->buf_index += l;
         if (f->buf_index == IO_BUF_SIZE) {
             qemu_fflush(f);
@@ -352,9 +366,7 @@ void qemu_put_byte(QEMUFile *f, int v)
 
     f->buf[f->buf_index] = v;
     f->bytes_xfer++;
-    if (f->ops->writev_buffer) {
-        add_to_iovec(f, f->buf + f->buf_index, 1);
-    }
+    add_to_iovec(f, f->buf + f->buf_index, 1);
     f->buf_index++;
     if (f->buf_index == IO_BUF_SIZE) {
         qemu_fflush(f);
@@ -518,12 +530,8 @@ int64_t qemu_ftell_fast(QEMUFile *f)
     int64_t ret = f->pos;
     int i;
 
-    if (f->ops->writev_buffer) {
-        for (i = 0; i < f->iovcnt; i++) {
-            ret += f->iov[i].iov_len;
-        }
-    } else {
-        ret += f->buf_index;
+    for (i = 0; i < f->iovcnt; i++) {
+        ret += f->iov[i].iov_len;
     }
 
     return ret;
@@ -670,9 +678,7 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256])
  */
 void qemu_file_set_blocking(QEMUFile *f, bool block)
 {
-    if (block) {
-        qemu_set_block(qemu_get_fd(f));
-    } else {
-        qemu_set_nonblock(qemu_get_fd(f));
+    if (f->ops->set_blocking) {
+        f->ops->set_blocking(f->opaque, block);
     }
 }
diff --git a/migration/ram.c b/migration/ram.c
index 54e215128c..844ea4694f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -429,10 +429,8 @@ static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 static void mig_throttle_guest_down(void)
 {
     MigrationState *s = migrate_get_current();
-    uint64_t pct_initial =
-            s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL];
-    uint64_t pct_icrement =
-            s->parameters[MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT];
+    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
+    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 
     /* We have not started throttling yet. Let's start it. */
     if (!cpu_throttle_active()) {
diff --git a/migration/rdma.c b/migration/rdma.c
index f6a9992b3e..51bafc702b 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2,10 +2,12 @@
  * RDMA protocol and interfaces
  *
  * Copyright IBM, Corp. 2010-2013
+ * Copyright Red Hat, Inc. 2015-2016
  *
  * Authors:
  *  Michael R. Hines <mrhines@us.ibm.com>
  *  Jiuxing Liu <jl@us.ibm.com>
+ *  Daniel P. Berrange <berrange@redhat.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or
  * later.  See the COPYING file in the top-level directory.
@@ -374,14 +376,20 @@ typedef struct RDMAContext {
     GHashTable *blockmap;
 } RDMAContext;
 
-/*
- * Interface to the rest of the migration call stack.
- */
-typedef struct QEMUFileRDMA {
+#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
+#define QIO_CHANNEL_RDMA(obj)                                     \
+    OBJECT_CHECK(QIOChannelRDMA, (obj), TYPE_QIO_CHANNEL_RDMA)
+
+typedef struct QIOChannelRDMA QIOChannelRDMA;
+
+
+struct QIOChannelRDMA {
+    QIOChannel parent;
     RDMAContext *rdma;
+    QEMUFile *file;
     size_t len;
-    void *file;
-} QEMUFileRDMA;
+    bool blocking; /* XXX we don't actually honour this yet */
+};
 
 /*
  * Main structure for IB Send/Recv control messages.
@@ -2518,15 +2526,19 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp)
  * SEND messages for control only.
  * VM's ram is handled with regular RDMA messages.
  */
-static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
-                                    int64_t pos, size_t size)
-{
-    QEMUFileRDMA *r = opaque;
-    QEMUFile *f = r->file;
-    RDMAContext *rdma = r->rdma;
-    size_t remaining = size;
-    uint8_t * data = (void *) buf;
+static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
+                                       const struct iovec *iov,
+                                       size_t niov,
+                                       int *fds,
+                                       size_t nfds,
+                                       Error **errp)
+{
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
+    QEMUFile *f = rioc->file;
+    RDMAContext *rdma = rioc->rdma;
     int ret;
+    ssize_t done = 0;
+    size_t i;
 
     CHECK_ERROR_STATE();
 
@@ -2540,27 +2552,31 @@ static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
         return ret;
     }
 
-    while (remaining) {
-        RDMAControlHeader head;
+    for (i = 0; i < niov; i++) {
+        size_t remaining = iov[i].iov_len;
+        uint8_t * data = (void *)iov[i].iov_base;
+        while (remaining) {
+            RDMAControlHeader head;
 
-        r->len = MIN(remaining, RDMA_SEND_INCREMENT);
-        remaining -= r->len;
+            rioc->len = MIN(remaining, RDMA_SEND_INCREMENT);
+            remaining -= rioc->len;
 
-        /* Guaranteed to fit due to RDMA_SEND_INCREMENT MIN above */
-        head.len = (uint32_t)r->len;
-        head.type = RDMA_CONTROL_QEMU_FILE;
+            head.len = rioc->len;
+            head.type = RDMA_CONTROL_QEMU_FILE;
 
-        ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
+            ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
 
-        if (ret < 0) {
-            rdma->error_state = ret;
-            return ret;
-        }
+            if (ret < 0) {
+                rdma->error_state = ret;
+                return ret;
+            }
 
-        data += r->len;
+            data += rioc->len;
+            done += rioc->len;
+        }
     }
 
-    return size;
+    return done;
 }
 
 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
@@ -2585,41 +2601,74 @@ static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
  * RDMA links don't use bytestreams, so we have to
  * return bytes to QEMUFile opportunistically.
  */
-static ssize_t qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
-                                    int64_t pos, size_t size)
-{
-    QEMUFileRDMA *r = opaque;
-    RDMAContext *rdma = r->rdma;
+static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
+                                      const struct iovec *iov,
+                                      size_t niov,
+                                      int **fds,
+                                      size_t *nfds,
+                                      Error **errp)
+{
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
+    RDMAContext *rdma = rioc->rdma;
     RDMAControlHeader head;
     int ret = 0;
+    ssize_t i;
+    size_t done = 0;
 
     CHECK_ERROR_STATE();
 
-    /*
-     * First, we hold on to the last SEND message we
-     * were given and dish out the bytes until we run
-     * out of bytes.
-     */
-    r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
-    if (r->len) {
-        return r->len;
-    }
+    for (i = 0; i < niov; i++) {
+        size_t want = iov[i].iov_len;
+        uint8_t *data = (void *)iov[i].iov_base;
 
-    /*
-     * Once we run out, we block and wait for another
-     * SEND message to arrive.
-     */
-    ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
+        /*
+         * First, we hold on to the last SEND message we
+         * were given and dish out the bytes until we run
+         * out of bytes.
+         */
+        ret = qemu_rdma_fill(rioc->rdma, data, want, 0);
+        done += ret;
+        want -= ret;
+        /* Got what we needed, so go to next iovec */
+        if (want == 0) {
+            continue;
+        }
 
-    if (ret < 0) {
-        rdma->error_state = ret;
-        return ret;
-    }
+        /* If we got any data so far, then don't wait
+         * for more, just return what we have */
+        if (done > 0) {
+            break;
+        }
 
-    /*
-     * SEND was received with new bytes, now try again.
-     */
-    return qemu_rdma_fill(r->rdma, buf, size, 0);
+
+        /* We've got nothing at all, so lets wait for
+         * more to arrive
+         */
+        ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
+
+        if (ret < 0) {
+            rdma->error_state = ret;
+            return ret;
+        }
+
+        /*
+         * SEND was received with new bytes, now try again.
+         */
+        ret = qemu_rdma_fill(rioc->rdma, data, want, 0);
+        done += ret;
+        want -= ret;
+
+        /* Still didn't get enough, so lets just return */
+        if (want) {
+            if (done == 0) {
+                return QIO_CHANNEL_ERR_BLOCK;
+            } else {
+                break;
+            }
+        }
+    }
+    rioc->len = done;
+    return rioc->len;
 }
 
 /*
@@ -2646,15 +2695,122 @@ static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
     return 0;
 }
 
-static int qemu_rdma_close(void *opaque)
+
+static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
+                                         bool blocking,
+                                         Error **errp)
+{
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
+    /* XXX we should make readv/writev actually honour this :-) */
+    rioc->blocking = blocking;
+    return 0;
+}
+
+
+typedef struct QIOChannelRDMASource QIOChannelRDMASource;
+struct QIOChannelRDMASource {
+    GSource parent;
+    QIOChannelRDMA *rioc;
+    GIOCondition condition;
+};
+
+static gboolean
+qio_channel_rdma_source_prepare(GSource *source,
+                                gint *timeout)
+{
+    QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
+    RDMAContext *rdma = rsource->rioc->rdma;
+    GIOCondition cond = 0;
+    *timeout = -1;
+
+    if (rdma->wr_data[0].control_len) {
+        cond |= G_IO_IN;
+    }
+    cond |= G_IO_OUT;
+
+    return cond & rsource->condition;
+}
+
+static gboolean
+qio_channel_rdma_source_check(GSource *source)
+{
+    QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
+    RDMAContext *rdma = rsource->rioc->rdma;
+    GIOCondition cond = 0;
+
+    if (rdma->wr_data[0].control_len) {
+        cond |= G_IO_IN;
+    }
+    cond |= G_IO_OUT;
+
+    return cond & rsource->condition;
+}
+
+static gboolean
+qio_channel_rdma_source_dispatch(GSource *source,
+                                 GSourceFunc callback,
+                                 gpointer user_data)
+{
+    QIOChannelFunc func = (QIOChannelFunc)callback;
+    QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
+    RDMAContext *rdma = rsource->rioc->rdma;
+    GIOCondition cond = 0;
+
+    if (rdma->wr_data[0].control_len) {
+        cond |= G_IO_IN;
+    }
+    cond |= G_IO_OUT;
+
+    return (*func)(QIO_CHANNEL(rsource->rioc),
+                   (cond & rsource->condition),
+                   user_data);
+}
+
+static void
+qio_channel_rdma_source_finalize(GSource *source)
+{
+    QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
+
+    object_unref(OBJECT(ssource->rioc));
+}
+
+GSourceFuncs qio_channel_rdma_source_funcs = {
+    qio_channel_rdma_source_prepare,
+    qio_channel_rdma_source_check,
+    qio_channel_rdma_source_dispatch,
+    qio_channel_rdma_source_finalize
+};
+
+static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
+                                              GIOCondition condition)
+{
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
+    QIOChannelRDMASource *ssource;
+    GSource *source;
+
+    source = g_source_new(&qio_channel_rdma_source_funcs,
+                          sizeof(QIOChannelRDMASource));
+    ssource = (QIOChannelRDMASource *)source;
+
+    ssource->rioc = rioc;
+    object_ref(OBJECT(rioc));
+
+    ssource->condition = condition;
+
+    return source;
+}
+
+
+static int qio_channel_rdma_close(QIOChannel *ioc,
+                                  Error **errp)
 {
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
     trace_qemu_rdma_close();
-    QEMUFileRDMA *r = opaque;
-    if (r->rdma) {
-        qemu_rdma_cleanup(r->rdma);
-        g_free(r->rdma);
+    if (rioc->rdma) {
+        qemu_rdma_cleanup(rioc->rdma);
+        g_free(rioc->rdma);
+        rioc->rdma = NULL;
     }
-    g_free(r);
     return 0;
 }
 
@@ -2696,8 +2852,8 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
                                   ram_addr_t block_offset, ram_addr_t offset,
                                   size_t size, uint64_t *bytes_sent)
 {
-    QEMUFileRDMA *rfile = opaque;
-    RDMAContext *rdma = rfile->rdma;
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
+    RDMAContext *rdma = rioc->rdma;
     int ret;
 
     CHECK_ERROR_STATE();
@@ -2951,8 +3107,8 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
                              };
     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
                                  .repeat = 1 };
-    QEMUFileRDMA *rfile = opaque;
-    RDMAContext *rdma = rfile->rdma;
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
+    RDMAContext *rdma = rioc->rdma;
     RDMALocalBlocks *local = &rdma->local_ram_blocks;
     RDMAControlHeader head;
     RDMARegister *reg, *registers;
@@ -3207,9 +3363,10 @@ out:
  * We've already built our local RAMBlock list, but not yet sent the list to
  * the source.
  */
-static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name)
+static int
+rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
 {
-    RDMAContext *rdma = rfile->rdma;
+    RDMAContext *rdma = rioc->rdma;
     int curr;
     int found = -1;
 
@@ -3251,8 +3408,8 @@ static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
 static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
                                         uint64_t flags, void *data)
 {
-    QEMUFileRDMA *rfile = opaque;
-    RDMAContext *rdma = rfile->rdma;
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
+    RDMAContext *rdma = rioc->rdma;
 
     CHECK_ERROR_STATE();
 
@@ -3271,8 +3428,8 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
                                        uint64_t flags, void *data)
 {
     Error *local_err = NULL, **errp = &local_err;
-    QEMUFileRDMA *rfile = opaque;
-    RDMAContext *rdma = rfile->rdma;
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
+    RDMAContext *rdma = rioc->rdma;
     RDMAControlHeader head = { .len = 0, .repeat = 1 };
     int ret = 0;
 
@@ -3368,47 +3525,74 @@ err:
     return ret;
 }
 
-static int qemu_rdma_get_fd(void *opaque)
-{
-    QEMUFileRDMA *rfile = opaque;
-    RDMAContext *rdma = rfile->rdma;
-
-    return rdma->comp_channel->fd;
-}
-
-static const QEMUFileOps rdma_read_ops = {
-    .get_buffer    = qemu_rdma_get_buffer,
-    .get_fd        = qemu_rdma_get_fd,
-    .close         = qemu_rdma_close,
+static const QEMUFileHooks rdma_read_hooks = {
     .hook_ram_load = rdma_load_hook,
 };
 
-static const QEMUFileOps rdma_write_ops = {
-    .put_buffer         = qemu_rdma_put_buffer,
-    .close              = qemu_rdma_close,
+static const QEMUFileHooks rdma_write_hooks = {
     .before_ram_iterate = qemu_rdma_registration_start,
     .after_ram_iterate  = qemu_rdma_registration_stop,
     .save_page          = qemu_rdma_save_page,
 };
 
-static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
+
+static void qio_channel_rdma_finalize(Object *obj)
 {
-    QEMUFileRDMA *r;
+    QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
+    if (rioc->rdma) {
+        qemu_rdma_cleanup(rioc->rdma);
+        g_free(rioc->rdma);
+        rioc->rdma = NULL;
+    }
+}
+
+static void qio_channel_rdma_class_init(ObjectClass *klass,
+                                        void *class_data G_GNUC_UNUSED)
+{
+    QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
+
+    ioc_klass->io_writev = qio_channel_rdma_writev;
+    ioc_klass->io_readv = qio_channel_rdma_readv;
+    ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
+    ioc_klass->io_close = qio_channel_rdma_close;
+    ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
+}
+
+static const TypeInfo qio_channel_rdma_info = {
+    .parent = TYPE_QIO_CHANNEL,
+    .name = TYPE_QIO_CHANNEL_RDMA,
+    .instance_size = sizeof(QIOChannelRDMA),
+    .instance_finalize = qio_channel_rdma_finalize,
+    .class_init = qio_channel_rdma_class_init,
+};
+
+static void qio_channel_rdma_register_types(void)
+{
+    type_register_static(&qio_channel_rdma_info);
+}
+
+type_init(qio_channel_rdma_register_types);
+
+static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
+{
+    QIOChannelRDMA *rioc;
 
     if (qemu_file_mode_is_not_valid(mode)) {
         return NULL;
     }
 
-    r = g_new0(QEMUFileRDMA, 1);
-    r->rdma = rdma;
+    rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
+    rioc->rdma = rdma;
 
     if (mode[0] == 'w') {
-        r->file = qemu_fopen_ops(r, &rdma_write_ops);
+        rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
+        qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
     } else {
-        r->file = qemu_fopen_ops(r, &rdma_read_ops);
+        rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
+        qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
     }
 
-    return r->file;
+    return rioc->file;
 }
 
 static void rdma_accept_incoming_migration(void *opaque)
@@ -3481,16 +3665,14 @@ void rdma_start_outgoing_migration(void *opaque,
                             const char *host_port, Error **errp)
 {
     MigrationState *s = opaque;
-    Error *local_err = NULL, **temp = &local_err;
-    RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err);
+    RDMAContext *rdma = qemu_rdma_data_init(host_port, errp);
     int ret = 0;
 
     if (rdma == NULL) {
-        ERROR(temp, "Failed to initialize RDMA data structures! %d", ret);
         goto err;
     }
 
-    ret = qemu_rdma_source_init(rdma, &local_err,
+    ret = qemu_rdma_source_init(rdma, errp,
         s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]);
 
     if (ret) {
@@ -3498,7 +3680,7 @@ void rdma_start_outgoing_migration(void *opaque,
     }
 
     trace_rdma_start_outgoing_migration_after_rdma_source_init();
-    ret = qemu_rdma_connect(rdma, &local_err);
+    ret = qemu_rdma_connect(rdma, errp);
 
     if (ret) {
         goto err;
@@ -3510,7 +3692,5 @@ void rdma_start_outgoing_migration(void *opaque,
     migrate_fd_connect(s);
     return;
 err:
-    error_propagate(errp, local_err);
     g_free(rdma);
-    migrate_fd_error(s);
 }
diff --git a/migration/savevm.c b/migration/savevm.c
index 65ce0c61a3..6c21231131 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -51,6 +51,8 @@
 #include "block/snapshot.h"
 #include "block/qapi.h"
 #include "qemu/cutils.h"
+#include "io/channel-buffer.h"
+#include "io/channel-file.h"
 
 #ifndef ETH_P_RARP
 #define ETH_P_RARP 0x8035
@@ -158,13 +160,6 @@ static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
     return qiov.size;
 }
 
-static ssize_t block_put_buffer(void *opaque, const uint8_t *buf,
-                                int64_t pos, size_t size)
-{
-    bdrv_save_vmstate(opaque, buf, pos, size);
-    return size;
-}
-
 static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
                                 size_t size)
 {
@@ -182,7 +177,6 @@ static const QEMUFileOps bdrv_read_ops = {
 };
 
 static const QEMUFileOps bdrv_write_ops = {
-    .put_buffer     = block_put_buffer,
     .writev_buffer  = block_writev_buffer,
     .close          = bdrv_fclose
 };
@@ -760,10 +754,8 @@ void qemu_savevm_send_open_return_path(QEMUFile *f)
  *    0 on success
  *    -ve on error
  */
-int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb)
+int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
 {
-    size_t cur_iov;
-    size_t len = qsb_get_length(qsb);
     uint32_t tmp;
 
     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
@@ -777,18 +769,7 @@ int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb)
     trace_qemu_savevm_send_packaged();
     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
 
-    /* all the data follows (concatinating the iov's) */
-    for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) {
-        /* The iov entries are partially filled */
-        size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len);
-        len -= towrite;
-
-        if (!towrite) {
-            break;
-        }
-
-        qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite);
-    }
+    qemu_put_buffer(f, buf, len);
 
     return 0;
 }
@@ -1578,39 +1559,36 @@ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
 {
     int ret;
-    uint8_t *buffer;
-    uint32_t length;
-    QEMUSizedBuffer *qsb;
+    size_t length;
+    QIOChannelBuffer *bioc;
 
     length = qemu_get_be32(mis->from_src_file);
     trace_loadvm_handle_cmd_packaged(length);
 
     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
-        error_report("Unreasonably large packaged state: %u", length);
+        error_report("Unreasonably large packaged state: %zu", length);
         return -1;
     }
-    buffer = g_malloc0(length);
-    ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length);
+
+    bioc = qio_channel_buffer_new(length);
+    ret = qemu_get_buffer(mis->from_src_file,
+                          bioc->data,
+                          length);
     if (ret != length) {
-        g_free(buffer);
-        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d",
+        object_unref(OBJECT(bioc));
+        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
                      ret, length);
         return (ret < 0) ? ret : -EAGAIN;
     }
+    bioc->usage += length;
     trace_loadvm_handle_cmd_packaged_received(ret);
 
-    /* Setup a dummy QEMUFile that actually reads from the buffer */
-    qsb = qsb_create(buffer, length);
-    g_free(buffer); /* Because qsb_create copies */
-    if (!qsb) {
-        error_report("Unable to create qsb");
-    }
-    QEMUFile *packf = qemu_bufopen("r", qsb);
+    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
 
     ret = qemu_loadvm_state_main(packf, mis);
     trace_loadvm_handle_cmd_packaged_main(ret);
     qemu_fclose(packf);
-    qsb_free(qsb);
+    object_unref(OBJECT(bioc));
 
     return ret;
 }
@@ -2061,6 +2039,7 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
 void qmp_xen_save_devices_state(const char *filename, Error **errp)
 {
     QEMUFile *f;
+    QIOChannelFile *ioc;
     int saved_vm_running;
     int ret;
 
@@ -2068,11 +2047,11 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp)
     vm_stop(RUN_STATE_SAVE_VM);
     global_state_store_running();
 
-    f = qemu_fopen(filename, "wb");
-    if (!f) {
-        error_setg_file_open(errp, errno, filename);
+    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
+    if (!ioc) {
         goto the_end;
     }
+    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
     ret = qemu_save_device_state(f);
     qemu_fclose(f);
     if (ret < 0) {
diff --git a/migration/socket.c b/migration/socket.c
new file mode 100644
index 0000000000..977a8d3c1d
--- /dev/null
+++ b/migration/socket.c
@@ -0,0 +1,183 @@
+/*
+ * QEMU live migration via Unix Domain Sockets
+ *
+ * Copyright Red Hat, Inc. 2009-2016
+ *
+ * Authors:
+ *  Chris Lalancette <clalance@redhat.com>
+ *  Daniel P. Berrange <berrange@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include "io/channel-socket.h"
+#include "trace.h"
+
+
+static SocketAddress *tcp_build_address(const char *host_port, Error **errp)
+{
+    InetSocketAddress *iaddr = inet_parse(host_port, errp);
+    SocketAddress *saddr;
+
+    if (!iaddr) {
+        return NULL;
+    }
+
+    saddr = g_new0(SocketAddress, 1);
+    saddr->type = SOCKET_ADDRESS_KIND_INET;
+    saddr->u.inet.data = iaddr;
+
+    return saddr;
+}
+
+
+static SocketAddress *unix_build_address(const char *path)
+{
+    SocketAddress *saddr;
+
+    saddr = g_new0(SocketAddress, 1);
+    saddr->type = SOCKET_ADDRESS_KIND_UNIX;
+    saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1);
+    saddr->u.q_unix.data->path = g_strdup(path);
+
+    return saddr;
+}
+
+
+struct SocketConnectData {
+    MigrationState *s;
+    char *hostname;
+};
+
+static void socket_connect_data_free(void *opaque)
+{
+    struct SocketConnectData *data = opaque;
+    if (!data) {
+        return;
+    }
+    g_free(data->hostname);
+    g_free(data);
+}
+
+static void socket_outgoing_migration(Object *src,
+                                      Error *err,
+                                      gpointer opaque)
+{
+    struct SocketConnectData *data = opaque;
+    QIOChannel *sioc = QIO_CHANNEL(src);
+
+    if (err) {
+        trace_migration_socket_outgoing_error(error_get_pretty(err));
+        data->s->to_dst_file = NULL;
+        migrate_fd_error(data->s, err);
+    } else {
+        trace_migration_socket_outgoing_connected(data->hostname);
+        migration_set_outgoing_channel(data->s, sioc, data->hostname);
+    }
+    object_unref(src);
+}
+
+static void socket_start_outgoing_migration(MigrationState *s,
+                                            SocketAddress *saddr,
+                                            Error **errp)
+{
+    QIOChannelSocket *sioc = qio_channel_socket_new();
+    struct SocketConnectData *data = g_new0(struct SocketConnectData, 1);
+    data->s = s;
+    if (saddr->type == SOCKET_ADDRESS_KIND_INET) {
+        data->hostname = g_strdup(saddr->u.inet.data->host);
+    }
+    qio_channel_socket_connect_async(sioc,
+                                     saddr,
+                                     socket_outgoing_migration,
+                                     data,
+                                     socket_connect_data_free);
+    qapi_free_SocketAddress(saddr);
+}
+
+void tcp_start_outgoing_migration(MigrationState *s,
+                                  const char *host_port,
+                                  Error **errp)
+{
+    SocketAddress *saddr = tcp_build_address(host_port, errp);
+    socket_start_outgoing_migration(s, saddr, errp);
+}
+
+void unix_start_outgoing_migration(MigrationState *s,
+                                   const char *path,
+                                   Error **errp)
+{
+    SocketAddress *saddr = unix_build_address(path);
+    socket_start_outgoing_migration(s, saddr, errp);
+}
+
+
+static gboolean socket_accept_incoming_migration(QIOChannel *ioc,
+                                                 GIOCondition condition,
+                                                 gpointer opaque)
+{
+    QIOChannelSocket *sioc;
+    Error *err = NULL;
+
+    sioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc),
+                                     &err);
+    if (!sioc) {
+        error_report("could not accept migration connection (%s)",
+                     error_get_pretty(err));
+        goto out;
+    }
+
+    trace_migration_socket_incoming_accepted();
+
+    migration_set_incoming_channel(migrate_get_current(),
+                                   QIO_CHANNEL(sioc));
+    object_unref(OBJECT(sioc));
+
+out:
+    /* Close listening socket as its no longer needed */
+    qio_channel_close(ioc, NULL);
+    return FALSE; /* unregister */
+}
+
+
+static void socket_start_incoming_migration(SocketAddress *saddr,
+                                            Error **errp)
+{
+    QIOChannelSocket *listen_ioc = qio_channel_socket_new();
+
+    if (qio_channel_socket_listen_sync(listen_ioc, saddr, errp) < 0) {
+        object_unref(OBJECT(listen_ioc));
+        qapi_free_SocketAddress(saddr);
+        return;
+    }
+
+    qio_channel_add_watch(QIO_CHANNEL(listen_ioc),
+                          G_IO_IN,
+                          socket_accept_incoming_migration,
+                          listen_ioc,
+                          (GDestroyNotify)object_unref);
+    qapi_free_SocketAddress(saddr);
+}
+
+void tcp_start_incoming_migration(const char *host_port, Error **errp)
+{
+    SocketAddress *saddr = tcp_build_address(host_port, errp);
+    socket_start_incoming_migration(saddr, errp);
+}
+
+void unix_start_incoming_migration(const char *path, Error **errp)
+{
+    SocketAddress *saddr = unix_build_address(path);
+    socket_start_incoming_migration(saddr, errp);
+}
diff --git a/migration/tcp.c b/migration/tcp.c
deleted file mode 100644
index e1fa7f8f18..0000000000
--- a/migration/tcp.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * QEMU live migration
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-
-#include "qemu/osdep.h"
-
-#include "qemu-common.h"
-#include "qemu/error-report.h"
-#include "qemu/sockets.h"
-#include "migration/migration.h"
-#include "migration/qemu-file.h"
-#include "block/block.h"
-#include "qemu/main-loop.h"
-
-//#define DEBUG_MIGRATION_TCP
-
-#ifdef DEBUG_MIGRATION_TCP
-#define DPRINTF(fmt, ...) \
-    do { printf("migration-tcp: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-static void tcp_wait_for_connect(int fd, Error *err, void *opaque)
-{
-    MigrationState *s = opaque;
-
-    if (fd < 0) {
-        DPRINTF("migrate connect error: %s\n", error_get_pretty(err));
-        s->to_dst_file = NULL;
-        migrate_fd_error(s);
-    } else {
-        DPRINTF("migrate connect success\n");
-        s->to_dst_file = qemu_fopen_socket(fd, "wb");
-        migrate_fd_connect(s);
-    }
-}
-
-void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp)
-{
-    inet_nonblocking_connect(host_port, tcp_wait_for_connect, s, errp);
-}
-
-static void tcp_accept_incoming_migration(void *opaque)
-{
-    struct sockaddr_in addr;
-    socklen_t addrlen = sizeof(addr);
-    int s = (intptr_t)opaque;
-    QEMUFile *f;
-    int c;
-
-    do {
-        c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen);
-    } while (c < 0 && errno == EINTR);
-    qemu_set_fd_handler(s, NULL, NULL, NULL);
-    closesocket(s);
-
-    DPRINTF("accepted migration\n");
-
-    if (c < 0) {
-        error_report("could not accept migration connection (%s)",
-                     strerror(errno));
-        return;
-    }
-
-    f = qemu_fopen_socket(c, "rb");
-    if (f == NULL) {
-        error_report("could not qemu_fopen socket");
-        goto out;
-    }
-
-    process_incoming_migration(f);
-    return;
-
-out:
-    closesocket(c);
-}
-
-void tcp_start_incoming_migration(const char *host_port, Error **errp)
-{
-    int s;
-
-    s = inet_listen(host_port, NULL, 256, SOCK_STREAM, 0, errp);
-    if (s < 0) {
-        return;
-    }
-
-    qemu_set_fd_handler(s, tcp_accept_incoming_migration, NULL,
-                        (void *)(intptr_t)s);
-}
diff --git a/migration/tls.c b/migration/tls.c
new file mode 100644
index 0000000000..75f959ff9c
--- /dev/null
+++ b/migration/tls.c
@@ -0,0 +1,161 @@
+/*
+ * QEMU migration TLS support
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "migration/migration.h"
+#include "io/channel-tls.h"
+#include "crypto/tlscreds.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "trace.h"
+
+static QCryptoTLSCreds *
+migration_tls_get_creds(MigrationState *s,
+                        QCryptoTLSCredsEndpoint endpoint,
+                        Error **errp)
+{
+    Object *creds;
+    QCryptoTLSCreds *ret;
+
+    creds = object_resolve_path_component(
+        object_get_objects_root(), s->parameters.tls_creds);
+    if (!creds) {
+        error_setg(errp, "No TLS credentials with id '%s'",
+                   s->parameters.tls_creds);
+        return NULL;
+    }
+    ret = (QCryptoTLSCreds *)object_dynamic_cast(
+        creds, TYPE_QCRYPTO_TLS_CREDS);
+    if (!ret) {
+        error_setg(errp, "Object with id '%s' is not TLS credentials",
+                   s->parameters.tls_creds);
+        return NULL;
+    }
+    if (ret->endpoint != endpoint) {
+        error_setg(errp,
+                   "Expected TLS credentials for a %s endpoint",
+                   endpoint == QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT ?
+                   "client" : "server");
+        return NULL;
+    }
+
+    object_ref(OBJECT(ret));
+    return ret;
+}
+
+
+static void migration_tls_incoming_handshake(Object *src,
+                                             Error *err,
+                                             gpointer opaque)
+{
+    QIOChannel *ioc = QIO_CHANNEL(src);
+
+    if (err) {
+        trace_migration_tls_incoming_handshake_error(error_get_pretty(err));
+        error_report("%s", error_get_pretty(err));
+    } else {
+        trace_migration_tls_incoming_handshake_complete();
+        migration_set_incoming_channel(migrate_get_current(), ioc);
+    }
+    object_unref(OBJECT(ioc));
+}
+
+void migration_tls_set_incoming_channel(MigrationState *s,
+                                        QIOChannel *ioc,
+                                        Error **errp)
+{
+    QCryptoTLSCreds *creds;
+    QIOChannelTLS *tioc;
+
+    creds = migration_tls_get_creds(
+        s, QCRYPTO_TLS_CREDS_ENDPOINT_SERVER, errp);
+    if (!creds) {
+        return;
+    }
+
+    tioc = qio_channel_tls_new_server(
+        ioc, creds,
+        NULL, /* XXX pass ACL name */
+        errp);
+    if (!tioc) {
+        return;
+    }
+
+    trace_migration_tls_incoming_handshake_start();
+    qio_channel_tls_handshake(tioc,
+                              migration_tls_incoming_handshake,
+                              NULL,
+                              NULL);
+}
+
+
+static void migration_tls_outgoing_handshake(Object *src,
+                                             Error *err,
+                                             gpointer opaque)
+{
+    MigrationState *s = opaque;
+    QIOChannel *ioc = QIO_CHANNEL(src);
+
+    if (err) {
+        trace_migration_tls_outgoing_handshake_error(error_get_pretty(err));
+        s->to_dst_file = NULL;
+        migrate_fd_error(s, err);
+    } else {
+        trace_migration_tls_outgoing_handshake_complete();
+        migration_set_outgoing_channel(s, ioc, NULL);
+    }
+    object_unref(OBJECT(ioc));
+}
+
+
+void migration_tls_set_outgoing_channel(MigrationState *s,
+                                        QIOChannel *ioc,
+                                        const char *hostname,
+                                        Error **errp)
+{
+    QCryptoTLSCreds *creds;
+    QIOChannelTLS *tioc;
+
+    creds = migration_tls_get_creds(
+        s, QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT, errp);
+    if (!creds) {
+        return;
+    }
+
+    if (s->parameters.tls_hostname) {
+        hostname = s->parameters.tls_hostname;
+    }
+    if (!hostname) {
+        error_setg(errp, "No hostname available for TLS");
+        return;
+    }
+
+    tioc = qio_channel_tls_new_client(
+        ioc, creds, hostname, errp);
+    if (!tioc) {
+        return;
+    }
+
+    trace_migration_tls_outgoing_handshake_start(hostname);
+    qio_channel_tls_handshake(tioc,
+                              migration_tls_outgoing_handshake,
+                              s,
+                              NULL);
+}
diff --git a/migration/unix.c b/migration/unix.c
deleted file mode 100644
index d9aac36b9a..0000000000
--- a/migration/unix.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * QEMU live migration via Unix Domain Sockets
- *
- * Copyright Red Hat, Inc. 2009
- *
- * Authors:
- *  Chris Lalancette <clalance@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-
-#include "qemu/osdep.h"
-
-#include "qemu-common.h"
-#include "qemu/error-report.h"
-#include "qemu/sockets.h"
-#include "qemu/main-loop.h"
-#include "migration/migration.h"
-#include "migration/qemu-file.h"
-#include "block/block.h"
-
-//#define DEBUG_MIGRATION_UNIX
-
-#ifdef DEBUG_MIGRATION_UNIX
-#define DPRINTF(fmt, ...) \
-    do { printf("migration-unix: " fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-static void unix_wait_for_connect(int fd, Error *err, void *opaque)
-{
-    MigrationState *s = opaque;
-
-    if (fd < 0) {
-        DPRINTF("migrate connect error: %s\n", error_get_pretty(err));
-        s->to_dst_file = NULL;
-        migrate_fd_error(s);
-    } else {
-        DPRINTF("migrate connect success\n");
-        s->to_dst_file = qemu_fopen_socket(fd, "wb");
-        migrate_fd_connect(s);
-    }
-}
-
-void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp)
-{
-    unix_nonblocking_connect(path, unix_wait_for_connect, s, errp);
-}
-
-static void unix_accept_incoming_migration(void *opaque)
-{
-    struct sockaddr_un addr;
-    socklen_t addrlen = sizeof(addr);
-    int s = (intptr_t)opaque;
-    QEMUFile *f;
-    int c, err;
-
-    do {
-        c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen);
-        err = errno;
-    } while (c < 0 && err == EINTR);
-    qemu_set_fd_handler(s, NULL, NULL, NULL);
-    close(s);
-
-    DPRINTF("accepted migration\n");
-
-    if (c < 0) {
-        error_report("could not accept migration connection (%s)",
-                     strerror(err));
-        return;
-    }
-
-    f = qemu_fopen_socket(c, "rb");
-    if (f == NULL) {
-        error_report("could not qemu_fopen socket");
-        goto out;
-    }
-
-    process_incoming_migration(f);
-    return;
-
-out:
-    close(c);
-}
-
-void unix_start_incoming_migration(const char *path, Error **errp)
-{
-    int s;
-
-    s = unix_listen(path, NULL, 0, errp);
-    if (s < 0) {
-        return;
-    }
-
-    qemu_set_fd_handler(s, unix_accept_incoming_migration, NULL,
-                        (void *)(intptr_t)s);
-}
diff --git a/monitor.c b/monitor.c
index 6a32b9bf59..404d594bb3 100644
--- a/monitor.c
+++ b/monitor.c
@@ -3432,12 +3432,12 @@ static void vm_completion(ReadLineState *rs, const char *str)
 {
     size_t len;
     BlockDriverState *bs;
-    BdrvNextIterator *it = NULL;
+    BdrvNextIterator it;
 
     len = strlen(str);
     readline_set_completion_index(rs, len);
 
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         SnapshotInfoList *snapshots, *snapshot;
         AioContext *ctx = bdrv_get_aio_context(bs);
         bool ok = false;
diff --git a/nbd/server.c b/nbd/server.c
index fa862cd622..b2cfeb9843 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1153,12 +1153,20 @@ static void nbd_trip(void *opaque)
         break;
     case NBD_CMD_TRIM:
         TRACE("Request type is TRIM");
-        ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
-                                       / BDRV_SECTOR_SIZE,
-                             request.len / BDRV_SECTOR_SIZE);
-        if (ret < 0) {
-            LOG("discard failed");
-            reply.error = -ret;
+        /* Ignore unaligned head or tail, until block layer adds byte
+         * interface */
+        if (request.len >= BDRV_SECTOR_SIZE) {
+            request.len -= (request.from + request.len) % BDRV_SECTOR_SIZE;
+            ret = blk_co_discard(exp->blk,
+                                 DIV_ROUND_UP(request.from + exp->dev_offset,
+                                              BDRV_SECTOR_SIZE),
+                                 request.len / BDRV_SECTOR_SIZE);
+            if (ret < 0) {
+                LOG("discard failed");
+                reply.error = -ret;
+            }
+        } else {
+            TRACE("trim request too small, ignoring");
         }
         if (nbd_co_send_reply(req, &reply, 0) < 0) {
             goto out;
diff --git a/qapi-schema.json b/qapi-schema.json
index 9a322d1836..8483bdfcce 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -484,6 +484,10 @@
 #        throttled during auto-converge. This is only present when auto-converge
 #        has started throttling guest cpus. (Since 2.7)
 #
+# @error-desc: #optional the human readable error description string, when
+#              @status is 'failed'. Clients should not attempt to parse the
+#              error strings. (Since 2.6)
+#
 # Since: 0.14.0
 ##
 { 'struct': 'MigrationInfo',
@@ -494,7 +498,8 @@
            '*expected-downtime': 'int',
            '*downtime': 'int',
            '*setup-time': 'int',
-           '*cpu-throttle-percentage': 'int'} }
+           '*cpu-throttle-percentage': 'int',
+           '*error-desc': 'str'} }
 
 ##
 # @query-migrate
@@ -612,11 +617,28 @@
 # @cpu-throttle-increment: throttle percentage increase each time
 #                          auto-converge detects that migration is not making
 #                          progress. The default value is 10. (Since 2.7)
+#
+# @tls-creds: ID of the 'tls-creds' object that provides credentials for
+#             establishing a TLS connection over the migration data channel.
+#             On the outgoing side of the migration, the credentials must
+#             be for a 'client' endpoint, while for the incoming side the
+#             credentials must be for a 'server' endpoint. Setting this
+#             will enable TLS for all migrations. The default is unset,
+#             resulting in unsecured migration at the QEMU level. (Since 2.7)
+#
+# @tls-hostname: hostname of the target host for the migration. This is
+#                required when using x509 based TLS credentials and the
+#                migration URI does not already include a hostname. For
+#                example if using fd: or exec: based migration, the
+#                hostname must be provided so that the server's x509
+#                certificate identity canbe validated. (Since 2.7)
+#
 # Since: 2.4
 ##
 { 'enum': 'MigrationParameter',
   'data': ['compress-level', 'compress-threads', 'decompress-threads',
-           'cpu-throttle-initial', 'cpu-throttle-increment'] }
+           'cpu-throttle-initial', 'cpu-throttle-increment',
+           'tls-creds', 'tls-hostname'] }
 
 #
 # @migrate-set-parameters
@@ -636,6 +658,22 @@
 # @cpu-throttle-increment: throttle percentage increase each time
 #                          auto-converge detects that migration is not making
 #                          progress. The default value is 10. (Since 2.7)
+#
+# @tls-creds: ID of the 'tls-creds' object that provides credentials for
+#             establishing a TLS connection over the migration data channel.
+#             On the outgoing side of the migration, the credentials must
+#             be for a 'client' endpoint, while for the incoming side the
+#             credentials must be for a 'server' endpoint. Setting this
+#             will enable TLS for all migrations. The default is unset,
+#             resulting in unsecured migration at the QEMU level. (Since 2.7)
+#
+# @tls-hostname: hostname of the target host for the migration. This is
+#                required when using x509 based TLS credentials and the
+#                migration URI does not already include a hostname. For
+#                example if using fd: or exec: based migration, the
+#                hostname must be provided so that the server's x509
+#                certificate identity canbe validated. (Since 2.7)
+#
 # Since: 2.4
 ##
 { 'command': 'migrate-set-parameters',
@@ -643,7 +681,9 @@
             '*compress-threads': 'int',
             '*decompress-threads': 'int',
             '*cpu-throttle-initial': 'int',
-            '*cpu-throttle-increment': 'int'} }
+            '*cpu-throttle-increment': 'int',
+            '*tls-creds': 'str',
+            '*tls-hostname': 'str'} }
 
 #
 # @MigrationParameters
@@ -662,6 +702,21 @@
 #                          auto-converge detects that migration is not making
 #                          progress. The default value is 10. (Since 2.7)
 #
+# @tls-creds: ID of the 'tls-creds' object that provides credentials for
+#             establishing a TLS connection over the migration data channel.
+#             On the outgoing side of the migration, the credentials must
+#             be for a 'client' endpoint, while for the incoming side the
+#             credentials must be for a 'server' endpoint. Setting this
+#             will enable TLS for all migrations. The default is unset,
+#             resulting in unsecured migration at the QEMU level. (Since 2.6)
+#
+# @tls-hostname: hostname of the target host for the migration. This is
+#                required when using x509 based TLS credentials and the
+#                migration URI does not already include a hostname. For
+#                example if using fd: or exec: based migration, the
+#                hostname must be provided so that the server's x509
+#                certificate identity canbe validated. (Since 2.6)
+#
 # Since: 2.4
 ##
 { 'struct': 'MigrationParameters',
@@ -669,7 +724,9 @@
             'compress-threads': 'int',
             'decompress-threads': 'int',
             'cpu-throttle-initial': 'int',
-            'cpu-throttle-increment': 'int'} }
+            'cpu-throttle-increment': 'int',
+            'tls-creds': 'str',
+            'tls-hostname': 'str'} }
 ##
 # @query-migrate-parameters
 #
diff --git a/qemu-img.c b/qemu-img.c
index 7ed8ef21cb..4b56ad36aa 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -775,7 +775,7 @@ static void common_block_job_cb(void *opaque, int ret)
 
 static void run_block_job(BlockJob *job, Error **errp)
 {
-    AioContext *aio_context = bdrv_get_aio_context(job->bs);
+    AioContext *aio_context = blk_get_aio_context(job->blk);
 
     do {
         aio_poll(aio_context, true);
@@ -1606,8 +1606,8 @@ static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors,
             if (s->has_zero_init) {
                 break;
             }
-            ret = blk_write_zeroes(s->target, sector_num << BDRV_SECTOR_BITS,
-                                   n << BDRV_SECTOR_BITS, 0);
+            ret = blk_pwrite_zeroes(s->target, sector_num << BDRV_SECTOR_BITS,
+                                    n << BDRV_SECTOR_BITS, 0);
             if (ret < 0) {
                 return ret;
             }
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index e766791ffc..09e879f872 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -451,12 +451,12 @@ typedef struct {
     bool done;
 } CoWriteZeroes;
 
-static void coroutine_fn co_write_zeroes_entry(void *opaque)
+static void coroutine_fn co_pwrite_zeroes_entry(void *opaque)
 {
     CoWriteZeroes *data = opaque;
 
-    data->ret = blk_co_write_zeroes(data->blk, data->offset, data->count,
-                                    data->flags);
+    data->ret = blk_co_pwrite_zeroes(data->blk, data->offset, data->count,
+                                     data->flags);
     data->done = true;
     if (data->ret < 0) {
         *data->total = data->ret;
@@ -466,8 +466,8 @@ static void coroutine_fn co_write_zeroes_entry(void *opaque)
     *data->total = data->count;
 }
 
-static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int64_t count,
-                              int flags, int64_t *total)
+static int do_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+                               int64_t count, int flags, int64_t *total)
 {
     Coroutine *co;
     CoWriteZeroes data = {
@@ -483,7 +483,7 @@ static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int64_t count,
         return -ERANGE;
     }
 
-    co = qemu_coroutine_create(co_write_zeroes_entry);
+    co = qemu_coroutine_create(co_pwrite_zeroes_entry);
     qemu_coroutine_enter(co, &data);
     while (!data.done) {
         aio_poll(blk_get_aio_context(blk), true);
@@ -901,7 +901,7 @@ static void write_help(void)
 " -C, -- report statistics in a machine parsable format\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 " -u, -- with -z, allow unmapping\n"
-" -z, -- write zeroes using blk_co_write_zeroes\n"
+" -z, -- write zeroes using blk_co_pwrite_zeroes\n"
 "\n");
 }
 
@@ -1033,7 +1033,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv)
     if (bflag) {
         cnt = do_save_vmstate(blk, buf, offset, count, &total);
     } else if (zflag) {
-        cnt = do_co_write_zeroes(blk, offset, count, flags, &total);
+        cnt = do_co_pwrite_zeroes(blk, offset, count, flags, &total);
     } else if (cflag) {
         cnt = do_write_compressed(blk, buf, offset, count, &total);
     } else {
@@ -1376,7 +1376,7 @@ static void aio_write_help(void)
 " -i, -- treat request as invalid, for exercising stats\n"
 " -q, -- quiet mode, do not show I/O statistics\n"
 " -u, -- with -z, allow unmapping\n"
-" -z, -- write zeroes using blk_aio_write_zeroes\n"
+" -z, -- write zeroes using blk_aio_pwrite_zeroes\n"
 "\n");
 }
 
@@ -1475,8 +1475,8 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
         }
 
         ctx->qiov.size = count;
-        blk_aio_write_zeroes(blk, ctx->offset, count, flags, aio_write_done,
-                             ctx);
+        blk_aio_pwrite_zeroes(blk, ctx->offset, count, flags, aio_write_done,
+                              ctx);
     } else {
         nr_iov = argc - optind;
         ctx->buf = create_iovec(blk, &ctx->qiov, &argv[optind], nr_iov,
diff --git a/qmp.c b/qmp.c
index 8f8ae3a79d..3165f8726b 100644
--- a/qmp.c
+++ b/qmp.c
@@ -181,7 +181,7 @@ void qmp_cont(Error **errp)
     Error *local_err = NULL;
     BlockBackend *blk;
     BlockDriverState *bs;
-    BdrvNextIterator *it;
+    BdrvNextIterator it;
 
     /* if there is a dump in background, we should wait until the dump
      * finished */
@@ -201,8 +201,7 @@ void qmp_cont(Error **errp)
         blk_iostatus_reset(blk);
     }
 
-    it = NULL;
-    while ((it = bdrv_next(it, &bs))) {
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
         bdrv_add_key(bs, NULL, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
diff --git a/rules.mak b/rules.mak
index d1ff311254..4a8f464940 100644
--- a/rules.mak
+++ b/rules.mak
@@ -1,4 +1,6 @@
 
+COMMA := ,
+
 # Don't use implicit rules or variables
 # we have explicit rules for everything
 MAKEFLAGS += -rR
diff --git a/scripts/dump-guest-memory.py b/scripts/dump-guest-memory.py
index eb24f7874b..9956fc036c 100644
--- a/scripts/dump-guest-memory.py
+++ b/scripts/dump-guest-memory.py
@@ -328,23 +328,10 @@ def qlist_foreach(head, field_str):
         yield var
 
 
-def qemu_get_ram_block(ram_addr):
-    """Returns the RAMBlock struct to which the given address belongs."""
-
-    ram_blocks = gdb.parse_and_eval("ram_list.blocks")
-
-    for block in qlist_foreach(ram_blocks, "next"):
-        if (ram_addr - block["offset"]) < block["used_length"]:
-            return block
-
-    raise gdb.GdbError("Bad ram offset %x" % ram_addr)
-
-
-def qemu_get_ram_ptr(ram_addr):
+def qemu_map_ram_ptr(block, offset):
     """Returns qemu vaddr for given guest physical address."""
 
-    block = qemu_get_ram_block(ram_addr)
-    return block["host"] + (ram_addr - block["offset"])
+    return block["host"] + offset
 
 
 def memory_region_get_ram_ptr(memory_region):
@@ -352,7 +339,7 @@ def memory_region_get_ram_ptr(memory_region):
         return (memory_region_get_ram_ptr(memory_region["alias"].dereference())
                 + memory_region["alias_offset"])
 
-    return qemu_get_ram_ptr(memory_region["ram_block"]["offset"])
+    return qemu_map_ram_ptr(memory_region["ram_block"], 0)
 
 
 def get_guest_phys_blocks():
diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat
deleted file mode 100755
index 769d884b6d..0000000000
--- a/scripts/kvm/kvm_stat
+++ /dev/null
@@ -1,825 +0,0 @@
-#!/usr/bin/python
-#
-# top-like utility for displaying kvm statistics
-#
-# Copyright 2006-2008 Qumranet Technologies
-# Copyright 2008-2011 Red Hat, Inc.
-#
-# Authors:
-#  Avi Kivity <avi@redhat.com>
-#
-# This work is licensed under the terms of the GNU GPL, version 2.  See
-# the COPYING file in the top-level directory.
-
-import curses
-import sys
-import os
-import time
-import optparse
-import ctypes
-import fcntl
-import resource
-import struct
-import re
-from collections import defaultdict
-from time import sleep
-
-VMX_EXIT_REASONS = {
-    'EXCEPTION_NMI':        0,
-    'EXTERNAL_INTERRUPT':   1,
-    'TRIPLE_FAULT':         2,
-    'PENDING_INTERRUPT':    7,
-    'NMI_WINDOW':           8,
-    'TASK_SWITCH':          9,
-    'CPUID':                10,
-    'HLT':                  12,
-    'INVLPG':               14,
-    'RDPMC':                15,
-    'RDTSC':                16,
-    'VMCALL':               18,
-    'VMCLEAR':              19,
-    'VMLAUNCH':             20,
-    'VMPTRLD':              21,
-    'VMPTRST':              22,
-    'VMREAD':               23,
-    'VMRESUME':             24,
-    'VMWRITE':              25,
-    'VMOFF':                26,
-    'VMON':                 27,
-    'CR_ACCESS':            28,
-    'DR_ACCESS':            29,
-    'IO_INSTRUCTION':       30,
-    'MSR_READ':             31,
-    'MSR_WRITE':            32,
-    'INVALID_STATE':        33,
-    'MWAIT_INSTRUCTION':    36,
-    'MONITOR_INSTRUCTION':  39,
-    'PAUSE_INSTRUCTION':    40,
-    'MCE_DURING_VMENTRY':   41,
-    'TPR_BELOW_THRESHOLD':  43,
-    'APIC_ACCESS':          44,
-    'EPT_VIOLATION':        48,
-    'EPT_MISCONFIG':        49,
-    'WBINVD':               54,
-    'XSETBV':               55,
-    'APIC_WRITE':           56,
-    'INVPCID':              58,
-}
-
-SVM_EXIT_REASONS = {
-    'READ_CR0':       0x000,
-    'READ_CR3':       0x003,
-    'READ_CR4':       0x004,
-    'READ_CR8':       0x008,
-    'WRITE_CR0':      0x010,
-    'WRITE_CR3':      0x013,
-    'WRITE_CR4':      0x014,
-    'WRITE_CR8':      0x018,
-    'READ_DR0':       0x020,
-    'READ_DR1':       0x021,
-    'READ_DR2':       0x022,
-    'READ_DR3':       0x023,
-    'READ_DR4':       0x024,
-    'READ_DR5':       0x025,
-    'READ_DR6':       0x026,
-    'READ_DR7':       0x027,
-    'WRITE_DR0':      0x030,
-    'WRITE_DR1':      0x031,
-    'WRITE_DR2':      0x032,
-    'WRITE_DR3':      0x033,
-    'WRITE_DR4':      0x034,
-    'WRITE_DR5':      0x035,
-    'WRITE_DR6':      0x036,
-    'WRITE_DR7':      0x037,
-    'EXCP_BASE':      0x040,
-    'INTR':           0x060,
-    'NMI':            0x061,
-    'SMI':            0x062,
-    'INIT':           0x063,
-    'VINTR':          0x064,
-    'CR0_SEL_WRITE':  0x065,
-    'IDTR_READ':      0x066,
-    'GDTR_READ':      0x067,
-    'LDTR_READ':      0x068,
-    'TR_READ':        0x069,
-    'IDTR_WRITE':     0x06a,
-    'GDTR_WRITE':     0x06b,
-    'LDTR_WRITE':     0x06c,
-    'TR_WRITE':       0x06d,
-    'RDTSC':          0x06e,
-    'RDPMC':          0x06f,
-    'PUSHF':          0x070,
-    'POPF':           0x071,
-    'CPUID':          0x072,
-    'RSM':            0x073,
-    'IRET':           0x074,
-    'SWINT':          0x075,
-    'INVD':           0x076,
-    'PAUSE':          0x077,
-    'HLT':            0x078,
-    'INVLPG':         0x079,
-    'INVLPGA':        0x07a,
-    'IOIO':           0x07b,
-    'MSR':            0x07c,
-    'TASK_SWITCH':    0x07d,
-    'FERR_FREEZE':    0x07e,
-    'SHUTDOWN':       0x07f,
-    'VMRUN':          0x080,
-    'VMMCALL':        0x081,
-    'VMLOAD':         0x082,
-    'VMSAVE':         0x083,
-    'STGI':           0x084,
-    'CLGI':           0x085,
-    'SKINIT':         0x086,
-    'RDTSCP':         0x087,
-    'ICEBP':          0x088,
-    'WBINVD':         0x089,
-    'MONITOR':        0x08a,
-    'MWAIT':          0x08b,
-    'MWAIT_COND':     0x08c,
-    'XSETBV':         0x08d,
-    'NPF':            0x400,
-}
-
-# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h)
-AARCH64_EXIT_REASONS = {
-    'UNKNOWN':      0x00,
-    'WFI':          0x01,
-    'CP15_32':      0x03,
-    'CP15_64':      0x04,
-    'CP14_MR':      0x05,
-    'CP14_LS':      0x06,
-    'FP_ASIMD':     0x07,
-    'CP10_ID':      0x08,
-    'CP14_64':      0x0C,
-    'ILL_ISS':      0x0E,
-    'SVC32':        0x11,
-    'HVC32':        0x12,
-    'SMC32':        0x13,
-    'SVC64':        0x15,
-    'HVC64':        0x16,
-    'SMC64':        0x17,
-    'SYS64':        0x18,
-    'IABT':         0x20,
-    'IABT_HYP':     0x21,
-    'PC_ALIGN':     0x22,
-    'DABT':         0x24,
-    'DABT_HYP':     0x25,
-    'SP_ALIGN':     0x26,
-    'FP_EXC32':     0x28,
-    'FP_EXC64':     0x2C,
-    'SERROR':       0x2F,
-    'BREAKPT':      0x30,
-    'BREAKPT_HYP':  0x31,
-    'SOFTSTP':      0x32,
-    'SOFTSTP_HYP':  0x33,
-    'WATCHPT':      0x34,
-    'WATCHPT_HYP':  0x35,
-    'BKPT32':       0x38,
-    'VECTOR32':     0x3A,
-    'BRK64':        0x3C,
-}
-
-# From include/uapi/linux/kvm.h, KVM_EXIT_xxx
-USERSPACE_EXIT_REASONS = {
-    'UNKNOWN':          0,
-    'EXCEPTION':        1,
-    'IO':               2,
-    'HYPERCALL':        3,
-    'DEBUG':            4,
-    'HLT':              5,
-    'MMIO':             6,
-    'IRQ_WINDOW_OPEN':  7,
-    'SHUTDOWN':         8,
-    'FAIL_ENTRY':       9,
-    'INTR':             10,
-    'SET_TPR':          11,
-    'TPR_ACCESS':       12,
-    'S390_SIEIC':       13,
-    'S390_RESET':       14,
-    'DCR':              15,
-    'NMI':              16,
-    'INTERNAL_ERROR':   17,
-    'OSI':              18,
-    'PAPR_HCALL':       19,
-    'S390_UCONTROL':    20,
-    'WATCHDOG':         21,
-    'S390_TSCH':        22,
-    'EPR':              23,
-    'SYSTEM_EVENT':     24,
-}
-
-IOCTL_NUMBERS = {
-    'SET_FILTER':  0x40082406,
-    'ENABLE':      0x00002400,
-    'DISABLE':     0x00002401,
-    'RESET':       0x00002403,
-}
-
-class Arch(object):
-    """Class that encapsulates global architecture specific data like
-    syscall and ioctl numbers.
-
-    """
-    @staticmethod
-    def get_arch():
-        machine = os.uname()[4]
-
-        if machine.startswith('ppc'):
-            return ArchPPC()
-        elif machine.startswith('aarch64'):
-            return ArchA64()
-        elif machine.startswith('s390'):
-            return ArchS390()
-        else:
-            # X86_64
-            for line in open('/proc/cpuinfo'):
-                if not line.startswith('flags'):
-                    continue
-
-                flags = line.split()
-                if 'vmx' in flags:
-                    return ArchX86(VMX_EXIT_REASONS)
-                if 'svm' in flags:
-                    return ArchX86(SVM_EXIT_REASONS)
-                return
-
-class ArchX86(Arch):
-    def __init__(self, exit_reasons):
-        self.sc_perf_evt_open = 298
-        self.ioctl_numbers = IOCTL_NUMBERS
-        self.exit_reasons = exit_reasons
-
-class ArchPPC(Arch):
-    def __init__(self):
-        self.sc_perf_evt_open = 319
-        self.ioctl_numbers = IOCTL_NUMBERS
-        self.ioctl_numbers['ENABLE'] = 0x20002400
-        self.ioctl_numbers['DISABLE'] = 0x20002401
-
-        # PPC comes in 32 and 64 bit and some generated ioctl
-        # numbers depend on the wordsize.
-        char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
-        self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
-
-class ArchA64(Arch):
-    def __init__(self):
-        self.sc_perf_evt_open = 241
-        self.ioctl_numbers = IOCTL_NUMBERS
-        self.exit_reasons = AARCH64_EXIT_REASONS
-
-class ArchS390(Arch):
-    def __init__(self):
-        self.sc_perf_evt_open = 331
-        self.ioctl_numbers = IOCTL_NUMBERS
-        self.exit_reasons = None
-
-ARCH = Arch.get_arch()
-
-
-def walkdir(path):
-    """Returns os.walk() data for specified directory.
-
-    As it is only a wrapper it returns the same 3-tuple of (dirpath,
-    dirnames, filenames).
-    """
-    return next(os.walk(path))
-
-
-def parse_int_list(list_string):
-    """Returns an int list from a string of comma separated integers and
-    integer ranges."""
-    integers = []
-    members = list_string.split(',')
-
-    for member in members:
-        if '-' not in member:
-            integers.append(int(member))
-        else:
-            int_range = member.split('-')
-            integers.extend(range(int(int_range[0]),
-                                  int(int_range[1]) + 1))
-
-    return integers
-
-
-def get_online_cpus():
-    with open('/sys/devices/system/cpu/online') as cpu_list:
-        cpu_string = cpu_list.readline()
-        return parse_int_list(cpu_string)
-
-
-def get_filters():
-    filters = {}
-    filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
-    if ARCH.exit_reasons:
-        filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
-    return filters
-
-libc = ctypes.CDLL('libc.so.6', use_errno=True)
-syscall = libc.syscall
-
-class perf_event_attr(ctypes.Structure):
-    _fields_ = [('type', ctypes.c_uint32),
-                ('size', ctypes.c_uint32),
-                ('config', ctypes.c_uint64),
-                ('sample_freq', ctypes.c_uint64),
-                ('sample_type', ctypes.c_uint64),
-                ('read_format', ctypes.c_uint64),
-                ('flags', ctypes.c_uint64),
-                ('wakeup_events', ctypes.c_uint32),
-                ('bp_type', ctypes.c_uint32),
-                ('bp_addr', ctypes.c_uint64),
-                ('bp_len', ctypes.c_uint64),
-                ]
-
-    def __init__(self):
-        super(self.__class__, self).__init__()
-        self.type = PERF_TYPE_TRACEPOINT
-        self.size = ctypes.sizeof(self)
-        self.read_format = PERF_FORMAT_GROUP
-
-def perf_event_open(attr, pid, cpu, group_fd, flags):
-    return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
-                   ctypes.c_int(pid), ctypes.c_int(cpu),
-                   ctypes.c_int(group_fd), ctypes.c_long(flags))
-
-PERF_TYPE_TRACEPOINT = 2
-PERF_FORMAT_GROUP = 1 << 3
-
-PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
-PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
-
-class Group(object):
-    def __init__(self):
-        self.events = []
-
-    def add_event(self, event):
-        self.events.append(event)
-
-    def read(self):
-        length = 8 * (1 + len(self.events))
-        read_format = 'xxxxxxxx' + 'Q' * len(self.events)
-        return dict(zip([event.name for event in self.events],
-                        struct.unpack(read_format,
-                                      os.read(self.events[0].fd, length))))
-
-class Event(object):
-    def __init__(self, name, group, trace_cpu, trace_point, trace_filter,
-                 trace_set='kvm'):
-        self.name = name
-        self.fd = None
-        self.setup_event(group, trace_cpu, trace_point, trace_filter,
-                         trace_set)
-
-    def setup_event_attribute(self, trace_set, trace_point):
-        id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set,
-                               trace_point, 'id')
-
-        event_attr = perf_event_attr()
-        event_attr.config = int(open(id_path).read())
-        return event_attr
-
-    def setup_event(self, group, trace_cpu, trace_point, trace_filter,
-                    trace_set):
-        event_attr = self.setup_event_attribute(trace_set, trace_point)
-
-        group_leader = -1
-        if group.events:
-            group_leader = group.events[0].fd
-
-        fd = perf_event_open(event_attr, -1, trace_cpu,
-                             group_leader, 0)
-        if fd == -1:
-            err = ctypes.get_errno()
-            raise OSError(err, os.strerror(err),
-                          'while calling sys_perf_event_open().')
-
-        if trace_filter:
-            fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'],
-                        trace_filter)
-
-        self.fd = fd
-
-    def enable(self):
-        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0)
-
-    def disable(self):
-        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0)
-
-    def reset(self):
-        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
-
-class TracepointProvider(object):
-    def __init__(self):
-        self.group_leaders = []
-        self.filters = get_filters()
-        self._fields = self.get_available_fields()
-        self.setup_traces()
-        self.fields = self._fields
-
-    def get_available_fields(self):
-        path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm')
-        fields = walkdir(path)[1]
-        extra = []
-        for field in fields:
-            if field in self.filters:
-                filter_name_, filter_dicts = self.filters[field]
-                for name in filter_dicts:
-                    extra.append(field + '(' + name + ')')
-        fields += extra
-        return fields
-
-    def setup_traces(self):
-        cpus = get_online_cpus()
-
-        # The constant is needed as a buffer for python libs, std
-        # streams and other files that the script opens.
-        newlim = len(cpus) * len(self._fields) + 50
-        try:
-            softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE)
-
-            if hardlim < newlim:
-                # Now we need CAP_SYS_RESOURCE, to increase the hard limit.
-                resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim))
-            else:
-                # Raising the soft limit is sufficient.
-                resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim))
-
-        except ValueError:
-            sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim))
-
-        for cpu in cpus:
-            group = Group()
-            for name in self._fields:
-                tracepoint = name
-                tracefilter = None
-                match = re.match(r'(.*)\((.*)\)', name)
-                if match:
-                    tracepoint, sub = match.groups()
-                    tracefilter = ('%s==%d\0' %
-                                   (self.filters[tracepoint][0],
-                                    self.filters[tracepoint][1][sub]))
-
-                group.add_event(Event(name=name,
-                                      group=group,
-                                      trace_cpu=cpu,
-                                      trace_point=tracepoint,
-                                      trace_filter=tracefilter))
-            self.group_leaders.append(group)
-
-    def available_fields(self):
-        return self.get_available_fields()
-
-    @property
-    def fields(self):
-        return self._fields
-
-    @fields.setter
-    def fields(self, fields):
-        self._fields = fields
-        for group in self.group_leaders:
-            for index, event in enumerate(group.events):
-                if event.name in fields:
-                    event.reset()
-                    event.enable()
-                else:
-                    # Do not disable the group leader.
-                    # It would disable all of its events.
-                    if index != 0:
-                        event.disable()
-
-    def read(self):
-        ret = defaultdict(int)
-        for group in self.group_leaders:
-            for name, val in group.read().iteritems():
-                if name in self._fields:
-                    ret[name] += val
-        return ret
-
-class DebugfsProvider(object):
-    def __init__(self):
-        self._fields = self.get_available_fields()
-
-    def get_available_fields(self):
-        return walkdir(PATH_DEBUGFS_KVM)[2]
-
-    @property
-    def fields(self):
-        return self._fields
-
-    @fields.setter
-    def fields(self, fields):
-        self._fields = fields
-
-    def read(self):
-        def val(key):
-            return int(file(PATH_DEBUGFS_KVM + '/' + key).read())
-        return dict([(key, val(key)) for key in self._fields])
-
-class Stats(object):
-    def __init__(self, providers, fields=None):
-        self.providers = providers
-        self._fields_filter = fields
-        self.values = {}
-        self.update_provider_filters()
-
-    def update_provider_filters(self):
-        def wanted(key):
-            if not self._fields_filter:
-                return True
-            return re.match(self._fields_filter, key) is not None
-
-        # As we reset the counters when updating the fields we can
-        # also clear the cache of old values.
-        self.values = {}
-        for provider in self.providers:
-            provider_fields = [key for key in provider.get_available_fields()
-                               if wanted(key)]
-            provider.fields = provider_fields
-
-    @property
-    def fields_filter(self):
-        return self._fields_filter
-
-    @fields_filter.setter
-    def fields_filter(self, fields_filter):
-        self._fields_filter = fields_filter
-        self.update_provider_filters()
-
-    def get(self):
-        for provider in self.providers:
-            new = provider.read()
-            for key in provider.fields:
-                oldval = self.values.get(key, (0, 0))
-                newval = new.get(key, 0)
-                newdelta = None
-                if oldval is not None:
-                    newdelta = newval - oldval[0]
-                self.values[key] = (newval, newdelta)
-        return self.values
-
-LABEL_WIDTH = 40
-NUMBER_WIDTH = 10
-
-class Tui(object):
-    def __init__(self, stats):
-        self.stats = stats
-        self.screen = None
-        self.drilldown = False
-        self.update_drilldown()
-
-    def __enter__(self):
-        """Initialises curses for later use.  Based on curses.wrapper
-           implementation from the Python standard library."""
-        self.screen = curses.initscr()
-        curses.noecho()
-        curses.cbreak()
-
-        # The try/catch works around a minor bit of
-        # over-conscientiousness in the curses module, the error
-        # return from C start_color() is ignorable.
-        try:
-            curses.start_color()
-        except:
-            pass
-
-        curses.use_default_colors()
-        return self
-
-    def __exit__(self, *exception):
-        """Resets the terminal to its normal state.  Based on curses.wrappre
-           implementation from the Python standard library."""
-        if self.screen:
-            self.screen.keypad(0)
-            curses.echo()
-            curses.nocbreak()
-            curses.endwin()
-
-    def update_drilldown(self):
-        if not self.stats.fields_filter:
-            self.stats.fields_filter = r'^[^\(]*$'
-
-        elif self.stats.fields_filter == r'^[^\(]*$':
-            self.stats.fields_filter = None
-
-    def refresh(self, sleeptime):
-        self.screen.erase()
-        self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
-        self.screen.addstr(2, 1, 'Event')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
-                           len('Total'), 'Total')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
-                           len('Current'), 'Current')
-        row = 3
-        stats = self.stats.get()
-        def sortkey(x):
-            if stats[x][1]:
-                return (-stats[x][1], -stats[x][0])
-            else:
-                return (0, -stats[x][0])
-        for key in sorted(stats.keys(), key=sortkey):
-
-            if row >= self.screen.getmaxyx()[0]:
-                break
-            values = stats[key]
-            if not values[0] and not values[1]:
-                break
-            col = 1
-            self.screen.addstr(row, col, key)
-            col += LABEL_WIDTH
-            self.screen.addstr(row, col, '%10d' % (values[0],))
-            col += NUMBER_WIDTH
-            if values[1] is not None:
-                self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
-            row += 1
-        self.screen.refresh()
-
-    def show_filter_selection(self):
-        while True:
-            self.screen.erase()
-            self.screen.addstr(0, 0,
-                               "Show statistics for events matching a regex.",
-                               curses.A_BOLD)
-            self.screen.addstr(2, 0,
-                               "Current regex: {0}"
-                               .format(self.stats.fields_filter))
-            self.screen.addstr(3, 0, "New regex: ")
-            curses.echo()
-            regex = self.screen.getstr()
-            curses.noecho()
-            if len(regex) == 0:
-                return
-            try:
-                re.compile(regex)
-                self.stats.fields_filter = regex
-                return
-            except re.error:
-                continue
-
-    def show_stats(self):
-        sleeptime = 0.25
-        while True:
-            self.refresh(sleeptime)
-            curses.halfdelay(int(sleeptime * 10))
-            sleeptime = 3
-            try:
-                char = self.screen.getkey()
-                if char == 'x':
-                    self.drilldown = not self.drilldown
-                    self.update_drilldown()
-                if char == 'q':
-                    break
-                if char == 'f':
-                    self.show_filter_selection()
-            except KeyboardInterrupt:
-                break
-            except curses.error:
-                continue
-
-def batch(stats):
-    s = stats.get()
-    time.sleep(1)
-    s = stats.get()
-    for key in sorted(s.keys()):
-        values = s[key]
-        print '%-42s%10d%10d' % (key, values[0], values[1])
-
-def log(stats):
-    keys = sorted(stats.get().iterkeys())
-    def banner():
-        for k in keys:
-            print '%s' % k,
-        print
-    def statline():
-        s = stats.get()
-        for k in keys:
-            print ' %9d' % s[k][1],
-        print
-    line = 0
-    banner_repeat = 20
-    while True:
-        time.sleep(1)
-        if line % banner_repeat == 0:
-            banner()
-        statline()
-        line += 1
-
-def get_options():
-    description_text = """
-This script displays various statistics about VMs running under KVM.
-The statistics are gathered from the KVM debugfs entries and / or the
-currently available perf traces.
-
-The monitoring takes additional cpu cycles and might affect the VM's
-performance.
-
-Requirements:
-- Access to:
-    /sys/kernel/debug/kvm
-    /sys/kernel/debug/trace/events/*
-    /proc/pid/task
-- /proc/sys/kernel/perf_event_paranoid < 1 if user has no
-  CAP_SYS_ADMIN and perf events are used.
-- CAP_SYS_RESOURCE if the hard limit is not high enough to allow
-  the large number of files that are possibly opened.
-"""
-
-    class PlainHelpFormatter(optparse.IndentedHelpFormatter):
-        def format_description(self, description):
-            if description:
-                return description + "\n"
-            else:
-                return ""
-
-    optparser = optparse.OptionParser(description=description_text,
-                                      formatter=PlainHelpFormatter())
-    optparser.add_option('-1', '--once', '--batch',
-                         action='store_true',
-                         default=False,
-                         dest='once',
-                         help='run in batch mode for one second',
-                         )
-    optparser.add_option('-l', '--log',
-                         action='store_true',
-                         default=False,
-                         dest='log',
-                         help='run in logging mode (like vmstat)',
-                         )
-    optparser.add_option('-t', '--tracepoints',
-                         action='store_true',
-                         default=False,
-                         dest='tracepoints',
-                         help='retrieve statistics from tracepoints',
-                         )
-    optparser.add_option('-d', '--debugfs',
-                         action='store_true',
-                         default=False,
-                         dest='debugfs',
-                         help='retrieve statistics from debugfs',
-                         )
-    optparser.add_option('-f', '--fields',
-                         action='store',
-                         default=None,
-                         dest='fields',
-                         help='fields to display (regex)',
-                         )
-    (options, _) = optparser.parse_args(sys.argv)
-    return options
-
-def get_providers(options):
-    providers = []
-
-    if options.tracepoints:
-        providers.append(TracepointProvider())
-    if options.debugfs:
-        providers.append(DebugfsProvider())
-    if len(providers) == 0:
-        providers.append(TracepointProvider())
-
-    return providers
-
-def check_access(options):
-    if not os.path.exists('/sys/kernel/debug'):
-        sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.')
-        sys.exit(1)
-
-    if not os.path.exists(PATH_DEBUGFS_KVM):
-        sys.stderr.write("Please make sure, that debugfs is mounted and "
-                         "readable by the current user:\n"
-                         "('mount -t debugfs debugfs /sys/kernel/debug')\n"
-                         "Also ensure, that the kvm modules are loaded.\n")
-        sys.exit(1)
-
-    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints
-                                                     or not options.debugfs):
-        sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
-                         "when using the option -t (default).\n"
-                         "If it is enabled, make {0} readable by the "
-                         "current user.\n"
-                         .format(PATH_DEBUGFS_TRACING))
-        if options.tracepoints:
-            sys.exit(1)
-
-        sys.stderr.write("Falling back to debugfs statistics!\n")
-        options.debugfs = True
-        sleep(5)
-
-    return options
-
-def main():
-    options = get_options()
-    options = check_access(options)
-    providers = get_providers(options)
-    stats = Stats(providers, fields=options.fields)
-
-    if options.log:
-        log(stats)
-    elif not options.once:
-        with Tui(stats) as tui:
-            tui.show_stats()
-    else:
-        batch(stats)
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/kvm/kvm_stat.texi b/scripts/kvm/kvm_stat.texi
deleted file mode 100644
index 6ce00d80e7..0000000000
--- a/scripts/kvm/kvm_stat.texi
+++ /dev/null
@@ -1,55 +0,0 @@
-@example
-@c man begin SYNOPSIS
-usage: kvm_stat [OPTION]...
-@c man end
-@end example
-
-@c man begin DESCRIPTION
-
-kvm_stat prints counts of KVM kernel module trace events.  These events signify
-state transitions such as guest mode entry and exit.
-
-This tool is useful for observing guest behavior from the host perspective.
-Often conclusions about performance or buggy behavior can be drawn from the
-output.
-
-The set of KVM kernel module trace events may be specific to the kernel version
-or architecture.  It is best to check the KVM kernel module source code for the
-meaning of events.
-
-Note that trace events are counted globally across all running guests.
-
-@c man end
-
-@c man begin OPTIONS
-@table @option
-@item -1, --once, --batch
-  run in batch mode for one second
-@item -l, --log
-  run in logging mode (like vmstat)
-@item -t, --tracepoints
-  retrieve statistics from tracepoints
-@item -d, --debugfs
-  retrieve statistics from debugfs
-@item -f, --fields=@var{fields}
-  fields to display (regex)
-@item -h, --help
-  show help message
-@end table
-
-@c man end
-
-@ignore
-
-@setfilename kvm_stat
-@settitle Report KVM kernel module event counters.
-
-@c man begin AUTHOR
-Stefan Hajnoczi <stefanha@redhat.com>
-@c man end
-
-@c man begin SEEALSO
-perf(1), trace-cmd(1)
-@c man end
-
-@end ignore
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 7b3667a089..abf50e6632 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -411,7 +411,8 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
 
     if ((env->mcg_cap & MCG_SER_P) && addr
         && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) {
-        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
+        ram_addr = qemu_ram_addr_from_host(addr);
+        if (ram_addr == RAM_ADDR_INVALID ||
             !kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
             fprintf(stderr, "Hardware memory error for memory used by "
                     "QEMU itself instead of guest system!\n");
@@ -445,7 +446,8 @@ int kvm_arch_on_sigbus(int code, void *addr)
         hwaddr paddr;
 
         /* Hope we are lucky for AO MCE */
-        if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL ||
+        ram_addr = qemu_ram_addr_from_host(addr);
+        if (ram_addr == RAM_ADDR_INVALID ||
             !kvm_physical_memory_addr_from_host(first_cpu->kvm_state,
                                                 addr, &paddr)) {
             fprintf(stderr, "Hardware memory error for memory used by "
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index cd33539d1c..98a24a50f3 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -359,6 +359,8 @@ struct ppc_slb_t {
 #define MSR_EP   6  /* Exception prefix on 601                               */
 #define MSR_IR   5  /* Instruction relocate                                  */
 #define MSR_DR   4  /* Data relocate                                         */
+#define MSR_IS   5  /* Instruction address space (BookE)                     */
+#define MSR_DS   4  /* Data address space (BookE)                            */
 #define MSR_PE   3  /* Protection enable on 403                              */
 #define MSR_PX   2  /* Protection exclusive on 403                  x        */
 #define MSR_PMM  2  /* Performance monitor mark on POWER            x        */
@@ -410,6 +412,8 @@ struct ppc_slb_t {
 #define msr_ep   ((env->msr >> MSR_EP)   & 1)
 #define msr_ir   ((env->msr >> MSR_IR)   & 1)
 #define msr_dr   ((env->msr >> MSR_DR)   & 1)
+#define msr_is   ((env->msr >> MSR_IS)   & 1)
+#define msr_ds   ((env->msr >> MSR_DS)   & 1)
 #define msr_pe   ((env->msr >> MSR_PE)   & 1)
 #define msr_px   ((env->msr >> MSR_PX)   & 1)
 #define msr_pmm  ((env->msr >> MSR_PMM)  & 1)
@@ -889,7 +893,7 @@ struct ppc_segment_page_sizes {
 
 /*****************************************************************************/
 /* The whole PowerPC CPU context */
-#define NB_MMU_MODES 3
+#define NB_MMU_MODES    8
 
 #define PPC_CPU_OPCODES_LEN          0x40
 #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20
@@ -954,6 +958,8 @@ struct CPUPPCState {
     /* PowerPC 64 SLB area */
     ppc_slb_t slb[MAX_SLB_ENTRIES];
     int32_t slb_nr;
+    /* tcg TLB needs flush (deferred slb inval instruction typically) */
+    uint32_t tlb_need_flush;
 #endif
     /* segment registers */
     hwaddr htab_base;
@@ -1053,7 +1059,8 @@ struct CPUPPCState {
     /* Those resources are used only in QEMU core */
     target_ulong hflags;      /* hflags is a MSR & HFLAGS_MASK         */
     target_ulong hflags_nmsr; /* specific hflags, not coming from MSR */
-    int mmu_idx;         /* precomputed MMU index to speed up mem accesses */
+    int immu_idx;         /* precomputed MMU index to speed up insn access */
+    int dmmu_idx;         /* precomputed MMU index to speed up data accesses */
 
     /* Power management */
     int (*check_pow)(CPUPPCState *env);
@@ -1242,13 +1249,10 @@ int ppc_dcr_write (ppc_dcr_t *dcr_env, int dcrn, uint32_t val);
 #define cpu_list ppc_cpu_list
 
 /* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _user
-#define MMU_MODE1_SUFFIX _kernel
-#define MMU_MODE2_SUFFIX _hypv
 #define MMU_USER_IDX 0
 static inline int cpu_mmu_index (CPUPPCState *env, bool ifetch)
 {
-    return env->mmu_idx;
+    return ifetch ? env->immu_idx : env->dmmu_idx;
 }
 
 #include "exec/cpu-all.h"
diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c
index 288903ee1d..a37009eb25 100644
--- a/target-ppc/excp_helper.c
+++ b/target-ppc/excp_helper.c
@@ -646,9 +646,6 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp)
 
     if (env->spr[SPR_LPCR] & LPCR_AIL) {
         new_msr |= (1 << MSR_IR) | (1 << MSR_DR);
-    } else if (msr & ((1 << MSR_IR) | (1 << MSR_DR))) {
-        /* If we disactivated any translation, flush TLBs */
-        tlb_flush(cs, 1);
     }
 
 #ifdef TARGET_PPC64
@@ -722,13 +719,10 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp)
     cs->exception_index = POWERPC_EXCP_NONE;
     env->error_code = 0;
 
-    if ((env->mmu_model == POWERPC_MMU_BOOKE) ||
-        (env->mmu_model == POWERPC_MMU_BOOKE206)) {
-        /* XXX: The BookE changes address space when switching modes,
-                we should probably implement that as different MMU indexes,
-                but for the moment we do it the slow way and flush all.  */
-        tlb_flush(cs, 1);
-    }
+    /* Any interrupt is context synchronizing, check if TCG TLB
+     * needs a delayed flush on ppc64
+     */
+    check_tlb_flush(env);
 }
 
 void ppc_cpu_do_interrupt(CPUState *cs)
@@ -954,6 +948,9 @@ static inline void do_rfi(CPUPPCState *env, target_ulong nip, target_ulong msr,
      * as rfi is always the last insn of a TB
      */
     cs->interrupt_request |= CPU_INTERRUPT_EXITTB;
+
+    /* Context synchronizing: check if TCG TLB needs flush */
+    check_tlb_flush(env);
 }
 
 void helper_rfi(CPUPPCState *env)
diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index e5a8f7b9b5..0526322f4d 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -16,6 +16,7 @@ DEF_HELPER_1(rfmci, void, env)
 DEF_HELPER_1(rfid, void, env)
 DEF_HELPER_1(hrfid, void, env)
 #endif
+DEF_HELPER_1(check_tlb_flush, void, env)
 #endif
 
 DEF_HELPER_3(lmw, void, env, tl, i32)
diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h
index 271fddf17f..57da931e3c 100644
--- a/target-ppc/helper_regs.h
+++ b/target-ppc/helper_regs.h
@@ -41,11 +41,50 @@ static inline void hreg_swap_gpr_tgpr(CPUPPCState *env)
 
 static inline void hreg_compute_mem_idx(CPUPPCState *env)
 {
-    /* Precompute MMU index */
-    if (msr_pr == 0 && msr_hv != 0) {
-        env->mmu_idx = 2;
+    /* This is our encoding for server processors
+     *
+     *   0 = Guest User space virtual mode
+     *   1 = Guest Kernel space virtual mode
+     *   2 = Guest Kernel space real mode
+     *   3 = HV User space virtual mode
+     *   4 = HV Kernel space virtual mode
+     *   5 = HV Kernel space real mode
+     *
+     * The combination PR=1 IR&DR=0 is invalid, we will treat
+     * it as IR=DR=1
+     *
+     * For BookE, we need 8 MMU modes as follow:
+     *
+     *  0 = AS 0 HV User space
+     *  1 = AS 0 HV Kernel space
+     *  2 = AS 1 HV User space
+     *  3 = AS 1 HV Kernel space
+     *  4 = AS 0 Guest User space
+     *  5 = AS 0 Guest Kernel space
+     *  6 = AS 1 Guest User space
+     *  7 = AS 1 Guest Kernel space
+     */
+    if (env->mmu_model & POWERPC_MMU_BOOKE) {
+        env->immu_idx = env->dmmu_idx = msr_pr ? 0 : 1;
+        env->immu_idx += msr_is ? 2 : 0;
+        env->dmmu_idx += msr_ds ? 2 : 0;
+        env->immu_idx += msr_gs ? 4 : 0;
+        env->dmmu_idx += msr_gs ? 4 : 0;
     } else {
-        env->mmu_idx = 1 - msr_pr;
+        /* First calucalte a base value independent of HV */
+        if (msr_pr != 0) {
+            /* User space, ignore IR and DR */
+            env->immu_idx = env->dmmu_idx = 0;
+        } else {
+            /* Kernel, setup a base I/D value */
+            env->immu_idx = msr_ir ? 1 : 2;
+            env->dmmu_idx = msr_dr ? 1 : 2;
+        }
+        /* Then offset it for HV */
+        if (msr_hv) {
+            env->immu_idx += 3;
+            env->dmmu_idx += 3;
+        }
     }
 }
 
@@ -82,9 +121,10 @@ static inline int hreg_store_msr(CPUPPCState *env, target_ulong value,
     }
     if (((value >> MSR_IR) & 1) != msr_ir ||
         ((value >> MSR_DR) & 1) != msr_dr) {
-        /* Flush all tlb when changing translation mode */
-        tlb_flush(cs, 1);
-        excp = POWERPC_EXCP_NONE;
+        cs->interrupt_request |= CPU_INTERRUPT_EXITTB;
+    }
+    if ((env->mmu_model & POWERPC_MMU_BOOKE) &&
+        ((value >> MSR_GS) & 1) != msr_gs) {
         cs->interrupt_request |= CPU_INTERRUPT_EXITTB;
     }
     if (unlikely((env->flags & POWERPC_FLAG_TGPR) &&
@@ -111,4 +151,17 @@ static inline int hreg_store_msr(CPUPPCState *env, target_ulong value,
     return excp;
 }
 
+#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
+static inline void check_tlb_flush(CPUPPCState *env)
+{
+    CPUState *cs = CPU(ppc_env_get_cpu(env));
+    if (env->tlb_need_flush) {
+        env->tlb_need_flush = 0;
+        tlb_flush(cs, 1);
+    }
+}
+#else
+static inline void check_tlb_flush(CPUPPCState *env) { }
+#endif
+
 #endif /* !defined(__HELPER_REGS_H__) */
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index fc7931227d..3b2090e42e 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -163,7 +163,7 @@ static inline bool kvmppc_spapr_use_multitce(void)
 
 static inline void *kvmppc_create_spapr_tce(uint32_t liobn,
                                             uint32_t window_size, int *fd,
-                                            bool vfio_accel)
+                                            bool need_vfio)
 {
     return NULL;
 }
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index f6c7256974..4820f22377 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -97,9 +97,12 @@ static int cpu_load_old(QEMUFile *f, void *opaque, int version_id)
     qemu_get_betls(f, &env->nip);
     qemu_get_betls(f, &env->hflags);
     qemu_get_betls(f, &env->hflags_nmsr);
-    qemu_get_sbe32s(f, &env->mmu_idx);
+    qemu_get_sbe32(f); /* Discard unused mmu_idx */
     qemu_get_sbe32(f); /* Discard unused power_mode */
 
+    /* Recompute mmu indices */
+    hreg_compute_mem_idx(env);
+
     return 0;
 }
 
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 04e6932fa0..ea6e99acd1 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -99,10 +99,8 @@ void dump_slb(FILE *f, fprintf_function cpu_fprintf, PowerPCCPU *cpu)
 
 void helper_slbia(CPUPPCState *env)
 {
-    PowerPCCPU *cpu = ppc_env_get_cpu(env);
-    int n, do_invalidate;
+    int n;
 
-    do_invalidate = 0;
     /* XXX: Warning: slbia never invalidates the first segment */
     for (n = 1; n < env->slb_nr; n++) {
         ppc_slb_t *slb = &env->slb[n];
@@ -113,12 +111,9 @@ void helper_slbia(CPUPPCState *env)
              *      and we still don't have a tlb_flush_mask(env, n, mask)
              *      in QEMU, we just invalidate all TLBs
              */
-            do_invalidate = 1;
+            env->tlb_need_flush = 1;
         }
     }
-    if (do_invalidate) {
-        tlb_flush(CPU(cpu), 1);
-    }
 }
 
 void helper_slbie(CPUPPCState *env, target_ulong addr)
@@ -138,7 +133,7 @@ void helper_slbie(CPUPPCState *env, target_ulong addr)
          *      and we still don't have a tlb_flush_mask(env, n, mask)
          *      in QEMU, we just invalidate all TLBs
          */
-        tlb_flush(CPU(cpu), 1);
+        env->tlb_need_flush = 1;
     }
 }
 
@@ -284,8 +279,6 @@ void ppc_hash64_set_external_hpt(PowerPCCPU *cpu, void *hpt, int shift,
     CPUPPCState *env = &cpu->env;
     Error *local_err = NULL;
 
-    cpu_synchronize_state(CPU(cpu));
-
     if (hpt) {
         env->external_htab = hpt;
     } else {
diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
index 2e0e3ca92c..1499af72a0 100644
--- a/target-ppc/mmu_helper.c
+++ b/target-ppc/mmu_helper.c
@@ -27,6 +27,7 @@
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "exec/log.h"
+#include "helper_regs.h"
 
 //#define DEBUG_MMU
 //#define DEBUG_BATS
@@ -1924,6 +1925,7 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
     case POWERPC_MMU_2_06a:
     case POWERPC_MMU_2_07:
     case POWERPC_MMU_2_07a:
+        env->tlb_need_flush = 0;
 #endif /* defined(TARGET_PPC64) */
         tlb_flush(CPU(cpu), 1);
         break;
@@ -1986,7 +1988,7 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
          *      and we still don't have a tlb_flush_mask(env, n, mask) in QEMU,
          *      we just invalidate all TLBs
          */
-        tlb_flush(CPU(cpu), 1);
+        env->tlb_need_flush = 1;
         break;
 #endif /* defined(TARGET_PPC64) */
     default:
@@ -2875,6 +2877,11 @@ void helper_booke206_tlbflush(CPUPPCState *env, target_ulong type)
 }
 
 
+void helper_check_tlb_flush(CPUPPCState *env)
+{
+    check_tlb_flush(env);
+}
+
 /*****************************************************************************/
 
 /* try to fill the TLB and return an exception if error. If retaddr is
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 745f4de98f..fe10bf8774 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -756,27 +756,20 @@ static void gen_cmpli(DisasContext *ctx)
 /* isel (PowerPC 2.03 specification) */
 static void gen_isel(DisasContext *ctx)
 {
-    TCGLabel *l1, *l2;
     uint32_t bi = rC(ctx->opcode);
-    uint32_t mask;
-    TCGv_i32 t0;
+    uint32_t mask = 0x08 >> (bi & 0x03);
+    TCGv t0 = tcg_temp_new();
+    TCGv zr;
 
-    l1 = gen_new_label();
-    l2 = gen_new_label();
+    tcg_gen_extu_i32_tl(t0, cpu_crf[bi >> 2]);
+    tcg_gen_andi_tl(t0, t0, mask);
 
-    mask = 0x08 >> (bi & 0x03);
-    t0 = tcg_temp_new_i32();
-    tcg_gen_andi_i32(t0, cpu_crf[bi >> 2], mask);
-    tcg_gen_brcondi_i32(TCG_COND_EQ, t0, 0, l1);
-    if (rA(ctx->opcode) == 0)
-        tcg_gen_movi_tl(cpu_gpr[rD(ctx->opcode)], 0);
-    else
-        tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
-    tcg_gen_br(l2);
-    gen_set_label(l1);
-    tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rB(ctx->opcode)]);
-    gen_set_label(l2);
-    tcg_temp_free_i32(t0);
+    zr = tcg_const_tl(0);
+    tcg_gen_movcond_tl(TCG_COND_NE, cpu_gpr[rD(ctx->opcode)], t0, zr,
+                       rA(ctx->opcode) ? cpu_gpr[rA(ctx->opcode)] : zr,
+                       cpu_gpr[rB(ctx->opcode)]);
+    tcg_temp_free(zr);
+    tcg_temp_free(t0);
 }
 
 /* cmpb: PowerPC 2.05 specification */
@@ -1399,6 +1392,19 @@ GEN_LOGICAL2(nand, tcg_gen_nand_tl, 0x0E, PPC_INTEGER);
 /* nor & nor. */
 GEN_LOGICAL2(nor, tcg_gen_nor_tl, 0x03, PPC_INTEGER);
 
+#if defined(TARGET_PPC64)
+static void gen_pause(DisasContext *ctx)
+{
+    TCGv_i32 t0 = tcg_const_i32(0);
+    tcg_gen_st_i32(t0, cpu_env,
+                   -offsetof(PowerPCCPU, env) + offsetof(CPUState, halted));
+    tcg_temp_free_i32(t0);
+
+    /* Stop translation, this gives other CPUs a chance to run */
+    gen_exception_err(ctx, EXCP_HLT, 1);
+}
+#endif /* defined(TARGET_PPC64) */
+
 /* or & or. */
 static void gen_or(DisasContext *ctx)
 {
@@ -1454,7 +1460,7 @@ static void gen_or(DisasContext *ctx)
             }
             break;
         case 7:
-            if (ctx->hv) {
+            if (ctx->hv && !ctx->pr) {
                 /* Set process priority to very high */
                 prio = 7;
             }
@@ -1471,6 +1477,10 @@ static void gen_or(DisasContext *ctx)
             tcg_gen_ori_tl(t0, t0, ((uint64_t)prio) << 50);
             gen_store_spr(SPR_PPR, t0);
             tcg_temp_free(t0);
+            /* Pause us out of TCG otherwise spin loops with smt_low
+             * eat too much CPU and the kernel hangs
+             */
+            gen_pause(ctx);
         }
 #endif
     }
@@ -1496,8 +1506,6 @@ static void gen_ori(DisasContext *ctx)
     target_ulong uimm = UIMM(ctx->opcode);
 
     if (rS(ctx->opcode) == rA(ctx->opcode) && uimm == 0) {
-        /* NOP */
-        /* XXX: should handle special NOPs for POWER series */
         return;
     }
     tcg_gen_ori_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], uimm);
@@ -1617,141 +1625,109 @@ static void gen_cntlzd(DisasContext *ctx)
 /* rlwimi & rlwimi. */
 static void gen_rlwimi(DisasContext *ctx)
 {
-    uint32_t mb, me, sh;
+    TCGv t_ra = cpu_gpr[rA(ctx->opcode)];
+    TCGv t_rs = cpu_gpr[rS(ctx->opcode)];
+    uint32_t sh = SH(ctx->opcode);
+    uint32_t mb = MB(ctx->opcode);
+    uint32_t me = ME(ctx->opcode);
 
-    mb = MB(ctx->opcode);
-    me = ME(ctx->opcode);
-    sh = SH(ctx->opcode);
-    if (likely(sh == (31-me) && mb <= me)) {
-        tcg_gen_deposit_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rA(ctx->opcode)],
-                           cpu_gpr[rS(ctx->opcode)], sh, me - mb + 1);
+    if (sh == (31-me) && mb <= me) {
+        tcg_gen_deposit_tl(t_ra, t_ra, t_rs, sh, me - mb + 1);
     } else {
         target_ulong mask;
+        TCGv_i32 t0;
         TCGv t1;
-        TCGv t0 = tcg_temp_new();
-#if defined(TARGET_PPC64)
-        tcg_gen_deposit_i64(t0, cpu_gpr[rS(ctx->opcode)],
-            cpu_gpr[rS(ctx->opcode)], 32, 32);
-        tcg_gen_rotli_i64(t0, t0, sh);
-#else
-        tcg_gen_rotli_i32(t0, cpu_gpr[rS(ctx->opcode)], sh);
-#endif
+
 #if defined(TARGET_PPC64)
         mb += 32;
         me += 32;
 #endif
         mask = MASK(mb, me);
+
+        t0 = tcg_temp_new_i32();
         t1 = tcg_temp_new();
-        tcg_gen_andi_tl(t0, t0, mask);
-        tcg_gen_andi_tl(t1, cpu_gpr[rA(ctx->opcode)], ~mask);
-        tcg_gen_or_tl(cpu_gpr[rA(ctx->opcode)], t0, t1);
-        tcg_temp_free(t0);
+        tcg_gen_trunc_tl_i32(t0, t_rs);
+        tcg_gen_rotli_i32(t0, t0, sh);
+        tcg_gen_extu_i32_tl(t1, t0);
+        tcg_temp_free_i32(t0);
+
+        tcg_gen_andi_tl(t1, t1, mask);
+        tcg_gen_andi_tl(t_ra, t_ra, ~mask);
+        tcg_gen_or_tl(t_ra, t_ra, t1);
         tcg_temp_free(t1);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, t_ra);
+    }
 }
 
 /* rlwinm & rlwinm. */
 static void gen_rlwinm(DisasContext *ctx)
 {
-    uint32_t mb, me, sh;
-
-    sh = SH(ctx->opcode);
-    mb = MB(ctx->opcode);
-    me = ME(ctx->opcode);
+    TCGv t_ra = cpu_gpr[rA(ctx->opcode)];
+    TCGv t_rs = cpu_gpr[rS(ctx->opcode)];
+    uint32_t sh = SH(ctx->opcode);
+    uint32_t mb = MB(ctx->opcode);
+    uint32_t me = ME(ctx->opcode);
 
-    if (likely(mb == 0 && me == (31 - sh))) {
-        if (likely(sh == 0)) {
-            tcg_gen_ext32u_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)]);
-        } else {
-            TCGv t0 = tcg_temp_new();
-            tcg_gen_ext32u_tl(t0, cpu_gpr[rS(ctx->opcode)]);
-            tcg_gen_shli_tl(t0, t0, sh);
-            tcg_gen_ext32u_tl(cpu_gpr[rA(ctx->opcode)], t0);
-            tcg_temp_free(t0);
-        }
-    } else if (likely(sh != 0 && me == 31 && sh == (32 - mb))) {
-        TCGv t0 = tcg_temp_new();
-        tcg_gen_ext32u_tl(t0, cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_shri_tl(t0, t0, mb);
-        tcg_gen_ext32u_tl(cpu_gpr[rA(ctx->opcode)], t0);
-        tcg_temp_free(t0);
-    } else if (likely(mb == 0 && me == 31)) {
-        TCGv_i32 t0 = tcg_temp_new_i32();
-        tcg_gen_trunc_tl_i32(t0, cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_rotli_i32(t0, t0, sh);
-        tcg_gen_extu_i32_tl(cpu_gpr[rA(ctx->opcode)], t0);
-        tcg_temp_free_i32(t0);
+    if (mb == 0 && me == (31 - sh)) {
+        tcg_gen_shli_tl(t_ra, t_rs, sh);
+        tcg_gen_ext32u_tl(t_ra, t_ra);
+    } else if (sh != 0 && me == 31 && sh == (32 - mb)) {
+        tcg_gen_ext32u_tl(t_ra, t_rs);
+        tcg_gen_shri_tl(t_ra, t_ra, mb);
     } else {
-        TCGv t0 = tcg_temp_new();
-#if defined(TARGET_PPC64)
-        tcg_gen_deposit_i64(t0, cpu_gpr[rS(ctx->opcode)],
-            cpu_gpr[rS(ctx->opcode)], 32, 32);
-        tcg_gen_rotli_i64(t0, t0, sh);
-#else
-        tcg_gen_rotli_i32(t0, cpu_gpr[rS(ctx->opcode)], sh);
-#endif
 #if defined(TARGET_PPC64)
         mb += 32;
         me += 32;
 #endif
-        tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me));
-        tcg_temp_free(t0);
+        if (sh == 0) {
+            tcg_gen_andi_tl(t_ra, t_rs, MASK(mb, me));
+        } else {
+            TCGv_i32 t0 = tcg_temp_new_i32();
+
+            tcg_gen_trunc_tl_i32(t0, t_rs);
+            tcg_gen_rotli_i32(t0, t0, sh);
+            tcg_gen_andi_i32(t0, t0, MASK(mb, me));
+            tcg_gen_extu_i32_tl(t_ra, t0);
+            tcg_temp_free_i32(t0);
+        }
+    }
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, t_ra);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
 }
 
 /* rlwnm & rlwnm. */
 static void gen_rlwnm(DisasContext *ctx)
 {
-    uint32_t mb, me;
-    mb = MB(ctx->opcode);
-    me = ME(ctx->opcode);
+    TCGv t_ra = cpu_gpr[rA(ctx->opcode)];
+    TCGv t_rs = cpu_gpr[rS(ctx->opcode)];
+    TCGv t_rb = cpu_gpr[rB(ctx->opcode)];
+    uint32_t mb = MB(ctx->opcode);
+    uint32_t me = ME(ctx->opcode);
+    TCGv_i32 t0, t1;
 
-    if (likely(mb == 0 && me == 31)) {
-        TCGv_i32 t0, t1;
-        t0 = tcg_temp_new_i32();
-        t1 = tcg_temp_new_i32();
-        tcg_gen_trunc_tl_i32(t0, cpu_gpr[rB(ctx->opcode)]);
-        tcg_gen_trunc_tl_i32(t1, cpu_gpr[rS(ctx->opcode)]);
-        tcg_gen_andi_i32(t0, t0, 0x1f);
-        tcg_gen_rotl_i32(t1, t1, t0);
-        tcg_gen_extu_i32_tl(cpu_gpr[rA(ctx->opcode)], t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-    } else {
-        TCGv t0;
 #if defined(TARGET_PPC64)
-        TCGv t1;
+    mb += 32;
+    me += 32;
 #endif
 
-        t0 = tcg_temp_new();
-        tcg_gen_andi_tl(t0, cpu_gpr[rB(ctx->opcode)], 0x1f);
-#if defined(TARGET_PPC64)
-        t1 = tcg_temp_new_i64();
-        tcg_gen_deposit_i64(t1, cpu_gpr[rS(ctx->opcode)],
-                            cpu_gpr[rS(ctx->opcode)], 32, 32);
-        tcg_gen_rotl_i64(t0, t1, t0);
-        tcg_temp_free_i64(t1);
-#else
-        tcg_gen_rotl_i32(t0, cpu_gpr[rS(ctx->opcode)], t0);
-#endif
-        if (unlikely(mb != 0 || me != 31)) {
-#if defined(TARGET_PPC64)
-            mb += 32;
-            me += 32;
-#endif
-            tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me));
-        } else {
-            tcg_gen_andi_tl(t0, t0, MASK(32, 63));
-            tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t0);
-        }
-        tcg_temp_free(t0);
+    t0 = tcg_temp_new_i32();
+    t1 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(t0, t_rb);
+    tcg_gen_trunc_tl_i32(t1, t_rs);
+    tcg_gen_andi_i32(t0, t0, 0x1f);
+    tcg_gen_rotl_i32(t1, t1, t0);
+    tcg_temp_free_i32(t0);
+
+    tcg_gen_andi_i32(t1, t1, MASK(mb, me));
+    tcg_gen_extu_i32_tl(t_ra, t1);
+    tcg_temp_free_i32(t1);
+
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, t_ra);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
 }
 
 #if defined(TARGET_PPC64)
@@ -1786,26 +1762,24 @@ static void glue(gen_, name##3)(DisasContext *ctx)                            \
     gen_##name(ctx, 1, 1);                                                    \
 }
 
-static inline void gen_rldinm(DisasContext *ctx, uint32_t mb, uint32_t me,
-                              uint32_t sh)
+static void gen_rldinm(DisasContext *ctx, int mb, int me, int sh)
 {
-    if (likely(sh != 0 && mb == 0 && me == (63 - sh))) {
-        tcg_gen_shli_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], sh);
-    } else if (likely(sh != 0 && me == 63 && sh == (64 - mb))) {
-        tcg_gen_shri_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)], mb);
+    TCGv t_ra = cpu_gpr[rA(ctx->opcode)];
+    TCGv t_rs = cpu_gpr[rS(ctx->opcode)];
+
+    if (sh != 0 && mb == 0 && me == (63 - sh)) {
+        tcg_gen_shli_tl(t_ra, t_rs, sh);
+    } else if (sh != 0 && me == 63 && sh == (64 - mb)) {
+        tcg_gen_shri_tl(t_ra, t_rs, mb);
     } else {
-        TCGv t0 = tcg_temp_new();
-        tcg_gen_rotli_tl(t0, cpu_gpr[rS(ctx->opcode)], sh);
-        if (likely(mb == 0 && me == 63)) {
-            tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t0);
-        } else {
-            tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me));
-        }
-        tcg_temp_free(t0);
+        tcg_gen_rotli_tl(t_ra, t_rs, sh);
+        tcg_gen_andi_tl(t_ra, t_ra, MASK(mb, me));
+    }
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, t_ra);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
 }
+
 /* rldicl - rldicl. */
 static inline void gen_rldicl(DisasContext *ctx, int mbn, int shn)
 {
@@ -1816,6 +1790,7 @@ static inline void gen_rldicl(DisasContext *ctx, int mbn, int shn)
     gen_rldinm(ctx, mb, 63, sh);
 }
 GEN_PPC64_R4(rldicl, 0x1E, 0x00);
+
 /* rldicr - rldicr. */
 static inline void gen_rldicr(DisasContext *ctx, int men, int shn)
 {
@@ -1826,6 +1801,7 @@ static inline void gen_rldicr(DisasContext *ctx, int men, int shn)
     gen_rldinm(ctx, 0, me, sh);
 }
 GEN_PPC64_R4(rldicr, 0x1E, 0x02);
+
 /* rldic - rldic. */
 static inline void gen_rldic(DisasContext *ctx, int mbn, int shn)
 {
@@ -1837,21 +1813,22 @@ static inline void gen_rldic(DisasContext *ctx, int mbn, int shn)
 }
 GEN_PPC64_R4(rldic, 0x1E, 0x04);
 
-static inline void gen_rldnm(DisasContext *ctx, uint32_t mb, uint32_t me)
+static void gen_rldnm(DisasContext *ctx, int mb, int me)
 {
+    TCGv t_ra = cpu_gpr[rA(ctx->opcode)];
+    TCGv t_rs = cpu_gpr[rS(ctx->opcode)];
+    TCGv t_rb = cpu_gpr[rB(ctx->opcode)];
     TCGv t0;
 
     t0 = tcg_temp_new();
-    tcg_gen_andi_tl(t0, cpu_gpr[rB(ctx->opcode)], 0x3f);
-    tcg_gen_rotl_tl(t0, cpu_gpr[rS(ctx->opcode)], t0);
-    if (unlikely(mb != 0 || me != 63)) {
-        tcg_gen_andi_tl(cpu_gpr[rA(ctx->opcode)], t0, MASK(mb, me));
-    } else {
-        tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], t0);
-    }
+    tcg_gen_andi_tl(t0, t_rb, 0x3f);
+    tcg_gen_rotl_tl(t_ra, t_rs, t0);
     tcg_temp_free(t0);
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
+
+    tcg_gen_andi_tl(t_ra, t_ra, MASK(mb, me));
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, t_ra);
+    }
 }
 
 /* rldcl - rldcl. */
@@ -1863,6 +1840,7 @@ static inline void gen_rldcl(DisasContext *ctx, int mbn)
     gen_rldnm(ctx, mb, 63);
 }
 GEN_PPC64_R2(rldcl, 0x1E, 0x08);
+
 /* rldcr - rldcr. */
 static inline void gen_rldcr(DisasContext *ctx, int men)
 {
@@ -1872,32 +1850,31 @@ static inline void gen_rldcr(DisasContext *ctx, int men)
     gen_rldnm(ctx, 0, me);
 }
 GEN_PPC64_R2(rldcr, 0x1E, 0x09);
+
 /* rldimi - rldimi. */
-static inline void gen_rldimi(DisasContext *ctx, int mbn, int shn)
+static void gen_rldimi(DisasContext *ctx, int mbn, int shn)
 {
-    uint32_t sh, mb, me;
+    TCGv t_ra = cpu_gpr[rA(ctx->opcode)];
+    TCGv t_rs = cpu_gpr[rS(ctx->opcode)];
+    uint32_t sh = SH(ctx->opcode) | (shn << 5);
+    uint32_t mb = MB(ctx->opcode) | (mbn << 5);
+    uint32_t me = 63 - sh;
 
-    sh = SH(ctx->opcode) | (shn << 5);
-    mb = MB(ctx->opcode) | (mbn << 5);
-    me = 63 - sh;
-    if (unlikely(sh == 0 && mb == 0)) {
-        tcg_gen_mov_tl(cpu_gpr[rA(ctx->opcode)], cpu_gpr[rS(ctx->opcode)]);
+    if (mb <= me) {
+        tcg_gen_deposit_tl(t_ra, t_ra, t_rs, sh, me - mb + 1);
     } else {
-        TCGv t0, t1;
-        target_ulong mask;
+        target_ulong mask = MASK(mb, me);
+        TCGv t1 = tcg_temp_new();
 
-        t0 = tcg_temp_new();
-        tcg_gen_rotli_tl(t0, cpu_gpr[rS(ctx->opcode)], sh);
-        t1 = tcg_temp_new();
-        mask = MASK(mb, me);
-        tcg_gen_andi_tl(t0, t0, mask);
-        tcg_gen_andi_tl(t1, cpu_gpr[rA(ctx->opcode)], ~mask);
-        tcg_gen_or_tl(cpu_gpr[rA(ctx->opcode)], t0, t1);
-        tcg_temp_free(t0);
+        tcg_gen_rotli_tl(t1, t_rs, sh);
+        tcg_gen_andi_tl(t1, t1, mask);
+        tcg_gen_andi_tl(t_ra, t_ra, ~mask);
+        tcg_gen_or_tl(t_ra, t_ra, t1);
         tcg_temp_free(t1);
     }
-    if (unlikely(Rc(ctx->opcode) != 0))
-        gen_set_Rc0(ctx, cpu_gpr[rA(ctx->opcode)]);
+    if (unlikely(Rc(ctx->opcode) != 0)) {
+        gen_set_Rc0(ctx, t_ra);
+    }
 }
 GEN_PPC64_R4(rldimi, 0x1E, 0x06);
 #endif
@@ -3313,9 +3290,32 @@ static void gen_eieio(DisasContext *ctx)
 {
 }
 
+#if !defined(CONFIG_USER_ONLY) && defined(TARGET_PPC64)
+static inline void gen_check_tlb_flush(DisasContext *ctx)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+    TCGLabel *l = gen_new_label();
+
+    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUPPCState, tlb_need_flush));
+    tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, l);
+    gen_helper_check_tlb_flush(cpu_env);
+    gen_set_label(l);
+    tcg_temp_free_i32(t);
+}
+#else
+static inline void gen_check_tlb_flush(DisasContext *ctx) { }
+#endif
+
 /* isync */
 static void gen_isync(DisasContext *ctx)
 {
+    /*
+     * We need to check for a pending TLB flush. This can only happen in
+     * kernel mode however so check MSR_PR
+     */
+    if (!ctx->pr) {
+        gen_check_tlb_flush(ctx);
+    }
     gen_stop_exception(ctx);
 }
 
@@ -3472,6 +3472,15 @@ STCX(stqcx_, 16);
 /* sync */
 static void gen_sync(DisasContext *ctx)
 {
+    uint32_t l = (ctx->opcode >> 21) & 3;
+
+    /*
+     * For l == 2, it's a ptesync, We need to check for a pending TLB flush.
+     * This can only happen in kernel mode however so check MSR_PR as well.
+     */
+    if (l == 2 && !ctx->pr) {
+        gen_check_tlb_flush(ctx);
+    }
 }
 
 /* wait */
@@ -4387,7 +4396,7 @@ static void gen_mtmsrd(DisasContext *ctx)
         /* Special form that does not need any synchronisation */
         TCGv t0 = tcg_temp_new();
         tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)], (1 << MSR_RI) | (1 << MSR_EE));
-        tcg_gen_andi_tl(cpu_msr, cpu_msr, ~((1 << MSR_RI) | (1 << MSR_EE)));
+        tcg_gen_andi_tl(cpu_msr, cpu_msr, ~(target_ulong)((1 << MSR_RI) | (1 << MSR_EE)));
         tcg_gen_or_tl(cpu_msr, cpu_msr, t0);
         tcg_temp_free(t0);
     } else {
@@ -4418,7 +4427,7 @@ static void gen_mtmsr(DisasContext *ctx)
         /* Special form that does not need any synchronisation */
         TCGv t0 = tcg_temp_new();
         tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)], (1 << MSR_RI) | (1 << MSR_EE));
-        tcg_gen_andi_tl(cpu_msr, cpu_msr, ~((1 << MSR_RI) | (1 << MSR_EE)));
+        tcg_gen_andi_tl(cpu_msr, cpu_msr, ~(target_ulong)((1 << MSR_RI) | (1 << MSR_EE)));
         tcg_gen_or_tl(cpu_msr, cpu_msr, t0);
         tcg_temp_free(t0);
     } else {
@@ -4864,7 +4873,7 @@ static void gen_tlbie(DisasContext *ctx)
 #if defined(CONFIG_USER_ONLY)
     gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
 #else
-    if (unlikely(ctx->pr)) {
+    if (unlikely(ctx->pr || !ctx->hv)) {
         gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
         return;
     }
@@ -4885,14 +4894,15 @@ static void gen_tlbsync(DisasContext *ctx)
 #if defined(CONFIG_USER_ONLY)
     gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
 #else
-    if (unlikely(ctx->pr)) {
+    if (unlikely(ctx->pr || !ctx->hv)) {
         gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
         return;
     }
-    /* This has no effect: it should ensure that all previous
-     * tlbie have completed
+    /* tlbsync is a nop for server, ptesync handles delayed tlb flush,
+     * embedded however needs to deal with tlbsync. We don't try to be
+     * fancy and swallow the overhead of checking for both.
      */
-    gen_stop_exception(ctx);
+    gen_check_tlb_flush(ctx);
 #endif
 }
 
@@ -4903,7 +4913,7 @@ static void gen_slbia(DisasContext *ctx)
 #if defined(CONFIG_USER_ONLY)
     gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
 #else
-    if (unlikely(ctx->pr)) {
+    if (unlikely(ctx->pr || !ctx->hv)) {
         gen_inval_exception(ctx, POWERPC_EXCP_PRIV_OPC);
         return;
     }
@@ -9951,8 +9961,10 @@ GEN_HANDLER2(slbmfee, "slbmfee", 0x1F, 0x13, 0x1C, 0x001F0001, PPC_SEGMENT_64B),
 GEN_HANDLER2(slbmfev, "slbmfev", 0x1F, 0x13, 0x1A, 0x001F0001, PPC_SEGMENT_64B),
 #endif
 GEN_HANDLER(tlbia, 0x1F, 0x12, 0x0B, 0x03FFFC01, PPC_MEM_TLBIA),
-GEN_HANDLER(tlbiel, 0x1F, 0x12, 0x08, 0x03FF0001, PPC_MEM_TLBIE),
-GEN_HANDLER(tlbie, 0x1F, 0x12, 0x09, 0x03FF0001, PPC_MEM_TLBIE),
+/* XXX Those instructions will need to be handled differently for
+ * different ISA versions */
+GEN_HANDLER(tlbiel, 0x1F, 0x12, 0x08, 0x001F0001, PPC_MEM_TLBIE),
+GEN_HANDLER(tlbie, 0x1F, 0x12, 0x09, 0x001F0001, PPC_MEM_TLBIE),
 GEN_HANDLER(tlbsync, 0x1F, 0x16, 0x11, 0x03FFF801, PPC_MEM_TLBSYNC),
 #if defined(TARGET_PPC64)
 GEN_HANDLER(slbia, 0x1F, 0x12, 0x0F, 0x03FFFC01, PPC_SLBI),
@@ -11258,8 +11270,9 @@ void ppc_cpu_dump_state(CPUState *cs, FILE *f, fprintf_function cpu_fprintf,
                 env->nip, env->lr, env->ctr, cpu_read_xer(env),
                 cs->cpu_index);
     cpu_fprintf(f, "MSR " TARGET_FMT_lx " HID0 " TARGET_FMT_lx "  HF "
-                TARGET_FMT_lx " idx %d\n", env->msr, env->spr[SPR_HID0],
-                env->hflags, env->mmu_idx);
+                TARGET_FMT_lx " iidx %d didx %d\n",
+                env->msr, env->spr[SPR_HID0],
+                env->hflags, env->immu_idx, env->dmmu_idx);
 #if !defined(NO_TIMER_DUMP)
     cpu_fprintf(f, "TB %08" PRIu32 " %08" PRIu64
 #if !defined(CONFIG_USER_ONLY)
@@ -11466,7 +11479,7 @@ void gen_intermediate_code(CPUPPCState *env, struct TranslationBlock *tb)
     ctx.spr_cb = env->spr_cb;
     ctx.pr = msr_pr;
     ctx.hv = !msr_pr && msr_hv;
-    ctx.mem_idx = env->mmu_idx;
+    ctx.mem_idx = env->dmmu_idx;
     ctx.insns_flags = env->insns_flags;
     ctx.insns_flags2 = env->insns_flags2;
     ctx.access_type = -1;
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index 954195f5e4..83010768ea 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -8359,7 +8359,7 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
                        PPC_CACHE | PPC_CACHE_ICBI | PPC_CACHE_DCBZ |
                        PPC_MEM_SYNC | PPC_MEM_EIEIO |
                        PPC_MEM_TLBIE | PPC_MEM_TLBSYNC |
-                       PPC_64B | PPC_ALTIVEC |
+                       PPC_64B | PPC_64H | PPC_ALTIVEC |
                        PPC_SEGMENT_64B | PPC_SLBI |
                        PPC_POPCNTB | PPC_POPCNTWD;
     pcc->insns_flags2 = PPC2_VSX | PPC2_DFP | PPC2_DBRX | PPC2_ISA205 |
@@ -8439,7 +8439,7 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
                        PPC_CACHE | PPC_CACHE_ICBI | PPC_CACHE_DCBZ |
                        PPC_MEM_SYNC | PPC_MEM_EIEIO |
                        PPC_MEM_TLBIE | PPC_MEM_TLBSYNC |
-                       PPC_64B | PPC_64BX | PPC_ALTIVEC |
+                       PPC_64B | PPC_64H | PPC_64BX | PPC_ALTIVEC |
                        PPC_SEGMENT_64B | PPC_SLBI |
                        PPC_POPCNTB | PPC_POPCNTWD;
     pcc->insns_flags2 = PPC2_VSX | PPC2_VSX207 | PPC2_DFP | PPC2_DBRX |
@@ -9231,6 +9231,14 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp)
 #if !defined(CONFIG_USER_ONLY)
     cpu->cpu_dt_id = (cs->cpu_index / smp_threads) * max_smt
         + (cs->cpu_index % smp_threads);
+
+    if (kvm_enabled() && !kvm_vcpu_id_is_valid(cpu->cpu_dt_id)) {
+        error_setg(errp, "Can't create CPU with id %d in KVM", cpu->cpu_dt_id);
+        error_append_hint(errp, "Adjust the number of cpus to %d "
+                          "or try to raise the number of threads per core\n",
+                          cpu->cpu_dt_id * smp_threads / max_smt);
+        return;
+    }
 #endif
 
     if (tcg_enabled()) {
diff --git a/tests/Makefile b/tests/Makefile
index b196489139..a3e20e39ec 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -436,13 +436,14 @@ tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y)
 
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
+	hw/core/bus.o \
 	hw/core/irq.o \
 	hw/core/fw-path-provider.o \
 	$(test-qapi-obj-y)
 tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
-	migration/vmstate.o migration/qemu-file.o migration/qemu-file-buf.o \
-        migration/qemu-file-unix.o migration/qjson.o \
-	$(test-qom-obj-y)
+	migration/vmstate.o migration/qemu-file.o \
+        migration/qemu-file-channel.o migration/qjson.o \
+	$(test-io-obj-y)
 tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
 	$(test-util-obj-y)
 tests/test-base64$(EXESUF): tests/test-base64.o \
diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include
new file mode 100644
index 0000000000..2fd2ca3057
--- /dev/null
+++ b/tests/docker/Makefile.include
@@ -0,0 +1,126 @@
+# Makefile for Docker tests
+
+.PHONY: docker docker-test docker-clean docker-image docker-qemu-src
+
+DOCKER_SUFFIX := .docker
+DOCKER_FILES_DIR := $(SRC_PATH)/tests/docker/dockerfiles
+DOCKER_IMAGES := $(notdir $(basename $(wildcard $(DOCKER_FILES_DIR)/*.docker)))
+DOCKER_TARGETS := $(patsubst %,docker-image-%,$(DOCKER_IMAGES))
+# Use a global constant ccache directory to speed up repetitive builds
+DOCKER_CCACHE_DIR := $$HOME/.cache/qemu-docker-ccache
+
+DOCKER_TESTS := $(notdir $(shell \
+	find $(SRC_PATH)/tests/docker/ -name 'test-*' -type f))
+
+DOCKER_TOOLS := travis
+
+TESTS ?= %
+IMAGES ?= %
+
+# Make archive from git repo $1 to tar.gz $2
+make-archive-maybe = $(if $(wildcard $1/*), \
+	$(call quiet-command, \
+		(cd $1; if git diff-index --quiet HEAD -- &>/dev/null; then \
+			git archive -1 HEAD --format=tar.gz -o $2; \
+		else \
+			git archive -1 $$(git stash create) --format=tar.gz -o $2; \
+		fi), \
+		"  ARCHIVE $(notdir $2)"))
+
+CUR_TIME := $(shell date +%Y-%m-%d-%H.%M.%S.$$$$)
+# Makes the definition constant after the first expansion
+DOCKER_SRC_COPY = $(eval DOCKER_SRC_COPY := docker-src.$(CUR_TIME))$(DOCKER_SRC_COPY)
+
+$(DOCKER_SRC_COPY):
+	@mkdir $@
+	$(call make-archive-maybe, $(SRC_PATH), $@/qemu.tgz)
+	$(call make-archive-maybe, $(SRC_PATH)/dtc, $@/dtc.tgz)
+	$(call make-archive-maybe, $(SRC_PATH)/pixman, $@/pixman.tgz)
+	$(call quiet-command, cp $(SRC_PATH)/tests/docker/run $@/run, \
+		"  COPY RUNNER")
+
+docker-qemu-src: $(DOCKER_SRC_COPY)
+
+docker-image: ${DOCKER_TARGETS}
+
+# General rule for building docker images
+docker-image-%: $(DOCKER_FILES_DIR)/%.docker
+	$(call quiet-command,\
+		$(SRC_PATH)/tests/docker/docker.py build qemu:$* $< \
+		$(if $V,,--quiet) $(if $(NOCACHE),--no-cache),\
+		"  BUILD $*")
+
+# Expand all the pre-requistes for each docker image and test combination
+$(foreach i,$(DOCKER_IMAGES), \
+	$(foreach t,$(DOCKER_TESTS) $(DOCKER_TOOLS), \
+		$(eval .PHONY: docker-$t@$i) \
+		$(eval docker-$t@$i: docker-image-$i docker-run-$t@$i) \
+	) \
+	$(foreach t,$(DOCKER_TESTS), \
+		$(eval docker-test: docker-$t@$i) \
+	) \
+)
+
+docker:
+	@echo 'Build QEMU and run tests inside Docker containers'
+	@echo
+	@echo 'Available targets:'
+	@echo
+	@echo '    docker:              Print this help.'
+	@echo '    docker-test:         Run all image/test combinations.'
+	@echo '    docker-clean:        Kill and remove residual docker testing containers.'
+	@echo '    docker-TEST@IMAGE:   Run "TEST" in container "IMAGE".'
+	@echo '                         Note: "TEST" is one of the listed test name,'
+	@echo '                         or a script name under $$QEMU_SRC/tests/docker/;'
+	@echo '                         "IMAGE" is one of the listed container name."'
+	@echo '    docker-image:        Build all images.'
+	@echo '    docker-image-IMAGE:  Build image "IMAGE".'
+	@echo
+	@echo 'Available container images:'
+	@echo '    $(DOCKER_IMAGES)'
+	@echo
+	@echo 'Available tests:'
+	@echo '    $(DOCKER_TESTS)'
+	@echo
+	@echo 'Available tools:'
+	@echo '    $(DOCKER_TOOLS)'
+	@echo
+	@echo 'Special variables:'
+	@echo '    TARGET_LIST=a,b,c    Override target list in builds.'
+	@echo '    EXTRA_CONFIGURE_OPTS="..."'
+	@echo '                         Extra configure options.'
+	@echo '    IMAGES="a b c ..":   Filters which images to build or run.'
+	@echo '    TESTS="x y z .."     Filters which tests to run (for docker-test).'
+	@echo '    J=[0..9]*            Overrides the -jN parameter for make commands'
+	@echo '                         (default is 1)'
+	@echo '    DEBUG=1              Stop and drop to shell in the created container'
+	@echo '                         before running the command.'
+	@echo '    NOCACHE=1            Ignore cache when build images.'
+
+docker-run-%: CMD = $(shell echo '$@' | sed -e 's/docker-run-\([^@]*\)@\(.*\)/\1/')
+docker-run-%: IMAGE = $(shell echo '$@' | sed -e 's/docker-run-\([^@]*\)@\(.*\)/\2/')
+docker-run-%: docker-qemu-src
+	@mkdir -p "$(DOCKER_CCACHE_DIR)"
+	@if test -z "$(IMAGE)" || test -z "$(CMD)"; \
+		then echo "Invalid target"; exit 1; \
+	fi
+	$(if $(filter $(TESTS),$(CMD)),$(if $(filter $(IMAGES),$(IMAGE)), \
+		$(call quiet-command,\
+			$(SRC_PATH)/tests/docker/docker.py run $(if $V,,--rm) \
+				-t \
+				$(if $(DEBUG),-i,--net=none) \
+				-e TARGET_LIST=$(TARGET_LIST) \
+				-e EXTRA_CONFIGURE_OPTS=$(EXTRA_CONFIGURE_OPTS) \
+				-e V=$V -e J=$J -e DEBUG=$(DEBUG)\
+				-e CCACHE_DIR=/var/tmp/ccache \
+				-v $$(realpath $(DOCKER_SRC_COPY)):/var/tmp/qemu:z$(COMMA)ro \
+				-v $(DOCKER_CCACHE_DIR):/var/tmp/ccache:z \
+				-w /var/tmp/qemu \
+				qemu:$(IMAGE) \
+				$(if $V,/bin/bash -x ,) \
+				./run \
+				$(CMD); \
+			, "  RUN $(CMD) in $(IMAGE)")))
+
+docker-clean:
+	$(call quiet-command, $(SRC_PATH)/tests/docker/docker.py clean)
diff --git a/tests/docker/common.rc b/tests/docker/common.rc
new file mode 100755
index 0000000000..c493eebd45
--- /dev/null
+++ b/tests/docker/common.rc
@@ -0,0 +1,32 @@
+#!/bin/sh
+#
+# Common routines for docker test scripts.
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+requires()
+{
+    for c in $@; do
+        if ! echo "$FEATURES" | grep -wq -e "$c"; then
+            echo "Prerequisite '$c' not present, skip"
+            exit 0
+        fi
+    done
+}
+
+build_qemu()
+{
+    $QEMU_SRC/configure \
+        --target-list="${TARGET_LIST}" \
+        --prefix="$PWD/install" \
+        $EXTRA_CONFIGURE_OPTS \
+        "$@"
+    make $MAKEFLAGS
+}
diff --git a/tests/docker/docker.py b/tests/docker/docker.py
new file mode 100755
index 0000000000..0151362d17
--- /dev/null
+++ b/tests/docker/docker.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python2
+#
+# Docker controlling module
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+import os
+import sys
+import subprocess
+import json
+import hashlib
+import atexit
+import uuid
+import argparse
+import tempfile
+from shutil import copy
+
+def _text_checksum(text):
+    """Calculate a digest string unique to the text content"""
+    return hashlib.sha1(text).hexdigest()
+
+def _guess_docker_command():
+    """ Guess a working docker command or raise exception if not found"""
+    commands = [["docker"], ["sudo", "-n", "docker"]]
+    for cmd in commands:
+        if subprocess.call(cmd + ["images"],
+                           stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE) == 0:
+            return cmd
+    commands_txt = "\n".join(["  " + " ".join(x) for x in commands])
+    raise Exception("Cannot find working docker command. Tried:\n%s" % \
+                    commands_txt)
+
+class Docker(object):
+    """ Running Docker commands """
+    def __init__(self):
+        self._command = _guess_docker_command()
+        self._instances = []
+        atexit.register(self._kill_instances)
+
+    def _do(self, cmd, quiet=True, **kwargs):
+        if quiet:
+            kwargs["stdout"] = subprocess.PIPE
+        return subprocess.call(self._command + cmd, **kwargs)
+
+    def _do_kill_instances(self, only_known, only_active=True):
+        cmd = ["ps", "-q"]
+        if not only_active:
+            cmd.append("-a")
+        for i in self._output(cmd).split():
+            resp = self._output(["inspect", i])
+            labels = json.loads(resp)[0]["Config"]["Labels"]
+            active = json.loads(resp)[0]["State"]["Running"]
+            if not labels:
+                continue
+            instance_uuid = labels.get("com.qemu.instance.uuid", None)
+            if not instance_uuid:
+                continue
+            if only_known and instance_uuid not in self._instances:
+                continue
+            print "Terminating", i
+            if active:
+                self._do(["kill", i])
+            self._do(["rm", i])
+
+    def clean(self):
+        self._do_kill_instances(False, False)
+        return 0
+
+    def _kill_instances(self):
+        return self._do_kill_instances(True)
+
+    def _output(self, cmd, **kwargs):
+        return subprocess.check_output(self._command + cmd,
+                                       stderr=subprocess.STDOUT,
+                                       **kwargs)
+
+    def get_image_dockerfile_checksum(self, tag):
+        resp = self._output(["inspect", tag])
+        labels = json.loads(resp)[0]["Config"].get("Labels", {})
+        return labels.get("com.qemu.dockerfile-checksum", "")
+
+    def build_image(self, tag, dockerfile, df_path, quiet=True, argv=None):
+        if argv == None:
+            argv = []
+        tmp_dir = tempfile.mkdtemp(prefix="docker_build")
+
+        tmp_df = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".docker")
+        tmp_df.write(dockerfile)
+
+        tmp_df.write("\n")
+        tmp_df.write("LABEL com.qemu.dockerfile-checksum=%s" %
+                     _text_checksum(dockerfile))
+        tmp_df.flush()
+        self._do(["build", "-t", tag, "-f", tmp_df.name] + argv + \
+                 [tmp_dir],
+                 quiet=quiet)
+
+    def image_matches_dockerfile(self, tag, dockerfile):
+        try:
+            checksum = self.get_image_dockerfile_checksum(tag)
+        except Exception:
+            return False
+        return checksum == _text_checksum(dockerfile)
+
+    def run(self, cmd, keep, quiet):
+        label = uuid.uuid1().hex
+        if not keep:
+            self._instances.append(label)
+        ret = self._do(["run", "--label",
+                        "com.qemu.instance.uuid=" + label] + cmd,
+                       quiet=quiet)
+        if not keep:
+            self._instances.remove(label)
+        return ret
+
+class SubCommand(object):
+    """A SubCommand template base class"""
+    name = None # Subcommand name
+    def shared_args(self, parser):
+        parser.add_argument("--quiet", action="store_true",
+                            help="Run quietly unless an error occured")
+
+    def args(self, parser):
+        """Setup argument parser"""
+        pass
+    def run(self, args, argv):
+        """Run command.
+        args: parsed argument by argument parser.
+        argv: remaining arguments from sys.argv.
+        """
+        pass
+
+class RunCommand(SubCommand):
+    """Invoke docker run and take care of cleaning up"""
+    name = "run"
+    def args(self, parser):
+        parser.add_argument("--keep", action="store_true",
+                            help="Don't remove image when command completes")
+    def run(self, args, argv):
+        return Docker().run(argv, args.keep, quiet=args.quiet)
+
+class BuildCommand(SubCommand):
+    """ Build docker image out of a dockerfile. Arguments: <tag> <dockerfile>"""
+    name = "build"
+    def args(self, parser):
+        parser.add_argument("tag",
+                            help="Image Tag")
+        parser.add_argument("dockerfile",
+                            help="Dockerfile name")
+
+    def run(self, args, argv):
+        dockerfile = open(args.dockerfile, "rb").read()
+        tag = args.tag
+
+        dkr = Docker()
+        if dkr.image_matches_dockerfile(tag, dockerfile):
+            if not args.quiet:
+                print "Image is up to date."
+            return 0
+
+        dkr.build_image(tag, dockerfile, args.dockerfile,
+                        quiet=args.quiet, argv=argv)
+        return 0
+
+class CleanCommand(SubCommand):
+    """Clean up docker instances"""
+    name = "clean"
+    def run(self, args, argv):
+        Docker().clean()
+        return 0
+
+def main():
+    parser = argparse.ArgumentParser(description="A Docker helper",
+            usage="%s <subcommand> ..." % os.path.basename(sys.argv[0]))
+    subparsers = parser.add_subparsers(title="subcommands", help=None)
+    for cls in SubCommand.__subclasses__():
+        cmd = cls()
+        subp = subparsers.add_parser(cmd.name, help=cmd.__doc__)
+        cmd.shared_args(subp)
+        cmd.args(subp)
+        subp.set_defaults(cmdobj=cmd)
+    args, argv = parser.parse_known_args()
+    return args.cmdobj.run(args, argv)
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/docker/dockerfiles/centos6.docker b/tests/docker/dockerfiles/centos6.docker
new file mode 100644
index 0000000000..8f4fe46379
--- /dev/null
+++ b/tests/docker/dockerfiles/centos6.docker
@@ -0,0 +1,6 @@
+FROM centos:6
+RUN yum install -y \
+    tar git make gcc g++ \
+    zlib-devel glib2-devel SDL-devel pixman-devel \
+    epel-release
+RUN yum install -y libfdt-devel ccache
diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker
new file mode 100644
index 0000000000..6251e45137
--- /dev/null
+++ b/tests/docker/dockerfiles/fedora.docker
@@ -0,0 +1,7 @@
+FROM fedora:23
+RUN dnf install -y \
+    ccache git tar \
+    glib2-devel pixman-devel zlib-devel SDL-devel libfdt-devel \
+    gcc gcc-c++ clang make perl which bc findutils \
+    mingw{32,64}-{pixman,glib2,gmp,SDL,pkg-config,gtk2,gtk3,gnutls,nettle,libtasn1,libjpeg-turbo,libpng,curl,libssh2,bzip2}
+ENV FEATURES mingw clang
diff --git a/tests/docker/dockerfiles/ubuntu.docker b/tests/docker/dockerfiles/ubuntu.docker
new file mode 100644
index 0000000000..725a7ca5d0
--- /dev/null
+++ b/tests/docker/dockerfiles/ubuntu.docker
@@ -0,0 +1,11 @@
+FROM ubuntu:14.04
+RUN echo "deb http://archive.ubuntu.com/ubuntu/ trusty universe multiverse" >> \
+    /etc/apt/sources.list
+RUN apt-get update
+RUN apt-get -y install \
+    libusb-1.0-0-dev libiscsi-dev librados-dev libncurses5-dev \
+    libseccomp-dev libgnutls-dev libssh2-1-dev  libspice-server-dev \
+    libspice-protocol-dev libnss3-dev libfdt-dev \
+    libgtk-3-dev libvte-2.90-dev libsdl1.2-dev libpng12-dev libpixman-1-dev \
+    git make ccache python-yaml gcc clang sparse
+ENV FEATURES clang ccache pyyaml
diff --git a/tests/docker/run b/tests/docker/run
new file mode 100755
index 0000000000..ec3d11934b
--- /dev/null
+++ b/tests/docker/run
@@ -0,0 +1,58 @@
+#!/bin/bash -e
+#
+# Docker test runner
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+# Prepare the environment
+. /etc/profile || true
+export PATH=/usr/lib/ccache:$PATH
+
+if test -n "$J"; then
+    export MAKEFLAGS="$MAKEFLAGS -j$J"
+fi
+
+# We are in the container so the whole file system belong to us
+export TEST_DIR=/tmp/qemu-test
+mkdir -p $TEST_DIR/{src,build,install}
+
+# Extract the source tarballs
+tar -C $TEST_DIR/src -xzf qemu.tgz
+for p in dtc pixman; do
+    if test -f $p.tgz; then
+        tar -C $TEST_DIR/src/$p -xzf $p.tgz
+        export FEATURES="$FEATURES $p"
+    fi
+done
+
+export QEMU_SRC="$TEST_DIR/src"
+
+cd "$QEMU_SRC/tests/docker"
+
+CMD="$QEMU_SRC/tests/docker/$@"
+
+if test -n "$DEBUG"; then
+    echo "* Prepared to run command:"
+    echo "  $CMD"
+    echo "* Hit Ctrl-D to continue, or type 'exit 1' to abort"
+    echo
+    $SHELL
+fi
+
+if "$CMD"; then
+    exit 0
+elif test -n "$DEBUG"; then
+    echo "* Command failed:"
+    echo "  $CMD"
+    echo "* Hit Ctrl-D to exit"
+    echo
+    # Force error after shell exits
+    $SHELL && exit 1
+fi
diff --git a/tests/docker/test-clang b/tests/docker/test-clang
new file mode 100755
index 0000000000..6745dbeb83
--- /dev/null
+++ b/tests/docker/test-clang
@@ -0,0 +1,26 @@
+#!/bin/bash -e
+#
+# Compile and check with clang.
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+. common.rc
+
+requires clang
+
+OPTS="--enable-debug --cxx=clang++ --cc=clang --host-cc=clang"
+# -fsanitize=undefined is broken on Fedora 23, skip it for now
+# See also: https://bugzilla.redhat.com/show_bug.cgi?id=1263834
+#OPTS="$OPTS --extra-cflags=-fsanitize=undefined \
+    #--extra-cflags=-fno-sanitize=float-divide-by-zero"
+DEF_TARGET_LIST="$(echo {x86_64,aarch64}-softmmu)"
+TARGET_LIST=${TARGET_LIST:-$DEF_TARGET_LIST} \
+build_qemu $OPTS
+make $MAKEFLAGS check
diff --git a/tests/docker/test-full b/tests/docker/test-full
new file mode 100755
index 0000000000..fd9b798947
--- /dev/null
+++ b/tests/docker/test-full
@@ -0,0 +1,17 @@
+#!/bin/bash -e
+#
+# Compile all the targets.
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+. common.rc
+
+build_qemu
+make check $MAKEFLAGS
diff --git a/tests/docker/test-mingw b/tests/docker/test-mingw
new file mode 100755
index 0000000000..c03757add8
--- /dev/null
+++ b/tests/docker/test-mingw
@@ -0,0 +1,34 @@
+#!/bin/bash -e
+#
+# Cross compile QEMU with mingw toolchain on Linux.
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+. common.rc
+
+requires mingw dtc
+
+for prefix in x86_64-w64-mingw32- i686-w64-mingw32-; do
+    TARGET_LIST=x86_64-softmmu,aarch64-softmmu \
+        build_qemu --cross-prefix=$prefix \
+        --enable-trace-backends=simple \
+        --enable-debug \
+        --enable-gnutls \
+        --enable-nettle \
+        --enable-curl \
+        --enable-vnc \
+        --enable-bzip2 \
+        --enable-guest-agent \
+        --with-sdlabi=1.2 \
+        --with-gtkabi=2.0
+    make clean
+
+done
+
diff --git a/tests/docker/test-quick b/tests/docker/test-quick
new file mode 100755
index 0000000000..07cdc59a10
--- /dev/null
+++ b/tests/docker/test-quick
@@ -0,0 +1,19 @@
+#!/bin/bash -e
+#
+# Quick compiling test that everyone already does. But why not automate it?
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+. common.rc
+
+DEF_TARGET_LIST="$(echo {x86_64,aarch64}-softmmu)"
+TARGET_LIST=${TARGET_LIST:-$DEF_TARGET_LIST} \
+build_qemu
+make check $MAKEFLAGS
diff --git a/tests/docker/travis b/tests/docker/travis
new file mode 100755
index 0000000000..d345393ced
--- /dev/null
+++ b/tests/docker/travis
@@ -0,0 +1,21 @@
+#!/bin/bash -e
+#
+# Mimic a travis testing matrix
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+. common.rc
+
+requires pyyaml
+cmdfile=/tmp/travis_cmd_list.sh
+$QEMU_SRC/tests/docker/travis.py $QEMU_SRC/.travis.yml > $cmdfile
+chmod +x $cmdfile
+cd "$QEMU_SRC"
+$cmdfile
diff --git a/tests/docker/travis.py b/tests/docker/travis.py
new file mode 100755
index 0000000000..8dcc964da4
--- /dev/null
+++ b/tests/docker/travis.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+#
+# Travis YAML config parser
+#
+# Copyright (c) 2016 Red Hat Inc.
+#
+# Authors:
+#  Fam Zheng <famz@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2
+# or (at your option) any later version. See the COPYING file in
+# the top-level directory.
+
+import sys
+import yaml
+import itertools
+
+def load_yaml(fname):
+    return yaml.load(open(fname, "r").read())
+
+def conf_iter(conf):
+    def env_to_list(env):
+        return env if isinstance(env, list) else [env]
+    global_env = conf["env"]["global"]
+    for entry in conf["matrix"]["include"]:
+        yield {"env": global_env + env_to_list(entry["env"]),
+               "compiler": entry["compiler"]}
+    for entry in itertools.product(conf["compiler"],
+                                   conf["env"]["matrix"]):
+        yield {"env": global_env + env_to_list(entry[1]),
+               "compiler": entry[0]}
+
+def main():
+    if len(sys.argv) < 2:
+        sys.stderr.write("Usage: %s <travis-file>\n" % sys.argv[0])
+        return 1
+    conf = load_yaml(sys.argv[1])
+    for config in conf_iter(conf):
+        print "("
+        print "\n".join(config["env"])
+        print "alias cc=" + config["compiler"]
+        print "\n".join(conf["before_script"])
+        print "\n".join(conf["script"])
+        print ")"
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041
index b1c542f99b..ed1d9d464c 100755
--- a/tests/qemu-iotests/041
+++ b/tests/qemu-iotests/041
@@ -207,33 +207,6 @@ class TestSingleBlockdev(TestSingleDrive):
     test_image_not_found = None
     test_small_buffer2 = None
 
-class TestBlockdevAttached(iotests.QMPTestCase):
-    image_len = 1 * 1024 * 1024 # MB
-
-    def setUp(self):
-        iotests.create_image(backing_img, self.image_len)
-        qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, test_img)
-        qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, target_img)
-        self.vm = iotests.VM().add_drive(test_img)
-        self.vm.launch()
-
-    def tearDown(self):
-        self.vm.shutdown()
-        os.remove(test_img)
-        os.remove(target_img)
-
-    def test_blockdev_attached(self):
-        self.assert_no_active_block_jobs()
-        args = {'options':
-                    {'driver': iotests.imgfmt,
-                     'id': 'drive1',
-                     'file': { 'filename': target_img, 'driver': 'file' } } }
-        result = self.vm.qmp("blockdev-add", **args)
-        self.assert_qmp(result, 'return', {})
-        result = self.vm.qmp('blockdev-mirror', device='drive0', sync='full',
-                             target='drive1')
-        self.assert_qmp(result, 'error/class', 'GenericError')
-
 class TestSingleDriveZeroLength(TestSingleDrive):
     image_len = 0
     test_small_buffer2 = None
diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out
index b67d0504a6..b0cadc8245 100644
--- a/tests/qemu-iotests/041.out
+++ b/tests/qemu-iotests/041.out
@@ -1,5 +1,5 @@
-............................................................................
+...........................................................................
 ----------------------------------------------------------------------
-Ran 76 tests
+Ran 75 tests
 
 OK
diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
index 55fad9507a..828389bb45 100644
--- a/tests/test-blockjob-txn.c
+++ b/tests/test-blockjob-txn.c
@@ -15,6 +15,7 @@
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
 #include "block/blockjob.h"
+#include "sysemu/block-backend.h"
 
 typedef struct {
     BlockJob common;
@@ -30,7 +31,7 @@ static const BlockJobDriver test_block_job_driver = {
 
 static void test_block_job_complete(BlockJob *job, void *opaque)
 {
-    BlockDriverState *bs = job->bs;
+    BlockDriverState *bs = blk_bs(job->blk);
     int rc = (intptr_t)opaque;
 
     if (block_job_is_cancelled(job)) {
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index 5ec966c8a4..c02be805f7 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -578,9 +578,9 @@ static void test_groups(void)
     BlockBackend *blk1, *blk2, *blk3;
     BlockBackendPublic *blkp1, *blkp2, *blkp3;
 
-    blk1 = blk_new_with_bs(&error_abort);
-    blk2 = blk_new_with_bs(&error_abort);
-    blk3 = blk_new_with_bs(&error_abort);
+    blk1 = blk_new();
+    blk2 = blk_new();
+    blk3 = blk_new();
 
     blkp1 = blk_get_public(blk1);
     blkp2 = blk_get_public(blk2);
diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c
index 713d4443b2..d19b16a60e 100644
--- a/tests/test-vmstate.c
+++ b/tests/test-vmstate.c
@@ -29,6 +29,7 @@
 #include "migration/migration.h"
 #include "migration/vmstate.h"
 #include "qemu/coroutine.h"
+#include "io/channel-file.h"
 
 static char temp_file[] = "/tmp/vmst.test.XXXXXX";
 static int temp_fd;
@@ -44,35 +45,22 @@ void yield_until_fd_readable(int fd)
     select(fd + 1, &fds, NULL, NULL, NULL);
 }
 
-/*
- * Some tests use 'open_test_file' to work on a real fd, some use
- * an in memory file (QEMUSizedBuffer+qemu_bufopen); we could pick one
- * but this way we test both.
- */
 
 /* Duplicate temp_fd and seek to the beginning of the file */
 static QEMUFile *open_test_file(bool write)
 {
     int fd = dup(temp_fd);
+    QIOChannel *ioc;
     lseek(fd, 0, SEEK_SET);
     if (write) {
         g_assert_cmpint(ftruncate(fd, 0), ==, 0);
     }
-    return qemu_fdopen(fd, write ? "wb" : "rb");
-}
-
-/*
- * Check that the contents of the memory-buffered file f match
- * the given size/data.
- */
-static void check_mem_file(QEMUFile *f, void *data, size_t size)
-{
-    uint8_t *result = g_malloc(size);
-    const QEMUSizedBuffer *qsb = qemu_buf_get(f);
-    g_assert_cmpint(qsb_get_length(qsb), ==, size);
-    g_assert_cmpint(qsb_get_buffer(qsb, 0, size, result), ==, size);
-    g_assert_cmpint(memcmp(result, data, size), ==, 0);
-    g_free(result);
+    ioc = QIO_CHANNEL(qio_channel_file_new_fd(fd));
+    if (write) {
+        return qemu_fopen_channel_output(ioc);
+    } else {
+        return qemu_fopen_channel_input(ioc);
+    }
 }
 
 #define SUCCESS(val) \
@@ -392,7 +380,7 @@ static const VMStateDescription vmstate_skipping = {
 
 static void test_save_noskip(void)
 {
-    QEMUFile *fsave = qemu_bufopen("w", NULL);
+    QEMUFile *fsave = open_test_file(true);
     TestStruct obj = { .a = 1, .b = 2, .c = 3, .d = 4, .e = 5, .f = 6,
                        .skip_c_e = false };
     vmstate_save_state(fsave, &vmstate_skipping, &obj, NULL);
@@ -406,13 +394,14 @@ static void test_save_noskip(void)
         0, 0, 0, 5,             /* e */
         0, 0, 0, 0, 0, 0, 0, 6, /* f */
     };
-    check_mem_file(fsave, expected, sizeof(expected));
+
     qemu_fclose(fsave);
+    compare_vmstate(expected, sizeof(expected));
 }
 
 static void test_save_skip(void)
 {
-    QEMUFile *fsave = qemu_bufopen("w", NULL);
+    QEMUFile *fsave = open_test_file(true);
     TestStruct obj = { .a = 1, .b = 2, .c = 3, .d = 4, .e = 5, .f = 6,
                        .skip_c_e = true };
     vmstate_save_state(fsave, &vmstate_skipping, &obj, NULL);
@@ -424,13 +413,14 @@ static void test_save_skip(void)
         0, 0, 0, 0, 0, 0, 0, 4, /* d */
         0, 0, 0, 0, 0, 0, 0, 6, /* f */
     };
-    check_mem_file(fsave, expected, sizeof(expected));
 
     qemu_fclose(fsave);
+    compare_vmstate(expected, sizeof(expected));
 }
 
 static void test_load_noskip(void)
 {
+    QEMUFile *fsave = open_test_file(true);
     uint8_t buf[] = {
         0, 0, 0, 10,             /* a */
         0, 0, 0, 20,             /* b */
@@ -440,10 +430,10 @@ static void test_load_noskip(void)
         0, 0, 0, 0, 0, 0, 0, 60, /* f */
         QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */
     };
+    qemu_put_buffer(fsave, buf, sizeof(buf));
+    qemu_fclose(fsave);
 
-    QEMUSizedBuffer *qsb = qsb_create(buf, sizeof(buf));
-    g_assert(qsb);
-    QEMUFile *loading = qemu_bufopen("r", qsb);
+    QEMUFile *loading = open_test_file(false);
     TestStruct obj = { .skip_c_e = false };
     vmstate_load_state(loading, &vmstate_skipping, &obj, 2);
     g_assert(!qemu_file_get_error(loading));
@@ -454,11 +444,11 @@ static void test_load_noskip(void)
     g_assert_cmpint(obj.e, ==, 50);
     g_assert_cmpint(obj.f, ==, 60);
     qemu_fclose(loading);
-    qsb_free(qsb);
 }
 
 static void test_load_skip(void)
 {
+    QEMUFile *fsave = open_test_file(true);
     uint8_t buf[] = {
         0, 0, 0, 10,             /* a */
         0, 0, 0, 20,             /* b */
@@ -466,10 +456,10 @@ static void test_load_skip(void)
         0, 0, 0, 0, 0, 0, 0, 60, /* f */
         QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */
     };
+    qemu_put_buffer(fsave, buf, sizeof(buf));
+    qemu_fclose(fsave);
 
-    QEMUSizedBuffer *qsb = qsb_create(buf, sizeof(buf));
-    g_assert(qsb);
-    QEMUFile *loading = qemu_bufopen("r", qsb);
+    QEMUFile *loading = open_test_file(false);
     TestStruct obj = { .skip_c_e = true, .c = 300, .e = 500 };
     vmstate_load_state(loading, &vmstate_skipping, &obj, 2);
     g_assert(!qemu_file_get_error(loading));
@@ -480,13 +470,14 @@ static void test_load_skip(void)
     g_assert_cmpint(obj.e, ==, 500);
     g_assert_cmpint(obj.f, ==, 60);
     qemu_fclose(loading);
-    qsb_free(qsb);
 }
 
 int main(int argc, char **argv)
 {
     temp_fd = mkstemp(temp_file);
 
+    module_call_init(MODULE_INIT_QOM);
+
     g_test_init(&argc, &argv, NULL);
     g_test_add_func("/vmstate/simple/primitive", test_simple_primitive);
     g_test_add_func("/vmstate/versioned/load/v1", test_load_v1);
diff --git a/trace-events b/trace-events
index 74596db0c6..68ebac9d84 100644
--- a/trace-events
+++ b/trace-events
@@ -61,6 +61,10 @@ virtio_console_chr_event(unsigned int port, int event) "port %u, event %d"
 bdrv_open_common(void *bs, const char *filename, int flags, const char *format_name) "bs %p filename \"%s\" flags %#x format_name \"%s\""
 bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 
+# block/block-backend.c
+blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
+
 # block/io.c
 bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
@@ -68,8 +72,6 @@ bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %
 bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x opaque %p"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
-bdrv_co_readv_no_serialising(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
 bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
@@ -1143,7 +1145,7 @@ win_helper_done(uint32_t tl) "tl=%d"
 win_helper_retry(uint32_t tl) "tl=%d"
 
 # dma-helpers.c
-dma_blk_io(void *dbs, void *bs, int64_t sector_num, bool to_dev) "dbs=%p bs=%p sector_num=%" PRId64 " to_dev=%d"
+dma_blk_io(void *dbs, void *bs, int64_t offset, bool to_dev) "dbs=%p bs=%p offset=%" PRId64 " to_dev=%d"
 dma_aio_cancel(void *dbs) "dbs=%p"
 dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p"
 dma_blk_cb(void *dbs, int ret) "dbs=%p ret=%d"
@@ -1428,7 +1430,7 @@ spapr_iommu_pci_get(uint64_t liobn, uint64_t ioba, uint64_t ret, uint64_t tce) "
 spapr_iommu_pci_indirect(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t iobaN, uint64_t tceN, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcelist=0x%"PRIx64" iobaN=0x%"PRIx64" tceN=0x%"PRIx64" ret=%"PRId64
 spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64
 spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
-spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
+spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
 
 # hw/ppc/ppc.c
 ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
@@ -1481,7 +1483,7 @@ await_return_path_close_on_source_close(void) ""
 await_return_path_close_on_source_joining(void) ""
 migrate_set_state(int new_state) "new state %d"
 migrate_fd_cleanup(void) ""
-migrate_fd_error(void) ""
+migrate_fd_error(const char *error_desc) "error=%s"
 migrate_fd_cancel(void) ""
 migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at %zx len %zx"
 migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
@@ -1511,6 +1513,8 @@ migrate_state_too_big(void) ""
 migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
 process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
 process_incoming_migration_co_postcopy_end_main(void) ""
+migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
+migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  "ioc=%p ioctype=%s hostname=%s"
 
 # migration/rdma.c
 qemu_rdma_accept_incoming_migration(void) ""
@@ -1595,6 +1599,27 @@ postcopy_ram_incoming_cleanup_entry(void) ""
 postcopy_ram_incoming_cleanup_exit(void) ""
 postcopy_ram_incoming_cleanup_join(void) ""
 
+# migration/exec.c
+migration_exec_outgoing(const char *cmd) "cmd=%s"
+migration_exec_incoming(const char *cmd) "cmd=%s"
+
+# migration/fd.c
+migration_fd_outgoing(int fd) "fd=%d"
+migration_fd_incoming(int fd) "fd=%d"
+
+# migration/socket.c
+migration_socket_incoming_accepted(void) ""
+migration_socket_outgoing_connected(const char *hostname) "hostname=%s"
+migration_socket_outgoing_error(const char *err) "error=%s"
+
+# migration/tls.c
+migration_tls_outgoing_handshake_start(const char *hostname) "hostname=%s"
+migration_tls_outgoing_handshake_error(const char *err) "err=%s"
+migration_tls_outgoing_handshake_complete(void) ""
+migration_tls_incoming_handshake_start(void) ""
+migration_tls_incoming_handshake_error(const char *err) "err=%s"
+migration_tls_incoming_handshake_complete(void) ""
+
 # kvm-all.c
 kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
 kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
@@ -1711,9 +1736,13 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset_done(const char *name) "%s"
 vfio_quirk_ati_bonaire_reset(const char *name) "%s"
+vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [%03x] %08x -> %08x"
+vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB"
+vfio_pci_igd_opregion_enabled(const char *name) "%s"
+vfio_pci_igd_host_bridge_enabled(const char *name) "%s"
+vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
 
-
-# hw/vfio/vfio-common.c
+# hw/vfio/common.c
 vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
 vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
 vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ %"PRIx64" - %"PRIx64
@@ -1732,6 +1761,9 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg
 vfio_region_exit(const char *name, int index) "Device %s, region %d"
 vfio_region_finalize(const char *name, int index) "Device %s, region %d"
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d"
+vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
+vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
+vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
 
 # hw/vfio/platform.c
 vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
diff --git a/util/error.c b/util/error.c
index cae2511732..9c40b1f458 100644
--- a/util/error.c
+++ b/util/error.c
@@ -217,7 +217,7 @@ ErrorClass error_get_class(const Error *err)
     return err->err_class;
 }
 
-const char *error_get_pretty(Error *err)
+const char *error_get_pretty(const Error *err)
 {
     return err->msg;
 }
diff --git a/xen-hvm.c b/xen-hvm.c
index c14e778a8e..01ee25de21 100644
--- a/xen-hvm.c
+++ b/xen-hvm.c
@@ -511,8 +511,13 @@ static void xen_io_add(MemoryListener *listener,
                        MemoryRegionSection *section)
 {
     XenIOState *state = container_of(listener, XenIOState, io_listener);
+    MemoryRegion *mr = section->mr;
 
-    memory_region_ref(section->mr);
+    if (mr->ops == &unassigned_io_ops) {
+        return;
+    }
+
+    memory_region_ref(mr);
 
     xen_map_io_section(xen_xc, xen_domid, state->ioservid, section);
 }
@@ -521,10 +526,15 @@ static void xen_io_del(MemoryListener *listener,
                        MemoryRegionSection *section)
 {
     XenIOState *state = container_of(listener, XenIOState, io_listener);
+    MemoryRegion *mr = section->mr;
+
+    if (mr->ops == &unassigned_io_ops) {
+        return;
+    }
 
     xen_unmap_io_section(xen_xc, xen_domid, state->ioservid, section);
 
-    memory_region_unref(section->mr);
+    memory_region_unref(mr);
 }
 
 static void xen_device_realize(DeviceListener *listener,