128 files changed, 3624 insertions, 1384 deletions
diff --git a/.gitignore b/.gitignore
index 55a001e3b8..09c2363acf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,7 @@
 /qemu-version.h.tmp
 /module_block.h
 /vscclient
+/vhost-user-scsi
 /fsdev/virtfs-proxy-helper
 *.[1-9]
 *.a
@@ -99,14 +100,14 @@
 /pc-bios/optionrom/kvmvapic.img
 /pc-bios/s390-ccw/s390-ccw.elf
 /pc-bios/s390-ccw/s390-ccw.img
-/docs/qemu-ga-qapi.texi
-/docs/qemu-ga-ref.html
-/docs/qemu-ga-ref.info*
-/docs/qemu-ga-ref.txt
-/docs/qemu-qmp-qapi.texi
-/docs/qemu-qmp-ref.html
-/docs/qemu-qmp-ref.info*
-/docs/qemu-qmp-ref.txt
+/docs/interop/qemu-ga-qapi.texi
+/docs/interop/qemu-ga-ref.html
+/docs/interop/qemu-ga-ref.info*
+/docs/interop/qemu-ga-ref.txt
+/docs/interop/qemu-qmp-qapi.texi
+/docs/interop/qemu-qmp-ref.html
+/docs/interop/qemu-qmp-ref.info*
+/docs/interop/qemu-qmp-ref.txt
 /docs/version.texi
 *.tps
 .stgit-*
diff --git a/Makefile b/Makefile
index c830d7a46c..16a0430c6c 100644
--- a/Makefile
+++ b/Makefile
@@ -207,8 +207,8 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF)
 
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-doc.txt qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8
-DOCS+=docs/qemu-qmp-ref.html docs/qemu-qmp-ref.txt docs/qemu-qmp-ref.7
-DOCS+=docs/qemu-ga-ref.html docs/qemu-ga-ref.txt docs/qemu-ga-ref.7
+DOCS+=docs/interop/qemu-qmp-ref.html docs/interop/qemu-qmp-ref.txt docs/interop/qemu-qmp-ref.7
+DOCS+=docs/interop/qemu-ga-ref.html docs/interop/qemu-ga-ref.txt docs/interop/qemu-ga-ref.7
 ifdef CONFIG_VIRTFS
 DOCS+=fsdev/virtfs-proxy-helper.1
 endif
@@ -269,6 +269,7 @@ dummy := $(call unnest-vars,, \
                 ivshmem-client-obj-y \
                 ivshmem-server-obj-y \
                 libvhost-user-obj-y \
+                vhost-user-scsi-obj-y \
                 qga-vss-dll-obj-y \
                 block-obj-y \
                 block-obj-m \
@@ -473,6 +474,8 @@ ivshmem-client$(EXESUF): $(ivshmem-client-obj-y) $(COMMON_LDADDS)
 	$(call LINK, $^)
 ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) $(COMMON_LDADDS)
 	$(call LINK, $^)
+vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y)
+	$(call LINK, $^)
 
 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
 	$(call quiet-command,$(PYTHON) $< $@ \
@@ -519,11 +522,12 @@ distclean: clean
 	rm -f qemu-doc.vr qemu-doc.txt
 	rm -f config.log
 	rm -f linux-headers/asm
-	rm -f docs/qemu-ga-qapi.texi docs/qemu-qmp-qapi.texi docs/version.texi
-	rm -f docs/qemu-qmp-ref.7 docs/qemu-ga-ref.7
-	rm -f docs/qemu-qmp-ref.txt docs/qemu-ga-ref.txt
-	rm -f docs/qemu-qmp-ref.pdf docs/qemu-ga-ref.pdf
-	rm -f docs/qemu-qmp-ref.html docs/qemu-ga-ref.html
+	rm -f docs/version.texi
+	rm -f docs/interop/qemu-ga-qapi.texi docs/interop/qemu-qmp-qapi.texi
+	rm -f docs/interop/qemu-qmp-ref.7 docs/interop/qemu-ga-ref.7
+	rm -f docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt
+	rm -f docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf
+	rm -f docs/interop/qemu-qmp-ref.html docs/interop/qemu-ga-ref.html
 	for d in $(TARGET_DIRS); do \
 	rm -rf $$d || exit 1 ; \
         done
@@ -562,13 +566,13 @@ install-doc: $(DOCS)
 	$(INSTALL_DIR) "$(DESTDIR)$(qemu_docdir)"
 	$(INSTALL_DATA) qemu-doc.html "$(DESTDIR)$(qemu_docdir)"
 	$(INSTALL_DATA) qemu-doc.txt "$(DESTDIR)$(qemu_docdir)"
-	$(INSTALL_DATA) docs/qemu-qmp-ref.html "$(DESTDIR)$(qemu_docdir)"
-	$(INSTALL_DATA) docs/qemu-qmp-ref.txt "$(DESTDIR)$(qemu_docdir)"
+	$(INSTALL_DATA) docs/interop/qemu-qmp-ref.html "$(DESTDIR)$(qemu_docdir)"
+	$(INSTALL_DATA) docs/interop/qemu-qmp-ref.txt "$(DESTDIR)$(qemu_docdir)"
 ifdef CONFIG_POSIX
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1"
 	$(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1"
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man7"
-	$(INSTALL_DATA) docs/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7"
+	$(INSTALL_DATA) docs/interop/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7"
 ifneq ($(TOOLS),)
 	$(INSTALL_DATA) qemu-img.1 "$(DESTDIR)$(mandir)/man1"
 	$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man8"
@@ -576,9 +580,9 @@ ifneq ($(TOOLS),)
 endif
 ifneq (,$(findstring qemu-ga,$(TOOLS)))
 	$(INSTALL_DATA) qemu-ga.8 "$(DESTDIR)$(mandir)/man8"
-	$(INSTALL_DATA) docs/qemu-ga-ref.html "$(DESTDIR)$(qemu_docdir)"
-	$(INSTALL_DATA) docs/qemu-ga-ref.txt "$(DESTDIR)$(qemu_docdir)"
-	$(INSTALL_DATA) docs/qemu-ga-ref.7 "$(DESTDIR)$(mandir)/man7"
+	$(INSTALL_DATA) docs/interop/qemu-ga-ref.html "$(DESTDIR)$(qemu_docdir)"
+	$(INSTALL_DATA) docs/interop/qemu-ga-ref.txt "$(DESTDIR)$(qemu_docdir)"
+	$(INSTALL_DATA) docs/interop/qemu-ga-ref.7 "$(DESTDIR)$(mandir)/man7"
 endif
 endif
 ifdef CONFIG_VIRTFS
@@ -666,28 +670,27 @@ ui/console-gl.o: $(SRC_PATH)/ui/console-gl.c \
 
 # documentation
 MAKEINFO=makeinfo
-MAKEINFOFLAGS=--no-split --number-sections -I docs
-TEXIFLAG=$(if $(V),,--quiet)
+MAKEINFOINCLUDES= -I docs -I $(<D) -I $(@D)
+MAKEINFOFLAGS=--no-split --number-sections $(MAKEINFOINCLUDES)
+TEXI2PODFLAGS=$(MAKEINFOINCLUDES) "-DVERSION=$(VERSION)"
+TEXI2PDFFLAGS=$(if $(V),,--quiet) -I $(SRC_PATH) $(MAKEINFOINCLUDES)
 
 docs/version.texi: $(SRC_PATH)/VERSION
 	$(call quiet-command,echo "@set VERSION $(VERSION)" > $@,"GEN","$@")
 
-%.html: %.texi
+%.html: %.texi docs/version.texi
 	$(call quiet-command,LC_ALL=C $(MAKEINFO) $(MAKEINFOFLAGS) --no-headers \
 	--html $< -o $@,"GEN","$@")
 
-%.info: %.texi
+%.info: %.texi docs/version.texi
 	$(call quiet-command,$(MAKEINFO) $(MAKEINFOFLAGS) $< -o $@,"GEN","$@")
 
-%.txt: %.texi
+%.txt: %.texi docs/version.texi
 	$(call quiet-command,LC_ALL=C $(MAKEINFO) $(MAKEINFOFLAGS) --no-headers \
 	--plaintext $< -o $@,"GEN","$@")
 
-%.pdf: %.texi
-	$(call quiet-command,texi2pdf $(TEXIFLAG) -I $(SRC_PATH) -I docs $< -o $@,"GEN","$@")
-
-docs/qemu-ga-ref.html docs/qemu-ga-ref.info docs/qemu-ga-ref.txt docs/qemu-ga-ref.pdf docs/qemu-ga-ref.7.pod: docs/version.texi
-docs/qemu-qmp-ref.html docs/qemu-qmp-ref.info docs/qemu-qmp-ref.txt docs/qemu-qmp-ref.pdf docs/qemu-qmp-ref.pod: docs/version.texi
+%.pdf: %.texi docs/version.texi
+	$(call quiet-command,texi2pdf $(TEXI2PDFFLAGS) $< -o $@,"GEN","$@")
 
 qemu-options.texi: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")
@@ -701,12 +704,12 @@ qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxt
 qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")
 
-docs/qemu-qmp-qapi.texi docs/qemu-ga-qapi.texi: $(SRC_PATH)/scripts/qapi2texi.py $(qapi-py)
+docs/interop/qemu-qmp-qapi.texi docs/interop/qemu-ga-qapi.texi: $(SRC_PATH)/scripts/qapi2texi.py $(qapi-py)
 
-docs/qemu-qmp-qapi.texi: $(qapi-modules)
+docs/interop/qemu-qmp-qapi.texi: $(qapi-modules)
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi2texi.py $< > $@,"GEN","$@")
 
-docs/qemu-ga-qapi.texi: $(SRC_PATH)/qga/qapi-schema.json
+docs/interop/qemu-ga-qapi.texi: $(SRC_PATH)/qga/qapi-schema.json
 	$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi2texi.py $< > $@,"GEN","$@")
 
 qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
@@ -716,21 +719,25 @@ fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
 qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
 qemu-ga.8: qemu-ga.texi
 
-html: qemu-doc.html docs/qemu-qmp-ref.html docs/qemu-ga-ref.html
-info: qemu-doc.info docs/qemu-qmp-ref.info docs/qemu-ga-ref.info
-pdf: qemu-doc.pdf docs/qemu-qmp-ref.pdf docs/qemu-ga-ref.pdf
-txt: qemu-doc.txt docs/qemu-qmp-ref.txt docs/qemu-ga-ref.txt
+html: qemu-doc.html docs/interop/qemu-qmp-ref.html docs/interop/qemu-ga-ref.html
+info: qemu-doc.info docs/interop/qemu-qmp-ref.info docs/interop/qemu-ga-ref.info
+pdf: qemu-doc.pdf docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf
+txt: qemu-doc.txt docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt
 
 qemu-doc.html qemu-doc.info qemu-doc.pdf qemu-doc.txt: \
 	qemu-img.texi qemu-nbd.texi qemu-options.texi qemu-option-trace.texi \
 	qemu-monitor.texi qemu-img-cmds.texi qemu-ga.texi \
 	qemu-monitor-info.texi
 
-docs/qemu-ga-ref.dvi docs/qemu-ga-ref.html docs/qemu-ga-ref.info docs/qemu-ga-ref.pdf docs/qemu-ga-ref.txt docs/qemu-ga-ref.7: \
-docs/qemu-ga-ref.texi docs/qemu-ga-qapi.texi
+docs/interop/qemu-ga-ref.dvi docs/interop/qemu-ga-ref.html \
+    docs/interop/qemu-ga-ref.info docs/interop/qemu-ga-ref.pdf \
+    docs/interop/qemu-ga-ref.txt docs/interop/qemu-ga-ref.7: \
+	docs/interop/qemu-ga-ref.texi docs/interop/qemu-ga-qapi.texi
 
-docs/qemu-qmp-ref.dvi docs/qemu-qmp-ref.html docs/qemu-qmp-ref.info docs/qemu-qmp-ref.pdf docs/qemu-qmp-ref.txt docs/qemu-qmp-ref.7: \
-docs/qemu-qmp-ref.texi docs/qemu-qmp-qapi.texi
+docs/interop/qemu-qmp-ref.dvi docs/interop/qemu-qmp-ref.html \
+    docs/interop/qemu-qmp-ref.info docs/interop/qemu-qmp-ref.pdf \
+    docs/interop/qemu-qmp-ref.txt docs/interop/qemu-qmp-ref.7: \
+	docs/interop/qemu-qmp-ref.texi docs/interop/qemu-qmp-qapi.texi
 
 
 ifdef CONFIG_WIN32
@@ -791,9 +798,11 @@ endif # CONFIG_WIN
 
 # Add a dependency on the generated files, so that they are always
 # rebuilt before other object files
+ifneq ($(wildcard config-host.mak),)
 ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
 Makefile: $(GENERATED_FILES)
 endif
+endif
 
 .SECONDARY: $(TRACE_HEADERS) $(TRACE_HEADERS:%=%-timestamp) \
 	$(TRACE_SOURCES) $(TRACE_SOURCES:%=%-timestamp) \
diff --git a/Makefile.objs b/Makefile.objs
index 0575802440..b2e6322ef0 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -52,7 +52,6 @@ common-obj-y += migration/
 
 common-obj-y += audio/
 common-obj-y += hw/
-common-obj-y += accel.o
 
 common-obj-y += replay/
 
@@ -111,6 +110,10 @@ qga-vss-dll-obj-y = qga/
 ivshmem-client-obj-y = contrib/ivshmem-client/
 ivshmem-server-obj-y = contrib/ivshmem-server/
 libvhost-user-obj-y = contrib/libvhost-user/
+vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
+vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
+vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
+vhost-user-scsi-obj-y += contrib/libvhost-user/libvhost-user.o
 
 ######################################################################
 trace-events-subdirs =
@@ -163,6 +166,8 @@ trace-events-subdirs += target/ppc
 trace-events-subdirs += qom
 trace-events-subdirs += linux-user
 trace-events-subdirs += qapi
+trace-events-subdirs += accel/tcg
+trace-events-subdirs += accel/kvm
 
 trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events)
 
diff --git a/Makefile.target b/Makefile.target
index ce8dfe44a8..0066579090 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -88,20 +88,17 @@ all: $(PROGS) stap
 
 #########################################################
 # cpu emulator library
-obj-y = exec.o translate-all.o cpu-exec.o
-obj-y += translate-common.o
-obj-y += cpu-exec-common.o
+obj-y += exec.o
+obj-y += accel/
 obj-y += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-$(CONFIG_TCG_INTERPRETER) += tci.o
-obj-y += tcg/tcg-common.o
+obj-y += tcg/tcg-common.o tcg/tcg-runtime.o
+obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
 obj-y += target/$(TARGET_BASE_ARCH)/
 obj-y += disas.o
-obj-y += tcg-runtime.o
 obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o
 obj-$(call lnot,$(CONFIG_HAX)) += hax-stub.o
-obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
 
 obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/decContext.o
 obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/decNumber.o
@@ -142,8 +139,7 @@ ifdef CONFIG_SOFTMMU
 obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o
 obj-y += qtest.o bootdevice.o
 obj-y += hw/
-obj-$(CONFIG_KVM) += kvm-all.o
-obj-y += memory.o cputlb.o
+obj-y += memory.o
 obj-y += memory_mapping.o
 obj-y += dump.o
 obj-y += migration/ram.o
diff --git a/accel/Makefile.objs b/accel/Makefile.objs
new file mode 100644
index 0000000000..cd5702f347
--- /dev/null
+++ b/accel/Makefile.objs
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SOFTMMU) += accel.o
+obj-y += kvm/
+obj-y += tcg/
+obj-y += stubs/
diff --git a/accel.c b/accel/accel.c
index 664bb88422..7c079a5611 100644
--- a/accel.c
+++ b/accel/accel.c
@@ -34,15 +34,6 @@
 #include "hw/xen/xen.h"
 #include "qom/object.h"
 
-int tcg_tb_size;
-static bool tcg_allowed = true;
-
-static int tcg_init(MachineState *ms)
-{
-    tcg_exec_init(tcg_tb_size * 1024 * 1024);
-    return 0;
-}
-
 static const TypeInfo accel_type = {
     .name = TYPE_ACCEL,
     .parent = TYPE_OBJECT,
@@ -129,27 +120,9 @@ void configure_accelerator(MachineState *ms)
     }
 }
 
-
-static void tcg_accel_class_init(ObjectClass *oc, void *data)
-{
-    AccelClass *ac = ACCEL_CLASS(oc);
-    ac->name = "tcg";
-    ac->init_machine = tcg_init;
-    ac->allowed = &tcg_allowed;
-}
-
-#define TYPE_TCG_ACCEL ACCEL_CLASS_NAME("tcg")
-
-static const TypeInfo tcg_accel_type = {
-    .name = TYPE_TCG_ACCEL,
-    .parent = TYPE_ACCEL,
-    .class_init = tcg_accel_class_init,
-};
-
 static void register_accel_types(void)
 {
     type_register_static(&accel_type);
-    type_register_static(&tcg_accel_type);
 }
 
 type_init(register_accel_types);
diff --git a/accel/kvm/Makefile.objs b/accel/kvm/Makefile.objs
new file mode 100644
index 0000000000..85351e7de7
--- /dev/null
+++ b/accel/kvm/Makefile.objs
@@ -0,0 +1 @@
+obj-$(CONFIG_KVM) += kvm-all.o
diff --git a/kvm-all.c b/accel/kvm/kvm-all.c
index ab8262f672..75feffa504 100644
--- a/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -36,7 +36,7 @@
 #include "exec/ram_addr.h"
 #include "exec/address-spaces.h"
 #include "qemu/event_notifier.h"
-#include "trace-root.h"
+#include "trace.h"
 #include "hw/irq.h"
 
 #include "hw/boards.h"
@@ -1977,6 +1977,7 @@ int kvm_cpu_exec(CPUState *cpu)
     }
 
     qemu_mutex_unlock_iothread();
+    cpu_exec_start(cpu);
 
     do {
         MemTxAttrs attrs;
@@ -2106,6 +2107,7 @@ int kvm_cpu_exec(CPUState *cpu)
         }
     } while (ret == 0);
 
+    cpu_exec_end(cpu);
     qemu_mutex_lock_iothread();
 
     if (ret < 0) {
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
new file mode 100644
index 0000000000..f89ba5578d
--- /dev/null
+++ b/accel/kvm/trace-events
@@ -0,0 +1,15 @@
+# Trace events for debugging and performance instrumentation
+
+# kvm-all.c
+kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
+kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
+kvm_vcpu_ioctl(int cpu_index, int type, void *arg) "cpu_index %d, type 0x%x, arg %p"
+kvm_run_exit(int cpu_index, uint32_t reason) "cpu_index %d, reason %d"
+kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p"
+kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s"
+kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s"
+kvm_irqchip_commit_routes(void) ""
+kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d"
+kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d"
+kvm_irqchip_release_virq(int virq) "virq %d"
+
diff --git a/accel/stubs/Makefile.objs b/accel/stubs/Makefile.objs
new file mode 100644
index 0000000000..bd5794f222
--- /dev/null
+++ b/accel/stubs/Makefile.objs
@@ -0,0 +1 @@
+obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
diff --git a/kvm-stub.c b/accel/stubs/kvm-stub.c
index ef0c7346af..ef0c7346af 100644
--- a/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
new file mode 100644
index 0000000000..f173cd5397
--- /dev/null
+++ b/accel/tcg/Makefile.objs
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SOFTMMU) += tcg-all.o
+obj-$(CONFIG_SOFTMMU) += cputlb.o
+obj-y += cpu-exec.o cpu-exec-common.o translate-all.o translate-common.o
diff --git a/cpu-exec-common.c b/accel/tcg/cpu-exec-common.c
index e81da276bb..e81da276bb 100644
--- a/cpu-exec-common.c
+++ b/accel/tcg/cpu-exec-common.c
diff --git a/cpu-exec.c b/accel/tcg/cpu-exec.c
index 5b181c18ed..3581618bc0 100644
--- a/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -18,7 +18,7 @@
  */
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "trace-root.h"
+#include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg.h"
diff --git a/cputlb.c b/accel/tcg/cputlb.c
index 743776ae19..743776ae19 100644
--- a/cputlb.c
+++ b/accel/tcg/cputlb.c
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
new file mode 100644
index 0000000000..dba99315e3
--- /dev/null
+++ b/accel/tcg/tcg-all.c
@@ -0,0 +1,61 @@
+/*
+ * QEMU System Emulator, accelerator interfaces
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/accel.h"
+#include "sysemu/sysemu.h"
+#include "qom/object.h"
+
+int tcg_tb_size;
+static bool tcg_allowed = true;
+
+static int tcg_init(MachineState *ms)
+{
+    tcg_exec_init(tcg_tb_size * 1024 * 1024);
+    return 0;
+}
+
+static void tcg_accel_class_init(ObjectClass *oc, void *data)
+{
+    AccelClass *ac = ACCEL_CLASS(oc);
+    ac->name = "tcg";
+    ac->init_machine = tcg_init;
+    ac->allowed = &tcg_allowed;
+}
+
+#define TYPE_TCG_ACCEL ACCEL_CLASS_NAME("tcg")
+
+static const TypeInfo tcg_accel_type = {
+    .name = TYPE_TCG_ACCEL,
+    .parent = TYPE_ACCEL,
+    .class_init = tcg_accel_class_init,
+};
+
+static void register_accel_types(void)
+{
+    type_register_static(&tcg_accel_type);
+}
+
+type_init(register_accel_types);
diff --git a/accel/tcg/trace-events b/accel/tcg/trace-events
new file mode 100644
index 0000000000..2de8359670
--- /dev/null
+++ b/accel/tcg/trace-events
@@ -0,0 +1,10 @@
+# Trace events for debugging and performance instrumentation
+
+# TCG related tracing (mostly disabled by default)
+# cpu-exec.c
+disable exec_tb(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
+disable exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
+disable exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=%x"
+
+# translate-all.c
+translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
diff --git a/translate-all.c b/accel/tcg/translate-all.c
index b3ee876526..f6ad46b613 100644
--- a/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -25,7 +25,7 @@
 #include "qemu-common.h"
 #define NO_CPU_IO_DEFS
 #include "cpu.h"
-#include "trace-root.h"
+#include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg.h"
@@ -523,8 +523,6 @@ static inline PageDesc *page_find(tb_page_addr_t index)
 # define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
 #elif defined(__aarch64__)
 # define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
-#elif defined(__arm__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (16u * 1024 * 1024)
 #elif defined(__s390x__)
   /* We have a +- 4GB range on the branches; leave some slop.  */
 # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
@@ -781,12 +779,13 @@ static inline void code_gen_alloc(size_t tb_size)
         exit(1);
     }
 
-    /* Estimate a good size for the number of TBs we can support.  We
-       still haven't deducted the prologue from the buffer size here,
-       but that's minimal and won't affect the estimate much.  */
-    tcg_ctx.code_gen_max_blocks
-        = tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
-    tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks);
+    /* size this conservatively -- realloc later if needed */
+    tcg_ctx.tb_ctx.tbs_size =
+        tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
+    if (unlikely(!tcg_ctx.tb_ctx.tbs_size)) {
+        tcg_ctx.tb_ctx.tbs_size = 64 * 1024;
+    }
+    tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx.tb_ctx.tbs_size);
 
     qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
 }
@@ -828,16 +827,20 @@ bool tcg_enabled(void)
 static TranslationBlock *tb_alloc(target_ulong pc)
 {
     TranslationBlock *tb;
+    TBContext *ctx;
 
     assert_tb_locked();
 
-    if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks) {
+    tb = tcg_tb_alloc(&tcg_ctx);
+    if (unlikely(tb == NULL)) {
         return NULL;
     }
-    tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++];
-    tb->pc = pc;
-    tb->cflags = 0;
-    tb->invalid = false;
+    ctx = &tcg_ctx.tb_ctx;
+    if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
+        ctx->tbs_size *= 2;
+        ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
+    }
+    ctx->tbs[ctx->nb_tbs++] = tb;
     return tb;
 }
 
@@ -850,8 +853,10 @@ void tb_free(TranslationBlock *tb)
        Ignore the hard cases and just back up if this TB happens to
        be the last one generated.  */
     if (tcg_ctx.tb_ctx.nb_tbs > 0 &&
-            tb == &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
-        tcg_ctx.code_gen_ptr = tb->tc_ptr;
+            tb == tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
+        size_t struct_size = ROUND_UP(sizeof(*tb), qemu_icache_linesize);
+
+        tcg_ctx.code_gen_ptr = tb->tc_ptr - struct_size;
         tcg_ctx.tb_ctx.nb_tbs--;
     }
 }
@@ -1279,9 +1284,11 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 
     gen_code_buf = tcg_ctx.code_gen_ptr;
     tb->tc_ptr = gen_code_buf;
+    tb->pc = pc;
     tb->cs_base = cs_base;
     tb->flags = flags;
     tb->cflags = cflags;
+    tb->invalid = false;
 
 #ifdef CONFIG_PROFILER
     tcg_ctx.tb_count1++; /* includes aborted translations because of
@@ -1666,7 +1673,7 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
     m_max = tcg_ctx.tb_ctx.nb_tbs - 1;
     while (m_min <= m_max) {
         m = (m_min + m_max) >> 1;
-        tb = &tcg_ctx.tb_ctx.tbs[m];
+        tb = tcg_ctx.tb_ctx.tbs[m];
         v = (uintptr_t)tb->tc_ptr;
         if (v == tc_ptr) {
             return tb;
@@ -1676,7 +1683,7 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
             m_min = m + 1;
         }
     }
-    return &tcg_ctx.tb_ctx.tbs[m_max];
+    return tcg_ctx.tb_ctx.tbs[m_max];
 }
 
 #if !defined(CONFIG_USER_ONLY)
@@ -1874,7 +1881,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     direct_jmp_count = 0;
     direct_jmp2_count = 0;
     for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
-        tb = &tcg_ctx.tb_ctx.tbs[i];
+        tb = tcg_ctx.tb_ctx.tbs[i];
         target_code_size += tb->size;
         if (tb->size > max_target_code_size) {
             max_target_code_size = tb->size;
@@ -1894,8 +1901,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     cpu_fprintf(f, "gen code size       %td/%zd\n",
                 tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer,
                 tcg_ctx.code_gen_highwater - tcg_ctx.code_gen_buffer);
-    cpu_fprintf(f, "TB count            %d/%d\n",
-            tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks);
+    cpu_fprintf(f, "TB count            %d\n", tcg_ctx.tb_ctx.nb_tbs);
     cpu_fprintf(f, "TB avg target size  %d max=%d bytes\n",
             tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
                     tcg_ctx.tb_ctx.nb_tbs : 0,
diff --git a/translate-all.h b/accel/tcg/translate-all.h
index ba8e4d63c4..ba8e4d63c4 100644
--- a/translate-all.h
+++ b/accel/tcg/translate-all.h
diff --git a/translate-common.c b/accel/tcg/translate-common.c
index 40fe5a19bb..40fe5a19bb 100644
--- a/translate-common.c
+++ b/accel/tcg/translate-common.c
diff --git a/block.c b/block.c
index fa1d06d846..694396281b 100644
--- a/block.c
+++ b/block.c
@@ -320,6 +320,8 @@ BlockDriverState *bdrv_new(void)
         QLIST_INIT(&bs->op_blockers[i]);
     }
     notifier_with_return_list_init(&bs->before_write_notifiers);
+    qemu_co_mutex_init(&bs->reqs_lock);
+    qemu_mutex_init(&bs->dirty_bitmap_mutex);
     bs->refcnt = 1;
     bs->aio_context = qemu_get_aio_context();
 
@@ -1300,7 +1302,9 @@ static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
         goto fail_opts;
     }
 
-    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
+    /* bdrv_new() and bdrv_close() make it so */
+    assert(atomic_read(&bs->copy_on_read) == 0);
+
     if (bs->open_flags & BDRV_O_COPY_ON_READ) {
         if (!bs->read_only) {
             bdrv_enable_copy_on_read(bs);
@@ -3063,7 +3067,7 @@ static void bdrv_close(BlockDriverState *bs)
 
         g_free(bs->opaque);
         bs->opaque = NULL;
-        bs->copy_on_read = 0;
+        atomic_set(&bs->copy_on_read, 0);
         bs->backing_file[0] = '\0';
         bs->backing_format[0] = '\0';
         bs->total_sectors = 0;
@@ -3422,7 +3426,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, Error **errp)
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
         bdrv_dirty_bitmap_truncate(bs);
         bdrv_parent_cb_resize(bs);
-        ++bs->write_gen;
+        atomic_inc(&bs->write_gen);
     }
     return ret;
 }
diff --git a/block/accounting.c b/block/accounting.c
index 3f457c4e73..87ef5bbfaa 100644
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -32,23 +32,28 @@
 static QEMUClockType clock_type = QEMU_CLOCK_REALTIME;
 static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000;
 
-void block_acct_init(BlockAcctStats *stats, bool account_invalid,
-                     bool account_failed)
+void block_acct_init(BlockAcctStats *stats)
 {
-    stats->account_invalid = account_invalid;
-    stats->account_failed = account_failed;
-
+    qemu_mutex_init(&stats->lock);
     if (qtest_enabled()) {
         clock_type = QEMU_CLOCK_VIRTUAL;
     }
 }
 
+void block_acct_setup(BlockAcctStats *stats, bool account_invalid,
+                      bool account_failed)
+{
+    stats->account_invalid = account_invalid;
+    stats->account_failed = account_failed;
+}
+
 void block_acct_cleanup(BlockAcctStats *stats)
 {
     BlockAcctTimedStats *s, *next;
     QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) {
         g_free(s);
     }
+    qemu_mutex_destroy(&stats->lock);
 }
 
 void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length)
@@ -58,12 +63,15 @@ void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length)
 
     s = g_new0(BlockAcctTimedStats, 1);
     s->interval_length = interval_length;
+    s->stats = stats;
+    qemu_mutex_lock(&stats->lock);
     QSLIST_INSERT_HEAD(&stats->intervals, s, entries);
 
     for (i = 0; i < BLOCK_MAX_IOTYPE; i++) {
         timed_average_init(&s->latency[i], clock_type,
                            (uint64_t) interval_length * NANOSECONDS_PER_SECOND);
     }
+    qemu_mutex_unlock(&stats->lock);
 }
 
 BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
@@ -86,7 +94,8 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
     cookie->type = type;
 }
 
-void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
+static void block_account_one_io(BlockAcctStats *stats, BlockAcctCookie *cookie,
+                                 bool failed)
 {
     BlockAcctTimedStats *s;
     int64_t time_ns = qemu_clock_get_ns(clock_type);
@@ -98,31 +107,16 @@ void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
 
     assert(cookie->type < BLOCK_MAX_IOTYPE);
 
-    stats->nr_bytes[cookie->type] += cookie->bytes;
-    stats->nr_ops[cookie->type]++;
-    stats->total_time_ns[cookie->type] += latency_ns;
-    stats->last_access_time_ns = time_ns;
+    qemu_mutex_lock(&stats->lock);
 
-    QSLIST_FOREACH(s, &stats->intervals, entries) {
-        timed_average_account(&s->latency[cookie->type], latency_ns);
+    if (failed) {
+        stats->failed_ops[cookie->type]++;
+    } else {
+        stats->nr_bytes[cookie->type] += cookie->bytes;
+        stats->nr_ops[cookie->type]++;
     }
-}
-
-void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie)
-{
-    assert(cookie->type < BLOCK_MAX_IOTYPE);
-
-    stats->failed_ops[cookie->type]++;
-
-    if (stats->account_failed) {
-        BlockAcctTimedStats *s;
-        int64_t time_ns = qemu_clock_get_ns(clock_type);
-        int64_t latency_ns = time_ns - cookie->start_time_ns;
-
-        if (qtest_enabled()) {
-            latency_ns = qtest_latency_ns;
-        }
 
+    if (!failed || stats->account_failed) {
         stats->total_time_ns[cookie->type] += latency_ns;
         stats->last_access_time_ns = time_ns;
 
@@ -130,29 +124,45 @@ void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie)
             timed_average_account(&s->latency[cookie->type], latency_ns);
         }
     }
+
+    qemu_mutex_unlock(&stats->lock);
+}
+
+void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
+{
+    block_account_one_io(stats, cookie, false);
+}
+
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie)
+{
+    block_account_one_io(stats, cookie, true);
 }
 
 void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type)
 {
     assert(type < BLOCK_MAX_IOTYPE);
 
-    /* block_acct_done() and block_acct_failed() update
-     * total_time_ns[], but this one does not. The reason is that
-     * invalid requests are accounted during their submission,
-     * therefore there's no actual I/O involved. */
-
+    /* block_account_one_io() updates total_time_ns[], but this one does
+     * not.  The reason is that invalid requests are accounted during their
+     * submission, therefore there's no actual I/O involved.
+     */
+    qemu_mutex_lock(&stats->lock);
     stats->invalid_ops[type]++;
 
     if (stats->account_invalid) {
         stats->last_access_time_ns = qemu_clock_get_ns(clock_type);
     }
+    qemu_mutex_unlock(&stats->lock);
 }
 
 void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
                       int num_requests)
 {
     assert(type < BLOCK_MAX_IOTYPE);
+
+    qemu_mutex_lock(&stats->lock);
     stats->merged[type] += num_requests;
+    qemu_mutex_unlock(&stats->lock);
 }
 
 int64_t block_acct_idle_time_ns(BlockAcctStats *stats)
@@ -167,7 +177,9 @@ double block_acct_queue_depth(BlockAcctTimedStats *stats,
 
     assert(type < BLOCK_MAX_IOTYPE);
 
+    qemu_mutex_lock(&stats->stats->lock);
     sum = timed_average_sum(&stats->latency[type], &elapsed);
+    qemu_mutex_unlock(&stats->stats->lock);
 
     return (double) sum / elapsed;
 }
diff --git a/block/block-backend.c b/block/block-backend.c
index 7d7f3697d1..a2bbae90b1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -216,8 +216,10 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
     blk->shared_perm = shared_perm;
     blk_set_enable_write_cache(blk, true);
 
+    qemu_co_mutex_init(&blk->public.throttled_reqs_lock);
     qemu_co_queue_init(&blk->public.throttled_reqs[0]);
     qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+    block_acct_init(&blk->stats);
 
     notifier_list_init(&blk->remove_bs_notifiers);
     notifier_list_init(&blk->insert_bs_notifiers);
@@ -1953,7 +1955,7 @@ static void blk_root_drained_begin(BdrvChild *child)
     /* Note that blk->root may not be accessible here yet if we are just
      * attaching to a BlockDriverState that is drained. Use child instead. */
 
-    if (blk->public.io_limits_disabled++ == 0) {
+    if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) {
         throttle_group_restart_blk(blk);
     }
 }
@@ -1964,7 +1966,7 @@ static void blk_root_drained_end(BdrvChild *child)
     assert(blk->quiesce_counter);
 
     assert(blk->public.io_limits_disabled);
-    --blk->public.io_limits_disabled;
+    atomic_dec(&blk->public.io_limits_disabled);
 
     if (--blk->quiesce_counter == 0) {
         if (blk->dev_ops && blk->dev_ops->drained_end) {
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 519737c8d3..a04c6e4154 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -37,6 +37,7 @@
  *     or enabled. A frozen bitmap can only abdicate() or reclaim().
  */
 struct BdrvDirtyBitmap {
+    QemuMutex *mutex;
     HBitmap *bitmap;            /* Dirty sector bitmap implementation */
     HBitmap *meta;              /* Meta dirty bitmap */
     BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
@@ -52,6 +53,27 @@ struct BdrvDirtyBitmapIter {
     BdrvDirtyBitmap *bitmap;
 };
 
+static inline void bdrv_dirty_bitmaps_lock(BlockDriverState *bs)
+{
+    qemu_mutex_lock(&bs->dirty_bitmap_mutex);
+}
+
+static inline void bdrv_dirty_bitmaps_unlock(BlockDriverState *bs)
+{
+    qemu_mutex_unlock(&bs->dirty_bitmap_mutex);
+}
+
+void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap)
+{
+    qemu_mutex_lock(bitmap->mutex);
+}
+
+void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap)
+{
+    qemu_mutex_unlock(bitmap->mutex);
+}
+
+/* Called with BQL or dirty_bitmap lock taken.  */
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
     BdrvDirtyBitmap *bm;
@@ -65,6 +87,7 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
     return NULL;
 }
 
+/* Called with BQL taken.  */
 void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
 {
     assert(!bdrv_dirty_bitmap_frozen(bitmap));
@@ -72,6 +95,7 @@ void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
     bitmap->name = NULL;
 }
 
+/* Called with BQL taken.  */
 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
                                           uint32_t granularity,
                                           const char *name,
@@ -96,11 +120,14 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
         return NULL;
     }
     bitmap = g_new0(BdrvDirtyBitmap, 1);
+    bitmap->mutex = &bs->dirty_bitmap_mutex;
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
     bitmap->size = bitmap_size;
     bitmap->name = g_strdup(name);
     bitmap->disabled = false;
+    bdrv_dirty_bitmaps_lock(bs);
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
+    bdrv_dirty_bitmaps_unlock(bs);
     return bitmap;
 }
 
@@ -119,20 +146,24 @@ void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                                    int chunk_size)
 {
     assert(!bitmap->meta);
+    qemu_mutex_lock(bitmap->mutex);
     bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
                                        chunk_size * BITS_PER_BYTE);
+    qemu_mutex_unlock(bitmap->mutex);
 }
 
 void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 {
     assert(bitmap->meta);
+    qemu_mutex_lock(bitmap->mutex);
     hbitmap_free_meta(bitmap->bitmap);
     bitmap->meta = NULL;
+    qemu_mutex_unlock(bitmap->mutex);
 }
 
-int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
-                               BdrvDirtyBitmap *bitmap, int64_t sector,
-                               int nb_sectors)
+int bdrv_dirty_bitmap_get_meta_locked(BlockDriverState *bs,
+                                      BdrvDirtyBitmap *bitmap, int64_t sector,
+                                      int nb_sectors)
 {
     uint64_t i;
     int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta);
@@ -147,11 +178,26 @@ int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
     return false;
 }
 
+int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
+                               BdrvDirtyBitmap *bitmap, int64_t sector,
+                               int nb_sectors)
+{
+    bool dirty;
+
+    qemu_mutex_lock(bitmap->mutex);
+    dirty = bdrv_dirty_bitmap_get_meta_locked(bs, bitmap, sector, nb_sectors);
+    qemu_mutex_unlock(bitmap->mutex);
+
+    return dirty;
+}
+
 void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
                                   BdrvDirtyBitmap *bitmap, int64_t sector,
                                   int nb_sectors)
 {
+    qemu_mutex_lock(bitmap->mutex);
     hbitmap_reset(bitmap->meta, sector, nb_sectors);
+    qemu_mutex_unlock(bitmap->mutex);
 }
 
 int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap)
@@ -164,16 +210,19 @@ const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap)
     return bitmap->name;
 }
 
+/* Called with BQL taken.  */
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
     return bitmap->successor;
 }
 
+/* Called with BQL taken.  */
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
 {
     return !(bitmap->disabled || bitmap->successor);
 }
 
+/* Called with BQL taken.  */
 DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap)
 {
     if (bdrv_dirty_bitmap_frozen(bitmap)) {
@@ -188,6 +237,7 @@ DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap)
 /**
  * Create a successor bitmap destined to replace this bitmap after an operation.
  * Requires that the bitmap is not frozen and has no successor.
+ * Called with BQL taken.
  */
 int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
                                        BdrvDirtyBitmap *bitmap, Error **errp)
@@ -220,6 +270,7 @@ int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
 /**
  * For a bitmap with a successor, yield our name to the successor,
  * delete the old bitmap, and return a handle to the new bitmap.
+ * Called with BQL taken.
  */
 BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
                                             BdrvDirtyBitmap *bitmap,
@@ -247,6 +298,7 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
  * In cases of failure where we can no longer safely delete the parent,
  * we may wish to re-join the parent and child/successor.
  * The merged parent will be un-frozen, but not explicitly re-enabled.
+ * Called with BQL taken.
  */
 BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
                                            BdrvDirtyBitmap *parent,
@@ -271,25 +323,30 @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
 
 /**
  * Truncates _all_ bitmaps attached to a BDS.
+ * Called with BQL taken.
  */
 void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
 {
     BdrvDirtyBitmap *bitmap;
     uint64_t size = bdrv_nb_sectors(bs);
 
+    bdrv_dirty_bitmaps_lock(bs);
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
         assert(!bdrv_dirty_bitmap_frozen(bitmap));
         assert(!bitmap->active_iterators);
         hbitmap_truncate(bitmap->bitmap, size);
         bitmap->size = size;
     }
+    bdrv_dirty_bitmaps_unlock(bs);
 }
 
+/* Called with BQL taken.  */
 static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
                                                   BdrvDirtyBitmap *bitmap,
                                                   bool only_named)
 {
     BdrvDirtyBitmap *bm, *next;
+    bdrv_dirty_bitmaps_lock(bs);
     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
         if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
             assert(!bm->active_iterators);
@@ -301,15 +358,19 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
             g_free(bm);
 
             if (bitmap) {
-                return;
+                goto out;
             }
         }
     }
     if (bitmap) {
         abort();
     }
+
+out:
+    bdrv_dirty_bitmaps_unlock(bs);
 }
 
+/* Called with BQL taken.  */
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
     bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false);
@@ -318,18 +379,21 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 /**
  * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()).
  * There must not be any frozen bitmaps attached.
+ * Called with BQL taken.
  */
 void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs)
 {
     bdrv_do_release_matching_dirty_bitmap(bs, NULL, true);
 }
 
+/* Called with BQL taken.  */
 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 {
     assert(!bdrv_dirty_bitmap_frozen(bitmap));
     bitmap->disabled = true;
 }
 
+/* Called with BQL taken.  */
 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 {
     assert(!bdrv_dirty_bitmap_frozen(bitmap));
@@ -342,6 +406,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
     BlockDirtyInfoList *list = NULL;
     BlockDirtyInfoList **plist = &list;
 
+    bdrv_dirty_bitmaps_lock(bs);
     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
@@ -354,12 +419,14 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
         *plist = entry;
         plist = &entry->next;
     }
+    bdrv_dirty_bitmaps_unlock(bs);
 
     return list;
 }
 
-int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
-                   int64_t sector)
+/* Called within bdrv_dirty_bitmap_lock..unlock */
+int bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                          int64_t sector)
 {
     if (bitmap) {
         return hbitmap_get(bitmap->bitmap, sector);
@@ -432,23 +499,42 @@ int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter)
     return hbitmap_iter_next(&iter->hbi);
 }
 
+/* Called within bdrv_dirty_bitmap_lock..unlock */
+void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                  int64_t cur_sector, int64_t nr_sectors)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+}
+
 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int64_t nr_sectors)
 {
+    bdrv_dirty_bitmap_lock(bitmap);
+    bdrv_set_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors);
+    bdrv_dirty_bitmap_unlock(bitmap);
+}
+
+/* Called within bdrv_dirty_bitmap_lock..unlock */
+void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                    int64_t cur_sector, int64_t nr_sectors)
+{
     assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
+    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int64_t nr_sectors)
 {
-    assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
+    bdrv_dirty_bitmap_lock(bitmap);
+    bdrv_reset_dirty_bitmap_locked(bitmap, cur_sector, nr_sectors);
+    bdrv_dirty_bitmap_unlock(bitmap);
 }
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
+    bdrv_dirty_bitmap_lock(bitmap);
     if (!out) {
         hbitmap_reset_all(bitmap->bitmap);
     } else {
@@ -457,6 +543,7 @@ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
                                        hbitmap_granularity(backup));
         *out = backup;
     }
+    bdrv_dirty_bitmap_unlock(bitmap);
 }
 
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
@@ -508,12 +595,19 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                     int64_t nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
+
+    if (QLIST_EMPTY(&bs->dirty_bitmaps)) {
+        return;
+    }
+
+    bdrv_dirty_bitmaps_lock(bs);
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
         if (!bdrv_dirty_bitmap_enabled(bitmap)) {
             continue;
         }
         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
     }
+    bdrv_dirty_bitmaps_unlock(bs);
 }
 
 /**
diff --git a/block/io.c b/block/io.c
index ed31810c0a..91611ffb2a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -130,13 +130,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  */
 void bdrv_enable_copy_on_read(BlockDriverState *bs)
 {
-    bs->copy_on_read++;
+    atomic_inc(&bs->copy_on_read);
 }
 
 void bdrv_disable_copy_on_read(BlockDriverState *bs)
 {
-    assert(bs->copy_on_read > 0);
-    bs->copy_on_read--;
+    int old = atomic_fetch_dec(&bs->copy_on_read);
+    assert(old >= 1);
 }
 
 /* Check if any requests are in-flight (including throttled requests) */
@@ -241,7 +241,7 @@ void bdrv_drained_begin(BlockDriverState *bs)
         return;
     }
 
-    if (!bs->quiesce_counter++) {
+    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
     }
@@ -252,7 +252,7 @@ void bdrv_drained_begin(BlockDriverState *bs)
 void bdrv_drained_end(BlockDriverState *bs)
 {
     assert(bs->quiesce_counter > 0);
-    if (--bs->quiesce_counter > 0) {
+    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
         return;
     }
 
@@ -375,11 +375,13 @@ void bdrv_drain_all(void)
 static void tracked_request_end(BdrvTrackedRequest *req)
 {
     if (req->serialising) {
-        req->bs->serialising_in_flight--;
+        atomic_dec(&req->bs->serialising_in_flight);
     }
 
+    qemu_co_mutex_lock(&req->bs->reqs_lock);
     QLIST_REMOVE(req, list);
     qemu_co_queue_restart_all(&req->wait_queue);
+    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 }
 
 /**
@@ -404,7 +406,9 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
 
     qemu_co_queue_init(&req->wait_queue);
 
+    qemu_co_mutex_lock(&bs->reqs_lock);
     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+    qemu_co_mutex_unlock(&bs->reqs_lock);
 }
 
 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
@@ -414,7 +418,7 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
                                - overlap_offset;
 
     if (!req->serialising) {
-        req->bs->serialising_in_flight++;
+        atomic_inc(&req->bs->serialising_in_flight);
         req->serialising = true;
     }
 
@@ -501,7 +505,8 @@ static void dummy_bh_cb(void *opaque)
 
 void bdrv_wakeup(BlockDriverState *bs)
 {
-    if (bs->wakeup) {
+    /* The barrier (or an atomic op) is in the caller.  */
+    if (atomic_read(&bs->wakeup)) {
         aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
     }
 }
@@ -519,12 +524,13 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
     bool retry;
     bool waited = false;
 
-    if (!bs->serialising_in_flight) {
+    if (!atomic_read(&bs->serialising_in_flight)) {
         return false;
     }
 
     do {
         retry = false;
+        qemu_co_mutex_lock(&bs->reqs_lock);
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
             if (req == self || (!req->serialising && !self->serialising)) {
                 continue;
@@ -543,7 +549,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  * (instead of producing a deadlock in the former case). */
                 if (!req->waiting_for) {
                     self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue, NULL);
+                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
                     self->waiting_for = NULL;
                     retry = true;
                     waited = true;
@@ -551,6 +557,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                 }
             }
         }
+        qemu_co_mutex_unlock(&bs->reqs_lock);
     } while (retry);
 
     return waited;
@@ -1144,7 +1151,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
     bdrv_inc_in_flight(bs);
 
     /* Don't do copy-on-read if we read data before write operation */
-    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
+    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 
@@ -1401,12 +1408,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
     }
     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
 
-    ++bs->write_gen;
+    atomic_inc(&bs->write_gen);
     bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
 
-    if (bs->wr_highest_offset < offset + bytes) {
-        bs->wr_highest_offset = offset + bytes;
-    }
+    stat64_max(&bs->wr_highest_offset, offset + bytes);
 
     if (ret >= 0) {
         bs->total_sectors = MAX(bs->total_sectors, end_sector);
@@ -2292,14 +2297,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
         goto early_exit;
     }
 
-    current_gen = bs->write_gen;
+    qemu_co_mutex_lock(&bs->reqs_lock);
+    current_gen = atomic_read(&bs->write_gen);
 
     /* Wait until any previous flushes are completed */
     while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue, NULL);
+        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
     }
 
+    /* Flushes reach this point in nondecreasing current_gen order.  */
     bs->active_flush_req = true;
+    qemu_co_mutex_unlock(&bs->reqs_lock);
 
     /* Write back all layers by calling one driver function */
     if (bs->drv->bdrv_co_flush) {
@@ -2371,9 +2379,12 @@ out:
     if (ret == 0) {
         bs->flushed_gen = current_gen;
     }
+
+    qemu_co_mutex_lock(&bs->reqs_lock);
     bs->active_flush_req = false;
     /* Return value is ignored - it's ok if wait queue is empty */
     qemu_co_queue_next(&bs->flush_queue);
+    qemu_co_mutex_unlock(&bs->reqs_lock);
 
 early_exit:
     bdrv_dec_in_flight(bs);
@@ -2517,7 +2528,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
     }
     ret = 0;
 out:
-    ++bs->write_gen;
+    atomic_inc(&bs->write_gen);
     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
                    req.bytes >> BDRV_SECTOR_BITS);
     tracked_request_end(&req);
@@ -2644,7 +2655,7 @@ void bdrv_io_plug(BlockDriverState *bs)
         bdrv_io_plug(child->bs);
     }
 
-    if (bs->io_plugged++ == 0) {
+    if (atomic_fetch_inc(&bs->io_plugged) == 0) {
         BlockDriver *drv = bs->drv;
         if (drv && drv->bdrv_io_plug) {
             drv->bdrv_io_plug(bs);
@@ -2657,7 +2668,7 @@ void bdrv_io_unplug(BlockDriverState *bs)
     BdrvChild *child;
 
     assert(bs->io_plugged);
-    if (--bs->io_plugged == 0) {
+    if (atomic_fetch_dec(&bs->io_plugged) == 1) {
         BlockDriver *drv = bs->drv;
         if (drv && drv->bdrv_io_unplug) {
             drv->bdrv_io_unplug(bs);
diff --git a/block/iscsi.c b/block/iscsi.c
index 5daa201181..b5f7a228b9 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1732,6 +1732,10 @@ static QemuOptsList runtime_opts = {
             .name = "timeout",
             .type = QEMU_OPT_NUMBER,
         },
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+        },
         { /* end of list */ }
     },
 };
@@ -1747,12 +1751,27 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     char *initiator_name = NULL;
     QemuOpts *opts;
     Error *local_err = NULL;
-    const char *transport_name, *portal, *target;
+    const char *transport_name, *portal, *target, *filename;
 #if LIBISCSI_API_VERSION >= (20160603)
     enum iscsi_transport_type transport;
 #endif
     int i, ret = 0, timeout = 0, lun;
 
+    /* If we are given a filename, parse the filename, with precedence given to
+     * filename encoded options */
+    filename = qdict_get_try_str(options, "filename");
+    if (filename) {
+        error_report("Warning: 'filename' option specified. "
+                      "This is an unsupported option, and may be deprecated "
+                      "in the future");
+        iscsi_parse_filename(filename, options, &local_err);
+        if (local_err) {
+            ret = -EINVAL;
+            error_propagate(errp, local_err);
+            goto exit;
+        }
+    }
+
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
     if (local_err) {
@@ -1967,6 +1986,7 @@ out:
         }
         memset(iscsilun, 0, sizeof(IscsiLun));
     }
+exit:
     return ret;
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index a2a970301c..19afcc6f1a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -342,6 +342,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT,
                              MAX_IO_SECTORS);
 
+    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
     sector_num = bdrv_dirty_iter_next(s->dbi);
     if (sector_num < 0) {
         bdrv_set_dirty_iter(s->dbi, 0);
@@ -349,6 +350,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
         assert(sector_num >= 0);
     }
+    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 
     first_chunk = sector_num / sectors_per_chunk;
     while (test_bit(first_chunk, s->in_flight_bitmap)) {
@@ -360,12 +362,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
     /* Find the number of consective dirty chunks following the first dirty
      * one, and wait for in flight requests in them. */
+    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
     while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) {
         int64_t next_dirty;
         int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk;
         int64_t next_chunk = next_sector / sectors_per_chunk;
         if (next_sector >= end ||
-            !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) {
+            !bdrv_get_dirty_locked(source, s->dirty_bitmap, next_sector)) {
             break;
         }
         if (test_bit(next_chunk, s->in_flight_bitmap)) {
@@ -386,8 +389,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
      * calling bdrv_get_block_status_above could yield - if some blocks are
      * marked dirty in this window, we need to know.
      */
-    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num,
-                            nb_chunks * sectors_per_chunk);
+    bdrv_reset_dirty_bitmap_locked(s->dirty_bitmap, sector_num,
+                                  nb_chunks * sectors_per_chunk);
+    bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
+
     bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
     while (nb_chunks > 0 && sector_num < end) {
         int64_t ret;
@@ -506,6 +511,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
     BlockDriverState *mirror_top_bs = s->mirror_top_bs;
     Error *local_err = NULL;
 
+    bdrv_release_dirty_bitmap(src, s->dirty_bitmap);
+
     /* Make sure that the source BDS doesn't go away before we called
      * block_job_completed(). */
     bdrv_ref(src);
@@ -904,7 +911,6 @@ immediate_exit:
     g_free(s->cow_bitmap);
     g_free(s->in_flight_bitmap);
     bdrv_dirty_iter_free(s->dbi);
-    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
 
     data = g_malloc(sizeof(*data));
     data->ret = ret;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index 87d19c7253..d64e775385 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -144,8 +144,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
         qio_channel_set_cork(s->ioc, true);
         rc = nbd_send_request(s->ioc, request);
         if (rc >= 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
-                               false, NULL);
+            ret = nbd_rwv(s->ioc, qiov->iov, qiov->niov, request->len, false,
+                          NULL);
             if (ret != request->len) {
                 rc = -EIO;
             }
@@ -173,8 +173,8 @@ static void nbd_co_receive_reply(NBDClientSession *s,
         reply->error = EIO;
     } else {
         if (qiov && reply->error == 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
-                               true, NULL);
+            ret = nbd_rwv(s->ioc, qiov->iov, qiov->niov, request->len, true,
+                          NULL);
             if (ret != request->len) {
                 reply->error = EIO;
             }
diff --git a/block/nfs.c b/block/nfs.c
index decefd15f1..6b8b5b653d 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -729,7 +729,9 @@ nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data,
     if (task->ret < 0) {
         error_report("NFS Error: %s", nfs_get_error(nfs));
     }
-    task->complete = 1;
+
+    /* Set task->complete before reading bs->wakeup.  */
+    atomic_mb_set(&task->complete, 1);
     bdrv_wakeup(task->bs);
 }
 
diff --git a/block/qapi.c b/block/qapi.c
index 2050df29e4..0a41d59bf3 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -441,7 +441,7 @@ static BlockStats *bdrv_query_bds_stats(const BlockDriverState *bs,
         s->node_name = g_strdup(bdrv_get_node_name(bs));
     }
 
-    s->stats->wr_highest_offset = bs->wr_highest_offset;
+    s->stats->wr_highest_offset = stat64_get(&bs->wr_highest_offset);
 
     if (bs->file) {
         s->has_parent = true;
diff --git a/block/rbd.c b/block/rbd.c
index e551639e47..ff44e5f437 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -340,6 +340,10 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_STRING,
             .help = "Legacy rados key/value option parameters",
         },
+        {
+            .name = "filename",
+            .type = QEMU_OPT_STRING,
+        },
         { /* end of list */ }
     },
 };
@@ -541,12 +545,27 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
 {
     BDRVRBDState *s = bs->opaque;
     const char *pool, *snap, *conf, *user, *image_name, *keypairs;
-    const char *secretid;
+    const char *secretid, *filename;
     QemuOpts *opts;
     Error *local_err = NULL;
     char *mon_host = NULL;
     int r;
 
+    /* If we are given a filename, parse the filename, with precedence given to
+     * filename encoded options */
+    filename = qdict_get_try_str(options, "filename");
+    if (filename) {
+        error_report("Warning: 'filename' option specified. "
+                      "This is an unsupported option, and may be deprecated "
+                      "in the future");
+        qemu_rbd_parse_filename(filename, options, &local_err);
+        if (local_err) {
+            r = -EINVAL;
+            error_propagate(errp, local_err);
+            goto exit;
+        }
+    }
+
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &local_err);
     if (local_err) {
@@ -665,6 +684,7 @@ failed_shutdown:
 failed_opts:
     qemu_opts_del(opts);
     g_free(mon_host);
+exit:
     return r;
 }
 
diff --git a/block/sheepdog.c b/block/sheepdog.c
index dea9000bdd..c9236679c6 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -697,7 +697,8 @@ out:
 
     srco->co = NULL;
     srco->ret = ret;
-    srco->finished = true;
+    /* Set srco->finished before reading bs->wakeup.  */
+    atomic_mb_set(&srco->finished, true);
     if (srco->bs) {
         bdrv_wakeup(srco->bs);
     }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index b73e7a800b..a181cb1dee 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -240,7 +240,7 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
     bool must_wait;
 
-    if (blkp->io_limits_disabled) {
+    if (atomic_read(&blkp->io_limits_disabled)) {
         return false;
     }
 
@@ -260,6 +260,25 @@ static bool throttle_group_schedule_timer(BlockBackend *blk, bool is_write)
     return must_wait;
 }
 
+/* Start the next pending I/O request for a BlockBackend.  Return whether
+ * any request was actually pending.
+ *
+ * @blk:       the current BlockBackend
+ * @is_write:  the type of operation (read/write)
+ */
+static bool coroutine_fn throttle_group_co_restart_queue(BlockBackend *blk,
+                                                         bool is_write)
+{
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    bool ret;
+
+    qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
+    ret = qemu_co_queue_next(&blkp->throttled_reqs[is_write]);
+    qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
+
+    return ret;
+}
+
 /* Look for the next pending I/O request and schedule it.
  *
  * This assumes that tg->lock is held.
@@ -287,12 +306,12 @@ static void schedule_next_request(BlockBackend *blk, bool is_write)
     if (!must_wait) {
         /* Give preference to requests from the current blk */
         if (qemu_in_coroutine() &&
-            qemu_co_queue_next(&blkp->throttled_reqs[is_write])) {
+            throttle_group_co_restart_queue(blk, is_write)) {
             token = blk;
         } else {
             ThrottleTimers *tt = &blk_get_public(token)->throttle_timers;
             int64_t now = qemu_clock_get_ns(tt->clock_type);
-            timer_mod(tt->timers[is_write], now + 1);
+            timer_mod(tt->timers[is_write], now);
             tg->any_timer_armed[is_write] = true;
         }
         tg->tokens[is_write] = token;
@@ -326,7 +345,10 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
     if (must_wait || blkp->pending_reqs[is_write]) {
         blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
+        qemu_co_mutex_lock(&blkp->throttled_reqs_lock);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write],
+                           &blkp->throttled_reqs_lock);
+        qemu_co_mutex_unlock(&blkp->throttled_reqs_lock);
         qemu_mutex_lock(&tg->lock);
         blkp->pending_reqs[is_write]--;
     }
@@ -340,15 +362,50 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
     qemu_mutex_unlock(&tg->lock);
 }
 
+typedef struct {
+    BlockBackend *blk;
+    bool is_write;
+} RestartData;
+
+static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
+{
+    RestartData *data = opaque;
+    BlockBackend *blk = data->blk;
+    bool is_write = data->is_write;
+    BlockBackendPublic *blkp = blk_get_public(blk);
+    ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, ts);
+    bool empty_queue;
+
+    empty_queue = !throttle_group_co_restart_queue(blk, is_write);
+
+    /* If the request queue was empty then we have to take care of
+     * scheduling the next one */
+    if (empty_queue) {
+        qemu_mutex_lock(&tg->lock);
+        schedule_next_request(blk, is_write);
+        qemu_mutex_unlock(&tg->lock);
+    }
+}
+
+static void throttle_group_restart_queue(BlockBackend *blk, bool is_write)
+{
+    Coroutine *co;
+    RestartData rd = {
+        .blk = blk,
+        .is_write = is_write
+    };
+
+    co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd);
+    aio_co_enter(blk_get_aio_context(blk), co);
+}
+
 void throttle_group_restart_blk(BlockBackend *blk)
 {
     BlockBackendPublic *blkp = blk_get_public(blk);
-    int i;
 
-    for (i = 0; i < 2; i++) {
-        while (qemu_co_enter_next(&blkp->throttled_reqs[i])) {
-            ;
-        }
+    if (blkp->throttle_state) {
+        throttle_group_restart_queue(blk, 0);
+        throttle_group_restart_queue(blk, 1);
     }
 }
 
@@ -376,8 +433,7 @@ void throttle_group_config(BlockBackend *blk, ThrottleConfig *cfg)
     throttle_config(ts, tt, cfg);
     qemu_mutex_unlock(&tg->lock);
 
-    qemu_co_enter_next(&blkp->throttled_reqs[0]);
-    qemu_co_enter_next(&blkp->throttled_reqs[1]);
+    throttle_group_restart_blk(blk);
 }
 
 /* Get the throttle configuration from a particular group. Similar to
@@ -408,7 +464,6 @@ static void timer_cb(BlockBackend *blk, bool is_write)
     BlockBackendPublic *blkp = blk_get_public(blk);
     ThrottleState *ts = blkp->throttle_state;
     ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    bool empty_queue;
 
     /* The timer has just been fired, so we can update the flag */
     qemu_mutex_lock(&tg->lock);
@@ -416,17 +471,7 @@ static void timer_cb(BlockBackend *blk, bool is_write)
     qemu_mutex_unlock(&tg->lock);
 
     /* Run the request that was waiting for this timer */
-    aio_context_acquire(blk_get_aio_context(blk));
-    empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
-    aio_context_release(blk_get_aio_context(blk));
-
-    /* If the request queue was empty then we have to take care of
-     * scheduling the next one */
-    if (empty_queue) {
-        qemu_mutex_lock(&tg->lock);
-        schedule_next_request(blk, is_write);
-        qemu_mutex_unlock(&tg->lock);
-    }
+    throttle_group_restart_queue(blk, is_write);
 }
 
 static void read_timer_cb(void *opaque)
diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index dd0860f4a6..28f551a7b0 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -27,6 +27,10 @@ typedef struct NBDServerData {
 
 static NBDServerData *nbd_server;
 
+static void nbd_blockdev_client_closed(NBDClient *client, bool ignored)
+{
+    nbd_client_put(client);
+}
 
 static gboolean nbd_accept(QIOChannel *ioc, GIOCondition condition,
                            gpointer opaque)
@@ -46,7 +50,7 @@ static gboolean nbd_accept(QIOChannel *ioc, GIOCondition condition,
     qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server");
     nbd_client_new(NULL, cioc,
                    nbd_server->tlscreds, NULL,
-                   nbd_client_put);
+                   nbd_blockdev_client_closed);
     object_unref(OBJECT(cioc));
     return TRUE;
 }
diff --git a/blockdev.c b/blockdev.c
index 18acc5bf01..f92dcf24bf 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -596,7 +596,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
             autostart = 0;
         }
 
-        block_acct_init(blk_get_stats(blk), account_invalid, account_failed);
+        block_acct_setup(blk_get_stats(blk), account_invalid, account_failed);
 
         if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
             blk_unref(blk);
@@ -1363,12 +1363,10 @@ out_aio_context:
 static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
                                                   const char *name,
                                                   BlockDriverState **pbs,
-                                                  AioContext **paio,
                                                   Error **errp)
 {
     BlockDriverState *bs;
     BdrvDirtyBitmap *bitmap;
-    AioContext *aio_context;
 
     if (!node) {
         error_setg(errp, "Node cannot be NULL");
@@ -1384,29 +1382,17 @@ static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
         return NULL;
     }
 
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
     bitmap = bdrv_find_dirty_bitmap(bs, name);
     if (!bitmap) {
         error_setg(errp, "Dirty bitmap '%s' not found", name);
-        goto fail;
+        return NULL;
     }
 
     if (pbs) {
         *pbs = bs;
     }
-    if (paio) {
-        *paio = aio_context;
-    } else {
-        aio_context_release(aio_context);
-    }
 
     return bitmap;
-
- fail:
-    aio_context_release(aio_context);
-    return NULL;
 }
 
 /* New and old BlockDriverState structs for atomic group operations */
@@ -1792,7 +1778,7 @@ static void external_snapshot_commit(BlkActionState *common)
     /* We don't need (or want) to use the transactional
      * bdrv_reopen_multiple() across all the entries at once, because we
      * don't want to abort all of them if one of them fails the reopen */
-    if (!state->old_bs->copy_on_read) {
+    if (!atomic_read(&state->old_bs->copy_on_read)) {
         bdrv_reopen(state->old_bs, state->old_bs->open_flags & ~BDRV_O_RDWR,
                     NULL);
     }
@@ -2026,7 +2012,6 @@ static void block_dirty_bitmap_clear_prepare(BlkActionState *common,
     state->bitmap = block_dirty_bitmap_lookup(action->node,
                                               action->name,
                                               &state->bs,
-                                              &state->aio_context,
                                               errp);
     if (!state->bitmap) {
         return;
@@ -2734,7 +2719,6 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
                                 bool has_granularity, uint32_t granularity,
                                 Error **errp)
 {
-    AioContext *aio_context;
     BlockDriverState *bs;
 
     if (!name || name[0] == '\0') {
@@ -2747,14 +2731,11 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
         return;
     }
 
-    aio_context = bdrv_get_aio_context(bs);
-    aio_context_acquire(aio_context);
-
     if (has_granularity) {
         if (granularity < 512 || !is_power_of_2(granularity)) {
             error_setg(errp, "Granularity must be power of 2 "
                              "and at least 512");
-            goto out;
+            return;
         }
     } else {
         /* Default to cluster size, if available: */
@@ -2762,19 +2743,15 @@ void qmp_block_dirty_bitmap_add(const char *node, const char *name,
     }
 
     bdrv_create_dirty_bitmap(bs, granularity, name, errp);
-
- out:
-    aio_context_release(aio_context);
 }
 
 void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
                                    Error **errp)
 {
-    AioContext *aio_context;
     BlockDriverState *bs;
     BdrvDirtyBitmap *bitmap;
 
-    bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp);
     if (!bitmap || !bs) {
         return;
     }
@@ -2783,13 +2760,10 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
         error_setg(errp,
                    "Bitmap '%s' is currently frozen and cannot be removed",
                    name);
-        goto out;
+        return;
     }
     bdrv_dirty_bitmap_make_anon(bitmap);
     bdrv_release_dirty_bitmap(bs, bitmap);
-
- out:
-    aio_context_release(aio_context);
 }
 
 /**
@@ -2799,11 +2773,10 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
 void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
                                   Error **errp)
 {
-    AioContext *aio_context;
     BdrvDirtyBitmap *bitmap;
     BlockDriverState *bs;
 
-    bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, errp);
     if (!bitmap || !bs) {
         return;
     }
@@ -2812,18 +2785,15 @@ void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
         error_setg(errp,
                    "Bitmap '%s' is currently frozen and cannot be modified",
                    name);
-        goto out;
+        return;
     } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
         error_setg(errp,
                    "Bitmap '%s' is currently disabled and cannot be cleared",
                    name);
-        goto out;
+        return;
     }
 
     bdrv_clear_dirty_bitmap(bitmap, NULL);
-
- out:
-    aio_context_release(aio_context);
 }
 
 void hmp_drive_del(Monitor *mon, const QDict *qdict)
diff --git a/configure b/configure
index b147191ae6..ff0f8b915c 100755
--- a/configure
+++ b/configure
@@ -407,7 +407,7 @@ QEMU_CFLAGS="-fno-strict-aliasing -fno-common -fwrapv $QEMU_CFLAGS"
 QEMU_CFLAGS="-Wall -Wundef -Wwrite-strings -Wmissing-prototypes $QEMU_CFLAGS"
 QEMU_CFLAGS="-Wstrict-prototypes -Wredundant-decls $QEMU_CFLAGS"
 QEMU_CFLAGS="-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE $QEMU_CFLAGS"
-QEMU_INCLUDES="-I. -I\$(SRC_PATH) -I\$(SRC_PATH)/include"
+QEMU_INCLUDES="-I. -I\$(SRC_PATH) -I\$(SRC_PATH)/accel/tcg -I\$(SRC_PATH)/include"
 if test "$debug_info" = "yes"; then
     CFLAGS="-g $CFLAGS"
     LDFLAGS="-g $LDFLAGS"
@@ -2301,14 +2301,14 @@ fi
 # GTK probe
 
 if test "$gtkabi" = ""; then
-    # The GTK ABI was not specified explicitly, so try whether 2.0 is available.
-    # Use 3.0 as a fallback if that is available.
-    if $pkg_config --exists "gtk+-2.0 >= 2.18.0"; then
-        gtkabi=2.0
-    elif $pkg_config --exists "gtk+-3.0 >= 3.0.0"; then
+    # The GTK ABI was not specified explicitly, so try whether 3.0 is available.
+    # Use 2.0 as a fallback if that is available.
+    if $pkg_config --exists "gtk+-3.0 >= 3.0.0"; then
         gtkabi=3.0
-    else
+    elif $pkg_config --exists "gtk+-2.0 >= 2.18.0"; then
         gtkabi=2.0
+    else
+        gtkabi=3.0
     fi
 fi
 
@@ -2331,7 +2331,7 @@ if test "$gtk" != "no"; then
         libs_softmmu="$gtk_libs $libs_softmmu"
         gtk="yes"
     elif test "$gtk" = "yes"; then
-        feature_not_found "gtk" "Install gtk2 or gtk3 devel"
+        feature_not_found "gtk" "Install gtk3-devel"
     else
         gtk="no"
     fi
@@ -2598,12 +2598,12 @@ fi
 # sdl-config even without cross prefix, and favour pkg-config over sdl-config.
 
 if test "$sdlabi" = ""; then
-    if $pkg_config --exists "sdl"; then
-        sdlabi=1.2
-    elif $pkg_config --exists "sdl2"; then
+    if $pkg_config --exists "sdl2"; then
         sdlabi=2.0
-    else
+    elif $pkg_config --exists "sdl"; then
         sdlabi=1.2
+    else
+        sdlabi=2.0
     fi
 fi
 
@@ -2630,7 +2630,7 @@ elif has ${sdl_config}; then
   sdlversion=$($sdlconfig --version)
 else
   if test "$sdl" = "yes" ; then
-    feature_not_found "sdl" "Install SDL devel"
+    feature_not_found "sdl" "Install SDL2-devel"
   fi
   sdl=no
 fi
@@ -6374,7 +6374,7 @@ fi
 
 # build tree in object directory in case the source is not in the current directory
 DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests"
-DIRS="$DIRS docs fsdev"
+DIRS="$DIRS docs docs/interop fsdev"
 DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas pc-bios/s390-ccw"
 DIRS="$DIRS roms/seabios roms/vgabios"
 DIRS="$DIRS qapi-generated"
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index af02a31ebe..53ef222c0b 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -17,6 +17,7 @@
 #include <stdint.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <sys/poll.h>
 #include <linux/vhost.h>
 #include "standard-headers/linux/virtio_ring.h"
 
@@ -192,11 +193,11 @@ typedef struct VuVirtq {
 } VuVirtq;
 
 enum VuWatchCondtion {
-    VU_WATCH_IN = 1 << 0,
-    VU_WATCH_OUT = 1 << 1,
-    VU_WATCH_PRI = 1 << 2,
-    VU_WATCH_ERR = 1 << 3,
-    VU_WATCH_HUP = 1 << 4,
+    VU_WATCH_IN = POLLIN,
+    VU_WATCH_OUT = POLLOUT,
+    VU_WATCH_PRI = POLLPRI,
+    VU_WATCH_ERR = POLLERR,
+    VU_WATCH_HUP = POLLHUP,
 };
 
 typedef void (*vu_panic_cb) (VuDev *dev, const char *err);
diff --git a/contrib/vhost-user-scsi/Makefile.objs b/contrib/vhost-user-scsi/Makefile.objs
new file mode 100644
index 0000000000..e83a38a85b
--- /dev/null
+++ b/contrib/vhost-user-scsi/Makefile.objs
@@ -0,0 +1 @@
+vhost-user-scsi-obj-y = vhost-user-scsi.o
diff --git a/contrib/vhost-user-scsi/vhost-user-scsi.c b/contrib/vhost-user-scsi/vhost-user-scsi.c
new file mode 100644
index 0000000000..b5ae02c96f
--- /dev/null
+++ b/contrib/vhost-user-scsi/vhost-user-scsi.c
@@ -0,0 +1,886 @@
+/*
+ * vhost-user-scsi sample application
+ *
+ * Copyright (c) 2016 Nutanix Inc. All rights reserved.
+ *
+ * Author:
+ *  Felipe Franciosi <felipe@nutanix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 only.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "contrib/libvhost-user/libvhost-user.h"
+#include "hw/virtio/virtio-scsi.h"
+#include "iscsi/iscsi.h"
+
+#include <glib.h>
+
+/* Small compat shim from glib 2.32 */
+#ifndef G_SOURCE_CONTINUE
+#define G_SOURCE_CONTINUE TRUE
+#endif
+#ifndef G_SOURCE_REMOVE
+#define G_SOURCE_REMOVE FALSE
+#endif
+
+/* #define VUS_DEBUG 1 */
+
+/** Log helpers **/
+
+#define PPRE                                                          \
+    struct timespec ts;                                               \
+    char   timebuf[64];                                               \
+    struct tm tm;                                                     \
+    (void)clock_gettime(CLOCK_REALTIME, &ts);                         \
+    (void)strftime(timebuf, 64, "%Y%m%d %T", gmtime_r(&ts.tv_sec, &tm))
+
+#define PEXT(lvl, msg, ...) do {                                      \
+    PPRE;                                                             \
+    fprintf(stderr, "%s.%06ld " lvl ": %s:%s():%d: " msg "\n",        \
+            timebuf, ts.tv_nsec / 1000,                               \
+            __FILE__, __func__, __LINE__, ## __VA_ARGS__);            \
+} while (0)
+
+#define PNOR(lvl, msg, ...) do {                                      \
+    PPRE;                                                             \
+    fprintf(stderr, "%s.%06ld " lvl ": " msg "\n",                    \
+            timebuf, ts.tv_nsec / 1000, ## __VA_ARGS__);              \
+} while (0)
+
+#ifdef VUS_DEBUG
+#define PDBG(msg, ...) PEXT("DBG", msg, ## __VA_ARGS__)
+#define PERR(msg, ...) PEXT("ERR", msg, ## __VA_ARGS__)
+#define PLOG(msg, ...) PEXT("LOG", msg, ## __VA_ARGS__)
+#else
+#define PDBG(msg, ...) { }
+#define PERR(msg, ...) PNOR("ERR", msg, ## __VA_ARGS__)
+#define PLOG(msg, ...) PNOR("LOG", msg, ## __VA_ARGS__)
+#endif
+
+/** vhost-user-scsi specific definitions **/
+
+ /* Only 1 LUN and device supported today */
+#define VUS_MAX_LUNS 1
+#define VUS_MAX_DEVS 1
+
+#define VUS_ISCSI_INITIATOR "iqn.2016-11.com.nutanix:vhost-user-scsi"
+
+typedef struct iscsi_lun {
+    struct iscsi_context *iscsi_ctx;
+    int iscsi_lun;
+} iscsi_lun_t;
+
+typedef struct vhost_scsi_dev {
+    VuDev vu_dev;
+    int server_sock;
+    GMainLoop *loop;
+    GTree *fdmap;   /* fd -> gsource context id */
+    iscsi_lun_t luns[VUS_MAX_LUNS];
+} vhost_scsi_dev_t;
+
+static vhost_scsi_dev_t *vhost_scsi_devs[VUS_MAX_DEVS];
+
+/** glib event loop integration for libvhost-user and misc callbacks **/
+
+QEMU_BUILD_BUG_ON((int)G_IO_IN != (int)VU_WATCH_IN);
+QEMU_BUILD_BUG_ON((int)G_IO_OUT != (int)VU_WATCH_OUT);
+QEMU_BUILD_BUG_ON((int)G_IO_PRI != (int)VU_WATCH_PRI);
+QEMU_BUILD_BUG_ON((int)G_IO_ERR != (int)VU_WATCH_ERR);
+QEMU_BUILD_BUG_ON((int)G_IO_HUP != (int)VU_WATCH_HUP);
+
+typedef struct vus_gsrc {
+    GSource parent;
+    vhost_scsi_dev_t *vdev_scsi;
+    GPollFD gfd;
+    vu_watch_cb vu_cb;
+} vus_gsrc_t;
+
+static gint vus_fdmap_compare(gconstpointer a, gconstpointer b)
+{
+    return (b > a) - (b < a);
+}
+
+static gboolean vus_gsrc_prepare(GSource *src, gint *timeout)
+{
+    assert(timeout);
+
+    *timeout = -1;
+    return FALSE;
+}
+
+static gboolean vus_gsrc_check(GSource *src)
+{
+    vus_gsrc_t *vus_src = (vus_gsrc_t *)src;
+
+    assert(vus_src);
+
+    return vus_src->gfd.revents & vus_src->gfd.events;
+}
+
+static gboolean vus_gsrc_dispatch(GSource *src, GSourceFunc cb, gpointer data)
+{
+    vhost_scsi_dev_t *vdev_scsi;
+    vus_gsrc_t *vus_src = (vus_gsrc_t *)src;
+
+    assert(vus_src);
+    assert(!(vus_src->vu_cb && cb));
+
+    vdev_scsi = vus_src->vdev_scsi;
+
+    assert(vdev_scsi);
+
+    if (cb) {
+        return cb(data);
+    }
+    if (vus_src->vu_cb) {
+        vus_src->vu_cb(&vdev_scsi->vu_dev, vus_src->gfd.revents, data);
+    }
+    return G_SOURCE_CONTINUE;
+}
+
+static GSourceFuncs vus_gsrc_funcs = {
+    vus_gsrc_prepare,
+    vus_gsrc_check,
+    vus_gsrc_dispatch,
+    NULL
+};
+
+static int vus_gsrc_new(vhost_scsi_dev_t *vdev_scsi, int fd, GIOCondition cond,
+                        vu_watch_cb vu_cb, GSourceFunc gsrc_cb, gpointer data)
+{
+    GSource *vus_gsrc;
+    vus_gsrc_t *vus_src;
+    guint id;
+
+    assert(vdev_scsi);
+    assert(fd >= 0);
+    assert(vu_cb || gsrc_cb);
+    assert(!(vu_cb && gsrc_cb));
+
+    vus_gsrc = g_source_new(&vus_gsrc_funcs, sizeof(vus_gsrc_t));
+    if (!vus_gsrc) {
+        PERR("Error creating GSource for new watch");
+        return -1;
+    }
+    vus_src = (vus_gsrc_t *)vus_gsrc;
+
+    vus_src->vdev_scsi = vdev_scsi;
+    vus_src->gfd.fd = fd;
+    vus_src->gfd.events = cond;
+    vus_src->vu_cb = vu_cb;
+
+    g_source_add_poll(vus_gsrc, &vus_src->gfd);
+    g_source_set_callback(vus_gsrc, gsrc_cb, data, NULL);
+    id = g_source_attach(vus_gsrc, NULL);
+    assert(id);
+    g_source_unref(vus_gsrc);
+
+    g_tree_insert(vdev_scsi->fdmap, (gpointer)(uintptr_t)fd,
+                                    (gpointer)(uintptr_t)id);
+
+    return 0;
+}
+
+/* from libiscsi's scsi-lowlevel.h **
+ *
+ * nb. We can't directly include scsi-lowlevel.h due to a namespace conflict:
+ *     QEMU's scsi.h also defines "SCSI_XFER_NONE".
+ */
+
+#define SCSI_CDB_MAX_SIZE           16
+
+struct scsi_iovector {
+    struct scsi_iovec *iov;
+    int niov;
+    int nalloc;
+    size_t offset;
+    int consumed;
+};
+
+struct scsi_allocated_memory {
+    struct scsi_allocated_memory *next;
+    char buf[0];
+};
+
+struct scsi_data {
+    int            size;
+    unsigned char *data;
+};
+
+enum scsi_sense_key {
+    SCSI_SENSE_NO_SENSE            = 0x00,
+    SCSI_SENSE_RECOVERED_ERROR     = 0x01,
+    SCSI_SENSE_NOT_READY           = 0x02,
+    SCSI_SENSE_MEDIUM_ERROR        = 0x03,
+    SCSI_SENSE_HARDWARE_ERROR      = 0x04,
+    SCSI_SENSE_ILLEGAL_REQUEST     = 0x05,
+    SCSI_SENSE_UNIT_ATTENTION      = 0x06,
+    SCSI_SENSE_DATA_PROTECTION     = 0x07,
+    SCSI_SENSE_BLANK_CHECK         = 0x08,
+    SCSI_SENSE_VENDOR_SPECIFIC     = 0x09,
+    SCSI_SENSE_COPY_ABORTED        = 0x0a,
+    SCSI_SENSE_COMMAND_ABORTED     = 0x0b,
+    SCSI_SENSE_OBSOLETE_ERROR_CODE = 0x0c,
+    SCSI_SENSE_OVERFLOW_COMMAND    = 0x0d,
+    SCSI_SENSE_MISCOMPARE          = 0x0e
+};
+
+struct scsi_sense {
+    unsigned char       error_type;
+    enum scsi_sense_key key;
+    int                 ascq;
+    unsigned            sense_specific:1;
+    unsigned            ill_param_in_cdb:1;
+    unsigned            bit_pointer_valid:1;
+    unsigned char       bit_pointer;
+    uint16_t            field_pointer;
+};
+
+enum scsi_residual {
+    SCSI_RESIDUAL_NO_RESIDUAL = 0,
+    SCSI_RESIDUAL_UNDERFLOW,
+    SCSI_RESIDUAL_OVERFLOW
+};
+
+struct scsi_task {
+    int status;
+    int cdb_size;
+    int xfer_dir;
+    int expxferlen;
+    unsigned char cdb[SCSI_CDB_MAX_SIZE];
+    enum scsi_residual residual_status;
+    size_t residual;
+    struct scsi_sense sense;
+    struct scsi_data datain;
+    struct scsi_allocated_memory *mem;
+    void *ptr;
+
+    uint32_t itt;
+    uint32_t cmdsn;
+    uint32_t lun;
+
+    struct scsi_iovector iovector_in;
+    struct scsi_iovector iovector_out;
+};
+
+/** libiscsi integration **/
+
+static int iscsi_add_lun(iscsi_lun_t *lun, char *iscsi_uri)
+{
+    struct iscsi_url *iscsi_url;
+    struct iscsi_context *iscsi_ctx;
+    int ret = 0;
+
+    assert(lun);
+    assert(iscsi_uri);
+
+    iscsi_ctx = iscsi_create_context(VUS_ISCSI_INITIATOR);
+    if (!iscsi_ctx) {
+        PERR("Unable to create iSCSI context");
+        return -1;
+    }
+
+    iscsi_url = iscsi_parse_full_url(iscsi_ctx, iscsi_uri);
+    if (!iscsi_url) {
+        PERR("Unable to parse iSCSI URL: %s", iscsi_get_error(iscsi_ctx));
+        goto fail;
+    }
+
+    iscsi_set_session_type(iscsi_ctx, ISCSI_SESSION_NORMAL);
+    iscsi_set_header_digest(iscsi_ctx, ISCSI_HEADER_DIGEST_NONE_CRC32C);
+    if (iscsi_full_connect_sync(iscsi_ctx, iscsi_url->portal, iscsi_url->lun)) {
+        PERR("Unable to login to iSCSI portal: %s", iscsi_get_error(iscsi_ctx));
+        goto fail;
+    }
+
+    lun->iscsi_ctx = iscsi_ctx;
+    lun->iscsi_lun = iscsi_url->lun;
+
+    PDBG("Context %p created for lun 0: %s", iscsi_ctx, iscsi_uri);
+
+out:
+    if (iscsi_url) {
+        iscsi_destroy_url(iscsi_url);
+    }
+    return ret;
+
+fail:
+    (void)iscsi_destroy_context(iscsi_ctx);
+    ret = -1;
+    goto out;
+}
+
+static struct scsi_task *scsi_task_new(int cdb_len, uint8_t *cdb, int dir,
+                                       int xfer_len) {
+    struct scsi_task *task;
+
+    assert(cdb_len > 0);
+    assert(cdb);
+
+    task = calloc(1, sizeof(struct scsi_task));
+    if (!task) {
+        PERR("Error allocating task: %s", strerror(errno));
+        return NULL;
+    }
+
+    memcpy(task->cdb, cdb, cdb_len);
+    task->cdb_size = cdb_len;
+    task->xfer_dir = dir;
+    task->expxferlen = xfer_len;
+
+    return task;
+}
+
+static int get_cdb_len(uint8_t *cdb)
+{
+    assert(cdb);
+
+    switch (cdb[0] >> 5) {
+    case 0: return 6;
+    case 1: /* fall through */
+    case 2: return 10;
+    case 4: return 16;
+    case 5: return 12;
+    }
+    PERR("Unable to determine cdb len (0x%02hhX)", cdb[0] >> 5);
+    return -1;
+}
+
+static int handle_cmd_sync(struct iscsi_context *ctx,
+                           VirtIOSCSICmdReq *req,
+                           struct iovec *out, unsigned int out_len,
+                           VirtIOSCSICmdResp *rsp,
+                           struct iovec *in, unsigned int in_len) {
+    struct scsi_task *task;
+    uint32_t dir;
+    uint32_t len;
+    int cdb_len;
+    int i;
+
+    assert(ctx);
+    assert(req);
+    assert(rsp);
+
+    if (!(!req->lun[1] && req->lun[2] == 0x40 && !req->lun[3])) {
+        /* Ignore anything different than target=0, lun=0 */
+        PDBG("Ignoring unconnected lun (0x%hhX, 0x%hhX)",
+             req->lun[1], req->lun[3]);
+        rsp->status = SCSI_STATUS_CHECK_CONDITION;
+        memset(rsp->sense, 0, sizeof(rsp->sense));
+        rsp->sense_len = 18;
+        rsp->sense[0] = 0x70;
+        rsp->sense[2] = SCSI_SENSE_ILLEGAL_REQUEST;
+        rsp->sense[7] = 10;
+        rsp->sense[12] = 0x24;
+
+        return 0;
+    }
+
+    cdb_len = get_cdb_len(req->cdb);
+    if (cdb_len == -1) {
+        return -1;
+    }
+
+    len = 0;
+    if (!out_len && !in_len) {
+        dir = SCSI_XFER_NONE;
+    } else if (out_len) {
+        dir = SCSI_XFER_TO_DEV;
+        for (i = 0; i < out_len; i++) {
+            len += out[i].iov_len;
+        }
+    } else {
+        dir = SCSI_XFER_FROM_DEV;
+        for (i = 0; i < in_len; i++) {
+            len += in[i].iov_len;
+        }
+    }
+
+    task = scsi_task_new(cdb_len, req->cdb, dir, len);
+    if (!task) {
+        PERR("Unable to create iscsi task");
+        return -1;
+    }
+
+    if (dir == SCSI_XFER_TO_DEV) {
+        task->iovector_out.iov = (struct scsi_iovec *)out;
+        task->iovector_out.niov = out_len;
+    } else if (dir == SCSI_XFER_FROM_DEV) {
+        task->iovector_in.iov = (struct scsi_iovec *)in;
+        task->iovector_in.niov = in_len;
+    }
+
+    PDBG("Sending iscsi cmd (cdb_len=%d, dir=%d, task=%p)",
+         cdb_len, dir, task);
+    if (!iscsi_scsi_command_sync(ctx, 0, task, NULL)) {
+        PERR("Error serving SCSI command");
+        free(task);
+        return -1;
+    }
+
+    memset(rsp, 0, sizeof(*rsp));
+
+    rsp->status = task->status;
+    rsp->resid  = task->residual;
+
+    if (task->status == SCSI_STATUS_CHECK_CONDITION) {
+        rsp->response = VIRTIO_SCSI_S_FAILURE;
+        rsp->sense_len = task->datain.size - 2;
+        memcpy(rsp->sense, &task->datain.data[2], rsp->sense_len);
+    }
+
+    free(task);
+
+    PDBG("Filled in rsp: status=%hhX, resid=%u, response=%hhX, sense_len=%u",
+         rsp->status, rsp->resid, rsp->response, rsp->sense_len);
+
+    return 0;
+}
+
+/** libvhost-user callbacks **/
+
+static vhost_scsi_dev_t *vdev_scsi_find_by_vu(VuDev *vu_dev);
+
+static void vus_panic_cb(VuDev *vu_dev, const char *buf)
+{
+    vhost_scsi_dev_t *vdev_scsi;
+
+    assert(vu_dev);
+
+    vdev_scsi = vdev_scsi_find_by_vu(vu_dev);
+
+    if (buf) {
+        PERR("vu_panic: %s", buf);
+    }
+
+    if (vdev_scsi) {
+        assert(vdev_scsi->loop);
+        g_main_loop_quit(vdev_scsi->loop);
+    }
+}
+
+static void vus_add_watch_cb(VuDev *vu_dev, int fd, int vu_evt, vu_watch_cb cb,
+                             void *pvt) {
+    vhost_scsi_dev_t *vdev_scsi;
+    guint id;
+
+    assert(vu_dev);
+    assert(fd >= 0);
+    assert(cb);
+
+    vdev_scsi = vdev_scsi_find_by_vu(vu_dev);
+    if (!vdev_scsi) {
+        vus_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    id = (guint)(uintptr_t)g_tree_lookup(vdev_scsi->fdmap,
+                                         (gpointer)(uintptr_t)fd);
+    if (id) {
+        GSource *vus_src = g_main_context_find_source_by_id(NULL, id);
+        assert(vus_src);
+        g_source_destroy(vus_src);
+        (void)g_tree_remove(vdev_scsi->fdmap, (gpointer)(uintptr_t)fd);
+    }
+
+    if (vus_gsrc_new(vdev_scsi, fd, vu_evt, cb, NULL, pvt)) {
+        vus_panic_cb(vu_dev, NULL);
+    }
+}
+
+static void vus_del_watch_cb(VuDev *vu_dev, int fd)
+{
+    vhost_scsi_dev_t *vdev_scsi;
+    guint id;
+
+    assert(vu_dev);
+    assert(fd >= 0);
+
+    vdev_scsi = vdev_scsi_find_by_vu(vu_dev);
+    if (!vdev_scsi) {
+        vus_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    id = (guint)(uintptr_t)g_tree_lookup(vdev_scsi->fdmap,
+                                         (gpointer)(uintptr_t)fd);
+    if (id) {
+        GSource *vus_src = g_main_context_find_source_by_id(NULL, id);
+        assert(vus_src);
+        g_source_destroy(vus_src);
+        (void)g_tree_remove(vdev_scsi->fdmap, (gpointer)(uintptr_t)fd);
+    }
+}
+
+static void vus_proc_ctl(VuDev *vu_dev, int idx)
+{
+    /* Control VQ not implemented */
+}
+
+static void vus_proc_evt(VuDev *vu_dev, int idx)
+{
+    /* Event VQ not implemented */
+}
+
+static void vus_proc_req(VuDev *vu_dev, int idx)
+{
+    vhost_scsi_dev_t *vdev_scsi;
+    VuVirtq *vq;
+
+    assert(vu_dev);
+
+    vdev_scsi = vdev_scsi_find_by_vu(vu_dev);
+    if (!vdev_scsi) {
+        vus_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    if ((idx < 0) || (idx >= VHOST_MAX_NR_VIRTQUEUE)) {
+        PERR("VQ Index out of range: %d", idx);
+        vus_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    vq = vu_get_queue(vu_dev, idx);
+    if (!vq) {
+        PERR("Error fetching VQ (dev=%p, idx=%d)", vu_dev, idx);
+        vus_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    PDBG("Got kicked on vq[%d]@%p", idx, vq);
+
+    while (1) {
+        VuVirtqElement *elem;
+        VirtIOSCSICmdReq *req;
+        VirtIOSCSICmdResp *rsp;
+
+        elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement));
+        if (!elem) {
+            PDBG("No more elements pending on vq[%d]@%p", idx, vq);
+            break;
+        }
+        PDBG("Popped elem@%p", elem);
+
+        assert(!((elem->out_num > 1) && (elem->in_num > 1)));
+        assert((elem->out_num > 0) && (elem->in_num > 0));
+
+        if (elem->out_sg[0].iov_len < sizeof(VirtIOSCSICmdReq)) {
+            PERR("Invalid virtio-scsi req header");
+            vus_panic_cb(vu_dev, NULL);
+            break;
+        }
+        req = (VirtIOSCSICmdReq *)elem->out_sg[0].iov_base;
+
+        if (elem->in_sg[0].iov_len < sizeof(VirtIOSCSICmdResp)) {
+            PERR("Invalid virtio-scsi rsp header");
+            vus_panic_cb(vu_dev, NULL);
+            break;
+        }
+        rsp = (VirtIOSCSICmdResp *)elem->in_sg[0].iov_base;
+
+        if (handle_cmd_sync(vdev_scsi->luns[0].iscsi_ctx,
+                            req, &elem->out_sg[1], elem->out_num - 1,
+                            rsp, &elem->in_sg[1], elem->in_num - 1) != 0) {
+            vus_panic_cb(vu_dev, NULL);
+            break;
+        }
+
+        vu_queue_push(vu_dev, vq, elem, 0);
+        vu_queue_notify(vu_dev, vq);
+
+        free(elem);
+    }
+}
+
+static void vus_queue_set_started(VuDev *vu_dev, int idx, bool started)
+{
+    VuVirtq *vq;
+
+    assert(vu_dev);
+
+    if ((idx < 0) || (idx >= VHOST_MAX_NR_VIRTQUEUE)) {
+        PERR("VQ Index out of range: %d", idx);
+        vus_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    vq = vu_get_queue(vu_dev, idx);
+
+    switch (idx) {
+    case 0:
+        vu_set_queue_handler(vu_dev, vq, started ? vus_proc_ctl : NULL);
+        break;
+    case 1:
+        vu_set_queue_handler(vu_dev, vq, started ? vus_proc_evt : NULL);
+        break;
+    default:
+        vu_set_queue_handler(vu_dev, vq, started ? vus_proc_req : NULL);
+    }
+}
+
+static const VuDevIface vus_iface = {
+    .queue_set_started = vus_queue_set_started,
+};
+
+static gboolean vus_vhost_cb(gpointer data)
+{
+    VuDev *vu_dev = (VuDev *)data;
+
+    assert(vu_dev);
+
+    if (!vu_dispatch(vu_dev) != 0) {
+        PERR("Error processing vhost message");
+        vus_panic_cb(vu_dev, NULL);
+        return G_SOURCE_REMOVE;
+    }
+
+    return G_SOURCE_CONTINUE;
+}
+
+/** misc helpers **/
+
+static int unix_sock_new(char *unix_fn)
+{
+    int sock;
+    struct sockaddr_un un;
+    size_t len;
+
+    assert(unix_fn);
+
+    sock = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (sock <= 0) {
+        perror("socket");
+        return -1;
+    }
+
+    un.sun_family = AF_UNIX;
+    (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn);
+    len = sizeof(un.sun_family) + strlen(un.sun_path);
+
+    (void)unlink(unix_fn);
+    if (bind(sock, (struct sockaddr *)&un, len) < 0) {
+        perror("bind");
+        goto fail;
+    }
+
+    if (listen(sock, 1) < 0) {
+        perror("listen");
+        goto fail;
+    }
+
+    return sock;
+
+fail:
+    (void)close(sock);
+
+    return -1;
+}
+
+/** vhost-user-scsi **/
+
+static vhost_scsi_dev_t *vdev_scsi_find_by_vu(VuDev *vu_dev)
+{
+    int i;
+
+    assert(vu_dev);
+
+    for (i = 0; i < VUS_MAX_DEVS; i++) {
+        if (&vhost_scsi_devs[i]->vu_dev == vu_dev) {
+            return vhost_scsi_devs[i];
+        }
+    }
+
+    PERR("Unknown VuDev %p", vu_dev);
+    return NULL;
+}
+
+static void vdev_scsi_deinit(vhost_scsi_dev_t *vdev_scsi)
+{
+    if (!vdev_scsi) {
+        return;
+    }
+
+    if (vdev_scsi->server_sock >= 0) {
+        struct sockaddr_storage ss;
+        socklen_t sslen = sizeof(ss);
+
+        if (getsockname(vdev_scsi->server_sock, (struct sockaddr *)&ss,
+                        &sslen) == 0) {
+            struct sockaddr_un *su = (struct sockaddr_un *)&ss;
+            (void)unlink(su->sun_path);
+        }
+
+        (void)close(vdev_scsi->server_sock);
+        vdev_scsi->server_sock = -1;
+    }
+
+    if (vdev_scsi->loop) {
+        g_main_loop_unref(vdev_scsi->loop);
+        vdev_scsi->loop = NULL;
+    }
+}
+
+static vhost_scsi_dev_t *vdev_scsi_new(char *unix_fn)
+{
+    vhost_scsi_dev_t *vdev_scsi = NULL;
+
+    assert(unix_fn);
+
+    vdev_scsi = calloc(1, sizeof(vhost_scsi_dev_t));
+    if (!vdev_scsi) {
+        PERR("calloc: %s", strerror(errno));
+        return NULL;
+    }
+
+    vdev_scsi->server_sock = unix_sock_new(unix_fn);
+    if (vdev_scsi->server_sock < 0) {
+        goto err;
+    }
+
+    vdev_scsi->loop = g_main_loop_new(NULL, FALSE);
+    if (!vdev_scsi->loop) {
+        PERR("Error creating glib event loop");
+        goto err;
+    }
+
+    vdev_scsi->fdmap = g_tree_new(vus_fdmap_compare);
+    if (!vdev_scsi->fdmap) {
+        PERR("Error creating glib tree for fdmap");
+        goto err;
+    }
+
+    return vdev_scsi;
+
+err:
+    vdev_scsi_deinit(vdev_scsi);
+    free(vdev_scsi);
+
+    return NULL;
+}
+
+static int vdev_scsi_add_iscsi_lun(vhost_scsi_dev_t *vdev_scsi,
+                                   char *iscsi_uri, uint32_t lun) {
+    assert(vdev_scsi);
+    assert(iscsi_uri);
+    assert(lun < VUS_MAX_LUNS);
+
+    if (vdev_scsi->luns[lun].iscsi_ctx) {
+        PERR("Lun %d already configured", lun);
+        return -1;
+    }
+
+    if (iscsi_add_lun(&vdev_scsi->luns[lun], iscsi_uri) != 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int vdev_scsi_run(vhost_scsi_dev_t *vdev_scsi)
+{
+    int cli_sock;
+    int ret = 0;
+
+    assert(vdev_scsi);
+    assert(vdev_scsi->server_sock >= 0);
+    assert(vdev_scsi->loop);
+
+    cli_sock = accept(vdev_scsi->server_sock, (void *)0, (void *)0);
+    if (cli_sock < 0) {
+        perror("accept");
+        return -1;
+    }
+
+    vu_init(&vdev_scsi->vu_dev,
+            cli_sock,
+            vus_panic_cb,
+            vus_add_watch_cb,
+            vus_del_watch_cb,
+            &vus_iface);
+
+    if (vus_gsrc_new(vdev_scsi, cli_sock, G_IO_IN, NULL, vus_vhost_cb,
+                     &vdev_scsi->vu_dev)) {
+        goto fail;
+    }
+
+    g_main_loop_run(vdev_scsi->loop);
+
+out:
+    vu_deinit(&vdev_scsi->vu_dev);
+
+    return ret;
+
+fail:
+    ret = -1;
+    goto out;
+}
+
+int main(int argc, char **argv)
+{
+    vhost_scsi_dev_t *vdev_scsi = NULL;
+    char *unix_fn = NULL;
+    char *iscsi_uri = NULL;
+    int opt, err = EXIT_SUCCESS;
+
+    while ((opt = getopt(argc, argv, "u:i:")) != -1) {
+        switch (opt) {
+        case 'h':
+            goto help;
+        case 'u':
+            unix_fn = strdup(optarg);
+            break;
+        case 'i':
+            iscsi_uri = strdup(optarg);
+            break;
+        default:
+            goto help;
+        }
+    }
+    if (!unix_fn || !iscsi_uri) {
+        goto help;
+    }
+
+    vdev_scsi = vdev_scsi_new(unix_fn);
+    if (!vdev_scsi) {
+        goto err;
+    }
+    vhost_scsi_devs[0] = vdev_scsi;
+
+    if (vdev_scsi_add_iscsi_lun(vdev_scsi, iscsi_uri, 0) != 0) {
+        goto err;
+    }
+
+    if (vdev_scsi_run(vdev_scsi) != 0) {
+        goto err;
+    }
+
+out:
+    if (vdev_scsi) {
+        vdev_scsi_deinit(vdev_scsi);
+        free(vdev_scsi);
+    }
+    if (unix_fn) {
+        free(unix_fn);
+    }
+    if (iscsi_uri) {
+        free(iscsi_uri);
+    }
+
+    return err;
+
+err:
+    err = EXIT_FAILURE;
+    goto out;
+
+help:
+    fprintf(stderr, "Usage: %s [ -u unix_sock_path -i iscsi_uri ] | [ -h ]\n",
+            argv[0]);
+    fprintf(stderr, "          -u path to unix socket\n");
+    fprintf(stderr, "          -i iscsi uri for lun 0\n");
+    fprintf(stderr, "          -h print help and quit\n");
+
+    goto err;
+}
diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index 3bbeb62d9a..53ff10975c 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -43,3 +43,4 @@ CONFIG_VGA=y
 CONFIG_VGA_PCI=y
 CONFIG_IVSHMEM=$(CONFIG_EVENTFD)
 CONFIG_ROCKER=y
+CONFIG_VHOST_USER_SCSI=$(CONFIG_LINUX)
diff --git a/default-configs/s390x-softmmu.mak b/default-configs/s390x-softmmu.mak
index 18aed56fc0..b227a36179 100644
--- a/default-configs/s390x-softmmu.mak
+++ b/default-configs/s390x-softmmu.mak
@@ -1,5 +1,6 @@
 CONFIG_PCI=y
 CONFIG_VIRTIO_PCI=y
+CONFIG_VHOST_USER_SCSI=$(CONFIG_LINUX)
 CONFIG_VIRTIO=y
 CONFIG_SCLPCONSOLE=y
 CONFIG_TERMINAL3270=y
diff --git a/docs/specs/parallels.txt b/docs/interop/parallels.txt
index e9271eba5d..e9271eba5d 100644
--- a/docs/specs/parallels.txt
+++ b/docs/interop/parallels.txt
diff --git a/docs/specs/qcow2.txt b/docs/interop/qcow2.txt
index 80cdfd0e91..80cdfd0e91 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/interop/qcow2.txt
diff --git a/docs/specs/qed_spec.txt b/docs/interop/qed_spec.txt
index 7982e058b2..7982e058b2 100644
--- a/docs/specs/qed_spec.txt
+++ b/docs/interop/qed_spec.txt
diff --git a/docs/qemu-ga-ref.texi b/docs/interop/qemu-ga-ref.texi
index ddb76ce1c2..ddb76ce1c2 100644
--- a/docs/qemu-ga-ref.texi
+++ b/docs/interop/qemu-ga-ref.texi
diff --git a/docs/qemu-qmp-ref.texi b/docs/interop/qemu-qmp-ref.texi
index bb25758bd0..bb25758bd0 100644
--- a/docs/qemu-qmp-ref.texi
+++ b/docs/interop/qemu-qmp-ref.texi
diff --git a/docs/qmp-intro.txt b/docs/interop/qmp-intro.txt
index 60deafbae6..60deafbae6 100644
--- a/docs/qmp-intro.txt
+++ b/docs/interop/qmp-intro.txt
diff --git a/docs/qmp-spec.txt b/docs/interop/qmp-spec.txt
index f8b5356015..f8b5356015 100644
--- a/docs/qmp-spec.txt
+++ b/docs/interop/qmp-spec.txt
diff --git a/docs/specs/vhost-user.txt b/docs/interop/vhost-user.txt
index 481ab56e35..481ab56e35 100644
--- a/docs/specs/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
diff --git a/docs/vnc-ledstate-Pseudo-encoding.txt b/docs/interop/vnc-ledstate-Pseudo-encoding.txt
index 0f124f68b1..0f124f68b1 100644
--- a/docs/vnc-ledstate-Pseudo-encoding.txt
+++ b/docs/interop/vnc-ledstate-Pseudo-encoding.txt
diff --git a/exec.c b/exec.c
index a93e209625..42ad1eaedd 100644
--- a/exec.c
+++ b/exec.c
@@ -1482,25 +1482,17 @@ static int64_t get_file_size(int fd)
     return size;
 }
 
-static void *file_ram_alloc(RAMBlock *block,
-                            ram_addr_t memory,
-                            const char *path,
-                            Error **errp)
+static int file_ram_open(const char *path,
+                         const char *region_name,
+                         bool *created,
+                         Error **errp)
 {
-    bool unlink_on_error = false;
     char *filename;
     char *sanitized_name;
     char *c;
-    void *area = MAP_FAILED;
     int fd = -1;
-    int64_t file_size;
-
-    if (kvm_enabled() && !kvm_has_sync_mmu()) {
-        error_setg(errp,
-                   "host lacks kvm mmu notifiers, -mem-path unsupported");
-        return NULL;
-    }
 
+    *created = false;
     for (;;) {
         fd = open(path, O_RDWR);
         if (fd >= 0) {
@@ -1511,13 +1503,13 @@ static void *file_ram_alloc(RAMBlock *block,
             /* @path names a file that doesn't exist, create it */
             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
             if (fd >= 0) {
-                unlink_on_error = true;
+                *created = true;
                 break;
             }
         } else if (errno == EISDIR) {
             /* @path names a directory, create a file there */
             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
-            sanitized_name = g_strdup(memory_region_name(block->mr));
+            sanitized_name = g_strdup(region_name);
             for (c = sanitized_name; *c != '\0'; c++) {
                 if (*c == '/') {
                     *c = '_';
@@ -1540,7 +1532,7 @@ static void *file_ram_alloc(RAMBlock *block,
             error_setg_errno(errp, errno,
                              "can't open backing store %s for guest RAM",
                              path);
-            goto error;
+            return -1;
         }
         /*
          * Try again on EINTR and EEXIST.  The latter happens when
@@ -1548,6 +1540,17 @@ static void *file_ram_alloc(RAMBlock *block,
          */
     }
 
+    return fd;
+}
+
+static void *file_ram_alloc(RAMBlock *block,
+                            ram_addr_t memory,
+                            int fd,
+                            bool truncate,
+                            Error **errp)
+{
+    void *area;
+
     block->page_size = qemu_fd_getpagesize(fd);
     block->mr->align = block->page_size;
 #if defined(__s390x__)
@@ -1556,20 +1559,11 @@ static void *file_ram_alloc(RAMBlock *block,
     }
 #endif
 
-    file_size = get_file_size(fd);
-
     if (memory < block->page_size) {
         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
                    "or larger than page size 0x%zx",
                    memory, block->page_size);
-        goto error;
-    }
-
-    if (file_size > 0 && file_size < memory) {
-        error_setg(errp, "backing store %s size 0x%" PRIx64
-                   " does not match 'size' option 0x" RAM_ADDR_FMT,
-                   path, file_size, memory);
-        goto error;
+        return NULL;
     }
 
     memory = ROUND_UP(memory, block->page_size);
@@ -1588,7 +1582,7 @@ static void *file_ram_alloc(RAMBlock *block,
      * those labels. Therefore, extending the non-empty backend file
      * is disabled as well.
      */
-    if (!file_size && ftruncate(fd, memory)) {
+    if (truncate && ftruncate(fd, memory)) {
         perror("ftruncate");
     }
 
@@ -1597,30 +1591,19 @@ static void *file_ram_alloc(RAMBlock *block,
     if (area == MAP_FAILED) {
         error_setg_errno(errp, errno,
                          "unable to map backing store for guest RAM");
-        goto error;
+        return NULL;
     }
 
     if (mem_prealloc) {
         os_mem_prealloc(fd, area, memory, smp_cpus, errp);
         if (errp && *errp) {
-            goto error;
+            qemu_ram_munmap(area, memory);
+            return NULL;
         }
     }
 
     block->fd = fd;
     return area;
-
-error:
-    if (area != MAP_FAILED) {
-        qemu_ram_munmap(area, memory);
-    }
-    if (unlink_on_error) {
-        unlink(path);
-    }
-    if (fd != -1) {
-        close(fd);
-    }
-    return NULL;
 }
 #endif
 
@@ -1931,18 +1914,25 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
 }
 
 #ifdef __linux__
-RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
-                                   bool share, const char *mem_path,
-                                   Error **errp)
+RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
+                                 bool share, int fd,
+                                 Error **errp)
 {
     RAMBlock *new_block;
     Error *local_err = NULL;
+    int64_t file_size;
 
     if (xen_enabled()) {
         error_setg(errp, "-mem-path not supported with Xen");
         return NULL;
     }
 
+    if (kvm_enabled() && !kvm_has_sync_mmu()) {
+        error_setg(errp,
+                   "host lacks kvm mmu notifiers, -mem-path unsupported");
+        return NULL;
+    }
+
     if (phys_mem_alloc != qemu_anon_ram_alloc) {
         /*
          * file_ram_alloc() needs to allocate just like
@@ -1955,13 +1945,20 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
     }
 
     size = HOST_PAGE_ALIGN(size);
+    file_size = get_file_size(fd);
+    if (file_size > 0 && file_size < size) {
+        error_setg(errp, "backing store %s size 0x%" PRIx64
+                   " does not match 'size' option 0x" RAM_ADDR_FMT,
+                   mem_path, file_size, size);
+        return NULL;
+    }
+
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->used_length = size;
     new_block->max_length = size;
     new_block->flags = share ? RAM_SHARED : 0;
-    new_block->host = file_ram_alloc(new_block, size,
-                                     mem_path, errp);
+    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
     if (!new_block->host) {
         g_free(new_block);
         return NULL;
@@ -1974,6 +1971,33 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
         return NULL;
     }
     return new_block;
+
+}
+
+
+RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
+                                   bool share, const char *mem_path,
+                                   Error **errp)
+{
+    int fd;
+    bool created;
+    RAMBlock *block;
+
+    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
+    if (fd < 0) {
+        return NULL;
+    }
+
+    block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
+    if (!block) {
+        if (created) {
+            unlink(mem_path);
+        }
+        close(fd);
+        return NULL;
+    }
+
+    return block;
 }
 #endif
 
diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index 100c8a98bf..de2c5d5702 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -111,7 +111,7 @@ float16 float16_default_nan(float_status *status)
 *----------------------------------------------------------------------------*/
 float32 float32_default_nan(float_status *status)
 {
-#if defined(TARGET_SPARC)
+#if defined(TARGET_SPARC) || defined(TARGET_M68K)
     return const_float32(0x7FFFFFFF);
 #elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) || \
       defined(TARGET_XTENSA) || defined(TARGET_S390X) || defined(TARGET_TRICORE)
@@ -136,7 +136,7 @@ float32 float32_default_nan(float_status *status)
 *----------------------------------------------------------------------------*/
 float64 float64_default_nan(float_status *status)
 {
-#if defined(TARGET_SPARC)
+#if defined(TARGET_SPARC) || defined(TARGET_M68K)
     return const_float64(LIT64(0x7FFFFFFFFFFFFFFF));
 #elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) || \
       defined(TARGET_S390X)
@@ -162,7 +162,10 @@ float64 float64_default_nan(float_status *status)
 floatx80 floatx80_default_nan(float_status *status)
 {
     floatx80 r;
-
+#if defined(TARGET_M68K)
+    r.low = LIT64(0xFFFFFFFFFFFFFFFF);
+    r.high = 0x7FFF;
+#else
     if (status->snan_bit_is_one) {
         r.low = LIT64(0xBFFFFFFFFFFFFFFF);
         r.high = 0x7FFF;
@@ -170,6 +173,7 @@ floatx80 floatx80_default_nan(float_status *status)
         r.low = LIT64(0xC000000000000000);
         r.high = 0xFFFF;
     }
+#endif
     return r;
 }
 
@@ -502,6 +506,30 @@ static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN,
         return 1;
     }
 }
+#elif defined(TARGET_M68K)
+static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN,
+                   flag aIsLargerSignificand)
+{
+    /* M68000 FAMILY PROGRAMMER'S REFERENCE MANUAL
+     * 3.4 FLOATING-POINT INSTRUCTION DETAILS
+     * If either operand, but not both operands, of an operation is a
+     * nonsignaling NaN, then that NaN is returned as the result. If both
+     * operands are nonsignaling NaNs, then the destination operand
+     * nonsignaling NaN is returned as the result.
+     * If either operand to an operation is a signaling NaN (SNaN), then the
+     * SNaN bit is set in the FPSR EXC byte. If the SNaN exception enable bit
+     * is set in the FPCR ENABLE byte, then the exception is taken and the
+     * destination is not modified. If the SNaN exception enable bit is not
+     * set, setting the SNaN bit in the operand to a one converts the SNaN to
+     * a nonsignaling NaN. The operation then continues as described in the
+     * preceding paragraph for nonsignaling NaNs.
+     */
+    if (aIsQNaN || aIsSNaN) { /* a is the destination operand */
+        return 0; /* return the destination operand */
+    } else {
+        return 1; /* return b */
+    }
+}
 #else
 static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN,
                     flag aIsLargerSignificand)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 15610b9de8..a9b59bdce5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -37,24 +37,6 @@
 #include "kvm_i386.h"
 #include "trace.h"
 
-/*#define DEBUG_INTEL_IOMMU*/
-#ifdef DEBUG_INTEL_IOMMU
-enum {
-    DEBUG_GENERAL, DEBUG_CSR, DEBUG_INV, DEBUG_MMU, DEBUG_FLOG,
-    DEBUG_CACHE, DEBUG_IR,
-};
-#define VTD_DBGBIT(x)   (1 << DEBUG_##x)
-static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
-
-#define VTD_DPRINTF(what, fmt, ...) do { \
-    if (vtd_dbgflags & VTD_DBGBIT(what)) { \
-        fprintf(stderr, "(vtd)%s: " fmt "\n", __func__, \
-                ## __VA_ARGS__); } \
-    } while (0)
-#else
-#define VTD_DPRINTF(what, fmt, ...) do {} while (0)
-#endif
-
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
                             uint64_t wmask, uint64_t w1cmask)
 {
@@ -199,9 +181,10 @@ static void vtd_reset_context_cache(IntelIOMMUState *s)
     GHashTableIter bus_it;
     uint32_t devfn_it;
 
+    trace_vtd_context_cache_reset();
+
     g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
 
-    VTD_DPRINTF(CACHE, "global context_cache_gen=1");
     while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
         for (devfn_it = 0; devfn_it < X86_IOMMU_PCI_DEVFN_MAX; ++devfn_it) {
             vtd_as = vtd_bus->dev_as[devfn_it];
@@ -291,8 +274,8 @@ static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
     msi.address = vtd_get_long_raw(s, mesg_addr_reg);
     msi.data = vtd_get_long_raw(s, mesg_data_reg);
 
-    VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32,
-                msi.address, msi.data);
+    trace_vtd_irq_generate(msi.address, msi.data);
+
     apic_get_class()->send_msi(&msi);
 }
 
@@ -304,14 +287,14 @@ static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
 {
     if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
         pre_fsts & VTD_FSTS_IQE) {
-        VTD_DPRINTF(FLOG, "there are previous interrupt conditions "
-                    "to be serviced by software, fault event is not generated "
-                    "(FSTS_REG 0x%"PRIx32 ")", pre_fsts);
+        trace_vtd_err("There are previous interrupt conditions "
+                      "to be serviced by software, fault event "
+                      "is not generated.");
         return;
     }
     vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
     if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
-        VTD_DPRINTF(FLOG, "Interrupt Mask set, fault event is not generated");
+        trace_vtd_err("Interrupt Mask set, irq is not generated.");
     } else {
         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
@@ -348,7 +331,7 @@ static void vtd_update_fsts_ppf(IntelIOMMUState *s)
         }
     }
     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
-    VTD_DPRINTF(FLOG, "set PPF of FSTS_REG to %d", ppf_mask ? 1 : 0);
+    trace_vtd_fsts_ppf(!!ppf_mask);
 }
 
 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
@@ -380,8 +363,8 @@ static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
     }
     vtd_set_quad_raw(s, frcd_reg_addr, lo);
     vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
-    VTD_DPRINTF(FLOG, "record to FRCD_REG #%"PRIu16 ": hi 0x%"PRIx64
-                ", lo 0x%"PRIx64, index, hi, lo);
+
+    trace_vtd_frr_new(index, hi, lo);
 }
 
 /* Try to collapse multiple pending faults from the same requester */
@@ -393,7 +376,6 @@ static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
 
     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
         frcd_reg = vtd_get_quad_raw(s, addr);
-        VTD_DPRINTF(FLOG, "frcd_reg #%d 0x%"PRIx64, i, frcd_reg);
         if ((frcd_reg & VTD_FRCD_F) &&
             ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
             return true;
@@ -416,21 +398,24 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
         /* This is not a normal fault reason case. Drop it. */
         return;
     }
-    VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
-                ", is_write %d", source_id, fault, addr, is_write);
+
+    trace_vtd_dmar_fault(source_id, fault, addr, is_write);
+
     if (fsts_reg & VTD_FSTS_PFO) {
-        VTD_DPRINTF(FLOG, "new fault is not recorded due to "
-                    "Primary Fault Overflow");
+        trace_vtd_err("New fault is not recorded due to "
+                      "Primary Fault Overflow.");
         return;
     }
+
     if (vtd_try_collapse_fault(s, source_id)) {
-        VTD_DPRINTF(FLOG, "new fault is not recorded due to "
-                    "compression of faults");
+        trace_vtd_err("New fault is not recorded due to "
+                      "compression of faults.");
         return;
     }
+
     if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
-        VTD_DPRINTF(FLOG, "Primary Fault Overflow and "
-                    "new fault is not recorded, set PFO field");
+        trace_vtd_err("Next Fault Recording Reg is used, "
+                      "new fault is not recorded, set PFO field.");
         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
         return;
     }
@@ -438,8 +423,8 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
     vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
 
     if (fsts_reg & VTD_FSTS_PPF) {
-        VTD_DPRINTF(FLOG, "there are pending faults already, "
-                    "fault event is not generated");
+        trace_vtd_err("There are pending faults already, "
+                      "fault event is not generated.");
         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
         s->next_frcd_reg++;
         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
@@ -702,7 +687,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
     uint64_t access_right_check;
 
     if (!vtd_iova_range_check(iova, ce)) {
-        VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", iova);
+        trace_vtd_err_dmar_iova_overflow(iova);
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
 
@@ -714,9 +699,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
         slpte = vtd_get_slpte(addr, offset);
 
         if (slpte == (uint64_t)-1) {
-            VTD_DPRINTF(GENERAL, "error: fail to access second-level paging "
-                        "entry at level %"PRIu32 " for iova 0x%"PRIx64,
-                        level, iova);
+            trace_vtd_err_dmar_slpte_read_error(iova, level);
             if (level == vtd_ce_get_level(ce)) {
                 /* Invalid programming of context-entry */
                 return -VTD_FR_CONTEXT_ENTRY_INV;
@@ -727,15 +710,11 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
         *reads = (*reads) && (slpte & VTD_SL_R);
         *writes = (*writes) && (slpte & VTD_SL_W);
         if (!(slpte & access_right_check)) {
-            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
-                        "iova 0x%"PRIx64 " slpte 0x%"PRIx64,
-                        (is_write ? "write" : "read"), iova, slpte);
+            trace_vtd_err_dmar_slpte_perm_error(iova, level, slpte, is_write);
             return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
         }
         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
-            VTD_DPRINTF(GENERAL, "error: non-zero reserved field in second "
-                        "level paging entry level %"PRIu32 " slpte 0x%"PRIx64,
-                        level, slpte);
+            trace_vtd_err_dmar_slpte_resv_error(iova, level, slpte);
             return -VTD_FR_PAGING_ENTRY_RSVD;
         }
 
@@ -1090,8 +1069,10 @@ out:
  * @devfn: The devfn, which is the  combined of device and function number
  * @is_write: The access is a write operation
  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
+ *
+ * Returns true if translation is successful, otherwise false.
  */
-static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
+static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
                                    uint8_t devfn, hwaddr addr, bool is_write,
                                    IOMMUTLBEntry *entry)
 {
@@ -1125,6 +1106,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         page_mask = iotlb_entry->mask;
         goto out;
     }
+
     /* Try to fetch context-entry from cache first */
     if (cc_entry->context_cache_gen == s->context_cache_gen) {
         trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
@@ -1142,7 +1124,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
             } else {
                 vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
             }
-            return;
+            goto error;
         }
         /* Update context-cache */
         trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
@@ -1157,8 +1139,9 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
      * Also, let's ignore IOTLB caching as well for PT devices.
      */
     if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
+        entry->iova = addr & VTD_PAGE_MASK;
         entry->translated_addr = entry->iova;
-        entry->addr_mask = VTD_PAGE_SIZE - 1;
+        entry->addr_mask = VTD_PAGE_MASK;
         entry->perm = IOMMU_RW;
         trace_vtd_translate_pt(source_id, entry->iova);
 
@@ -1173,7 +1156,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
          */
         vtd_pt_enable_fast_path(s, source_id);
 
-        return;
+        return true;
     }
 
     ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
@@ -1185,7 +1168,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         } else {
             vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
         }
-        return;
+        goto error;
     }
 
     page_mask = vtd_slpt_level_page_mask(level);
@@ -1196,6 +1179,14 @@ out:
     entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask;
     entry->addr_mask = ~page_mask;
     entry->perm = IOMMU_ACCESS_FLAG(reads, writes);
+    return true;
+
+error:
+    entry->iova = 0;
+    entry->translated_addr = 0;
+    entry->addr_mask = 0;
+    entry->perm = IOMMU_NONE;
+    return false;
 }
 
 static void vtd_root_table_setup(IntelIOMMUState *s)
@@ -1204,8 +1195,7 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
     s->root_extended = s->root & VTD_RTADDR_RTT;
     s->root &= VTD_RTADDR_ADDR_MASK;
 
-    VTD_DPRINTF(CSR, "root_table addr 0x%"PRIx64 " %s", s->root,
-                (s->root_extended ? "(extended)" : ""));
+    trace_vtd_reg_dmar_root(s->root, s->root_extended);
 }
 
 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
@@ -1225,8 +1215,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
     /* Notify global invalidation */
     vtd_iec_notify_all(s, true, 0, 0);
 
-    VTD_DPRINTF(CSR, "int remap table addr 0x%"PRIx64 " size %"PRIu32,
-                s->intr_root, s->intr_size);
+    trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
 }
 
 static void vtd_iommu_replay_all(IntelIOMMUState *s)
@@ -1328,11 +1317,8 @@ static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
 
     switch (type) {
     case VTD_CCMD_DOMAIN_INVL:
-        VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16,
-                    (uint16_t)VTD_CCMD_DID(val));
         /* Fall through */
     case VTD_CCMD_GLOBAL_INVL:
-        VTD_DPRINTF(INV, "global invalidation");
         caig = VTD_CCMD_GLOBAL_INVL_A;
         vtd_context_global_invalidate(s);
         break;
@@ -1343,7 +1329,7 @@ static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
         break;
 
     default:
-        VTD_DPRINTF(GENERAL, "error: invalid granularity");
+        trace_vtd_err("Context cache invalidate type error.");
         caig = 0;
     }
     return caig;
@@ -1351,7 +1337,7 @@ static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
 
 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
 {
-    trace_vtd_iotlb_reset("global invalidation recved");
+    trace_vtd_inv_desc_iotlb_global();
     vtd_reset_iotlb(s);
     vtd_iommu_replay_all(s);
 }
@@ -1362,6 +1348,8 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
     VTDContextEntry ce;
     VTDAddressSpace *vtd_as;
 
+    trace_vtd_inv_desc_iotlb_domain(domain_id);
+
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
                                 &domain_id);
 
@@ -1407,6 +1395,8 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
 {
     VTDIOTLBPageInvInfo info;
 
+    trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
+
     assert(am <= VTD_MAMV);
     info.domain_id = domain_id;
     info.addr = addr;
@@ -1429,15 +1419,12 @@ static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
 
     switch (type) {
     case VTD_TLB_GLOBAL_FLUSH:
-        VTD_DPRINTF(INV, "global invalidation");
         iaig = VTD_TLB_GLOBAL_FLUSH_A;
         vtd_iotlb_global_invalidate(s);
         break;
 
     case VTD_TLB_DSI_FLUSH:
         domain_id = VTD_TLB_DID(val);
-        VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16,
-                    domain_id);
         iaig = VTD_TLB_DSI_FLUSH_A;
         vtd_iotlb_domain_invalidate(s, domain_id);
         break;
@@ -1447,11 +1434,8 @@ static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
         addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
         am = VTD_IVA_AM(addr);
         addr = VTD_IVA_ADDR(addr);
-        VTD_DPRINTF(INV, "page-selective invalidation domain 0x%"PRIx16
-                    " addr 0x%"PRIx64 " mask %"PRIu8, domain_id, addr, am);
         if (am > VTD_MAMV) {
-            VTD_DPRINTF(GENERAL, "error: supported max address mask value is "
-                        "%"PRIu8, (uint8_t)VTD_MAMV);
+            trace_vtd_err("IOTLB PSI flush: address mask overflow.");
             iaig = 0;
             break;
         }
@@ -1460,7 +1444,7 @@ static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
         break;
 
     default:
-        VTD_DPRINTF(GENERAL, "error: invalid granularity");
+        trace_vtd_err("IOTLB flush: invalid granularity.");
         iaig = 0;
     }
     return iaig;
@@ -1481,21 +1465,19 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
 {
     uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
 
-    VTD_DPRINTF(INV, "Queued Invalidation Enable %s", (en ? "on" : "off"));
+    trace_vtd_inv_qi_enable(en);
+
     if (en) {
         if (vtd_queued_inv_enable_check(s)) {
             s->iq = iqa_val & VTD_IQA_IQA_MASK;
             /* 2^(x+8) entries */
             s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
             s->qi_enabled = true;
-            VTD_DPRINTF(INV, "DMAR_IQA_REG 0x%"PRIx64, iqa_val);
-            VTD_DPRINTF(INV, "Invalidation Queue addr 0x%"PRIx64 " size %d",
-                        s->iq, s->iq_size);
+            trace_vtd_inv_qi_setup(s->iq, s->iq_size);
             /* Ok - report back to driver */
             vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
         } else {
-            VTD_DPRINTF(GENERAL, "error: can't enable Queued Invalidation: "
-                        "tail %"PRIu16, s->iq_tail);
+            trace_vtd_err_qi_enable(s->iq_tail);
         }
     } else {
         if (vtd_queued_inv_disable_check(s)) {
@@ -1506,10 +1488,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
             /* Ok - report back to driver */
             vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
         } else {
-            VTD_DPRINTF(GENERAL, "error: can't disable Queued Invalidation: "
-                        "head %"PRIu16 ", tail %"PRIu16
-                        ", last_descriptor %"PRIu8,
-                        s->iq_head, s->iq_tail, s->iq_last_desc_type);
+            trace_vtd_err_qi_disable(s->iq_head, s->iq_tail, s->iq_last_desc_type);
         }
     }
 }
@@ -1517,8 +1496,6 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
 /* Set Root Table Pointer */
 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
 {
-    VTD_DPRINTF(CSR, "set Root Table Pointer");
-
     vtd_root_table_setup(s);
     /* Ok - report back to driver */
     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
@@ -1527,8 +1504,6 @@ static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
 /* Set Interrupt Remap Table Pointer */
 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
 {
-    VTD_DPRINTF(CSR, "set Interrupt Remap Table Pointer");
-
     vtd_interrupt_remap_table_setup(s);
     /* Ok - report back to driver */
     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
@@ -1541,7 +1516,7 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
         return;
     }
 
-    VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
+    trace_vtd_dmar_enable(en);
 
     if (en) {
         s->dmar_enabled = true;
@@ -1562,7 +1537,7 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
 /* Handle Interrupt Remap Enable/Disable */
 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
 {
-    VTD_DPRINTF(CSR, "Interrupt Remap Enable %s", (en ? "on" : "off"));
+    trace_vtd_ir_enable(en);
 
     if (en) {
         s->intr_enabled = true;
@@ -1582,7 +1557,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s)
     uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
     uint32_t changed = status ^ val;
 
-    VTD_DPRINTF(CSR, "value 0x%"PRIx32 " status 0x%"PRIx32, val, status);
+    trace_vtd_reg_write_gcmd(status, val);
     if (changed & VTD_GCMD_TE) {
         /* Translation enable/disable */
         vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
@@ -1614,8 +1589,8 @@ static void vtd_handle_ccmd_write(IntelIOMMUState *s)
     /* Context-cache invalidation request */
     if (val & VTD_CCMD_ICC) {
         if (s->qi_enabled) {
-            VTD_DPRINTF(GENERAL, "error: Queued Invalidation enabled, "
-                        "should not use register-based invalidation");
+            trace_vtd_err("Queued Invalidation enabled, "
+                          "should not use register-based invalidation");
             return;
         }
         ret = vtd_context_cache_invalidate(s, val);
@@ -1623,7 +1598,6 @@ static void vtd_handle_ccmd_write(IntelIOMMUState *s)
         vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
         ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
                                       ret);
-        VTD_DPRINTF(INV, "CCMD_REG write-back val: 0x%"PRIx64, ret);
     }
 }
 
@@ -1636,8 +1610,8 @@ static void vtd_handle_iotlb_write(IntelIOMMUState *s)
     /* IOTLB invalidation request */
     if (val & VTD_TLB_IVT) {
         if (s->qi_enabled) {
-            VTD_DPRINTF(GENERAL, "error: Queued Invalidation enabled, "
-                        "should not use register-based invalidation");
+            trace_vtd_err("Queued Invalidation enabled, "
+                          "should not use register-based invalidation.");
             return;
         }
         ret = vtd_iotlb_flush(s, val);
@@ -1645,7 +1619,6 @@ static void vtd_handle_iotlb_write(IntelIOMMUState *s)
         vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
         ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
                                       VTD_TLB_FLUSH_GRANU_MASK_A, ret);
-        VTD_DPRINTF(INV, "IOTLB_REG write-back val: 0x%"PRIx64, ret);
     }
 }
 
@@ -1656,11 +1629,9 @@ static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
     dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
     if (dma_memory_read(&address_space_memory, addr, inv_desc,
         sizeof(*inv_desc))) {
-        VTD_DPRINTF(GENERAL, "error: fail to fetch Invalidation Descriptor "
-                    "base_addr 0x%"PRIx64 " offset %"PRIu32, base_addr, offset);
+        trace_vtd_err("Read INV DESC failed.");
         inv_desc->lo = 0;
         inv_desc->hi = 0;
-
         return false;
     }
     inv_desc->lo = le64_to_cpu(inv_desc->lo);
@@ -1746,13 +1717,11 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
 
     switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
     case VTD_INV_DESC_IOTLB_GLOBAL:
-        trace_vtd_inv_desc_iotlb_global();
         vtd_iotlb_global_invalidate(s);
         break;
 
     case VTD_INV_DESC_IOTLB_DOMAIN:
         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
-        trace_vtd_inv_desc_iotlb_domain(domain_id);
         vtd_iotlb_domain_invalidate(s, domain_id);
         break;
 
@@ -1760,7 +1729,6 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
         addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
         am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
-        trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
         if (am > VTD_MAMV) {
             trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo);
             return false;
@@ -1778,10 +1746,9 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
                                      VTDInvDesc *inv_desc)
 {
-    VTD_DPRINTF(INV, "inv ir glob %d index %d mask %d",
-                inv_desc->iec.granularity,
-                inv_desc->iec.index,
-                inv_desc->iec.index_mask);
+    trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
+                           inv_desc->iec.index,
+                           inv_desc->iec.index_mask);
 
     vtd_iec_notify_all(s, !inv_desc->iec.granularity,
                        inv_desc->iec.index,
@@ -1810,9 +1777,7 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
 
     if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
         (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
-        VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Device "
-                    "IOTLB Invalidate Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64,
-                    inv_desc->hi, inv_desc->lo);
+        trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo);
         return false;
     }
 
@@ -1857,7 +1822,7 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
     VTDInvDesc inv_desc;
     uint8_t desc_type;
 
-    VTD_DPRINTF(INV, "iq head %"PRIu16, s->iq_head);
+    trace_vtd_inv_qi_head(s->iq_head);
     if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) {
         s->iq_last_desc_type = VTD_INV_DESC_NONE;
         return false;
@@ -1896,8 +1861,7 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
         break;
 
     case VTD_INV_DESC_DEVICE:
-        VTD_DPRINTF(INV, "Device IOTLB Invalidation Descriptor hi 0x%"PRIx64
-                    " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo);
+        trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
         if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
             return false;
         }
@@ -1917,11 +1881,11 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 /* Try to fetch and process more Invalidation Descriptors */
 static void vtd_fetch_inv_desc(IntelIOMMUState *s)
 {
-    VTD_DPRINTF(INV, "fetch Invalidation Descriptors");
+    trace_vtd_inv_qi_fetch();
+
     if (s->iq_tail >= s->iq_size) {
         /* Detects an invalid Tail pointer */
-        VTD_DPRINTF(GENERAL, "error: iq_tail is %"PRIu16
-                    " while iq_size is %"PRIu16, s->iq_tail, s->iq_size);
+        trace_vtd_err_qi_tail(s->iq_tail, s->iq_size);
         vtd_handle_inv_queue_error(s);
         return;
     }
@@ -1944,7 +1908,8 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s)
     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
 
     s->iq_tail = VTD_IQT_QT(val);
-    VTD_DPRINTF(INV, "set iq tail %"PRIu16, s->iq_tail);
+    trace_vtd_inv_qi_tail(s->iq_tail);
+
     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
         /* Process Invalidation Queue here */
         vtd_fetch_inv_desc(s);
@@ -1959,8 +1924,7 @@ static void vtd_handle_fsts_write(IntelIOMMUState *s)
 
     if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
-        VTD_DPRINTF(FLOG, "all pending interrupt conditions serviced, clear "
-                    "IP field of FECTL_REG");
+        trace_vtd_fsts_clear_ip();
     }
     /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
      * Descriptors if there are any when Queued Invalidation is enabled?
@@ -1975,11 +1939,12 @@ static void vtd_handle_fectl_write(IntelIOMMUState *s)
      * software clears the IM field? Or just check if the IM field is zero?
      */
     fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
+
+    trace_vtd_reg_write_fectl(fectl_reg);
+
     if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
-        VTD_DPRINTF(FLOG, "IM field is cleared, generate "
-                    "fault event interrupt");
     }
 }
 
@@ -1989,9 +1954,8 @@ static void vtd_handle_ics_write(IntelIOMMUState *s)
     uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
 
     if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
+        trace_vtd_reg_ics_clear_ip();
         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
-        VTD_DPRINTF(INV, "pending completion interrupt condition serviced, "
-                    "clear IP field of IECTL_REG");
     }
 }
 
@@ -2003,11 +1967,12 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
      * software clears the IM field? Or just check if the IM field is zero?
      */
     iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
+
+    trace_vtd_reg_write_iectl(iectl_reg);
+
     if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
-        VTD_DPRINTF(INV, "IM field is cleared, generate "
-                    "invalidation event interrupt");
     }
 }
 
@@ -2016,10 +1981,10 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
     IntelIOMMUState *s = opaque;
     uint64_t val;
 
+    trace_vtd_reg_read(addr, size);
+
     if (addr + size > DMAR_REG_SIZE) {
-        VTD_DPRINTF(GENERAL, "error: addr outside region: max 0x%"PRIx64
-                    ", got 0x%"PRIx64 " %d",
-                    (uint64_t)DMAR_REG_SIZE, addr, size);
+        trace_vtd_err("Read MMIO over range.");
         return (uint64_t)-1;
     }
 
@@ -2058,8 +2023,7 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
             val = vtd_get_quad(s, addr);
         }
     }
-    VTD_DPRINTF(CSR, "addr 0x%"PRIx64 " size %d val 0x%"PRIx64,
-                addr, size, val);
+
     return val;
 }
 
@@ -2068,26 +2032,22 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 {
     IntelIOMMUState *s = opaque;
 
+    trace_vtd_reg_write(addr, size, val);
+
     if (addr + size > DMAR_REG_SIZE) {
-        VTD_DPRINTF(GENERAL, "error: addr outside region: max 0x%"PRIx64
-                    ", got 0x%"PRIx64 " %d",
-                    (uint64_t)DMAR_REG_SIZE, addr, size);
+        trace_vtd_err("Write MMIO over range.");
         return;
     }
 
     switch (addr) {
     /* Global Command Register, 32-bit */
     case DMAR_GCMD_REG:
-        VTD_DPRINTF(CSR, "DMAR_GCMD_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         vtd_set_long(s, addr, val);
         vtd_handle_gcmd_write(s);
         break;
 
     /* Context Command Register, 64-bit */
     case DMAR_CCMD_REG:
-        VTD_DPRINTF(CSR, "DMAR_CCMD_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2097,8 +2057,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_CCMD_REG_HI:
-        VTD_DPRINTF(CSR, "DMAR_CCMD_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         vtd_handle_ccmd_write(s);
@@ -2106,8 +2064,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* IOTLB Invalidation Register, 64-bit */
     case DMAR_IOTLB_REG:
-        VTD_DPRINTF(INV, "DMAR_IOTLB_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2117,8 +2073,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_IOTLB_REG_HI:
-        VTD_DPRINTF(INV, "DMAR_IOTLB_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         vtd_handle_iotlb_write(s);
@@ -2126,8 +2080,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* Invalidate Address Register, 64-bit */
     case DMAR_IVA_REG:
-        VTD_DPRINTF(INV, "DMAR_IVA_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2136,16 +2088,12 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_IVA_REG_HI:
-        VTD_DPRINTF(INV, "DMAR_IVA_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Fault Status Register, 32-bit */
     case DMAR_FSTS_REG:
-        VTD_DPRINTF(FLOG, "DMAR_FSTS_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         vtd_handle_fsts_write(s);
@@ -2153,8 +2101,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* Fault Event Control Register, 32-bit */
     case DMAR_FECTL_REG:
-        VTD_DPRINTF(FLOG, "DMAR_FECTL_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         vtd_handle_fectl_write(s);
@@ -2162,40 +2108,30 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* Fault Event Data Register, 32-bit */
     case DMAR_FEDATA_REG:
-        VTD_DPRINTF(FLOG, "DMAR_FEDATA_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Fault Event Address Register, 32-bit */
     case DMAR_FEADDR_REG:
-        VTD_DPRINTF(FLOG, "DMAR_FEADDR_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Fault Event Upper Address Register, 32-bit */
     case DMAR_FEUADDR_REG:
-        VTD_DPRINTF(FLOG, "DMAR_FEUADDR_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Protected Memory Enable Register, 32-bit */
     case DMAR_PMEN_REG:
-        VTD_DPRINTF(CSR, "DMAR_PMEN_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Root Table Address Register, 64-bit */
     case DMAR_RTADDR_REG:
-        VTD_DPRINTF(CSR, "DMAR_RTADDR_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2204,16 +2140,12 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_RTADDR_REG_HI:
-        VTD_DPRINTF(CSR, "DMAR_RTADDR_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Invalidation Queue Tail Register, 64-bit */
     case DMAR_IQT_REG:
-        VTD_DPRINTF(INV, "DMAR_IQT_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2223,8 +2155,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_IQT_REG_HI:
-        VTD_DPRINTF(INV, "DMAR_IQT_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         /* 19:63 of IQT_REG is RsvdZ, do nothing here */
@@ -2232,8 +2162,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* Invalidation Queue Address Register, 64-bit */
     case DMAR_IQA_REG:
-        VTD_DPRINTF(INV, "DMAR_IQA_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2242,16 +2170,12 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_IQA_REG_HI:
-        VTD_DPRINTF(INV, "DMAR_IQA_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Invalidation Completion Status Register, 32-bit */
     case DMAR_ICS_REG:
-        VTD_DPRINTF(INV, "DMAR_ICS_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         vtd_handle_ics_write(s);
@@ -2259,8 +2183,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* Invalidation Event Control Register, 32-bit */
     case DMAR_IECTL_REG:
-        VTD_DPRINTF(INV, "DMAR_IECTL_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         vtd_handle_iectl_write(s);
@@ -2268,32 +2190,24 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 
     /* Invalidation Event Data Register, 32-bit */
     case DMAR_IEDATA_REG:
-        VTD_DPRINTF(INV, "DMAR_IEDATA_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Invalidation Event Address Register, 32-bit */
     case DMAR_IEADDR_REG:
-        VTD_DPRINTF(INV, "DMAR_IEADDR_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Invalidation Event Upper Address Register, 32-bit */
     case DMAR_IEUADDR_REG:
-        VTD_DPRINTF(INV, "DMAR_IEUADDR_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     /* Fault Recording Registers, 128-bit */
     case DMAR_FRCD_REG_0_0:
-        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_0 write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2302,15 +2216,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_FRCD_REG_0_1:
-        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_1 write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     case DMAR_FRCD_REG_0_2:
-        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_2 write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2321,8 +2231,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_FRCD_REG_0_3:
-        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_3 write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         /* May clear bit 127 (Fault), update PPF */
@@ -2330,8 +2238,6 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_IRTA_REG:
-        VTD_DPRINTF(IR, "DMAR_IRTA_REG write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2340,15 +2246,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
         break;
 
     case DMAR_IRTA_REG_HI:
-        VTD_DPRINTF(IR, "DMAR_IRTA_REG_HI write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         assert(size == 4);
         vtd_set_long(s, addr, val);
         break;
 
     default:
-        VTD_DPRINTF(GENERAL, "error: unhandled reg write addr 0x%"PRIx64
-                    ", size %d, val 0x%"PRIx64, addr, size, val);
         if (size == 4) {
             vtd_set_long(s, addr, val);
         } else {
@@ -2362,31 +2264,38 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
     IntelIOMMUState *s = vtd_as->iommu_state;
-    IOMMUTLBEntry ret = {
+    IOMMUTLBEntry iotlb = {
+        /* We'll fill in the rest later. */
         .target_as = &address_space_memory,
-        .iova = addr,
-        .translated_addr = 0,
-        .addr_mask = ~(hwaddr)0,
-        .perm = IOMMU_NONE,
     };
+    bool success;
 
-    if (!s->dmar_enabled) {
+    if (likely(s->dmar_enabled)) {
+        success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
+                                         addr, flag & IOMMU_WO, &iotlb);
+    } else {
         /* DMAR disabled, passthrough, use 4k-page*/
-        ret.iova = addr & VTD_PAGE_MASK_4K;
-        ret.translated_addr = addr & VTD_PAGE_MASK_4K;
-        ret.addr_mask = ~VTD_PAGE_MASK_4K;
-        ret.perm = IOMMU_RW;
-        return ret;
+        iotlb.iova = addr & VTD_PAGE_MASK_4K;
+        iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
+        iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
+        iotlb.perm = IOMMU_RW;
+        success = true;
     }
 
-    vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, addr,
-                           flag & IOMMU_WO, &ret);
-    VTD_DPRINTF(MMU,
-                "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8
-                " iova 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus),
-                VTD_PCI_SLOT(vtd_as->devfn), VTD_PCI_FUNC(vtd_as->devfn),
-                vtd_as->devfn, addr, ret.translated_addr);
-    return ret;
+    if (likely(success)) {
+        trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
+                                 VTD_PCI_SLOT(vtd_as->devfn),
+                                 VTD_PCI_FUNC(vtd_as->devfn),
+                                 iotlb.iova, iotlb.translated_addr,
+                                 iotlb.addr_mask);
+    } else {
+        trace_vtd_err_dmar_translate(pci_bus_num(vtd_as->bus),
+                                     VTD_PCI_SLOT(vtd_as->devfn),
+                                     VTD_PCI_FUNC(vtd_as->devfn),
+                                     iotlb.iova);
+    }
+
+    return iotlb;
 }
 
 static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
@@ -2484,25 +2393,23 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
     addr = iommu->intr_root + index * sizeof(*entry);
     if (dma_memory_read(&address_space_memory, addr, entry,
                         sizeof(*entry))) {
-        VTD_DPRINTF(GENERAL, "error: fail to access IR root at 0x%"PRIx64
-                    " + %"PRIu16, iommu->intr_root, index);
+        trace_vtd_err("Memory read failed for IRTE.");
         return -VTD_FR_IR_ROOT_INVAL;
     }
 
+    trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
+                          le64_to_cpu(entry->data[0]));
+
     if (!entry->irte.present) {
-        VTD_DPRINTF(GENERAL, "error: present flag not set in IRTE"
-                    " entry index %u value 0x%"PRIx64 " 0x%"PRIx64,
-                    index, le64_to_cpu(entry->data[1]),
-                    le64_to_cpu(entry->data[0]));
+        trace_vtd_err_irte(index, le64_to_cpu(entry->data[1]),
+                           le64_to_cpu(entry->data[0]));
         return -VTD_FR_IR_ENTRY_P;
     }
 
     if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
         entry->irte.__reserved_2) {
-        VTD_DPRINTF(GENERAL, "error: IRTE entry index %"PRIu16
-                    " reserved fields non-zero: 0x%"PRIx64 " 0x%"PRIx64,
-                    index, le64_to_cpu(entry->data[1]),
-                    le64_to_cpu(entry->data[0]));
+        trace_vtd_err_irte(index, le64_to_cpu(entry->data[1]),
+                           le64_to_cpu(entry->data[0]));
         return -VTD_FR_IR_IRTE_RSVD;
     }
 
@@ -2511,15 +2418,12 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
         source_id = le32_to_cpu(entry->irte.source_id);
         switch (entry->irte.sid_vtype) {
         case VTD_SVT_NONE:
-            VTD_DPRINTF(IR, "No SID validation for IRTE index %d", index);
             break;
 
         case VTD_SVT_ALL:
             mask = vtd_svt_mask[entry->irte.sid_q];
             if ((source_id & mask) != (sid & mask)) {
-                VTD_DPRINTF(GENERAL, "SID validation for IRTE index "
-                            "%d failed (reqid 0x%04x sid 0x%04x)", index,
-                            sid, source_id);
+                trace_vtd_err_irte_sid(index, sid, source_id);
                 return -VTD_FR_IR_SID_ERR;
             }
             break;
@@ -2529,16 +2433,13 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
             bus_min = source_id & 0xff;
             bus = sid >> 8;
             if (bus > bus_max || bus < bus_min) {
-                VTD_DPRINTF(GENERAL, "SID validation for IRTE index %d "
-                            "failed (bus %d outside %d-%d)", index, bus,
-                            bus_min, bus_max);
+                trace_vtd_err_irte_sid_bus(index, bus, bus_min, bus_max);
                 return -VTD_FR_IR_SID_ERR;
             }
             break;
 
         default:
-            VTD_DPRINTF(GENERAL, "Invalid SVT bits (0x%x) in IRTE index "
-                        "%d", entry->irte.sid_vtype, index);
+            trace_vtd_err_irte_svt(index, entry->irte.sid_vtype);
             /* Take this as verification failure. */
             return -VTD_FR_IR_SID_ERR;
             break;
@@ -2573,10 +2474,8 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
     irq->dest_mode = irte.irte.dest_mode;
     irq->redir_hint = irte.irte.redir_hint;
 
-    VTD_DPRINTF(IR, "remapping interrupt index %d: trig:%u,vec:%u,"
-                "deliver:%u,dest:%u,dest_mode:%u", index,
-                irq->trigger_mode, irq->vector, irq->delivery_mode,
-                irq->dest, irq->dest_mode);
+    trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
+                       irq->delivery_mode, irq->dest, irq->dest_mode);
 
     return 0;
 }
@@ -2618,28 +2517,29 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
 
     assert(origin && translated);
 
+    trace_vtd_ir_remap_msi_req(origin->address, origin->data);
+
     if (!iommu || !iommu->intr_enabled) {
-        goto do_not_translate;
+        memcpy(translated, origin, sizeof(*origin));
+        goto out;
     }
 
     if (origin->address & VTD_MSI_ADDR_HI_MASK) {
-        VTD_DPRINTF(GENERAL, "error: MSI addr high 32 bits nonzero"
-                    " during interrupt remapping: 0x%"PRIx32,
-                    (uint32_t)((origin->address & VTD_MSI_ADDR_HI_MASK) >> \
-                    VTD_MSI_ADDR_HI_SHIFT));
+        trace_vtd_err("MSI address high 32 bits non-zero when "
+                      "Interrupt Remapping enabled.");
         return -VTD_FR_IR_REQ_RSVD;
     }
 
     addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
     if (addr.addr.__head != 0xfee) {
-        VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: "
-                    "0x%"PRIx32, addr.data);
+        trace_vtd_err("MSI addr low 32 bit invalid.");
         return -VTD_FR_IR_REQ_RSVD;
     }
 
     /* This is compatible mode. */
     if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
-        goto do_not_translate;
+        memcpy(translated, origin, sizeof(*origin));
+        goto out;
     }
 
     index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
@@ -2658,34 +2558,28 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
     }
 
     if (addr.addr.sub_valid) {
-        VTD_DPRINTF(IR, "received MSI interrupt");
+        trace_vtd_ir_remap_type("MSI");
         if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
-            VTD_DPRINTF(GENERAL, "error: MSI data bits non-zero for "
-                        "interrupt remappable entry: 0x%"PRIx32,
-                        origin->data);
+            trace_vtd_err_ir_msi_invalid(sid, origin->address, origin->data);
             return -VTD_FR_IR_REQ_RSVD;
         }
     } else {
         uint8_t vector = origin->data & 0xff;
         uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
 
-        VTD_DPRINTF(IR, "received IOAPIC interrupt");
+        trace_vtd_ir_remap_type("IOAPIC");
         /* IOAPIC entry vector should be aligned with IRTE vector
          * (see vt-d spec 5.1.5.1). */
         if (vector != irq.vector) {
-            VTD_DPRINTF(GENERAL, "IOAPIC vector inconsistent: "
-                        "entry: %d, IRTE: %d, index: %d",
-                        vector, irq.vector, index);
+            trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
         }
 
         /* The Trigger Mode field must match the Trigger Mode in the IRTE.
          * (see vt-d spec 5.1.5.1). */
         if (trigger_mode != irq.trigger_mode) {
-            VTD_DPRINTF(GENERAL, "IOAPIC trigger mode inconsistent: "
-                        "entry: %u, IRTE: %u, index: %d",
-                        trigger_mode, irq.trigger_mode, index);
+            trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
+                                      irq.trigger_mode);
         }
-
     }
 
     /*
@@ -2697,13 +2591,9 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
     /* Translate VTDIrq to MSI message */
     vtd_generate_msi_message(&irq, translated);
 
-    VTD_DPRINTF(IR, "mapping MSI 0x%"PRIx64":0x%"PRIx32 " -> "
-                "0x%"PRIx64":0x%"PRIx32, origin->address, origin->data,
-                translated->address, translated->data);
-    return 0;
-
-do_not_translate:
-    memcpy(translated, origin, sizeof(*origin));
+out:
+    trace_vtd_ir_remap_msi(origin->address, origin->data,
+                           translated->address, translated->data);
     return 0;
 }
 
@@ -2740,16 +2630,10 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
     ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
     if (ret) {
         /* TODO: report error */
-        VTD_DPRINTF(GENERAL, "int remap fail for addr 0x%"PRIx64
-                    " data 0x%"PRIx32, from.address, from.data);
         /* Drop this interrupt */
         return MEMTX_ERROR;
     }
 
-    VTD_DPRINTF(IR, "delivering MSI 0x%"PRIx64":0x%"PRIx32
-                " for device sid 0x%04x",
-                to.address, to.data, sid);
-
     apic_get_class()->send_msi(&to);
 
     return MEMTX_OK;
@@ -3052,7 +2936,6 @@ static void vtd_reset(DeviceState *dev)
 {
     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
 
-    VTD_DPRINTF(GENERAL, "");
     vtd_init(s);
 
     /*
@@ -3125,7 +3008,6 @@ static void vtd_realize(DeviceState *dev, Error **errp)
     }
 
     bus = pcms->bus;
-    VTD_DPRINTF(GENERAL, "");
     x86_iommu->type = TYPE_INTEL;
 
     if (!vtd_decide_config(s, errp)) {
@@ -3173,7 +3055,6 @@ static const TypeInfo vtd_info = {
 
 static void vtd_register_types(void)
 {
-    VTD_DPRINTF(GENERAL, "");
     type_register_static(&vtd_info);
 }
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0e73a65bf2..f50ecd8b73 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -384,6 +384,7 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
 /* Pagesize of VTD paging structures, including root and context tables */
 #define VTD_PAGE_SHIFT              12
 #define VTD_PAGE_SIZE               (1ULL << VTD_PAGE_SHIFT)
+#define VTD_PAGE_MASK               (VTD_PAGE_SIZE - 1)
 
 #define VTD_PAGE_SHIFT_4K           12
 #define VTD_PAGE_MASK_4K            (~((1ULL << VTD_PAGE_SHIFT_4K) - 1))
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 02f9a8fe91..224fe58fe7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1692,6 +1692,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev,
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
     MemoryRegion *mr = ddc->get_memory_region(dimm);
     uint64_t align = TARGET_PAGE_SIZE;
+    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
 
     if (memory_region_get_alignment(mr) && pcmc->enforce_aligned_dimm) {
         align = memory_region_get_alignment(mr);
@@ -1703,17 +1704,18 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev,
         goto out;
     }
 
+    if (is_nvdimm && !pcms->acpi_nvdimm_state.is_enabled) {
+        error_setg(&local_err,
+                   "nvdimm is not enabled: missing 'nvdimm' in '-M'");
+        goto out;
+    }
+
     pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, &local_err);
     if (local_err) {
         goto out;
     }
 
-    if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
-        if (!pcms->acpi_nvdimm_state.is_enabled) {
-            error_setg(&local_err,
-                       "nvdimm is not enabled: missing 'nvdimm' in '-M'");
-            goto out;
-        }
+    if (is_nvdimm) {
         nvdimm_plug(&pcms->acpi_nvdimm_state);
     }
 
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 72556dad48..5f111d6dde 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -19,6 +19,13 @@ vtd_inv_desc_wait_sw(uint64_t addr, uint32_t data) "wait invalidate status write
 vtd_inv_desc_wait_irq(const char *msg) "%s"
 vtd_inv_desc_wait_invalid(uint64_t hi, uint64_t lo) "invalid wait desc hi 0x%"PRIx64" lo 0x%"PRIx64
 vtd_inv_desc_wait_write_fail(uint64_t hi, uint64_t lo) "write fail for wait desc hi 0x%"PRIx64" lo 0x%"PRIx64
+vtd_inv_desc_iec(uint32_t granularity, uint32_t index, uint32_t mask) "granularity 0x%"PRIx32" index 0x%"PRIx32" mask 0x%"PRIx32
+vtd_inv_qi_enable(bool enable) "enabled %d"
+vtd_inv_qi_setup(uint64_t addr, int size) "addr 0x%"PRIx64" size %d"
+vtd_inv_qi_head(uint16_t head) "read head %d"
+vtd_inv_qi_tail(uint16_t head) "write tail %d"
+vtd_inv_qi_fetch(void) ""
+vtd_context_cache_reset(void) ""
 vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
 vtd_re_invalid(uint64_t hi, uint64_t lo) "invalid root entry hi 0x%"PRIx64" lo 0x%"PRIx64
 vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present"
@@ -40,6 +47,43 @@ vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
 vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64
 vtd_pt_enable_fast_path(uint16_t sid, bool success) "sid 0x%"PRIu16" %d"
+vtd_irq_generate(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_reg_read(uint64_t addr, uint64_t size) "addr 0x%"PRIx64" size 0x%"PRIx64
+vtd_reg_write(uint64_t addr, uint64_t size, uint64_t val) "addr 0x%"PRIx64" size 0x%"PRIx64" value 0x%"PRIx64
+vtd_reg_dmar_root(uint64_t addr, bool extended) "addr 0x%"PRIx64" extended %d"
+vtd_reg_ir_root(uint64_t addr, uint32_t size) "addr 0x%"PRIx64" size 0x%"PRIx32
+vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
+vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
+vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
+vtd_reg_ics_clear_ip(void) ""
+vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
+vtd_dmar_enable(bool en) "enable %d"
+vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
+vtd_ir_enable(bool en) "enable %d"
+vtd_ir_irte_get(int index, uint64_t lo, uint64_t hi) "index %d low 0x%"PRIx64" high 0x%"PRIx64
+vtd_ir_remap(int index, int tri, int vec, int deliver, uint32_t dest, int dest_mode) "index %d trigger %d vector %d deliver %d dest 0x%"PRIx32" mode %d"
+vtd_ir_remap_type(const char *type) "%s"
+vtd_ir_remap_msi(uint64_t addr, uint64_t data, uint64_t addr2, uint64_t data2) "(addr 0x%"PRIx64", data 0x%"PRIx64") -> (addr 0x%"PRIx64", data 0x%"PRIx64")"
+vtd_ir_remap_msi_req(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_fsts_ppf(bool set) "FSTS PPF bit set to %d"
+vtd_fsts_clear_ip(void) ""
+vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low 0x%"PRIx64
+vtd_err(const char *str) "%s"
+vtd_err_dmar_iova_overflow(uint64_t iova) "iova 0x%"PRIx64
+vtd_err_dmar_slpte_read_error(uint64_t iova, int level) "iova 0x%"PRIx64" level %d"
+vtd_err_dmar_slpte_perm_error(uint64_t iova, int level, uint64_t slpte, bool is_write) "iova 0x%"PRIx64" level %d slpte 0x%"PRIx64" write %d"
+vtd_err_dmar_slpte_resv_error(uint64_t iova, int level, uint64_t slpte) "iova 0x%"PRIx64" level %d slpte 0x%"PRIx64
+vtd_err_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova) "dev %02x:%02x.%02x iova 0x%"PRIx64
+vtd_err_qi_enable(uint16_t tail) "tail 0x%"PRIx16
+vtd_err_qi_disable(uint16_t head, uint16_t tail, int type) "head 0x%"PRIx16" tail 0x%"PRIx16" last_desc_type %d"
+vtd_err_qi_tail(uint16_t tail, uint16_t size) "tail 0x%"PRIx16" size 0x%"PRIx16
+vtd_err_irte(int index, uint64_t lo, uint64_t hi) "index %d low 0x%"PRIx64" high 0x%"PRIx64
+vtd_err_irte_sid(int index, uint16_t req, uint16_t target) "index %d SVT_ALL sid 0x%"PRIx16" (should be: 0x%"PRIx16")"
+vtd_err_irte_sid_bus(int index, uint8_t bus, uint8_t min, uint8_t max) "index %d SVT_BUS bus 0x%"PRIx8" (should be: 0x%"PRIx8"-0x%"PRIx8")"
+vtd_err_irte_svt(int index, int type) "index %d SVT type %d"
+vtd_err_ir_msi_invalid(uint16_t sid, uint64_t addr, uint64_t data) "sid 0x%"PRIx16" addr 0x%"PRIx64" data 0x%"PRIx64
+vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
+vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
 
 # hw/i386/amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 6367d041f0..2f0819d977 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -491,9 +491,9 @@ static void setup_interrupt(IVShmemState *s, int vector, Error **errp)
 
 static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
 {
+    Error *local_err = NULL;
     struct stat buf;
     size_t size;
-    void *ptr;
 
     if (s->ivshmem_bar2) {
         error_setg(errp, "server sent unexpected shared memory message");
@@ -522,15 +522,13 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
     }
 
     /* mmap the region and map into the BAR2 */
-    ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-    if (ptr == MAP_FAILED) {
-        error_setg_errno(errp, errno, "Failed to mmap shared memory");
-        close(fd);
+    memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
+                                   "ivshmem.bar2", size, true, fd, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         return;
     }
-    memory_region_init_ram_ptr(&s->server_bar2, OBJECT(s),
-                               "ivshmem.bar2", size, ptr);
-    memory_region_set_fd(&s->server_bar2, fd);
+
     s->ivshmem_bar2 = &s->server_bar2;
 }
 
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 564f6cbb14..0e472f2ed4 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -133,7 +133,7 @@ static void q35_host_get_mmcfg_size(Object *obj, Visitor *v, const char *name,
     visit_type_uint64(v, name, &e->size, errp);
 }
 
-static Property mch_props[] = {
+static Property q35_host_props[] = {
     DEFINE_PROP_UINT64(PCIE_HOST_MCFG_BASE, Q35PCIHost, parent_obj.base_addr,
                         MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT),
     DEFINE_PROP_SIZE(PCI_HOST_PROP_PCI_HOLE64_SIZE, Q35PCIHost,
@@ -153,7 +153,7 @@ static void q35_host_class_init(ObjectClass *klass, void *data)
 
     hc->root_bus_path = q35_host_root_bus_path;
     dc->realize = q35_host_realize;
-    dc->props = mch_props;
+    dc->props = q35_host_props;
     /* Reason: needs to be wired up by pc_q35_init */
     dc->user_creatable = false;
     set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
@@ -368,7 +368,7 @@ static void mch_update_smram(MCHPCIState *mch)
             tseg_size = 1024 * 1024 * 8;
             break;
         default:
-            tseg_size = 0;
+            tseg_size = 1024 * 1024 * (uint32_t)mch->ext_tseg_mbytes;
             break;
         }
     } else {
@@ -391,6 +391,17 @@ static void mch_update_smram(MCHPCIState *mch)
     memory_region_transaction_commit();
 }
 
+static void mch_update_ext_tseg_mbytes(MCHPCIState *mch)
+{
+    PCIDevice *pd = PCI_DEVICE(mch);
+    uint8_t *reg = pd->config + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES;
+
+    if (mch->ext_tseg_mbytes > 0 &&
+        pci_get_word(reg) == MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY) {
+        pci_set_word(reg, mch->ext_tseg_mbytes);
+    }
+}
+
 static void mch_write_config(PCIDevice *d,
                               uint32_t address, uint32_t val, int len)
 {
@@ -412,6 +423,11 @@ static void mch_write_config(PCIDevice *d,
                        MCH_HOST_BRIDGE_SMRAM_SIZE)) {
         mch_update_smram(mch);
     }
+
+    if (ranges_overlap(address, len, MCH_HOST_BRIDGE_EXT_TSEG_MBYTES,
+                       MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_SIZE)) {
+        mch_update_ext_tseg_mbytes(mch);
+    }
 }
 
 static void mch_update(MCHPCIState *mch)
@@ -419,6 +435,7 @@ static void mch_update(MCHPCIState *mch)
     mch_update_pciexbar(mch);
     mch_update_pam(mch);
     mch_update_smram(mch);
+    mch_update_ext_tseg_mbytes(mch);
 }
 
 static int mch_post_load(void *opaque, int version_id)
@@ -456,6 +473,11 @@ static void mch_reset(DeviceState *qdev)
     d->wmask[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_WMASK;
     d->wmask[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_WMASK;
 
+    if (mch->ext_tseg_mbytes > 0) {
+        pci_set_word(d->config + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES,
+                     MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY);
+    }
+
     mch_update(mch);
 }
 
@@ -464,6 +486,12 @@ static void mch_realize(PCIDevice *d, Error **errp)
     int i;
     MCHPCIState *mch = MCH_PCI_DEVICE(d);
 
+    if (mch->ext_tseg_mbytes > MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX) {
+        error_setg(errp, "invalid extended-tseg-mbytes value: %" PRIu16,
+                   mch->ext_tseg_mbytes);
+        return;
+    }
+
     /* setup pci memory mapping */
     pc_pci_as_mapping_init(OBJECT(mch), mch->system_memory,
                            mch->pci_address_space);
@@ -529,6 +557,12 @@ uint64_t mch_mcfg_base(void)
     return MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT;
 }
 
+static Property mch_props[] = {
+    DEFINE_PROP_UINT16("extended-tseg-mbytes", MCHPCIState, ext_tseg_mbytes,
+                       16),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void mch_class_init(ObjectClass *klass, void *data)
 {
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
@@ -537,6 +571,7 @@ static void mch_class_init(ObjectClass *klass, void *data)
     k->realize = mch_realize;
     k->config_write = mch_write_config;
     dc->reset = mch_reset;
+    dc->props = mch_props;
     set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
     dc->desc = "Host bridge";
     dc->vmsd = &vmstate_mch;
diff --git a/hw/scsi/Makefile.objs b/hw/scsi/Makefile.objs
index 54d8754e9a..b188f7242b 100644
--- a/hw/scsi/Makefile.objs
+++ b/hw/scsi/Makefile.objs
@@ -11,4 +11,5 @@ obj-$(CONFIG_PSERIES) += spapr_vscsi.o
 ifeq ($(CONFIG_VIRTIO),y)
 obj-y += virtio-scsi.o virtio-scsi-dataplane.o
 obj-$(CONFIG_VHOST_SCSI) += vhost-scsi-common.o vhost-scsi.o
+obj-$(CONFIG_VHOST_USER_SCSI) += vhost-scsi-common.o vhost-user-scsi.o
 endif
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index 804122ab05..734fdaef90 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -63,6 +63,7 @@ typedef struct MegasasCmd {
 
     hwaddr pa;
     hwaddr pa_size;
+    uint32_t dcmd_opcode;
     union mfi_frame *frame;
     SCSIRequest *req;
     QEMUSGList qsg;
@@ -309,9 +310,11 @@ static int megasas_build_sense(MegasasCmd *cmd, uint8_t *sense_ptr,
     PCIDevice *pcid = PCI_DEVICE(cmd->state);
     uint32_t pa_hi = 0, pa_lo;
     hwaddr pa;
+    int frame_sense_len;
 
-    if (sense_len > cmd->frame->header.sense_len) {
-        sense_len = cmd->frame->header.sense_len;
+    frame_sense_len = cmd->frame->header.sense_len;
+    if (sense_len > frame_sense_len) {
+        sense_len = frame_sense_len;
     }
     if (sense_len) {
         pa_lo = le32_to_cpu(cmd->frame->pass.sense_addr_lo);
@@ -511,6 +514,7 @@ static MegasasCmd *megasas_enqueue_frame(MegasasState *s,
         cmd->context &= (uint64_t)0xFFFFFFFF;
     }
     cmd->count = count;
+    cmd->dcmd_opcode = -1;
     s->busy++;
 
     if (s->consumer_pa) {
@@ -605,6 +609,9 @@ static void megasas_reset_frames(MegasasState *s)
 static void megasas_abort_command(MegasasCmd *cmd)
 {
     /* Never abort internal commands.  */
+    if (cmd->dcmd_opcode != -1) {
+        return;
+    }
     if (cmd->req != NULL) {
         scsi_req_cancel(cmd->req);
     }
@@ -673,15 +680,16 @@ out:
 static int megasas_map_dcmd(MegasasState *s, MegasasCmd *cmd)
 {
     dma_addr_t iov_pa, iov_size;
+    int iov_count;
 
     cmd->flags = le16_to_cpu(cmd->frame->header.flags);
-    if (!cmd->frame->header.sge_count) {
+    iov_count = cmd->frame->header.sge_count;
+    if (!iov_count) {
         trace_megasas_dcmd_zero_sge(cmd->index);
         cmd->iov_size = 0;
         return 0;
-    } else if (cmd->frame->header.sge_count > 1) {
-        trace_megasas_dcmd_invalid_sge(cmd->index,
-                                       cmd->frame->header.sge_count);
+    } else if (iov_count > 1) {
+        trace_megasas_dcmd_invalid_sge(cmd->index, iov_count);
         cmd->iov_size = 0;
         return -EINVAL;
     }
@@ -1012,7 +1020,6 @@ static int megasas_pd_get_info_submit(SCSIDevice *sdev, int lun,
     uint64_t pd_size;
     uint16_t pd_id = ((sdev->id & 0xFF) << 8) | (lun & 0xFF);
     uint8_t cmdbuf[6];
-    SCSIRequest *req;
     size_t len, resid;
 
     if (!cmd->iov_buf) {
@@ -1021,8 +1028,8 @@ static int megasas_pd_get_info_submit(SCSIDevice *sdev, int lun,
         info->inquiry_data[0] = 0x7f; /* Force PQual 0x3, PType 0x1f */
         info->vpd_page83[0] = 0x7f;
         megasas_setup_inquiry(cmdbuf, 0, sizeof(info->inquiry_data));
-        req = scsi_req_new(sdev, cmd->index, lun, cmdbuf, cmd);
-        if (!req) {
+        cmd->req = scsi_req_new(sdev, cmd->index, lun, cmdbuf, cmd);
+        if (!cmd->req) {
             trace_megasas_dcmd_req_alloc_failed(cmd->index,
                                                 "PD get info std inquiry");
             g_free(cmd->iov_buf);
@@ -1031,26 +1038,26 @@ static int megasas_pd_get_info_submit(SCSIDevice *sdev, int lun,
         }
         trace_megasas_dcmd_internal_submit(cmd->index,
                                            "PD get info std inquiry", lun);
-        len = scsi_req_enqueue(req);
+        len = scsi_req_enqueue(cmd->req);
         if (len > 0) {
             cmd->iov_size = len;
-            scsi_req_continue(req);
+            scsi_req_continue(cmd->req);
         }
         return MFI_STAT_INVALID_STATUS;
     } else if (info->inquiry_data[0] != 0x7f && info->vpd_page83[0] == 0x7f) {
         megasas_setup_inquiry(cmdbuf, 0x83, sizeof(info->vpd_page83));
-        req = scsi_req_new(sdev, cmd->index, lun, cmdbuf, cmd);
-        if (!req) {
+        cmd->req = scsi_req_new(sdev, cmd->index, lun, cmdbuf, cmd);
+        if (!cmd->req) {
             trace_megasas_dcmd_req_alloc_failed(cmd->index,
                                                 "PD get info vpd inquiry");
             return MFI_STAT_FLASH_ALLOC_FAIL;
         }
         trace_megasas_dcmd_internal_submit(cmd->index,
                                            "PD get info vpd inquiry", lun);
-        len = scsi_req_enqueue(req);
+        len = scsi_req_enqueue(cmd->req);
         if (len > 0) {
             cmd->iov_size = len;
-            scsi_req_continue(req);
+            scsi_req_continue(cmd->req);
         }
         return MFI_STAT_INVALID_STATUS;
     }
@@ -1212,7 +1219,6 @@ static int megasas_ld_get_info_submit(SCSIDevice *sdev, int lun,
     struct mfi_ld_info *info = cmd->iov_buf;
     size_t dcmd_size = sizeof(struct mfi_ld_info);
     uint8_t cdb[6];
-    SCSIRequest *req;
     ssize_t len, resid;
     uint16_t sdev_id = ((sdev->id & 0xFF) << 8) | (lun & 0xFF);
     uint64_t ld_size;
@@ -1221,8 +1227,8 @@ static int megasas_ld_get_info_submit(SCSIDevice *sdev, int lun,
         cmd->iov_buf = g_malloc0(dcmd_size);
         info = cmd->iov_buf;
         megasas_setup_inquiry(cdb, 0x83, sizeof(info->vpd_page83));
-        req = scsi_req_new(sdev, cmd->index, lun, cdb, cmd);
-        if (!req) {
+        cmd->req = scsi_req_new(sdev, cmd->index, lun, cdb, cmd);
+        if (!cmd->req) {
             trace_megasas_dcmd_req_alloc_failed(cmd->index,
                                                 "LD get info vpd inquiry");
             g_free(cmd->iov_buf);
@@ -1231,10 +1237,10 @@ static int megasas_ld_get_info_submit(SCSIDevice *sdev, int lun,
         }
         trace_megasas_dcmd_internal_submit(cmd->index,
                                            "LD get info vpd inquiry", lun);
-        len = scsi_req_enqueue(req);
+        len = scsi_req_enqueue(cmd->req);
         if (len > 0) {
             cmd->iov_size = len;
-            scsi_req_continue(req);
+            scsi_req_continue(cmd->req);
         }
         return MFI_STAT_INVALID_STATUS;
     }
@@ -1559,22 +1565,21 @@ static const struct dcmd_cmd_tbl_t {
 
 static int megasas_handle_dcmd(MegasasState *s, MegasasCmd *cmd)
 {
-    int opcode;
     int retval = 0;
     size_t len;
     const struct dcmd_cmd_tbl_t *cmdptr = dcmd_cmd_tbl;
 
-    opcode = le32_to_cpu(cmd->frame->dcmd.opcode);
-    trace_megasas_handle_dcmd(cmd->index, opcode);
+    cmd->dcmd_opcode = le32_to_cpu(cmd->frame->dcmd.opcode);
+    trace_megasas_handle_dcmd(cmd->index, cmd->dcmd_opcode);
     if (megasas_map_dcmd(s, cmd) < 0) {
         return MFI_STAT_MEMORY_NOT_AVAILABLE;
     }
-    while (cmdptr->opcode != -1 && cmdptr->opcode != opcode) {
+    while (cmdptr->opcode != -1 && cmdptr->opcode != cmd->dcmd_opcode) {
         cmdptr++;
     }
     len = cmd->iov_size;
     if (cmdptr->opcode == -1) {
-        trace_megasas_dcmd_unhandled(cmd->index, opcode, len);
+        trace_megasas_dcmd_unhandled(cmd->index, cmd->dcmd_opcode, len);
         retval = megasas_dcmd_dummy(s, cmd);
     } else {
         trace_megasas_dcmd_enter(cmd->index, cmdptr->desc, len);
@@ -1587,15 +1592,14 @@ static int megasas_handle_dcmd(MegasasState *s, MegasasCmd *cmd)
 }
 
 static int megasas_finish_internal_dcmd(MegasasCmd *cmd,
-                                        SCSIRequest *req)
+                                        SCSIRequest *req, size_t resid)
 {
-    int opcode;
     int retval = MFI_STAT_OK;
     int lun = req->lun;
 
-    opcode = le32_to_cpu(cmd->frame->dcmd.opcode);
-    trace_megasas_dcmd_internal_finish(cmd->index, opcode, lun);
-    switch (opcode) {
+    trace_megasas_dcmd_internal_finish(cmd->index, cmd->dcmd_opcode, lun);
+    cmd->iov_size -= resid;
+    switch (cmd->dcmd_opcode) {
     case MFI_DCMD_PD_GET_INFO:
         retval = megasas_pd_get_info_submit(req->dev, lun, cmd);
         break;
@@ -1603,7 +1607,7 @@ static int megasas_finish_internal_dcmd(MegasasCmd *cmd,
         retval = megasas_ld_get_info_submit(req->dev, lun, cmd);
         break;
     default:
-        trace_megasas_dcmd_internal_invalid(cmd->index, opcode);
+        trace_megasas_dcmd_internal_invalid(cmd->index, cmd->dcmd_opcode);
         retval = MFI_STAT_INVALID_DCMD;
         break;
     }
@@ -1647,43 +1651,42 @@ static int megasas_enqueue_req(MegasasCmd *cmd, bool is_write)
 }
 
 static int megasas_handle_scsi(MegasasState *s, MegasasCmd *cmd,
-                               bool is_logical)
+                               int frame_cmd)
 {
     uint8_t *cdb;
+    int target_id, lun_id, cdb_len;
     bool is_write;
     struct SCSIDevice *sdev = NULL;
+    bool is_logical = (frame_cmd == MFI_CMD_LD_SCSI_IO);
 
     cdb = cmd->frame->pass.cdb;
+    target_id = cmd->frame->header.target_id;
+    lun_id = cmd->frame->header.lun_id;
+    cdb_len = cmd->frame->header.cdb_len;
 
     if (is_logical) {
-        if (cmd->frame->header.target_id >= MFI_MAX_LD ||
-            cmd->frame->header.lun_id != 0) {
+        if (target_id >= MFI_MAX_LD || lun_id != 0) {
             trace_megasas_scsi_target_not_present(
-                mfi_frame_desc[cmd->frame->header.frame_cmd], is_logical,
-                cmd->frame->header.target_id, cmd->frame->header.lun_id);
+                mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
             return MFI_STAT_DEVICE_NOT_FOUND;
         }
     }
-    sdev = scsi_device_find(&s->bus, 0, cmd->frame->header.target_id,
-                            cmd->frame->header.lun_id);
+    sdev = scsi_device_find(&s->bus, 0, target_id, lun_id);
 
     cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len);
-    trace_megasas_handle_scsi(mfi_frame_desc[cmd->frame->header.frame_cmd],
-                              is_logical, cmd->frame->header.target_id,
-                              cmd->frame->header.lun_id, sdev, cmd->iov_size);
+    trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical,
+                              target_id, lun_id, sdev, cmd->iov_size);
 
     if (!sdev || (megasas_is_jbod(s) && is_logical)) {
         trace_megasas_scsi_target_not_present(
-            mfi_frame_desc[cmd->frame->header.frame_cmd], is_logical,
-            cmd->frame->header.target_id, cmd->frame->header.lun_id);
+            mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
         return MFI_STAT_DEVICE_NOT_FOUND;
     }
 
-    if (cmd->frame->header.cdb_len > 16) {
+    if (cdb_len > 16) {
         trace_megasas_scsi_invalid_cdb_len(
-                mfi_frame_desc[cmd->frame->header.frame_cmd], is_logical,
-                cmd->frame->header.target_id, cmd->frame->header.lun_id,
-                cmd->frame->header.cdb_len);
+                mfi_frame_desc[frame_cmd], is_logical,
+                target_id, lun_id, cdb_len);
         megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
         cmd->frame->header.scsi_status = CHECK_CONDITION;
         s->event_count++;
@@ -1697,12 +1700,10 @@ static int megasas_handle_scsi(MegasasState *s, MegasasCmd *cmd,
         return MFI_STAT_SCSI_DONE_WITH_ERROR;
     }
 
-    cmd->req = scsi_req_new(sdev, cmd->index,
-                            cmd->frame->header.lun_id, cdb, cmd);
+    cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd);
     if (!cmd->req) {
         trace_megasas_scsi_req_alloc_failed(
-                mfi_frame_desc[cmd->frame->header.frame_cmd],
-                cmd->frame->header.target_id, cmd->frame->header.lun_id);
+                mfi_frame_desc[frame_cmd], target_id, lun_id);
         megasas_write_sense(cmd, SENSE_CODE(NO_SENSE));
         cmd->frame->header.scsi_status = BUSY;
         s->event_count++;
@@ -1723,43 +1724,41 @@ static int megasas_handle_scsi(MegasasState *s, MegasasCmd *cmd,
     return MFI_STAT_INVALID_STATUS;
 }
 
-static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd)
+static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd, int frame_cmd)
 {
     uint32_t lba_count, lba_start_hi, lba_start_lo;
     uint64_t lba_start;
-    bool is_write = (cmd->frame->header.frame_cmd == MFI_CMD_LD_WRITE);
+    bool is_write = (frame_cmd == MFI_CMD_LD_WRITE);
     uint8_t cdb[16];
     int len;
     struct SCSIDevice *sdev = NULL;
+    int target_id, lun_id, cdb_len;
 
     lba_count = le32_to_cpu(cmd->frame->io.header.data_len);
     lba_start_lo = le32_to_cpu(cmd->frame->io.lba_lo);
     lba_start_hi = le32_to_cpu(cmd->frame->io.lba_hi);
     lba_start = ((uint64_t)lba_start_hi << 32) | lba_start_lo;
 
-    if (cmd->frame->header.target_id < MFI_MAX_LD &&
-        cmd->frame->header.lun_id == 0) {
-        sdev = scsi_device_find(&s->bus, 0, cmd->frame->header.target_id,
-                                cmd->frame->header.lun_id);
+    target_id = cmd->frame->header.target_id;
+    lun_id = cmd->frame->header.lun_id;
+    cdb_len = cmd->frame->header.cdb_len;
+
+    if (target_id < MFI_MAX_LD && lun_id == 0) {
+        sdev = scsi_device_find(&s->bus, 0, target_id, lun_id);
     }
 
     trace_megasas_handle_io(cmd->index,
-                            mfi_frame_desc[cmd->frame->header.frame_cmd],
-                            cmd->frame->header.target_id,
-                            cmd->frame->header.lun_id,
+                            mfi_frame_desc[frame_cmd], target_id, lun_id,
                             (unsigned long)lba_start, (unsigned long)lba_count);
     if (!sdev) {
         trace_megasas_io_target_not_present(cmd->index,
-            mfi_frame_desc[cmd->frame->header.frame_cmd],
-            cmd->frame->header.target_id, cmd->frame->header.lun_id);
+            mfi_frame_desc[frame_cmd], target_id, lun_id);
         return MFI_STAT_DEVICE_NOT_FOUND;
     }
 
-    if (cmd->frame->header.cdb_len > 16) {
+    if (cdb_len > 16) {
         trace_megasas_scsi_invalid_cdb_len(
-            mfi_frame_desc[cmd->frame->header.frame_cmd], 1,
-            cmd->frame->header.target_id, cmd->frame->header.lun_id,
-            cmd->frame->header.cdb_len);
+            mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len);
         megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
         cmd->frame->header.scsi_status = CHECK_CONDITION;
         s->event_count++;
@@ -1776,11 +1775,10 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd)
 
     megasas_encode_lba(cdb, lba_start, lba_count, is_write);
     cmd->req = scsi_req_new(sdev, cmd->index,
-                            cmd->frame->header.lun_id, cdb, cmd);
+                            lun_id, cdb, cmd);
     if (!cmd->req) {
         trace_megasas_scsi_req_alloc_failed(
-            mfi_frame_desc[cmd->frame->header.frame_cmd],
-            cmd->frame->header.target_id, cmd->frame->header.lun_id);
+            mfi_frame_desc[frame_cmd], target_id, lun_id);
         megasas_write_sense(cmd, SENSE_CODE(NO_SENSE));
         cmd->frame->header.scsi_status = BUSY;
         s->event_count++;
@@ -1797,23 +1795,11 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd)
     return MFI_STAT_INVALID_STATUS;
 }
 
-static int megasas_finish_internal_command(MegasasCmd *cmd,
-                                           SCSIRequest *req, size_t resid)
-{
-    int retval = MFI_STAT_INVALID_CMD;
-
-    if (cmd->frame->header.frame_cmd == MFI_CMD_DCMD) {
-        cmd->iov_size -= resid;
-        retval = megasas_finish_internal_dcmd(cmd, req);
-    }
-    return retval;
-}
-
 static QEMUSGList *megasas_get_sg_list(SCSIRequest *req)
 {
     MegasasCmd *cmd = req->hba_private;
 
-    if (cmd->frame->header.frame_cmd == MFI_CMD_DCMD) {
+    if (cmd->dcmd_opcode != -1) {
         return NULL;
     } else {
         return &cmd->qsg;
@@ -1824,18 +1810,16 @@ static void megasas_xfer_complete(SCSIRequest *req, uint32_t len)
 {
     MegasasCmd *cmd = req->hba_private;
     uint8_t *buf;
-    uint32_t opcode;
 
     trace_megasas_io_complete(cmd->index, len);
 
-    if (cmd->frame->header.frame_cmd != MFI_CMD_DCMD) {
+    if (cmd->dcmd_opcode != -1) {
         scsi_req_continue(req);
         return;
     }
 
     buf = scsi_req_get_buf(req);
-    opcode = le32_to_cpu(cmd->frame->dcmd.opcode);
-    if (opcode == MFI_DCMD_PD_GET_INFO && cmd->iov_buf) {
+    if (cmd->dcmd_opcode == MFI_DCMD_PD_GET_INFO && cmd->iov_buf) {
         struct mfi_pd_info *info = cmd->iov_buf;
 
         if (info->inquiry_data[0] == 0x7f) {
@@ -1846,7 +1830,7 @@ static void megasas_xfer_complete(SCSIRequest *req, uint32_t len)
             memcpy(info->vpd_page83, buf, len);
         }
         scsi_req_continue(req);
-    } else if (opcode == MFI_DCMD_LD_GET_INFO) {
+    } else if (cmd->dcmd_opcode == MFI_DCMD_LD_GET_INFO) {
         struct mfi_ld_info *info = cmd->iov_buf;
 
         if (cmd->iov_buf) {
@@ -1868,11 +1852,11 @@ static void megasas_command_complete(SCSIRequest *req, uint32_t status,
         return;
     }
 
-    if (cmd->req == NULL) {
+    if (cmd->dcmd_opcode != -1) {
         /*
          * Internal command complete
          */
-        cmd_status = megasas_finish_internal_command(cmd, req, resid);
+        cmd_status = megasas_finish_internal_dcmd(cmd, req, resid);
         if (cmd_status == MFI_STAT_INVALID_STATUS) {
             return;
         }
@@ -1943,6 +1927,7 @@ static void megasas_handle_frame(MegasasState *s, uint64_t frame_addr,
 {
     uint8_t frame_status = MFI_STAT_INVALID_CMD;
     uint64_t frame_context;
+    int frame_cmd;
     MegasasCmd *cmd;
 
     /*
@@ -1961,7 +1946,8 @@ static void megasas_handle_frame(MegasasState *s, uint64_t frame_addr,
         s->event_count++;
         return;
     }
-    switch (cmd->frame->header.frame_cmd) {
+    frame_cmd = cmd->frame->header.frame_cmd;
+    switch (frame_cmd) {
     case MFI_CMD_INIT:
         frame_status = megasas_init_firmware(s, cmd);
         break;
@@ -1972,18 +1958,15 @@ static void megasas_handle_frame(MegasasState *s, uint64_t frame_addr,
         frame_status = megasas_handle_abort(s, cmd);
         break;
     case MFI_CMD_PD_SCSI_IO:
-        frame_status = megasas_handle_scsi(s, cmd, 0);
-        break;
     case MFI_CMD_LD_SCSI_IO:
-        frame_status = megasas_handle_scsi(s, cmd, 1);
+        frame_status = megasas_handle_scsi(s, cmd, frame_cmd);
         break;
     case MFI_CMD_LD_READ:
     case MFI_CMD_LD_WRITE:
-        frame_status = megasas_handle_io(s, cmd);
+        frame_status = megasas_handle_io(s, cmd, frame_cmd);
         break;
     default:
-        trace_megasas_unhandled_frame_cmd(cmd->index,
-                                          cmd->frame->header.frame_cmd);
+        trace_megasas_unhandled_frame_cmd(cmd->index, frame_cmd);
         s->event_count++;
         break;
     }
diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
index e41c0314db..d434b3e99a 100644
--- a/hw/scsi/vhost-scsi-common.c
+++ b/hw/scsi/vhost-scsi-common.c
@@ -16,7 +16,6 @@
  */
 
 #include "qemu/osdep.h"
-#include <linux/vhost.h>
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "migration/migration.h"
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
new file mode 100644
index 0000000000..500fa6a067
--- /dev/null
+++ b/hw/scsi/vhost-user-scsi.c
@@ -0,0 +1,205 @@
+/*
+ * vhost-user-scsi host device
+ *
+ * Copyright (c) 2016 Nutanix Inc. All rights reserved.
+ *
+ * Author:
+ *  Felipe Franciosi <felipe@nutanix.com>
+ *
+ * This work is largely based on the "vhost-scsi" implementation by:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Nicholas Bellinger <nab@risingtidesystems.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/typedefs.h"
+#include "qom/object.h"
+#include "hw/fw-path-provider.h"
+#include "hw/qdev-core.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-backend.h"
+#include "hw/virtio/vhost-user-scsi.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-access.h"
+#include "chardev/char-fe.h"
+
+/* Features supported by the host application */
+static const int user_feature_bits[] = {
+    VIRTIO_F_NOTIFY_ON_EMPTY,
+    VIRTIO_RING_F_INDIRECT_DESC,
+    VIRTIO_RING_F_EVENT_IDX,
+    VIRTIO_SCSI_F_HOTPLUG,
+    VHOST_INVALID_FEATURE_BIT
+};
+
+static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status)
+{
+    VHostUserSCSI *s = (VHostUserSCSI *)vdev;
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+    bool start = (status & VIRTIO_CONFIG_S_DRIVER_OK) && vdev->vm_running;
+
+    if (vsc->dev.started == start) {
+        return;
+    }
+
+    if (start) {
+        int ret;
+
+        ret = vhost_scsi_common_start(vsc);
+        if (ret < 0) {
+            error_report("unable to start vhost-user-scsi: %s", strerror(-ret));
+            exit(1);
+        }
+    } else {
+        vhost_scsi_common_stop(vsc);
+    }
+}
+
+static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+}
+
+static void vhost_user_scsi_realize(DeviceState *dev, Error **errp)
+{
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+    VHostUserSCSI *s = VHOST_USER_SCSI(dev);
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+    Error *err = NULL;
+    int ret;
+
+    if (!vs->conf.chardev.chr) {
+        error_setg(errp, "vhost-user-scsi: missing chardev");
+        return;
+    }
+
+    virtio_scsi_common_realize(dev, vhost_dummy_handle_output,
+                               vhost_dummy_handle_output,
+                               vhost_dummy_handle_output, &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        return;
+    }
+
+    vsc->dev.nvqs = 2 + vs->conf.num_queues;
+    vsc->dev.vqs = g_new(struct vhost_virtqueue, vsc->dev.nvqs);
+    vsc->dev.vq_index = 0;
+    vsc->dev.backend_features = 0;
+
+    ret = vhost_dev_init(&vsc->dev, (void *)&vs->conf.chardev,
+                         VHOST_BACKEND_TYPE_USER, 0);
+    if (ret < 0) {
+        error_setg(errp, "vhost-user-scsi: vhost initialization failed: %s",
+                   strerror(-ret));
+        return;
+    }
+
+    /* Channel and lun both are 0 for bootable vhost-user-scsi disk */
+    vsc->channel = 0;
+    vsc->lun = 0;
+    vsc->target = vs->conf.boot_tpgt;
+}
+
+static void vhost_user_scsi_unrealize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserSCSI *s = VHOST_USER_SCSI(dev);
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+
+    /* This will stop the vhost backend. */
+    vhost_user_scsi_set_status(vdev, 0);
+
+    vhost_dev_cleanup(&vsc->dev);
+    g_free(vsc->dev.vqs);
+
+    virtio_scsi_common_unrealize(dev, errp);
+}
+
+static uint64_t vhost_user_scsi_get_features(VirtIODevice *vdev,
+                                             uint64_t features, Error **errp)
+{
+    VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+
+    /* Turn on predefined features supported by this device */
+    features |= s->host_features;
+
+    return vhost_scsi_common_get_features(vdev, features, errp);
+}
+
+static Property vhost_user_scsi_properties[] = {
+    DEFINE_PROP_CHR("chardev", VirtIOSCSICommon, conf.chardev),
+    DEFINE_PROP_UINT32("boot_tpgt", VirtIOSCSICommon, conf.boot_tpgt, 0),
+    DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1),
+    DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors,
+                       0xFFFF),
+    DEFINE_PROP_UINT32("cmd_per_lun", VirtIOSCSICommon, conf.cmd_per_lun, 128),
+    DEFINE_PROP_BIT64("hotplug", VHostUserSCSI, host_features,
+                                                VIRTIO_SCSI_F_HOTPLUG,
+                                                true),
+    DEFINE_PROP_BIT64("param_change", VHostUserSCSI, host_features,
+                                                     VIRTIO_SCSI_F_CHANGE,
+                                                     true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static const VMStateDescription vmstate_vhost_scsi = {
+    .name = "virtio-scsi",
+    .minimum_version_id = 1,
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_VIRTIO_DEVICE,
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static void vhost_user_scsi_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(klass);
+
+    dc->props = vhost_user_scsi_properties;
+    dc->vmsd = &vmstate_vhost_scsi;
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    vdc->realize = vhost_user_scsi_realize;
+    vdc->unrealize = vhost_user_scsi_unrealize;
+    vdc->get_features = vhost_user_scsi_get_features;
+    vdc->set_config = vhost_scsi_common_set_config;
+    vdc->set_status = vhost_user_scsi_set_status;
+    fwc->get_dev_path = vhost_scsi_common_get_fw_dev_path;
+}
+
+static void vhost_user_scsi_instance_init(Object *obj)
+{
+    VHostSCSICommon *vsc = VHOST_SCSI_COMMON(obj);
+
+    vsc->feature_bits = user_feature_bits;
+
+    /* Add the bootindex property for this object */
+    device_add_bootindex_property(obj, &vsc->bootindex, "bootindex", NULL,
+                                  DEVICE(vsc), NULL);
+}
+
+static const TypeInfo vhost_user_scsi_info = {
+    .name = TYPE_VHOST_USER_SCSI,
+    .parent = TYPE_VHOST_SCSI_COMMON,
+    .instance_size = sizeof(VHostUserSCSI),
+    .class_init = vhost_user_scsi_class_init,
+    .instance_init = vhost_user_scsi_instance_init,
+    .interfaces = (InterfaceInfo[]) {
+        { TYPE_FW_PATH_PROVIDER },
+        { }
+    },
+};
+
+static void virtio_register_types(void)
+{
+    type_register_static(&vhost_user_scsi_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c
index 17c572c55f..73090e01ad 100644
--- a/hw/usb/hcd-ehci.c
+++ b/hw/usb/hcd-ehci.c
@@ -2241,6 +2241,11 @@ static void ehci_work_bh(void *opaque)
     uint64_t uframes, skipped_uframes;
     int i;
 
+    if (ehci->working) {
+        return;
+    }
+    ehci->working = true;
+
     t_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
     ns_elapsed = t_now - ehci->last_run_ns;
     uframes = ns_elapsed / UFRAME_TIMER_NS;
@@ -2322,6 +2327,8 @@ static void ehci_work_bh(void *opaque)
         }
         timer_mod(ehci->frame_timer, expire_time);
     }
+
+    ehci->working = false;
 }
 
 static void ehci_work_timer(void *opaque)
diff --git a/hw/usb/hcd-ehci.h b/hw/usb/hcd-ehci.h
index 938d8aa284..821f1ded43 100644
--- a/hw/usb/hcd-ehci.h
+++ b/hw/usb/hcd-ehci.h
@@ -297,6 +297,7 @@ struct EHCIState {
      */
     QEMUTimer *frame_timer;
     QEMUBH *async_bh;
+    bool working;
     uint32_t astate;         /* Current state in asynchronous schedule */
     uint32_t pstate;         /* Current state in periodic schedule     */
     USBPort ports[NB_PORTS];
diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index a0c7960a7b..760135c0d2 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -1912,6 +1912,8 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid)
         }
         assert(!xfer->running_retry);
         if (xfer->complete) {
+            /* update ring dequeue ptr */
+            xhci_set_ep_state(xhci, epctx, stctx, epctx->state);
             xhci_ep_free_xfer(epctx->retry);
         }
         epctx->retry = NULL;
@@ -1962,6 +1964,8 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid)
             xhci_fire_transfer(xhci, xfer, epctx);
         }
         if (xfer->complete) {
+            /* update ring dequeue ptr */
+            xhci_set_ep_state(xhci, epctx, stctx, epctx->state);
             xhci_ep_free_xfer(xfer);
             xfer = NULL;
         }
@@ -1979,8 +1983,6 @@ static void xhci_kick_epctx(XHCIEPContext *epctx, unsigned int streamid)
             break;
         }
     }
-    /* update ring dequeue ptr */
-    xhci_set_ep_state(xhci, epctx, stctx, epctx->state);
     epctx->kick_active--;
 
     ep = xhci_epid_to_usbep(epctx);
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index f9b7244808..20d6a08616 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -2135,6 +2135,61 @@ static const TypeInfo vhost_scsi_pci_info = {
 };
 #endif
 
+#ifdef CONFIG_LINUX
+/* vhost-user-scsi-pci */
+static Property vhost_user_scsi_pci_properties[] = {
+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+                       DEV_NVECTORS_UNSPECIFIED),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&dev->vdev);
+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
+
+    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+        vpci_dev->nvectors = vs->conf.num_queues + 3;
+    }
+
+    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void vhost_user_scsi_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+    k->realize = vhost_user_scsi_pci_realize;
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    dc->props = vhost_user_scsi_pci_properties;
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI;
+    pcidev_k->revision = 0x00;
+    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_user_scsi_pci_instance_init(Object *obj)
+{
+    VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_USER_SCSI);
+    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+                              "bootindex", &error_abort);
+}
+
+static const TypeInfo vhost_user_scsi_pci_info = {
+    .name          = TYPE_VHOST_USER_SCSI_PCI,
+    .parent        = TYPE_VIRTIO_PCI,
+    .instance_size = sizeof(VHostUserSCSIPCI),
+    .instance_init = vhost_user_scsi_pci_instance_init,
+    .class_init    = vhost_user_scsi_pci_class_init,
+};
+#endif
+
 /* vhost-vsock-pci */
 
 #ifdef CONFIG_VHOST_VSOCK
@@ -2612,6 +2667,9 @@ static void virtio_pci_register_types(void)
 #ifdef CONFIG_VHOST_SCSI
     type_register_static(&vhost_scsi_pci_info);
 #endif
+#ifdef CONFIG_LINUX
+    type_register_static(&vhost_user_scsi_pci_info);
+#endif
 #ifdef CONFIG_VHOST_VSOCK
     type_register_static(&vhost_vsock_pci_info);
 #endif
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index b095dfc6d9..69f5959623 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -26,6 +26,7 @@
 #include "hw/virtio/virtio-input.h"
 #include "hw/virtio/virtio-gpu.h"
 #include "hw/virtio/virtio-crypto.h"
+#include "hw/virtio/vhost-user-scsi.h"
 
 #ifdef CONFIG_VIRTFS
 #include "hw/9pfs/virtio-9p.h"
@@ -44,6 +45,7 @@ typedef struct VirtIOBalloonPCI VirtIOBalloonPCI;
 typedef struct VirtIOSerialPCI VirtIOSerialPCI;
 typedef struct VirtIONetPCI VirtIONetPCI;
 typedef struct VHostSCSIPCI VHostSCSIPCI;
+typedef struct VHostUserSCSIPCI VHostUserSCSIPCI;
 typedef struct VirtIORngPCI VirtIORngPCI;
 typedef struct VirtIOInputPCI VirtIOInputPCI;
 typedef struct VirtIOInputHIDPCI VirtIOInputHIDPCI;
@@ -230,6 +232,15 @@ struct VHostSCSIPCI {
 };
 #endif
 
+#define TYPE_VHOST_USER_SCSI_PCI "vhost-user-scsi-pci"
+#define VHOST_USER_SCSI_PCI(obj) \
+        OBJECT_CHECK(VHostUserSCSIPCI, (obj), TYPE_VHOST_USER_SCSI_PCI)
+
+struct VHostUserSCSIPCI {
+    VirtIOPCIProxy parent_obj;
+    VHostUserSCSI vdev;
+};
+
 /*
  * virtio-blk-pci: This extends VirtioPCIProxy.
  */
diff --git a/include/block/accounting.h b/include/block/accounting.h
index 20891639d5..b833d26d6c 100644
--- a/include/block/accounting.h
+++ b/include/block/accounting.h
@@ -26,8 +26,10 @@
 #define BLOCK_ACCOUNTING_H
 
 #include "qemu/timed-average.h"
+#include "qemu/thread.h"
 
 typedef struct BlockAcctTimedStats BlockAcctTimedStats;
+typedef struct BlockAcctStats BlockAcctStats;
 
 enum BlockAcctType {
     BLOCK_ACCT_READ,
@@ -37,12 +39,14 @@ enum BlockAcctType {
 };
 
 struct BlockAcctTimedStats {
+    BlockAcctStats *stats;
     TimedAverage latency[BLOCK_MAX_IOTYPE];
     unsigned interval_length; /* in seconds */
     QSLIST_ENTRY(BlockAcctTimedStats) entries;
 };
 
-typedef struct BlockAcctStats {
+struct BlockAcctStats {
+    QemuMutex lock;
     uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
     uint64_t nr_ops[BLOCK_MAX_IOTYPE];
     uint64_t invalid_ops[BLOCK_MAX_IOTYPE];
@@ -53,7 +57,7 @@ typedef struct BlockAcctStats {
     QSLIST_HEAD(, BlockAcctTimedStats) intervals;
     bool account_invalid;
     bool account_failed;
-} BlockAcctStats;
+};
 
 typedef struct BlockAcctCookie {
     int64_t bytes;
@@ -61,7 +65,8 @@ typedef struct BlockAcctCookie {
     enum BlockAcctType type;
 } BlockAcctCookie;
 
-void block_acct_init(BlockAcctStats *stats, bool account_invalid,
+void block_acct_init(BlockAcctStats *stats);
+void block_acct_setup(BlockAcctStats *stats, bool account_invalid,
                      bool account_failed);
 void block_acct_cleanup(BlockAcctStats *stats);
 void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length);
diff --git a/include/block/block.h b/include/block/block.h
index 9b355e92d8..a4f09df95a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -402,7 +402,8 @@ void bdrv_drain_all(void);
          * block_job_defer_to_main_loop for how to do it). \
          */                                                \
         assert(!bs_->wakeup);                              \
-        bs_->wakeup = true;                                \
+        /* Set bs->wakeup before evaluating cond.  */      \
+        atomic_mb_set(&bs_->wakeup, true);                 \
         while (busy_) {                                    \
             if ((cond)) {                                  \
                 waited_ = busy_ = true;                    \
@@ -414,7 +415,7 @@ void bdrv_drain_all(void);
                 waited_ |= busy_;                          \
             }                                              \
         }                                                  \
-        bs_->wakeup = false;                               \
+        atomic_set(&bs_->wakeup, false);                   \
     }                                                      \
     waited_; })
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index cb78c4fa82..748970055e 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -29,6 +29,7 @@
 #include "qemu/option.h"
 #include "qemu/queue.h"
 #include "qemu/coroutine.h"
+#include "qemu/stats64.h"
 #include "qemu/timer.h"
 #include "qapi-types.h"
 #include "qemu/hbitmap.h"
@@ -595,11 +596,6 @@ struct BlockDriverState {
 
     /* Protected by AioContext lock */
 
-    /* If true, copy read backing sectors into image.  Can be >1 if more
-     * than one client has requested copy-on-read.
-     */
-    int copy_on_read;
-
     /* If we are reading a disk image, give its size in sectors.
      * Generally read-only; it is written to by load_snapshot and
      * save_snaphost, but the block layer is quiescent during those.
@@ -609,34 +605,57 @@ struct BlockDriverState {
     /* Callback before write request is processed */
     NotifierWithReturnList before_write_notifiers;
 
-    /* number of in-flight requests; overall and serialising */
-    unsigned int in_flight;
-    unsigned int serialising_in_flight;
+    /* threshold limit for writes, in bytes. "High water mark". */
+    uint64_t write_threshold_offset;
+    NotifierWithReturn write_threshold_notifier;
 
-    bool wakeup;
+    /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
+     * Reading from the list can be done with either the BQL or the
+     * dirty_bitmap_mutex.  Modifying a bitmap only requires
+     * dirty_bitmap_mutex.  */
+    QemuMutex dirty_bitmap_mutex;
+    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
 
     /* Offset after the highest byte written to */
-    uint64_t wr_highest_offset;
+    Stat64 wr_highest_offset;
 
-    /* threshold limit for writes, in bytes. "High water mark". */
-    uint64_t write_threshold_offset;
-    NotifierWithReturn write_threshold_notifier;
+    /* If true, copy read backing sectors into image.  Can be >1 if more
+     * than one client has requested copy-on-read.  Accessed with atomic
+     * ops.
+     */
+    int copy_on_read;
 
-    /* counter for nested bdrv_io_plug */
-    unsigned io_plugged;
+    /* number of in-flight requests; overall and serialising.
+     * Accessed with atomic ops.
+     */
+    unsigned int in_flight;
+    unsigned int serialising_in_flight;
 
-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
-    CoQueue flush_queue;                  /* Serializing flush queue */
-    bool active_flush_req;                /* Flush request in flight? */
-    unsigned int write_gen;               /* Current data generation */
-    unsigned int flushed_gen;             /* Flushed write generation */
+    /* Internal to BDRV_POLL_WHILE and bdrv_wakeup.  Accessed with atomic
+     * ops.
+     */
+    bool wakeup;
 
-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+    /* counter for nested bdrv_io_plug.
+     * Accessed with atomic ops.
+    */
+    unsigned io_plugged;
 
     /* do we need to tell the quest if we have a volatile write cache? */
     int enable_write_cache;
 
+    /* Accessed with atomic ops.  */
     int quiesce_counter;
+    unsigned int write_gen;               /* Current data generation */
+
+    /* Protected by reqs_lock.  */
+    CoMutex reqs_lock;
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+    CoQueue flush_queue;                  /* Serializing flush queue */
+    bool active_flush_req;                /* Flush request in flight? */
+
+    /* Only read/written by whoever has set active_flush_req to true.  */
+    unsigned int flushed_gen;             /* Flushed write generation */
 };
 
 struct BlockBackendRootState {
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 9dea14ba03..ad6558af56 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -36,8 +36,6 @@ bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
 const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
 int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap);
 DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap);
-int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
-                   int64_t sector);
 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int64_t nr_sectors);
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
@@ -45,6 +43,9 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
 int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
                                BdrvDirtyBitmap *bitmap, int64_t sector,
                                int nb_sectors);
+int bdrv_dirty_bitmap_get_meta_locked(BlockDriverState *bs,
+                                      BdrvDirtyBitmap *bitmap, int64_t sector,
+                                      int nb_sectors);
 void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
                                   BdrvDirtyBitmap *bitmap, int64_t sector,
                                   int nb_sectors);
@@ -52,11 +53,6 @@ BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap);
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
                                          uint64_t first_sector);
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
-int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
-void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num);
-int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
-int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
-void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 
 uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
                                               uint64_t start, uint64_t count);
@@ -72,4 +68,19 @@ void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
                                           bool finish);
 void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
 
+/* Functions that require manual locking.  */
+void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap);
+int bdrv_get_dirty_locked(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+                          int64_t sector);
+void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                  int64_t cur_sector, int64_t nr_sectors);
+void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                    int64_t cur_sector, int64_t nr_sectors);
+int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
+void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num);
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
+int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
+
 #endif
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 416257abca..6d75d5a670 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -123,12 +123,8 @@ enum {
  * aren't overflowing some other buffer. */
 #define NBD_MAX_NAME_SIZE 256
 
-ssize_t nbd_wr_syncv(QIOChannel *ioc,
-                     struct iovec *iov,
-                     size_t niov,
-                     size_t length,
-                     bool do_read,
-                     Error **errp);
+ssize_t nbd_rwv(QIOChannel *ioc, struct iovec *iov, size_t niov, size_t length,
+                bool do_read, Error **errp);
 int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
                           QCryptoTLSCreds *tlscreds, const char *hostname,
                           QIOChannel **outioc,
@@ -162,7 +158,7 @@ void nbd_client_new(NBDExport *exp,
                     QIOChannelSocket *sioc,
                     QCryptoTLSCreds *tlscreds,
                     const char *tlsaclname,
-                    void (*close)(NBDClient *));
+                    void (*close_fn)(NBDClient *, bool));
 void nbd_client_get(NBDClient *client);
 void nbd_client_put(NBDClient *client);
 
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 87ae10bcc9..724ec73dce 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -301,7 +301,7 @@ static inline void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu,
 #define CODE_GEN_AVG_BLOCK_SIZE 150
 #endif
 
-#if defined(__arm__) || defined(_ARCH_PPC) \
+#if defined(_ARCH_PPC) \
     || defined(__x86_64__) || defined(__i386__) \
     || defined(__sparc__) || defined(__aarch64__) \
     || defined(__s390x__) || defined(__mips__) \
@@ -401,9 +401,6 @@ static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
 #elif defined(__aarch64__)
 void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr);
 #define tb_set_jmp_target1 aarch64_tb_set_jmp_target
-#elif defined(__arm__)
-void arm_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr);
-#define tb_set_jmp_target1 arm_tb_set_jmp_target
 #elif defined(__sparc__) || defined(__mips__)
 void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
 #else
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 80e605a96a..37f8e78e71 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -456,6 +456,26 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
                                       bool share,
                                       const char *path,
                                       Error **errp);
+
+/**
+ * memory_region_init_ram_from_fd:  Initialize RAM memory region with a
+ *                                  mmap-ed backend.
+ *
+ * @mr: the #MemoryRegion to be initialized.
+ * @owner: the object that tracks the region's reference count
+ * @name: the name of the region.
+ * @size: size of the region.
+ * @share: %true if memory must be mmaped with the MAP_SHARED flag
+ * @fd: the fd to mmap.
+ * @errp: pointer to Error*, to store an error if it happens.
+ */
+void memory_region_init_ram_from_fd(MemoryRegion *mr,
+                                    struct Object *owner,
+                                    const char *name,
+                                    uint64_t size,
+                                    bool share,
+                                    int fd,
+                                    Error **errp);
 #endif
 
 /**
@@ -805,17 +825,6 @@ static inline bool memory_region_is_rom(MemoryRegion *mr)
 int memory_region_get_fd(MemoryRegion *mr);
 
 /**
- * memory_region_set_fd: Mark a RAM memory region as backed by a
- * file descriptor.
- *
- * This function is typically used after memory_region_init_ram_ptr().
- *
- * @mr: the memory region being queried.
- * @fd: the file descriptor that backs @mr.
- */
-void memory_region_set_fd(MemoryRegion *mr, int fd);
-
-/**
  * memory_region_from_host: Convert a pointer into a RAM memory region
  * and an offset within it.
  *
diff --git a/include/exec/poison.h b/include/exec/poison.h
index 3ca7929cce..5ffed4d56e 100644
--- a/include/exec/poison.h
+++ b/include/exec/poison.h
@@ -12,17 +12,28 @@
 #pragma GCC poison TARGET_CRIS
 #pragma GCC poison TARGET_LM32
 #pragma GCC poison TARGET_M68K
+#pragma GCC poison TARGET_MICROBLAZE
 #pragma GCC poison TARGET_MIPS
+#pragma GCC poison TARGET_ABI_MIPSO32
 #pragma GCC poison TARGET_MIPS64
+#pragma GCC poison TARGET_ABI_MIPSN64
+#pragma GCC poison TARGET_MOXIE
+#pragma GCC poison TARGET_NIOS2
 #pragma GCC poison TARGET_OPENRISC
 #pragma GCC poison TARGET_PPC
 #pragma GCC poison TARGET_PPCEMB
 #pragma GCC poison TARGET_PPC64
 #pragma GCC poison TARGET_ABI32
+#pragma GCC poison TARGET_S390X
 #pragma GCC poison TARGET_SH4
 #pragma GCC poison TARGET_SPARC
 #pragma GCC poison TARGET_SPARC64
+#pragma GCC poison TARGET_TRICORE
+#pragma GCC poison TARGET_UNICORE32
+#pragma GCC poison TARGET_XTENSA
 
+#pragma GCC poison TARGET_NAME
+#pragma GCC poison TARGET_SUPPORTS_MTTCG
 #pragma GCC poison TARGET_WORDS_BIGENDIAN
 #pragma GCC poison BSWAP_NEEDED
 
@@ -50,5 +61,25 @@
 #pragma GCC poison CPU_INTERRUPT_TGT_INT_1
 #pragma GCC poison CPU_INTERRUPT_TGT_INT_2
 
+#pragma GCC poison CONFIG_ALPHA_DIS
+#pragma GCC poison CONFIG_ARM_A64_DIS
+#pragma GCC poison CONFIG_ARM_DIS
+#pragma GCC poison CONFIG_CRIS_DIS
+#pragma GCC poison CONFIG_I386_DIS
+#pragma GCC poison CONFIG_LM32_DIS
+#pragma GCC poison CONFIG_M68K_DIS
+#pragma GCC poison CONFIG_MICROBLAZE_DIS
+#pragma GCC poison CONFIG_MIPS_DIS
+#pragma GCC poison CONFIG_MOXIE_DIS
+#pragma GCC poison CONFIG_NIOS2_DIS
+#pragma GCC poison CONFIG_PPC_DIS
+#pragma GCC poison CONFIG_S390_DIS
+#pragma GCC poison CONFIG_SH4_DIS
+#pragma GCC poison CONFIG_SPARC_DIS
+#pragma GCC poison CONFIG_XTENSA_DIS
+
+#pragma GCC poison CONFIG_LINUX_USER
+#pragma GCC poison CONFIG_VHOST_NET
+
 #endif
 #endif
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 140efa840c..73d1bea8b6 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -65,6 +65,9 @@ unsigned long last_ram_page(void);
 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                    bool share, const char *mem_path,
                                    Error **errp);
+RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
+                                 bool share, int fd,
+                                 Error **errp);
 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                   MemoryRegion *mr, Error **errp);
 RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp);
diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h
index c7f17f26e0..25c2afe753 100644
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@@ -31,8 +31,9 @@ typedef struct TBContext TBContext;
 
 struct TBContext {
 
-    TranslationBlock *tbs;
+    TranslationBlock **tbs;
     struct qht htable;
+    size_t tbs_size;
     int nb_tbs;
     /* any access to the tbs or the page table must use this lock */
     QemuMutex tb_lock;
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index d071c9c0e9..233216abdc 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -384,6 +384,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 
 #define PC_COMPAT_2_9 \
     HW_COMPAT_2_9 \
+    {\
+        .driver   = "mch",\
+        .property = "extended-tseg-mbytes",\
+        .value    = stringify(0),\
+    },\
 
 #define PC_COMPAT_2_8 \
     HW_COMPAT_2_8 \
diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h
index 53b6760c16..58983c00b3 100644
--- a/include/hw/pci-host/q35.h
+++ b/include/hw/pci-host/q35.h
@@ -60,6 +60,7 @@ typedef struct MCHPCIState {
     uint64_t above_4g_mem_size;
     uint64_t pci_hole64_size;
     uint32_t short_root_bus;
+    uint16_t ext_tseg_mbytes;
 } MCHPCIState;
 
 typedef struct Q35PCIHost {
@@ -91,6 +92,11 @@ typedef struct Q35PCIHost {
 /* D0:F0 configuration space */
 #define MCH_HOST_BRIDGE_REVISION_DEFAULT       0x0
 
+#define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES        0x50
+#define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_SIZE   2
+#define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY  0xffff
+#define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX    0xfff
+
 #define MCH_HOST_BRIDGE_PCIEXBAR               0x60    /* 64bit register */
 #define MCH_HOST_BRIDGE_PCIEXBAR_SIZE          8       /* 64bit register */
 #define MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT       0xb0000000
diff --git a/include/hw/virtio/vhost-user-scsi.h b/include/hw/virtio/vhost-user-scsi.h
new file mode 100644
index 0000000000..01861f78d0
--- /dev/null
+++ b/include/hw/virtio/vhost-user-scsi.h
@@ -0,0 +1,35 @@
+/*
+ * vhost-user-scsi host device
+ *
+ * Copyright (c) 2016 Nutanix Inc. All rights reserved.
+ *
+ * Author:
+ *  Felipe Franciosi <felipe@nutanix.com>
+ *
+ * This file is largely based on "vhost-scsi.h" by:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef VHOST_USER_SCSI_H
+#define VHOST_USER_SCSI_H
+
+#include "qemu-common.h"
+#include "hw/qdev.h"
+#include "hw/virtio/virtio-scsi.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-scsi-common.h"
+
+#define TYPE_VHOST_USER_SCSI "vhost-user-scsi"
+#define VHOST_USER_SCSI(obj) \
+        OBJECT_CHECK(VHostUserSCSI, (obj), TYPE_VHOST_USER_SCSI)
+
+typedef struct VHostUserSCSI {
+    VHostSCSICommon parent_obj;
+    uint64_t host_features;
+} VHostUserSCSI;
+
+#endif /* VHOST_USER_SCSI_H */
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index eac2013ddd..de6ae5a9f6 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -21,6 +21,7 @@
 #include "hw/virtio/virtio.h"
 #include "hw/pci/pci.h"
 #include "hw/scsi/scsi.h"
+#include "chardev/char-fe.h"
 #include "sysemu/iothread.h"
 
 #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
@@ -53,6 +54,7 @@ struct VirtIOSCSIConf {
     char *vhostfd;
     char *wwpn;
 #endif
+    CharBackend chardev;
     uint32_t boot_tpgt;
     IOThread *iothread;
 };
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index fb008a2e65..85596341fa 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -483,4 +483,7 @@ char *qemu_get_pid_name(pid_t pid);
  */
 pid_t qemu_fork(Error **errp);
 
+extern int qemu_icache_linesize;
+extern int qemu_dcache_linesize;
+
 #endif
diff --git a/include/qemu/stats64.h b/include/qemu/stats64.h
new file mode 100644
index 0000000000..4a357b3e9d
--- /dev/null
+++ b/include/qemu/stats64.h
@@ -0,0 +1,193 @@
+/*
+ * Atomic operations on 64-bit quantities.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_STATS64_H
+#define QEMU_STATS64_H 1
+
+#include "qemu/atomic.h"
+
+/* This provides atomic operations on 64-bit type, using a reader-writer
+ * spinlock on architectures that do not have 64-bit accesses.  Even on
+ * those architectures, it tries hard not to take the lock.
+ */
+
+typedef struct Stat64 {
+#ifdef CONFIG_ATOMIC64
+    uint64_t value;
+#else
+    uint32_t low, high;
+    uint32_t lock;
+#endif
+} Stat64;
+
+#ifdef CONFIG_ATOMIC64
+static inline void stat64_init(Stat64 *s, uint64_t value)
+{
+    /* This is not guaranteed to be atomic! */
+    *s = (Stat64) { value };
+}
+
+static inline uint64_t stat64_get(const Stat64 *s)
+{
+    return atomic_read__nocheck(&s->value);
+}
+
+static inline void stat64_add(Stat64 *s, uint64_t value)
+{
+    atomic_add(&s->value, value);
+}
+
+static inline void stat64_min(Stat64 *s, uint64_t value)
+{
+    uint64_t orig = atomic_read__nocheck(&s->value);
+    while (orig > value) {
+        orig = atomic_cmpxchg__nocheck(&s->value, orig, value);
+    }
+}
+
+static inline void stat64_max(Stat64 *s, uint64_t value)
+{
+    uint64_t orig = atomic_read__nocheck(&s->value);
+    while (orig < value) {
+        orig = atomic_cmpxchg__nocheck(&s->value, orig, value);
+    }
+}
+#else
+uint64_t stat64_get(const Stat64 *s);
+bool stat64_min_slow(Stat64 *s, uint64_t value);
+bool stat64_max_slow(Stat64 *s, uint64_t value);
+bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high);
+
+static inline void stat64_init(Stat64 *s, uint64_t value)
+{
+    /* This is not guaranteed to be atomic! */
+    *s = (Stat64) { .low = value, .high = value >> 32, .lock = 0 };
+}
+
+static inline void stat64_add(Stat64 *s, uint64_t value)
+{
+    uint32_t low, high;
+    high = value >> 32;
+    low = (uint32_t) value;
+    if (!low) {
+        if (high) {
+            atomic_add(&s->high, high);
+        }
+        return;
+    }
+
+    for (;;) {
+        uint32_t orig = s->low;
+        uint32_t result = orig + low;
+        uint32_t old;
+
+        if (result < low || high) {
+            /* If the high part is affected, take the lock.  */
+            if (stat64_add32_carry(s, low, high)) {
+                return;
+            }
+            continue;
+        }
+
+        /* No carry, try with a 32-bit cmpxchg.  The result is independent of
+         * the high 32 bits, so it can race just fine with stat64_add32_carry
+         * and even stat64_get!
+         */
+        old = atomic_cmpxchg(&s->low, orig, result);
+        if (orig == old) {
+            return;
+        }
+    }
+}
+
+static inline void stat64_min(Stat64 *s, uint64_t value)
+{
+    uint32_t low, high;
+    uint32_t orig_low, orig_high;
+
+    high = value >> 32;
+    low = (uint32_t) value;
+    do {
+        orig_high = atomic_read(&s->high);
+        if (orig_high < high) {
+            return;
+        }
+
+        if (orig_high == high) {
+            /* High 32 bits are equal.  Read low after high, otherwise we
+             * can get a false positive (e.g. 0x1235,0x0000 changes to
+             * 0x1234,0x8000 and we read it as 0x1234,0x0000). Pairs with
+             * the write barrier in stat64_min_slow.
+             */
+            smp_rmb();
+            orig_low = atomic_read(&s->low);
+            if (orig_low <= low) {
+                return;
+            }
+
+            /* See if we were lucky and a writer raced against us.  The
+             * barrier is theoretically unnecessary, but if we remove it
+             * we may miss being lucky.
+             */
+            smp_rmb();
+            orig_high = atomic_read(&s->high);
+            if (orig_high < high) {
+                return;
+            }
+        }
+
+        /* If the value changes in any way, we have to take the lock.  */
+    } while (!stat64_min_slow(s, value));
+}
+
+static inline void stat64_max(Stat64 *s, uint64_t value)
+{
+    uint32_t low, high;
+    uint32_t orig_low, orig_high;
+
+    high = value >> 32;
+    low = (uint32_t) value;
+    do {
+        orig_high = atomic_read(&s->high);
+        if (orig_high > high) {
+            return;
+        }
+
+        if (orig_high == high) {
+            /* High 32 bits are equal.  Read low after high, otherwise we
+             * can get a false positive (e.g. 0x1234,0x8000 changes to
+             * 0x1235,0x0000 and we read it as 0x1235,0x8000). Pairs with
+             * the write barrier in stat64_max_slow.
+             */
+            smp_rmb();
+            orig_low = atomic_read(&s->low);
+            if (orig_low >= low) {
+                return;
+            }
+
+            /* See if we were lucky and a writer raced against us.  The
+             * barrier is theoretically unnecessary, but if we remove it
+             * we may miss being lucky.
+             */
+            smp_rmb();
+            orig_high = atomic_read(&s->high);
+            if (orig_high > high) {
+                return;
+            }
+        }
+
+        /* If the value changes in any way, we have to take the lock.  */
+    } while (!stat64_max_slow(s, value));
+}
+
+#endif
+
+#endif
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 840ad6134c..999eb2333a 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -72,15 +72,13 @@ typedef struct BlockDevOps {
  * fields that must be public. This is in particular for QLIST_ENTRY() and
  * friends so that BlockBackends can be kept in lists outside block-backend.c */
 typedef struct BlockBackendPublic {
-    /* I/O throttling has its own locking, but also some fields are
-     * protected by the AioContext lock.
-     */
-
-    /* Protected by AioContext lock.  */
+    /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
+    CoMutex      throttled_reqs_lock;
     CoQueue      throttled_reqs[2];
 
     /* Nonzero if the I/O limits are currently being ignored; generally
-     * it is zero.  */
+     * it is zero.  Accessed with atomic operations.
+     */
     unsigned int io_limits_disabled;
 
     /* The following fields are protected by the ThrottleGroup lock.
diff --git a/include/ui/spice-display.h b/include/ui/spice-display.h
index 184d4c373a..4ba9444dba 100644
--- a/include/ui/spice-display.h
+++ b/include/ui/spice-display.h
@@ -140,6 +140,8 @@ struct SimpleSpiceCursor {
     QXLCursor cursor;
 };
 
+extern bool spice_opengl;
+
 int qemu_spice_rect_is_empty(const QXLRect* r);
 void qemu_spice_rect_union(QXLRect *dest, const QXLRect *r);
 
diff --git a/memory.c b/memory.c
index 0ddc4cc28d..e08fa0ae6c 100644
--- a/memory.c
+++ b/memory.c
@@ -1397,6 +1397,22 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
     mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp);
     mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
 }
+
+void memory_region_init_ram_from_fd(MemoryRegion *mr,
+                                    struct Object *owner,
+                                    const char *name,
+                                    uint64_t size,
+                                    bool share,
+                                    int fd,
+                                    Error **errp)
+{
+    memory_region_init(mr, owner, name, size);
+    mr->ram = true;
+    mr->terminates = true;
+    mr->destructor = memory_region_destructor_ram;
+    mr->ram_block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
+    mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
+}
 #endif
 
 void memory_region_init_ram_ptr(MemoryRegion *mr,
@@ -1835,16 +1851,6 @@ int memory_region_get_fd(MemoryRegion *mr)
     return fd;
 }
 
-void memory_region_set_fd(MemoryRegion *mr, int fd)
-{
-    rcu_read_lock();
-    while (mr->alias) {
-        mr = mr->alias;
-    }
-    mr->ram_block->fd = fd;
-    rcu_read_unlock();
-}
-
 void *memory_region_get_ram_ptr(MemoryRegion *mr)
 {
     void *ptr;
diff --git a/migration/block.c b/migration/block.c
index 3aae5a375e..7674ae1078 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -341,10 +341,8 @@ static int set_dirty_tracking(void)
     int ret;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        aio_context_acquire(blk_get_aio_context(bmds->blk));
         bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
                                                       BLOCK_SIZE, NULL, NULL);
-        aio_context_release(blk_get_aio_context(bmds->blk));
         if (!bmds->dirty_bitmap) {
             ret = -errno;
             goto fail;
@@ -355,9 +353,7 @@ static int set_dirty_tracking(void)
 fail:
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         if (bmds->dirty_bitmap) {
-            aio_context_acquire(blk_get_aio_context(bmds->blk));
             bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap);
-            aio_context_release(blk_get_aio_context(bmds->blk));
         }
     }
     return ret;
@@ -370,9 +366,7 @@ static void unset_dirty_tracking(void)
     BlkMigDevState *bmds;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        aio_context_acquire(blk_get_aio_context(bmds->blk));
         bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap);
-        aio_context_release(blk_get_aio_context(bmds->blk));
     }
 }
 
@@ -531,13 +525,16 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
         } else {
             blk_mig_unlock();
         }
-        if (bdrv_get_dirty(bs, bmds->dirty_bitmap, sector)) {
-
+        bdrv_dirty_bitmap_lock(bmds->dirty_bitmap);
+        if (bdrv_get_dirty_locked(bs, bmds->dirty_bitmap, sector)) {
             if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                 nr_sectors = total_sectors - sector;
             } else {
                 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
             }
+            bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, sector, nr_sectors);
+            bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
+
             blk = g_new(BlkMigBlock, 1);
             blk->buf = g_malloc(BLOCK_SIZE);
             blk->bmds = bmds;
@@ -570,12 +567,12 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                 g_free(blk);
             }
 
-            bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
             sector += nr_sectors;
             bmds->cur_dirty = sector;
-
             break;
         }
+
+        bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
         sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
         bmds->cur_dirty = sector;
     }
diff --git a/migration/colo.c b/migration/colo.c
index c436d63a19..c4ba4c328b 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -352,7 +352,7 @@ static int colo_do_checkpoint_transaction(MigrationState *s,
     qemu_savevm_state_header(fb);
     qemu_savevm_state_begin(fb);
     qemu_mutex_lock_iothread();
-    qemu_savevm_state_complete_precopy(fb, false);
+    qemu_savevm_state_complete_precopy(fb, false, false);
     qemu_mutex_unlock_iothread();
 
     qemu_fflush(fb);
diff --git a/migration/migration.c b/migration/migration.c
index b9d8798ed1..f588329f4c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1553,7 +1553,7 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
      * Cause any non-postcopiable, but iterative devices to
      * send out their final data.
      */
-    qemu_savevm_state_complete_precopy(ms->to_dst_file, true);
+    qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
 
     /*
      * in Finish migrate and with the io-lock held everything should
@@ -1597,7 +1597,7 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
      */
     qemu_savevm_send_postcopy_listen(fb);
 
-    qemu_savevm_state_complete_precopy(fb, false);
+    qemu_savevm_state_complete_precopy(fb, false, false);
     qemu_savevm_send_ping(fb, 3);
 
     qemu_savevm_send_postcopy_run(fb);
@@ -1695,20 +1695,15 @@ static void migration_completion(MigrationState *s, int current_active_state,
         ret = global_state_store();
 
         if (!ret) {
+            bool inactivate = !migrate_colo_enabled();
             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
             if (ret >= 0) {
                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
-                qemu_savevm_state_complete_precopy(s->to_dst_file, false);
+                ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
+                                                         inactivate);
             }
-            /*
-             * Don't mark the image with BDRV_O_INACTIVE flag if
-             * we will go into COLO stage later.
-             */
-            if (ret >= 0 && !migrate_colo_enabled()) {
-                ret = bdrv_inactivate_all();
-                if (ret >= 0) {
-                    s->block_inactive = true;
-                }
+            if (inactivate && ret >= 0) {
+                s->block_inactive = true;
             }
         }
         qemu_mutex_unlock_iothread();
diff --git a/migration/savevm.c b/migration/savevm.c
index f32a82de05..6bfd4893e0 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1104,7 +1104,8 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f)
     qemu_fflush(f);
 }
 
-void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
+int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
+                                       bool inactivate_disks)
 {
     QJSON *vmdesc;
     int vmdesc_len;
@@ -1138,12 +1139,12 @@ void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
         save_section_footer(f, se);
         if (ret < 0) {
             qemu_file_set_error(f, ret);
-            return;
+            return -1;
         }
     }
 
     if (iterable_only) {
-        return;
+        return 0;
     }
 
     vmdesc = qjson_new();
@@ -1173,6 +1174,15 @@ void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
         json_end_object(vmdesc);
     }
 
+    if (inactivate_disks) {
+        /* Inactivate before sending QEMU_VM_EOF so that the
+         * bdrv_invalidate_cache_all() on the other end won't fail. */
+        ret = bdrv_inactivate_all();
+        if (ret) {
+            qemu_file_set_error(f, ret);
+            return ret;
+        }
+    }
     if (!in_postcopy) {
         /* Postcopy stream will still be going */
         qemu_put_byte(f, QEMU_VM_EOF);
@@ -1190,6 +1200,7 @@ void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
     qjson_destroy(vmdesc);
 
     qemu_fflush(f);
+    return 0;
 }
 
 /* Give an estimate of the amount left to be transferred,
@@ -1263,7 +1274,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 
     ret = qemu_file_get_error(f);
     if (ret == 0) {
-        qemu_savevm_state_complete_precopy(f, false);
+        qemu_savevm_state_complete_precopy(f, false, false);
         ret = qemu_file_get_error(f);
     }
     qemu_savevm_state_cleanup();
diff --git a/migration/savevm.h b/migration/savevm.h
index 45b59c19bc..5a2ed1161d 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -35,7 +35,8 @@ void qemu_savevm_state_header(QEMUFile *f);
 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
 void qemu_savevm_state_cleanup(void);
 void qemu_savevm_state_complete_postcopy(QEMUFile *f);
-void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only);
+int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
+                                       bool inactivate_disks);
 void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
                                uint64_t *res_non_postcopiable,
                                uint64_t *res_postcopiable);
diff --git a/nbd/client.c b/nbd/client.c
index 595d99ed30..b97143fa60 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -86,32 +86,6 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
 
 */
 
-/* Discard length bytes from channel.  Return -errno on failure and 0 on
- * success*/
-static int drop_sync(QIOChannel *ioc, size_t size, Error **errp)
-{
-    ssize_t ret = 0;
-    char small[1024];
-    char *buffer;
-
-    buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size));
-    while (size > 0) {
-        ssize_t count = MIN(65536, size);
-        ret = read_sync(ioc, buffer, MIN(65536, size), errp);
-
-        if (ret < 0) {
-            goto cleanup;
-        }
-        size -= count;
-    }
-
- cleanup:
-    if (buffer != small) {
-        g_free(buffer);
-    }
-    return ret;
-}
-
 /* Send an option request.
  *
  * The request is for option @opt, with @data containing @len bytes of
@@ -135,12 +109,12 @@ static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt,
     stl_be_p(&req.option, opt);
     stl_be_p(&req.length, len);
 
-    if (write_sync(ioc, &req, sizeof(req), errp) < 0) {
+    if (nbd_write(ioc, &req, sizeof(req), errp) < 0) {
         error_prepend(errp, "Failed to send option request header");
         return -1;
     }
 
-    if (len && write_sync(ioc, (char *) data, len, errp) < 0) {
+    if (len && nbd_write(ioc, (char *) data, len, errp) < 0) {
         error_prepend(errp, "Failed to send option request data");
         return -1;
     }
@@ -169,7 +143,7 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt,
                                     nbd_opt_reply *reply, Error **errp)
 {
     QEMU_BUILD_BUG_ON(sizeof(*reply) != 20);
-    if (read_sync(ioc, reply, sizeof(*reply), errp) < 0) {
+    if (nbd_read(ioc, reply, sizeof(*reply), errp) < 0) {
         error_prepend(errp, "failed to read option reply");
         nbd_send_opt_abort(ioc);
         return -1;
@@ -218,7 +192,7 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply,
             goto cleanup;
         }
         msg = g_malloc(reply->length + 1);
-        if (read_sync(ioc, msg, reply->length, errp) < 0) {
+        if (nbd_read(ioc, msg, reply->length, errp) < 0) {
             error_prepend(errp, "failed to read option error message");
             goto cleanup;
         }
@@ -320,7 +294,7 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match,
         nbd_send_opt_abort(ioc);
         return -1;
     }
-    if (read_sync(ioc, &namelen, sizeof(namelen), errp) < 0) {
+    if (nbd_read(ioc, &namelen, sizeof(namelen), errp) < 0) {
         error_prepend(errp, "failed to read option name length");
         nbd_send_opt_abort(ioc);
         return -1;
@@ -333,7 +307,7 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match,
         return -1;
     }
     if (namelen != strlen(want)) {
-        if (drop_sync(ioc, len, errp) < 0) {
+        if (nbd_drop(ioc, len, errp) < 0) {
             error_prepend(errp, "failed to skip export name with wrong length");
             nbd_send_opt_abort(ioc);
             return -1;
@@ -342,14 +316,14 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match,
     }
 
     assert(namelen < sizeof(name));
-    if (read_sync(ioc, name, namelen, errp) < 0) {
+    if (nbd_read(ioc, name, namelen, errp) < 0) {
         error_prepend(errp, "failed to read export name");
         nbd_send_opt_abort(ioc);
         return -1;
     }
     name[namelen] = '\0';
     len -= namelen;
-    if (drop_sync(ioc, len, errp) < 0) {
+    if (nbd_drop(ioc, len, errp) < 0) {
         error_prepend(errp, "failed to read export description");
         nbd_send_opt_abort(ioc);
         return -1;
@@ -476,7 +450,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
         goto fail;
     }
 
-    if (read_sync(ioc, buf, 8, errp) < 0) {
+    if (nbd_read(ioc, buf, 8, errp) < 0) {
         error_prepend(errp, "Failed to read data");
         goto fail;
     }
@@ -502,7 +476,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
         goto fail;
     }
 
-    if (read_sync(ioc, &magic, sizeof(magic), errp) < 0) {
+    if (nbd_read(ioc, &magic, sizeof(magic), errp) < 0) {
         error_prepend(errp, "Failed to read magic");
         goto fail;
     }
@@ -514,7 +488,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
         uint16_t globalflags;
         bool fixedNewStyle = false;
 
-        if (read_sync(ioc, &globalflags, sizeof(globalflags), errp) < 0) {
+        if (nbd_read(ioc, &globalflags, sizeof(globalflags), errp) < 0) {
             error_prepend(errp, "Failed to read server flags");
             goto fail;
         }
@@ -532,7 +506,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
         }
         /* client requested flags */
         clientflags = cpu_to_be32(clientflags);
-        if (write_sync(ioc, &clientflags, sizeof(clientflags), errp) < 0) {
+        if (nbd_write(ioc, &clientflags, sizeof(clientflags), errp) < 0) {
             error_prepend(errp, "Failed to send clientflags field");
             goto fail;
         }
@@ -570,13 +544,13 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
         }
 
         /* Read the response */
-        if (read_sync(ioc, &s, sizeof(s), errp) < 0) {
+        if (nbd_read(ioc, &s, sizeof(s), errp) < 0) {
             error_prepend(errp, "Failed to read export length");
             goto fail;
         }
         *size = be64_to_cpu(s);
 
-        if (read_sync(ioc, flags, sizeof(*flags), errp) < 0) {
+        if (nbd_read(ioc, flags, sizeof(*flags), errp) < 0) {
             error_prepend(errp, "Failed to read export flags");
             goto fail;
         }
@@ -593,14 +567,14 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
             goto fail;
         }
 
-        if (read_sync(ioc, &s, sizeof(s), errp) < 0) {
+        if (nbd_read(ioc, &s, sizeof(s), errp) < 0) {
             error_prepend(errp, "Failed to read export length");
             goto fail;
         }
         *size = be64_to_cpu(s);
         TRACE("Size is %" PRIu64, *size);
 
-        if (read_sync(ioc, &oldflags, sizeof(oldflags), errp) < 0) {
+        if (nbd_read(ioc, &oldflags, sizeof(oldflags), errp) < 0) {
             error_prepend(errp, "Failed to read export flags");
             goto fail;
         }
@@ -616,7 +590,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags,
     }
 
     TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags);
-    if (zeroes && drop_sync(ioc, 124, errp) < 0) {
+    if (zeroes && nbd_drop(ioc, 124, errp) < 0) {
         error_prepend(errp, "Failed to read reserved block");
         goto fail;
     }
@@ -759,7 +733,7 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request)
     stq_be_p(buf + 16, request->from);
     stl_be_p(buf + 24, request->len);
 
-    return write_sync(ioc, buf, sizeof(buf), NULL);
+    return nbd_write(ioc, buf, sizeof(buf), NULL);
 }
 
 ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
@@ -768,7 +742,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
     uint32_t magic;
     ssize_t ret;
 
-    ret = read_sync_eof(ioc, buf, sizeof(buf), errp);
+    ret = nbd_read_eof(ioc, buf, sizeof(buf), errp);
     if (ret <= 0) {
         return ret;
     }
diff --git a/nbd/common.c b/nbd/common.c
index bd81637ab9..6b5c1b7b02 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -24,12 +24,8 @@
  * The function may be called from coroutine or from non-coroutine context.
  * When called from non-coroutine context @ioc must be in blocking mode.
  */
-ssize_t nbd_wr_syncv(QIOChannel *ioc,
-                     struct iovec *iov,
-                     size_t niov,
-                     size_t length,
-                     bool do_read,
-                     Error **errp)
+ssize_t nbd_rwv(QIOChannel *ioc, struct iovec *iov, size_t niov, size_t length,
+                bool do_read, Error **errp)
 {
     ssize_t done = 0;
     struct iovec *local_iov = g_new(struct iovec, niov);
@@ -69,6 +65,32 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
     return done;
 }
 
+/* Discard length bytes from channel.  Return -errno on failure and 0 on
+ * success */
+int nbd_drop(QIOChannel *ioc, size_t size, Error **errp)
+{
+    ssize_t ret = 0;
+    char small[1024];
+    char *buffer;
+
+    buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size));
+    while (size > 0) {
+        ssize_t count = MIN(65536, size);
+        ret = nbd_read(ioc, buffer, MIN(65536, size), errp);
+
+        if (ret < 0) {
+            goto cleanup;
+        }
+        size -= count;
+    }
+
+ cleanup:
+    if (buffer != small) {
+        g_free(buffer);
+    }
+    return ret;
+}
+
 
 void nbd_tls_handshake(QIOTask *task,
                        void *opaque)
diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
index d6071640a0..39bfed177c 100644
--- a/nbd/nbd-internal.h
+++ b/nbd/nbd-internal.h
@@ -94,14 +94,14 @@
 #define NBD_ENOSPC     28
 #define NBD_ESHUTDOWN  108
 
-/* read_sync_eof
+/* nbd_read_eof
  * Tries to read @size bytes from @ioc. Returns number of bytes actually read.
  * May return a value >= 0 and < size only on EOF, i.e. when iteratively called
- * qio_channel_readv() returns 0. So, there are no needs to call read_sync_eof
+ * qio_channel_readv() returns 0. So, there is no need to call nbd_read_eof
  * iteratively.
  */
-static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size,
-                                    Error **errp)
+static inline ssize_t nbd_read_eof(QIOChannel *ioc, void *buffer, size_t size,
+                                   Error **errp)
 {
     struct iovec iov = { .iov_base = buffer, .iov_len = size };
     /* Sockets are kept in blocking mode in the negotiation phase.  After
@@ -109,16 +109,16 @@ static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size,
      * our request/reply.  Synchronization is done with recv_coroutine, so
      * that this is coroutine-safe.
      */
-    return nbd_wr_syncv(ioc, &iov, 1, size, true, errp);
+    return nbd_rwv(ioc, &iov, 1, size, true, errp);
 }
 
-/* read_sync
+/* nbd_read
  * Reads @size bytes from @ioc. Returns 0 on success.
  */
-static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size,
-                            Error **errp)
+static inline int nbd_read(QIOChannel *ioc, void *buffer, size_t size,
+                           Error **errp)
 {
-    ssize_t ret = read_sync_eof(ioc, buffer, size, errp);
+    ssize_t ret = nbd_read_eof(ioc, buffer, size, errp);
 
     if (ret >= 0 && ret != size) {
         ret = -EINVAL;
@@ -128,15 +128,15 @@ static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size,
     return ret < 0 ? ret : 0;
 }
 
-/* write_sync
+/* nbd_write
  * Writes @size bytes to @ioc. Returns 0 on success.
  */
-static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size,
-                             Error **errp)
+static inline int nbd_write(QIOChannel *ioc, const void *buffer, size_t size,
+                            Error **errp)
 {
     struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size };
 
-    ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, errp);
+    ssize_t ret = nbd_rwv(ioc, &iov, 1, size, false, errp);
 
     assert(ret < 0 || ret == size);
 
@@ -153,4 +153,6 @@ struct NBDTLSHandshakeData {
 void nbd_tls_handshake(QIOTask *task,
                        void *opaque);
 
+int nbd_drop(QIOChannel *ioc, size_t size, Error **errp);
+
 #endif
diff --git a/nbd/server.c b/nbd/server.c
index 49b55f6ede..8a70c054a6 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -81,7 +81,7 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
 
 struct NBDClient {
     int refcount;
-    void (*close)(NBDClient *client);
+    void (*close_fn)(NBDClient *client, bool negotiated);
 
     bool no_zeroes;
     NBDExport *exp;
@@ -104,69 +104,6 @@ struct NBDClient {
 
 static void nbd_client_receive_next_request(NBDClient *client);
 
-static gboolean nbd_negotiate_continue(QIOChannel *ioc,
-                                       GIOCondition condition,
-                                       void *opaque)
-{
-    qemu_coroutine_enter(opaque);
-    return TRUE;
-}
-
-static int nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size)
-{
-    ssize_t ret;
-    guint watch;
-
-    assert(qemu_in_coroutine());
-    /* Negotiation are always in main loop. */
-    watch = qio_channel_add_watch(ioc,
-                                  G_IO_IN,
-                                  nbd_negotiate_continue,
-                                  qemu_coroutine_self(),
-                                  NULL);
-    ret = read_sync(ioc, buffer, size, NULL);
-    g_source_remove(watch);
-    return ret;
-
-}
-
-static int nbd_negotiate_write(QIOChannel *ioc, const void *buffer, size_t size)
-{
-    ssize_t ret;
-    guint watch;
-
-    assert(qemu_in_coroutine());
-    /* Negotiation are always in main loop. */
-    watch = qio_channel_add_watch(ioc,
-                                  G_IO_OUT,
-                                  nbd_negotiate_continue,
-                                  qemu_coroutine_self(),
-                                  NULL);
-    ret = write_sync(ioc, buffer, size, NULL);
-    g_source_remove(watch);
-    return ret;
-}
-
-static int nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size)
-{
-    ssize_t ret;
-    uint8_t *buffer = g_malloc(MIN(65536, size));
-
-    while (size > 0) {
-        size_t count = MIN(65536, size);
-        ret = nbd_negotiate_read(ioc, buffer, count);
-        if (ret < 0) {
-            g_free(buffer);
-            return ret;
-        }
-
-        size -= count;
-    }
-
-    g_free(buffer);
-    return 0;
-}
-
 /* Basic flow for negotiation
 
    Server         Client
@@ -205,22 +142,22 @@ static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type,
           type, opt, len);
 
     magic = cpu_to_be64(NBD_REP_MAGIC);
-    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) < 0) {
+    if (nbd_write(ioc, &magic, sizeof(magic), NULL) < 0) {
         LOG("write failed (rep magic)");
         return -EINVAL;
     }
     opt = cpu_to_be32(opt);
-    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) < 0) {
+    if (nbd_write(ioc, &opt, sizeof(opt), NULL) < 0) {
         LOG("write failed (rep opt)");
         return -EINVAL;
     }
     type = cpu_to_be32(type);
-    if (nbd_negotiate_write(ioc, &type, sizeof(type)) < 0) {
+    if (nbd_write(ioc, &type, sizeof(type), NULL) < 0) {
         LOG("write failed (rep type)");
         return -EINVAL;
     }
     len = cpu_to_be32(len);
-    if (nbd_negotiate_write(ioc, &len, sizeof(len)) < 0) {
+    if (nbd_write(ioc, &len, sizeof(len), NULL) < 0) {
         LOG("write failed (rep data length)");
         return -EINVAL;
     }
@@ -255,7 +192,7 @@ nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type,
     if (ret < 0) {
         goto out;
     }
-    if (nbd_negotiate_write(ioc, msg, len) < 0) {
+    if (nbd_write(ioc, msg, len, NULL) < 0) {
         LOG("write failed (error message)");
         ret = -EIO;
     } else {
@@ -274,27 +211,27 @@ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp)
     uint32_t len;
     const char *name = exp->name ? exp->name : "";
     const char *desc = exp->description ? exp->description : "";
-    int rc;
+    int ret;
 
     TRACE("Advertising export name '%s' description '%s'", name, desc);
     name_len = strlen(name);
     desc_len = strlen(desc);
     len = name_len + desc_len + sizeof(len);
-    rc = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len);
-    if (rc < 0) {
-        return rc;
+    ret = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len);
+    if (ret < 0) {
+        return ret;
     }
 
     len = cpu_to_be32(name_len);
-    if (nbd_negotiate_write(ioc, &len, sizeof(len)) < 0) {
+    if (nbd_write(ioc, &len, sizeof(len), NULL) < 0) {
         LOG("write failed (name length)");
         return -EINVAL;
     }
-    if (nbd_negotiate_write(ioc, name, name_len) < 0) {
+    if (nbd_write(ioc, name, name_len, NULL) < 0) {
         LOG("write failed (name buffer)");
         return -EINVAL;
     }
-    if (nbd_negotiate_write(ioc, desc, desc_len) < 0) {
+    if (nbd_write(ioc, desc, desc_len, NULL) < 0) {
         LOG("write failed (description buffer)");
         return -EINVAL;
     }
@@ -308,7 +245,7 @@ static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length)
     NBDExport *exp;
 
     if (length) {
-        if (nbd_negotiate_drop_sync(client->ioc, length) < 0) {
+        if (nbd_drop(client->ioc, length, NULL) < 0) {
             return -EIO;
         }
         return nbd_negotiate_send_rep_err(client->ioc,
@@ -328,7 +265,6 @@ static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length)
 
 static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length)
 {
-    int rc = -EINVAL;
     char name[NBD_MAX_NAME_SIZE + 1];
 
     /* Client sends:
@@ -337,11 +273,11 @@ static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length)
     TRACE("Checking length");
     if (length >= sizeof(name)) {
         LOG("Bad length received");
-        goto fail;
+        return -EINVAL;
     }
-    if (nbd_negotiate_read(client->ioc, name, length) < 0) {
+    if (nbd_read(client->ioc, name, length, NULL) < 0) {
         LOG("read failed");
-        goto fail;
+        return -EINVAL;
     }
     name[length] = '\0';
 
@@ -350,14 +286,13 @@ static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length)
     client->exp = nbd_export_find(name);
     if (!client->exp) {
         LOG("export not found");
-        goto fail;
+        return -EINVAL;
     }
 
     QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
     nbd_export_get(client->exp);
-    rc = 0;
-fail:
-    return rc;
+
+    return 0;
 }
 
 /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
@@ -372,7 +307,7 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
     TRACE("Setting up TLS");
     ioc = client->ioc;
     if (length) {
-        if (nbd_negotiate_drop_sync(ioc, length) < 0) {
+        if (nbd_drop(ioc, length, NULL) < 0) {
             return NULL;
         }
         nbd_negotiate_send_rep_err(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS,
@@ -436,7 +371,7 @@ static int nbd_negotiate_options(NBDClient *client)
         ...           Rest of request
     */
 
-    if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) < 0) {
+    if (nbd_read(client->ioc, &flags, sizeof(flags), NULL) < 0) {
         LOG("read failed");
         return -EIO;
     }
@@ -462,7 +397,7 @@ static int nbd_negotiate_options(NBDClient *client)
         uint32_t clientflags, length;
         uint64_t magic;
 
-        if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) < 0) {
+        if (nbd_read(client->ioc, &magic, sizeof(magic), NULL) < 0) {
             LOG("read failed");
             return -EINVAL;
         }
@@ -472,15 +407,15 @@ static int nbd_negotiate_options(NBDClient *client)
             return -EINVAL;
         }
 
-        if (nbd_negotiate_read(client->ioc, &clientflags,
-                               sizeof(clientflags)) < 0)
+        if (nbd_read(client->ioc, &clientflags,
+                      sizeof(clientflags), NULL) < 0)
         {
             LOG("read failed");
             return -EINVAL;
         }
         clientflags = be32_to_cpu(clientflags);
 
-        if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) < 0) {
+        if (nbd_read(client->ioc, &length, sizeof(length), NULL) < 0) {
             LOG("read failed");
             return -EINVAL;
         }
@@ -510,7 +445,7 @@ static int nbd_negotiate_options(NBDClient *client)
                 return -EINVAL;
 
             default:
-                if (nbd_negotiate_drop_sync(client->ioc, length) < 0) {
+                if (nbd_drop(client->ioc, length, NULL) < 0) {
                     return -EIO;
                 }
                 ret = nbd_negotiate_send_rep_err(client->ioc,
@@ -548,7 +483,7 @@ static int nbd_negotiate_options(NBDClient *client)
                 return nbd_negotiate_handle_export_name(client, length);
 
             case NBD_OPT_STARTTLS:
-                if (nbd_negotiate_drop_sync(client->ioc, length) < 0) {
+                if (nbd_drop(client->ioc, length, NULL) < 0) {
                     return -EIO;
                 }
                 if (client->tlscreds) {
@@ -567,7 +502,7 @@ static int nbd_negotiate_options(NBDClient *client)
                 }
                 break;
             default:
-                if (nbd_negotiate_drop_sync(client->ioc, length) < 0) {
+                if (nbd_drop(client->ioc, length, NULL) < 0) {
                     return -EIO;
                 }
                 ret = nbd_negotiate_send_rep_err(client->ioc,
@@ -598,16 +533,10 @@ static int nbd_negotiate_options(NBDClient *client)
     }
 }
 
-typedef struct {
-    NBDClient *client;
-    Coroutine *co;
-} NBDClientNewData;
-
-static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
+static coroutine_fn int nbd_negotiate(NBDClient *client)
 {
-    NBDClient *client = data->client;
     char buf[8 + 8 + 8 + 128];
-    int rc;
+    int ret;
     const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
                               NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA |
                               NBD_FLAG_SEND_WRITE_ZEROES);
@@ -633,7 +562,6 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
      */
 
     qio_channel_set_blocking(client->ioc, false, NULL);
-    rc = -EINVAL;
 
     TRACE("Beginning negotiation.");
     memset(buf, 0, sizeof(buf));
@@ -654,21 +582,21 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
     if (oldStyle) {
         if (client->tlscreds) {
             TRACE("TLS cannot be enabled with oldstyle protocol");
-            goto fail;
+            return -EINVAL;
         }
-        if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) < 0) {
+        if (nbd_write(client->ioc, buf, sizeof(buf), NULL) < 0) {
             LOG("write failed");
-            goto fail;
+            return -EINVAL;
         }
     } else {
-        if (nbd_negotiate_write(client->ioc, buf, 18) < 0) {
+        if (nbd_write(client->ioc, buf, 18, NULL) < 0) {
             LOG("write failed");
-            goto fail;
+            return -EINVAL;
         }
-        rc = nbd_negotiate_options(client);
-        if (rc != 0) {
+        ret = nbd_negotiate_options(client);
+        if (ret != 0) {
             LOG("option negotiation failed");
-            goto fail;
+            return ret;
         }
 
         TRACE("advertising size %" PRIu64 " and flags %x",
@@ -676,25 +604,25 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
         stq_be_p(buf + 18, client->exp->size);
         stw_be_p(buf + 26, client->exp->nbdflags | myflags);
         len = client->no_zeroes ? 10 : sizeof(buf) - 18;
-        if (nbd_negotiate_write(client->ioc, buf + 18, len) < 0) {
+        ret = nbd_write(client->ioc, buf + 18, len, NULL);
+        if (ret < 0) {
             LOG("write failed");
-            goto fail;
+            return ret;
         }
     }
 
     TRACE("Negotiation succeeded.");
-    rc = 0;
-fail:
-    return rc;
+
+    return 0;
 }
 
-static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request)
+static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request)
 {
     uint8_t buf[NBD_REQUEST_SIZE];
     uint32_t magic;
-    ssize_t ret;
+    int ret;
 
-    ret = read_sync(ioc, buf, sizeof(buf), NULL);
+    ret = nbd_read(ioc, buf, sizeof(buf), NULL);
     if (ret < 0) {
         return ret;
     }
@@ -726,7 +654,7 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request)
     return 0;
 }
 
-static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply)
+static int nbd_send_reply(QIOChannel *ioc, NBDReply *reply)
 {
     uint8_t buf[NBD_REPLY_SIZE];
 
@@ -745,7 +673,7 @@ static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply)
     stl_be_p(buf + 4, reply->error);
     stq_be_p(buf + 8, reply->handle);
 
-    return write_sync(ioc, buf, sizeof(buf), NULL);
+    return nbd_write(ioc, buf, sizeof(buf), NULL);
 }
 
 #define MAX_NBD_REQUESTS 16
@@ -778,7 +706,7 @@ void nbd_client_put(NBDClient *client)
     }
 }
 
-static void client_close(NBDClient *client)
+static void client_close(NBDClient *client, bool negotiated)
 {
     if (client->closing) {
         return;
@@ -793,8 +721,8 @@ static void client_close(NBDClient *client)
                          NULL);
 
     /* Also tell the client, so that they release their reference.  */
-    if (client->close) {
-        client->close(client);
+    if (client->close_fn) {
+        client->close_fn(client, negotiated);
     }
 }
 
@@ -975,7 +903,7 @@ void nbd_export_close(NBDExport *exp)
 
     nbd_export_get(exp);
     QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
-        client_close(client);
+        client_close(client, true);
     }
     nbd_export_set_name(exp, NULL);
     nbd_export_set_description(exp, NULL);
@@ -1032,25 +960,24 @@ void nbd_export_close_all(void)
     }
 }
 
-static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
-                                 int len)
+static int nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, int len)
 {
     NBDClient *client = req->client;
-    ssize_t rc, ret;
+    int ret;
 
     g_assert(qemu_in_coroutine());
     qemu_co_mutex_lock(&client->send_lock);
     client->send_coroutine = qemu_coroutine_self();
 
     if (!len) {
-        rc = nbd_send_reply(client->ioc, reply);
+        ret = nbd_send_reply(client->ioc, reply);
     } else {
         qio_channel_set_cork(client->ioc, true);
-        rc = nbd_send_reply(client->ioc, reply);
-        if (rc >= 0) {
-            ret = write_sync(client->ioc, req->data, len, NULL);
+        ret = nbd_send_reply(client->ioc, reply);
+        if (ret == 0) {
+            ret = nbd_write(client->ioc, req->data, len, NULL);
             if (ret < 0) {
-                rc = -EIO;
+                ret = -EIO;
             }
         }
         qio_channel_set_cork(client->ioc, false);
@@ -1058,28 +985,23 @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
 
     client->send_coroutine = NULL;
     qemu_co_mutex_unlock(&client->send_lock);
-    return rc;
+    return ret;
 }
 
-/* Collect a client request.  Return 0 if request looks valid, -EAGAIN
- * to keep trying the collection, -EIO to drop connection right away,
- * and any other negative value to report an error to the client
- * (although the caller may still need to disconnect after reporting
- * the error).  */
-static ssize_t nbd_co_receive_request(NBDRequestData *req,
-                                      NBDRequest *request)
+/* nbd_co_receive_request
+ * Collect a client request. Return 0 if request looks valid, -EIO to drop
+ * connection right away, and any other negative value to report an error to
+ * the client (although the caller may still need to disconnect after reporting
+ * the error).
+ */
+static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request)
 {
     NBDClient *client = req->client;
-    ssize_t rc;
 
     g_assert(qemu_in_coroutine());
     assert(client->recv_coroutine == qemu_coroutine_self());
-    rc = nbd_receive_request(client->ioc, request);
-    if (rc < 0) {
-        if (rc != -EAGAIN) {
-            rc = -EIO;
-        }
-        goto out;
+    if (nbd_receive_request(client->ioc, request) < 0) {
+        return -EIO;
     }
 
     TRACE("Decoding type");
@@ -1093,8 +1015,7 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
         /* Special case: we're going to disconnect without a reply,
          * whether or not flags, from, or len are bogus */
         TRACE("Request type is DISCONNECT");
-        rc = -EIO;
-        goto out;
+        return -EIO;
     }
 
     /* Check for sanity in the parameters, part 1.  Defer as many
@@ -1102,31 +1023,27 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
      * payload, so we can try and keep the connection alive.  */
     if ((request->from + request->len) < request->from) {
         LOG("integer overflow detected, you're probably being attacked");
-        rc = -EINVAL;
-        goto out;
+        return -EINVAL;
     }
 
     if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE) {
         if (request->len > NBD_MAX_BUFFER_SIZE) {
             LOG("len (%" PRIu32" ) is larger than max len (%u)",
                 request->len, NBD_MAX_BUFFER_SIZE);
-            rc = -EINVAL;
-            goto out;
+            return -EINVAL;
         }
 
         req->data = blk_try_blockalign(client->exp->blk, request->len);
         if (req->data == NULL) {
-            rc = -ENOMEM;
-            goto out;
+            return -ENOMEM;
         }
     }
     if (request->type == NBD_CMD_WRITE) {
         TRACE("Reading %" PRIu32 " byte(s)", request->len);
 
-        if (read_sync(client->ioc, req->data, request->len, NULL) < 0) {
+        if (nbd_read(client->ioc, req->data, request->len, NULL) < 0) {
             LOG("reading from socket failed");
-            rc = -EIO;
-            goto out;
+            return -EIO;
         }
         req->complete = true;
     }
@@ -1136,28 +1053,19 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
         LOG("operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
             ", Size: %" PRIu64, request->from, request->len,
             (uint64_t)client->exp->size);
-        rc = request->type == NBD_CMD_WRITE ? -ENOSPC : -EINVAL;
-        goto out;
+        return request->type == NBD_CMD_WRITE ? -ENOSPC : -EINVAL;
     }
     if (request->flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
         LOG("unsupported flags (got 0x%x)", request->flags);
-        rc = -EINVAL;
-        goto out;
+        return -EINVAL;
     }
     if (request->type != NBD_CMD_WRITE_ZEROES &&
         (request->flags & NBD_CMD_FLAG_NO_HOLE)) {
         LOG("unexpected flags (got 0x%x)", request->flags);
-        rc = -EINVAL;
-        goto out;
+        return -EINVAL;
     }
 
-    rc = 0;
-
-out:
-    client->recv_coroutine = NULL;
-    nbd_client_receive_next_request(client);
-
-    return rc;
+    return 0;
 }
 
 /* Owns a reference to the NBDClient passed as opaque.  */
@@ -1168,8 +1076,9 @@ static coroutine_fn void nbd_trip(void *opaque)
     NBDRequestData *req;
     NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
     NBDReply reply;
-    ssize_t ret;
+    int ret;
     int flags;
+    int reply_data_len = 0;
 
     TRACE("Reading request.");
     if (client->closing) {
@@ -1179,11 +1088,10 @@ static coroutine_fn void nbd_trip(void *opaque)
 
     req = nbd_request_get(client);
     ret = nbd_co_receive_request(req, &request);
-    if (ret == -EAGAIN) {
-        goto done;
-    }
+    client->recv_coroutine = NULL;
+    nbd_client_receive_next_request(client);
     if (ret == -EIO) {
-        goto out;
+        goto disconnect;
     }
 
     reply.handle = request.handle;
@@ -1191,7 +1099,7 @@ static coroutine_fn void nbd_trip(void *opaque)
 
     if (ret < 0) {
         reply.error = -ret;
-        goto error_reply;
+        goto reply;
     }
 
     if (client->closing) {
@@ -1212,7 +1120,7 @@ static coroutine_fn void nbd_trip(void *opaque)
             if (ret < 0) {
                 LOG("flush failed");
                 reply.error = -ret;
-                goto error_reply;
+                break;
             }
         }
 
@@ -1221,12 +1129,12 @@ static coroutine_fn void nbd_trip(void *opaque)
         if (ret < 0) {
             LOG("reading from file failed");
             reply.error = -ret;
-            goto error_reply;
+            break;
         }
 
+        reply_data_len = request.len;
         TRACE("Read %" PRIu32" byte(s)", request.len);
-        if (nbd_co_send_reply(req, &reply, request.len) < 0)
-            goto out;
+
         break;
     case NBD_CMD_WRITE:
         TRACE("Request type is WRITE");
@@ -1234,7 +1142,7 @@ static coroutine_fn void nbd_trip(void *opaque)
         if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
             TRACE("Server is read-only, return error");
             reply.error = EROFS;
-            goto error_reply;
+            break;
         }
 
         TRACE("Writing to device");
@@ -1248,21 +1156,16 @@ static coroutine_fn void nbd_trip(void *opaque)
         if (ret < 0) {
             LOG("writing to file failed");
             reply.error = -ret;
-            goto error_reply;
         }
 
-        if (nbd_co_send_reply(req, &reply, 0) < 0) {
-            goto out;
-        }
         break;
-
     case NBD_CMD_WRITE_ZEROES:
         TRACE("Request type is WRITE_ZEROES");
 
         if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
             TRACE("Server is read-only, return error");
             reply.error = EROFS;
-            goto error_reply;
+            break;
         }
 
         TRACE("Writing to device");
@@ -1279,14 +1182,9 @@ static coroutine_fn void nbd_trip(void *opaque)
         if (ret < 0) {
             LOG("writing to file failed");
             reply.error = -ret;
-            goto error_reply;
         }
 
-        if (nbd_co_send_reply(req, &reply, 0) < 0) {
-            goto out;
-        }
         break;
-
     case NBD_CMD_DISC:
         /* unreachable, thanks to special case in nbd_co_receive_request() */
         abort();
@@ -1299,9 +1197,7 @@ static coroutine_fn void nbd_trip(void *opaque)
             LOG("flush failed");
             reply.error = -ret;
         }
-        if (nbd_co_send_reply(req, &reply, 0) < 0) {
-            goto out;
-        }
+
         break;
     case NBD_CMD_TRIM:
         TRACE("Request type is TRIM");
@@ -1311,21 +1207,19 @@ static coroutine_fn void nbd_trip(void *opaque)
             LOG("discard failed");
             reply.error = -ret;
         }
-        if (nbd_co_send_reply(req, &reply, 0) < 0) {
-            goto out;
-        }
+
         break;
     default:
         LOG("invalid request type (%" PRIu32 ") received", request.type);
         reply.error = EINVAL;
-    error_reply:
-        /* We must disconnect after NBD_CMD_WRITE if we did not
-         * read the payload.
-         */
-        if (nbd_co_send_reply(req, &reply, 0) < 0 || !req->complete) {
-            goto out;
-        }
-        break;
+    }
+
+reply:
+    /* We must disconnect after NBD_CMD_WRITE if we did not
+     * read the payload.
+     */
+    if (nbd_co_send_reply(req, &reply, reply_data_len) < 0 || !req->complete) {
+        goto disconnect;
     }
 
     TRACE("Request/Reply complete");
@@ -1335,9 +1229,9 @@ done:
     nbd_client_put(client);
     return;
 
-out:
+disconnect:
     nbd_request_put(req);
-    client_close(client);
+    client_close(client, true);
     nbd_client_put(client);
 }
 
@@ -1352,8 +1246,7 @@ static void nbd_client_receive_next_request(NBDClient *client)
 
 static coroutine_fn void nbd_co_client_start(void *opaque)
 {
-    NBDClientNewData *data = opaque;
-    NBDClient *client = data->client;
+    NBDClient *client = opaque;
     NBDExport *exp = client->exp;
 
     if (exp) {
@@ -1362,25 +1255,28 @@ static coroutine_fn void nbd_co_client_start(void *opaque)
     }
     qemu_co_mutex_init(&client->send_lock);
 
-    if (nbd_negotiate(data)) {
-        client_close(client);
-        goto out;
+    if (nbd_negotiate(client)) {
+        client_close(client, false);
+        return;
     }
 
     nbd_client_receive_next_request(client);
-
-out:
-    g_free(data);
 }
 
+/*
+ * Create a new client listener on the given export @exp, using the
+ * given channel @sioc.  Begin servicing it in a coroutine.  When the
+ * connection closes, call @close_fn with an indication of whether the
+ * client completed negotiation.
+ */
 void nbd_client_new(NBDExport *exp,
                     QIOChannelSocket *sioc,
                     QCryptoTLSCreds *tlscreds,
                     const char *tlsaclname,
-                    void (*close_fn)(NBDClient *))
+                    void (*close_fn)(NBDClient *, bool))
 {
     NBDClient *client;
-    NBDClientNewData *data = g_new(NBDClientNewData, 1);
+    Coroutine *co;
 
     client = g_malloc0(sizeof(NBDClient));
     client->refcount = 1;
@@ -1394,9 +1290,8 @@ void nbd_client_new(NBDExport *exp,
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
-    client->close = close_fn;
+    client->close_fn = close_fn;
 
-    data->client = client;
-    data->co = qemu_coroutine_create(nbd_co_client_start, data);
-    qemu_coroutine_enter(data->co);
+    co = qemu_coroutine_create(nbd_co_client_start, client);
+    qemu_coroutine_enter(co);
 }
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 965ba5929e..21079fd675 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -1,11 +1,12 @@
 \input texinfo @c -*- texinfo -*-
 @c %**start of header
 @setfilename qemu-doc.info
+@include version.texi
 
 @documentlanguage en
 @documentencoding UTF-8
 
-@settitle QEMU Emulator User Documentation
+@settitle QEMU version @value{VERSION} User Documentation
 @exampleindent 0
 @paragraphindent 0
 @c %**end of header
@@ -19,7 +20,7 @@
 @iftex
 @titlepage
 @sp 7
-@center @titlefont{QEMU Emulator}
+@center @titlefont{QEMU version @value{VERSION}}
 @sp 1
 @center @titlefont{User Documentation}
 @sp 3
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 651f85ecc1..4dd3fd4732 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -336,10 +336,10 @@ static void nbd_export_closed(NBDExport *exp)
 
 static void nbd_update_server_watch(void);
 
-static void nbd_client_closed(NBDClient *client)
+static void nbd_client_closed(NBDClient *client, bool negotiated)
 {
     nb_fds--;
-    if (nb_fds == 0 && !persistent && state == RUNNING) {
+    if (negotiated && nb_fds == 0 && !persistent && state == RUNNING) {
         state = TERMINATE;
     }
     nbd_update_server_watch();
@@ -581,6 +581,10 @@ int main(int argc, char **argv)
     sa_sigterm.sa_handler = termsig_handler;
     sigaction(SIGTERM, &sa_sigterm, NULL);
 
+#ifdef CONFIG_POSIX
+    signal(SIGPIPE, SIG_IGN);
+#endif
+
     module_call_init(MODULE_INIT_TRACE);
     qcrypto_init(&error_fatal);
 
diff --git a/rules.mak b/rules.mak
index 2a2fb72e85..6e943335f3 100644
--- a/rules.mak
+++ b/rules.mak
@@ -377,7 +377,7 @@ define unnest-vars
 endef
 
 TEXI2MAN = $(call quiet-command, \
-	perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl -I docs $< $@.pod && \
+	perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $(TEXI2PODFLAGS) $< $@.pod && \
 	$(POD2MAN) --section=$(subst .,,$(suffix $@)) --center=" " --release=" " $@.pod > $@, \
 	"GEN","$@")
 
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index 7c45ae360c..232af9e177 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -84,6 +84,7 @@ typedef enum {
        the PC (for whatever reason), so there's no need to do it again on
        exiting the TB.  */
     EXIT_PC_UPDATED,
+    EXIT_PC_UPDATED_NOCHAIN,
 
     /* We are exiting the TB, but have neither emitted a goto_tb, nor
        updated the PC for the next instruction to be executed.  */
@@ -458,11 +459,17 @@ static bool in_superpage(DisasContext *ctx, int64_t addr)
 #endif
 }
 
+static bool use_exit_tb(DisasContext *ctx)
+{
+    return ((ctx->tb->cflags & CF_LAST_IO)
+            || ctx->singlestep_enabled
+            || singlestep);
+}
+
 static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
 {
     /* Suppress goto_tb in the case of single-steping and IO.  */
-    if ((ctx->tb->cflags & CF_LAST_IO)
-        || ctx->singlestep_enabled || singlestep) {
+    if (unlikely(use_exit_tb(ctx))) {
         return false;
     }
 #ifndef CONFIG_USER_ONLY
@@ -1198,7 +1205,10 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
             tcg_gen_andi_i64(tmp, ctx->ir[IR_A0], PS_INT_MASK);
             tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, ps));
             tcg_temp_free(tmp);
-            break;
+
+            /* Allow interrupts to be recognized right away.  */
+            tcg_gen_movi_i64(cpu_pc, ctx->pc);
+            return EXIT_PC_UPDATED_NOCHAIN;
 
         case 0x36:
             /* RDPS */
@@ -1266,7 +1276,7 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
            need the page permissions check.  We'll see the existence of
            the page when we create the TB, and we'll flush all TBs if
            we change the PAL base register.  */
-        if (!ctx->singlestep_enabled && !(ctx->tb->cflags & CF_LAST_IO)) {
+        if (!use_exit_tb(ctx)) {
             tcg_gen_goto_tb(0);
             tcg_gen_movi_i64(cpu_pc, entry);
             tcg_gen_exit_tb((uintptr_t)ctx->tb);
@@ -2686,7 +2696,8 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
         tcg_gen_andi_i64(tmp, vb, 1);
         tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, pal_mode));
         tcg_gen_andi_i64(cpu_pc, vb, ~3);
-        ret = EXIT_PC_UPDATED;
+        /* Allow interrupts to be recognized right away.  */
+        ret = EXIT_PC_UPDATED_NOCHAIN;
         break;
 #else
         goto invalid_opc;
@@ -3010,6 +3021,12 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
         tcg_gen_movi_i64(cpu_pc, ctx.pc);
         /* FALLTHRU */
     case EXIT_PC_UPDATED:
+        if (!use_exit_tb(&ctx)) {
+            tcg_gen_lookup_and_goto_ptr(cpu_pc);
+            break;
+        }
+        /* FALLTHRU */
+    case EXIT_PC_UPDATED_NOCHAIN:
         if (ctx.singlestep_enabled) {
             gen_excp_1(EXCP_DEBUG, 0);
         } else {
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 860e279658..e55547d95d 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1422,7 +1422,9 @@ static void handle_msr_i(DisasContext *s, uint32_t insn,
         gen_helper_msr_i_pstate(cpu_env, tcg_op, tcg_imm);
         tcg_temp_free_i32(tcg_imm);
         tcg_temp_free_i32(tcg_op);
-        s->is_jmp = DISAS_UPDATE;
+        /* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs.  */
+        gen_a64_set_pc_im(s->pc);
+        s->is_jmp = (op == 0x1f ? DISAS_EXIT : DISAS_JUMP);
         break;
     }
     default:
@@ -11369,6 +11371,9 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
         case DISAS_JUMP:
             tcg_gen_lookup_and_goto_ptr(cpu_pc);
             break;
+        case DISAS_EXIT:
+            tcg_gen_exit_tb(0);
+            break;
         case DISAS_TB_JUMP:
         case DISAS_EXC:
         case DISAS_SWI:
diff --git a/target/i386/hax-all.c b/target/i386/hax-all.c
index 097db5cae1..ba6117d7de 100644
--- a/target/i386/hax-all.c
+++ b/target/i386/hax-all.c
@@ -514,9 +514,10 @@ static int hax_vcpu_hax_exec(CPUArchState *env)
         hax_vcpu_interrupt(env);
 
         qemu_mutex_unlock_iothread();
+        cpu_exec_start(cpu);
         hax_ret = hax_vcpu_run(vcpu);
+        cpu_exec_end(cpu);
         qemu_mutex_lock_iothread();
-        current_cpu = cpu;
 
         /* Simply continue the vcpu_run if system call interrupted */
         if (hax_ret == -EINTR || hax_ret == -EAGAIN) {
diff --git a/target/m68k/Makefile.objs b/target/m68k/Makefile.objs
index 02cf616a78..39141ab93d 100644
--- a/target/m68k/Makefile.objs
+++ b/target/m68k/Makefile.objs
@@ -1,3 +1,3 @@
 obj-y += m68k-semi.o
-obj-y += translate.o op_helper.o helper.o cpu.o
+obj-y += translate.o op_helper.o helper.o cpu.o fpu_helper.o
 obj-y += gdbstub.o
diff --git a/target/m68k/fpu_helper.c b/target/m68k/fpu_helper.c
new file mode 100644
index 0000000000..5bf2576c2b
--- /dev/null
+++ b/target/m68k/fpu_helper.c
@@ -0,0 +1,112 @@
+/*
+ *  m68k FPU helpers
+ *
+ *  Copyright (c) 2006-2007 CodeSourcery
+ *  Written by Paul Brook
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+
+uint32_t HELPER(f64_to_i32)(CPUM68KState *env, float64 val)
+{
+    return float64_to_int32(val, &env->fp_status);
+}
+
+float32 HELPER(f64_to_f32)(CPUM68KState *env, float64 val)
+{
+    return float64_to_float32(val, &env->fp_status);
+}
+
+float64 HELPER(i32_to_f64)(CPUM68KState *env, uint32_t val)
+{
+    return int32_to_float64(val, &env->fp_status);
+}
+
+float64 HELPER(f32_to_f64)(CPUM68KState *env, float32 val)
+{
+    return float32_to_float64(val, &env->fp_status);
+}
+
+float64 HELPER(iround_f64)(CPUM68KState *env, float64 val)
+{
+    return float64_round_to_int(val, &env->fp_status);
+}
+
+float64 HELPER(itrunc_f64)(CPUM68KState *env, float64 val)
+{
+    return float64_trunc_to_int(val, &env->fp_status);
+}
+
+float64 HELPER(sqrt_f64)(CPUM68KState *env, float64 val)
+{
+    return float64_sqrt(val, &env->fp_status);
+}
+
+float64 HELPER(abs_f64)(float64 val)
+{
+    return float64_abs(val);
+}
+
+float64 HELPER(chs_f64)(float64 val)
+{
+    return float64_chs(val);
+}
+
+float64 HELPER(add_f64)(CPUM68KState *env, float64 a, float64 b)
+{
+    return float64_add(a, b, &env->fp_status);
+}
+
+float64 HELPER(sub_f64)(CPUM68KState *env, float64 a, float64 b)
+{
+    return float64_sub(a, b, &env->fp_status);
+}
+
+float64 HELPER(mul_f64)(CPUM68KState *env, float64 a, float64 b)
+{
+    return float64_mul(a, b, &env->fp_status);
+}
+
+float64 HELPER(div_f64)(CPUM68KState *env, float64 a, float64 b)
+{
+    return float64_div(a, b, &env->fp_status);
+}
+
+float64 HELPER(sub_cmp_f64)(CPUM68KState *env, float64 a, float64 b)
+{
+    /* ??? This may incorrectly raise exceptions.  */
+    /* ??? Should flush denormals to zero.  */
+    float64 res;
+    res = float64_sub(a, b, &env->fp_status);
+    if (float64_is_quiet_nan(res, &env->fp_status)) {
+        /* +/-inf compares equal against itself, but sub returns nan.  */
+        if (!float64_is_quiet_nan(a, &env->fp_status)
+            && !float64_is_quiet_nan(b, &env->fp_status)) {
+            res = float64_zero;
+            if (float64_lt_quiet(a, res, &env->fp_status)) {
+                res = float64_chs(res);
+            }
+        }
+    }
+    return res;
+}
+
+uint32_t HELPER(compare_f64)(CPUM68KState *env, float64 val)
+{
+    return float64_compare_quiet(val, float64_zero, &env->fp_status);
+}
diff --git a/target/m68k/helper.c b/target/m68k/helper.c
index f750d3dbaa..5ca9911657 100644
--- a/target/m68k/helper.c
+++ b/target/m68k/helper.c
@@ -284,94 +284,6 @@ void HELPER(set_sr)(CPUM68KState *env, uint32_t val)
     m68k_switch_sp(env);
 }
 
-/* FPU helpers.  */
-uint32_t HELPER(f64_to_i32)(CPUM68KState *env, float64 val)
-{
-    return float64_to_int32(val, &env->fp_status);
-}
-
-float32 HELPER(f64_to_f32)(CPUM68KState *env, float64 val)
-{
-    return float64_to_float32(val, &env->fp_status);
-}
-
-float64 HELPER(i32_to_f64)(CPUM68KState *env, uint32_t val)
-{
-    return int32_to_float64(val, &env->fp_status);
-}
-
-float64 HELPER(f32_to_f64)(CPUM68KState *env, float32 val)
-{
-    return float32_to_float64(val, &env->fp_status);
-}
-
-float64 HELPER(iround_f64)(CPUM68KState *env, float64 val)
-{
-    return float64_round_to_int(val, &env->fp_status);
-}
-
-float64 HELPER(itrunc_f64)(CPUM68KState *env, float64 val)
-{
-    return float64_trunc_to_int(val, &env->fp_status);
-}
-
-float64 HELPER(sqrt_f64)(CPUM68KState *env, float64 val)
-{
-    return float64_sqrt(val, &env->fp_status);
-}
-
-float64 HELPER(abs_f64)(float64 val)
-{
-    return float64_abs(val);
-}
-
-float64 HELPER(chs_f64)(float64 val)
-{
-    return float64_chs(val);
-}
-
-float64 HELPER(add_f64)(CPUM68KState *env, float64 a, float64 b)
-{
-    return float64_add(a, b, &env->fp_status);
-}
-
-float64 HELPER(sub_f64)(CPUM68KState *env, float64 a, float64 b)
-{
-    return float64_sub(a, b, &env->fp_status);
-}
-
-float64 HELPER(mul_f64)(CPUM68KState *env, float64 a, float64 b)
-{
-    return float64_mul(a, b, &env->fp_status);
-}
-
-float64 HELPER(div_f64)(CPUM68KState *env, float64 a, float64 b)
-{
-    return float64_div(a, b, &env->fp_status);
-}
-
-float64 HELPER(sub_cmp_f64)(CPUM68KState *env, float64 a, float64 b)
-{
-    /* ??? This may incorrectly raise exceptions.  */
-    /* ??? Should flush denormals to zero.  */
-    float64 res;
-    res = float64_sub(a, b, &env->fp_status);
-    if (float64_is_quiet_nan(res, &env->fp_status)) {
-        /* +/-inf compares equal against itself, but sub returns nan.  */
-        if (!float64_is_quiet_nan(a, &env->fp_status)
-            && !float64_is_quiet_nan(b, &env->fp_status)) {
-            res = float64_zero;
-            if (float64_lt_quiet(a, res, &env->fp_status))
-                res = float64_chs(res);
-        }
-    }
-    return res;
-}
-
-uint32_t HELPER(compare_f64)(CPUM68KState *env, float64 val)
-{
-    return float64_compare_quiet(val, float64_zero, &env->fp_status);
-}
 
 /* MAC unit.  */
 /* FIXME: The MAC unit implementation is a bit of a mess.  Some helpers
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index ad4d4efb8d..dfecfb6e5f 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -565,7 +565,7 @@ static void gen_flush_flags(DisasContext *s)
         t1 = tcg_temp_new();
         tcg_gen_add_i32(t0, QREG_CC_N, QREG_CC_V);
         gen_ext(t0, t0, s->cc_op - CC_OP_SUBB, 1);
-        tcg_gen_xor_i32(t1, QREG_CC_N, QREG_CC_V);
+        tcg_gen_xor_i32(t1, QREG_CC_N, t0);
         tcg_gen_xor_i32(QREG_CC_V, QREG_CC_V, t0);
         tcg_temp_free(t0);
         tcg_gen_and_i32(QREG_CC_V, QREG_CC_V, t1);
@@ -669,6 +669,21 @@ static inline int insn_opsize(int insn)
     }
 }
 
+static inline int ext_opsize(int ext, int pos)
+{
+    switch ((ext >> pos) & 7) {
+    case 0: return OS_LONG;
+    case 1: return OS_SINGLE;
+    case 2: return OS_EXTENDED;
+    case 3: return OS_PACKED;
+    case 4: return OS_WORD;
+    case 5: return OS_DOUBLE;
+    case 6: return OS_BYTE;
+    default:
+        g_assert_not_reached();
+    }
+}
+
 /* Assign value to a register.  If the width is less than the register width
    only the low part of the register is set.  */
 static void gen_partset_reg(int opsize, TCGv reg, TCGv val)
@@ -4111,20 +4126,19 @@ DISAS_INSN(fpu)
         tmp32 = tcg_temp_new_i32();
         /* fmove */
         /* ??? TODO: Proper behavior on overflow.  */
-        switch ((ext >> 10) & 7) {
-        case 0:
-            opsize = OS_LONG;
+
+        opsize = ext_opsize(ext, 10);
+        switch (opsize) {
+        case OS_LONG:
             gen_helper_f64_to_i32(tmp32, cpu_env, src);
             break;
-        case 1:
-            opsize = OS_SINGLE;
+        case OS_SINGLE:
             gen_helper_f64_to_f32(tmp32, cpu_env, src);
             break;
-        case 4:
-            opsize = OS_WORD;
+        case OS_WORD:
             gen_helper_f64_to_i32(tmp32, cpu_env, src);
             break;
-        case 5: /* OS_DOUBLE */
+        case OS_DOUBLE:
             tcg_gen_mov_i32(tmp32, AREG(insn, 0));
             switch ((insn >> 3) & 7) {
             case 2:
@@ -4153,8 +4167,7 @@ DISAS_INSN(fpu)
             }
             tcg_temp_free_i32(tmp32);
             return;
-        case 6:
-            opsize = OS_BYTE;
+        case OS_BYTE:
             gen_helper_f64_to_i32(tmp32, cpu_env, src);
             break;
         default:
@@ -4227,15 +4240,7 @@ DISAS_INSN(fpu)
     }
     if (ext & (1 << 14)) {
         /* Source effective address.  */
-        switch ((ext >> 10) & 7) {
-        case 0: opsize = OS_LONG; break;
-        case 1: opsize = OS_SINGLE; break;
-        case 4: opsize = OS_WORD; break;
-        case 5: opsize = OS_DOUBLE; break;
-        case 6: opsize = OS_BYTE; break;
-        default:
-            goto undef;
-        }
+        opsize = ext_opsize(ext, 10);
         if (opsize == OS_DOUBLE) {
             tmp32 = tcg_temp_new_i32();
             tcg_gen_mov_i32(tmp32, AREG(insn, 0));
diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index 8c055b7bb7..640354271c 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -1173,6 +1173,8 @@ typedef enum {
     /* We are exiting the TB, but have neither emitted a goto_tb, nor
        updated the PC for the next instruction to be executed.  */
     EXIT_PC_STALE,
+    /* We are exiting the TB to the main loop.  */
+    EXIT_PC_STALE_NOCHAIN,
     /* We are ending the TB with a noreturn function call, e.g. longjmp.
        No following code will be executed.  */
     EXIT_NORETURN,
@@ -3795,7 +3797,8 @@ static ExitStatus op_ssm(DisasContext *s, DisasOps *o)
 {
     check_privileged(s);
     tcg_gen_deposit_i64(psw_mask, psw_mask, o->in2, 56, 8);
-    return NO_EXIT;
+    /* Exit to main loop to reevaluate s390_cpu_exec_interrupt.  */
+    return EXIT_PC_STALE_NOCHAIN;
 }
 
 static ExitStatus op_stap(DisasContext *s, DisasOps *o)
@@ -4038,7 +4041,9 @@ static ExitStatus op_stnosm(DisasContext *s, DisasOps *o)
     } else {
         tcg_gen_ori_i64(psw_mask, psw_mask, i2 << 56);
     }
-    return NO_EXIT;
+
+    /* Exit to main loop to reevaluate s390_cpu_exec_interrupt.  */
+    return EXIT_PC_STALE_NOCHAIN;
 }
 
 static ExitStatus op_stura(DisasContext *s, DisasOps *o)
@@ -5788,6 +5793,7 @@ void gen_intermediate_code(CPUS390XState *env, struct TranslationBlock *tb)
     case EXIT_NORETURN:
         break;
     case EXIT_PC_STALE:
+    case EXIT_PC_STALE_NOCHAIN:
         update_psw_addr(&dc);
         /* FALLTHRU */
     case EXIT_PC_UPDATED:
@@ -5799,14 +5805,14 @@ void gen_intermediate_code(CPUS390XState *env, struct TranslationBlock *tb)
         /* Exit the TB, either by raising a debug exception or by return.  */
         if (do_debug) {
             gen_exception(EXCP_DEBUG);
-        } else if (use_exit_tb(&dc)) {
+        } else if (use_exit_tb(&dc) || status == EXIT_PC_STALE_NOCHAIN) {
             tcg_gen_exit_tb(0);
         } else {
             tcg_gen_lookup_and_goto_ptr(psw_addr);
         }
         break;
     default:
-        abort();
+        g_assert_not_reached();
     }
 
     gen_tb_end(tb, num_insns);
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 5f185458f1..1fa3bccc89 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -616,7 +616,12 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     /* Look for host pointer values within 4G of the PC.  This happens
        often when loading pointers to QEMU's own data structures.  */
     if (type == TCG_TYPE_I64) {
-        tcg_target_long disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12);
+        tcg_target_long disp = value - (intptr_t)s->code_ptr;
+        if (disp == sextract64(disp, 0, 21)) {
+            tcg_out_insn(s, 3406, ADR, rd, disp);
+            return;
+        }
+        disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12);
         if (disp == sextract64(disp, 0, 21)) {
             tcg_out_insn(s, 3406, ADRP, rd, disp);
             if (value & 0xfff) {
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 9f5cb66718..d1793ec77d 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -418,23 +418,37 @@ static inline void tcg_out_dat_imm(TCGContext *s,
 
 static void tcg_out_movi32(TCGContext *s, int cond, int rd, uint32_t arg)
 {
-    int rot, opc, rn;
-
-    /* For armv7, make sure not to use movw+movt when mov/mvn would do.
-       Speed things up by only checking when movt would be required.
-       Prior to armv7, have one go at fully rotated immediates before
-       doing the decomposition thing below.  */
-    if (!use_armv7_instructions || (arg & 0xffff0000)) {
-        rot = encode_imm(arg);
+    int rot, opc, rn, diff;
+
+    /* Check a single MOV/MVN before anything else.  */
+    rot = encode_imm(arg);
+    if (rot >= 0) {
+        tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0,
+                        rotl(arg, rot) | (rot << 7));
+        return;
+    }
+    rot = encode_imm(~arg);
+    if (rot >= 0) {
+        tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0,
+                        rotl(~arg, rot) | (rot << 7));
+        return;
+    }
+
+    /* Check for a pc-relative address.  This will usually be the TB,
+       or within the TB, which is immediately before the code block.  */
+    diff = arg - ((intptr_t)s->code_ptr + 8);
+    if (diff >= 0) {
+        rot = encode_imm(diff);
         if (rot >= 0) {
-            tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0,
-                            rotl(arg, rot) | (rot << 7));
+            tcg_out_dat_imm(s, cond, ARITH_ADD, rd, TCG_REG_PC,
+                            rotl(diff, rot) | (rot << 7));
             return;
         }
-        rot = encode_imm(~arg);
+    } else {
+        rot = encode_imm(-diff);
         if (rot >= 0) {
-            tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0,
-                            rotl(~arg, rot) | (rot << 7));
+            tcg_out_dat_imm(s, cond, ARITH_SUB, rd, TCG_REG_PC,
+                            rotl(-diff, rot) | (rot << 7));
             return;
         }
     }
@@ -1026,16 +1040,6 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *addr)
     }
 }
 
-void arm_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
-{
-    tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
-    tcg_insn_unit *target = (tcg_insn_unit *)addr;
-
-    /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
-    reloc_pc24_atomic(code_ptr, target);
-    flush_icache_range(jmp_addr, jmp_addr + 4);
-}
-
 static inline void tcg_out_goto_label(TCGContext *s, int cond, TCGLabel *l)
 {
     if (l->has_value) {
@@ -1665,17 +1669,27 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
     case INDEX_op_goto_tb:
-        if (s->tb_jmp_insn_offset) {
-            /* Direct jump method */
-            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-            tcg_out_b_noaddr(s, COND_AL);
-        } else {
+        {
             /* Indirect jump method */
-            intptr_t ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]);
-            tcg_out_movi32(s, COND_AL, TCG_REG_R0, ptr & ~0xfff);
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, ptr & 0xfff);
+            intptr_t ptr, dif, dil;
+            TCGReg base = TCG_REG_PC;
+
+            tcg_debug_assert(s->tb_jmp_insn_offset == 0);
+            ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]);
+            dif = ptr - ((intptr_t)s->code_ptr + 8);
+            dil = sextract32(dif, 0, 12);
+            if (dif != dil) {
+                /* The TB is close, but outside the 12 bits addressable by
+                   the load.  We can extend this to 20 bits with a sub of a
+                   shifted immediate from pc.  In the vastly unlikely event
+                   the code requires more than 1MB, we'll use 2 insns and
+                   be no worse off.  */
+                base = TCG_REG_R0;
+                tcg_out_movi32(s, COND_AL, base, ptr - dil);
+            }
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
+            s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
         }
-        s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
         break;
     case INDEX_op_goto_ptr:
         tcg_out_bx(s, COND_AL, args[0]);
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 8d50f18328..1f690df20d 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -2820,14 +2820,11 @@ void tcg_register_jit(void *buf, size_t buf_size)
 }
 #endif /* __ELF__ */
 
-static size_t dcache_bsize = 16;
-static size_t icache_bsize = 16;
-
 void flush_icache_range(uintptr_t start, uintptr_t stop)
 {
     uintptr_t p, start1, stop1;
-    size_t dsize = dcache_bsize;
-    size_t isize = icache_bsize;
+    size_t dsize = qemu_dcache_linesize;
+    size_t isize = qemu_icache_linesize;
 
     start1 = start & ~(dsize - 1);
     stop1 = (stop + dsize - 1) & ~(dsize - 1);
@@ -2844,67 +2841,3 @@ void flush_icache_range(uintptr_t start, uintptr_t stop)
     asm volatile ("sync" : : : "memory");
     asm volatile ("isync" : : : "memory");
 }
-
-#if defined _AIX
-#include <sys/systemcfg.h>
-
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
-    icache_bsize = _system_configuration.icache_line;
-    dcache_bsize = _system_configuration.dcache_line;
-}
-
-#elif defined __linux__
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
-    unsigned long dsize = qemu_getauxval(AT_DCACHEBSIZE);
-    unsigned long isize = qemu_getauxval(AT_ICACHEBSIZE);
-
-    if (dsize == 0 || isize == 0) {
-        if (dsize == 0) {
-            fprintf(stderr, "getauxval AT_DCACHEBSIZE failed\n");
-        }
-        if (isize == 0) {
-            fprintf(stderr, "getauxval AT_ICACHEBSIZE failed\n");
-        }
-        exit(1);
-    }
-    dcache_bsize = dsize;
-    icache_bsize = isize;
-}
-
-#elif defined __APPLE__
-#include <sys/sysctl.h>
-
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
-    size_t len;
-    unsigned cacheline;
-    int name[2] = { CTL_HW, HW_CACHELINE };
-
-    len = sizeof(cacheline);
-    if (sysctl(name, 2, &cacheline, &len, NULL, 0)) {
-        perror("sysctl CTL_HW HW_CACHELINE failed");
-        exit(1);
-    }
-    dcache_bsize = cacheline;
-    icache_bsize = cacheline;
-}
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
-#include <sys/sysctl.h>
-
-static void __attribute__((constructor)) tcg_cache_init(void)
-{
-    size_t len = 4;
-    unsigned cacheline;
-
-    if (sysctlbyname ("machdep.cacheline_size", &cacheline, &len, NULL, 0)) {
-        fprintf(stderr, "sysctlbyname machdep.cacheline_size failed: %s\n",
-                strerror(errno));
-        exit(1);
-    }
-    dcache_bsize = cacheline;
-    icache_bsize = cacheline;
-}
-#endif
diff --git a/tcg-runtime.c b/tcg/tcg-runtime.c
index 7fa90ce508..ec3a34e461 100644
--- a/tcg-runtime.c
+++ b/tcg/tcg-runtime.c
@@ -149,23 +149,23 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr)
     CPUState *cpu = ENV_GET_CPU(env);
     TranslationBlock *tb;
     target_ulong cs_base, pc;
-    uint32_t flags;
-
-    tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
-    if (likely(tb)) {
-        cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-        if (likely(tb->pc == addr && tb->cs_base == cs_base &&
-                   tb->flags == flags)) {
-            goto found;
-        }
+    uint32_t flags, addr_hash;
+
+    addr_hash = tb_jmp_cache_hash_func(addr);
+    tb = atomic_rcu_read(&cpu->tb_jmp_cache[addr_hash]);
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+
+    if (unlikely(!(tb
+                   && tb->pc == addr
+                   && tb->cs_base == cs_base
+                   && tb->flags == flags))) {
         tb = tb_htable_lookup(cpu, addr, cs_base, flags);
-        if (likely(tb)) {
-            atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)], tb);
-            goto found;
+        if (!tb) {
+            return tcg_ctx.code_gen_epilogue;
         }
+        atomic_set(&cpu->tb_jmp_cache[addr_hash], tb);
     }
-    return tcg_ctx.code_gen_epilogue;
- found:
+
     qemu_log_mask_and_addr(CPU_LOG_EXEC, addr,
                            "Chain %p [%d: " TARGET_FMT_lx "] %s\n",
                            tb->tc_ptr, cpu->cpu_index, addr,
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 564292f54d..35598296c5 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -383,6 +383,26 @@ void tcg_context_init(TCGContext *s)
     }
 }
 
+/*
+ * Allocate TBs right before their corresponding translated code, making
+ * sure that TBs and code are on different cache lines.
+ */
+TranslationBlock *tcg_tb_alloc(TCGContext *s)
+{
+    uintptr_t align = qemu_icache_linesize;
+    TranslationBlock *tb;
+    void *next;
+
+    tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
+    next = (void *)ROUND_UP((uintptr_t)(tb + 1), align);
+
+    if (unlikely(next > s->code_gen_highwater)) {
+        return NULL;
+    }
+    s->code_gen_ptr = next;
+    return tb;
+}
+
 void tcg_prologue_init(TCGContext *s)
 {
     size_t prologue_size, total_size;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 5ec48d1787..9e37722799 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -697,7 +697,6 @@ struct TCGContext {
        here, because there's too much arithmetic throughout that relies
        on addition and subtraction working on bytes.  Rely on the GCC
        extension that allows arithmetic on void*.  */
-    int code_gen_max_blocks;
     void *code_gen_prologue;
     void *code_gen_epilogue;
     void *code_gen_buffer;
@@ -756,6 +755,7 @@ static inline bool tcg_op_buf_full(void)
 /* tb_lock must be held for tcg_malloc_internal. */
 void *tcg_malloc_internal(TCGContext *s, int size);
 void tcg_pool_reset(TCGContext *s);
+TranslationBlock *tcg_tb_alloc(TCGContext *s);
 
 void tb_lock(void);
 void tb_unlock(void);
diff --git a/tci.c b/tcg/tci.c
index 4bdc645f2a..4bdc645f2a 100644
--- a/tci.c
+++ b/tcg/tci.c
diff --git a/tests/Makefile.include b/tests/Makefile.include
index fec5af765a..ae889cae02 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -203,6 +203,8 @@ check-qtest-pci-y += tests/intel-hda-test$(EXESUF)
 gcov-files-pci-y += hw/audio/intel-hda.c hw/audio/hda-codec.c
 check-qtest-pci-$(CONFIG_EVENTFD) += tests/ivshmem-test$(EXESUF)
 gcov-files-pci-y += hw/misc/ivshmem.c
+check-qtest-pci-y += tests/megasas-test$(EXESUF)
+gcov-files-pci-y += hw/scsi/megasas.c
 
 check-qtest-i386-y = tests/endianness-test$(EXESUF)
 check-qtest-i386-y += tests/fdc-test$(EXESUF)
@@ -752,6 +754,7 @@ tests/test-filter-mirror$(EXESUF): tests/test-filter-mirror.o $(qtest-obj-y)
 tests/test-filter-redirector$(EXESUF): tests/test-filter-redirector.o $(qtest-obj-y)
 tests/test-x86-cpuid-compat$(EXESUF): tests/test-x86-cpuid-compat.o $(qtest-obj-y)
 tests/ivshmem-test$(EXESUF): tests/ivshmem-test.o contrib/ivshmem-server/ivshmem-server.o $(libqos-pc-obj-y) $(libqos-spapr-obj-y)
+tests/megasas-test$(EXESUF): tests/megasas-test.o $(libqos-spapr-obj-y) $(libqos-pc-obj-y)
 tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o contrib/libvhost-user/libvhost-user.o $(test-util-obj-y)
 tests/test-uuid$(EXESUF): tests/test-uuid.o $(test-util-obj-y)
 tests/test-arm-mptimer$(EXESUF): tests/test-arm-mptimer.o
diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include
index 03eda37bf4..0ed8c3d323 100644
--- a/tests/docker/Makefile.include
+++ b/tests/docker/Makefile.include
@@ -126,7 +126,7 @@ docker-run: docker-qemu-src
 			"  COPYING $(EXECUTABLE) to $(IMAGE)"))
 	$(call quiet-command,						\
 		$(SRC_PATH)/tests/docker/docker.py run 			\
-			-t 						\
+			$(if $(NOUSER),,-u $(shell id -u)) -t 		\
 			$(if $V,,--rm) 					\
 			$(if $(DEBUG),-i,--net=none) 			\
 			-e TARGET_LIST=$(TARGET_LIST) 			\
diff --git a/tests/docker/dockerfiles/centos6.docker b/tests/docker/dockerfiles/centos6.docker
index 34e0d3b91e..17a4d24d54 100644
--- a/tests/docker/dockerfiles/centos6.docker
+++ b/tests/docker/dockerfiles/centos6.docker
@@ -1,7 +1,7 @@
 FROM centos:6
 RUN yum install -y epel-release
 ENV PACKAGES libfdt-devel ccache \
-    tar git make gcc g++ \
+    tar git make gcc g++ flex bison \
     zlib-devel glib2-devel SDL-devel pixman-devel \
     epel-release
 RUN yum install -y $PACKAGES
diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker
index c4f80ad3d8..4eaa8ed2a5 100644
--- a/tests/docker/dockerfiles/fedora.docker
+++ b/tests/docker/dockerfiles/fedora.docker
@@ -1,8 +1,8 @@
 FROM fedora:latest
 ENV PACKAGES \
-    ccache git tar PyYAML sparse flex bison python2 \
+    ccache git tar PyYAML sparse flex bison python2 bzip2 hostname \
     glib2-devel pixman-devel zlib-devel SDL-devel libfdt-devel \
-    gcc gcc-c++ clang make perl which bc findutils \
+    gcc gcc-c++ clang make perl which bc findutils libaio-devel \
     mingw32-pixman mingw32-glib2 mingw32-gmp mingw32-SDL mingw32-pkg-config \
     mingw32-gtk2 mingw32-gtk3 mingw32-gnutls mingw32-nettle mingw32-libtasn1 \
     mingw32-libjpeg-turbo mingw32-libpng mingw32-curl mingw32-libssh2 \
diff --git a/tests/megasas-test.c b/tests/megasas-test.c
new file mode 100644
index 0000000000..ce960e7f81
--- /dev/null
+++ b/tests/megasas-test.c
@@ -0,0 +1,86 @@
+/*
+ * QTest testcase for LSI MegaRAID
+ *
+ * Copyright (c) 2017 Red Hat Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+#include "qemu/bswap.h"
+#include "libqos/libqos-pc.h"
+#include "libqos/libqos-spapr.h"
+
+static QOSState *qmegasas_start(const char *extra_opts)
+{
+    const char *arch = qtest_get_arch();
+    const char *cmd = "-drive id=hd0,if=none,file=null-co://,format=raw "
+                      "-device megasas,id=scsi0,addr=04.0 "
+                      "-device scsi-hd,bus=scsi0.0,drive=hd0 %s";
+
+    if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
+        return qtest_pc_boot(cmd, extra_opts ? : "");
+    }
+
+    g_printerr("virtio-scsi tests are only available on x86 or ppc64\n");
+    exit(EXIT_FAILURE);
+}
+
+static void qmegasas_stop(QOSState *qs)
+{
+    qtest_shutdown(qs);
+}
+
+/* Tests only initialization so far. TODO: Replace with functional tests */
+static void pci_nop(void)
+{
+    QOSState *qs;
+
+    qs = qmegasas_start(NULL);
+    qmegasas_stop(qs);
+}
+
+/* This used to cause a NULL pointer dereference.  */
+static void megasas_pd_get_info_fuzz(void)
+{
+    QPCIDevice *dev;
+    QOSState *qs;
+    QPCIBar bar;
+    uint32_t context[256];
+    uint64_t context_pa;
+    int i;
+
+    qs = qmegasas_start(NULL);
+    dev = qpci_device_find(qs->pcibus, QPCI_DEVFN(4,0));
+    g_assert(dev != NULL);
+
+    qpci_device_enable(dev);
+    bar = qpci_iomap(dev, 0, NULL);
+
+    memset(context, 0, sizeof(context));
+    context[0] = cpu_to_le32(0x05050505);
+    context[1] = cpu_to_le32(0x01010101);
+    for (i = 2; i < ARRAY_SIZE(context); i++) {
+        context[i] = cpu_to_le32(0x41414141);
+    }
+    context[6] = cpu_to_le32(0x02020000);
+    context[7] = cpu_to_le32(0);
+
+    context_pa = qmalloc(qs, sizeof(context));
+    memwrite(context_pa, context, sizeof(context));
+    qpci_io_writel(dev, bar, 0x40, context_pa);
+
+    g_free(dev);
+    qmegasas_stop(qs);
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    qtest_add_func("/megasas/pci/nop", pci_nop);
+    qtest_add_func("/megasas/dcmd/pd-get-info/fuzz", megasas_pd_get_info_fuzz);
+
+    return g_test_run();
+}
diff --git a/tests/q35-test.c b/tests/q35-test.c
index cc58f3ecf4..f98bed7a2d 100644
--- a/tests/q35-test.c
+++ b/tests/q35-test.c
@@ -15,6 +15,48 @@
 #include "libqos/pci-pc.h"
 #include "hw/pci-host/q35.h"
 
+#define TSEG_SIZE_TEST_GUEST_RAM_MBYTES 128
+
+/* @esmramc_tseg_sz: ESMRAMC.TSEG_SZ bitmask for selecting the requested TSEG
+ *                   size. Must be a subset of
+ *                   MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK.
+ *
+ * @extended_tseg_mbytes: Size of the extended TSEG. Only consulted if
+ *                        @esmramc_tseg_sz equals
+ *                        MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK precisely.
+ *
+ * @expected_tseg_mbytes: Expected guest-visible TSEG size in megabytes,
+ *                        matching @esmramc_tseg_sz and @extended_tseg_mbytes
+ *                        above.
+ */
+struct TsegSizeArgs {
+    uint8_t esmramc_tseg_sz;
+    uint16_t extended_tseg_mbytes;
+    uint16_t expected_tseg_mbytes;
+};
+typedef struct TsegSizeArgs TsegSizeArgs;
+
+static const TsegSizeArgs tseg_1mb = {
+    .esmramc_tseg_sz      = MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_1MB,
+    .extended_tseg_mbytes = 0,
+    .expected_tseg_mbytes = 1,
+};
+static const TsegSizeArgs tseg_2mb = {
+    .esmramc_tseg_sz      = MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_2MB,
+    .extended_tseg_mbytes = 0,
+    .expected_tseg_mbytes = 2,
+};
+static const TsegSizeArgs tseg_8mb = {
+    .esmramc_tseg_sz      = MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_8MB,
+    .extended_tseg_mbytes = 0,
+    .expected_tseg_mbytes = 8,
+};
+static const TsegSizeArgs tseg_ext_16mb = {
+    .esmramc_tseg_sz      = MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK,
+    .extended_tseg_mbytes = 16,
+    .expected_tseg_mbytes = 16,
+};
+
 static void smram_set_bit(QPCIDevice *pcidev, uint8_t mask, bool enabled)
 {
     uint8_t smram;
@@ -42,6 +84,8 @@ static void test_smram_lock(void)
     QPCIDevice *pcidev;
     QDict *response;
 
+    qtest_start("-M q35");
+
     pcibus = qpci_init_pc(NULL);
     g_assert(pcibus != NULL);
 
@@ -74,19 +118,86 @@ static void test_smram_lock(void)
 
     g_free(pcidev);
     qpci_free_pc(pcibus);
+
+    qtest_end();
 }
 
-int main(int argc, char **argv)
+static void test_tseg_size(const void *data)
 {
-    int ret;
+    const TsegSizeArgs *args = data;
+    char *cmdline;
+    QPCIBus *pcibus;
+    QPCIDevice *pcidev;
+    uint8_t smram_val;
+    uint8_t esmramc_val;
+    uint32_t ram_offs;
+
+    if (args->esmramc_tseg_sz == MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK) {
+        cmdline = g_strdup_printf("-M q35 -m %uM "
+                                  "-global mch.extended-tseg-mbytes=%u",
+                                  TSEG_SIZE_TEST_GUEST_RAM_MBYTES,
+                                  args->extended_tseg_mbytes);
+    } else {
+        cmdline = g_strdup_printf("-M q35 -m %uM",
+                                  TSEG_SIZE_TEST_GUEST_RAM_MBYTES);
+    }
+    qtest_start(cmdline);
+    g_free(cmdline);
 
-    g_test_init(&argc, &argv, NULL);
+    /* locate the DRAM controller */
+    pcibus = qpci_init_pc(NULL);
+    g_assert(pcibus != NULL);
+    pcidev = qpci_device_find(pcibus, 0);
+    g_assert(pcidev != NULL);
 
-    qtest_add_func("/q35/smram/lock", test_smram_lock);
+    /* Set TSEG size. Restrict TSEG visibility to SMM by setting T_EN. */
+    esmramc_val = qpci_config_readb(pcidev, MCH_HOST_BRIDGE_ESMRAMC);
+    esmramc_val &= ~MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK;
+    esmramc_val |= args->esmramc_tseg_sz;
+    esmramc_val |= MCH_HOST_BRIDGE_ESMRAMC_T_EN;
+    qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_ESMRAMC, esmramc_val);
+
+    /* Enable TSEG by setting G_SMRAME. Close TSEG by setting D_CLS. */
+    smram_val = qpci_config_readb(pcidev, MCH_HOST_BRIDGE_SMRAM);
+    smram_val &= ~(MCH_HOST_BRIDGE_SMRAM_D_OPEN |
+                   MCH_HOST_BRIDGE_SMRAM_D_LCK);
+    smram_val |= (MCH_HOST_BRIDGE_SMRAM_D_CLS |
+                  MCH_HOST_BRIDGE_SMRAM_G_SMRAME);
+    qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_SMRAM, smram_val);
+
+    /* lock TSEG */
+    smram_val |= MCH_HOST_BRIDGE_SMRAM_D_LCK;
+    qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_SMRAM, smram_val);
+
+    /* Now check that the byte right before the TSEG is r/w, and that the first
+     * byte in the TSEG always reads as 0xff.
+     */
+    ram_offs = (TSEG_SIZE_TEST_GUEST_RAM_MBYTES - args->expected_tseg_mbytes) *
+               1024 * 1024 - 1;
+    g_assert_cmpint(readb(ram_offs), ==, 0);
+    writeb(ram_offs, 1);
+    g_assert_cmpint(readb(ram_offs), ==, 1);
+
+    ram_offs++;
+    g_assert_cmpint(readb(ram_offs), ==, 0xff);
+    writeb(ram_offs, 1);
+    g_assert_cmpint(readb(ram_offs), ==, 0xff);
 
-    qtest_start("-M q35");
-    ret = g_test_run();
+    g_free(pcidev);
+    qpci_free_pc(pcibus);
     qtest_end();
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+
+    qtest_add_func("/q35/smram/lock", test_smram_lock);
 
-    return ret;
+    qtest_add_data_func("/q35/tseg-size/1mb", &tseg_1mb, test_tseg_size);
+    qtest_add_data_func("/q35/tseg-size/2mb", &tseg_2mb, test_tseg_size);
+    qtest_add_data_func("/q35/tseg-size/8mb", &tseg_8mb, test_tseg_size);
+    qtest_add_data_func("/q35/tseg-size/ext/16mb", &tseg_ext_16mb,
+                        test_tseg_size);
+    return g_test_run();
 }
diff --git a/trace-events b/trace-events
index fd83087e39..bae63fdb1d 100644
--- a/trace-events
+++ b/trace-events
@@ -55,28 +55,6 @@ dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p"
 dma_blk_cb(void *dbs, int ret) "dbs=%p ret=%d"
 dma_map_wait(void *dbs) "dbs=%p"
 
-# kvm-all.c
-kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
-kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
-kvm_vcpu_ioctl(int cpu_index, int type, void *arg) "cpu_index %d, type 0x%x, arg %p"
-kvm_run_exit(int cpu_index, uint32_t reason) "cpu_index %d, reason %d"
-kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p"
-kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s"
-kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s"
-kvm_irqchip_commit_routes(void) ""
-kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d"
-kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d"
-kvm_irqchip_release_virq(int virq) "virq %d"
-
-# TCG related tracing (mostly disabled by default)
-# cpu-exec.c
-disable exec_tb(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
-disable exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
-disable exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=%x"
-
-# translate-all.c
-translate_block(void *tb, uintptr_t pc, uint8_t *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
-
 # memory.c
 memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr %#"PRIx64" value %#"PRIx64" size %u"
 memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr %#"PRIx64" value %#"PRIx64" size %u"
diff --git a/ui/cocoa.m b/ui/cocoa.m
index 004ec2711c..1f010d3ae7 100644
--- a/ui/cocoa.m
+++ b/ui/cocoa.m
@@ -52,6 +52,8 @@
 /* macOS 10.12 deprecated many constants, #define the new names for older SDKs */
 #if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_12
 #define NSEventMaskAny                  NSAnyEventMask
+#define NSEventModifierFlagCapsLock     NSAlphaShiftKeyMask
+#define NSEventModifierFlagShift        NSShiftKeyMask
 #define NSEventModifierFlagCommand      NSCommandKeyMask
 #define NSEventModifierFlagControl      NSControlKeyMask
 #define NSEventModifierFlagOption       NSAlternateKeyMask
@@ -268,7 +270,7 @@ static void handleAnyDeviceErrors(Error * err)
     NSWindow *fullScreenWindow;
     float cx,cy,cw,ch,cdx,cdy;
     CGDataProviderRef dataProviderRef;
-    int modifiers_state[256];
+    BOOL modifiers_state[256];
     BOOL isMouseGrabbed;
     BOOL isFullscreen;
     BOOL isAbsoluteEnabled;
@@ -536,18 +538,59 @@ QemuCocoaView *cocoaView;
     }
 }
 
+- (void) toggleModifier: (int)keycode {
+    // Toggle the stored state.
+    modifiers_state[keycode] = !modifiers_state[keycode];
+    // Send a keyup or keydown depending on the state.
+    qemu_input_event_send_key_qcode(dcl->con, keycode, modifiers_state[keycode]);
+}
+
+- (void) toggleStatefulModifier: (int)keycode {
+    // Toggle the stored state.
+    modifiers_state[keycode] = !modifiers_state[keycode];
+    // Generate keydown and keyup.
+    qemu_input_event_send_key_qcode(dcl->con, keycode, true);
+    qemu_input_event_send_key_qcode(dcl->con, keycode, false);
+}
+
 - (void) handleEvent:(NSEvent *)event
 {
     COCOA_DEBUG("QemuCocoaView: handleEvent\n");
 
     int buttons = 0;
-    int keycode;
+    int keycode = 0;
     bool mouse_event = false;
     NSPoint p = [event locationInWindow];
 
     switch ([event type]) {
         case NSEventTypeFlagsChanged:
-            keycode = cocoa_keycode_to_qemu([event keyCode]);
+            if ([event keyCode] == 0) {
+                // When the Cocoa keyCode is zero that means keys should be
+                // synthesized based on the values in in the eventModifiers
+                // bitmask.
+
+                if (qemu_console_is_graphic(NULL)) {
+                    NSEventModifierFlags modifiers = [event modifierFlags];
+
+                    if (!!(modifiers & NSEventModifierFlagCapsLock) != !!modifiers_state[Q_KEY_CODE_CAPS_LOCK]) {
+                        [self toggleStatefulModifier:Q_KEY_CODE_CAPS_LOCK];
+                    }
+                    if (!!(modifiers & NSEventModifierFlagShift) != !!modifiers_state[Q_KEY_CODE_SHIFT]) {
+                        [self toggleModifier:Q_KEY_CODE_SHIFT];
+                    }
+                    if (!!(modifiers & NSEventModifierFlagControl) != !!modifiers_state[Q_KEY_CODE_CTRL]) {
+                        [self toggleModifier:Q_KEY_CODE_CTRL];
+                    }
+                    if (!!(modifiers & NSEventModifierFlagOption) != !!modifiers_state[Q_KEY_CODE_ALT]) {
+                        [self toggleModifier:Q_KEY_CODE_ALT];
+                    }
+                    if (!!(modifiers & NSEventModifierFlagCommand) != !!modifiers_state[Q_KEY_CODE_META_L]) {
+                        [self toggleModifier:Q_KEY_CODE_META_L];
+                    }
+                }
+            } else {
+                keycode = cocoa_keycode_to_qemu([event keyCode]);
+            }
 
             if ((keycode == Q_KEY_CODE_META_L || keycode == Q_KEY_CODE_META_R)
                && !isMouseGrabbed) {
@@ -559,16 +602,9 @@ QemuCocoaView *cocoaView;
                 // emulate caps lock and num lock keydown and keyup
                 if (keycode == Q_KEY_CODE_CAPS_LOCK ||
                     keycode == Q_KEY_CODE_NUM_LOCK) {
-                    qemu_input_event_send_key_qcode(dcl->con, keycode, true);
-                    qemu_input_event_send_key_qcode(dcl->con, keycode, false);
+                    [self toggleStatefulModifier:keycode];
                 } else if (qemu_console_is_graphic(NULL)) {
-                    if (modifiers_state[keycode] == 0) { // keydown
-                        qemu_input_event_send_key_qcode(dcl->con, keycode, true);
-                        modifiers_state[keycode] = 1;
-                    } else { // keyup
-                        qemu_input_event_send_key_qcode(dcl->con, keycode, false);
-                        modifiers_state[keycode] = 0;
-                    }
+                  [self toggleModifier:keycode];
                 }
             }
 
diff --git a/ui/spice-core.c b/ui/spice-core.c
index 7519965f75..ea04dc69b5 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
@@ -846,6 +846,7 @@ void qemu_spice_init(void)
             exit(1);
         }
         display_opengl = 1;
+        spice_opengl = 1;
     }
 #endif
 }
diff --git a/ui/spice-display.c b/ui/spice-display.c
index b353445f58..042292cc90 100644
--- a/ui/spice-display.c
+++ b/ui/spice-display.c
@@ -27,6 +27,7 @@
 #include "ui/spice-display.h"
 
 static int debug = 0;
+bool spice_opengl;
 
 static void GCC_FMT_ATTR(2, 3) dprint(int level, const char *fmt, ...)
 {
@@ -1013,7 +1014,7 @@ static void qemu_spice_display_init_one(QemuConsole *con)
 
     ssd->dcl.ops = &display_listener_ops;
 #ifdef HAVE_SPICE_GL
-    if (display_opengl) {
+    if (spice_opengl) {
         ssd->dcl.ops = &display_listener_gl_ops;
         ssd->gl_unblock_bh = qemu_bh_new(qemu_spice_gl_unblock_bh, ssd);
         ssd->gl_unblock_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
diff --git a/ui/spice-input.c b/ui/spice-input.c
index 86293dd2ce..918580239d 100644
--- a/ui/spice-input.c
+++ b/ui/spice-input.c
@@ -87,7 +87,7 @@ static void kbd_leds(void *opaque, int ledstate)
     if (ledstate & QEMU_CAPS_LOCK_LED) {
         kbd->ledstate |= SPICE_KEYBOARD_MODIFIER_FLAGS_CAPS_LOCK;
     }
-    spice_server_kbd_leds(&kbd->sin, ledstate);
+    spice_server_kbd_leds(&kbd->sin, kbd->ledstate);
 }
 
 /* mouse bits */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index c6205ebf86..50a55ecc75 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
 util-obj-y += acl.o
+util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += id.o
 util-obj-y += iov.o qemu-config.o qemu-sockets.o uri.o notify.o
@@ -42,4 +43,5 @@ util-obj-y += log.o
 util-obj-y += qdist.o
 util-obj-y += qht.o
 util-obj-y += range.o
+util-obj-y += stats64.o
 util-obj-y += systemd.o
diff --git a/util/cacheinfo.c b/util/cacheinfo.c
new file mode 100644
index 0000000000..f987522df4
--- /dev/null
+++ b/util/cacheinfo.c
@@ -0,0 +1,185 @@
+/*
+ * cacheinfo.c - helpers to query the host about its caches
+ *
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+int qemu_icache_linesize = 0;
+int qemu_dcache_linesize = 0;
+
+/*
+ * Operating system specific detection mechanisms.
+ */
+
+#if defined(_AIX)
+# include <sys/systemcfg.h>
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+    *isize = _system_configuration.icache_line;
+    *dsize = _system_configuration.dcache_line;
+}
+
+#elif defined(_WIN32)
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf;
+    DWORD size = 0;
+    BOOL success;
+    size_t i, n;
+
+    /* Check for the required buffer size first.  Note that if the zero
+       size we use for the probe results in success, then there is no
+       data available; fail in that case.  */
+    success = GetLogicalProcessorInformation(0, &size);
+    if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+        return;
+    }
+
+    n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n);
+    if (!GetLogicalProcessorInformation(buf, &size)) {
+        goto fail;
+    }
+
+    for (i = 0; i < n; i++) {
+        if (buf[i].Relationship == RelationCache
+            && buf[i].Cache.Level == 1) {
+            switch (buf[i].Cache.Type) {
+            case CacheUnified:
+                *isize = *dsize = buf[i].Cache.LineSize;
+                break;
+            case CacheInstruction:
+                *isize = buf[i].Cache.LineSize;
+                break;
+            case CacheData:
+                *dsize = buf[i].Cache.LineSize;
+                break;
+            default:
+                break;
+            }
+        }
+    }
+ fail:
+    g_free(buf);
+}
+
+#elif defined(__APPLE__) \
+      || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+# include <sys/sysctl.h>
+# if defined(__APPLE__)
+#  define SYSCTL_CACHELINE_NAME "hw.cachelinesize"
+# else
+#  define SYSCTL_CACHELINE_NAME "machdep.cacheline_size"
+# endif
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+    /* There's only a single sysctl for both I/D cache line sizes.  */
+    long size;
+    size_t len = sizeof(size);
+    if (!sysctlbyname(SYSCTL_CACHELINE_NAME, &size, &len, NULL, 0)) {
+        *isize = *dsize = size;
+    }
+}
+
+#else
+/* POSIX */
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+# ifdef _SC_LEVEL1_ICACHE_LINESIZE
+    *isize = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+# endif
+# ifdef _SC_LEVEL1_DCACHE_LINESIZE
+    *dsize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+# endif
+}
+#endif /* sys_cache_info */
+
+/*
+ * Architecture (+ OS) specific detection mechanisms.
+ */
+
+#if defined(__aarch64__)
+
+static void arch_cache_info(int *isize, int *dsize)
+{
+    if (*isize == 0 || *dsize == 0) {
+        unsigned ctr;
+
+        /* The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1,
+           but (at least under Linux) these are marked protected by the
+           kernel.  However, CTR_EL0 contains the minimum linesize in the
+           entire hierarchy, and is used by userspace cache flushing.  */
+        asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr));
+        if (*isize == 0) {
+            *isize = 4 << (ctr & 0xf);
+        }
+        if (*dsize == 0) {
+            *dsize = 4 << ((ctr >> 16) & 0xf);
+        }
+    }
+}
+
+#elif defined(_ARCH_PPC) && defined(__linux__)
+
+static void arch_cache_info(int *isize, int *dsize)
+{
+    if (*isize == 0) {
+        *isize = qemu_getauxval(AT_ICACHEBSIZE);
+    }
+    if (*dsize == 0) {
+        *dsize = qemu_getauxval(AT_DCACHEBSIZE);
+    }
+}
+
+#else
+static void arch_cache_info(int *isize, int *dsize) { }
+#endif /* arch_cache_info */
+
+/*
+ * ... and if all else fails ...
+ */
+
+static void fallback_cache_info(int *isize, int *dsize)
+{
+    /* If we can only find one of the two, assume they're the same.  */
+    if (*isize) {
+        if (*dsize) {
+            /* Success! */
+        } else {
+            *dsize = *isize;
+        }
+    } else if (*dsize) {
+        *isize = *dsize;
+    } else {
+#if defined(_ARCH_PPC)
+        /* For PPC, we're going to use the icache size computed for
+           flush_icache_range.  Which means that we must use the
+           architecture minimum.  */
+        *isize = *dsize = 16;
+#else
+        /* Otherwise, 64 bytes is not uncommon.  */
+        *isize = *dsize = 64;
+#endif
+    }
+}
+
+static void __attribute__((constructor)) init_cache_info(void)
+{
+    int isize = 0, dsize = 0;
+
+    sys_cache_info(&isize, &dsize);
+    arch_cache_info(&isize, &dsize);
+    fallback_cache_info(&isize, &dsize);
+
+    qemu_icache_linesize = isize;
+    qemu_dcache_linesize = dsize;
+}
diff --git a/util/stats64.c b/util/stats64.c
new file mode 100644
index 0000000000..9968fcceac
--- /dev/null
+++ b/util/stats64.c
@@ -0,0 +1,137 @@
+/*
+ * Atomic operations on 64-bit quantities.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "qemu/stats64.h"
+#include "qemu/processor.h"
+
+#ifndef CONFIG_ATOMIC64
+static inline void stat64_rdlock(Stat64 *s)
+{
+    /* Keep out incoming writers to avoid them starving us. */
+    atomic_add(&s->lock, 2);
+
+    /* If there is a concurrent writer, wait for it.  */
+    while (atomic_read(&s->lock) & 1) {
+        cpu_relax();
+    }
+}
+
+static inline void stat64_rdunlock(Stat64 *s)
+{
+    atomic_sub(&s->lock, 2);
+}
+
+static inline bool stat64_wrtrylock(Stat64 *s)
+{
+    return atomic_cmpxchg(&s->lock, 0, 1) == 0;
+}
+
+static inline void stat64_wrunlock(Stat64 *s)
+{
+    atomic_dec(&s->lock);
+}
+
+uint64_t stat64_get(const Stat64 *s)
+{
+    uint32_t high, low;
+
+    stat64_rdlock((Stat64 *)s);
+
+    /* 64-bit writes always take the lock, so we can read in
+     * any order.
+     */
+    high = atomic_read(&s->high);
+    low = atomic_read(&s->low);
+    stat64_rdunlock((Stat64 *)s);
+
+    return ((uint64_t)high << 32) | low;
+}
+
+bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high)
+{
+    uint32_t old;
+
+    if (!stat64_wrtrylock(s)) {
+        cpu_relax();
+        return false;
+    }
+
+    /* 64-bit reads always take the lock, so they don't care about the
+     * order of our update.  By updating s->low first, we can check
+     * whether we have to carry into s->high.
+     */
+    old = atomic_fetch_add(&s->low, low);
+    high += (old + low) < old;
+    atomic_add(&s->high, high);
+    stat64_wrunlock(s);
+    return true;
+}
+
+bool stat64_min_slow(Stat64 *s, uint64_t value)
+{
+    uint32_t high, low;
+    uint64_t orig;
+
+    if (!stat64_wrtrylock(s)) {
+        cpu_relax();
+        return false;
+    }
+
+    high = atomic_read(&s->high);
+    low = atomic_read(&s->low);
+
+    orig = ((uint64_t)high << 32) | low;
+    if (orig < value) {
+        /* We have to set low before high, just like stat64_min reads
+         * high before low.  The value may become higher temporarily, but
+         * stat64_get does not notice (it takes the lock) and the only ill
+         * effect on stat64_min is that the slow path may be triggered
+         * unnecessarily.
+         */
+        atomic_set(&s->low, (uint32_t)value);
+        smp_wmb();
+        atomic_set(&s->high, value >> 32);
+    }
+    stat64_wrunlock(s);
+    return true;
+}
+
+bool stat64_max_slow(Stat64 *s, uint64_t value)
+{
+    uint32_t high, low;
+    uint64_t orig;
+
+    if (!stat64_wrtrylock(s)) {
+        cpu_relax();
+        return false;
+    }
+
+    high = atomic_read(&s->high);
+    low = atomic_read(&s->low);
+
+    orig = ((uint64_t)high << 32) | low;
+    if (orig > value) {
+        /* We have to set low before high, just like stat64_max reads
+         * high before low.  The value may become lower temporarily, but
+         * stat64_get does not notice (it takes the lock) and the only ill
+         * effect on stat64_max is that the slow path may be triggered
+         * unnecessarily.
+         */
+        atomic_set(&s->low, (uint32_t)value);
+        smp_wmb();
+        atomic_set(&s->high, value >> 32);
+    }
+    stat64_wrunlock(s);
+    return true;
+}
+#endif
diff --git a/vl.c b/vl.c
index 32db19e3b9..59fea15488 100644
--- a/vl.c
+++ b/vl.c
@@ -3757,21 +3757,18 @@ int main(int argc, char **argv, char **envp)
                 qdev_prop_register_global(&kvm_pit_lost_tick_policy);
                 break;
             }
-            case QEMU_OPTION_accel: {
-                QemuOpts *accel_opts;
-
+            case QEMU_OPTION_accel:
                 accel_opts = qemu_opts_parse_noisily(qemu_find_opts("accel"),
                                                      optarg, true);
                 optarg = qemu_opt_get(accel_opts, "accel");
                 if (!optarg || is_help_option(optarg)) {
                     error_printf("Possible accelerators: kvm, xen, hax, tcg\n");
-                    exit(1);
+                    exit(0);
                 }
-                accel_opts = qemu_opts_create(qemu_find_opts("machine"), NULL,
-                                              false, &error_abort);
-                qemu_opt_set(accel_opts, "accel", optarg, &error_abort);
+                opts = qemu_opts_create(qemu_find_opts("machine"), NULL,
+                                        false, &error_abort);
+                qemu_opt_set(opts, "accel", optarg, &error_abort);
                 break;
-            }
             case QEMU_OPTION_usb:
                 olist = qemu_find_opts("machine");
                 qemu_opts_parse_noisily(olist, "usb=on", false);