94 files changed, 2056 insertions, 817 deletions
diff --git a/cpu-exec.c b/cpu-exec.c
index 2ffeb6e40d..b2724c18c1 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -27,6 +27,7 @@
 #include "exec/address-spaces.h"
 #include "exec/memory-internal.h"
 #include "qemu/rcu.h"
+#include "exec/tb-hash.h"
 
 /* -icount align implementation. */
 
diff --git a/cpus.c b/cpus.c
index 4f0e54d53c..f547aebeaf 100644
--- a/cpus.c
+++ b/cpus.c
@@ -954,7 +954,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
     CPUState *cpu = arg;
     int r;
 
-    qemu_mutex_lock(&qemu_global_mutex);
+    qemu_mutex_lock_iothread();
     qemu_thread_get_self(cpu->thread);
     cpu->thread_id = qemu_get_thread_id();
     cpu->can_do_io = 1;
@@ -1034,10 +1034,10 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
 
+    qemu_mutex_lock_iothread();
     qemu_tcg_init_cpu_signals();
     qemu_thread_get_self(cpu->thread);
 
-    qemu_mutex_lock(&qemu_global_mutex);
     CPU_FOREACH(cpu) {
         cpu->thread_id = qemu_get_thread_id();
         cpu->created = true;
@@ -1146,10 +1146,21 @@ bool qemu_in_vcpu_thread(void)
     return current_cpu && qemu_cpu_is_self(current_cpu);
 }
 
+static __thread bool iothread_locked = false;
+
+bool qemu_mutex_iothread_locked(void)
+{
+    return iothread_locked;
+}
+
 void qemu_mutex_lock_iothread(void)
 {
     atomic_inc(&iothread_requesting_mutex);
-    if (!tcg_enabled() || !first_cpu || !first_cpu->thread) {
+    /* In the simple case there is no need to bump the VCPU thread out of
+     * TCG code execution.
+     */
+    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
+        !first_cpu || !first_cpu->thread) {
         qemu_mutex_lock(&qemu_global_mutex);
         atomic_dec(&iothread_requesting_mutex);
     } else {
@@ -1160,10 +1171,12 @@ void qemu_mutex_lock_iothread(void)
         atomic_dec(&iothread_requesting_mutex);
         qemu_cond_broadcast(&qemu_io_proceeded_cond);
     }
+    iothread_locked = true;
 }
 
 void qemu_mutex_unlock_iothread(void)
 {
+    iothread_locked = false;
     qemu_mutex_unlock(&qemu_global_mutex);
 }
 
diff --git a/docs/multiseat.txt b/docs/multiseat.txt
index 814496e94c..ebf2446933 100644
--- a/docs/multiseat.txt
+++ b/docs/multiseat.txt
@@ -2,8 +2,8 @@
 multiseat howto (with some multihead coverage)
 ==============================================
 
-host side
----------
+host devices
+------------
 
 First you must compile qemu with a user interface supporting
 multihead/multiseat and input event routing.  Right now this
@@ -41,6 +41,19 @@ The "display=video2" sets up the input routing.  Any input coming from
 the window which belongs to the video.2 display adapter will be routed
 to these input devices.
 
+Starting with qemu 2.4 and linux kernel 4.1 you can also use virtio
+for the input devices, using this ...
+
+	-device pci-bridge,addr=12.0,chassis_nr=2,id=head.2 \
+	-device secondary-vga,bus=head.2,addr=02.0,id=video.2 \
+	-device virtio-keyboard-pci,bus=head.2,addr=03.0,display=video.2 \
+	-device virtio-tablet-pci,bus=head.2,addr=03.0,display=video.2
+
+... instead of xhci and usb hid devices.
+
+host ui
+-------
+
 The sdl2 ui will start up with two windows, one for each display
 device.  The gtk ui will start with a single window and each display
 in a separate tab.  You can either simply switch tabs to switch heads,
@@ -110,7 +123,7 @@ Background info is here:
 guest side with pci-bridge-seat
 -------------------------------
 
-Qemu version FIXME and newer has a new pci-bridge-seat device which
+Qemu version 2.4 and newer has a new pci-bridge-seat device which
 can be used instead of pci-bridge.  Just swap the device name in the
 qemu command line above.  The only difference between the two devices
 is the pci id.  We can match the pci id instead of the device path
@@ -121,9 +134,10 @@ configuration:
     SUBSYSTEM=="pci", ATTR{vendor}=="0x1b36", ATTR{device}=="0x000a", \
             TAG+="seat", ENV{ID_AUTOSEAT}="1"
 
-Patch with this rule will be submitted to upstream udev/systemd, so
-long-term, when systemd with this lands in distros, things will work
-just fine without any manual guest configuration.
+Patch with this rule has been submitted to upstream udev/systemd, was
+accepted and and should be included in the next systemd release (222).
+So, if your guest has this or a newer version, multiseat will work just
+fine without any manual guest configuration.
 
 Enjoy!
 
diff --git a/exec.c b/exec.c
index f7883d2246..251dc79e10 100644
--- a/exec.c
+++ b/exec.c
@@ -48,6 +48,7 @@
 #endif
 #include "exec/cpu-all.h"
 #include "qemu/rcu_queue.h"
+#include "qemu/main-loop.h"
 #include "exec/cputlb.h"
 #include "translate-all.h"
 
@@ -352,6 +353,18 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x
     *xlat = addr + section->offset_within_region;
 
     mr = section->mr;
+
+    /* MMIO registers can be expected to perform full-width accesses based only
+     * on their address, without considering adjacent registers that could
+     * decode to completely different MemoryRegions.  When such registers
+     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
+     * regions overlap wildly.  For this reason we cannot clamp the accesses
+     * here.
+     *
+     * If the length is small (as is the case for address_space_ldl/stl),
+     * everything works fine.  If the incoming length is large, however,
+     * the caller really has to do the clamping through memory_access_size.
+     */
     if (memory_region_is_ram(mr)) {
         diff = int128_sub(section->size, int128_make64(addr));
         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
@@ -2316,6 +2329,29 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
     return l;
 }
 
+static bool prepare_mmio_access(MemoryRegion *mr)
+{
+    bool unlocked = !qemu_mutex_iothread_locked();
+    bool release_lock = false;
+
+    if (unlocked && mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        unlocked = false;
+        release_lock = true;
+    }
+    if (mr->flush_coalesced_mmio) {
+        if (unlocked) {
+            qemu_mutex_lock_iothread();
+        }
+        qemu_flush_coalesced_mmio_buffer();
+        if (unlocked) {
+            qemu_mutex_unlock_iothread();
+        }
+    }
+
+    return release_lock;
+}
+
 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                              uint8_t *buf, int len, bool is_write)
 {
@@ -2325,6 +2361,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
     hwaddr addr1;
     MemoryRegion *mr;
     MemTxResult result = MEMTX_OK;
+    bool release_lock = false;
 
     rcu_read_lock();
     while (len > 0) {
@@ -2333,6 +2370,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
 
         if (is_write) {
             if (!memory_access_is_direct(mr, is_write)) {
+                release_lock |= prepare_mmio_access(mr);
                 l = memory_access_size(mr, l, addr1);
                 /* XXX: could force current_cpu to NULL to avoid
                    potential bugs */
@@ -2374,6 +2412,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
         } else {
             if (!memory_access_is_direct(mr, is_write)) {
                 /* I/O case */
+                release_lock |= prepare_mmio_access(mr);
                 l = memory_access_size(mr, l, addr1);
                 switch (l) {
                 case 8:
@@ -2409,6 +2448,12 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
                 memcpy(buf, ptr, l);
             }
         }
+
+        if (release_lock) {
+            qemu_mutex_unlock_iothread();
+            release_lock = false;
+        }
+
         len -= l;
         buf += l;
         addr += l;
@@ -2458,7 +2503,7 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
 
         if (!(memory_region_is_ram(mr) ||
               memory_region_is_romd(mr))) {
-            /* do nothing */
+            l = memory_access_size(mr, l, addr1);
         } else {
             addr1 += memory_region_get_ram_addr(mr);
             /* ROM/RAM case */
@@ -2735,10 +2780,13 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l, false);
     if (l < 4 || !memory_access_is_direct(mr, false)) {
+        release_lock |= prepare_mmio_access(mr);
+
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -2771,6 +2819,9 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -2823,11 +2874,14 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
     hwaddr l = 8;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  false);
     if (l < 8 || !memory_access_is_direct(mr, false)) {
+        release_lock |= prepare_mmio_access(mr);
+
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -2860,6 +2914,9 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -2932,11 +2989,14 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
     hwaddr l = 2;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  false);
     if (l < 2 || !memory_access_is_direct(mr, false)) {
+        release_lock |= prepare_mmio_access(mr);
+
         /* I/O case */
         r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
 #if defined(TARGET_WORDS_BIGENDIAN)
@@ -2969,6 +3029,9 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
     return val;
 }
@@ -3021,11 +3084,14 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
     hwaddr addr1;
     MemTxResult r;
     uint8_t dirty_log_mask;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  true);
     if (l < 4 || !memory_access_is_direct(mr, true)) {
+        release_lock |= prepare_mmio_access(mr);
+
         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
     } else {
         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
@@ -3040,6 +3106,9 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
@@ -3060,11 +3129,14 @@ static inline void address_space_stl_internal(AddressSpace *as,
     hwaddr l = 4;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l,
                                  true);
     if (l < 4 || !memory_access_is_direct(mr, true)) {
+        release_lock |= prepare_mmio_access(mr);
+
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
             val = bswap32(val);
@@ -3096,6 +3168,9 @@ static inline void address_space_stl_internal(AddressSpace *as,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
@@ -3165,10 +3240,13 @@ static inline void address_space_stw_internal(AddressSpace *as,
     hwaddr l = 2;
     hwaddr addr1;
     MemTxResult r;
+    bool release_lock = false;
 
     rcu_read_lock();
     mr = address_space_translate(as, addr, &addr1, &l, true);
     if (l < 2 || !memory_access_is_direct(mr, true)) {
+        release_lock |= prepare_mmio_access(mr);
+
 #if defined(TARGET_WORDS_BIGENDIAN)
         if (endian == DEVICE_LITTLE_ENDIAN) {
             val = bswap16(val);
@@ -3200,6 +3278,9 @@ static inline void address_space_stw_internal(AddressSpace *as,
     if (result) {
         *result = r;
     }
+    if (release_lock) {
+        qemu_mutex_unlock_iothread();
+    }
     rcu_read_unlock();
 }
 
diff --git a/hw/acpi/core.c b/hw/acpi/core.c
index 0f201d8c6d..fe6215af4a 100644
--- a/hw/acpi/core.c
+++ b/hw/acpi/core.c
@@ -528,6 +528,7 @@ void acpi_pm_tmr_init(ACPIREGS *ar, acpi_update_sci_fn update_sci,
     ar->tmr.timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, acpi_pm_tmr_timer, ar);
     memory_region_init_io(&ar->tmr.io, memory_region_owner(parent),
                           &acpi_pm_tmr_ops, ar, "acpi-tmr", 4);
+    memory_region_clear_global_locking(&ar->tmr.io);
     memory_region_add_subregion(parent, 8, &ar->tmr.io);
 }
 
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 8a64ffb38f..f4dc7a84be 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -192,7 +192,7 @@ static void pm_reset(void *opaque)
     acpi_pm_tmr_reset(&pm->acpi_regs);
     acpi_gpe_reset(&pm->acpi_regs);
 
-    if (kvm_enabled()) {
+    if (!pm->smm_enabled) {
         /* Mark SMM as already inited to prevent SMM from running. KVM does not
          * support SMM mode. */
         pm->smi_en |= ICH9_PMIO_SMI_EN_APMC_EN;
@@ -209,7 +209,7 @@ static void pm_powerdown_req(Notifier *n, void *opaque)
     acpi_pm1_evt_power_down(&pm->acpi_regs);
 }
 
-void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
+void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, bool smm_enabled,
                   qemu_irq sci_irq)
 {
     memory_region_init(&pm->io, OBJECT(lpc_pci), "ich9-pm", ICH9_PMIO_SIZE);
@@ -231,6 +231,7 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
                           "acpi-smi", 8);
     memory_region_add_subregion(&pm->io, ICH9_PMIO_SMI_EN, &pm->io_smi);
 
+    pm->smm_enabled = smm_enabled;
     pm->irq = sci_irq;
     qemu_register_reset(pm_reset, pm);
     pm->powerdown_notifier.notify = pm_powerdown_req;
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index 3bd1d5a865..2cd2fee897 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -72,7 +72,7 @@ typedef struct PIIX4PMState {
 
     qemu_irq irq;
     qemu_irq smi_irq;
-    int kvm_enabled;
+    int smm_enabled;
     Notifier machine_ready;
     Notifier powerdown_notifier;
 
@@ -112,6 +112,9 @@ static void apm_ctrl_changed(uint32_t val, void *arg)
 
     /* ACPI specs 3.0, 4.7.2.5 */
     acpi_pm1_cnt_update(&s->ar, val == ACPI_ENABLE, val == ACPI_DISABLE);
+    if (val == ACPI_ENABLE || val == ACPI_DISABLE) {
+        return;
+    }
 
     if (d->config[0x5b] & (1 << 1)) {
         if (s->smi_irq) {
@@ -319,7 +322,7 @@ static void piix4_reset(void *opaque)
     pci_conf[0x40] = 0x01; /* PM io base read only bit */
     pci_conf[0x80] = 0;
 
-    if (s->kvm_enabled) {
+    if (!s->smm_enabled) {
         /* Mark SMM as already inited (until KVM supports SMM). */
         pci_conf[0x5B] = 0x02;
     }
@@ -450,7 +453,7 @@ static void piix4_pm_realize(PCIDevice *dev, Error **errp)
     /* APM */
     apm_init(dev, &s->apm, apm_ctrl_changed, s);
 
-    if (s->kvm_enabled) {
+    if (!s->smm_enabled) {
         /* Mark SMM as already inited to prevent SMM from running.  KVM does not
          * support SMM mode. */
         pci_conf[0x5B] = 0x02;
@@ -501,7 +504,7 @@ Object *piix4_pm_find(void)
 
 I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
                       qemu_irq sci_irq, qemu_irq smi_irq,
-                      int kvm_enabled, DeviceState **piix4_pm)
+                      int smm_enabled, DeviceState **piix4_pm)
 {
     DeviceState *dev;
     PIIX4PMState *s;
@@ -515,7 +518,7 @@ I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
     s = PIIX4_PM(dev);
     s->irq = sci_irq;
     s->smi_irq = smi_irq;
-    s->kvm_enabled = kvm_enabled;
+    s->smm_enabled = smm_enabled;
     if (xen_enabled()) {
         s->use_acpi_pci_hotplug = false;
     }
diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c
index 278a2d1bdd..3c58629894 100644
--- a/hw/core/sysbus.c
+++ b/hw/core/sysbus.c
@@ -109,7 +109,13 @@ qemu_irq sysbus_get_connected_irq(SysBusDevice *dev, int n)
 
 void sysbus_connect_irq(SysBusDevice *dev, int n, qemu_irq irq)
 {
+    SysBusDeviceClass *sbd = SYS_BUS_DEVICE_GET_CLASS(dev);
+
     qdev_connect_gpio_out_named(DEVICE(dev), SYSBUS_DEVICE_GPIO_IRQ, n, irq);
+
+    if (sbd->connect_irq_notifier) {
+        sbd->connect_irq_notifier(dev, irq);
+    }
 }
 
 /* Check whether an MMIO region exists */
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 00818b925b..aed811a166 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1509,7 +1509,7 @@ build_srat(GArray *table_data, GArray *linker, PcGuestInfo *guest_info)
      */
     if (hotplugabble_address_space_size) {
         numamem = acpi_data_push(table_data, sizeof *numamem);
-        acpi_build_srat_memory(numamem, pcms->hotplug_memory_base,
+        acpi_build_srat_memory(numamem, pcms->hotplug_memory.base,
                                hotplugabble_address_space_size, 0,
                                MEM_AFFINITY_HOTPLUGGABLE |
                                MEM_AFFINITY_ENABLED);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7072930cfc..7959b44b6b 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -64,7 +64,6 @@
 #include "hw/pci/pci_host.h"
 #include "acpi-build.h"
 #include "hw/mem/pc-dimm.h"
-#include "trace.h"
 #include "qapi/visitor.h"
 #include "qapi-visit.h"
 
@@ -1297,7 +1296,7 @@ FWCfgState *pc_memory_init(MachineState *machine,
             exit(EXIT_FAILURE);
         }
 
-        pcms->hotplug_memory_base =
+        pcms->hotplug_memory.base =
             ROUND_UP(0x100000000ULL + above_4g_mem_size, 1ULL << 30);
 
         if (pcms->enforce_aligned_dimm) {
@@ -1305,17 +1304,17 @@ FWCfgState *pc_memory_init(MachineState *machine,
             hotplug_mem_size += (1ULL << 30) * machine->ram_slots;
         }
 
-        if ((pcms->hotplug_memory_base + hotplug_mem_size) <
+        if ((pcms->hotplug_memory.base + hotplug_mem_size) <
             hotplug_mem_size) {
             error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
                          machine->maxram_size);
             exit(EXIT_FAILURE);
         }
 
-        memory_region_init(&pcms->hotplug_memory, OBJECT(pcms),
+        memory_region_init(&pcms->hotplug_memory.mr, OBJECT(pcms),
                            "hotplug-memory", hotplug_mem_size);
-        memory_region_add_subregion(system_memory, pcms->hotplug_memory_base,
-                                    &pcms->hotplug_memory);
+        memory_region_add_subregion(system_memory, pcms->hotplug_memory.base,
+                                    &pcms->hotplug_memory.mr);
     }
 
     /* Initialize PC system firmware */
@@ -1333,9 +1332,9 @@ FWCfgState *pc_memory_init(MachineState *machine,
     fw_cfg = bochs_bios_init();
     rom_set_fw(fw_cfg);
 
-    if (guest_info->has_reserved_memory && pcms->hotplug_memory_base) {
+    if (guest_info->has_reserved_memory && pcms->hotplug_memory.base) {
         uint64_t *val = g_malloc(sizeof(*val));
-        *val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory_base, 0x1ULL << 30));
+        *val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory.base, 0x1ULL << 30));
         fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
     }
 
@@ -1554,88 +1553,31 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
 static void pc_dimm_plug(HotplugHandler *hotplug_dev,
                          DeviceState *dev, Error **errp)
 {
-    int slot;
     HotplugHandlerClass *hhc;
     Error *local_err = NULL;
     PCMachineState *pcms = PC_MACHINE(hotplug_dev);
-    MachineState *machine = MACHINE(hotplug_dev);
     PCDIMMDevice *dimm = PC_DIMM(dev);
     PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
     MemoryRegion *mr = ddc->get_memory_region(dimm);
-    uint64_t existing_dimms_capacity = 0;
     uint64_t align = TARGET_PAGE_SIZE;
-    uint64_t addr;
-
-    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
 
     if (memory_region_get_alignment(mr) && pcms->enforce_aligned_dimm) {
         align = memory_region_get_alignment(mr);
     }
 
-    addr = pc_dimm_get_free_addr(pcms->hotplug_memory_base,
-                                 memory_region_size(&pcms->hotplug_memory),
-                                 !addr ? NULL : &addr, align,
-                                 memory_region_size(mr), &local_err);
-    if (local_err) {
-        goto out;
-    }
-
-    existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
-    if (local_err) {
-        goto out;
-    }
-
-    if (existing_dimms_capacity + memory_region_size(mr) >
-        machine->maxram_size - machine->ram_size) {
-        error_setg(&local_err, "not enough space, currently 0x%" PRIx64
-                   " in use of total hot pluggable 0x" RAM_ADDR_FMT,
-                   existing_dimms_capacity,
-                   machine->maxram_size - machine->ram_size);
-        goto out;
-    }
-
-    object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
-    trace_mhp_pc_dimm_assigned_address(addr);
-
-    slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
-
-    slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot,
-                                 machine->ram_slots, &local_err);
-    if (local_err) {
-        goto out;
-    }
-    object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err);
-    if (local_err) {
-        goto out;
-    }
-    trace_mhp_pc_dimm_assigned_slot(slot);
-
     if (!pcms->acpi_dev) {
         error_setg(&local_err,
                    "memory hotplug is not enabled: missing acpi device");
         goto out;
     }
 
-    if (kvm_enabled() && !kvm_has_free_slot(machine)) {
-        error_setg(&local_err, "hypervisor has no free memory slots left");
+    pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, &local_err);
+    if (local_err) {
         goto out;
     }
 
-    memory_region_add_subregion(&pcms->hotplug_memory,
-                                addr - pcms->hotplug_memory_base, mr);
-    vmstate_register_ram(mr, dev);
-
     hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev);
-    hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &local_err);
+    hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &error_abort);
 out:
     error_propagate(errp, local_err);
 }
@@ -1677,9 +1619,7 @@ static void pc_dimm_unplug(HotplugHandler *hotplug_dev,
         goto out;
     }
 
-    memory_region_del_subregion(&pcms->hotplug_memory, mr);
-    vmstate_unregister_ram(mr, dev);
-
+    pc_dimm_memory_unplug(dev, &pcms->hotplug_memory, mr);
     object_unparent(OBJECT(dev));
 
  out:
@@ -1766,7 +1706,7 @@ pc_machine_get_hotplug_memory_region_size(Object *obj, Visitor *v, void *opaque,
                                           const char *name, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(obj);
-    int64_t value = memory_region_size(&pcms->hotplug_memory);
+    int64_t value = memory_region_size(&pcms->hotplug_memory.mr);
 
     visit_type_int(v, &value, name, errp);
 }
@@ -1828,6 +1768,48 @@ static void pc_machine_set_vmport(Object *obj, Visitor *v, void *opaque,
     visit_type_OnOffAuto(v, &pcms->vmport, name, errp);
 }
 
+bool pc_machine_is_smm_enabled(PCMachineState *pcms)
+{
+    bool smm_available = false;
+
+    if (pcms->smm == ON_OFF_AUTO_OFF) {
+        return false;
+    }
+
+    if (tcg_enabled() || qtest_enabled()) {
+        smm_available = true;
+    } else if (kvm_enabled()) {
+        smm_available = kvm_has_smm();
+    }
+
+    if (smm_available) {
+        return true;
+    }
+
+    if (pcms->smm == ON_OFF_AUTO_ON) {
+        error_report("System Management Mode not supported by this hypervisor.");
+        exit(1);
+    }
+    return false;
+}
+
+static void pc_machine_get_smm(Object *obj, Visitor *v, void *opaque,
+                              const char *name, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+    OnOffAuto smm = pcms->smm;
+
+    visit_type_OnOffAuto(v, &smm, name, errp);
+}
+
+static void pc_machine_set_smm(Object *obj, Visitor *v, void *opaque,
+                                  const char *name, Error **errp)
+{
+    PCMachineState *pcms = PC_MACHINE(obj);
+
+    visit_type_OnOffAuto(v, &pcms->smm, name, errp);
+}
+
 static bool pc_machine_get_aligned_dimm(Object *obj, Error **errp)
 {
     PCMachineState *pcms = PC_MACHINE(obj);
@@ -1852,6 +1834,15 @@ static void pc_machine_initfn(Object *obj)
                                     "Maximum ram below the 4G boundary (32bit boundary)",
                                     NULL);
 
+    pcms->smm = ON_OFF_AUTO_AUTO;
+    object_property_add(obj, PC_MACHINE_SMM, "OnOffAuto",
+                        pc_machine_get_smm,
+                        pc_machine_set_smm,
+                        NULL, NULL, NULL);
+    object_property_set_description(obj, PC_MACHINE_SMM,
+                                    "Enable SMM (pc & q35)",
+                                    NULL);
+
     pcms->vmport = ON_OFF_AUTO_AUTO;
     object_property_add(obj, PC_MACHINE_VMPORT, "OnOffAuto",
                         pc_machine_get_vmport,
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index e142f75649..56cdcb9661 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -287,7 +287,8 @@ static void pc_init1(MachineState *machine)
         /* TODO: Populate SPD eeprom data.  */
         smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100,
                               gsi[9], smi_irq,
-                              kvm_enabled(), &piix4_pm);
+                              pc_machine_is_smm_enabled(pc_machine),
+                              &piix4_pm);
         smbus_eeprom_init(smbus, 8, NULL, 0);
 
         object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP,
@@ -306,7 +307,11 @@ static void pc_init1(MachineState *machine)
 
 static void pc_compat_2_3(MachineState *machine)
 {
+    PCMachineState *pcms = PC_MACHINE(machine);
     savevm_skip_section_footers();
+    if (kvm_enabled()) {
+        pcms->smm = ON_OFF_AUTO_OFF;
+    }
 }
 
 static void pc_compat_2_2(MachineState *machine)
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 082cd93bb2..8aa3a67fdf 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -253,7 +253,7 @@ static void pc_q35_init(MachineState *machine)
                          (pc_machine->vmport != ON_OFF_AUTO_ON), 0xff0104);
 
     /* connect pm stuff to lpc */
-    ich9_lpc_pm_init(lpc);
+    ich9_lpc_pm_init(lpc, pc_machine_is_smm_enabled(pc_machine));
 
     /* ahci and SATA device, for q35 1 ahci controller is built-in */
     ahci = pci_create_simple_multifunction(host_bus,
@@ -290,7 +290,11 @@ static void pc_q35_init(MachineState *machine)
 
 static void pc_compat_2_3(MachineState *machine)
 {
+    PCMachineState *pcms = PC_MACHINE(machine);
     savevm_skip_section_footers();
+    if (kvm_enabled()) {
+        pcms->smm = ON_OFF_AUTO_OFF;
+    }
 }
 
 static void pc_compat_2_2(MachineState *machine)
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index b4b65c100a..bb6a92f7f4 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -45,11 +45,11 @@ do { \
 } while (0)
 
 static void check_cmd(AHCIState *s, int port);
-static int handle_cmd(AHCIState *s,int port,int slot);
+static int handle_cmd(AHCIState *s, int port, uint8_t slot);
 static void ahci_reset_port(AHCIState *s, int port);
 static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis);
 static void ahci_init_d2h(AHCIDevice *ad);
-static int ahci_dma_prepare_buf(IDEDMA *dma, int is_write);
+static int ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit);
 static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes);
 static bool ahci_map_clb_address(AHCIDevice *ad);
 static bool ahci_map_fis_address(AHCIDevice *ad);
@@ -106,8 +106,6 @@ static uint32_t  ahci_port_read(AHCIState *s, int port, int offset)
         val = pr->scr_err;
         break;
     case PORT_SCR_ACT:
-        pr->scr_act &= ~s->dev[port].finished;
-        s->dev[port].finished = 0;
         val = pr->scr_act;
         break;
     case PORT_CMD_ISSUE:
@@ -331,8 +329,7 @@ static void  ahci_port_write(AHCIState *s, int port, int offset, uint32_t val)
     }
 }
 
-static uint64_t ahci_mem_read(void *opaque, hwaddr addr,
-                              unsigned size)
+static uint64_t ahci_mem_read_32(void *opaque, hwaddr addr)
 {
     AHCIState *s = opaque;
     uint32_t val = 0;
@@ -368,6 +365,30 @@ static uint64_t ahci_mem_read(void *opaque, hwaddr addr,
 }
 
 
+/**
+ * AHCI 1.3 section 3 ("HBA Memory Registers")
+ * Support unaligned 8/16/32 bit reads, and 64 bit aligned reads.
+ * Caller is responsible for masking unwanted higher order bytes.
+ */
+static uint64_t ahci_mem_read(void *opaque, hwaddr addr, unsigned size)
+{
+    hwaddr aligned = addr & ~0x3;
+    int ofst = addr - aligned;
+    uint64_t lo = ahci_mem_read_32(opaque, aligned);
+    uint64_t hi;
+
+    /* if < 8 byte read does not cross 4 byte boundary */
+    if (ofst + size <= 4) {
+        return lo >> (ofst * 8);
+    }
+    g_assert_cmpint(size, >, 1);
+
+    /* If the 64bit read is unaligned, we will produce undefined
+     * results. AHCI does not support unaligned 64bit reads. */
+    hi = ahci_mem_read_32(opaque, aligned + 4);
+    return (hi << 32 | lo) >> (ofst * 8);
+}
+
 
 static void ahci_mem_write(void *opaque, hwaddr addr,
                            uint64_t val, unsigned size)
@@ -483,7 +504,7 @@ static void ahci_reg_init(AHCIState *s)
 static void check_cmd(AHCIState *s, int port)
 {
     AHCIPortRegs *pr = &s->dev[port].port_regs;
-    int slot;
+    uint8_t slot;
 
     if ((pr->cmd & PORT_CMD_START) && pr->cmd_issue) {
         for (slot = 0; (slot < 32) && pr->cmd_issue; slot++) {
@@ -558,6 +579,7 @@ static void ahci_reset_port(AHCIState *s, int port)
     /* reset ncq queue */
     for (i = 0; i < AHCI_MAX_CMDS; i++) {
         NCQTransferState *ncq_tfs = &s->dev[port].ncq_tfs[i];
+        ncq_tfs->halt = false;
         if (!ncq_tfs->used) {
             continue;
         }
@@ -642,14 +664,14 @@ static void ahci_unmap_clb_address(AHCIDevice *ad)
     ad->lst = NULL;
 }
 
-static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished)
+static void ahci_write_fis_sdb(AHCIState *s, NCQTransferState *ncq_tfs)
 {
-    AHCIDevice *ad = &s->dev[port];
+    AHCIDevice *ad = ncq_tfs->drive;
     AHCIPortRegs *pr = &ad->port_regs;
     IDEState *ide_state;
     SDBFIS *sdb_fis;
 
-    if (!s->dev[port].res_fis ||
+    if (!ad->res_fis ||
         !(pr->cmd & PORT_CMD_FIS_RX)) {
         return;
     }
@@ -659,53 +681,35 @@ static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished)
 
     sdb_fis->type = SATA_FIS_TYPE_SDB;
     /* Interrupt pending & Notification bit */
-    sdb_fis->flags = (ad->hba->control_regs.irqstatus ? (1 << 6) : 0);
+    sdb_fis->flags = 0x40; /* Interrupt bit, always 1 for NCQ */
     sdb_fis->status = ide_state->status & 0x77;
     sdb_fis->error = ide_state->error;
     /* update SAct field in SDB_FIS */
-    s->dev[port].finished |= finished;
     sdb_fis->payload = cpu_to_le32(ad->finished);
 
     /* Update shadow registers (except BSY 0x80 and DRQ 0x08) */
     pr->tfdata = (ad->port.ifs[0].error << 8) |
         (ad->port.ifs[0].status & 0x77) |
         (pr->tfdata & 0x88);
+    pr->scr_act &= ~ad->finished;
+    ad->finished = 0;
 
-    ahci_trigger_irq(s, ad, PORT_IRQ_SDB_FIS);
+    /* Trigger IRQ if interrupt bit is set (which currently, it always is) */
+    if (sdb_fis->flags & 0x40) {
+        ahci_trigger_irq(s, ad, PORT_IRQ_SDB_FIS);
+    }
 }
 
 static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len)
 {
     AHCIPortRegs *pr = &ad->port_regs;
-    uint8_t *pio_fis, *cmd_fis;
-    uint64_t tbl_addr;
-    dma_addr_t cmd_len = 0x80;
+    uint8_t *pio_fis;
     IDEState *s = &ad->port.ifs[0];
 
     if (!ad->res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) {
         return;
     }
 
-    /* map cmd_fis */
-    tbl_addr = le64_to_cpu(ad->cur_cmd->tbl_addr);
-    cmd_fis = dma_memory_map(ad->hba->as, tbl_addr, &cmd_len,
-                             DMA_DIRECTION_TO_DEVICE);
-
-    if (cmd_fis == NULL) {
-        DPRINTF(ad->port_no, "dma_memory_map failed in ahci_write_fis_pio");
-        ahci_trigger_irq(ad->hba, ad, PORT_IRQ_HBUS_ERR);
-        return;
-    }
-
-    if (cmd_len != 0x80) {
-        DPRINTF(ad->port_no,
-                "dma_memory_map mapped too few bytes in ahci_write_fis_pio");
-        dma_memory_unmap(ad->hba->as, cmd_fis, cmd_len,
-                         DMA_DIRECTION_TO_DEVICE, cmd_len);
-        ahci_trigger_irq(ad->hba, ad, PORT_IRQ_HBUS_ERR);
-        return;
-    }
-
     pio_fis = &ad->res_fis[RES_FIS_PSFIS];
 
     pio_fis[0] = SATA_FIS_TYPE_PIO_SETUP;
@@ -721,8 +725,8 @@ static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len)
     pio_fis[9] = s->hob_lcyl;
     pio_fis[10] = s->hob_hcyl;
     pio_fis[11] = 0;
-    pio_fis[12] = cmd_fis[12];
-    pio_fis[13] = cmd_fis[13];
+    pio_fis[12] = s->nsector & 0xFF;
+    pio_fis[13] = (s->nsector >> 8) & 0xFF;
     pio_fis[14] = 0;
     pio_fis[15] = s->status;
     pio_fis[16] = len & 255;
@@ -739,9 +743,6 @@ static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len)
     }
 
     ahci_trigger_irq(ad->hba, ad, PORT_IRQ_PIOS_FIS);
-
-    dma_memory_unmap(ad->hba->as, cmd_fis, cmd_len,
-                     DMA_DIRECTION_TO_DEVICE, cmd_len);
 }
 
 static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis)
@@ -749,22 +750,12 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis)
     AHCIPortRegs *pr = &ad->port_regs;
     uint8_t *d2h_fis;
     int i;
-    dma_addr_t cmd_len = 0x80;
-    int cmd_mapped = 0;
     IDEState *s = &ad->port.ifs[0];
 
     if (!ad->res_fis || !(pr->cmd & PORT_CMD_FIS_RX)) {
         return;
     }
 
-    if (!cmd_fis) {
-        /* map cmd_fis */
-        uint64_t tbl_addr = le64_to_cpu(ad->cur_cmd->tbl_addr);
-        cmd_fis = dma_memory_map(ad->hba->as, tbl_addr, &cmd_len,
-                                 DMA_DIRECTION_TO_DEVICE);
-        cmd_mapped = 1;
-    }
-
     d2h_fis = &ad->res_fis[RES_FIS_RFIS];
 
     d2h_fis[0] = SATA_FIS_TYPE_REGISTER_D2H;
@@ -780,8 +771,8 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis)
     d2h_fis[9] = s->hob_lcyl;
     d2h_fis[10] = s->hob_hcyl;
     d2h_fis[11] = 0;
-    d2h_fis[12] = cmd_fis[12];
-    d2h_fis[13] = cmd_fis[13];
+    d2h_fis[12] = s->nsector & 0xFF;
+    d2h_fis[13] = (s->nsector >> 8) & 0xFF;
     for (i = 14; i < 20; i++) {
         d2h_fis[i] = 0;
     }
@@ -795,26 +786,22 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis)
     }
 
     ahci_trigger_irq(ad->hba, ad, PORT_IRQ_D2H_REG_FIS);
-
-    if (cmd_mapped) {
-        dma_memory_unmap(ad->hba->as, cmd_fis, cmd_len,
-                         DMA_DIRECTION_TO_DEVICE, cmd_len);
-    }
 }
 
 static int prdt_tbl_entry_size(const AHCI_SG *tbl)
 {
+    /* flags_size is zero-based */
     return (le32_to_cpu(tbl->flags_size) & AHCI_PRDT_SIZE_MASK) + 1;
 }
 
 static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
-                                int32_t offset)
+                                AHCICmdHdr *cmd, int64_t limit, int32_t offset)
 {
-    AHCICmdHdr *cmd = ad->cur_cmd;
-    uint32_t opts = le32_to_cpu(cmd->opts);
-    uint64_t prdt_addr = le64_to_cpu(cmd->tbl_addr) + 0x80;
-    int sglist_alloc_hint = opts >> AHCI_CMD_HDR_PRDT_LEN;
-    dma_addr_t prdt_len = (sglist_alloc_hint * sizeof(AHCI_SG));
+    uint16_t opts = le16_to_cpu(cmd->opts);
+    uint16_t prdtl = le16_to_cpu(cmd->prdtl);
+    uint64_t cfis_addr = le64_to_cpu(cmd->tbl_addr);
+    uint64_t prdt_addr = cfis_addr + 0x80;
+    dma_addr_t prdt_len = (prdtl * sizeof(AHCI_SG));
     dma_addr_t real_prdt_len = prdt_len;
     uint8_t *prdt;
     int i;
@@ -834,7 +821,7 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
      * request for sector sizes up to 32K.
      */
 
-    if (!sglist_alloc_hint) {
+    if (!prdtl) {
         DPRINTF(ad->port_no, "no sg list given by guest: 0x%08x\n", opts);
         return -1;
     }
@@ -853,13 +840,12 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
     }
 
     /* Get entries in the PRDT, init a qemu sglist accordingly */
-    if (sglist_alloc_hint > 0) {
+    if (prdtl > 0) {
         AHCI_SG *tbl = (AHCI_SG *)prdt;
         sum = 0;
-        for (i = 0; i < sglist_alloc_hint; i++) {
-            /* flags_size is zero-based */
+        for (i = 0; i < prdtl; i++) {
             tbl_entry_size = prdt_tbl_entry_size(&tbl[i]);
-            if (offset <= (sum + tbl_entry_size)) {
+            if (offset < (sum + tbl_entry_size)) {
                 off_idx = i;
                 off_pos = offset - sum;
                 break;
@@ -874,15 +860,16 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
             goto out;
         }
 
-        qemu_sglist_init(sglist, qbus->parent, (sglist_alloc_hint - off_idx),
+        qemu_sglist_init(sglist, qbus->parent, (prdtl - off_idx),
                          ad->hba->as);
         qemu_sglist_add(sglist, le64_to_cpu(tbl[off_idx].addr) + off_pos,
-                        prdt_tbl_entry_size(&tbl[off_idx]) - off_pos);
+                        MIN(prdt_tbl_entry_size(&tbl[off_idx]) - off_pos,
+                            limit));
 
-        for (i = off_idx + 1; i < sglist_alloc_hint; i++) {
-            /* flags_size is zero-based */
+        for (i = off_idx + 1; i < prdtl && sglist->size < limit; i++) {
             qemu_sglist_add(sglist, le64_to_cpu(tbl[i].addr),
-                            prdt_tbl_entry_size(&tbl[i]));
+                            MIN(prdt_tbl_entry_size(&tbl[i]),
+                                limit - sglist->size));
             if (sglist->size > INT32_MAX) {
                 error_report("AHCI Physical Region Descriptor Table describes "
                              "more than 2 GiB.\n");
@@ -899,28 +886,25 @@ out:
     return r;
 }
 
-static void ncq_cb(void *opaque, int ret)
+static void ncq_err(NCQTransferState *ncq_tfs)
 {
-    NCQTransferState *ncq_tfs = (NCQTransferState *)opaque;
     IDEState *ide_state = &ncq_tfs->drive->port.ifs[0];
 
-    if (ret == -ECANCELED) {
-        return;
-    }
-    /* Clear bit for this tag in SActive */
-    ncq_tfs->drive->port_regs.scr_act &= ~(1 << ncq_tfs->tag);
+    ide_state->error = ABRT_ERR;
+    ide_state->status = READY_STAT | ERR_STAT;
+    ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
+}
 
-    if (ret < 0) {
-        /* error */
-        ide_state->error = ABRT_ERR;
-        ide_state->status = READY_STAT | ERR_STAT;
-        ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
-    } else {
-        ide_state->status = READY_STAT | SEEK_STAT;
+static void ncq_finish(NCQTransferState *ncq_tfs)
+{
+    /* If we didn't error out, set our finished bit. Errored commands
+     * do not get a bit set for the SDB FIS ACT register, nor do they
+     * clear the outstanding bit in scr_act (PxSACT). */
+    if (!(ncq_tfs->drive->port_regs.scr_err & (1 << ncq_tfs->tag))) {
+        ncq_tfs->drive->finished |= (1 << ncq_tfs->tag);
     }
 
-    ahci_write_fis_sdb(ncq_tfs->drive->hba, ncq_tfs->drive->port_no,
-                       (1 << ncq_tfs->tag));
+    ahci_write_fis_sdb(ncq_tfs->drive->hba, ncq_tfs);
 
     DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
             ncq_tfs->tag);
@@ -931,6 +915,35 @@ static void ncq_cb(void *opaque, int ret)
     ncq_tfs->used = 0;
 }
 
+static void ncq_cb(void *opaque, int ret)
+{
+    NCQTransferState *ncq_tfs = (NCQTransferState *)opaque;
+    IDEState *ide_state = &ncq_tfs->drive->port.ifs[0];
+
+    if (ret == -ECANCELED) {
+        return;
+    }
+
+    if (ret < 0) {
+        bool is_read = ncq_tfs->cmd == READ_FPDMA_QUEUED;
+        BlockErrorAction action = blk_get_error_action(ide_state->blk,
+                                                       is_read, -ret);
+        if (action == BLOCK_ERROR_ACTION_STOP) {
+            ncq_tfs->halt = true;
+            ide_state->bus->error_status = IDE_RETRY_HBA;
+        } else if (action == BLOCK_ERROR_ACTION_REPORT) {
+            ncq_err(ncq_tfs);
+        }
+        blk_error_action(ide_state->blk, action, is_read, -ret);
+    } else {
+        ide_state->status = READY_STAT | SEEK_STAT;
+    }
+
+    if (!ncq_tfs->halt) {
+        ncq_finish(ncq_tfs);
+    }
+}
+
 static int is_ncq(uint8_t ata_cmd)
 {
     /* Based on SATA 3.2 section 13.6.3.2 */
@@ -946,13 +959,60 @@ static int is_ncq(uint8_t ata_cmd)
     }
 }
 
+static void execute_ncq_command(NCQTransferState *ncq_tfs)
+{
+    AHCIDevice *ad = ncq_tfs->drive;
+    IDEState *ide_state = &ad->port.ifs[0];
+    int port = ad->port_no;
+
+    g_assert(is_ncq(ncq_tfs->cmd));
+    ncq_tfs->halt = false;
+
+    switch (ncq_tfs->cmd) {
+    case READ_FPDMA_QUEUED:
+        DPRINTF(port, "NCQ reading %d sectors from LBA %"PRId64", tag %d\n",
+                ncq_tfs->sector_count, ncq_tfs->lba, ncq_tfs->tag);
+
+        DPRINTF(port, "tag %d aio read %"PRId64"\n",
+                ncq_tfs->tag, ncq_tfs->lba);
+
+        dma_acct_start(ide_state->blk, &ncq_tfs->acct,
+                       &ncq_tfs->sglist, BLOCK_ACCT_READ);
+        ncq_tfs->aiocb = dma_blk_read(ide_state->blk, &ncq_tfs->sglist,
+                                      ncq_tfs->lba, ncq_cb, ncq_tfs);
+        break;
+    case WRITE_FPDMA_QUEUED:
+        DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n",
+                ncq_tfs->sector_count, ncq_tfs->lba, ncq_tfs->tag);
+
+        DPRINTF(port, "tag %d aio write %"PRId64"\n",
+                ncq_tfs->tag, ncq_tfs->lba);
+
+        dma_acct_start(ide_state->blk, &ncq_tfs->acct,
+                       &ncq_tfs->sglist, BLOCK_ACCT_WRITE);
+        ncq_tfs->aiocb = dma_blk_write(ide_state->blk, &ncq_tfs->sglist,
+                                       ncq_tfs->lba, ncq_cb, ncq_tfs);
+        break;
+    default:
+        DPRINTF(port, "error: unsupported NCQ command (0x%02x) received\n",
+                ncq_tfs->cmd);
+        qemu_sglist_destroy(&ncq_tfs->sglist);
+        ncq_err(ncq_tfs);
+    }
+}
+
+
 static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis,
-                                int slot)
+                                uint8_t slot)
 {
+    AHCIDevice *ad = &s->dev[port];
+    IDEState *ide_state = &ad->port.ifs[0];
     NCQFrame *ncq_fis = (NCQFrame*)cmd_fis;
     uint8_t tag = ncq_fis->tag >> 3;
-    NCQTransferState *ncq_tfs = &s->dev[port].ncq_tfs[tag];
+    NCQTransferState *ncq_tfs = &ad->ncq_tfs[tag];
+    size_t size;
 
+    g_assert(is_ncq(ncq_fis->command));
     if (ncq_tfs->used) {
         /* error - already in use */
         fprintf(stderr, "%s: tag %d already used\n", __FUNCTION__, tag);
@@ -960,75 +1020,82 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis,
     }
 
     ncq_tfs->used = 1;
-    ncq_tfs->drive = &s->dev[port];
+    ncq_tfs->drive = ad;
     ncq_tfs->slot = slot;
+    ncq_tfs->cmdh = &((AHCICmdHdr *)ad->lst)[slot];
+    ncq_tfs->cmd = ncq_fis->command;
     ncq_tfs->lba = ((uint64_t)ncq_fis->lba5 << 40) |
                    ((uint64_t)ncq_fis->lba4 << 32) |
                    ((uint64_t)ncq_fis->lba3 << 24) |
                    ((uint64_t)ncq_fis->lba2 << 16) |
                    ((uint64_t)ncq_fis->lba1 << 8) |
                    (uint64_t)ncq_fis->lba0;
+    ncq_tfs->tag = tag;
 
-    /* Note: We calculate the sector count, but don't currently rely on it.
-     * The total size of the DMA buffer tells us the transfer size instead. */
-    ncq_tfs->sector_count = ((uint16_t)ncq_fis->sector_count_high << 8) |
-                                ncq_fis->sector_count_low;
+    /* Sanity-check the NCQ packet */
+    if (tag != slot) {
+        DPRINTF(port, "Warn: NCQ slot (%d) did not match the given tag (%d)\n",
+                slot, tag);
+    }
 
-    DPRINTF(port, "NCQ transfer LBA from %"PRId64" to %"PRId64", "
-            "drive max %"PRId64"\n",
-            ncq_tfs->lba, ncq_tfs->lba + ncq_tfs->sector_count - 2,
-            s->dev[port].port.ifs[0].nb_sectors - 1);
+    if (ncq_fis->aux0 || ncq_fis->aux1 || ncq_fis->aux2 || ncq_fis->aux3) {
+        DPRINTF(port, "Warn: Attempt to use NCQ auxiliary fields.\n");
+    }
+    if (ncq_fis->prio || ncq_fis->icc) {
+        DPRINTF(port, "Warn: Unsupported attempt to use PRIO/ICC fields\n");
+    }
+    if (ncq_fis->fua & NCQ_FIS_FUA_MASK) {
+        DPRINTF(port, "Warn: Unsupported attempt to use Force Unit Access\n");
+    }
+    if (ncq_fis->tag & NCQ_FIS_RARC_MASK) {
+        DPRINTF(port, "Warn: Unsupported attempt to use Rebuild Assist\n");
+    }
 
-    ahci_populate_sglist(&s->dev[port], &ncq_tfs->sglist, 0);
-    ncq_tfs->tag = tag;
+    ncq_tfs->sector_count = ((ncq_fis->sector_count_high << 8) |
+                             ncq_fis->sector_count_low);
+    if (!ncq_tfs->sector_count) {
+        ncq_tfs->sector_count = 0x10000;
+    }
+    size = ncq_tfs->sector_count * 512;
+    ahci_populate_sglist(ad, &ncq_tfs->sglist, ncq_tfs->cmdh, size, 0);
 
-    switch(ncq_fis->command) {
-        case READ_FPDMA_QUEUED:
-            DPRINTF(port, "NCQ reading %d sectors from LBA %"PRId64", "
-                    "tag %d\n",
-                    ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
+    if (ncq_tfs->sglist.size < size) {
+        error_report("ahci: PRDT length for NCQ command (0x%zx) "
+                     "is smaller than the requested size (0x%zx)",
+                     ncq_tfs->sglist.size, size);
+        qemu_sglist_destroy(&ncq_tfs->sglist);
+        ncq_err(ncq_tfs);
+        ahci_trigger_irq(ad->hba, ad, PORT_IRQ_OVERFLOW);
+        return;
+    } else if (ncq_tfs->sglist.size != size) {
+        DPRINTF(port, "Warn: PRDTL (0x%zx)"
+                " does not match requested size (0x%zx)",
+                ncq_tfs->sglist.size, size);
+    }
 
-            DPRINTF(port, "tag %d aio read %"PRId64"\n",
-                    ncq_tfs->tag, ncq_tfs->lba);
+    DPRINTF(port, "NCQ transfer LBA from %"PRId64" to %"PRId64", "
+            "drive max %"PRId64"\n",
+            ncq_tfs->lba, ncq_tfs->lba + ncq_tfs->sector_count - 1,
+            ide_state->nb_sectors - 1);
 
-            dma_acct_start(ncq_tfs->drive->port.ifs[0].blk, &ncq_tfs->acct,
-                           &ncq_tfs->sglist, BLOCK_ACCT_READ);
-            ncq_tfs->aiocb = dma_blk_read(ncq_tfs->drive->port.ifs[0].blk,
-                                          &ncq_tfs->sglist, ncq_tfs->lba,
-                                          ncq_cb, ncq_tfs);
-            break;
-        case WRITE_FPDMA_QUEUED:
-            DPRINTF(port, "NCQ writing %d sectors to LBA %"PRId64", tag %d\n",
-                    ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
-
-            DPRINTF(port, "tag %d aio write %"PRId64"\n",
-                    ncq_tfs->tag, ncq_tfs->lba);
-
-            dma_acct_start(ncq_tfs->drive->port.ifs[0].blk, &ncq_tfs->acct,
-                           &ncq_tfs->sglist, BLOCK_ACCT_WRITE);
-            ncq_tfs->aiocb = dma_blk_write(ncq_tfs->drive->port.ifs[0].blk,
-                                           &ncq_tfs->sglist, ncq_tfs->lba,
-                                           ncq_cb, ncq_tfs);
-            break;
-        default:
-            if (is_ncq(cmd_fis[2])) {
-                DPRINTF(port,
-                        "error: unsupported NCQ command (0x%02x) received\n",
-                        cmd_fis[2]);
-            } else {
-                DPRINTF(port,
-                        "error: tried to process non-NCQ command as NCQ\n");
-            }
-            qemu_sglist_destroy(&ncq_tfs->sglist);
+    execute_ncq_command(ncq_tfs);
+}
+
+static AHCICmdHdr *get_cmd_header(AHCIState *s, uint8_t port, uint8_t slot)
+{
+    if (port >= s->ports || slot >= AHCI_MAX_CMDS) {
+        return NULL;
     }
+
+    return s->dev[port].lst ? &((AHCICmdHdr *)s->dev[port].lst)[slot] : NULL;
 }
 
 static void handle_reg_h2d_fis(AHCIState *s, int port,
-                               int slot, uint8_t *cmd_fis)
+                               uint8_t slot, uint8_t *cmd_fis)
 {
     IDEState *ide_state = &s->dev[port].port.ifs[0];
-    AHCICmdHdr *cmd = s->dev[port].cur_cmd;
-    uint32_t opts = le32_to_cpu(cmd->opts);
+    AHCICmdHdr *cmd = get_cmd_header(s, port, slot);
+    uint16_t opts = le16_to_cpu(cmd->opts);
 
     if (cmd_fis[1] & 0x0F) {
         DPRINTF(port, "Port Multiplier not supported."
@@ -1108,7 +1175,7 @@ static void handle_reg_h2d_fis(AHCIState *s, int port,
     ide_exec_cmd(&s->dev[port].port, cmd_fis[2]);
 }
 
-static int handle_cmd(AHCIState *s, int port, int slot)
+static int handle_cmd(AHCIState *s, int port, uint8_t slot)
 {
     IDEState *ide_state;
     uint64_t tbl_addr;
@@ -1126,7 +1193,7 @@ static int handle_cmd(AHCIState *s, int port, int slot)
         DPRINTF(port, "error: lst not given but cmd handled");
         return -1;
     }
-    cmd = &((AHCICmdHdr *)s->dev[port].lst)[slot];
+    cmd = get_cmd_header(s, port, slot);
     /* remember current slot handle for later */
     s->dev[port].cur_cmd = cmd;
 
@@ -1185,7 +1252,7 @@ static void ahci_start_transfer(IDEDMA *dma)
     IDEState *s = &ad->port.ifs[0];
     uint32_t size = (uint32_t)(s->data_end - s->data_ptr);
     /* write == ram -> device */
-    uint32_t opts = le32_to_cpu(ad->cur_cmd->opts);
+    uint16_t opts = le16_to_cpu(ad->cur_cmd->opts);
     int is_write = opts & AHCI_CMD_WRITE;
     int is_atapi = opts & AHCI_CMD_ATAPI;
     int has_sglist = 0;
@@ -1197,7 +1264,7 @@ static void ahci_start_transfer(IDEDMA *dma)
         goto out;
     }
 
-    if (ahci_dma_prepare_buf(dma, is_write)) {
+    if (ahci_dma_prepare_buf(dma, size)) {
         has_sglist = 1;
     }
 
@@ -1243,16 +1310,34 @@ static void ahci_restart_dma(IDEDMA *dma)
 }
 
 /**
+ * IDE/PIO restarts are handled by the core layer, but NCQ commands
+ * need an extra kick from the AHCI HBA.
+ */
+static void ahci_restart(IDEDMA *dma)
+{
+    AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
+    int i;
+
+    for (i = 0; i < AHCI_MAX_CMDS; i++) {
+        NCQTransferState *ncq_tfs = &ad->ncq_tfs[i];
+        if (ncq_tfs->halt) {
+            execute_ncq_command(ncq_tfs);
+        }
+    }
+}
+
+/**
  * Called in DMA R/W chains to read the PRDT, utilizing ahci_populate_sglist.
  * Not currently invoked by PIO R/W chains,
  * which invoke ahci_populate_sglist via ahci_start_transfer.
  */
-static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int is_write)
+static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit)
 {
     AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
     IDEState *s = &ad->port.ifs[0];
 
-    if (ahci_populate_sglist(ad, &s->sg, s->io_buffer_offset) == -1) {
+    if (ahci_populate_sglist(ad, &s->sg, ad->cur_cmd,
+                             limit, s->io_buffer_offset) == -1) {
         DPRINTF(ad->port_no, "ahci_dma_prepare_buf failed.\n");
         return -1;
     }
@@ -1287,7 +1372,7 @@ static int ahci_dma_rw_buf(IDEDMA *dma, int is_write)
     uint8_t *p = s->io_buffer + s->io_buffer_index;
     int l = s->io_buffer_size - s->io_buffer_index;
 
-    if (ahci_populate_sglist(ad, &s->sg, s->io_buffer_offset)) {
+    if (ahci_populate_sglist(ad, &s->sg, ad->cur_cmd, l, s->io_buffer_offset)) {
         return 0;
     }
 
@@ -1330,6 +1415,7 @@ static void ahci_irq_set(void *opaque, int n, int level)
 
 static const IDEDMAOps ahci_dma_ops = {
     .start_dma = ahci_start_dma,
+    .restart = ahci_restart,
     .restart_dma = ahci_restart_dma,
     .start_transfer = ahci_start_transfer,
     .prepare_buf = ahci_dma_prepare_buf,
@@ -1400,6 +1486,21 @@ void ahci_reset(AHCIState *s)
     }
 }
 
+static const VMStateDescription vmstate_ncq_tfs = {
+    .name = "ncq state",
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(sector_count, NCQTransferState),
+        VMSTATE_UINT64(lba, NCQTransferState),
+        VMSTATE_UINT8(tag, NCQTransferState),
+        VMSTATE_UINT8(cmd, NCQTransferState),
+        VMSTATE_UINT8(slot, NCQTransferState),
+        VMSTATE_BOOL(used, NCQTransferState),
+        VMSTATE_BOOL(halt, NCQTransferState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_ahci_device = {
     .name = "ahci port",
     .version_id = 1,
@@ -1425,14 +1526,17 @@ static const VMStateDescription vmstate_ahci_device = {
         VMSTATE_BOOL(done_atapi_packet, AHCIDevice),
         VMSTATE_INT32(busy_slot, AHCIDevice),
         VMSTATE_BOOL(init_d2h_sent, AHCIDevice),
+        VMSTATE_STRUCT_ARRAY(ncq_tfs, AHCIDevice, AHCI_MAX_CMDS,
+                             1, vmstate_ncq_tfs, NCQTransferState),
         VMSTATE_END_OF_LIST()
     },
 };
 
 static int ahci_state_post_load(void *opaque, int version_id)
 {
-    int i;
+    int i, j;
     struct AHCIDevice *ad;
+    NCQTransferState *ncq_tfs;
     AHCIState *s = opaque;
 
     for (i = 0; i < s->ports; i++) {
@@ -1444,6 +1548,37 @@ static int ahci_state_post_load(void *opaque, int version_id)
             return -1;
         }
 
+        for (j = 0; j < AHCI_MAX_CMDS; j++) {
+            ncq_tfs = &ad->ncq_tfs[j];
+            ncq_tfs->drive = ad;
+
+            if (ncq_tfs->used != ncq_tfs->halt) {
+                return -1;
+            }
+            if (!ncq_tfs->halt) {
+                continue;
+            }
+            if (!is_ncq(ncq_tfs->cmd)) {
+                return -1;
+            }
+            if (ncq_tfs->slot != ncq_tfs->tag) {
+                return -1;
+            }
+            /* If ncq_tfs->halt is justly set, the engine should be engaged,
+             * and the command list buffer should be mapped. */
+            ncq_tfs->cmdh = get_cmd_header(s, i, ncq_tfs->slot);
+            if (!ncq_tfs->cmdh) {
+                return -1;
+            }
+            ahci_populate_sglist(ncq_tfs->drive, &ncq_tfs->sglist,
+                                 ncq_tfs->cmdh, ncq_tfs->sector_count * 512,
+                                 0);
+            if (ncq_tfs->sector_count != ncq_tfs->sglist.size >> 9) {
+                return -1;
+            }
+        }
+
+
         /*
          * If an error is present, ad->busy_slot will be valid and not -1.
          * In this case, an operation is waiting to resume and will re-check
@@ -1460,7 +1595,7 @@ static int ahci_state_post_load(void *opaque, int version_id)
             if (ad->busy_slot < 0 || ad->busy_slot >= AHCI_MAX_CMDS) {
                 return -1;
             }
-            ad->cur_cmd = &((AHCICmdHdr *)ad->lst)[ad->busy_slot];
+            ad->cur_cmd = get_cmd_header(s, i, ad->busy_slot);
         }
     }
 
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index 501c002c31..9f5b4d20b5 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -195,6 +195,9 @@
 #define RECEIVE_FPDMA_QUEUED               0x65
 #define SEND_FPDMA_QUEUED                  0x64
 
+#define NCQ_FIS_FUA_MASK                   0x80
+#define NCQ_FIS_RARC_MASK                  0x01
+
 #define RES_FIS_DSFIS                      0x00
 #define RES_FIS_PSFIS                      0x20
 #define RES_FIS_RFIS                       0x40
@@ -233,7 +236,8 @@ typedef struct AHCIPortRegs {
 } AHCIPortRegs;
 
 typedef struct AHCICmdHdr {
-    uint32_t    opts;
+    uint16_t    opts;
+    uint16_t    prdtl;
     uint32_t    status;
     uint64_t    tbl_addr;
     uint32_t    reserved[4];
@@ -250,13 +254,16 @@ typedef struct AHCIDevice AHCIDevice;
 typedef struct NCQTransferState {
     AHCIDevice *drive;
     BlockAIOCB *aiocb;
+    AHCICmdHdr *cmdh;
     QEMUSGList sglist;
     BlockAcctCookie acct;
-    uint16_t sector_count;
+    uint32_t sector_count;
     uint64_t lba;
     uint8_t tag;
-    int slot;
-    int used;
+    uint8_t cmd;
+    uint8_t slot;
+    bool used;
+    bool halt;
 } NCQTransferState;
 
 struct AHCIDevice {
@@ -312,27 +319,39 @@ extern const VMStateDescription vmstate_ahci;
     .offset     = vmstate_offset_value(_state, _field, AHCIState),   \
 }
 
+/**
+ * NCQFrame is the same as a Register H2D FIS (described in SATA 3.2),
+ * but some fields have been re-mapped and re-purposed, as seen in
+ * SATA 3.2 section 13.6.4.1 ("READ FPDMA QUEUED")
+ *
+ * cmd_fis[3], feature 7:0, becomes sector count 7:0.
+ * cmd_fis[7], device 7:0, uses bit 7 as the Force Unit Access bit.
+ * cmd_fis[11], feature 15:8, becomes sector count 15:8.
+ * cmd_fis[12], count 7:0, becomes the NCQ TAG (7:3) and RARC bit (0)
+ * cmd_fis[13], count 15:8, becomes the priority value (7:6)
+ * bytes 16-19 become an le32 "auxiliary" field.
+ */
 typedef struct NCQFrame {
     uint8_t fis_type;
     uint8_t c;
     uint8_t command;
-    uint8_t sector_count_low;
+    uint8_t sector_count_low;  /* (feature 7:0) */
     uint8_t lba0;
     uint8_t lba1;
     uint8_t lba2;
-    uint8_t fua;
+    uint8_t fua;               /* (device 7:0) */
     uint8_t lba3;
     uint8_t lba4;
     uint8_t lba5;
-    uint8_t sector_count_high;
-    uint8_t tag;
-    uint8_t reserved5;
-    uint8_t reserved6;
+    uint8_t sector_count_high; /* (feature 15:8) */
+    uint8_t tag;               /* (count 0:7) */
+    uint8_t prio;              /* (count 15:8) */
+    uint8_t icc;
     uint8_t control;
-    uint8_t reserved7;
-    uint8_t reserved8;
-    uint8_t reserved9;
-    uint8_t reserved10;
+    uint8_t aux0;
+    uint8_t aux1;
+    uint8_t aux2;
+    uint8_t aux3;
 } QEMU_PACKED NCQFrame;
 
 typedef struct SDBFIS {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 1efd98af63..122e955084 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -716,8 +716,8 @@ static void ide_dma_cb(void *opaque, int ret)
 
     sector_num = ide_get_sector(s);
     if (n > 0) {
-        assert(s->io_buffer_size == s->sg.size);
-        dma_buf_commit(s, s->io_buffer_size);
+        assert(n * 512 == s->sg.size);
+        dma_buf_commit(s, s->sg.size);
         sector_num += n;
         ide_set_sector(s, sector_num);
         s->nsector -= n;
@@ -734,7 +734,7 @@ static void ide_dma_cb(void *opaque, int ret)
     n = s->nsector;
     s->io_buffer_index = 0;
     s->io_buffer_size = n * 512;
-    if (s->bus->dma->ops->prepare_buf(s->bus->dma, ide_cmd_is_read(s)) < 512) {
+    if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) {
         /* The PRDs were too short. Reset the Active bit, but don't raise an
          * interrupt. */
         s->status = READY_STAT | SEEK_STAT;
@@ -2326,7 +2326,7 @@ static void ide_nop(IDEDMA *dma)
 {
 }
 
-static int32_t ide_nop_int32(IDEDMA *dma, int x)
+static int32_t ide_nop_int32(IDEDMA *dma, int32_t l)
 {
     return 0;
 }
@@ -2371,6 +2371,13 @@ static void ide_restart_bh(void *opaque)
      * called function can set a new error status. */
     bus->error_status = 0;
 
+    /* The HBA has generically asked to be kicked on retry */
+    if (error_status & IDE_RETRY_HBA) {
+        if (s->bus->dma->ops->restart) {
+            s->bus->dma->ops->restart(s->bus->dma);
+        }
+    }
+
     if (error_status & IDE_RETRY_DMA) {
         if (error_status & IDE_RETRY_TRIM) {
             ide_restart_dma(s, IDE_DMA_TRIM);
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 965cc55cb8..30fdcbc5fa 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -324,7 +324,7 @@ typedef void EndTransferFunc(IDEState *);
 typedef void DMAStartFunc(IDEDMA *, IDEState *, BlockCompletionFunc *);
 typedef void DMAVoidFunc(IDEDMA *);
 typedef int DMAIntFunc(IDEDMA *, int);
-typedef int32_t DMAInt32Func(IDEDMA *, int);
+typedef int32_t DMAInt32Func(IDEDMA *, int32_t len);
 typedef void DMAu32Func(IDEDMA *, uint32_t);
 typedef void DMAStopFunc(IDEDMA *, bool);
 typedef void DMARestartFunc(void *, int, RunState);
@@ -436,6 +436,7 @@ struct IDEDMAOps {
     DMAInt32Func *prepare_buf;
     DMAu32Func *commit_buf;
     DMAIntFunc *rw_buf;
+    DMAVoidFunc *restart;
     DMAVoidFunc *restart_dma;
     DMAStopFunc *set_inactive;
     DMAVoidFunc *cmd_done;
@@ -499,6 +500,7 @@ struct IDEDevice {
 #define IDE_RETRY_READ  0x20
 #define IDE_RETRY_FLUSH 0x40
 #define IDE_RETRY_TRIM 0x80
+#define IDE_RETRY_HBA  0x100
 
 static inline IDEState *idebus_active_if(IDEBus *bus)
 {
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index dd52d50732..a55a479da6 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -499,7 +499,7 @@ static int ide_nop_int(IDEDMA *dma, int x)
     return 0;
 }
 
-static int32_t ide_nop_int32(IDEDMA *dma, int x)
+static int32_t ide_nop_int32(IDEDMA *dma, int32_t l)
 {
     return 0;
 }
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 4afd0cfe8c..d31ff885b7 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -53,10 +53,14 @@ static void bmdma_start_dma(IDEDMA *dma, IDEState *s,
 }
 
 /**
- * Return the number of bytes successfully prepared.
- * -1 on error.
+ * Prepare an sglist based on available PRDs.
+ * @limit: How many bytes to prepare total.
+ *
+ * Returns the number of bytes prepared, -1 on error.
+ * IDEState.io_buffer_size will contain the number of bytes described
+ * by the PRDs, whether or not we added them to the sglist.
  */
-static int32_t bmdma_prepare_buf(IDEDMA *dma, int is_write)
+static int32_t bmdma_prepare_buf(IDEDMA *dma, int32_t limit)
 {
     BMDMAState *bm = DO_UPCAST(BMDMAState, dma, dma);
     IDEState *s = bmdma_active_if(bm);
@@ -75,7 +79,7 @@ static int32_t bmdma_prepare_buf(IDEDMA *dma, int is_write)
             /* end of table (with a fail safe of one page) */
             if (bm->cur_prd_last ||
                 (bm->cur_addr - bm->addr) >= BMDMA_PAGE_SIZE) {
-                return s->io_buffer_size;
+                return s->sg.size;
             }
             pci_dma_read(pci_dev, bm->cur_addr, &prd, 8);
             bm->cur_addr += 8;
@@ -90,7 +94,14 @@ static int32_t bmdma_prepare_buf(IDEDMA *dma, int is_write)
         }
         l = bm->cur_prd_len;
         if (l > 0) {
-            qemu_sglist_add(&s->sg, bm->cur_prd_addr, l);
+            uint64_t sg_len;
+
+            /* Don't add extra bytes to the SGList; consume any remaining
+             * PRDs from the guest, but ignore them. */
+            sg_len = MIN(limit - s->sg.size, bm->cur_prd_len);
+            if (sg_len) {
+                qemu_sglist_add(&s->sg, bm->cur_prd_addr, sg_len);
+            }
 
             /* Note: We limit the max transfer to be 2GiB.
              * This should accommodate the largest ATA transaction
diff --git a/hw/input/virtio-input-hid.c b/hw/input/virtio-input-hid.c
index 32cc94af0b..616a815ed4 100644
--- a/hw/input/virtio-input-hid.c
+++ b/hw/input/virtio-input-hid.c
@@ -252,7 +252,11 @@ static void virtio_input_handle_sync(DeviceState *dev)
 static void virtio_input_hid_realize(DeviceState *dev, Error **errp)
 {
     VirtIOInputHID *vhid = VIRTIO_INPUT_HID(dev);
+
     vhid->hs = qemu_input_handler_register(dev, vhid->handler);
+    if (vhid->display && vhid->hs) {
+        qemu_input_handler_bind(vhid->hs, vhid->display, vhid->head, NULL);
+    }
 }
 
 static void virtio_input_hid_unrealize(DeviceState *dev, Error **errp)
@@ -301,10 +305,17 @@ static void virtio_input_hid_handle_status(VirtIOInput *vinput,
     }
 }
 
+static Property virtio_input_hid_properties[] = {
+    DEFINE_PROP_STRING("display", VirtIOInputHID, display),
+    DEFINE_PROP_UINT32("head", VirtIOInputHID, head, 0),
+};
+
 static void virtio_input_hid_class_init(ObjectClass *klass, void *data)
 {
+    DeviceClass *dc = DEVICE_CLASS(klass);
     VirtIOInputClass *vic = VIRTIO_INPUT_CLASS(klass);
 
+    dc->props          = virtio_input_hid_properties;
     vic->realize       = virtio_input_hid_realize;
     vic->unrealize     = virtio_input_hid_unrealize;
     vic->change_active = virtio_input_hid_change_active;
diff --git a/hw/intc/arm_gic_common.c b/hw/intc/arm_gic_common.c
index 044ad66730..a64d0714ea 100644
--- a/hw/intc/arm_gic_common.c
+++ b/hw/intc/arm_gic_common.c
@@ -123,7 +123,7 @@ static void arm_gic_common_realize(DeviceState *dev, Error **errp)
 static void arm_gic_common_reset(DeviceState *dev)
 {
     GICState *s = ARM_GIC_COMMON(dev);
-    int i;
+    int i, j;
     memset(s->irq_state, 0, GIC_MAXIRQ * sizeof(gic_irq_state));
     for (i = 0 ; i < s->num_cpu; i++) {
         if (s->revision == REV_11MPCORE) {
@@ -135,15 +135,30 @@ static void arm_gic_common_reset(DeviceState *dev)
         s->running_irq[i] = 1023;
         s->running_priority[i] = 0x100;
         s->cpu_ctlr[i] = 0;
+        s->bpr[i] = GIC_MIN_BPR;
+        s->abpr[i] = GIC_MIN_ABPR;
+        for (j = 0; j < GIC_INTERNAL; j++) {
+            s->priority1[j][i] = 0;
+        }
+        for (j = 0; j < GIC_NR_SGIS; j++) {
+            s->sgi_pending[j][i] = 0;
+        }
     }
     for (i = 0; i < GIC_NR_SGIS; i++) {
         GIC_SET_ENABLED(i, ALL_CPU_MASK);
         GIC_SET_EDGE_TRIGGER(i);
     }
-    if (s->num_cpu == 1) {
+
+    for (i = 0; i < ARRAY_SIZE(s->priority2); i++) {
+        s->priority2[i] = 0;
+    }
+
+    for (i = 0; i < GIC_MAXIRQ; i++) {
         /* For uniprocessor GICs all interrupts always target the sole CPU */
-        for (i = 0; i < GIC_MAXIRQ; i++) {
+        if (s->num_cpu == 1) {
             s->irq_target[i] = 1;
+        } else {
+            s->irq_target[i] = 0;
         }
     }
     s->ctlr = 0;
diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c
index 2cb7d255d2..f56bff1afb 100644
--- a/hw/intc/arm_gic_kvm.c
+++ b/hw/intc/arm_gic_kvm.c
@@ -570,6 +570,12 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp)
      */
     i += (GIC_INTERNAL * s->num_cpu);
     qdev_init_gpio_in(dev, kvm_arm_gic_set_irq, i);
+
+    for (i = 0; i < s->num_irq - GIC_INTERNAL; i++) {
+        qemu_irq irq = qdev_get_gpio_in(dev, i);
+        kvm_irqchip_set_qemuirq_gsi(kvm_state, irq, i);
+    }
+
     /* We never use our outbound IRQ/FIQ lines but provide them so that
      * we maintain the same interface as the non-KVM GIC.
      */
diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c
index b3e0b1fd52..bd655b8405 100644
--- a/hw/isa/lpc_ich9.c
+++ b/hw/isa/lpc_ich9.c
@@ -357,11 +357,13 @@ static void ich9_set_sci(void *opaque, int irq_num, int level)
     }
 }
 
-void ich9_lpc_pm_init(PCIDevice *lpc_pci)
+void ich9_lpc_pm_init(PCIDevice *lpc_pci, bool smm_enabled)
 {
     ICH9LPCState *lpc = ICH9_LPC_DEVICE(lpc_pci);
+    qemu_irq sci_irq;
 
-    ich9_pm_init(lpc_pci, &lpc->pm, qemu_allocate_irq(ich9_set_sci, lpc, 0));
+    sci_irq = qemu_allocate_irq(ich9_set_sci, lpc, 0);
+    ich9_pm_init(lpc_pci, &lpc->pm, smm_enabled, sci_irq);
     ich9_lpc_reset(&lpc->d.qdev);
 }
 
@@ -375,6 +377,9 @@ static void ich9_apm_ctrl_changed(uint32_t val, void *arg)
     acpi_pm1_cnt_update(&lpc->pm.acpi_regs,
                         val == ICH9_APM_ACPI_ENABLE,
                         val == ICH9_APM_ACPI_DISABLE);
+    if (val == ICH9_APM_ACPI_ENABLE || val == ICH9_APM_ACPI_DISABLE) {
+        return;
+    }
 
     /* SMI_EN = PMBASE + 30. SMI control and enable register */
     if (lpc->pm.smi_en & ICH9_PMIO_SMI_EN_APMC_EN) {
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index e70633d29f..bb04862de8 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -23,12 +23,96 @@
 #include "qapi/visitor.h"
 #include "qemu/range.h"
 #include "sysemu/numa.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
 
 typedef struct pc_dimms_capacity {
      uint64_t size;
      Error    **errp;
 } pc_dimms_capacity;
 
+void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
+                         MemoryRegion *mr, uint64_t align, Error **errp)
+{
+    int slot;
+    MachineState *machine = MACHINE(qdev_get_machine());
+    PCDIMMDevice *dimm = PC_DIMM(dev);
+    Error *local_err = NULL;
+    uint64_t existing_dimms_capacity = 0;
+    uint64_t addr;
+
+    addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    addr = pc_dimm_get_free_addr(hpms->base,
+                                 memory_region_size(&hpms->mr),
+                                 !addr ? NULL : &addr, align,
+                                 memory_region_size(mr), &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    if (existing_dimms_capacity + memory_region_size(mr) >
+        machine->maxram_size - machine->ram_size) {
+        error_setg(&local_err, "not enough space, currently 0x%" PRIx64
+                   " in use of total hot pluggable 0x" RAM_ADDR_FMT,
+                   existing_dimms_capacity,
+                   machine->maxram_size - machine->ram_size);
+        goto out;
+    }
+
+    object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    trace_mhp_pc_dimm_assigned_address(addr);
+
+    slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot,
+                                 machine->ram_slots, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    trace_mhp_pc_dimm_assigned_slot(slot);
+
+    if (kvm_enabled() && !kvm_has_free_slot(machine)) {
+        error_setg(&local_err, "hypervisor has no free memory slots left");
+        goto out;
+    }
+
+    memory_region_add_subregion(&hpms->mr, addr - hpms->base, mr);
+    vmstate_register_ram(mr, dev);
+    numa_set_mem_node_id(addr, memory_region_size(mr), dimm->node);
+
+out:
+    error_propagate(errp, local_err);
+}
+
+void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
+                           MemoryRegion *mr)
+{
+    PCDIMMDevice *dimm = PC_DIMM(dev);
+
+    numa_unset_mem_node_id(dimm->addr, memory_region_size(mr), dimm->node);
+    memory_region_del_subregion(&hpms->mr, mr);
+    vmstate_unregister_ram(mr, dev);
+}
+
 static int pc_existing_dimms_capacity_internal(Object *obj, void *opaque)
 {
     pc_dimms_capacity *cap = opaque;
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index d631337e11..e345a6e9b8 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -1316,8 +1316,8 @@ static int virtio_ccw_add_irqfd(VirtioCcwDevice *dev, int n)
     VirtQueue *vq = virtio_get_queue(vdev, n);
     EventNotifier *notifier = virtio_queue_get_guest_notifier(vq);
 
-    return kvm_irqchip_add_irqfd_notifier(kvm_state, notifier, NULL,
-                                          dev->routes.gsi[n]);
+    return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, notifier, NULL,
+                                              dev->routes.gsi[n]);
 }
 
 static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n)
@@ -1327,8 +1327,8 @@ static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n)
     EventNotifier *notifier = virtio_queue_get_guest_notifier(vq);
     int ret;
 
-    ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, notifier,
-                                            dev->routes.gsi[n]);
+    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, notifier,
+                                                dev->routes.gsi[n]);
     assert(ret == 0);
 }
 
diff --git a/hw/timer/arm_mptimer.c b/hw/timer/arm_mptimer.c
index 8b93b3c1ae..3e59c2a288 100644
--- a/hw/timer/arm_mptimer.c
+++ b/hw/timer/arm_mptimer.c
@@ -38,7 +38,7 @@ static inline int get_current_cpu(ARMMPTimerState *s)
 
 static inline void timerblock_update_irq(TimerBlock *tb)
 {
-    qemu_set_irq(tb->irq, tb->status);
+    qemu_set_irq(tb->irq, tb->status && (tb->control & 4));
 }
 
 /* Return conversion factor from mpcore timer ticks to qemu timer ticks.  */
@@ -122,11 +122,18 @@ static void timerblock_write(void *opaque, hwaddr addr,
     case 8: /* Control.  */
         old = tb->control;
         tb->control = value;
-        if (((old & 1) == 0) && (value & 1)) {
-            if (tb->count == 0 && (tb->control & 2)) {
+        if (value & 1) {
+            if ((old & 1) && (tb->count != 0)) {
+                /* Do nothing if timer is ticking right now.  */
+                break;
+            }
+            if (tb->control & 2) {
                 tb->count = tb->load;
             }
             timerblock_reload(tb, 1);
+        } else if (old & 1) {
+            /* Shutdown the timer.  */
+            timer_del(tb->timer);
         }
         break;
     case 12: /* Interrupt status.  */
diff --git a/hw/timer/cadence_ttc.c b/hw/timer/cadence_ttc.c
index d46db3c0e2..35bc88033e 100644
--- a/hw/timer/cadence_ttc.c
+++ b/hw/timer/cadence_ttc.c
@@ -208,15 +208,14 @@ static void cadence_timer_sync(CadenceTimerState *s)
             s->reg_intr |= (2 << i);
         }
     }
+    if ((x < 0) || (x >= interval)) {
+        s->reg_intr |= (s->reg_count & COUNTER_CTRL_INT) ?
+            COUNTER_INTR_IV : COUNTER_INTR_OV;
+    }
     while (x < 0) {
         x += interval;
     }
     s->reg_value = (uint32_t)(x % interval);
-
-    if (s->reg_value != x) {
-        s->reg_intr |= (s->reg_count & COUNTER_CTRL_INT) ?
-            COUNTER_INTR_IV : COUNTER_INTR_OV;
-    }
     cadence_timer_update(s);
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index b1045da857..85ee9b005e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -772,11 +772,19 @@ static void vfio_disconnect_container(VFIOGroup *group)
 
     if (QLIST_EMPTY(&container->group_list)) {
         VFIOAddressSpace *space = container->space;
+        VFIOGuestIOMMU *giommu, *tmp;
 
         if (container->iommu_data.release) {
             container->iommu_data.release(container);
         }
         QLIST_REMOVE(container, next);
+
+        QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
+            memory_region_unregister_iommu_notifier(&giommu->n);
+            QLIST_REMOVE(giommu, giommu_next);
+            g_free(giommu);
+        }
+
         trace_vfio_disconnect_container(container->fd);
         close(container->fd);
         g_free(container);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e0e339a534..2ed877fe9f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -597,7 +597,7 @@ static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
         return;
     }
 
-    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
+    if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
                                        NULL, virq) < 0) {
         kvm_irqchip_release_virq(kvm_state, virq);
         event_notifier_cleanup(&vector->kvm_interrupt);
@@ -609,8 +609,8 @@ static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg,
 
 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 {
-    kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt,
-                                      vector->virq);
+    kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
+                                          vector->virq);
     kvm_irqchip_release_virq(kvm_state, vector->virq);
     vector->virq = -1;
     event_notifier_cleanup(&vector->kvm_interrupt);
@@ -939,7 +939,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
     };
     uint64_t size;
     off_t off = 0;
-    size_t bytes;
+    ssize_t bytes;
 
     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
         error_report("vfio: Error getting ROM info: %m");
@@ -2252,6 +2252,33 @@ static int vfio_early_setup_msix(VFIOPCIDevice *vdev)
     vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
     vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
 
+    /*
+     * Test the size of the pba_offset variable and catch if it extends outside
+     * of the specified BAR. If it is the case, we need to apply a hardware
+     * specific quirk if the device is known or we have a broken configuration.
+     */
+    if (vdev->msix->pba_offset >=
+        vdev->bars[vdev->msix->pba_bar].region.size) {
+
+        PCIDevice *pdev = &vdev->pdev;
+        uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID);
+        uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID);
+
+        /*
+         * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
+         * adapters. The T5 hardware returns an incorrect value of 0x8000 for
+         * the VF PBA offset while the BAR itself is only 8k. The correct value
+         * is 0x1000, so we hard code that here.
+         */
+        if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) {
+            vdev->msix->pba_offset = 0x1000;
+        } else {
+            error_report("vfio: Hardware reports invalid configuration, "
+                         "MSIX PBA outside of specified BAR");
+            return -EINVAL;
+        }
+    }
+
     trace_vfio_early_setup_msix(vdev->vbasedev.name, pos,
                                 vdev->msix->table_bar,
                                 vdev->msix->table_offset,
@@ -2388,7 +2415,7 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
      * potentially insert a direct-mapped subregion before and after it.
      */
     if (vdev->msix && vdev->msix->table_bar == nr) {
-        size = vdev->msix->table_offset & qemu_host_page_mask;
+        size = vdev->msix->table_offset & qemu_real_host_page_mask;
     }
 
     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
@@ -2401,8 +2428,9 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
     if (vdev->msix && vdev->msix->table_bar == nr) {
         uint64_t start;
 
-        start = HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
-                                (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+        start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
+                                     (vdev->msix->entries *
+                                      PCI_MSIX_ENTRY_SIZE));
 
         size = start < bar->region.size ? bar->region.size - start : 0;
         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 5c678b914e..60365d1279 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -26,6 +26,7 @@
 #include "hw/sysbus.h"
 #include "trace.h"
 #include "hw/platform-bus.h"
+#include "sysemu/kvm.h"
 
 /*
  * Functions used whatever the injection method
@@ -51,6 +52,7 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev,
     intp->pin = info.index;
     intp->flags = info.flags;
     intp->state = VFIO_IRQ_INACTIVE;
+    intp->kvm_accel = false;
 
     sysbus_init_irq(sbdev, &intp->qemuirq);
 
@@ -61,6 +63,13 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev,
         error_report("vfio: Error: trigger event_notifier_init failed ");
         return NULL;
     }
+    /* Get an eventfd for resample/unmask */
+    ret = event_notifier_init(&intp->unmask, 0);
+    if (ret) {
+        g_free(intp);
+        error_report("vfio: Error: resamplefd event_notifier_init failed");
+        return NULL;
+    }
 
     QLIST_INSERT_HEAD(&vdev->intp_list, intp, next);
     return intp;
@@ -315,6 +324,94 @@ static int vfio_start_eventfd_injection(VFIOINTp *intp)
     return ret;
 }
 
+/*
+ * Functions used for irqfd
+ */
+
+/**
+ * vfio_set_resample_eventfd - sets the resamplefd for an IRQ
+ * @intp: the IRQ struct handle
+ * programs the VFIO driver to unmask this IRQ when the
+ * intp->unmask eventfd is triggered
+ */
+static int vfio_set_resample_eventfd(VFIOINTp *intp)
+{
+    VFIODevice *vbasedev = &intp->vdev->vbasedev;
+    struct vfio_irq_set *irq_set;
+    int argsz, ret;
+    int32_t *pfd;
+
+    argsz = sizeof(*irq_set) + sizeof(*pfd);
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
+    irq_set->index = intp->pin;
+    irq_set->start = 0;
+    irq_set->count = 1;
+    pfd = (int32_t *)&irq_set->data;
+    *pfd = event_notifier_get_fd(&intp->unmask);
+    qemu_set_fd_handler(*pfd, NULL, NULL, NULL);
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (ret < 0) {
+        error_report("vfio: Failed to set resample eventfd: %m");
+    }
+    return ret;
+}
+
+static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq)
+{
+    VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev);
+    VFIOINTp *intp;
+
+    if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() ||
+        !vdev->irqfd_allowed) {
+        return;
+    }
+
+    QLIST_FOREACH(intp, &vdev->intp_list, next) {
+        if (intp->qemuirq == irq) {
+            break;
+        }
+    }
+    assert(intp);
+
+    /* Get to a known interrupt state */
+    qemu_set_fd_handler(event_notifier_get_fd(&intp->interrupt),
+                        NULL, NULL, vdev);
+
+    vfio_mask_single_irqindex(&vdev->vbasedev, intp->pin);
+    qemu_set_irq(intp->qemuirq, 0);
+
+    if (kvm_irqchip_add_irqfd_notifier(kvm_state, &intp->interrupt,
+                                   &intp->unmask, irq) < 0) {
+        goto fail_irqfd;
+    }
+
+    if (vfio_set_trigger_eventfd(intp, NULL) < 0) {
+        goto fail_vfio;
+    }
+    if (vfio_set_resample_eventfd(intp) < 0) {
+        goto fail_vfio;
+    }
+
+    /* Let's resume injection with irqfd setup */
+    vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin);
+
+    intp->kvm_accel = true;
+
+    trace_vfio_platform_start_irqfd_injection(intp->pin,
+                                     event_notifier_get_fd(&intp->interrupt),
+                                     event_notifier_get_fd(&intp->unmask));
+    return;
+fail_vfio:
+    kvm_irqchip_remove_irqfd_notifier(kvm_state, &intp->interrupt, irq);
+fail_irqfd:
+    vfio_start_eventfd_injection(intp);
+    vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin);
+    return;
+}
+
 /* VFIO skeleton */
 
 static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev)
@@ -584,17 +681,20 @@ static Property vfio_platform_dev_properties[] = {
     DEFINE_PROP_BOOL("x-mmap", VFIOPlatformDevice, vbasedev.allow_mmap, true),
     DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
                        mmap_timeout, 1100),
+    DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
 static void vfio_platform_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
+    SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass);
 
     dc->realize = vfio_platform_realize;
     dc->props = vfio_platform_dev_properties;
     dc->vmsd = &vfio_platform_vmstate;
     dc->desc = "VFIO-based platform device assignment";
+    sbc->connect_irq_notifier = vfio_start_irqfd_injection;
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 6a0174e9cc..7a89081e4f 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -495,7 +495,7 @@ static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy,
     VirtQueue *vq = virtio_get_queue(vdev, queue_no);
     EventNotifier *n = virtio_queue_get_guest_notifier(vq);
     int ret;
-    ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, NULL, irqfd->virq);
+    ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq);
     return ret;
 }
 
@@ -509,7 +509,7 @@ static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy,
     VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
     int ret;
 
-    ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, n, irqfd->virq);
+    ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq);
     assert(ret == 0);
 }
 
diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index ac06c6721c..ea6a9a667c 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -26,6 +26,12 @@
 #include "qom/cpu.h"
 #include "qemu/rcu.h"
 
+#define EXCP_INTERRUPT 	0x10000 /* async interruption */
+#define EXCP_HLT        0x10001 /* hlt instruction reached */
+#define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
+#define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
+#define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
+
 /* some important defines:
  *
  * WORDS_ALIGNED : if defined, the host cpu can only make word aligned
@@ -177,10 +183,13 @@ extern unsigned long reserved_va;
 
 /* ??? These should be the larger of uintptr_t and target_ulong.  */
 extern uintptr_t qemu_real_host_page_size;
+extern uintptr_t qemu_real_host_page_mask;
 extern uintptr_t qemu_host_page_size;
 extern uintptr_t qemu_host_page_mask;
 
 #define HOST_PAGE_ALIGN(addr) (((addr) + qemu_host_page_size - 1) & qemu_host_page_mask)
+#define REAL_HOST_PAGE_ALIGN(addr) (((addr) + qemu_real_host_page_size - 1) & \
+                                    qemu_real_host_page_mask)
 
 /* same as PROT_xxx */
 #define PAGE_READ      0x0001
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index de8a7200a9..9fb1d541d4 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -13,6 +13,8 @@
 
 #include "qemu/bswap.h"
 #include "qemu/queue.h"
+#include "qemu/fprintf-fn.h"
+#include "qemu/typedefs.h"
 
 /**
  * CPUListState:
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index d5aecaf49e..98b9cff310 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -56,20 +56,6 @@ typedef uint64_t target_ulong;
 #error TARGET_LONG_SIZE undefined
 #endif
 
-#define EXCP_INTERRUPT 	0x10000 /* async interruption */
-#define EXCP_HLT        0x10001 /* hlt instruction reached */
-#define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
-#define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
-#define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
-
-/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
-   addresses on the same page.  The top bits are the same.  This allows
-   TLB invalidation to quickly clear a subset of the hash table.  */
-#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2)
-#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS)
-#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1)
-#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
-
 #if !defined(CONFIG_USER_ONLY)
 /* use a fully associative victim tlb of 8 entries */
 #define CPU_VTLB_SIZE 8
@@ -161,7 +147,6 @@ typedef struct CPUIOTLBEntry {
 #endif
 
 
-#define CPU_TEMP_BUF_NLONGS 128
 #define CPU_COMMON                                                      \
     /* soft mmu support */                                              \
     CPU_COMMON_TLB                                                      \
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 2573e8c36e..d678114cb2 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -195,26 +195,6 @@ struct TBContext {
     int tb_invalidated_flag;
 };
 
-static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc)
-{
-    target_ulong tmp;
-    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
-    return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK;
-}
-
-static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
-{
-    target_ulong tmp;
-    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
-    return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK)
-	    | (tmp & TB_JMP_ADDR_MASK));
-}
-
-static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc)
-{
-    return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1);
-}
-
 void tb_free(TranslationBlock *tb);
 void tb_flush(CPUArchState *env);
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 8ae004eb06..139471500f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -23,7 +23,6 @@
 
 #include <stdint.h>
 #include <stdbool.h>
-#include "qemu-common.h"
 #include "exec/cpu-common.h"
 #ifndef CONFIG_USER_ONLY
 #include "exec/hwaddr.h"
@@ -180,6 +179,7 @@ struct MemoryRegion {
     bool rom_device;
     bool warning_printed; /* For reservations */
     bool flush_coalesced_mmio;
+    bool global_locking;
     MemoryRegion *alias;
     hwaddr alias_offset;
     int32_t priority;
@@ -825,6 +825,31 @@ void memory_region_set_flush_coalesced(MemoryRegion *mr);
 void memory_region_clear_flush_coalesced(MemoryRegion *mr);
 
 /**
+ * memory_region_set_global_locking: Declares the access processing requires
+ *                                   QEMU's global lock.
+ *
+ * When this is invoked, accesses to the memory region will be processed while
+ * holding the global lock of QEMU. This is the default behavior of memory
+ * regions.
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_set_global_locking(MemoryRegion *mr);
+
+/**
+ * memory_region_clear_global_locking: Declares that access processing does
+ *                                     not depend on the QEMU global lock.
+ *
+ * By clearing this property, accesses to the memory region will be processed
+ * outside of QEMU's global lock (unless the lock is held on when issuing the
+ * access request). In this case, the device model implementing the access
+ * handlers is responsible for synchronization of concurrency.
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_clear_global_locking(MemoryRegion *mr);
+
+/**
  * memory_region_add_eventfd: Request an eventfd to be triggered when a word
  *                            is written to a location.
  *
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
new file mode 100644
index 0000000000..0f4e8a08af
--- /dev/null
+++ b/include/exec/tb-hash.h
@@ -0,0 +1,51 @@
+/*
+ * internal execution defines for qemu
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EXEC_TB_HASH
+#define EXEC_TB_HASH
+
+/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
+   addresses on the same page.  The top bits are the same.  This allows
+   TLB invalidation to quickly clear a subset of the hash table.  */
+#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2)
+#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS)
+#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1)
+#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
+
+static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc)
+{
+    target_ulong tmp;
+    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+    return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK;
+}
+
+static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
+{
+    target_ulong tmp;
+    tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+    return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK)
+           | (tmp & TB_JMP_ADDR_MASK));
+}
+
+static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc)
+{
+    return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1);
+}
+
+#endif
diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h
index 77cc65cbc2..ac24bbe9a3 100644
--- a/include/hw/acpi/ich9.h
+++ b/include/hw/acpi/ich9.h
@@ -54,10 +54,11 @@ typedef struct ICH9LPCPMRegs {
     uint8_t disable_s3;
     uint8_t disable_s4;
     uint8_t s4_val;
+    uint8_t smm_enabled;
 } ICH9LPCPMRegs;
 
 void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
-                  qemu_irq sci_irq);
+                  bool smm_enabled, qemu_irq sci_irq);
 void ich9_pm_iospace_update(ICH9LPCPMRegs *pm, uint32_t pm_io_base);
 extern const VMStateDescription vmstate_ich9_pm;
 
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index 760804cc46..4dcd4f9b63 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -14,6 +14,7 @@
 #include "exec/memory.h"
 #include "hw/irq.h"
 #include "qemu/notify.h"
+#include "cpu.h"
 
 /* armv7m.c */
 qemu_irq *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq,
diff --git a/include/hw/i386/ich9.h b/include/hw/i386/ich9.h
index a2cc15c915..b317a481c8 100644
--- a/include/hw/i386/ich9.h
+++ b/include/hw/i386/ich9.h
@@ -17,7 +17,7 @@
 void ich9_lpc_set_irq(void *opaque, int irq_num, int level);
 int ich9_lpc_map_irq(PCIDevice *pci_dev, int intx);
 PCIINTxRoute ich9_route_intx_pin_to_irq(void *opaque, int pirq_pin);
-void ich9_lpc_pm_init(PCIDevice *pci_lpc);
+void ich9_lpc_pm_init(PCIDevice *pci_lpc, bool smm_enabled);
 I2CBus *ich9_smb_init(PCIBus *bus, int devfn, uint32_t smb_io_base);
 
 #define ICH9_CC_SIZE                            (16 * 1024)     /* 16KB */
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 86c565147c..786a1d511c 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -15,14 +15,12 @@
 #include "hw/pci/pci.h"
 #include "hw/boards.h"
 #include "hw/compat.h"
+#include "hw/mem/pc-dimm.h"
 
 #define HPET_INTCAP "hpet-intcap"
 
 /**
  * PCMachineState:
- * @hotplug_memory_base: address in guest RAM address space where hotplug memory
- * address space begins.
- * @hotplug_memory: hotplug memory addess space container
  * @acpi_dev: link to ACPI PM device that performs ACPI hotplug handling
  * @enforce_aligned_dimm: check that DIMM's address/size is aligned by
  *                        backend's alignment value if provided
@@ -32,14 +30,14 @@ struct PCMachineState {
     MachineState parent_obj;
 
     /* <public> */
-    ram_addr_t hotplug_memory_base;
-    MemoryRegion hotplug_memory;
+    MemoryHotplugState hotplug_memory;
 
     HotplugHandler *acpi_dev;
     ISADevice *rtc;
 
     uint64_t max_ram_below_4g;
     OnOffAuto vmport;
+    OnOffAuto smm;
     bool enforce_aligned_dimm;
 };
 
@@ -47,6 +45,7 @@ struct PCMachineState {
 #define PC_MACHINE_MEMHP_REGION_SIZE "hotplug-memory-region-size"
 #define PC_MACHINE_MAX_RAM_BELOW_4G "max-ram-below-4g"
 #define PC_MACHINE_VMPORT           "vmport"
+#define PC_MACHINE_SMM              "smm"
 #define PC_MACHINE_ENFORCE_ALIGNED_DIMM "enforce-aligned-dimm"
 
 /**
@@ -158,6 +157,7 @@ void i8042_setup_a20_line(ISADevice *dev, qemu_irq *a20_out);
 /* pc.c */
 extern int fd_bootchk;
 
+bool pc_machine_is_smm_enabled(PCMachineState *pcms);
 void pc_register_ferr_irq(qemu_irq irq);
 void pc_acpi_smi_interrupt(void *opaque, int irq, int level);
 
@@ -217,7 +217,7 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name);
 
 I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
                       qemu_irq sci_irq, qemu_irq smi_irq,
-                      int kvm_enabled, DeviceState **piix4_pm);
+                      int smm_enabled, DeviceState **piix4_pm);
 void piix4_smbus_register_device(SMBusDevice *dev, uint8_t addr);
 
 /* hpet.c */
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index f7b80b44b7..d83bf30ea9 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -70,6 +70,17 @@ typedef struct PCDIMMDeviceClass {
     MemoryRegion *(*get_memory_region)(PCDIMMDevice *dimm);
 } PCDIMMDeviceClass;
 
+/**
+ * MemoryHotplugState:
+ * @base: address in guest RAM address space where hotplug memory
+ * address space begins.
+ * @mr: hotplug memory address space container
+ */
+typedef struct MemoryHotplugState {
+    ram_addr_t base;
+    MemoryRegion mr;
+} MemoryHotplugState;
+
 uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
                                uint64_t address_space_size,
                                uint64_t *hint, uint64_t align, uint64_t size,
@@ -79,4 +90,8 @@ int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
 
 int qmp_pc_dimm_device_list(Object *obj, void *opaque);
 uint64_t pc_existing_dimms_capacity(Error **errp);
+void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
+                         MemoryRegion *mr, uint64_t align, Error **errp);
+void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
+                           MemoryRegion *mr);
 #endif
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 49c062b8ce..d98e6c915d 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -114,6 +114,8 @@
 #define PCI_VENDOR_ID_ENSONIQ            0x1274
 #define PCI_DEVICE_ID_ENSONIQ_ES1370     0x5000
 
+#define PCI_VENDOR_ID_CHELSIO            0x1425
+
 #define PCI_VENDOR_ID_FREESCALE          0x1957
 #define PCI_DEVICE_ID_MPC8533E           0x0030
 
diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h
index 34f93c39bf..cc1dba49bf 100644
--- a/include/hw/sysbus.h
+++ b/include/hw/sysbus.h
@@ -58,6 +58,7 @@ typedef struct SysBusDeviceClass {
      * omitted then. (This is not considered a fatal error.)
      */
     char *(*explicit_ofw_unit_address)(const SysBusDevice *dev);
+    void (*connect_irq_notifier)(SysBusDevice *dev, qemu_irq irq);
 } SysBusDeviceClass;
 
 struct SysBusDevice {
diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h
index 26b2ad6f4e..c5cf1d79f3 100644
--- a/include/hw/vfio/vfio-platform.h
+++ b/include/hw/vfio/vfio-platform.h
@@ -41,6 +41,7 @@ typedef struct VFIOINTp {
     int state; /* inactive, pending, active */
     uint8_t pin; /* index */
     uint32_t flags; /* IRQ info flags */
+    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
 } VFIOINTp;
 
 /* function type for user side eventfd handler */
@@ -57,6 +58,7 @@ typedef struct VFIOPlatformDevice {
     uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
     QEMUTimer *mmap_timer; /* allows fast-path resume after IRQ hit */
     QemuMutex intp_mutex; /* protect the intp_list IRQ state */
+    bool irqfd_allowed; /* debug option to force irqfd on/off */
 } VFIOPlatformDevice;
 
 typedef struct VFIOPlatformDeviceClass {
diff --git a/include/hw/virtio/virtio-input.h b/include/hw/virtio/virtio-input.h
index fd5417d1a3..af1c207ab1 100644
--- a/include/hw/virtio/virtio-input.h
+++ b/include/hw/virtio/virtio-input.h
@@ -95,6 +95,8 @@ struct VirtIOInputClass {
 
 struct VirtIOInputHID {
     VirtIOInput                       parent_obj;
+    char                              *display;
+    uint32_t                          head;
     QemuInputHandler                  *handler;
     QemuInputHandlerState             *hs;
     int                               ledstate;
diff --git a/include/qemu-common.h b/include/qemu-common.h
index d52d09cfb8..237d6547b3 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -15,6 +15,7 @@
 #include "qemu/compiler.h"
 #include "config-host.h"
 #include "qemu/typedefs.h"
+#include "qemu/fprintf-fn.h"
 
 #if defined(__arm__) || defined(__sparc__) || defined(__mips__) || defined(__hppa__) || defined(__ia64__)
 #define WORDS_ALIGNED
@@ -85,9 +86,6 @@
 # error Unknown pointer size
 #endif
 
-typedef int (*fprintf_function)(FILE *f, const char *fmt, ...)
-    GCC_FMT_ATTR(2, 3);
-
 #ifdef _WIN32
 #define fsync _commit
 #if !defined(lseek)
@@ -455,6 +453,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #define VECTYPE        __vector unsigned char
 #define SPLAT(p)       vec_splat(vec_ld(0, p), 0)
 #define ALL_EQ(v1, v2) vec_all_eq(v1, v2)
+#define VEC_OR(v1, v2) ((v1) | (v2))
 /* altivec.h may redefine the bool macro as vector type.
  * Reset it to POSIX semantics. */
 #define bool _Bool
@@ -463,10 +462,12 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #define VECTYPE        __m128i
 #define SPLAT(p)       _mm_set1_epi8(*(p))
 #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF)
+#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2))
 #else
 #define VECTYPE        unsigned long
 #define SPLAT(p)       (*(p) * (~0UL / 255))
 #define ALL_EQ(v1, v2) ((v1) == (v2))
+#define VEC_OR(v1, v2) ((v1) | (v2))
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
diff --git a/include/qemu/fprintf-fn.h b/include/qemu/fprintf-fn.h
new file mode 100644
index 0000000000..9ddc90f1c5
--- /dev/null
+++ b/include/qemu/fprintf-fn.h
@@ -0,0 +1,17 @@
+/*
+ * Typedef for fprintf-alike function pointers.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_FPRINTF_FN_H
+#define QEMU_FPRINTF_FN_H 1
+
+#include "qemu/compiler.h"
+#include <stdio.h>
+
+typedef int (*fprintf_function)(FILE *f, const char *fmt, ...)
+    GCC_FMT_ATTR(2, 3);
+
+#endif
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 0f4a0fd4b2..bc18ca30e4 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -223,6 +223,16 @@ int qemu_add_child_watch(pid_t pid);
 #endif
 
 /**
+ * qemu_mutex_iothread_locked: Return lock status of the main loop mutex.
+ *
+ * The main loop mutex is the coarsest lock in QEMU, and as such it
+ * must always be taken outside other locks.  This function helps
+ * functions take different paths depending on whether the current
+ * thread is running within the main loop mutex.
+ */
+bool qemu_mutex_iothread_locked(void);
+
+/**
  * qemu_mutex_lock_iothread: Lock the main loop mutex.
  *
  * This function locks the main loop mutex.  The mutex is taken by
diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h
index b94d365f28..a459dd25da 100644
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@@ -367,7 +367,8 @@ struct input_keymap_entry {
 #define KEY_MSDOS		151
 #define KEY_COFFEE		152	/* AL Terminal Lock/Screensaver */
 #define KEY_SCREENLOCK		KEY_COFFEE
-#define KEY_DIRECTION		153
+#define KEY_ROTATE_DISPLAY	153	/* Display orientation for e.g. tablets */
+#define KEY_DIRECTION		KEY_ROTATE_DISPLAY
 #define KEY_CYCLEWINDOWS	154
 #define KEY_MAIL		155
 #define KEY_BOOKMARKS		156	/* AC Bookmarks */
@@ -700,6 +701,10 @@ struct input_keymap_entry {
 #define KEY_NUMERIC_9		0x209
 #define KEY_NUMERIC_STAR	0x20a
 #define KEY_NUMERIC_POUND	0x20b
+#define KEY_NUMERIC_A		0x20c	/* Phone key A - HUT Telephony 0xb9 */
+#define KEY_NUMERIC_B		0x20d
+#define KEY_NUMERIC_C		0x20e
+#define KEY_NUMERIC_D		0x20f
 
 #define KEY_CAMERA_FOCUS	0x210
 #define KEY_WPS_BUTTON		0x211	/* WiFi Protected Setup key */
@@ -971,7 +976,8 @@ struct input_keymap_entry {
  */
 #define MT_TOOL_FINGER		0
 #define MT_TOOL_PEN		1
-#define MT_TOOL_MAX		1
+#define MT_TOOL_PALM		2
+#define MT_TOOL_MAX		2
 
 /*
  * Values describing the status of a force-feedback effect
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 88ada1d048..2e2a6dcf3a 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -26,6 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE. */
 #include "standard-headers/linux/types.h"
+#include "standard-headers/linux/virtio_types.h"
 #include "standard-headers/linux/virtio_ids.h"
 #include "standard-headers/linux/virtio_config.h"
 
diff --git a/include/standard-headers/linux/virtio_gpu.h b/include/standard-headers/linux/virtio_gpu.h
index cfcfb463fc..72ef815f51 100644
--- a/include/standard-headers/linux/virtio_gpu.h
+++ b/include/standard-headers/linux/virtio_gpu.h
@@ -38,6 +38,8 @@
 #ifndef VIRTIO_GPU_HW_H
 #define VIRTIO_GPU_HW_H
 
+#include "standard-headers/linux/types.h"
+
 enum virtio_gpu_ctrl_type {
 	VIRTIO_GPU_UNDEFINED = 0,
 
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index f459fbdbd4..983e99e1e7 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -19,6 +19,7 @@
 #include "qemu/queue.h"
 #include "qom/cpu.h"
 #include "exec/memattrs.h"
+#include "hw/irq.h"
 
 #ifdef CONFIG_KVM
 #include <linux/kvm.h>
@@ -151,6 +152,7 @@ extern bool kvm_readonly_mem_allowed;
 #define kvm_halt_in_kernel() (false)
 #define kvm_eventfds_enabled() (false)
 #define kvm_irqfds_enabled() (false)
+#define kvm_resamplefds_enabled() (false)
 #define kvm_msi_via_irqfd_enabled() (false)
 #define kvm_gsi_routing_allowed() (false)
 #define kvm_gsi_direct_mapping() (false)
@@ -416,9 +418,15 @@ void kvm_irqchip_release_virq(KVMState *s, int virq);
 
 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter);
 
+int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                       EventNotifier *rn, int virq);
+int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                          int virq);
 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
-                                   EventNotifier *rn, int virq);
-int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq);
+                                   EventNotifier *rn, qemu_irq irq);
+int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
+                                      qemu_irq irq);
+void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi);
 void kvm_pc_gsi_handler(void *opaque, int n, int level);
 void kvm_pc_setup_irq_routing(bool pci_enabled);
 void kvm_init_irq_routing(KVMState *s);
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
new file mode 100644
index 0000000000..888557a1ca
--- /dev/null
+++ b/include/sysemu/kvm_int.h
@@ -0,0 +1,39 @@
+/*
+ * Internal definitions for a target's KVM support
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_KVM_INT_H
+#define QEMU_KVM_INT_H
+
+#include "sysemu/sysemu.h"
+#include "sysemu/accel.h"
+#include "sysemu/kvm.h"
+
+typedef struct KVMSlot
+{
+    hwaddr start_addr;
+    ram_addr_t memory_size;
+    void *ram;
+    int slot;
+    int flags;
+} KVMSlot;
+
+typedef struct KVMMemoryListener {
+    MemoryListener listener;
+    KVMSlot *slots;
+    int as_id;
+} KVMMemoryListener;
+
+#define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
+
+#define KVM_STATE(obj) \
+    OBJECT_CHECK(KVMState, (obj), TYPE_KVM_ACCEL)
+
+void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
+                                  AddressSpace *as, int as_id);
+
+#endif
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 6523b4d7f9..a6392bc50f 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -10,16 +10,27 @@
 
 extern int nb_numa_nodes;   /* Number of NUMA nodes */
 
+struct numa_addr_range {
+    ram_addr_t mem_start;
+    ram_addr_t mem_end;
+    QLIST_ENTRY(numa_addr_range) entry;
+};
+
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
     struct HostMemoryBackend *node_memdev;
     bool present;
+    QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
 } NodeInfo;
+
 extern NodeInfo numa_info[MAX_NODES];
 void parse_numa_opts(MachineClass *mc);
 void numa_post_machine_init(void);
 void query_numa_node_mem(uint64_t node_mem[]);
 extern QemuOptsList qemu_numa_opts;
+void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
+void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
+uint32_t numa_get_node(ram_addr_t addr, Error **errp);
 
 #endif
diff --git a/kvm-all.c b/kvm-all.c
index 53e01d468e..06e06f2b3f 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -24,19 +24,18 @@
 #include "qemu/atomic.h"
 #include "qemu/option.h"
 #include "qemu/config-file.h"
-#include "sysemu/sysemu.h"
-#include "sysemu/accel.h"
 #include "hw/hw.h"
 #include "hw/pci/msi.h"
 #include "hw/s390x/adapter.h"
 #include "exec/gdbstub.h"
-#include "sysemu/kvm.h"
+#include "sysemu/kvm_int.h"
 #include "qemu/bswap.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
 #include "exec/address-spaces.h"
 #include "qemu/event_notifier.h"
 #include "trace.h"
+#include "hw/irq.h"
 
 #include "hw/boards.h"
 
@@ -60,22 +59,10 @@
 
 #define KVM_MSI_HASHTAB_SIZE    256
 
-typedef struct KVMSlot
-{
-    hwaddr start_addr;
-    ram_addr_t memory_size;
-    void *ram;
-    int slot;
-    int flags;
-} KVMSlot;
-
-typedef struct kvm_dirty_log KVMDirtyLog;
-
 struct KVMState
 {
     AccelState parent_obj;
 
-    KVMSlot *slots;
     int nr_slots;
     int fd;
     int vmfd;
@@ -98,6 +85,7 @@ struct KVMState
      * unsigned, and treating them as signed here can break things */
     unsigned irq_set_ioctl;
     unsigned int sigmask_len;
+    GHashTable *gsimap;
 #ifdef KVM_CAP_IRQ_ROUTING
     struct kvm_irq_routing *irq_routes;
     int nr_allocated_irq_routes;
@@ -106,13 +94,9 @@ struct KVMState
     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
     bool direct_msi;
 #endif
+    KVMMemoryListener memory_listener;
 };
 
-#define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
-
-#define KVM_STATE(obj) \
-    OBJECT_CHECK(KVMState, (obj), TYPE_KVM_ACCEL)
-
 KVMState *kvm_state;
 bool kvm_kernel_irqchip;
 bool kvm_async_interrupts_allowed;
@@ -133,13 +117,14 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
     KVM_CAP_LAST_INFO
 };
 
-static KVMSlot *kvm_get_free_slot(KVMState *s)
+static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 {
+    KVMState *s = kvm_state;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        if (s->slots[i].memory_size == 0) {
-            return &s->slots[i];
+        if (kml->slots[i].memory_size == 0) {
+            return &kml->slots[i];
         }
     }
 
@@ -148,12 +133,14 @@ static KVMSlot *kvm_get_free_slot(KVMState *s)
 
 bool kvm_has_free_slot(MachineState *ms)
 {
-    return kvm_get_free_slot(KVM_STATE(ms->accelerator));
+    KVMState *s = KVM_STATE(ms->accelerator);
+
+    return kvm_get_free_slot(&s->memory_listener);
 }
 
-static KVMSlot *kvm_alloc_slot(KVMState *s)
+static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 {
-    KVMSlot *slot = kvm_get_free_slot(s);
+    KVMSlot *slot = kvm_get_free_slot(kml);
 
     if (slot) {
         return slot;
@@ -163,14 +150,15 @@ static KVMSlot *kvm_alloc_slot(KVMState *s)
     abort();
 }
 
-static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
+static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
                                          hwaddr start_addr,
                                          hwaddr end_addr)
 {
+    KVMState *s = kvm_state;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &s->slots[i];
+        KVMSlot *mem = &kml->slots[i];
 
         if (start_addr == mem->start_addr &&
             end_addr == mem->start_addr + mem->memory_size) {
@@ -184,15 +172,16 @@ static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 /*
  * Find overlapping slot with lowest start address
  */
-static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
+static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
                                             hwaddr start_addr,
                                             hwaddr end_addr)
 {
+    KVMState *s = kvm_state;
     KVMSlot *found = NULL;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &s->slots[i];
+        KVMSlot *mem = &kml->slots[i];
 
         if (mem->memory_size == 0 ||
             (found && found->start_addr < mem->start_addr)) {
@@ -211,10 +200,11 @@ static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
                                        hwaddr *phys_addr)
 {
+    KVMMemoryListener *kml = &s->memory_listener;
     int i;
 
     for (i = 0; i < s->nr_slots; i++) {
-        KVMSlot *mem = &s->slots[i];
+        KVMSlot *mem = &kml->slots[i];
 
         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
             *phys_addr = mem->start_addr + (ram - mem->ram);
@@ -225,11 +215,12 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
     return 0;
 }
 
-static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
+static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
 {
+    KVMState *s = kvm_state;
     struct kvm_userspace_memory_region mem;
 
-    mem.slot = slot->slot;
+    mem.slot = slot->slot | (kml->as_id << 16);
     mem.guest_phys_addr = slot->start_addr;
     mem.userspace_addr = (unsigned long)slot->ram;
     mem.flags = slot->flags;
@@ -291,45 +282,47 @@ err:
  * dirty pages logging control
  */
 
-static int kvm_mem_flags(KVMState *s, bool log_dirty, bool readonly)
+static int kvm_mem_flags(MemoryRegion *mr)
 {
+    bool readonly = mr->readonly || memory_region_is_romd(mr);
     int flags = 0;
-    flags = log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
+
+    if (memory_region_get_dirty_log_mask(mr) != 0) {
+        flags |= KVM_MEM_LOG_DIRTY_PAGES;
+    }
     if (readonly && kvm_readonly_mem_allowed) {
         flags |= KVM_MEM_READONLY;
     }
     return flags;
 }
 
-static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
+static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
+                                 MemoryRegion *mr)
 {
-    KVMState *s = kvm_state;
-    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
     int old_flags;
 
     old_flags = mem->flags;
-
-    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty, false);
-    mem->flags = flags;
+    mem->flags = kvm_mem_flags(mr);
 
     /* If nothing changed effectively, no need to issue ioctl */
-    if (flags == old_flags) {
+    if (mem->flags == old_flags) {
         return 0;
     }
 
-    return kvm_set_user_memory_region(s, mem);
+    return kvm_set_user_memory_region(kml, mem);
 }
 
-static int kvm_dirty_pages_log_change(hwaddr phys_addr,
-                                      ram_addr_t size, bool log_dirty)
+static int kvm_section_update_flags(KVMMemoryListener *kml,
+                                    MemoryRegionSection *section)
 {
-    KVMState *s = kvm_state;
-    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
+    hwaddr phys_addr = section->offset_within_address_space;
+    ram_addr_t size = int128_get64(section->size);
+    KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, phys_addr + size);
 
     if (mem == NULL)  {
         return 0;
     } else {
-        return kvm_slot_dirty_pages_log_change(mem, log_dirty);
+        return kvm_slot_update_flags(kml, mem, section->mr);
     }
 }
 
@@ -337,14 +330,14 @@ static void kvm_log_start(MemoryListener *listener,
                           MemoryRegionSection *section,
                           int old, int new)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
     int r;
 
     if (old != 0) {
         return;
     }
 
-    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
-                                   int128_get64(section->size), true);
+    r = kvm_section_update_flags(kml, section);
     if (r < 0) {
         abort();
     }
@@ -354,14 +347,14 @@ static void kvm_log_stop(MemoryListener *listener,
                           MemoryRegionSection *section,
                           int old, int new)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
     int r;
 
     if (new != 0) {
         return;
     }
 
-    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
-                                   int128_get64(section->size), false);
+    r = kvm_section_update_flags(kml, section);
     if (r < 0) {
         abort();
     }
@@ -389,11 +382,12 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
  * @start_add: start of logged region.
  * @end_addr: end of logged region.
  */
-static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
+static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
+                                          MemoryRegionSection *section)
 {
     KVMState *s = kvm_state;
     unsigned long size, allocated_size = 0;
-    KVMDirtyLog d = {};
+    struct kvm_dirty_log d = {};
     KVMSlot *mem;
     int ret = 0;
     hwaddr start_addr = section->offset_within_address_space;
@@ -401,7 +395,7 @@ static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
 
     d.dirty_bitmap = NULL;
     while (start_addr < end_addr) {
-        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
+        mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
         if (mem == NULL) {
             break;
         }
@@ -428,8 +422,7 @@ static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
         allocated_size = size;
         memset(d.dirty_bitmap, 0, allocated_size);
 
-        d.slot = mem->slot;
-
+        d.slot = mem->slot | (kml->as_id << 16);
         if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
             DPRINTF("ioctl failed %d\n", errno);
             ret = -1;
@@ -632,15 +625,14 @@ kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
     return NULL;
 }
 
-static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
+static void kvm_set_phys_mem(KVMMemoryListener *kml,
+                             MemoryRegionSection *section, bool add)
 {
     KVMState *s = kvm_state;
     KVMSlot *mem, old;
     int err;
     MemoryRegion *mr = section->mr;
-    bool log_dirty = memory_region_get_dirty_log_mask(mr) != 0;
     bool writeable = !mr->readonly && !mr->rom_device;
-    bool readonly_flag = mr->readonly || memory_region_is_romd(mr);
     hwaddr start_addr = section->offset_within_address_space;
     ram_addr_t size = int128_get64(section->size);
     void *ram = NULL;
@@ -674,7 +666,7 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
     ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
 
     while (1) {
-        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
+        mem = kvm_lookup_overlapping_slot(kml, start_addr, start_addr + size);
         if (!mem) {
             break;
         }
@@ -684,19 +676,19 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
             (ram - start_addr == mem->ram - mem->start_addr)) {
             /* The new slot fits into the existing one and comes with
              * identical parameters - update flags and done. */
-            kvm_slot_dirty_pages_log_change(mem, log_dirty);
+            kvm_slot_update_flags(kml, mem, mr);
             return;
         }
 
         old = *mem;
 
         if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-            kvm_physical_sync_dirty_bitmap(section);
+            kvm_physical_sync_dirty_bitmap(kml, section);
         }
 
         /* unregister the overlapping slot */
         mem->memory_size = 0;
-        err = kvm_set_user_memory_region(s, mem);
+        err = kvm_set_user_memory_region(kml, mem);
         if (err) {
             fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
                     __func__, strerror(-err));
@@ -713,13 +705,13 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
          * - and actually require a recent KVM version. */
         if (s->broken_set_mem_region &&
             old.start_addr == start_addr && old.memory_size < size && add) {
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot(kml);
             mem->memory_size = old.memory_size;
             mem->start_addr = old.start_addr;
             mem->ram = old.ram;
-            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
+            mem->flags = kvm_mem_flags(mr);
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(kml, mem);
             if (err) {
                 fprintf(stderr, "%s: error updating slot: %s\n", __func__,
                         strerror(-err));
@@ -734,13 +726,13 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
 
         /* register prefix slot */
         if (old.start_addr < start_addr) {
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot(kml);
             mem->memory_size = start_addr - old.start_addr;
             mem->start_addr = old.start_addr;
             mem->ram = old.ram;
-            mem->flags =  kvm_mem_flags(s, log_dirty, readonly_flag);
+            mem->flags =  kvm_mem_flags(mr);
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(kml, mem);
             if (err) {
                 fprintf(stderr, "%s: error registering prefix slot: %s\n",
                         __func__, strerror(-err));
@@ -757,14 +749,14 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
         if (old.start_addr + old.memory_size > start_addr + size) {
             ram_addr_t size_delta;
 
-            mem = kvm_alloc_slot(s);
+            mem = kvm_alloc_slot(kml);
             mem->start_addr = start_addr + size;
             size_delta = mem->start_addr - old.start_addr;
             mem->memory_size = old.memory_size - size_delta;
             mem->ram = old.ram + size_delta;
-            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
+            mem->flags = kvm_mem_flags(mr);
 
-            err = kvm_set_user_memory_region(s, mem);
+            err = kvm_set_user_memory_region(kml, mem);
             if (err) {
                 fprintf(stderr, "%s: error registering suffix slot: %s\n",
                         __func__, strerror(-err));
@@ -780,13 +772,13 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
     if (!add) {
         return;
     }
-    mem = kvm_alloc_slot(s);
+    mem = kvm_alloc_slot(kml);
     mem->memory_size = size;
     mem->start_addr = start_addr;
     mem->ram = ram;
-    mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
+    mem->flags = kvm_mem_flags(mr);
 
-    err = kvm_set_user_memory_region(s, mem);
+    err = kvm_set_user_memory_region(kml, mem);
     if (err) {
         fprintf(stderr, "%s: error registering slot: %s\n", __func__,
                 strerror(-err));
@@ -797,23 +789,28 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
 static void kvm_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+
     memory_region_ref(section->mr);
-    kvm_set_phys_mem(section, true);
+    kvm_set_phys_mem(kml, section, true);
 }
 
 static void kvm_region_del(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
-    kvm_set_phys_mem(section, false);
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+
+    kvm_set_phys_mem(kml, section, false);
     memory_region_unref(section->mr);
 }
 
 static void kvm_log_sync(MemoryListener *listener,
                          MemoryRegionSection *section)
 {
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
     int r;
 
-    r = kvm_physical_sync_dirty_bitmap(section);
+    r = kvm_physical_sync_dirty_bitmap(kml, section);
     if (r < 0) {
         abort();
     }
@@ -888,18 +885,27 @@ static void kvm_io_ioeventfd_del(MemoryListener *listener,
     }
 }
 
-static MemoryListener kvm_memory_listener = {
-    .region_add = kvm_region_add,
-    .region_del = kvm_region_del,
-    .log_start = kvm_log_start,
-    .log_stop = kvm_log_stop,
-    .log_sync = kvm_log_sync,
-    .eventfd_add = kvm_mem_ioeventfd_add,
-    .eventfd_del = kvm_mem_ioeventfd_del,
-    .coalesced_mmio_add = kvm_coalesce_mmio_region,
-    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
-    .priority = 10,
-};
+void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
+                                  AddressSpace *as, int as_id)
+{
+    int i;
+
+    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
+    kml->as_id = as_id;
+
+    for (i = 0; i < s->nr_slots; i++) {
+        kml->slots[i].slot = i;
+    }
+
+    kml->listener.region_add = kvm_region_add;
+    kml->listener.region_del = kvm_region_del;
+    kml->listener.log_start = kvm_log_start;
+    kml->listener.log_stop = kvm_log_stop;
+    kml->listener.log_sync = kvm_log_sync;
+    kml->listener.priority = 10;
+
+    memory_listener_register(&kml->listener, as);
+}
 
 static MemoryListener kvm_io_listener = {
     .eventfd_add = kvm_io_ioeventfd_add,
@@ -1099,9 +1105,17 @@ static int kvm_irqchip_get_virq(KVMState *s)
     uint32_t *word = s->used_gsi_bitmap;
     int max_words = ALIGN(s->gsi_count, 32) / 32;
     int i, zeroes;
-    bool retry = true;
 
-again:
+    /*
+     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
+     * GSI numbers are more than the number of IRQ route. Allocating a GSI
+     * number can succeed even though a new route entry cannot be added.
+     * When this happens, flush dynamic MSI entries to free IRQ route entries.
+     */
+    if (!s->direct_msi && s->irq_routes->nr == s->gsi_count) {
+        kvm_flush_dynamic_msi_routes(s);
+    }
+
     /* Return the lowest unused GSI in the bitmap */
     for (i = 0; i < max_words; i++) {
         zeroes = ctz32(~word[i]);
@@ -1111,11 +1125,6 @@ again:
 
         return zeroes + i * 32;
     }
-    if (!s->direct_msi && retry) {
-        retry = false;
-        kvm_flush_dynamic_msi_routes(s);
-        goto again;
-    }
     return -ENOSPC;
 
 }
@@ -1325,40 +1334,74 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
 }
 #endif /* !KVM_CAP_IRQ_ROUTING */
 
-int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
-                                   EventNotifier *rn, int virq)
+int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                       EventNotifier *rn, int virq)
 {
     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
            rn ? event_notifier_get_fd(rn) : -1, virq, true);
 }
 
-int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
+int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                          int virq)
 {
     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
            false);
 }
 
-static int kvm_irqchip_create(MachineState *machine, KVMState *s)
+int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
+                                   EventNotifier *rn, qemu_irq irq)
+{
+    gpointer key, gsi;
+    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
+
+    if (!found) {
+        return -ENXIO;
+    }
+    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
+}
+
+int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
+                                      qemu_irq irq)
+{
+    gpointer key, gsi;
+    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
+
+    if (!found) {
+        return -ENXIO;
+    }
+    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
+}
+
+void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
+{
+    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
+}
+
+static void kvm_irqchip_create(MachineState *machine, KVMState *s)
 {
     int ret;
 
-    if (!machine_kernel_irqchip_allowed(machine) ||
-        (!kvm_check_extension(s, KVM_CAP_IRQCHIP) &&
-         (kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0) < 0))) {
-        return 0;
+    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
+        ;
+    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
+        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
+        if (ret < 0) {
+            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
+            exit(1);
+        }
+    } else {
+        return;
     }
 
     /* First probe and see if there's a arch-specific hook to create the
      * in-kernel irqchip for us */
     ret = kvm_arch_irqchip_create(s);
-    if (ret < 0) {
-        return ret;
-    } else if (ret == 0) {
+    if (ret == 0) {
         ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
-        if (ret < 0) {
-            fprintf(stderr, "Create kernel irqchip failed\n");
-            return ret;
-        }
+    }
+    if (ret < 0) {
+        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
+        exit(1);
     }
 
     kvm_kernel_irqchip = true;
@@ -1370,7 +1413,7 @@ static int kvm_irqchip_create(MachineState *machine, KVMState *s)
 
     kvm_init_irq_routing(s);
 
-    return 0;
+    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
 }
 
 /* Find number of supported CPUs using the recommended
@@ -1407,7 +1450,7 @@ static int kvm_init(MachineState *ms)
     KVMState *s;
     const KVMCapabilityInfo *missing_cap;
     int ret;
-    int i, type = 0;
+    int type = 0;
     const char *kvm_type;
 
     s = KVM_STATE(ms->accelerator);
@@ -1456,12 +1499,6 @@ static int kvm_init(MachineState *ms)
         s->nr_slots = 32;
     }
 
-    s->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
-
-    for (i = 0; i < s->nr_slots; i++) {
-        s->slots[i].slot = i;
-    }
-
     /* check the vcpu limits */
     soft_vcpus_limit = kvm_recommended_vcpus(s);
     hard_vcpus_limit = kvm_max_vcpus(s);
@@ -1593,14 +1630,21 @@ static int kvm_init(MachineState *ms)
         goto err;
     }
 
-    ret = kvm_irqchip_create(ms, s);
-    if (ret < 0) {
-        goto err;
+    if (machine_kernel_irqchip_allowed(ms)) {
+        kvm_irqchip_create(ms, s);
     }
 
     kvm_state = s;
-    memory_listener_register(&kvm_memory_listener, &address_space_memory);
-    memory_listener_register(&kvm_io_listener, &address_space_io);
+
+    s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
+    s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
+    s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
+    s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
+
+    kvm_memory_listener_register(s, &s->memory_listener,
+                                 &address_space_memory, 0);
+    memory_listener_register(&kvm_io_listener,
+                             &address_space_io);
 
     s->many_ioeventfds = kvm_check_many_ioeventfds();
 
@@ -1616,7 +1660,7 @@ err:
     if (s->fd != -1) {
         close(s->fd);
     }
-    g_free(s->slots);
+    g_free(s->memory_listener.slots);
 
     return ret;
 }
@@ -1752,6 +1796,8 @@ int kvm_cpu_exec(CPUState *cpu)
         return EXCP_HLT;
     }
 
+    qemu_mutex_unlock_iothread();
+
     do {
         MemTxAttrs attrs;
 
@@ -1770,11 +1816,9 @@ int kvm_cpu_exec(CPUState *cpu)
              */
             qemu_cpu_kick_self();
         }
-        qemu_mutex_unlock_iothread();
 
         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
 
-        qemu_mutex_lock_iothread();
         attrs = kvm_arch_post_run(cpu, run);
 
         if (run_ret < 0) {
@@ -1801,6 +1845,7 @@ int kvm_cpu_exec(CPUState *cpu)
         switch (run->exit_reason) {
         case KVM_EXIT_IO:
             DPRINTF("handle_io\n");
+            /* Called outside BQL */
             kvm_handle_io(run->io.port, attrs,
                           (uint8_t *)run + run->io.data_offset,
                           run->io.direction,
@@ -1810,6 +1855,7 @@ int kvm_cpu_exec(CPUState *cpu)
             break;
         case KVM_EXIT_MMIO:
             DPRINTF("handle_mmio\n");
+            /* Called outside BQL */
             address_space_rw(&address_space_memory,
                              run->mmio.phys_addr, attrs,
                              run->mmio.data,
@@ -1857,6 +1903,8 @@ int kvm_cpu_exec(CPUState *cpu)
         }
     } while (ret == 0);
 
+    qemu_mutex_lock_iothread();
+
     if (ret < 0) {
         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
         vm_stop(RUN_STATE_INTERNAL_ERROR);
diff --git a/kvm-stub.c b/kvm-stub.c
index 7ba90c546f..d9ad624eee 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -24,6 +24,7 @@ bool kvm_kernel_irqchip;
 bool kvm_async_interrupts_allowed;
 bool kvm_eventfds_allowed;
 bool kvm_irqfds_allowed;
+bool kvm_resamplefds_allowed;
 bool kvm_msi_via_irqfd_allowed;
 bool kvm_gsi_routing_allowed;
 bool kvm_gsi_direct_mapping;
@@ -137,13 +138,14 @@ int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
     return -ENOSYS;
 }
 
-int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
-                                   EventNotifier *rn, int virq)
+int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                       EventNotifier *rn, int virq)
 {
     return -ENOSYS;
 }
 
-int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
+int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
+                                          int virq)
 {
     return -ENOSYS;
 }
diff --git a/linux-headers/asm-x86/hyperv.h b/linux-headers/asm-x86/hyperv.h
index ce6068dbcf..8fba544e9c 100644
--- a/linux-headers/asm-x86/hyperv.h
+++ b/linux-headers/asm-x86/hyperv.h
@@ -199,6 +199,17 @@
 #define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
 #define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0			0x40000100
+#define HV_X64_MSR_CRASH_P1			0x40000101
+#define HV_X64_MSR_CRASH_P2			0x40000102
+#define HV_X64_MSR_CRASH_P3			0x40000103
+#define HV_X64_MSR_CRASH_P4			0x40000104
+#define HV_X64_MSR_CRASH_CTL			0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY		(1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS		\
+		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index fad9e5c561..3bac8736d8 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -897,7 +897,7 @@ struct kvm_xen_hvm_config {
  *
  * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
  * the irqfd to operate in resampling mode for level triggered interrupt
- * emlation.  See Documentation/virtual/kvm/api.txt.
+ * emulation.  See Documentation/virtual/kvm/api.txt.
  */
 #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
 
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 0508d0b5d2..aa276bce39 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -36,6 +36,8 @@
 /* Two-stage IOMMU */
 #define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
+#define VFIO_SPAPR_TCE_v2_IOMMU		7
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -443,6 +445,23 @@ struct vfio_iommu_type1_dma_unmap {
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
+ * The SPAPR TCE DDW info struct provides the information about
+ * the details of Dynamic DMA window capability.
+ *
+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
+ * @max_dynamic_windows_supported tells the maximum number of windows
+ * which the platform can create.
+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
+ * this allows splitting a table into smaller chunks which reduces
+ * the amount of physically contiguous memory required for the table.
+ */
+struct vfio_iommu_spapr_tce_ddw_info {
+	__u64 pgsizes;			/* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 levels;
+};
+
+/*
  * The SPAPR TCE info struct provides the information about the PCI bus
  * address ranges available for DMA, these values are programmed into
  * the hardware so the guest has to know that information.
@@ -452,14 +471,17 @@ struct vfio_iommu_type1_dma_unmap {
  * addresses too so the window works as a filter rather than an offset
  * for IOVA addresses.
  *
- * A flag will need to be added if other page sizes are supported,
- * so as defined here, it is always 4k.
+ * Flags supported:
+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
+ *   (DDW) support is present. @ddw is only supported when DDW is present.
  */
 struct vfio_iommu_spapr_tce_info {
 	__u32 argsz;
-	__u32 flags;			/* reserved for future use */
+	__u32 flags;
+#define VFIO_IOMMU_SPAPR_INFO_DDW	(1 << 0)	/* DDW supported */
 	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
 	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	struct vfio_iommu_spapr_tce_ddw_info ddw;
 };
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
@@ -470,12 +492,23 @@ struct vfio_iommu_spapr_tce_info {
  * - unfreeze IO/DMA for frozen PE;
  * - read PE state;
  * - reset PE;
- * - configure PE.
+ * - configure PE;
+ * - inject EEH error.
  */
+struct vfio_eeh_pe_err {
+	__u32 type;
+	__u32 func;
+	__u64 addr;
+	__u64 mask;
+};
+
 struct vfio_eeh_pe_op {
 	__u32 argsz;
 	__u32 flags;
 	__u32 op;
+	union {
+		struct vfio_eeh_pe_err err;
+	};
 };
 
 #define VFIO_EEH_PE_DISABLE		0	/* Disable EEH functionality */
@@ -492,9 +525,70 @@ struct vfio_eeh_pe_op {
 #define VFIO_EEH_PE_RESET_HOT		6	/* Assert hot reset          */
 #define VFIO_EEH_PE_RESET_FUNDAMENTAL	7	/* Assert fundamental reset  */
 #define VFIO_EEH_PE_CONFIGURE		8	/* PE configuration          */
+#define VFIO_EEH_PE_INJECT_ERR		9	/* Inject EEH error          */
 
 #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
 
+/**
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
+ *
+ * Registers user space memory where DMA is allowed. It pins
+ * user pages and does the locked memory accounting so
+ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
+ * get faster.
+ */
+struct vfio_iommu_spapr_register_memory {
+	__u32	argsz;
+	__u32	flags;
+	__u64	vaddr;				/* Process virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
+ *
+ * Unregisters user space memory registered with
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
+ * Uses vfio_iommu_spapr_register_memory for parameters.
+ */
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
+ *
+ * Creates an additional TCE table and programs it (sets a new DMA window)
+ * to every IOMMU group in the container. It receives page shift, window
+ * size and number of levels in the TCE table being created.
+ *
+ * It allocates and returns an offset on a PCI bus of the new DMA window.
+ */
+struct vfio_iommu_spapr_tce_create {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u32 page_shift;
+	__u64 window_size;
+	__u32 levels;
+	/* out */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_CREATE	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
+ *
+ * Unprograms a TCE table from all groups in the container and destroys it.
+ * It receives a PCI bus offset as a window id.
+ */
+struct vfio_iommu_spapr_tce_remove {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
+
 /* ***************************************************************** */
 
 #endif /* VFIO_H */
diff --git a/linux-headers/linux/virtio_pci.h b/linux-headers/linux/virtio_pci.h
deleted file mode 100644
index 92624e5310..0000000000
--- a/linux-headers/linux/virtio_pci.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Virtio PCI driver
- *
- * This module allows virtio devices to be used over a virtual PCI device.
- * This can be used with QEMU based VMMs like KVM or Xen.
- *
- * Copyright IBM Corp. 2007
- *
- * Authors:
- *  Anthony Liguori  <aliguori@us.ibm.com>
- *
- * This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _LINUX_VIRTIO_PCI_H
-#define _LINUX_VIRTIO_PCI_H
-
-#include <linux/types.h>
-
-#ifndef VIRTIO_PCI_NO_LEGACY
-
-/* A 32-bit r/o bitmask of the features supported by the host */
-#define VIRTIO_PCI_HOST_FEATURES	0
-
-/* A 32-bit r/w bitmask of features activated by the guest */
-#define VIRTIO_PCI_GUEST_FEATURES	4
-
-/* A 32-bit r/w PFN for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_PFN		8
-
-/* A 16-bit r/o queue size for the currently selected queue */
-#define VIRTIO_PCI_QUEUE_NUM		12
-
-/* A 16-bit r/w queue selector */
-#define VIRTIO_PCI_QUEUE_SEL		14
-
-/* A 16-bit r/w queue notifier */
-#define VIRTIO_PCI_QUEUE_NOTIFY		16
-
-/* An 8-bit device status register.  */
-#define VIRTIO_PCI_STATUS		18
-
-/* An 8-bit r/o interrupt status register.  Reading the value will return the
- * current contents of the ISR and will also clear it.  This is effectively
- * a read-and-acknowledge. */
-#define VIRTIO_PCI_ISR			19
-
-/* MSI-X registers: only enabled if MSI-X is enabled. */
-/* A 16-bit vector for configuration changes. */
-#define VIRTIO_MSI_CONFIG_VECTOR        20
-/* A 16-bit vector for selected queue notifications. */
-#define VIRTIO_MSI_QUEUE_VECTOR         22
-
-/* The remaining space is defined by each driver as the per-driver
- * configuration space */
-#define VIRTIO_PCI_CONFIG_OFF(msix_enabled)	((msix_enabled) ? 24 : 20)
-/* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
-#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
-
-/* Virtio ABI version, this must match exactly */
-#define VIRTIO_PCI_ABI_VERSION		0
-
-/* How many bits to shift physical queue address written to QUEUE_PFN.
- * 12 is historical, and due to x86 page size. */
-#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
-
-/* The alignment to use between consumer and producer parts of vring.
- * x86 pagesize again. */
-#define VIRTIO_PCI_VRING_ALIGN		4096
-
-#endif /* VIRTIO_PCI_NO_LEGACY */
-
-/* The bit of the ISR which indicates a device configuration change. */
-#define VIRTIO_PCI_ISR_CONFIG		0x2
-/* Vector value used to disable MSI for queue */
-#define VIRTIO_MSI_NO_VECTOR            0xffff
-
-#ifndef VIRTIO_PCI_NO_MODERN
-
-/* IDs for different capabilities.  Must all exist. */
-
-/* Common configuration */
-#define VIRTIO_PCI_CAP_COMMON_CFG	1
-/* Notifications */
-#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
-/* ISR access */
-#define VIRTIO_PCI_CAP_ISR_CFG		3
-/* Device specific confiuration */
-#define VIRTIO_PCI_CAP_DEVICE_CFG	4
-
-/* This is the PCI capability header: */
-struct virtio_pci_cap {
-	__u8 cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
-	__u8 cap_next;		/* Generic PCI field: next ptr. */
-	__u8 cap_len;		/* Generic PCI field: capability length */
-	__u8 cfg_type;		/* Identifies the structure. */
-	__u8 bar;		/* Where to find it. */
-	__u8 padding[3];	/* Pad to full dword. */
-	__le32 offset;		/* Offset within bar. */
-	__le32 length;		/* Length of the structure, in bytes. */
-};
-
-struct virtio_pci_notify_cap {
-	struct virtio_pci_cap cap;
-	__le32 notify_off_multiplier;	/* Multiplier for queue_notify_off. */
-};
-
-/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
-struct virtio_pci_common_cfg {
-	/* About the whole device. */
-	__le32 device_feature_select;	/* read-write */
-	__le32 device_feature;		/* read-only */
-	__le32 guest_feature_select;	/* read-write */
-	__le32 guest_feature;		/* read-write */
-	__le16 msix_config;		/* read-write */
-	__le16 num_queues;		/* read-only */
-	__u8 device_status;		/* read-write */
-	__u8 config_generation;		/* read-only */
-
-	/* About a specific virtqueue. */
-	__le16 queue_select;		/* read-write */
-	__le16 queue_size;		/* read-write, power of 2. */
-	__le16 queue_msix_vector;	/* read-write */
-	__le16 queue_enable;		/* read-write */
-	__le16 queue_notify_off;	/* read-only */
-	__le32 queue_desc_lo;		/* read-write */
-	__le32 queue_desc_hi;		/* read-write */
-	__le32 queue_avail_lo;		/* read-write */
-	__le32 queue_avail_hi;		/* read-write */
-	__le32 queue_used_lo;		/* read-write */
-	__le32 queue_used_hi;		/* read-write */
-};
-
-/* Macro versions of offsets for the Old Timers! */
-#define VIRTIO_PCI_CAP_VNDR		0
-#define VIRTIO_PCI_CAP_NEXT		1
-#define VIRTIO_PCI_CAP_LEN		2
-#define VIRTIO_PCI_CAP_CFG_TYPE		3
-#define VIRTIO_PCI_CAP_BAR		4
-#define VIRTIO_PCI_CAP_OFFSET		8
-#define VIRTIO_PCI_CAP_LENGTH		12
-
-#define VIRTIO_PCI_NOTIFY_CAP_MULT	16
-
-
-#define VIRTIO_PCI_COMMON_DFSELECT	0
-#define VIRTIO_PCI_COMMON_DF		4
-#define VIRTIO_PCI_COMMON_GFSELECT	8
-#define VIRTIO_PCI_COMMON_GF		12
-#define VIRTIO_PCI_COMMON_MSIX		16
-#define VIRTIO_PCI_COMMON_NUMQ		18
-#define VIRTIO_PCI_COMMON_STATUS	20
-#define VIRTIO_PCI_COMMON_CFGGENERATION	21
-#define VIRTIO_PCI_COMMON_Q_SELECT	22
-#define VIRTIO_PCI_COMMON_Q_SIZE	24
-#define VIRTIO_PCI_COMMON_Q_MSIX	26
-#define VIRTIO_PCI_COMMON_Q_ENABLE	28
-#define VIRTIO_PCI_COMMON_Q_NOFF	30
-#define VIRTIO_PCI_COMMON_Q_DESCLO	32
-#define VIRTIO_PCI_COMMON_Q_DESCHI	36
-#define VIRTIO_PCI_COMMON_Q_AVAILLO	40
-#define VIRTIO_PCI_COMMON_Q_AVAILHI	44
-#define VIRTIO_PCI_COMMON_Q_USEDLO	48
-#define VIRTIO_PCI_COMMON_Q_USEDHI	52
-
-#endif /* VIRTIO_PCI_NO_MODERN */
-
-#endif
diff --git a/memory.c b/memory.c
index 3ac0bd20d2..5a0cc66982 100644
--- a/memory.c
+++ b/memory.c
@@ -396,9 +396,6 @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
 {
     uint64_t tmp;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     tmp = mr->ops->read(mr->opaque, addr, size);
     trace_memory_region_ops_read(mr, addr, tmp, size);
     *value |= (tmp & mask) << shift;
@@ -416,9 +413,6 @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
     uint64_t tmp = 0;
     MemTxResult r;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
     trace_memory_region_ops_read(mr, addr, tmp, size);
     *value |= (tmp & mask) << shift;
@@ -451,9 +445,6 @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
 {
     uint64_t tmp;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     tmp = (*value >> shift) & mask;
     trace_memory_region_ops_write(mr, addr, tmp, size);
     mr->ops->write(mr->opaque, addr, tmp, size);
@@ -470,9 +461,6 @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
 {
     uint64_t tmp;
 
-    if (mr->flush_coalesced_mmio) {
-        qemu_flush_coalesced_mmio_buffer();
-    }
     tmp = (*value >> shift) & mask;
     trace_memory_region_ops_write(mr, addr, tmp, size);
     return mr->ops->write_with_attrs(mr->opaque, addr, tmp, size, attrs);
@@ -1012,6 +1000,7 @@ static void memory_region_initfn(Object *obj)
     mr->ram_addr = RAM_ADDR_INVALID;
     mr->enabled = true;
     mr->romd_mode = true;
+    mr->global_locking = true;
     mr->destructor = memory_region_destructor_none;
     QTAILQ_INIT(&mr->subregions);
     QTAILQ_INIT(&mr->coalesced);
@@ -1646,6 +1635,16 @@ void memory_region_clear_flush_coalesced(MemoryRegion *mr)
     }
 }
 
+void memory_region_set_global_locking(MemoryRegion *mr)
+{
+    mr->global_locking = true;
+}
+
+void memory_region_clear_global_locking(MemoryRegion *mr)
+{
+    mr->global_locking = false;
+}
+
 void memory_region_add_eventfd(MemoryRegion *mr,
                                hwaddr addr,
                                unsigned size,
diff --git a/memory_mapping.c b/memory_mapping.c
index 7b69801cb8..36d6b26046 100644
--- a/memory_mapping.c
+++ b/memory_mapping.c
@@ -13,8 +13,8 @@
 
 #include <glib.h>
 
+#include "qemu-common.h"
 #include "cpu.h"
-#include "exec/cpu-all.h"
 #include "sysemu/memory_mapping.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
diff --git a/numa.c b/numa.c
index 91fc6c1d93..3c8005913f 100644
--- a/numa.c
+++ b/numa.c
@@ -52,6 +52,92 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
 int nb_numa_nodes;
 NodeInfo numa_info[MAX_NODES];
 
+void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
+{
+    struct numa_addr_range *range = g_malloc0(sizeof(*range));
+
+    /*
+     * Memory-less nodes can come here with 0 size in which case,
+     * there is nothing to do.
+     */
+    if (!size) {
+        return;
+    }
+
+    range->mem_start = addr;
+    range->mem_end = addr + size - 1;
+    QLIST_INSERT_HEAD(&numa_info[node].addr, range, entry);
+}
+
+void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
+{
+    struct numa_addr_range *range, *next;
+
+    QLIST_FOREACH_SAFE(range, &numa_info[node].addr, entry, next) {
+        if (addr == range->mem_start && (addr + size - 1) == range->mem_end) {
+            QLIST_REMOVE(range, entry);
+            g_free(range);
+            return;
+        }
+    }
+}
+
+static void numa_set_mem_ranges(void)
+{
+    int i;
+    ram_addr_t mem_start = 0;
+
+    /*
+     * Deduce start address of each node and use it to store
+     * the address range info in numa_info address range list
+     */
+    for (i = 0; i < nb_numa_nodes; i++) {
+        numa_set_mem_node_id(mem_start, numa_info[i].node_mem, i);
+        mem_start += numa_info[i].node_mem;
+    }
+}
+
+/*
+ * Check if @addr falls under NUMA @node.
+ */
+static bool numa_addr_belongs_to_node(ram_addr_t addr, uint32_t node)
+{
+    struct numa_addr_range *range;
+
+    QLIST_FOREACH(range, &numa_info[node].addr, entry) {
+        if (addr >= range->mem_start && addr <= range->mem_end) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/*
+ * Given an address, return the index of the NUMA node to which the
+ * address belongs to.
+ */
+uint32_t numa_get_node(ram_addr_t addr, Error **errp)
+{
+    uint32_t i;
+
+    /* For non NUMA configurations, check if the addr falls under node 0 */
+    if (!nb_numa_nodes) {
+        if (numa_addr_belongs_to_node(addr, 0)) {
+            return 0;
+        }
+    }
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (numa_addr_belongs_to_node(addr, i)) {
+            return i;
+        }
+    }
+
+    error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
+                "NUMA node", addr);
+    return -1;
+}
+
 static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
 {
     uint16_t nodenr;
@@ -274,6 +360,12 @@ void parse_numa_opts(MachineClass *mc)
         }
 
         for (i = 0; i < nb_numa_nodes; i++) {
+            QLIST_INIT(&numa_info[i].addr);
+        }
+
+        numa_set_mem_ranges();
+
+        for (i = 0; i < nb_numa_nodes; i++) {
             if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
                 break;
             }
@@ -297,6 +389,8 @@ void parse_numa_opts(MachineClass *mc)
         }
 
         validate_numa_cpus();
+    } else {
+        numa_set_mem_node_id(0, ram_size, 0);
     }
 }
 
diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c
index 5d8aca1b37..dda6f6b58d 100644
--- a/stubs/iothread-lock.c
+++ b/stubs/iothread-lock.c
@@ -1,6 +1,11 @@
 #include "qemu-common.h"
 #include "qemu/main-loop.h"
 
+bool qemu_mutex_iothread_locked(void)
+{
+    return true;
+}
+
 void qemu_mutex_lock_iothread(void)
 {
 }
diff --git a/target-arm/helper.c b/target-arm/helper.c
index aa341599cf..b87afe7cde 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -2441,7 +2441,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
     { .name = "TLBI_ALLE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 4, .crn = 8, .crm = 3, .opc2 = 4,
       .access = PL2_W, .type = ARM_CP_NO_RAW,
-      .writefn = tlbiall_write },
+      .writefn = tlbiall_is_write },
     { .name = "TLBI_VMALLE1IS", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 8, .crm = 3, .opc2 = 0,
       .access = PL1_W, .type = ARM_CP_NO_RAW,
diff --git a/target-arm/helper.h b/target-arm/helper.h
index fc885dea43..827b33dfec 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -50,6 +50,7 @@ DEF_HELPER_2(exception_internal, void, env, i32)
 DEF_HELPER_4(exception_with_syndrome, void, env, i32, i32, i32)
 DEF_HELPER_1(wfi, void, env)
 DEF_HELPER_1(wfe, void, env)
+DEF_HELPER_1(yield, void, env)
 DEF_HELPER_1(pre_hvc, void, env)
 DEF_HELPER_2(pre_smc, void, env, i32)
 
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 7fa32c4707..663c05d1d2 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -323,13 +323,25 @@ void HELPER(wfi)(CPUARMState *env)
 
 void HELPER(wfe)(CPUARMState *env)
 {
-    CPUState *cs = CPU(arm_env_get_cpu(env));
-
-    /* Don't actually halt the CPU, just yield back to top
+    /* This is a hint instruction that is semantically different
+     * from YIELD even though we currently implement it identically.
+     * Don't actually halt the CPU, just yield back to top
      * level loop. This is not going into a "low power state"
      * (ie halting until some event occurs), so we never take
      * a configurable trap to a different exception level.
      */
+    HELPER(yield)(env);
+}
+
+void HELPER(yield)(CPUARMState *env)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
+
+    /* This is a non-trappable hint instruction that generally indicates
+     * that the guest is currently busy-looping. Yield control back to the
+     * top level loop so that a more deserving VCPU has a chance to run.
+     */
     cs->exception_index = EXCP_YIELD;
     cpu_loop_exit(cs);
 }
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index e077f2dc30..689f2be896 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1199,6 +1199,8 @@ static void handle_hint(DisasContext *s, uint32_t insn,
         s->is_jmp = DISAS_WFI;
         return;
     case 1: /* YIELD */
+        s->is_jmp = DISAS_YIELD;
+        return;
     case 2: /* WFE */
         s->is_jmp = DISAS_WFE;
         return;
@@ -11107,6 +11109,10 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu,
             gen_a64_set_pc_im(dc->pc);
             gen_helper_wfe(cpu_env);
             break;
+        case DISAS_YIELD:
+            gen_a64_set_pc_im(dc->pc);
+            gen_helper_yield(cpu_env);
+            break;
         case DISAS_WFI:
             /* This is a special case because we don't want to just halt the CPU
              * if trying to debug across a WFI.
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 971b6db061..69ac18c108 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4080,6 +4080,10 @@ static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
 static void gen_nop_hint(DisasContext *s, int val)
 {
     switch (val) {
+    case 1: /* yield */
+        gen_set_pc_im(s, s->pc);
+        s->is_jmp = DISAS_YIELD;
+        break;
     case 3: /* wfi */
         gen_set_pc_im(s, s->pc);
         s->is_jmp = DISAS_WFI;
@@ -11459,6 +11463,9 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
         case DISAS_WFE:
             gen_helper_wfe(cpu_env);
             break;
+        case DISAS_YIELD:
+            gen_helper_yield(cpu_env);
+            break;
         case DISAS_SWI:
             gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb),
                           default_exception_el(dc));
diff --git a/target-arm/translate.h b/target-arm/translate.h
index bcdcf11718..9ab978fb75 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -103,6 +103,7 @@ static inline int default_exception_el(DisasContext *s)
 #define DISAS_WFE 7
 #define DISAS_HVC 8
 #define DISAS_SMC 9
+#define DISAS_YIELD 10
 
 #ifdef TARGET_AARCH64
 void a64_translate_init(void);
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 603aaf0924..ac39291b48 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -314,6 +314,7 @@
 
 #define MSR_P6_PERFCTR0                 0xc1
 
+#define MSR_IA32_SMBASE                 0x9e
 #define MSR_MTRRcap                     0xfe
 #define MSR_MTRRcap_VCNT                8
 #define MSR_MTRRcap_FIXRANGE_SUPPORT    (1 << 8)
diff --git a/target-i386/kvm-stub.c b/target-i386/kvm-stub.c
index 2b9e8011fb..6fefd65c23 100644
--- a/target-i386/kvm-stub.c
+++ b/target-i386/kvm-stub.c
@@ -18,6 +18,11 @@ bool kvm_allows_irq0_override(void)
 }
 
 #ifndef __OPTIMIZE__
+bool kvm_has_smm(void)
+{
+    return 1;
+}
+
 /* This function is only called inside conditionals which we
  * rely on the compiler to optimize out when CONFIG_KVM is not
  * defined.
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index daced5cb94..9038bf7077 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -22,7 +22,7 @@
 
 #include "qemu-common.h"
 #include "sysemu/sysemu.h"
-#include "sysemu/kvm.h"
+#include "sysemu/kvm_int.h"
 #include "kvm_i386.h"
 #include "cpu.h"
 #include "exec/gdbstub.h"
@@ -73,6 +73,7 @@ static bool has_msr_feature_control;
 static bool has_msr_async_pf_en;
 static bool has_msr_pv_eoi_en;
 static bool has_msr_misc_enable;
+static bool has_msr_smbase;
 static bool has_msr_bndcfgs;
 static bool has_msr_kvm_steal_time;
 static int lm_capable_kernel;
@@ -85,6 +86,11 @@ static bool has_msr_xss;
 static bool has_msr_architectural_pmu;
 static uint32_t num_architectural_pmu_counters;
 
+bool kvm_has_smm(void)
+{
+    return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
@@ -819,6 +825,10 @@ static int kvm_get_supported_msrs(KVMState *s)
                     has_msr_tsc_deadline = true;
                     continue;
                 }
+                if (kvm_msr_list->indices[i] == MSR_IA32_SMBASE) {
+                    has_msr_smbase = true;
+                    continue;
+                }
                 if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
                     has_msr_misc_enable = true;
                     continue;
@@ -840,6 +850,40 @@ static int kvm_get_supported_msrs(KVMState *s)
     return ret;
 }
 
+static Notifier smram_machine_done;
+static KVMMemoryListener smram_listener;
+static AddressSpace smram_address_space;
+static MemoryRegion smram_as_root;
+static MemoryRegion smram_as_mem;
+
+static void register_smram_listener(Notifier *n, void *unused)
+{
+    MemoryRegion *smram =
+        (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
+
+    /* Outer container... */
+    memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
+    memory_region_set_enabled(&smram_as_root, true);
+
+    /* ... with two regions inside: normal system memory with low
+     * priority, and...
+     */
+    memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
+                             get_system_memory(), 0, ~0ull);
+    memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
+    memory_region_set_enabled(&smram_as_mem, true);
+
+    if (smram) {
+        /* ... SMRAM with higher priority */
+        memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
+        memory_region_set_enabled(smram, true);
+    }
+
+    address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
+    kvm_memory_listener_register(kvm_state, &smram_listener,
+                                 &smram_address_space, 1);
+}
+
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
     uint64_t identity_base = 0xfffbc000;
@@ -898,6 +942,11 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
             return ret;
         }
     }
+
+    if (kvm_check_extension(s, KVM_CAP_X86_SMM)) {
+        smram_machine_done.notify = register_smram_listener;
+        qemu_add_machine_init_done_notifier(&smram_machine_done);
+    }
     return 0;
 }
 
@@ -1245,6 +1294,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                           env->msr_ia32_misc_enable);
     }
+    if (has_msr_smbase) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_SMBASE, env->smbase);
+    }
     if (has_msr_bndcfgs) {
         kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
     }
@@ -1606,6 +1658,9 @@ static int kvm_get_msrs(X86CPU *cpu)
     if (has_msr_misc_enable) {
         msrs[n++].index = MSR_IA32_MISC_ENABLE;
     }
+    if (has_msr_smbase) {
+        msrs[n++].index = MSR_IA32_SMBASE;
+    }
     if (has_msr_feature_control) {
         msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
     }
@@ -1760,6 +1815,9 @@ static int kvm_get_msrs(X86CPU *cpu)
         case MSR_IA32_MISC_ENABLE:
             env->msr_ia32_misc_enable = msrs[i].data;
             break;
+        case MSR_IA32_SMBASE:
+            env->smbase = msrs[i].data;
+            break;
         case MSR_IA32_FEATURE_CONTROL:
             env->msr_ia32_feature_control = msrs[i].data;
             break;
@@ -1923,6 +1981,7 @@ static int kvm_put_apic(X86CPU *cpu)
 
 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
 {
+    CPUState *cs = CPU(cpu);
     CPUX86State *env = &cpu->env;
     struct kvm_vcpu_events events = {};
 
@@ -1947,6 +2006,24 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
 
     events.sipi_vector = env->sipi_vector;
 
+    if (has_msr_smbase) {
+        events.smi.smm = !!(env->hflags & HF_SMM_MASK);
+        events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
+        if (kvm_irqchip_in_kernel()) {
+            /* As soon as these are moved to the kernel, remove them
+             * from cs->interrupt_request.
+             */
+            events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
+            events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
+            cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
+        } else {
+            /* Keep these in cs->interrupt_request.  */
+            events.smi.pending = 0;
+            events.smi.latched_init = 0;
+        }
+        events.flags |= KVM_VCPUEVENT_VALID_SMM;
+    }
+
     events.flags = 0;
     if (level >= KVM_PUT_RESET_STATE) {
         events.flags |=
@@ -1966,6 +2043,7 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
         return 0;
     }
 
+    memset(&events, 0, sizeof(events));
     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
     if (ret < 0) {
        return ret;
@@ -1987,6 +2065,29 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
         env->hflags2 &= ~HF2_NMI_MASK;
     }
 
+    if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
+        if (events.smi.smm) {
+            env->hflags |= HF_SMM_MASK;
+        } else {
+            env->hflags &= ~HF_SMM_MASK;
+        }
+        if (events.smi.pending) {
+            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
+        } else {
+            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
+        }
+        if (events.smi.smm_inside_nmi) {
+            env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
+        } else {
+            env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
+        }
+        if (events.smi.latched_init) {
+            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
+        } else {
+            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
+        }
+    }
+
     env->sipi_vector = events.sipi_vector;
 
     return 0;
@@ -2190,22 +2291,47 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
     int ret;
 
     /* Inject NMI */
-    if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
-        cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
-        DPRINTF("injected NMI\n");
-        ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
-        if (ret < 0) {
-            fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
-                    strerror(-ret));
+    if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
+        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
+            qemu_mutex_lock_iothread();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
+            qemu_mutex_unlock_iothread();
+            DPRINTF("injected NMI\n");
+            ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
+            if (ret < 0) {
+                fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
+                        strerror(-ret));
+            }
+        }
+        if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
+            qemu_mutex_lock_iothread();
+            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
+            qemu_mutex_unlock_iothread();
+            DPRINTF("injected SMI\n");
+            ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
+            if (ret < 0) {
+                fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
+                        strerror(-ret));
+            }
         }
     }
 
+    if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_lock_iothread();
+    }
+
     /* Force the VCPU out of its inner loop to process any INIT requests
      * or (for userspace APIC, but it is cheap to combine the checks here)
      * pending TPR access reports.
      */
     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
-        cpu->exit_request = 1;
+        if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
+            !(env->hflags & HF_SMM_MASK)) {
+            cpu->exit_request = 1;
+        }
+        if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
+            cpu->exit_request = 1;
+        }
     }
 
     if (!kvm_irqchip_in_kernel()) {
@@ -2243,6 +2369,8 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
 
         DPRINTF("setting tpr\n");
         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
+
+        qemu_mutex_unlock_iothread();
     }
 }
 
@@ -2251,13 +2379,27 @@ MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
     X86CPU *x86_cpu = X86_CPU(cpu);
     CPUX86State *env = &x86_cpu->env;
 
+    if (run->flags & KVM_RUN_X86_SMM) {
+        env->hflags |= HF_SMM_MASK;
+    } else {
+        env->hflags &= HF_SMM_MASK;
+    }
     if (run->if_flag) {
         env->eflags |= IF_MASK;
     } else {
         env->eflags &= ~IF_MASK;
     }
+
+    /* We need to protect the apic state against concurrent accesses from
+     * different threads in case the userspace irqchip is used. */
+    if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_lock_iothread();
+    }
     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
+    if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_unlock_iothread();
+    }
     return cpu_get_mem_attrs(env);
 }
 
@@ -2289,7 +2431,8 @@ int kvm_arch_process_async_events(CPUState *cs)
         }
     }
 
-    if (cs->interrupt_request & CPU_INTERRUPT_INIT) {
+    if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
+        !(env->hflags & HF_SMM_MASK)) {
         kvm_cpu_synchronize_state(cs);
         do_cpu_init(cpu);
     }
@@ -2550,13 +2693,17 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
     switch (run->exit_reason) {
     case KVM_EXIT_HLT:
         DPRINTF("handle_hlt\n");
+        qemu_mutex_lock_iothread();
         ret = kvm_handle_halt(cpu);
+        qemu_mutex_unlock_iothread();
         break;
     case KVM_EXIT_SET_TPR:
         ret = 0;
         break;
     case KVM_EXIT_TPR_ACCESS:
+        qemu_mutex_lock_iothread();
         ret = kvm_handle_tpr_access(cpu);
+        qemu_mutex_unlock_iothread();
         break;
     case KVM_EXIT_FAIL_ENTRY:
         code = run->fail_entry.hardware_entry_failure_reason;
@@ -2582,7 +2729,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
         break;
     case KVM_EXIT_DEBUG:
         DPRINTF("kvm_exit_debug\n");
+        qemu_mutex_lock_iothread();
         ret = kvm_handle_debug(cpu, &run->debug.arch);
+        qemu_mutex_unlock_iothread();
         break;
     default:
         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
index cac30fd381..e557e94f44 100644
--- a/target-i386/kvm_i386.h
+++ b/target-i386/kvm_i386.h
@@ -14,6 +14,7 @@
 #include "sysemu/kvm.h"
 
 bool kvm_allows_irq0_override(void);
+bool kvm_has_smm(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);
 
diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 948619fbab..7d2293d934 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -99,6 +99,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
     int r;
     struct kvm_mips_interrupt intr;
 
+    qemu_mutex_lock_iothread();
+
     if ((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
             cpu_mips_io_interrupts_pending(cpu)) {
         intr.cpu = -1;
@@ -109,6 +111,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
                          __func__, cs->cpu_index, intr.irq);
         }
     }
+
+    qemu_mutex_unlock_iothread();
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index afb4696b8a..ddf469fe09 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -1242,6 +1242,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
     int r;
     unsigned irq;
 
+    qemu_mutex_lock_iothread();
+
     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
     if (!cap_interrupt_level &&
@@ -1269,6 +1271,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
     /* We don't know if there are more interrupts pending after this. However,
      * the guest will return to userspace in the course of handling this one
      * anyways, so we will get a chance to deliver the rest. */
+
+    qemu_mutex_unlock_iothread();
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
@@ -1570,6 +1574,8 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
     CPUPPCState *env = &cpu->env;
     int ret;
 
+    qemu_mutex_lock_iothread();
+
     switch (run->exit_reason) {
     case KVM_EXIT_DCR:
         if (run->dcr.is_write) {
@@ -1620,6 +1626,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
         break;
     }
 
+    qemu_mutex_unlock_iothread();
     return ret;
 }
 
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 135111a2c4..ae3a0affec 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -2007,6 +2007,8 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
     S390CPU *cpu = S390_CPU(cs);
     int ret = 0;
 
+    qemu_mutex_lock_iothread();
+
     switch (run->exit_reason) {
         case KVM_EXIT_S390_SIEIC:
             ret = handle_intercept(cpu);
@@ -2027,6 +2029,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
             fprintf(stderr, "Unknown KVM exit: %d\n", run->exit_reason);
             break;
     }
+    qemu_mutex_unlock_iothread();
 
     if (ret == 0) {
         ret = EXCP_INTERRUPT;
diff --git a/target-s390x/mmu_helper.c b/target-s390x/mmu_helper.c
index 815ff42dde..1ea6d812c2 100644
--- a/target-s390x/mmu_helper.c
+++ b/target-s390x/mmu_helper.c
@@ -17,8 +17,8 @@
 
 #include "qemu/error-report.h"
 #include "exec/address-spaces.h"
-#include "sysemu/kvm.h"
 #include "cpu.h"
+#include "sysemu/kvm.h"
 
 /* #define DEBUG_S390 */
 /* #define DEBUG_S390_PTE */
diff --git a/target-xtensa/core-dc232b.c b/target-xtensa/core-dc232b.c
index a3b914bad4..06826c042f 100644
--- a/target-xtensa/core-dc232b.c
+++ b/target-xtensa/core-dc232b.c
@@ -33,7 +33,7 @@
 #include "core-dc232b/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig dc232b __attribute__((unused)) = {
+static XtensaConfig dc232b __attribute__((unused)) = {
     .name = "dc232b",
     .gdb_regmap = {
         .num_regs = 120,
diff --git a/target-xtensa/core-dc233c.c b/target-xtensa/core-dc233c.c
index ac745d106f..8daf7d9f84 100644
--- a/target-xtensa/core-dc233c.c
+++ b/target-xtensa/core-dc233c.c
@@ -34,7 +34,7 @@
 #include "core-dc233c/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig dc233c __attribute__((unused)) = {
+static XtensaConfig dc233c __attribute__((unused)) = {
     .name = "dc233c",
     .gdb_regmap = {
         .num_regs = 121,
diff --git a/target-xtensa/core-fsf.c b/target-xtensa/core-fsf.c
index cfcc840255..f6ea6b944a 100644
--- a/target-xtensa/core-fsf.c
+++ b/target-xtensa/core-fsf.c
@@ -33,9 +33,14 @@
 #include "core-fsf/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig fsf __attribute__((unused)) = {
+static XtensaConfig fsf __attribute__((unused)) = {
     .name = "fsf",
+    .gdb_regmap = {
     /* GDB for this core is not supported currently */
+        .reg = {
+            XTREG_END
+        },
+    },
     .clock_freq_khz = 10000,
     DEFAULT_SECTIONS
 };
diff --git a/target-xtensa/cpu.h b/target-xtensa/cpu.h
index dfd0d1ceda..b89c60245d 100644
--- a/target-xtensa/cpu.h
+++ b/target-xtensa/cpu.h
@@ -287,6 +287,7 @@ typedef struct XtensaGdbReg {
     int targno;
     int type;
     int group;
+    unsigned size;
 } XtensaGdbReg;
 
 typedef struct XtensaGdbRegmap {
@@ -336,6 +337,18 @@ typedef struct XtensaConfigList {
     struct XtensaConfigList *next;
 } XtensaConfigList;
 
+#ifdef HOST_WORDS_BIGENDIAN
+enum {
+    FP_F32_HIGH,
+    FP_F32_LOW,
+};
+#else
+enum {
+    FP_F32_LOW,
+    FP_F32_HIGH,
+};
+#endif
+
 typedef struct CPUXtensaState {
     const XtensaConfig *config;
     uint32_t regs[16];
@@ -343,7 +356,10 @@ typedef struct CPUXtensaState {
     uint32_t sregs[256];
     uint32_t uregs[256];
     uint32_t phys_regs[MAX_NAREG];
-    float32 fregs[16];
+    union {
+        float32 f32[2];
+        float64 f64;
+    } fregs[16];
     float_status fp_status;
 
     xtensa_tlb_entry itlb[7][MAX_TLB_WAY_SIZE];
@@ -384,6 +400,7 @@ XtensaCPU *cpu_xtensa_init(const char *cpu_model);
 void xtensa_translate_init(void);
 void xtensa_breakpoint_handler(CPUState *cs);
 int cpu_xtensa_exec(CPUXtensaState *s);
+void xtensa_finalize_config(XtensaConfig *config);
 void xtensa_register_core(XtensaConfigList *node);
 void check_interrupts(CPUXtensaState *s);
 void xtensa_irq_init(CPUXtensaState *env);
diff --git a/target-xtensa/gdbstub.c b/target-xtensa/gdbstub.c
index 9e13b20c46..bc2e1b55f6 100644
--- a/target-xtensa/gdbstub.c
+++ b/target-xtensa/gdbstub.c
@@ -26,6 +26,7 @@ int xtensa_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n)
     XtensaCPU *cpu = XTENSA_CPU(cs);
     CPUXtensaState *env = &cpu->env;
     const XtensaGdbReg *reg = env->config->gdb_regmap.reg + n;
+    unsigned i;
 
     if (n < 0 || n >= env->config->gdb_regmap.num_regs) {
         return 0;
@@ -47,8 +48,16 @@ int xtensa_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n)
         return gdb_get_reg32(mem_buf, env->uregs[reg->targno & 0xff]);
 
     case 4: /*f*/
-        return gdb_get_reg32(mem_buf, float32_val(env->fregs[reg->targno
-                                                             & 0x0f]));
+        i = reg->targno & 0x0f;
+        switch (reg->size) {
+        case 4:
+            return gdb_get_reg32(mem_buf,
+                                 float32_val(env->fregs[i].f32[FP_F32_LOW]));
+        case 8:
+            return gdb_get_reg64(mem_buf, float64_val(env->fregs[i].f64));
+        default:
+            return 0;
+        }
 
     case 8: /*a*/
         return gdb_get_reg32(mem_buf, env->regs[reg->targno & 0x0f]);
@@ -92,8 +101,16 @@ int xtensa_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
         break;
 
     case 4: /*f*/
-        env->fregs[reg->targno & 0x0f] = make_float32(tmp);
-        break;
+        switch (reg->size) {
+        case 4:
+            env->fregs[reg->targno & 0x0f].f32[FP_F32_LOW] = make_float32(tmp);
+            return 4;
+        case 8:
+            env->fregs[reg->targno & 0x0f].f64 = make_float64(tmp);
+            return 8;
+        default:
+            return 0;
+        }
 
     case 8: /*a*/
         env->regs[reg->targno & 0x0f] = tmp;
diff --git a/target-xtensa/helper.c b/target-xtensa/helper.c
index d84d259cf8..76be50d09c 100644
--- a/target-xtensa/helper.c
+++ b/target-xtensa/helper.c
@@ -51,6 +51,20 @@ static void xtensa_core_class_init(ObjectClass *oc, void *data)
     cc->gdb_num_core_regs = config->gdb_regmap.num_regs;
 }
 
+void xtensa_finalize_config(XtensaConfig *config)
+{
+    unsigned i, n = 0;
+
+    if (config->gdb_regmap.num_regs) {
+        return;
+    }
+
+    for (i = 0; config->gdb_regmap.reg[i].targno >= 0; ++i) {
+        n += (config->gdb_regmap.reg[i].type != 6);
+    }
+    config->gdb_regmap.num_regs = n;
+}
+
 void xtensa_register_core(XtensaConfigList *node)
 {
     TypeInfo type = {
diff --git a/target-xtensa/import_core.sh b/target-xtensa/import_core.sh
index 73791ec545..351bee41c2 100755
--- a/target-xtensa/import_core.sh
+++ b/target-xtensa/import_core.sh
@@ -22,8 +22,7 @@ mkdir -p "$TARGET"
 tar -xf "$OVERLAY" -C "$TARGET" --strip-components=1 \
     --xform='s/core/core-isa/' config/core.h
 tar -xf "$OVERLAY" -O gdb/xtensa-config.c | \
-    sed -n '1,/*\//p;/pc/,/a15/p' > "$TARGET"/gdb-config.c
-NUM_REGS=$(grep XTREG "$TARGET"/gdb-config.c | wc -l)
+    sed -n '1,/*\//p;/XTREG/,/XTREG_END/p' > "$TARGET"/gdb-config.c
 
 cat <<EOF > "${TARGET}.c"
 #include "cpu.h"
@@ -34,10 +33,9 @@ cat <<EOF > "${TARGET}.c"
 #include "core-$NAME/core-isa.h"
 #include "overlay_tool.h"
 
-static const XtensaConfig $NAME __attribute__((unused)) = {
+static XtensaConfig $NAME __attribute__((unused)) = {
     .name = "$NAME",
     .gdb_regmap = {
-        .num_regs = $NUM_REGS,
         .reg = {
 #include "core-$NAME/gdb-config.c"
         }
diff --git a/target-xtensa/overlay_tool.h b/target-xtensa/overlay_tool.h
index 6105d4c8ff..eda03aaca9 100644
--- a/target-xtensa/overlay_tool.h
+++ b/target-xtensa/overlay_tool.h
@@ -27,7 +27,8 @@
 
 #define XTREG(idx, ofs, bi, sz, al, no, flags, cp, typ, grp, name, \
         a1, a2, a3, a4, a5, a6) \
-    { .targno = (no), .type = (typ), .group = (grp) },
+    { .targno = (no), .type = (typ), .group = (grp), .size = (sz) },
+#define XTREG_END { .targno = -1 },
 
 #ifndef XCHAL_HAVE_DIV32
 #define XCHAL_HAVE_DIV32 0
@@ -316,6 +317,7 @@
         static XtensaConfigList node = { \
             .config = &core, \
         }; \
+        xtensa_finalize_config(&core); \
         xtensa_register_core(&node); \
     }
 #else
diff --git a/target-xtensa/translate.c b/target-xtensa/translate.c
index 86e4849fb6..f2118c24c0 100644
--- a/target-xtensa/translate.c
+++ b/target-xtensa/translate.c
@@ -228,7 +228,7 @@ void xtensa_translate_init(void)
 
     for (i = 0; i < 16; i++) {
         cpu_FR[i] = tcg_global_mem_new_i32(TCG_AREG0,
-                offsetof(CPUXtensaState, fregs[i]),
+                offsetof(CPUXtensaState, fregs[i].f32[FP_F32_LOW]),
                 fregnames[i]);
     }
 
@@ -3206,8 +3206,9 @@ void xtensa_cpu_dump_state(CPUState *cs, FILE *f,
 
         for (i = 0; i < 16; ++i) {
             cpu_fprintf(f, "F%02d=%08x (%+10.8e)%c", i,
-                    float32_val(env->fregs[i]),
-                    *(float *)&env->fregs[i], (i % 2) == 1 ? '\n' : ' ');
+                    float32_val(env->fregs[i].f32[FP_F32_LOW]),
+                    *(float *)(env->fregs[i].f32 + FP_F32_LOW),
+                    (i % 2) == 1 ? '\n' : ' ');
         }
     }
 }
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 41e486959d..231a781524 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -29,6 +29,8 @@
 #include "qemu/bitops.h"
 #include "tcg-target.h"
 
+#define CPU_TEMP_BUF_NLONGS 128
+
 /* Default target word size to pointer size.  */
 #ifndef TCG_TARGET_REG_BITS
 # if UINTPTR_MAX == UINT32_MAX
diff --git a/tests/ahci-test.c b/tests/ahci-test.c
index ae9415d74c..87d7691861 100644
--- a/tests/ahci-test.c
+++ b/tests/ahci-test.c
@@ -228,6 +228,8 @@ static AHCIQState *ahci_boot_and_enable(const char *cli, ...)
 {
     AHCIQState *ahci;
     va_list ap;
+    uint16_t buff[256];
+    uint8_t port;
 
     if (cli) {
         va_start(ap, cli);
@@ -239,6 +241,10 @@ static AHCIQState *ahci_boot_and_enable(const char *cli, ...)
 
     ahci_pci_enable(ahci);
     ahci_hba_enable(ahci);
+    /* Initialize test device */
+    port = ahci_port_select(ahci);
+    ahci_port_clear(ahci, port);
+    ahci_io(ahci, port, CMD_IDENTIFY, &buff, sizeof(buff), 0);
 
     return ahci;
 }
@@ -890,21 +896,23 @@ static void ahci_test_io_rw_simple(AHCIQState *ahci, unsigned bufsize,
     g_free(rx);
 }
 
-static void ahci_test_nondata(AHCIQState *ahci, uint8_t ide_cmd)
+static uint8_t ahci_test_nondata(AHCIQState *ahci, uint8_t ide_cmd)
 {
-    uint8_t px;
+    uint8_t port;
     AHCICommand *cmd;
 
     /* Sanitize */
-    px = ahci_port_select(ahci);
-    ahci_port_clear(ahci, px);
+    port = ahci_port_select(ahci);
+    ahci_port_clear(ahci, port);
 
     /* Issue Command */
     cmd = ahci_command_create(ide_cmd);
-    ahci_command_commit(ahci, cmd, px);
+    ahci_command_commit(ahci, cmd, port);
     ahci_command_issue(ahci, cmd);
     ahci_command_verify(ahci, cmd);
     ahci_command_free(cmd);
+
+    return port;
 }
 
 static void ahci_test_flush(AHCIQState *ahci)
@@ -912,6 +920,33 @@ static void ahci_test_flush(AHCIQState *ahci)
     ahci_test_nondata(ahci, CMD_FLUSH_CACHE);
 }
 
+static void ahci_test_max(AHCIQState *ahci)
+{
+    RegD2HFIS *d2h = g_malloc0(0x20);
+    uint64_t nsect;
+    uint8_t port;
+    uint8_t cmd;
+    uint64_t config_sect = TEST_IMAGE_SECTORS - 1;
+
+    if (config_sect > 0xFFFFFF) {
+        cmd = CMD_READ_MAX_EXT;
+    } else {
+        cmd = CMD_READ_MAX;
+    }
+
+    port = ahci_test_nondata(ahci, cmd);
+    memread(ahci->port[port].fb + 0x40, d2h, 0x20);
+    nsect = (uint64_t)d2h->lba_hi[2] << 40 |
+        (uint64_t)d2h->lba_hi[1] << 32 |
+        (uint64_t)d2h->lba_hi[0] << 24 |
+        (uint64_t)d2h->lba_lo[2] << 16 |
+        (uint64_t)d2h->lba_lo[1] << 8 |
+        (uint64_t)d2h->lba_lo[0];
+
+    g_assert_cmphex(nsect, ==, config_sect);
+    g_free(d2h);
+}
+
 
 /******************************************************************************/
 /* Test Interfaces                                                            */
@@ -1111,9 +1146,9 @@ static void test_migrate_sanity(void)
 }
 
 /**
- * DMA Migration test: Write a pattern, migrate, then read.
+ * Simple migration test: Write a pattern, migrate, then read.
  */
-static void test_migrate_dma(void)
+static void ahci_migrate_simple(uint8_t cmd_read, uint8_t cmd_write)
 {
     AHCIQState *src, *dst;
     uint8_t px;
@@ -1141,9 +1176,9 @@ static void test_migrate_dma(void)
     }
 
     /* Write, migrate, then read. */
-    ahci_io(src, px, CMD_WRITE_DMA, tx, bufsize, 0);
+    ahci_io(src, px, cmd_write, tx, bufsize, 0);
     ahci_migrate(src, dst, uri);
-    ahci_io(dst, px, CMD_READ_DMA, rx, bufsize, 0);
+    ahci_io(dst, px, cmd_read, rx, bufsize, 0);
 
     /* Verify pattern */
     g_assert_cmphex(memcmp(tx, rx, bufsize), ==, 0);
@@ -1154,14 +1189,24 @@ static void test_migrate_dma(void)
     g_free(tx);
 }
 
+static void test_migrate_dma(void)
+{
+    ahci_migrate_simple(CMD_READ_DMA, CMD_WRITE_DMA);
+}
+
+static void test_migrate_ncq(void)
+{
+    ahci_migrate_simple(READ_FPDMA_QUEUED, WRITE_FPDMA_QUEUED);
+}
+
 /**
- * DMA Error Test
+ * Halted IO Error Test
  *
  * Simulate an error on first write, Try to write a pattern,
  * Confirm the VM has stopped, resume the VM, verify command
  * has completed, then read back the data and verify.
  */
-static void test_halted_dma(void)
+static void ahci_halted_io_test(uint8_t cmd_read, uint8_t cmd_write)
 {
     AHCIQState *ahci;
     uint8_t port;
@@ -1196,7 +1241,7 @@ static void test_halted_dma(void)
     memwrite(ptr, tx, bufsize);
 
     /* Attempt to write (and fail) */
-    cmd = ahci_guest_io_halt(ahci, port, CMD_WRITE_DMA,
+    cmd = ahci_guest_io_halt(ahci, port, cmd_write,
                              ptr, bufsize, 0);
 
     /* Attempt to resume the command */
@@ -1204,7 +1249,7 @@ static void test_halted_dma(void)
     ahci_free(ahci, ptr);
 
     /* Read back and verify */
-    ahci_io(ahci, port, CMD_READ_DMA, rx, bufsize, 0);
+    ahci_io(ahci, port, cmd_read, rx, bufsize, 0);
     g_assert_cmphex(memcmp(tx, rx, bufsize), ==, 0);
 
     /* Cleanup and go home */
@@ -1213,14 +1258,24 @@ static void test_halted_dma(void)
     g_free(tx);
 }
 
+static void test_halted_dma(void)
+{
+    ahci_halted_io_test(CMD_READ_DMA, CMD_WRITE_DMA);
+}
+
+static void test_halted_ncq(void)
+{
+    ahci_halted_io_test(READ_FPDMA_QUEUED, WRITE_FPDMA_QUEUED);
+}
+
 /**
- * DMA Error Migration Test
+ * IO Error Migration Test
  *
  * Simulate an error on first write, Try to write a pattern,
  * Confirm the VM has stopped, migrate, resume the VM,
  * verify command has completed, then read back the data and verify.
  */
-static void test_migrate_halted_dma(void)
+static void ahci_migrate_halted_io(uint8_t cmd_read, uint8_t cmd_write)
 {
     AHCIQState *src, *dst;
     uint8_t port;
@@ -1266,14 +1321,14 @@ static void test_migrate_halted_dma(void)
     memwrite(ptr, tx, bufsize);
 
     /* Write, trigger the VM to stop, migrate, then resume. */
-    cmd = ahci_guest_io_halt(src, port, CMD_WRITE_DMA,
+    cmd = ahci_guest_io_halt(src, port, cmd_write,
                              ptr, bufsize, 0);
     ahci_migrate(src, dst, uri);
     ahci_guest_io_resume(dst, cmd);
     ahci_free(dst, ptr);
 
     /* Read back */
-    ahci_io(dst, port, CMD_READ_DMA, rx, bufsize, 0);
+    ahci_io(dst, port, cmd_read, rx, bufsize, 0);
 
     /* Verify TX and RX are identical */
     g_assert_cmphex(memcmp(tx, rx, bufsize), ==, 0);
@@ -1285,6 +1340,16 @@ static void test_migrate_halted_dma(void)
     g_free(tx);
 }
 
+static void test_migrate_halted_dma(void)
+{
+    ahci_migrate_halted_io(CMD_READ_DMA, CMD_WRITE_DMA);
+}
+
+static void test_migrate_halted_ncq(void)
+{
+    ahci_migrate_halted_io(READ_FPDMA_QUEUED, WRITE_FPDMA_QUEUED);
+}
+
 /**
  * Migration test: Try to flush, migrate, then resume.
  */
@@ -1334,6 +1399,49 @@ static void test_flush_migrate(void)
     ahci_shutdown(dst);
 }
 
+static void test_max(void)
+{
+    AHCIQState *ahci;
+
+    ahci = ahci_boot_and_enable(NULL);
+    ahci_test_max(ahci);
+    ahci_shutdown(ahci);
+}
+
+static void test_reset(void)
+{
+    AHCIQState *ahci;
+    int i;
+
+    ahci = ahci_boot(NULL);
+    ahci_test_pci_spec(ahci);
+    ahci_pci_enable(ahci);
+
+    for (i = 0; i < 2; i++) {
+        ahci_test_hba_spec(ahci);
+        ahci_hba_enable(ahci);
+        ahci_test_identify(ahci);
+        ahci_test_io_rw_simple(ahci, 4096, 0,
+                               CMD_READ_DMA_EXT,
+                               CMD_WRITE_DMA_EXT);
+        ahci_set(ahci, AHCI_GHC, AHCI_GHC_HR);
+        ahci_clean_mem(ahci);
+    }
+
+    ahci_shutdown(ahci);
+}
+
+static void test_ncq_simple(void)
+{
+    AHCIQState *ahci;
+
+    ahci = ahci_boot_and_enable(NULL);
+    ahci_test_io_rw_simple(ahci, 4096, 0,
+                           READ_FPDMA_QUEUED,
+                           WRITE_FPDMA_QUEUED);
+    ahci_shutdown(ahci);
+}
+
 /******************************************************************************/
 /* AHCI I/O Test Matrix Definitions                                           */
 
@@ -1584,6 +1692,14 @@ int main(int argc, char **argv)
     qtest_add_func("/ahci/io/dma/lba28/retry", test_halted_dma);
     qtest_add_func("/ahci/migrate/dma/halted", test_migrate_halted_dma);
 
+    qtest_add_func("/ahci/max", test_max);
+    qtest_add_func("/ahci/reset", test_reset);
+
+    qtest_add_func("/ahci/io/ncq/simple", test_ncq_simple);
+    qtest_add_func("/ahci/migrate/ncq/simple", test_migrate_ncq);
+    qtest_add_func("/ahci/io/ncq/retry", test_halted_ncq);
+    qtest_add_func("/ahci/migrate/ncq/halted", test_migrate_halted_ncq);
+
     ret = g_test_run();
 
     /* Cleanup */
diff --git a/tests/libqos/ahci.c b/tests/libqos/ahci.c
index 7e17bb691e..33ecd2abfb 100644
--- a/tests/libqos/ahci.c
+++ b/tests/libqos/ahci.c
@@ -50,27 +50,47 @@ typedef struct AHCICommandProp {
 } AHCICommandProp;
 
 AHCICommandProp ahci_command_properties[] = {
-    { .cmd = CMD_READ_PIO,      .data = true,  .pio = true,
-                                .lba28 = true, .read = true },
-    { .cmd = CMD_WRITE_PIO,     .data = true,  .pio = true,
-                                .lba28 = true, .write = true },
-    { .cmd = CMD_READ_PIO_EXT,  .data = true,  .pio = true,
-                                .lba48 = true, .read = true },
-    { .cmd = CMD_WRITE_PIO_EXT, .data = true,  .pio = true,
-                                .lba48 = true, .write = true },
-    { .cmd = CMD_READ_DMA,      .data = true,  .dma = true,
-                                .lba28 = true, .read = true },
-    { .cmd = CMD_WRITE_DMA,     .data = true,  .dma = true,
-                                .lba28 = true, .write = true },
-    { .cmd = CMD_READ_DMA_EXT,  .data = true,  .dma = true,
-                                .lba48 = true, .read = true },
-    { .cmd = CMD_WRITE_DMA_EXT, .data = true,  .dma = true,
-                                .lba48 = true, .write = true },
-    { .cmd = CMD_IDENTIFY,      .data = true,  .pio = true,
-                                .size = 512,   .read = true },
-    { .cmd = CMD_READ_MAX,      .lba28 = true },
-    { .cmd = CMD_READ_MAX_EXT,  .lba48 = true },
-    { .cmd = CMD_FLUSH_CACHE,   .data = false }
+    { .cmd = CMD_READ_PIO,       .data = true,  .pio = true,
+                                 .lba28 = true, .read = true },
+    { .cmd = CMD_WRITE_PIO,      .data = true,  .pio = true,
+                                 .lba28 = true, .write = true },
+    { .cmd = CMD_READ_PIO_EXT,   .data = true,  .pio = true,
+                                 .lba48 = true, .read = true },
+    { .cmd = CMD_WRITE_PIO_EXT,  .data = true,  .pio = true,
+                                 .lba48 = true, .write = true },
+    { .cmd = CMD_READ_DMA,       .data = true,  .dma = true,
+                                 .lba28 = true, .read = true },
+    { .cmd = CMD_WRITE_DMA,      .data = true,  .dma = true,
+                                 .lba28 = true, .write = true },
+    { .cmd = CMD_READ_DMA_EXT,   .data = true,  .dma = true,
+                                 .lba48 = true, .read = true },
+    { .cmd = CMD_WRITE_DMA_EXT,  .data = true,  .dma = true,
+                                 .lba48 = true, .write = true },
+    { .cmd = CMD_IDENTIFY,       .data = true,  .pio = true,
+                                 .size = 512,   .read = true },
+    { .cmd = READ_FPDMA_QUEUED,  .data = true,  .dma = true,
+                                 .lba48 = true, .read = true, .ncq = true },
+    { .cmd = WRITE_FPDMA_QUEUED, .data = true,  .dma = true,
+                                 .lba48 = true, .write = true, .ncq = true },
+    { .cmd = CMD_READ_MAX,       .lba28 = true },
+    { .cmd = CMD_READ_MAX_EXT,   .lba48 = true },
+    { .cmd = CMD_FLUSH_CACHE,    .data = false }
+};
+
+struct AHCICommand {
+    /* Test Management Data */
+    uint8_t name;
+    uint8_t port;
+    uint8_t slot;
+    uint32_t interrupts;
+    uint64_t xbytes;
+    uint32_t prd_size;
+    uint64_t buffer;
+    AHCICommandProp *props;
+    /* Data to be transferred to the guest */
+    AHCICommandHeader header;
+    RegH2DFIS fis;
+    void *atapi_cmd;
 };
 
 /**
@@ -138,12 +158,14 @@ void ahci_clean_mem(AHCIQState *ahci)
     for (port = 0; port < 32; ++port) {
         if (ahci->port[port].fb) {
             ahci_free(ahci, ahci->port[port].fb);
+            ahci->port[port].fb = 0;
         }
         if (ahci->port[port].clb) {
             for (slot = 0; slot < 32; slot++) {
                 ahci_destroy_command(ahci, port, slot);
             }
             ahci_free(ahci, ahci->port[port].clb);
+            ahci->port[port].clb = 0;
         }
     }
 }
@@ -252,7 +274,7 @@ void ahci_hba_enable(AHCIQState *ahci)
         /* Allocate Memory for the Command List Buffer & FIS Buffer */
         /* PxCLB space ... 0x20 per command, as in 4.2.2 p 36 */
         ahci->port[i].clb = ahci_alloc(ahci, num_cmd_slots * 0x20);
-        qmemset(ahci->port[i].clb, 0x00, 0x100);
+        qmemset(ahci->port[i].clb, 0x00, num_cmd_slots * 0x20);
         g_test_message("CLB: 0x%08" PRIx64, ahci->port[i].clb);
         ahci_px_wreg(ahci, i, AHCI_PX_CLB, ahci->port[i].clb);
         g_assert_cmphex(ahci->port[i].clb, ==,
@@ -460,13 +482,15 @@ void ahci_port_check_pio_sanity(AHCIQState *ahci, uint8_t port,
     g_free(pio);
 }
 
-void ahci_port_check_cmd_sanity(AHCIQState *ahci, uint8_t port,
-                                uint8_t slot, size_t buffsize)
+void ahci_port_check_cmd_sanity(AHCIQState *ahci, AHCICommand *cmd)
 {
-    AHCICommandHeader cmd;
+    AHCICommandHeader cmdh;
 
-    ahci_get_command_header(ahci, port, slot, &cmd);
-    g_assert_cmphex(buffsize, ==, cmd.prdbc);
+    ahci_get_command_header(ahci, cmd->port, cmd->slot, &cmdh);
+    /* Physical Region Descriptor Byte Count is not required to work for NCQ. */
+    if (!cmd->props->ncq) {
+        g_assert_cmphex(cmd->xbytes, ==, cmdh.prdbc);
+    }
 }
 
 /* Get the command in #slot of port #port. */
@@ -549,7 +573,7 @@ unsigned ahci_pick_cmd(AHCIQState *ahci, uint8_t port)
         if (reg & (1 << j)) {
             continue;
         }
-        ahci_destroy_command(ahci, port, i);
+        ahci_destroy_command(ahci, port, j);
         ahci->port[port].next = (j + 1) % 32;
         return j;
     }
@@ -610,22 +634,6 @@ void ahci_guest_io(AHCIQState *ahci, uint8_t port, uint8_t ide_cmd,
     ahci_command_free(cmd);
 }
 
-struct AHCICommand {
-    /* Test Management Data */
-    uint8_t name;
-    uint8_t port;
-    uint8_t slot;
-    uint32_t interrupts;
-    uint64_t xbytes;
-    uint32_t prd_size;
-    uint64_t buffer;
-    AHCICommandProp *props;
-    /* Data to be transferred to the guest */
-    AHCICommandHeader header;
-    RegH2DFIS fis;
-    void *atapi_cmd;
-};
-
 static AHCICommandProp *ahci_command_find(uint8_t command_name)
 {
     int i;
@@ -691,19 +699,34 @@ static void command_header_init(AHCICommand *cmd)
 static void command_table_init(AHCICommand *cmd)
 {
     RegH2DFIS *fis = &(cmd->fis);
+    uint16_t sect_count = (cmd->xbytes / AHCI_SECTOR_SIZE);
 
     fis->fis_type = REG_H2D_FIS;
     fis->flags = REG_H2D_FIS_CMD; /* "Command" bit */
     fis->command = cmd->name;
-    cmd->fis.feature_low = 0x00;
-    cmd->fis.feature_high = 0x00;
-    if (cmd->props->lba28 || cmd->props->lba48) {
-        cmd->fis.device = ATA_DEVICE_LBA;
+
+    if (cmd->props->ncq) {
+        NCQFIS *ncqfis = (NCQFIS *)fis;
+        /* NCQ is weird and re-uses FIS frames for unrelated data.
+         * See SATA 3.2, 13.6.4.1 READ FPDMA QUEUED for an example. */
+        ncqfis->sector_low = sect_count & 0xFF;
+        ncqfis->sector_hi = (sect_count >> 8) & 0xFF;
+        ncqfis->device = NCQ_DEVICE_MAGIC;
+        /* Force Unit Access is bit 7 in the device register */
+        ncqfis->tag = 0;  /* bits 3-7 are the NCQ tag */
+        ncqfis->prio = 0; /* bits 6,7 are a prio tag */
+        /* RARC bit is bit 0 of TAG field */
+    } else {
+        fis->feature_low = 0x00;
+        fis->feature_high = 0x00;
+        if (cmd->props->lba28 || cmd->props->lba48) {
+            fis->device = ATA_DEVICE_LBA;
+        }
+        fis->count = (cmd->xbytes / AHCI_SECTOR_SIZE);
     }
-    cmd->fis.count = (cmd->xbytes / AHCI_SECTOR_SIZE);
-    cmd->fis.icc = 0x00;
-    cmd->fis.control = 0x00;
-    memset(cmd->fis.aux, 0x00, ARRAY_SIZE(cmd->fis.aux));
+    fis->icc = 0x00;
+    fis->control = 0x00;
+    memset(fis->aux, 0x00, ARRAY_SIZE(fis->aux));
 }
 
 AHCICommand *ahci_command_create(uint8_t command_name)
@@ -717,6 +740,7 @@ AHCICommand *ahci_command_create(uint8_t command_name)
     g_assert(!(props->lba28 && props->lba48));
     g_assert(!(props->read && props->write));
     g_assert(!props->size || props->data);
+    g_assert(!props->ncq || (props->ncq && props->lba48));
 
     /* Defaults and book-keeping */
     cmd->props = props;
@@ -725,12 +749,15 @@ AHCICommand *ahci_command_create(uint8_t command_name)
     cmd->prd_size = 4096;
     cmd->buffer = 0xabad1dea;
 
-    cmd->interrupts = AHCI_PX_IS_DHRS;
+    if (!cmd->props->ncq) {
+        cmd->interrupts = AHCI_PX_IS_DHRS;
+    }
     /* BUG: We expect the DPS interrupt for data commands */
     /* cmd->interrupts |= props->data ? AHCI_PX_IS_DPS : 0; */
     /* BUG: We expect the DMA Setup interrupt for DMA commands */
     /* cmd->interrupts |= props->dma ? AHCI_PX_IS_DSS : 0; */
     cmd->interrupts |= props->pio ? AHCI_PX_IS_PSS : 0;
+    cmd->interrupts |= props->ncq ? AHCI_PX_IS_SDBS : 0;
 
     command_header_init(cmd);
     command_table_init(cmd);
@@ -758,7 +785,7 @@ void ahci_command_set_offset(AHCICommand *cmd, uint64_t lba_sect)
     RegH2DFIS *fis = &(cmd->fis);
     if (cmd->props->lba28) {
         g_assert_cmphex(lba_sect, <=, 0xFFFFFFF);
-    } else if (cmd->props->lba48) {
+    } else if (cmd->props->lba48 || cmd->props->ncq) {
         g_assert_cmphex(lba_sect, <=, 0xFFFFFFFFFFFF);
     } else {
         /* Can't set offset if we don't know the format. */
@@ -785,6 +812,8 @@ void ahci_command_set_buffer(AHCICommand *cmd, uint64_t buffer)
 void ahci_command_set_sizes(AHCICommand *cmd, uint64_t xbytes,
                             unsigned prd_size)
 {
+    uint16_t sect_count;
+
     /* Each PRD can describe up to 4MiB, and must not be odd. */
     g_assert_cmphex(prd_size, <=, 4096 * 1024);
     g_assert_cmphex(prd_size & 0x01, ==, 0x00);
@@ -792,7 +821,15 @@ void ahci_command_set_sizes(AHCICommand *cmd, uint64_t xbytes,
         cmd->prd_size = prd_size;
     }
     cmd->xbytes = xbytes;
-    cmd->fis.count = (cmd->xbytes / AHCI_SECTOR_SIZE);
+    sect_count = (cmd->xbytes / AHCI_SECTOR_SIZE);
+
+    if (cmd->props->ncq) {
+        NCQFIS *nfis = (NCQFIS *)&(cmd->fis);
+        nfis->sector_low = sect_count & 0xFF;
+        nfis->sector_hi = (sect_count >> 8) & 0xFF;
+    } else {
+        cmd->fis.count = sect_count;
+    }
     cmd->header.prdtl = size_to_prdtl(cmd->xbytes, cmd->prd_size);
 }
 
@@ -824,6 +861,11 @@ void ahci_command_commit(AHCIQState *ahci, AHCICommand *cmd, uint8_t port)
     cmd->port = port;
     cmd->slot = ahci_pick_cmd(ahci, port);
 
+    if (cmd->props->ncq) {
+        NCQFIS *nfis = (NCQFIS *)&cmd->fis;
+        nfis->tag = (cmd->slot << 3) & 0xFC;
+    }
+
     /* Create a buffer for the command table */
     prdtl = size_to_prdtl(cmd->xbytes, cmd->prd_size);
     table_size = CMD_TBL_SIZ(prdtl);
@@ -878,11 +920,15 @@ void ahci_command_wait(AHCIQState *ahci, AHCICommand *cmd)
     /* We can't rely on STS_BSY until the command has started processing.
      * Therefore, we also use the Command Issue bit as indication of
      * a command in-flight. */
-    while (BITSET(ahci_px_rreg(ahci, cmd->port, AHCI_PX_TFD),
-                  AHCI_PX_TFD_STS_BSY) ||
-           BITSET(ahci_px_rreg(ahci, cmd->port, AHCI_PX_CI), (1 << cmd->slot))) {
+
+#define RSET(REG, MASK) (BITSET(ahci_px_rreg(ahci, cmd->port, (REG)), (MASK)))
+
+    while (RSET(AHCI_PX_TFD, AHCI_PX_TFD_STS_BSY) ||
+           RSET(AHCI_PX_CI, 1 << cmd->slot) ||
+           (cmd->props->ncq && RSET(AHCI_PX_SACT, 1 << cmd->slot))) {
         usleep(50);
     }
+
 }
 
 void ahci_command_issue(AHCIQState *ahci, AHCICommand *cmd)
@@ -899,8 +945,10 @@ void ahci_command_verify(AHCIQState *ahci, AHCICommand *cmd)
     ahci_port_check_error(ahci, port);
     ahci_port_check_interrupts(ahci, port, cmd->interrupts);
     ahci_port_check_nonbusy(ahci, port, slot);
-    ahci_port_check_cmd_sanity(ahci, port, slot, cmd->xbytes);
-    ahci_port_check_d2h_sanity(ahci, port, slot);
+    ahci_port_check_cmd_sanity(ahci, cmd);
+    if (cmd->interrupts & AHCI_PX_IS_DHRS) {
+        ahci_port_check_d2h_sanity(ahci, port, slot);
+    }
     if (cmd->props->pio) {
         ahci_port_check_pio_sanity(ahci, port, slot, cmd->xbytes);
     }
diff --git a/tests/libqos/ahci.h b/tests/libqos/ahci.h
index 779e812400..a08a9ddac1 100644
--- a/tests/libqos/ahci.h
+++ b/tests/libqos/ahci.h
@@ -263,20 +263,23 @@ enum {
 /* ATA Commands */
 enum {
     /* DMA */
-    CMD_READ_DMA      = 0xC8,
-    CMD_READ_DMA_EXT  = 0x25,
-    CMD_WRITE_DMA     = 0xCA,
-    CMD_WRITE_DMA_EXT = 0x35,
+    CMD_READ_DMA       = 0xC8,
+    CMD_READ_DMA_EXT   = 0x25,
+    CMD_WRITE_DMA      = 0xCA,
+    CMD_WRITE_DMA_EXT  = 0x35,
     /* PIO */
-    CMD_READ_PIO      = 0x20,
-    CMD_READ_PIO_EXT  = 0x24,
-    CMD_WRITE_PIO     = 0x30,
-    CMD_WRITE_PIO_EXT = 0x34,
+    CMD_READ_PIO       = 0x20,
+    CMD_READ_PIO_EXT   = 0x24,
+    CMD_WRITE_PIO      = 0x30,
+    CMD_WRITE_PIO_EXT  = 0x34,
     /* Misc */
-    CMD_READ_MAX      = 0xF8,
-    CMD_READ_MAX_EXT  = 0x27,
-    CMD_FLUSH_CACHE   = 0xE7,
-    CMD_IDENTIFY      = 0xEC
+    CMD_READ_MAX       = 0xF8,
+    CMD_READ_MAX_EXT   = 0x27,
+    CMD_FLUSH_CACHE    = 0xE7,
+    CMD_IDENTIFY       = 0xEC,
+    /* NCQ */
+    READ_FPDMA_QUEUED  = 0x60,
+    WRITE_FPDMA_QUEUED = 0x61,
 };
 
 /* AHCI Command Header Flags & Masks*/
@@ -291,8 +294,9 @@ enum {
 #define CMDH_PMP      (0xF000)
 
 /* ATA device register masks */
-#define ATA_DEVICE_MAGIC 0xA0
+#define ATA_DEVICE_MAGIC 0xA0 /* used in ata1-3 */
 #define ATA_DEVICE_LBA   0x40
+#define NCQ_DEVICE_MAGIC 0x40 /* for ncq device registers */
 #define ATA_DEVICE_DRIVE 0x10
 #define ATA_DEVICE_HEAD  0x0F
 
@@ -397,6 +401,32 @@ typedef struct RegH2DFIS {
 } __attribute__((__packed__)) RegH2DFIS;
 
 /**
+ * Register host-to-device FIS structure, for NCQ commands.
+ * Actually just a RegH2DFIS, but with fields repurposed.
+ * Repurposed fields are annotated below.
+ */
+typedef struct NCQFIS {
+    /* DW0 */
+    uint8_t fis_type;
+    uint8_t flags;
+    uint8_t command;
+    uint8_t sector_low; /* H2D: Feature 7:0 */
+    /* DW1 */
+    uint8_t lba_lo[3];
+    uint8_t device;
+    /* DW2 */
+    uint8_t lba_hi[3];
+    uint8_t sector_hi; /* H2D: Feature 15:8 */
+    /* DW3 */
+    uint8_t tag;       /* H2D: Count 0:7 */
+    uint8_t prio;      /* H2D: Count 15:8 */
+    uint8_t icc;
+    uint8_t control;
+    /* DW4 */
+    uint8_t aux[4];
+} __attribute__((__packed__)) NCQFIS;
+
+/**
  * Command List entry structure.
  * The command list contains between 1-32 of these structures.
  */
@@ -512,8 +542,7 @@ void ahci_port_check_nonbusy(AHCIQState *ahci, uint8_t port, uint8_t slot);
 void ahci_port_check_d2h_sanity(AHCIQState *ahci, uint8_t port, uint8_t slot);
 void ahci_port_check_pio_sanity(AHCIQState *ahci, uint8_t port,
                                 uint8_t slot, size_t buffsize);
-void ahci_port_check_cmd_sanity(AHCIQState *ahci, uint8_t port,
-                                uint8_t slot, size_t buffsize);
+void ahci_port_check_cmd_sanity(AHCIQState *ahci, AHCICommand *cmd);
 void ahci_get_command_header(AHCIQState *ahci, uint8_t port,
                              uint8_t slot, AHCICommandHeader *cmd);
 void ahci_set_command_header(AHCIQState *ahci, uint8_t port,
diff --git a/trace-events b/trace-events
index 52b7efa9a4..d24d80aed2 100644
--- a/trace-events
+++ b/trace-events
@@ -1594,6 +1594,7 @@ vfio_platform_intp_interrupt(int pin, int fd) "Inject IRQ #%d (fd = %d)"
 vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)"
 vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d: count %d, flags=0x%x"
 vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING"
+vfio_platform_start_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d"
 
 #hw/acpi/memory_hotplug.c
 mhp_acpi_invalid_slot_selected(uint32_t slot) "0x%"PRIx32
diff --git a/translate-all.c b/translate-all.c
index b6b0e1c098..50d53fdac0 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -58,6 +58,7 @@
 #endif
 
 #include "exec/cputlb.h"
+#include "exec/tb-hash.h"
 #include "translate-all.h"
 #include "qemu/bitmap.h"
 #include "qemu/timer.h"
@@ -117,6 +118,7 @@ typedef struct PageDesc {
 #define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)
 
 uintptr_t qemu_real_host_page_size;
+uintptr_t qemu_real_host_page_mask;
 uintptr_t qemu_host_page_size;
 uintptr_t qemu_host_page_mask;
 
@@ -306,6 +308,7 @@ void page_size_init(void)
     /* NOTE: we can always suppose that qemu_host_page_size >=
        TARGET_PAGE_SIZE */
     qemu_real_host_page_size = getpagesize();
+    qemu_real_host_page_mask = ~(qemu_real_host_page_size - 1);
     if (qemu_host_page_size == 0) {
         qemu_host_page_size = qemu_real_host_page_size;
     }
diff --git a/util/cutils.c b/util/cutils.c
index 144b25c05a..5d1c9ebe05 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -207,13 +207,13 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len)
     for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
          i < len / sizeof(VECTYPE);
          i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
-        VECTYPE tmp0 = p[i + 0] | p[i + 1];
-        VECTYPE tmp1 = p[i + 2] | p[i + 3];
-        VECTYPE tmp2 = p[i + 4] | p[i + 5];
-        VECTYPE tmp3 = p[i + 6] | p[i + 7];
-        VECTYPE tmp01 = tmp0 | tmp1;
-        VECTYPE tmp23 = tmp2 | tmp3;
-        if (!ALL_EQ(tmp01 | tmp23, zero)) {
+        VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
+        VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
+        VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
+        VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]);
+        VECTYPE tmp01 = VEC_OR(tmp0, tmp1);
+        VECTYPE tmp23 = VEC_OR(tmp2, tmp3);
+        if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) {
             break;
         }
     }