From 158c87e5de4f34840bf8115789f09806e7e14b94 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 4 Jul 2016 09:20:12 +1000
Subject: ppc: Fix xsrdpi, xvrdpi and xvrspi rounding

xsrdpi, xvrdpi and xvrspi use the round ties away method, not round
nearest even.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target-ppc/fpu_helper.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c
index 4ef893be2c..d9795d04d0 100644
--- a/target-ppc/fpu_helper.c
+++ b/target-ppc/fpu_helper.c
@@ -2689,19 +2689,19 @@ void helper_##op(CPUPPCState *env, uint32_t opcode)                    \
     helper_float_check_status(env);                                    \
 }
 
-VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_nearest_even, 1)
+VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_ties_away, 1)
 VSX_ROUND(xsrdpic, 1, float64, VsrD(0), FLOAT_ROUND_CURRENT, 1)
 VSX_ROUND(xsrdpim, 1, float64, VsrD(0), float_round_down, 1)
 VSX_ROUND(xsrdpip, 1, float64, VsrD(0), float_round_up, 1)
 VSX_ROUND(xsrdpiz, 1, float64, VsrD(0), float_round_to_zero, 1)
 
-VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_nearest_even, 0)
+VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_ties_away, 0)
 VSX_ROUND(xvrdpic, 2, float64, VsrD(i), FLOAT_ROUND_CURRENT, 0)
 VSX_ROUND(xvrdpim, 2, float64, VsrD(i), float_round_down, 0)
 VSX_ROUND(xvrdpip, 2, float64, VsrD(i), float_round_up, 0)
 VSX_ROUND(xvrdpiz, 2, float64, VsrD(i), float_round_to_zero, 0)
 
-VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_nearest_even, 0)
+VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_ties_away, 0)
 VSX_ROUND(xvrspic, 4, float32, VsrW(i), FLOAT_ROUND_CURRENT, 0)
 VSX_ROUND(xvrspim, 4, float32, VsrW(i), float_round_down, 0)
 VSX_ROUND(xvrspip, 4, float32, VsrW(i), float_round_up, 0)
-- 
cgit v1.2.3


From 7093645a843e5da1a750bc451dd8c9107d595c61 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Fri, 1 Jul 2016 10:44:39 +0530
Subject: spapr: Ensure thread0 of CPU core is always realized first

During CPU core realization, we create all the thread objects and parent
them to the core object in a loop. However, the realization of thread
objects is done separately by walking the threads of a core using
object_child_foreach(). With this, there is no guarantee on the order
in which the child thread objects get realized. Since CPU device tree
properties are currently derived from the CPU thread object, we assume
thread0 of the core to be the representative thread of the core when
creating device tree properties for the core. If thread0 is not the
first thread that gets realized, then we would end up having an
incorrect dt_id for the core and this causes hotplug failures from
the guest.

Fix this by realizing each thread object by walking the core's thread
object list thereby ensuring that thread0 and other threads are always
realized in the correct order.

Future TODO: CPU DT nodes are per-core properties and we should
ideally base the creation of CPU DT nodes on core objects rather than
the thread objects.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr_cpu_core.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index a384db5204..70b6b0b5ee 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -259,9 +259,9 @@ out:
     error_propagate(errp, local_err);
 }
 
-static int spapr_cpu_core_realize_child(Object *child, void *opaque)
+static void spapr_cpu_core_realize_child(Object *child, Error **errp)
 {
-    Error **errp = opaque, *local_err = NULL;
+    Error *local_err = NULL;
     sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     CPUState *cs = CPU(child);
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -269,15 +269,14 @@ static int spapr_cpu_core_realize_child(Object *child, void *opaque)
     object_property_set_bool(child, true, "realized", &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return 1;
+        return;
     }
 
     spapr_cpu_init(spapr, cpu, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return 1;
+        return;
     }
-    return 0;
 }
 
 static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
@@ -287,13 +286,13 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
     const char *typename = object_class_get_name(sc->cpu_class);
     size_t size = object_type_get_instance_size(typename);
     Error *local_err = NULL;
-    Object *obj;
-    int i;
+    void *obj;
+    int i, j;
 
     sc->threads = g_malloc0(size * cc->nr_threads);
     for (i = 0; i < cc->nr_threads; i++) {
         char id[32];
-        void *obj = sc->threads + i * size;
+        obj = sc->threads + i * size;
 
         object_initialize(obj, size, typename);
         snprintf(id, sizeof(id), "thread[%d]", i);
@@ -303,12 +302,16 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
         }
         object_unref(obj);
     }
-    object_child_foreach(OBJECT(dev), spapr_cpu_core_realize_child, &local_err);
-    if (local_err) {
-        goto err;
-    } else {
-        return;
+
+    for (j = 0; j < cc->nr_threads; j++) {
+        obj = sc->threads + j * size;
+
+        spapr_cpu_core_realize_child(obj, &local_err);
+        if (local_err) {
+            goto err;
+        }
     }
+    return;
 
 err:
     while (--i >= 0) {
-- 
cgit v1.2.3


From c4e6c42353fe735add45b790f8d3a323590f7cab Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Sat, 2 Jul 2016 00:41:32 +0200
Subject: ppc: simplify max_smt initialization in ppc_cpu_realizefn()

kvmppc_smt_threads() returns 1 if KVM is not enabled.

Signed-off-by: Greg Kurz <groug@kaod.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target-ppc/translate_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index 843f19b748..a06bf50b65 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -9516,7 +9516,7 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp)
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
     Error *local_err = NULL;
 #if !defined(CONFIG_USER_ONLY)
-    int max_smt = kvm_enabled() ? kvmppc_smt_threads() : 1;
+    int max_smt = kvmppc_smt_threads();
 #endif
 
 #if !defined(CONFIG_USER_ONLY)
-- 
cgit v1.2.3


From 606b54986df4e3964eee2d74460bd06ed2f384e5 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 4 Jul 2016 13:33:03 +1000
Subject: spapr_iommu: Realloc guest visible TCE table when starting/stopping
 listening

The sPAPR TCE tables manage 2 copies when VFIO is using an IOMMU -
a guest view of the table and a hardware TCE table. If there is no VFIO
presense in the address space, then just the guest view is used, if
this is the case, it is allocated in the KVM. However since there is no
support yet for VFIO in KVM TCE hypercalls, when we start using VFIO,
we need to move the guest view from KVM to the userspace; and we need
to do this for every IOMMU on a bus with VFIO devices.

This implements the callbacks for the sPAPR IOMMU - notify_started()
reallocated the guest view to the user space, notify_stopped() does
the opposite.

This removes explicit spapr_tce_set_need_vfio() call from PCI hotplug
path as the new callbacks do this better - they notify IOMMU at
the exact moment when the configuration is changed, and this also
includes the case of PCI hot unplug.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr_iommu.c | 12 ++++++++++++
 hw/ppc/spapr_pci.c   |  6 ------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index e230bacae1..d57b05d5c0 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -156,6 +156,16 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu)
     return 1ULL << tcet->page_shift;
 }
 
+static void spapr_tce_notify_started(MemoryRegion *iommu)
+{
+    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true);
+}
+
+static void spapr_tce_notify_stopped(MemoryRegion *iommu)
+{
+    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false);
+}
+
 static int spapr_tce_table_post_load(void *opaque, int version_id)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -236,6 +246,8 @@ static const VMStateDescription vmstate_spapr_tce_table = {
 static MemoryRegionIOMMUOps spapr_iommu_ops = {
     .translate = spapr_tce_translate_iommu,
     .get_min_page_size = spapr_tce_get_min_page_size,
+    .notify_started = spapr_tce_notify_started,
+    .notify_stopped = spapr_tce_notify_stopped,
 };
 
 static int spapr_tce_table_realize(DeviceState *dev)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 8c1e6b17c3..cbb7cdd774 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1087,12 +1087,6 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
     void *fdt = NULL;
     int fdt_start_offset = 0, fdt_size;
 
-    if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) {
-        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
-
-        spapr_tce_set_need_vfio(tcet, true);
-    }
-
     fdt = create_device_tree(&fdt_size);
     fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
     if (!fdt_start_offset) {
-- 
cgit v1.2.3


From 318f67ce13710a09c6dcf34da7b6b0ebc845c5c9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 4 Jul 2016 13:33:04 +1000
Subject: vfio: spapr: Add DMA memory preregistering (SPAPR IOMMU v2)

This makes use of the new "memory registering" feature. The idea is
to provide the userspace ability to notify the host kernel about pages
which are going to be used for DMA. Having this information, the host
kernel can pin them all once per user process, do locked pages
accounting (once) and not spent time on doing that in real time with
possible failures which cannot be handled nicely in some cases.

This adds a prereg memory listener which listens on address_space_memory
and notifies a VFIO container about memory which needs to be
pinned/unpinned. VFIO MMIO regions (i.e. "skip dump" regions) are skipped.

The feature is only enabled for SPAPR IOMMU v2. The host kernel changes
are required. Since v2 does not need/support VFIO_IOMMU_ENABLE, this does
not call it when v2 is detected and enabled.

This enforces guest RAM blocks to be host page size aligned; however
this is not new as KVM already requires memory slots to be host page
size aligned.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[dwg: Fix compile error on 32-bit host]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/vfio/Makefile.objs         |   1 +
 hw/vfio/common.c              |  40 +++++++++---
 hw/vfio/spapr.c               | 139 ++++++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   6 ++
 include/hw/vfio/vfio-common.h |   4 ++
 5 files changed, 181 insertions(+), 9 deletions(-)
 create mode 100644 hw/vfio/spapr.c

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index ceddbb8f99..c25e32b029 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -4,4 +4,5 @@ obj-$(CONFIG_PCI) += pci.o pci-quirks.o
 obj-$(CONFIG_SOFTMMU) += platform.o
 obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o
 obj-$(CONFIG_SOFTMMU) += amd-xgbe.o
+obj-$(CONFIG_SOFTMMU) += spapr.o
 endif
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7be638e0e3..46381e6242 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -503,6 +503,9 @@ static const MemoryListener vfio_memory_listener = {
 static void vfio_listener_release(VFIOContainer *container)
 {
     memory_listener_unregister(&container->listener);
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        memory_listener_unregister(&container->prereg_listener);
+    }
 }
 
 static struct vfio_info_cap_header *
@@ -861,8 +864,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             goto free_container_exit;
         }
 
-        ret = ioctl(fd, VFIO_SET_IOMMU,
-                    v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU);
+        container->iommu_type = v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU;
+        ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type);
         if (ret) {
             error_report("vfio: failed to set iommu for container: %m");
             ret = -errno;
@@ -887,8 +890,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
         if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
             container->iova_pgsizes = info.iova_pgsizes;
         }
-    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
+    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) ||
+               ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) {
         struct vfio_iommu_spapr_tce_info info;
+        bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU);
 
         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
         if (ret) {
@@ -896,7 +901,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             ret = -errno;
             goto free_container_exit;
         }
-        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
+        container->iommu_type =
+            v2 ? VFIO_SPAPR_TCE_v2_IOMMU : VFIO_SPAPR_TCE_IOMMU;
+        ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type);
         if (ret) {
             error_report("vfio: failed to set iommu for container: %m");
             ret = -errno;
@@ -908,11 +915,23 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
          * when container fd is closed so we do not call it explicitly
          * in this file.
          */
-        ret = ioctl(fd, VFIO_IOMMU_ENABLE);
-        if (ret) {
-            error_report("vfio: failed to enable container: %m");
-            ret = -errno;
-            goto free_container_exit;
+        if (!v2) {
+            ret = ioctl(fd, VFIO_IOMMU_ENABLE);
+            if (ret) {
+                error_report("vfio: failed to enable container: %m");
+                ret = -errno;
+                goto free_container_exit;
+            }
+        } else {
+            container->prereg_listener = vfio_prereg_listener;
+
+            memory_listener_register(&container->prereg_listener,
+                                     &address_space_memory);
+            if (container->error) {
+                memory_listener_unregister(&container->prereg_listener);
+                error_report("vfio: RAM memory listener initialization failed for container");
+                goto free_container_exit;
+            }
         }
 
         /*
@@ -925,6 +944,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
         if (ret) {
             error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m");
             ret = -errno;
+            if (v2) {
+                memory_listener_unregister(&container->prereg_listener);
+            }
             goto free_container_exit;
         }
         container->min_iova = info.dma32_window_start;
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
new file mode 100644
index 0000000000..7d64443b7f
--- /dev/null
+++ b/hw/vfio/spapr.c
@@ -0,0 +1,139 @@
+/*
+ * DMA memory preregistration
+ *
+ * Authors:
+ *  Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "hw/hw.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
+{
+    if (memory_region_is_iommu(section->mr)) {
+        hw_error("Cannot possibly preregister IOMMU memory");
+    }
+
+    return !memory_region_is_ram(section->mr) ||
+            memory_region_is_skip_dump(section->mr);
+}
+
+static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
+{
+    return memory_region_get_ram_ptr(section->mr) +
+        section->offset_within_region +
+        (gpa - section->offset_within_address_space);
+}
+
+static void vfio_prereg_listener_region_add(MemoryListener *listener,
+                                            MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            prereg_listener);
+    const hwaddr gpa = section->offset_within_address_space;
+    hwaddr end;
+    int ret;
+    hwaddr page_mask = qemu_real_host_page_mask;
+    struct vfio_iommu_spapr_register_memory reg = {
+        .argsz = sizeof(reg),
+        .flags = 0,
+    };
+
+    if (vfio_prereg_listener_skipped_section(section)) {
+        trace_vfio_prereg_listener_region_add_skip(
+                section->offset_within_address_space,
+                section->offset_within_address_space +
+                int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~page_mask) ||
+                 (section->offset_within_region & ~page_mask) ||
+                 (int128_get64(section->size) & ~page_mask))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    end = section->offset_within_address_space + int128_get64(section->size);
+    if (gpa >= end) {
+        return;
+    }
+
+    memory_region_ref(section->mr);
+
+    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
+    reg.size = end - gpa;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+    trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
+    if (ret) {
+        /*
+         * On the initfn path, store the first error in the container so we
+         * can gracefully fail.  Runtime, there's not much we can do other
+         * than throw a hardware error.
+         */
+        if (!container->initialized) {
+            if (!container->error) {
+                container->error = ret;
+            }
+        } else {
+            hw_error("vfio: Memory registering failed, unable to continue");
+        }
+    }
+}
+
+static void vfio_prereg_listener_region_del(MemoryListener *listener,
+                                            MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            prereg_listener);
+    const hwaddr gpa = section->offset_within_address_space;
+    hwaddr end;
+    int ret;
+    hwaddr page_mask = qemu_real_host_page_mask;
+    struct vfio_iommu_spapr_register_memory reg = {
+        .argsz = sizeof(reg),
+        .flags = 0,
+    };
+
+    if (vfio_prereg_listener_skipped_section(section)) {
+        trace_vfio_prereg_listener_region_del_skip(
+                section->offset_within_address_space,
+                section->offset_within_address_space +
+                int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~page_mask) ||
+                 (section->offset_within_region & ~page_mask) ||
+                 (int128_get64(section->size) & ~page_mask))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    end = section->offset_within_address_space + int128_get64(section->size);
+    if (gpa >= end) {
+        return;
+    }
+
+    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
+    reg.size = end - gpa;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+    trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
+}
+
+const MemoryListener vfio_prereg_listener = {
+    .region_add = vfio_prereg_listener_region_add,
+    .region_del = vfio_prereg_listener_region_del,
+};
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index a768fb54ec..0b02a3bd54 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -115,3 +115,9 @@ vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d
 vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING"
 vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d"
 vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d"
+
+# hw/vfio/spapr.c
+vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
+vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
+vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 0610377789..405c3b29d6 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -73,6 +73,8 @@ typedef struct VFIOContainer {
     VFIOAddressSpace *space;
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
     MemoryListener listener;
+    MemoryListener prereg_listener;
+    unsigned iommu_type;
     int error;
     bool initialized;
     /*
@@ -158,4 +160,6 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
                              uint32_t subtype, struct vfio_region_info **info);
 #endif
+extern const MemoryListener vfio_prereg_listener;
+
 #endif /* !HW_VFIO_VFIO_COMMON_H */
-- 
cgit v1.2.3


From f4ec5e26edbd4c7509623ec882c344dc334bc1b2 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 4 Jul 2016 13:33:05 +1000
Subject: vfio: Add host side DMA window capabilities

There are going to be multiple IOMMUs per a container. This moves
the single host IOMMU parameter set to a list of VFIOHostDMAWindow.

This should cause no behavioral change and will be used later by
the SPAPR TCE IOMMU v2 which will also add a vfio_host_win_del() helper.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/vfio/common.c              | 60 +++++++++++++++++++++++++++++++------------
 include/hw/vfio/vfio-common.h | 10 ++++++--
 2 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 46381e6242..7c8a5310ae 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -28,6 +28,7 @@
 #include "exec/memory.h"
 #include "hw/hw.h"
 #include "qemu/error-report.h"
+#include "qemu/range.h"
 #include "sysemu/kvm.h"
 #ifdef CONFIG_KVM
 #include "linux/kvm.h"
@@ -241,6 +242,29 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
     return -errno;
 }
 
+static void vfio_host_win_add(VFIOContainer *container,
+                              hwaddr min_iova, hwaddr max_iova,
+                              uint64_t iova_pgsizes)
+{
+    VFIOHostDMAWindow *hostwin;
+
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (ranges_overlap(hostwin->min_iova,
+                           hostwin->max_iova - hostwin->min_iova + 1,
+                           min_iova,
+                           max_iova - min_iova + 1)) {
+            hw_error("%s: Overlapped IOMMU are not enabled", __func__);
+        }
+    }
+
+    hostwin = g_malloc0(sizeof(*hostwin));
+
+    hostwin->min_iova = min_iova;
+    hostwin->max_iova = max_iova;
+    hostwin->iova_pgsizes = iova_pgsizes;
+    QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
+}
+
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
     return (!memory_region_is_ram(section->mr) &&
@@ -329,6 +353,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
     Int128 llend, llsize;
     void *vaddr;
     int ret;
+    VFIOHostDMAWindow *hostwin;
+    bool hostwin_found;
 
     if (vfio_listener_skipped_section(section)) {
         trace_vfio_listener_region_add_skip(
@@ -354,7 +380,15 @@ static void vfio_listener_region_add(MemoryListener *listener,
     }
     end = int128_get64(int128_sub(llend, int128_one()));
 
-    if ((iova < container->min_iova) || (end > container->max_iova)) {
+    hostwin_found = false;
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+            hostwin_found = true;
+            break;
+        }
+    }
+
+    if (!hostwin_found) {
         error_report("vfio: IOMMU container %p can't map guest IOVA region"
                      " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
                      container, iova, end);
@@ -369,10 +403,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
 
         trace_vfio_listener_region_add_iommu(iova, end);
         /*
-         * FIXME: We should do some checking to see if the
-         * capabilities of the host VFIO IOMMU are adequate to model
-         * the guest IOMMU
-         *
          * FIXME: For VFIO iommu types which have KVM acceleration to
          * avoid bouncing all map/unmaps through qemu this way, this
          * would be the right place to wire that up (tell the KVM
@@ -879,17 +909,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
          * existing Type1 IOMMUs generally support any IOVA we're
          * going to actually try in practice.
          */
-        container->min_iova = 0;
-        container->max_iova = (hwaddr)-1;
-
-        /* Assume just 4K IOVA page size */
-        container->iova_pgsizes = 0x1000;
         info.argsz = sizeof(info);
         ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
         /* Ignore errors */
-        if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
-            container->iova_pgsizes = info.iova_pgsizes;
+        if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
+            /* Assume 4k IOVA page size */
+            info.iova_pgsizes = 4096;
         }
+        vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes);
     } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) ||
                ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) {
         struct vfio_iommu_spapr_tce_info info;
@@ -949,11 +976,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             }
             goto free_container_exit;
         }
-        container->min_iova = info.dma32_window_start;
-        container->max_iova = container->min_iova + info.dma32_window_size - 1;
 
-        /* Assume just 4K IOVA pages for now */
-        container->iova_pgsizes = 0x1000;
+        /* The default table uses 4K pages */
+        vfio_host_win_add(container, info.dma32_window_start,
+                          info.dma32_window_start +
+                          info.dma32_window_size - 1,
+                          0x1000);
     } else {
         error_report("vfio: No available IOMMU models");
         ret = -EINVAL;
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 405c3b29d6..b1f3e92405 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -82,9 +82,8 @@ typedef struct VFIOContainer {
      * contiguous IOVA window.  We may need to generalize that in
      * future
      */
-    hwaddr min_iova, max_iova;
-    uint64_t iova_pgsizes;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+    QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
@@ -97,6 +96,13 @@ typedef struct VFIOGuestIOMMU {
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;
 
+typedef struct VFIOHostDMAWindow {
+    hwaddr min_iova;
+    hwaddr max_iova;
+    uint64_t iova_pgsizes;
+    QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
+} VFIOHostDMAWindow;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef struct VFIODevice {
-- 
cgit v1.2.3


From 2e4109de8e589beecd69996ee14f24021b991c0d Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 4 Jul 2016 13:33:06 +1000
Subject: vfio/spapr: Create DMA window dynamically (SPAPR IOMMU v2)

New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management.
This adds ability to VFIO common code to dynamically allocate/remove
DMA windows in the host kernel when new VFIO container is added/removed.

This adds a helper to vfio_listener_region_add which makes
VFIO_IOMMU_SPAPR_TCE_CREATE ioctl and adds just created IOMMU into
the host IOMMU list; the opposite action is taken in
vfio_listener_region_del.

When creating a new window, this uses heuristic to decide on the TCE table
levels number.

This should cause no guest visible change in behavior.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
[dwg: Added some casts to prevent printf() warnings on certain targets
 where the kernel headers' __u64 doesn't match uint64_t or PRIx64]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/vfio/common.c              | 80 +++++++++++++++++++++++++++++++++++++------
 hw/vfio/spapr.c               | 71 ++++++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |  2 ++
 include/hw/vfio/vfio-common.h |  6 ++++
 4 files changed, 149 insertions(+), 10 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7c8a5310ae..f3c0522e7e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -265,6 +265,21 @@ static void vfio_host_win_add(VFIOContainer *container,
     QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
 }
 
+static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
+                             hwaddr max_iova)
+{
+    VFIOHostDMAWindow *hostwin;
+
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
+            QLIST_REMOVE(hostwin, hostwin_next);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
     return (!memory_region_is_ram(section->mr) &&
@@ -380,6 +395,31 @@ static void vfio_listener_region_add(MemoryListener *listener,
     }
     end = int128_get64(int128_sub(llend, int128_one()));
 
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        VFIOHostDMAWindow *hostwin;
+        hwaddr pgsize = 0;
+
+        /* For now intersections are not allowed, we may relax this later */
+        QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+            if (ranges_overlap(hostwin->min_iova,
+                               hostwin->max_iova - hostwin->min_iova + 1,
+                               section->offset_within_address_space,
+                               int128_get64(section->size))) {
+                ret = -1;
+                goto fail;
+            }
+        }
+
+        ret = vfio_spapr_create_window(container, section, &pgsize);
+        if (ret) {
+            goto fail;
+        }
+
+        vfio_host_win_add(container, section->offset_within_address_space,
+                          section->offset_within_address_space +
+                          int128_get64(section->size) - 1, pgsize);
+    }
+
     hostwin_found = false;
     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
         if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
@@ -523,6 +563,18 @@ static void vfio_listener_region_del(MemoryListener *listener,
                      "0x%"HWADDR_PRIx") = %d (%m)",
                      container, iova, int128_get64(llsize), ret);
     }
+
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        vfio_spapr_remove_window(container,
+                                 section->offset_within_address_space);
+        if (vfio_host_win_del(container,
+                              section->offset_within_address_space,
+                              section->offset_within_address_space +
+                              int128_get64(section->size) - 1) < 0) {
+            hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
+                     __func__, section->offset_within_address_space);
+        }
+    }
 }
 
 static const MemoryListener vfio_memory_listener = {
@@ -961,11 +1013,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             }
         }
 
-        /*
-         * This only considers the host IOMMU's 32-bit window.  At
-         * some point we need to add support for the optional 64-bit
-         * window and dynamic windows
-         */
         info.argsz = sizeof(info);
         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
         if (ret) {
@@ -977,11 +1024,24 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             goto free_container_exit;
         }
 
-        /* The default table uses 4K pages */
-        vfio_host_win_add(container, info.dma32_window_start,
-                          info.dma32_window_start +
-                          info.dma32_window_size - 1,
-                          0x1000);
+        if (v2) {
+            /*
+             * There is a default window in just created container.
+             * To make region_add/del simpler, we better remove this
+             * window now and let those iommu_listener callbacks
+             * create/remove them when needed.
+             */
+            ret = vfio_spapr_remove_window(container, info.dma32_window_start);
+            if (ret) {
+                goto free_container_exit;
+            }
+        } else {
+            /* The default table uses 4K pages */
+            vfio_host_win_add(container, info.dma32_window_start,
+                              info.dma32_window_start +
+                              info.dma32_window_size - 1,
+                              0x1000);
+        }
     } else {
         error_report("vfio: No available IOMMU models");
         ret = -EINVAL;
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 7d64443b7f..0af342332c 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -137,3 +137,74 @@ const MemoryListener vfio_prereg_listener = {
     .region_add = vfio_prereg_listener_region_add,
     .region_del = vfio_prereg_listener_region_del,
 };
+
+int vfio_spapr_create_window(VFIOContainer *container,
+                             MemoryRegionSection *section,
+                             hwaddr *pgsize)
+{
+    int ret;
+    unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr);
+    unsigned entries, pages;
+    struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
+
+    /*
+     * FIXME: For VFIO iommu types which have KVM acceleration to
+     * avoid bouncing all map/unmaps through qemu this way, this
+     * would be the right place to wire that up (tell the KVM
+     * device emulation the VFIO iommu handles to use).
+     */
+    create.window_size = int128_get64(section->size);
+    create.page_shift = ctz64(pagesize);
+    /*
+     * SPAPR host supports multilevel TCE tables, there is some
+     * heuristic to decide how many levels we want for our table:
+     * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
+     */
+    entries = create.window_size >> create.page_shift;
+    pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
+    pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */
+    create.levels = ctz64(pages) / 6 + 1;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+    if (ret) {
+        error_report("Failed to create a window, ret = %d (%m)", ret);
+        return -errno;
+    }
+
+    if (create.start_addr != section->offset_within_address_space) {
+        vfio_spapr_remove_window(container, create.start_addr);
+
+        error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
+                     section->offset_within_address_space,
+                     (uint64_t)create.start_addr);
+        ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+        return -EINVAL;
+    }
+    trace_vfio_spapr_create_window(create.page_shift,
+                                   create.window_size,
+                                   create.start_addr);
+    *pgsize = pagesize;
+
+    return 0;
+}
+
+int vfio_spapr_remove_window(VFIOContainer *container,
+                             hwaddr offset_within_address_space)
+{
+    struct vfio_iommu_spapr_tce_remove remove = {
+        .argsz = sizeof(remove),
+        .start_addr = offset_within_address_space,
+    };
+    int ret;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+    if (ret) {
+        error_report("Failed to remove window at %"PRIx64,
+                     (uint64_t)remove.start_addr);
+        return -errno;
+    }
+
+    trace_vfio_spapr_remove_window(offset_within_address_space);
+
+    return 0;
+}
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 0b02a3bd54..4bb7690c46 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -121,3 +121,5 @@ vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" -
 vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
 vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
 vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
+vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b1f3e92405..07f7188df4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -168,4 +168,10 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
 #endif
 extern const MemoryListener vfio_prereg_listener;
 
+int vfio_spapr_create_window(VFIOContainer *container,
+                             MemoryRegionSection *section,
+                             hwaddr *pgsize);
+int vfio_spapr_remove_window(VFIOContainer *container,
+                             hwaddr offset_within_address_space);
+
 #endif /* !HW_VFIO_VFIO_COMMON_H */
-- 
cgit v1.2.3


From ae4de14cd36b6a899d83df9595be3971ac0802d4 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 4 Jul 2016 13:33:07 +1000
Subject: spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)

This adds support for Dynamic DMA Windows (DDW) option defined by
the SPAPR specification which allows to have additional DMA window(s)

The "ddw" property is enabled by default on a PHB but for compatibility
the pseries-2.6 machine and older disable it.
This also creates a single DMA window for the older machines to
maintain backward migration.

This implements DDW for PHB with emulated and VFIO devices. The host
kernel support is required. The advertised IOMMU page sizes are 4K and
64K; 16M pages are supported but not advertised by default, in order to
enable them, the user has to specify "pgsz" property for PHB and
enable huge pages for RAM.

The existing linux guests try creating one additional huge DMA window
with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
the guest switches to dma_direct_ops and never calls TCE hypercalls
(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
and not waste time on map/unmap later. This adds a "dma64_win_addr"
property which is a bus address for the 64bit window and by default
set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
uses and this allows having emulated and VFIO devices on the same bus.

This adds 4 RTAS handlers:
* ibm,query-pe-dma-window
* ibm,create-pe-dma-window
* ibm,remove-pe-dma-window
* ibm,reset-pe-dma-window
These are registered from type_init() callback.

These RTAS handlers are implemented in a separate file to avoid polluting
spapr_iommu.c with PCI.

This changes sPAPRPHBState::dma_liobn to an array to allow 2 LIOBNs
and updates all references to dma_liobn. However this does not add
64bit LIOBN to the migration stream as in fact even 32bit LIOBN is
rather pointless there (as it is a PHB property and the management
software can/should pass LIOBNs via CLI) but we keep it for the backward
migration support.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/Makefile.objs        |   1 +
 hw/ppc/spapr.c              |   7 +-
 hw/ppc/spapr_pci.c          |  75 ++++++++---
 hw/ppc/spapr_rtas_ddw.c     | 295 ++++++++++++++++++++++++++++++++++++++++++++
 hw/ppc/trace-events         |   4 +
 include/hw/pci-host/spapr.h |   8 +-
 include/hw/ppc/spapr.h      |  16 ++-
 7 files changed, 385 insertions(+), 21 deletions(-)
 create mode 100644 hw/ppc/spapr_rtas_ddw.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 5cc6608e50..91a3420f47 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -8,6 +8,7 @@ obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
 # PowerPC 4xx boards
 obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
 obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 78ebd9ee38..9c1c2c1858 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2489,7 +2489,12 @@ DEFINE_SPAPR_MACHINE(2_7, "2.7", true);
  * pseries-2.6
  */
 #define SPAPR_COMPAT_2_6 \
-    HW_COMPAT_2_6
+    HW_COMPAT_2_6 \
+    { \
+        .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+        .property = "ddw",\
+        .value    = stringify(off),\
+    },
 
 static void spapr_machine_2_6_instance_options(MachineState *machine)
 {
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index cbb7cdd774..949c44fec8 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -35,6 +35,7 @@
 #include "hw/ppc/spapr.h"
 #include "hw/pci-host/spapr.h"
 #include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
 #include <libfdt.h>
 #include "trace.h"
 #include "qemu/error-report.h"
@@ -45,6 +46,7 @@
 #include "hw/ppc/spapr_drc.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/kvm.h"
+#include "sysemu/hostmem.h"
 
 #include "hw/vfio/vfio.h"
 
@@ -1304,11 +1306,14 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
     PCIBus *bus;
     uint64_t msi_window_size = 4096;
     sPAPRTCETable *tcet;
+    const unsigned windows_supported =
+        sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
 
     if (sphb->index != (uint32_t)-1) {
         hwaddr windows_base;
 
-        if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn != (uint32_t)-1)
+        if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != (uint32_t)-1)
+            || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2)
             || (sphb->mem_win_addr != (hwaddr)-1)
             || (sphb->io_win_addr != (hwaddr)-1)) {
             error_setg(errp, "Either \"index\" or other parameters must"
@@ -1323,7 +1328,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
         }
 
         sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index;
-        sphb->dma_liobn = SPAPR_PCI_LIOBN(sphb->index, 0);
+        for (i = 0; i < windows_supported; ++i) {
+            sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i);
+        }
 
         windows_base = SPAPR_PCI_WINDOW_BASE
             + sphb->index * SPAPR_PCI_WINDOW_SPACING;
@@ -1336,8 +1343,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    if (sphb->dma_liobn == (uint32_t)-1) {
-        error_setg(errp, "LIOBN not specified for PHB");
+    if ((sphb->dma_liobn[0] == (uint32_t)-1) ||
+        ((sphb->dma_liobn[1] == (uint32_t)-1) && (windows_supported > 1))) {
+        error_setg(errp, "LIOBN(s) not specified for PHB");
         return;
     }
 
@@ -1456,16 +1464,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
         }
     }
 
-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
-    if (!tcet) {
-        error_setg(errp, "Unable to create TCE table for %s",
-                   sphb->dtbusname);
-        return;
+    /* DMA setup */
+    for (i = 0; i < windows_supported; ++i) {
+        tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]);
+        if (!tcet) {
+            error_setg(errp, "Creating window#%d failed for %s",
+                       i, sphb->dtbusname);
+            return;
+        }
+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
+                                            spapr_tce_get_iommu(tcet), 0);
     }
 
-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
-                                        spapr_tce_get_iommu(tcet), 0);
-
     sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
 }
 
@@ -1482,13 +1492,19 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
 
 void spapr_phb_dma_reset(sPAPRPHBState *sphb)
 {
-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
+    int i;
+    sPAPRTCETable *tcet;
+
+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+        tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[i]);
 
-    if (tcet && tcet->nb_table) {
-        spapr_tce_table_disable(tcet);
+        if (tcet && tcet->nb_table) {
+            spapr_tce_table_disable(tcet);
+        }
     }
 
     /* Register default 32bit DMA window */
+    tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[0]);
     spapr_tce_table_enable(tcet, SPAPR_TCE_PAGE_SHIFT, sphb->dma_win_addr,
                            sphb->dma_win_size >> SPAPR_TCE_PAGE_SHIFT);
 }
@@ -1510,7 +1526,8 @@ static void spapr_phb_reset(DeviceState *qdev)
 static Property spapr_phb_properties[] = {
     DEFINE_PROP_UINT32("index", sPAPRPHBState, index, -1),
     DEFINE_PROP_UINT64("buid", sPAPRPHBState, buid, -1),
-    DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn, -1),
+    DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn[0], -1),
+    DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1),
     DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1),
     DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size,
                        SPAPR_PCI_MMIO_WIN_SIZE),
@@ -1522,6 +1539,11 @@ static Property spapr_phb_properties[] = {
     /* Default DMA window is 0..1GB */
     DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
     DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_win_addr,
+                       0x800000000000000ULL),
+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
+                       (1ULL << 12) | (1ULL << 16)),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1598,7 +1620,7 @@ static const VMStateDescription vmstate_spapr_pci = {
     .post_load = spapr_pci_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState),
-        VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState),
+        VMSTATE_UINT32_EQUAL(dma_liobn[0], sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(mem_win_addr, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
@@ -1774,6 +1796,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     uint32_t interrupt_map_mask[] = {
         cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
     uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+    uint32_t ddw_applicable[] = {
+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
+    };
+    uint32_t ddw_extensions[] = {
+        cpu_to_be32(1),
+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
+    };
     sPAPRTCETable *tcet;
     PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
     sPAPRFDT s_fdt;
@@ -1798,6 +1829,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
 
+    /* Dynamic DMA window */
+    if (phb->ddw_enabled) {
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
+                         sizeof(ddw_applicable)));
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
+                         &ddw_extensions, sizeof(ddw_extensions)));
+    }
+
     /* Build the interrupt-map, this must matches what is done
      * in pci_spapr_map_irq
      */
@@ -1821,7 +1860,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
                      sizeof(interrupt_map)));
 
-    tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
+    tcet = spapr_tce_find_by_liobn(phb->dma_liobn[0]);
     if (!tcet) {
         return -1;
     }
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 0000000000..177dcffc9b
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,295 @@
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "qemu/error-report.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "trace.h"
+
+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && tcet->nb_table) {
+        ++*(unsigned *)opaque;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
+{
+    unsigned ret = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
+
+    return ret;
+}
+
+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && !tcet->nb_table) {
+        *(uint32_t *)opaque = tcet->liobn;
+        return 1;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
+{
+    uint32_t liobn = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
+
+    return liobn;
+}
+
+static uint32_t spapr_page_mask_to_query_mask(uint64_t page_mask)
+{
+    int i;
+    uint32_t mask = 0;
+    const struct { int shift; uint32_t mask; } masks[] = {
+        { 12, RTAS_DDW_PGSIZE_4K },
+        { 16, RTAS_DDW_PGSIZE_64K },
+        { 24, RTAS_DDW_PGSIZE_16M },
+        { 25, RTAS_DDW_PGSIZE_32M },
+        { 26, RTAS_DDW_PGSIZE_64M },
+        { 27, RTAS_DDW_PGSIZE_128M },
+        { 28, RTAS_DDW_PGSIZE_256M },
+        { 34, RTAS_DDW_PGSIZE_16G },
+    };
+
+    for (i = 0; i < ARRAY_SIZE(masks); ++i) {
+        if (page_mask & (1ULL << masks[i].shift)) {
+            mask |= masks[i].mask;
+        }
+    }
+
+    return mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    uint64_t buid, max_window_size;
+    uint32_t avail, addr, pgmask = 0;
+    MachineState *machine = MACHINE(spapr);
+
+    if ((nargs != 3) || (nret != 5)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    /* Translate page mask to LoPAPR format */
+    pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
+
+    /*
+     * This is "Largest contiguous block of TCEs allocated specifically
+     * for (that is, are reserved for) this PE".
+     * Return the maximum number as maximum supported RAM size was in 4K pages.
+     */
+    if (machine->ram_size == machine->maxram_size) {
+        max_window_size = machine->ram_size;
+    } else {
+        MemoryHotplugState *hpms = &spapr->hotplug_memory;
+
+        max_window_size = hpms->base + memory_region_size(&hpms->mr);
+    }
+
+    avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, avail);
+    rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT);
+    rtas_st(rets, 3, pgmask);
+    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
+
+    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet = NULL;
+    uint32_t addr, page_shift, window_shift, liobn;
+    uint64_t buid, win_addr;
+    int windows;
+
+    if ((nargs != 5) || (nret != 4)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    page_shift = rtas_ld(args, 3);
+    window_shift = rtas_ld(args, 4);
+    liobn = spapr_phb_get_free_liobn(sphb);
+    windows = spapr_phb_get_active_win_num(sphb);
+
+    if (!(sphb->page_size_mask & (1ULL << page_shift)) ||
+        (window_shift < page_shift)) {
+        goto param_error_exit;
+    }
+
+    if (!liobn || !sphb->ddw_enabled || windows == SPAPR_PCI_DMA_MAX_WINDOWS) {
+        goto hw_error_exit;
+    }
+
+    tcet = spapr_tce_find_by_liobn(liobn);
+    if (!tcet) {
+        goto hw_error_exit;
+    }
+
+    win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
+    spapr_tce_table_enable(tcet, page_shift, win_addr,
+                           1ULL << (window_shift - page_shift));
+    if (!tcet->nb_table) {
+        goto hw_error_exit;
+    }
+
+    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
+                                 1ULL << window_shift, tcet->bus_offset, liobn);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, liobn);
+    rtas_st(rets, 2, tcet->bus_offset >> 32);
+    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
+
+    return;
+
+hw_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet;
+    uint32_t liobn;
+
+    if ((nargs != 1) || (nret != 1)) {
+        goto param_error_exit;
+    }
+
+    liobn = rtas_ld(args, 0);
+    tcet = spapr_tce_find_by_liobn(liobn);
+    if (!tcet) {
+        goto param_error_exit;
+    }
+
+    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
+    if (!sphb || !sphb->ddw_enabled || !tcet->nb_table) {
+        goto param_error_exit;
+    }
+
+    spapr_tce_table_disable(tcet);
+    trace_spapr_iommu_ddw_remove(liobn);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    uint64_t buid;
+    uint32_t addr;
+
+    if ((nargs != 3) || (nret != 1)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    spapr_phb_dma_reset(sphb);
+    trace_spapr_iommu_ddw_reset(buid, addr);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void spapr_rtas_ddw_init(void)
+{
+    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
+                        "ibm,query-pe-dma-window",
+                        rtas_ibm_query_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
+                        "ibm,create-pe-dma-window",
+                        rtas_ibm_create_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
+                        "ibm,remove-pe-dma-window",
+                        rtas_ibm_remove_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
+                        "ibm,reset-pe-dma-window",
+                        rtas_ibm_reset_pe_dma_window);
+}
+
+type_init(spapr_rtas_ddw_init)
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index 6da713547f..900679bc9d 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -30,6 +30,10 @@ spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, un
 spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
 spapr_iommu_pre_save(uint64_t liobn, uint32_t nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32
 spapr_iommu_post_load(uint64_t liobn, uint32_t pre_nb, uint32_t post_nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" => %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32
+spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32
+spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32
+spapr_iommu_ddw_remove(uint32_t liobn) "liobn=%"PRIx32
+spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32
 
 # hw/ppc/ppc.c
 ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 288b89c04a..193631d2dc 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -32,6 +32,8 @@
 #define SPAPR_PCI_HOST_BRIDGE(obj) \
     OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
 
+#define SPAPR_PCI_DMA_MAX_WINDOWS    2
+
 typedef struct sPAPRPHBState sPAPRPHBState;
 
 typedef struct spapr_pci_msi {
@@ -56,7 +58,7 @@ struct sPAPRPHBState {
     hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
     MemoryRegion memwindow, iowindow, msiwindow;
 
-    uint32_t dma_liobn;
+    uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS];
     hwaddr dma_win_addr, dma_win_size;
     AddressSpace iommu_as;
     MemoryRegion iommu_root;
@@ -71,6 +73,10 @@ struct sPAPRPHBState {
     spapr_pci_msi_mig *msi_devs;
 
     QLIST_ENTRY(sPAPRPHBState) list;
+
+    bool ddw_enabled;
+    uint64_t page_size_mask;
+    uint64_t dma64_win_addr;
 };
 
 #define SPAPR_PCI_MAX_INDEX          255
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 49325555d3..2e2dd14c30 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -416,6 +416,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 #define RTAS_OUT_NOT_AUTHORIZED                 -9002
 #define RTAS_OUT_SYSPARM_PARAM_ERROR            -9999
 
+/* DDW pagesize mask values from ibm,query-pe-dma-window */
+#define RTAS_DDW_PGSIZE_4K       0x01
+#define RTAS_DDW_PGSIZE_64K      0x02
+#define RTAS_DDW_PGSIZE_16M      0x04
+#define RTAS_DDW_PGSIZE_32M      0x08
+#define RTAS_DDW_PGSIZE_64M      0x10
+#define RTAS_DDW_PGSIZE_128M     0x20
+#define RTAS_DDW_PGSIZE_256M     0x40
+#define RTAS_DDW_PGSIZE_16G      0x80
+
 /* RTAS tokens */
 #define RTAS_TOKEN_BASE      0x2000
 
@@ -457,8 +467,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 #define RTAS_IBM_SET_SLOT_RESET                 (RTAS_TOKEN_BASE + 0x23)
 #define RTAS_IBM_CONFIGURE_PE                   (RTAS_TOKEN_BASE + 0x24)
 #define RTAS_IBM_SLOT_ERROR_DETAIL              (RTAS_TOKEN_BASE + 0x25)
+#define RTAS_IBM_QUERY_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
+#define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
+#define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
 
-#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
 
 /* RTAS ibm,get-system-parameter token values */
 #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
-- 
cgit v1.2.3


From 1f0252e66e76f0b5967419e2a1e53a1f1398bf7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Fri, 1 Jul 2016 09:10:10 +0200
Subject: ppc: simplify ppc_hash64_hpte_page_shift_noslb()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The segment page shift parameter is never used. Let's remove it.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr_hcall.c    | 4 ++--
 target-ppc/mmu-hash64.c | 6 +-----
 target-ppc/mmu-hash64.h | 3 +--
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index e011ed4b66..73af112e1d 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -83,12 +83,12 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     target_ulong pte_index = args[1];
     target_ulong pteh = args[2];
     target_ulong ptel = args[3];
-    unsigned apshift, spshift;
+    unsigned apshift;
     target_ulong raddr;
     target_ulong index;
     uint64_t token;
 
-    apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel, &spshift);
+    apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel);
     if (!apshift) {
         /* Bad page size encoding */
         return H_PARAMETER;
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 3b1357a648..a15ef83f7d 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -610,14 +610,12 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
 }
 
 unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
-                                          uint64_t pte0, uint64_t pte1,
-                                          unsigned *seg_page_shift)
+                                          uint64_t pte0, uint64_t pte1)
 {
     CPUPPCState *env = &cpu->env;
     int i;
 
     if (!(pte0 & HPTE64_V_LARGE)) {
-        *seg_page_shift = 12;
         return 12;
     }
 
@@ -635,12 +633,10 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 
         shift = hpte_page_shift(sps, pte0, pte1);
         if (shift) {
-            *seg_page_shift = sps->page_shift;
             return shift;
         }
     }
 
-    *seg_page_shift = 0;
     return 0;
 }
 
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 6423b9f791..7fcb555b47 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -17,8 +17,7 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
                                target_ulong pte_index,
                                target_ulong pte0, target_ulong pte1);
 unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
-                                          uint64_t pte0, uint64_t pte1,
-                                          unsigned *seg_page_shift);
+                                          uint64_t pte0, uint64_t pte1);
 #endif
 
 /*
-- 
cgit v1.2.3


From 651060aba79dc9d0cc77ac3921948ea78dba7409 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 5 Jul 2016 12:17:56 +1000
Subject: target-ppc: Correct page size decoding in ppc_hash64_pteg_search()

The architecture specifies that when searching a PTEG for PTEs, entries
with a page size encoding that's not valid for the current segment should
be ignored, continuing the search.

The current implementation does this with ppc_hash64_pte_size_decode()
which is a very incomplete implementation of this check.  We already have
code to do a full and correct page size decode in hpte_page_shift().

This patch moves hpte_page_shift() so it can be used in
ppc_hash64_pteg_search() and adjusts the latter's parameters to include
a full SLBE instead of just a segment page shift.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 target-ppc/mmu-hash64.c | 99 ++++++++++++++++++++-----------------------------
 1 file changed, 41 insertions(+), 58 deletions(-)

diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index a15ef83f7d..6d3428ef7d 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -450,30 +450,45 @@ void ppc_hash64_stop_access(PowerPCCPU *cpu, uint64_t token)
     }
 }
 
-/* Returns the effective page shift or 0. MPSS isn't supported yet so
- * this will always be the slb_pshift or 0
- */
-static uint32_t ppc_hash64_pte_size_decode(uint64_t pte1, uint32_t slb_pshift)
+static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
+    uint64_t pte0, uint64_t pte1)
 {
-    switch (slb_pshift) {
-    case 12:
+    int i;
+
+    if (!(pte0 & HPTE64_V_LARGE)) {
+        if (sps->page_shift != 12) {
+            /* 4kiB page in a non 4kiB segment */
+            return 0;
+        }
+        /* Normal 4kiB page */
         return 12;
-    case 16:
-        if ((pte1 & 0xf000) == 0x1000) {
-            return 16;
+    }
+
+    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+        const struct ppc_one_page_size *ps = &sps->enc[i];
+        uint64_t mask;
+
+        if (!ps->page_shift) {
+            break;
         }
-        return 0;
-    case 24:
-        if ((pte1 & 0xff000) == 0) {
-            return 24;
+
+        if (ps->page_shift == 12) {
+            /* L bit is set so this can't be a 4kiB page */
+            continue;
+        }
+
+        mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
+
+        if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
+            return ps->page_shift;
         }
-        return 0;
     }
-    return 0;
+
+    return 0; /* Bad page size encoding */
 }
 
 static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
-                                     uint32_t slb_pshift, bool secondary,
+                                     ppc_slb_t *slb, bool secondary,
                                      target_ulong ptem, ppc_hash_pte64_t *pte)
 {
     CPUPPCState *env = &cpu->env;
@@ -494,7 +509,14 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
         if ((pte0 & HPTE64_V_VALID)
             && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
             && HPTE64_V_COMPARE(pte0, ptem)) {
-            uint32_t pshift = ppc_hash64_pte_size_decode(pte1, slb_pshift);
+            unsigned pshift = hpte_page_shift(slb->sps, pte0, pte1);
+            /*
+             * If there is no match, ignore the PTE, it could simply
+             * be for a different segment size encoding and the
+             * architecture specifies we should not match. Linux will
+             * potentially leave behind PTEs for the wrong base page
+             * size when demoting segments.
+             */
             if (pshift == 0) {
                 continue;
             }
@@ -554,8 +576,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb->sps->page_shift,
-                                        0, ptem, pte);
+    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, 0, ptem, pte);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
@@ -565,50 +586,12 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb->sps->page_shift, 1,
-                                            ptem, pte);
+        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, 1, ptem, pte);
     }
 
     return pte_offset;
 }
 
-static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
-    uint64_t pte0, uint64_t pte1)
-{
-    int i;
-
-    if (!(pte0 & HPTE64_V_LARGE)) {
-        if (sps->page_shift != 12) {
-            /* 4kiB page in a non 4kiB segment */
-            return 0;
-        }
-        /* Normal 4kiB page */
-        return 12;
-    }
-
-    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
-        const struct ppc_one_page_size *ps = &sps->enc[i];
-        uint64_t mask;
-
-        if (!ps->page_shift) {
-            break;
-        }
-
-        if (ps->page_shift == 12) {
-            /* L bit is set so this can't be a 4kiB page */
-            continue;
-        }
-
-        mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
-
-        if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
-            return ps->page_shift;
-        }
-    }
-
-    return 0; /* Bad page size encoding */
-}
-
 unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
                                           uint64_t pte0, uint64_t pte1)
 {
-- 
cgit v1.2.3


From 073de86aa934d46d596a2367e7501da5500e5b86 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 5 Jul 2016 12:31:48 +1000
Subject: target-ppc: Simplify HPTE matching

ppc_hash64_pteg_search() explicitly checks each HPTE's VALID and
SECONDARY bits, then uses the HPTE64_V_COMPARE() macro to check the B field
and AVPN.  However, a small tweak to HPTE64_V_COMPARE() means we can check
all of these bits at once with a suitable ptem value.  So, consolidate all
the comparisons for simplicity.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 target-ppc/mmu-hash64.c | 15 ++++++++-------
 target-ppc/mmu-hash64.h |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 6d3428ef7d..07d324930c 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -488,8 +488,8 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
 }
 
 static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
-                                     ppc_slb_t *slb, bool secondary,
-                                     target_ulong ptem, ppc_hash_pte64_t *pte)
+                                     ppc_slb_t *slb, target_ulong ptem,
+                                     ppc_hash_pte64_t *pte)
 {
     CPUPPCState *env = &cpu->env;
     int i;
@@ -506,9 +506,8 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
         pte0 = ppc_hash64_load_hpte0(cpu, token, i);
         pte1 = ppc_hash64_load_hpte1(cpu, token, i);
 
-        if ((pte0 & HPTE64_V_VALID)
-            && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
-            && HPTE64_V_COMPARE(pte0, ptem)) {
+        /* This compares V, B, H (secondary) and the AVPN */
+        if (HPTE64_V_COMPARE(pte0, ptem)) {
             unsigned pshift = hpte_page_shift(slb->sps, pte0, pte1);
             /*
              * If there is no match, ignore the PTE, it could simply
@@ -563,6 +562,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
         hash = vsid ^ (epn >> slb->sps->page_shift);
     }
     ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN);
+    ptem |= HPTE64_V_VALID;
 
     /* Page address translation */
     qemu_log_mask(CPU_LOG_MMU,
@@ -576,17 +576,18 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, 0, ptem, pte);
+    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
+        ptem |= HPTE64_V_SECONDARY;
         qemu_log_mask(CPU_LOG_MMU,
                 "1 htab=" TARGET_FMT_plx "/" TARGET_FMT_plx
                 " vsid=" TARGET_FMT_lx " api=" TARGET_FMT_lx
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, 1, ptem, pte);
+        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte);
     }
 
     return pte_offset;
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 7fcb555b47..154a306997 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -62,7 +62,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 #define HPTE64_V_AVPN_SHIFT     7
 #define HPTE64_V_AVPN           0x3fffffffffffff80ULL
 #define HPTE64_V_AVPN_VAL(x)    (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT)
-#define HPTE64_V_COMPARE(x, y)  (!(((x) ^ (y)) & 0xffffffffffffff80ULL))
+#define HPTE64_V_COMPARE(x, y)  (!(((x) ^ (y)) & 0xffffffffffffff83ULL))
 #define HPTE64_V_LARGE          0x0000000000000004ULL
 #define HPTE64_V_SECONDARY      0x0000000000000002ULL
 #define HPTE64_V_VALID          0x0000000000000001ULL
-- 
cgit v1.2.3


From 949868633f0454715af1781c0f377413b6ab000e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 5 Jul 2016 12:31:57 +1000
Subject: target-ppc: Return page shift from PTEG search

ppc_hash64_pteg_search() now decodes a PTEs page size encoding, which it
didn't previously do.  This means we're now double decoding the page size
because we check it int he fault path after ppc64_hash64_htab_lookup()
returns.

To avoid this duplication have ppc_hash64_pteg_search() and
ppc_hash64_htab_lookup() return the page size from the PTE and use that in
the callers instead of decoding again.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 target-ppc/mmu-hash64.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 07d324930c..7c1b169676 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -489,7 +489,7 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
 
 static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
                                      ppc_slb_t *slb, target_ulong ptem,
-                                     ppc_hash_pte64_t *pte)
+                                     ppc_hash_pte64_t *pte, unsigned *pshift)
 {
     CPUPPCState *env = &cpu->env;
     int i;
@@ -508,7 +508,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
 
         /* This compares V, B, H (secondary) and the AVPN */
         if (HPTE64_V_COMPARE(pte0, ptem)) {
-            unsigned pshift = hpte_page_shift(slb->sps, pte0, pte1);
+            *pshift = hpte_page_shift(slb->sps, pte0, pte1);
             /*
              * If there is no match, ignore the PTE, it could simply
              * be for a different segment size encoding and the
@@ -516,7 +516,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
              * potentially leave behind PTEs for the wrong base page
              * size when demoting segments.
              */
-            if (pshift == 0) {
+            if (*pshift == 0) {
                 continue;
             }
             /* We don't do anything with pshift yet as qemu TLB only deals
@@ -537,7 +537,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
 
 static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
                                      ppc_slb_t *slb, target_ulong eaddr,
-                                     ppc_hash_pte64_t *pte)
+                                     ppc_hash_pte64_t *pte, unsigned *pshift)
 {
     CPUPPCState *env = &cpu->env;
     hwaddr pte_offset;
@@ -576,7 +576,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte);
+    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte, pshift);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
@@ -587,7 +587,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte);
+        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte, pshift);
     }
 
     return pte_offset;
@@ -714,7 +714,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
     }
 
     /* 4. Locate the PTE in the hash table */
-    pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte);
+    pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte, &apshift);
     if (pte_offset == -1) {
         dsisr = 0x40000000;
         if (rwx == 2) {
@@ -730,18 +730,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
     qemu_log_mask(CPU_LOG_MMU,
                 "found PTE at offset %08" HWADDR_PRIx "\n", pte_offset);
 
-    /* Validate page size encoding */
-    apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
-    if (!apshift) {
-        error_report("Bad page size encoding in HPTE 0x%"PRIx64" - 0x%"PRIx64
-                     " @ 0x%"HWADDR_PRIx, pte.pte0, pte.pte1, pte_offset);
-        /* Not entirely sure what the right action here, but machine
-         * check seems reasonable */
-        cs->exception_index = POWERPC_EXCP_MCHECK;
-        env->error_code = 0;
-        return 1;
-    }
-
     /* 5. Check access permissions */
 
     pp_prot = ppc_hash64_pte_prot(cpu, slb, pte);
@@ -815,16 +803,11 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr)
         return -1;
     }
 
-    pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte);
+    pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift);
     if (pte_offset == -1) {
         return -1;
     }
 
-    apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
-    if (!apshift) {
-        return -1;
-    }
-
     return deposit64(pte.pte1 & HPTE64_R_RPN, 0, apshift, addr)
         & TARGET_PAGE_MASK;
 }
-- 
cgit v1.2.3


From 912acdf487a3c8c0083b904fdb917fe6d79f87a7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 5 Jul 2016 07:37:08 +1000
Subject: ppc/hash64: Add proper real mode translation support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds proper support for translating real mode addresses based
on the combination of HV and LPCR bits. This handles HRMOR offset
for hypervisor real mode, and both RMA and VRMA modes for guest
real mode. PAPR mode adjusts the offsets appropriately to match the
RMA used in TCG, but we need to limit to the max supported by the
implementation (16G).

This includes some fixes by Cédric Le Goater <clg@kaod.org>

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[dwg: Adjusted for differences in my version of the prereq patches]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/spapr.c              |   7 ++
 target-ppc/cpu.h            |   2 +
 target-ppc/mmu-hash64.c     | 165 +++++++++++++++++++++++++++++++++++++++++---
 target-ppc/mmu-hash64.h     |   3 +
 target-ppc/translate_init.c |  14 +++-
 5 files changed, 181 insertions(+), 10 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 9c1c2c1858..7f33a1b2b5 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1771,6 +1771,13 @@ static void ppc_spapr_init(MachineState *machine)
             spapr->vrma_adjust = 1;
             spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
         }
+
+        /* Actually we don't support unbounded RMA anymore since we
+         * added proper emulation of HV mode. The max we can get is
+         * 16G which also happens to be what we configure for PAPR
+         * mode so make sure we don't do anything bigger than that
+         */
+        spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull);
     }
 
     if (spapr->rma_size > node0_size) {
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index af73bced9f..2666a3f80d 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -1047,6 +1047,8 @@ struct CPUPPCState {
     uint64_t insns_flags2;
 #if defined(TARGET_PPC64)
     struct ppc_segment_page_sizes sps;
+    ppc_slb_t vrma_slb;
+    target_ulong rmls;
     bool ci_large_pages;
 #endif
 
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 7c1b169676..7f314442ca 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -681,11 +681,52 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
 
     assert((rwx == 0) || (rwx == 1) || (rwx == 2));
 
+    /* Note on LPCR usage: 970 uses HID4, but our special variant
+     * of store_spr copies relevant fields into env->spr[SPR_LPCR].
+     * Similarily we filter unimplemented bits when storing into
+     * LPCR depending on the MMU version. This code can thus just
+     * use the LPCR "as-is".
+     */
+
     /* 1. Handle real mode accesses */
     if (((rwx == 2) && (msr_ir == 0)) || ((rwx != 2) && (msr_dr == 0))) {
-        /* Translation is off */
-        /* In real mode the top 4 effective address bits are ignored */
+        /* Translation is supposedly "off"  */
+        /* In real mode the top 4 effective address bits are (mostly) ignored */
         raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
+
+        /* In HV mode, add HRMOR if top EA bit is clear */
+        if (msr_hv || !env->has_hv_mode) {
+            if (!(eaddr >> 63)) {
+                raddr |= env->spr[SPR_HRMOR];
+            }
+        } else {
+            /* Otherwise, check VPM for RMA vs VRMA */
+            if (env->spr[SPR_LPCR] & LPCR_VPM0) {
+                slb = &env->vrma_slb;
+                if (slb->sps) {
+                    goto skip_slb_search;
+                }
+                /* Not much else to do here */
+                cs->exception_index = POWERPC_EXCP_MCHECK;
+                env->error_code = 0;
+                return 1;
+            } else if (raddr < env->rmls) {
+                /* RMA. Check bounds in RMLS */
+                raddr |= env->spr[SPR_RMOR];
+            } else {
+                /* The access failed, generate the approriate interrupt */
+                if (rwx == 2) {
+                    ppc_hash64_set_isi(cs, env, 0x08000000);
+                } else {
+                    dsisr = 0x08000000;
+                    if (rwx == 1) {
+                        dsisr |= 0x02000000;
+                    }
+                    ppc_hash64_set_dsi(cs, env, eaddr, dsisr);
+                }
+                return 1;
+            }
+        }
         tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK,
                      PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx,
                      TARGET_PAGE_SIZE);
@@ -694,7 +735,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
 
     /* 2. Translation is on, so look up the SLB */
     slb = slb_lookup(cpu, eaddr);
-
     if (!slb) {
         if (rwx == 2) {
             cs->exception_index = POWERPC_EXCP_ISEG;
@@ -707,6 +747,8 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
         return 1;
     }
 
+skip_slb_search:
+
     /* 3. Check for segment level no-execute violation */
     if ((rwx == 2) && (slb->vsid & SLB_VSID_N)) {
         ppc_hash64_set_isi(cs, env, 0x10000000);
@@ -789,18 +831,37 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr)
 {
     CPUPPCState *env = &cpu->env;
     ppc_slb_t *slb;
-    hwaddr pte_offset;
+    hwaddr pte_offset, raddr;
     ppc_hash_pte64_t pte;
     unsigned apshift;
 
+    /* Handle real mode */
     if (msr_dr == 0) {
         /* In real mode the top 4 effective address bits are ignored */
-        return addr & 0x0FFFFFFFFFFFFFFFULL;
-    }
+        raddr = addr & 0x0FFFFFFFFFFFFFFFULL;
 
-    slb = slb_lookup(cpu, addr);
-    if (!slb) {
-        return -1;
+        /* In HV mode, add HRMOR if top EA bit is clear */
+        if ((msr_hv || !env->has_hv_mode) && !(addr >> 63)) {
+            return raddr | env->spr[SPR_HRMOR];
+        }
+
+        /* Otherwise, check VPM for RMA vs VRMA */
+        if (env->spr[SPR_LPCR] & LPCR_VPM0) {
+            slb = &env->vrma_slb;
+            if (!slb->sps) {
+                return -1;
+            }
+        } else if (raddr < env->rmls) {
+            /* RMA. Check bounds in RMLS */
+            return raddr | env->spr[SPR_RMOR];
+        } else {
+            return -1;
+        }
+    } else {
+        slb = slb_lookup(cpu, addr);
+        if (!slb) {
+            return -1;
+        }
     }
 
     pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift);
@@ -846,6 +907,90 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
     tlb_flush(CPU(cpu), 1);
 }
 
+void ppc_hash64_update_rmls(CPUPPCState *env)
+{
+    uint64_t lpcr = env->spr[SPR_LPCR];
+
+    /*
+     * This is the full 4 bits encoding of POWER8. Previous
+     * CPUs only support a subset of these but the filtering
+     * is done when writing LPCR
+     */
+    switch ((lpcr & LPCR_RMLS) >> LPCR_RMLS_SHIFT) {
+    case 0x8: /* 32MB */
+        env->rmls = 0x2000000ull;
+        break;
+    case 0x3: /* 64MB */
+        env->rmls = 0x4000000ull;
+        break;
+    case 0x7: /* 128MB */
+        env->rmls = 0x8000000ull;
+        break;
+    case 0x4: /* 256MB */
+        env->rmls = 0x10000000ull;
+        break;
+    case 0x2: /* 1GB */
+        env->rmls = 0x40000000ull;
+        break;
+    case 0x1: /* 16GB */
+        env->rmls = 0x400000000ull;
+        break;
+    default:
+        /* What to do here ??? */
+        env->rmls = 0;
+    }
+}
+
+void ppc_hash64_update_vrma(CPUPPCState *env)
+{
+    const struct ppc_one_seg_page_size *sps = NULL;
+    target_ulong esid, vsid, lpcr;
+    ppc_slb_t *slb = &env->vrma_slb;
+    uint32_t vrmasd;
+    int i;
+
+    /* First clear it */
+    slb->esid = slb->vsid = 0;
+    slb->sps = NULL;
+
+    /* Is VRMA enabled ? */
+    lpcr = env->spr[SPR_LPCR];
+    if (!(lpcr & LPCR_VPM0)) {
+        return;
+    }
+
+    /* Make one up. Mostly ignore the ESID which will not be
+     * needed for translation
+     */
+    vsid = SLB_VSID_VRMA;
+    vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT;
+    vsid |= (vrmasd << 4) & (SLB_VSID_L | SLB_VSID_LP);
+    esid = SLB_ESID_V;
+
+   for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+        const struct ppc_one_seg_page_size *sps1 = &env->sps.sps[i];
+
+        if (!sps1->page_shift) {
+            break;
+        }
+
+        if ((vsid & SLB_VSID_LLP_MASK) == sps1->slb_enc) {
+            sps = sps1;
+            break;
+        }
+    }
+
+    if (!sps) {
+        error_report("Bad page size encoding esid 0x"TARGET_FMT_lx
+                     " vsid 0x"TARGET_FMT_lx, esid, vsid);
+        return;
+    }
+
+    slb->vsid = vsid;
+    slb->esid = esid;
+    slb->sps = sps;
+}
+
 void helper_store_lpcr(CPUPPCState *env, target_ulong val)
 {
     uint64_t lpcr = 0;
@@ -901,4 +1046,6 @@ void helper_store_lpcr(CPUPPCState *env, target_ulong val)
         ;
     }
     env->spr[SPR_LPCR] = lpcr;
+    ppc_hash64_update_rmls(env);
+    ppc_hash64_update_vrma(env);
 }
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 154a306997..3a7476b30a 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -18,6 +18,8 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
                                target_ulong pte0, target_ulong pte1);
 unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
                                           uint64_t pte0, uint64_t pte1);
+void ppc_hash64_update_vrma(CPUPPCState *env);
+void ppc_hash64_update_rmls(CPUPPCState *env);
 #endif
 
 /*
@@ -36,6 +38,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 #define SLB_VSID_B_256M         0x0000000000000000ULL
 #define SLB_VSID_B_1T           0x4000000000000000ULL
 #define SLB_VSID_VSID           0x3FFFFFFFFFFFF000ULL
+#define SLB_VSID_VRMA           (0x0001FFFFFF000000ULL | SLB_VSID_B_1T)
 #define SLB_VSID_PTEM           (SLB_VSID_B | SLB_VSID_VSID)
 #define SLB_VSID_KS             0x0000000000000800ULL
 #define SLB_VSID_KP             0x0000000000000400ULL
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index a06bf50b65..8f257fb74a 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -8791,11 +8791,19 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu)
     /* Set emulated LPCR to not send interrupts to hypervisor. Note that
      * under KVM, the actual HW LPCR will be set differently by KVM itself,
      * the settings below ensure proper operations with TCG in absence of
-     * a real hypervisor
+     * a real hypervisor.
+     *
+     * Clearing VPM0 will also cause us to use RMOR in mmu-hash64.c for
+     * real mode accesses, which thankfully defaults to 0 and isn't
+     * accessible in guest mode.
      */
     lpcr->default_value &= ~(LPCR_VPM0 | LPCR_VPM1 | LPCR_ISL | LPCR_KBV);
     lpcr->default_value |= LPCR_LPES0 | LPCR_LPES1;
 
+    /* Set RMLS to the max (ie, 16G) */
+    lpcr->default_value &= ~LPCR_RMLS;
+    lpcr->default_value |= 1ull << LPCR_RMLS_SHIFT;
+
     /* P7 and P8 has slightly different PECE bits, mostly because P8 adds
      * bit 47 and 48 which are reserved on P7. Here we set them all, which
      * will work as expected for both implementations
@@ -8811,6 +8819,10 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu)
     /* Set a full AMOR so guest can use the AMR as it sees fit */
     env->spr[SPR_AMOR] = amor->default_value = 0xffffffffffffffffull;
 
+    /* Update some env bits based on new LPCR value */
+    ppc_hash64_update_rmls(env);
+    ppc_hash64_update_vrma(env);
+
     /* Tell KVM that we're in PAPR mode */
     if (kvm_enabled()) {
         kvmppc_set_papr(cpu);
-- 
cgit v1.2.3


From 2c7ad80443e9747eb85b508be01cded958191bad Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 4 Jul 2016 17:44:11 +1000
Subject: ppc/hash64: Fix support for LPCR:ISL

We need to ignore the segment page size and essentially treat
all pages as coming from a 4K segment.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[dwg: Adjusted for differences in my version of the prereq patches]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 target-ppc/mmu-hash64.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 7f314442ca..82c2186bcf 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -488,7 +488,8 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
 }
 
 static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
-                                     ppc_slb_t *slb, target_ulong ptem,
+                                     const struct ppc_one_seg_page_size *sps,
+                                     target_ulong ptem,
                                      ppc_hash_pte64_t *pte, unsigned *pshift)
 {
     CPUPPCState *env = &cpu->env;
@@ -508,7 +509,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
 
         /* This compares V, B, H (secondary) and the AVPN */
         if (HPTE64_V_COMPARE(pte0, ptem)) {
-            *pshift = hpte_page_shift(slb->sps, pte0, pte1);
+            *pshift = hpte_page_shift(sps, pte0, pte1);
             /*
              * If there is no match, ignore the PTE, it could simply
              * be for a different segment size encoding and the
@@ -543,23 +544,31 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
     hwaddr pte_offset;
     hwaddr hash;
     uint64_t vsid, epnmask, epn, ptem;
+    const struct ppc_one_seg_page_size *sps = slb->sps;
 
     /* The SLB store path should prevent any bad page size encodings
      * getting in there, so: */
-    assert(slb->sps);
+    assert(sps);
 
-    epnmask = ~((1ULL << slb->sps->page_shift) - 1);
+    /* If ISL is set in LPCR we need to clamp the page size to 4K */
+    if (env->spr[SPR_LPCR] & LPCR_ISL) {
+        /* We assume that when using TCG, 4k is first entry of SPS */
+        sps = &env->sps.sps[0];
+        assert(sps->page_shift == 12);
+    }
+
+    epnmask = ~((1ULL << sps->page_shift) - 1);
 
     if (slb->vsid & SLB_VSID_B) {
         /* 1TB segment */
         vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT_1T;
         epn = (eaddr & ~SEGMENT_MASK_1T) & epnmask;
-        hash = vsid ^ (vsid << 25) ^ (epn >> slb->sps->page_shift);
+        hash = vsid ^ (vsid << 25) ^ (epn >> sps->page_shift);
     } else {
         /* 256M segment */
         vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT;
         epn = (eaddr & ~SEGMENT_MASK_256M) & epnmask;
-        hash = vsid ^ (epn >> slb->sps->page_shift);
+        hash = vsid ^ (epn >> sps->page_shift);
     }
     ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN);
     ptem |= HPTE64_V_VALID;
@@ -576,7 +585,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte, pshift);
+    pte_offset = ppc_hash64_pteg_search(cpu, hash, sps, ptem, pte, pshift);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
@@ -587,7 +596,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte, pshift);
+        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, sps, ptem, pte, pshift);
     }
 
     return pte_offset;
-- 
cgit v1.2.3