From 158c87e5de4f34840bf8115789f09806e7e14b94 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 4 Jul 2016 09:20:12 +1000 Subject: ppc: Fix xsrdpi, xvrdpi and xvrspi rounding xsrdpi, xvrdpi and xvrspi use the round ties away method, not round nearest even. Signed-off-by: Anton Blanchard Signed-off-by: David Gibson --- target-ppc/fpu_helper.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c index 4ef893be2c..d9795d04d0 100644 --- a/target-ppc/fpu_helper.c +++ b/target-ppc/fpu_helper.c @@ -2689,19 +2689,19 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) \ helper_float_check_status(env); \ } -VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_nearest_even, 1) +VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_ties_away, 1) VSX_ROUND(xsrdpic, 1, float64, VsrD(0), FLOAT_ROUND_CURRENT, 1) VSX_ROUND(xsrdpim, 1, float64, VsrD(0), float_round_down, 1) VSX_ROUND(xsrdpip, 1, float64, VsrD(0), float_round_up, 1) VSX_ROUND(xsrdpiz, 1, float64, VsrD(0), float_round_to_zero, 1) -VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_nearest_even, 0) +VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_ties_away, 0) VSX_ROUND(xvrdpic, 2, float64, VsrD(i), FLOAT_ROUND_CURRENT, 0) VSX_ROUND(xvrdpim, 2, float64, VsrD(i), float_round_down, 0) VSX_ROUND(xvrdpip, 2, float64, VsrD(i), float_round_up, 0) VSX_ROUND(xvrdpiz, 2, float64, VsrD(i), float_round_to_zero, 0) -VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_nearest_even, 0) +VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_ties_away, 0) VSX_ROUND(xvrspic, 4, float32, VsrW(i), FLOAT_ROUND_CURRENT, 0) VSX_ROUND(xvrspim, 4, float32, VsrW(i), float_round_down, 0) VSX_ROUND(xvrspip, 4, float32, VsrW(i), float_round_up, 0) -- cgit v1.2.3 From 7093645a843e5da1a750bc451dd8c9107d595c61 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Fri, 1 Jul 2016 10:44:39 +0530 Subject: spapr: Ensure thread0 of CPU core is always realized first During CPU core realization, we create all the thread objects and parent them to the core object in a loop. However, the realization of thread objects is done separately by walking the threads of a core using object_child_foreach(). With this, there is no guarantee on the order in which the child thread objects get realized. Since CPU device tree properties are currently derived from the CPU thread object, we assume thread0 of the core to be the representative thread of the core when creating device tree properties for the core. If thread0 is not the first thread that gets realized, then we would end up having an incorrect dt_id for the core and this causes hotplug failures from the guest. Fix this by realizing each thread object by walking the core's thread object list thereby ensuring that thread0 and other threads are always realized in the correct order. Future TODO: CPU DT nodes are per-core properties and we should ideally base the creation of CPU DT nodes on core objects rather than the thread objects. Signed-off-by: Bharata B Rao Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr_cpu_core.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index a384db5204..70b6b0b5ee 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -259,9 +259,9 @@ out: error_propagate(errp, local_err); } -static int spapr_cpu_core_realize_child(Object *child, void *opaque) +static void spapr_cpu_core_realize_child(Object *child, Error **errp) { - Error **errp = opaque, *local_err = NULL; + Error *local_err = NULL; sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); CPUState *cs = CPU(child); PowerPCCPU *cpu = POWERPC_CPU(cs); @@ -269,15 +269,14 @@ static int spapr_cpu_core_realize_child(Object *child, void *opaque) object_property_set_bool(child, true, "realized", &local_err); if (local_err) { error_propagate(errp, local_err); - return 1; + return; } spapr_cpu_init(spapr, cpu, &local_err); if (local_err) { error_propagate(errp, local_err); - return 1; + return; } - return 0; } static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) @@ -287,13 +286,13 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) const char *typename = object_class_get_name(sc->cpu_class); size_t size = object_type_get_instance_size(typename); Error *local_err = NULL; - Object *obj; - int i; + void *obj; + int i, j; sc->threads = g_malloc0(size * cc->nr_threads); for (i = 0; i < cc->nr_threads; i++) { char id[32]; - void *obj = sc->threads + i * size; + obj = sc->threads + i * size; object_initialize(obj, size, typename); snprintf(id, sizeof(id), "thread[%d]", i); @@ -303,12 +302,16 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) } object_unref(obj); } - object_child_foreach(OBJECT(dev), spapr_cpu_core_realize_child, &local_err); - if (local_err) { - goto err; - } else { - return; + + for (j = 0; j < cc->nr_threads; j++) { + obj = sc->threads + j * size; + + spapr_cpu_core_realize_child(obj, &local_err); + if (local_err) { + goto err; + } } + return; err: while (--i >= 0) { -- cgit v1.2.3 From c4e6c42353fe735add45b790f8d3a323590f7cab Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Sat, 2 Jul 2016 00:41:32 +0200 Subject: ppc: simplify max_smt initialization in ppc_cpu_realizefn() kvmppc_smt_threads() returns 1 if KVM is not enabled. Signed-off-by: Greg Kurz Signed-off-by: David Gibson --- target-ppc/translate_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c index 843f19b748..a06bf50b65 100644 --- a/target-ppc/translate_init.c +++ b/target-ppc/translate_init.c @@ -9516,7 +9516,7 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp) PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); Error *local_err = NULL; #if !defined(CONFIG_USER_ONLY) - int max_smt = kvm_enabled() ? kvmppc_smt_threads() : 1; + int max_smt = kvmppc_smt_threads(); #endif #if !defined(CONFIG_USER_ONLY) -- cgit v1.2.3 From 606b54986df4e3964eee2d74460bd06ed2f384e5 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 4 Jul 2016 13:33:03 +1000 Subject: spapr_iommu: Realloc guest visible TCE table when starting/stopping listening The sPAPR TCE tables manage 2 copies when VFIO is using an IOMMU - a guest view of the table and a hardware TCE table. If there is no VFIO presense in the address space, then just the guest view is used, if this is the case, it is allocated in the KVM. However since there is no support yet for VFIO in KVM TCE hypercalls, when we start using VFIO, we need to move the guest view from KVM to the userspace; and we need to do this for every IOMMU on a bus with VFIO devices. This implements the callbacks for the sPAPR IOMMU - notify_started() reallocated the guest view to the user space, notify_stopped() does the opposite. This removes explicit spapr_tce_set_need_vfio() call from PCI hotplug path as the new callbacks do this better - they notify IOMMU at the exact moment when the configuration is changed, and this also includes the case of PCI hot unplug. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: David Gibson --- hw/ppc/spapr_iommu.c | 12 ++++++++++++ hw/ppc/spapr_pci.c | 6 ------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index e230bacae1..d57b05d5c0 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -156,6 +156,16 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu) return 1ULL << tcet->page_shift; } +static void spapr_tce_notify_started(MemoryRegion *iommu) +{ + spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true); +} + +static void spapr_tce_notify_stopped(MemoryRegion *iommu) +{ + spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false); +} + static int spapr_tce_table_post_load(void *opaque, int version_id) { sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); @@ -236,6 +246,8 @@ static const VMStateDescription vmstate_spapr_tce_table = { static MemoryRegionIOMMUOps spapr_iommu_ops = { .translate = spapr_tce_translate_iommu, .get_min_page_size = spapr_tce_get_min_page_size, + .notify_started = spapr_tce_notify_started, + .notify_stopped = spapr_tce_notify_stopped, }; static int spapr_tce_table_realize(DeviceState *dev) diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 8c1e6b17c3..cbb7cdd774 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1087,12 +1087,6 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc, void *fdt = NULL; int fdt_start_offset = 0, fdt_size; - if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { - sPAPRTCETable *tcet = spapr_tce_find_by_liobn(phb->dma_liobn); - - spapr_tce_set_need_vfio(tcet, true); - } - fdt = create_device_tree(&fdt_size); fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0); if (!fdt_start_offset) { -- cgit v1.2.3 From 318f67ce13710a09c6dcf34da7b6b0ebc845c5c9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 4 Jul 2016 13:33:04 +1000 Subject: vfio: spapr: Add DMA memory preregistering (SPAPR IOMMU v2) This makes use of the new "memory registering" feature. The idea is to provide the userspace ability to notify the host kernel about pages which are going to be used for DMA. Having this information, the host kernel can pin them all once per user process, do locked pages accounting (once) and not spent time on doing that in real time with possible failures which cannot be handled nicely in some cases. This adds a prereg memory listener which listens on address_space_memory and notifies a VFIO container about memory which needs to be pinned/unpinned. VFIO MMIO regions (i.e. "skip dump" regions) are skipped. The feature is only enabled for SPAPR IOMMU v2. The host kernel changes are required. Since v2 does not need/support VFIO_IOMMU_ENABLE, this does not call it when v2 is detected and enabled. This enforces guest RAM blocks to be host page size aligned; however this is not new as KVM already requires memory slots to be host page size aligned. Signed-off-by: Alexey Kardashevskiy [dwg: Fix compile error on 32-bit host] Signed-off-by: David Gibson --- hw/vfio/Makefile.objs | 1 + hw/vfio/common.c | 40 +++++++++--- hw/vfio/spapr.c | 139 ++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 6 ++ include/hw/vfio/vfio-common.h | 4 ++ 5 files changed, 181 insertions(+), 9 deletions(-) create mode 100644 hw/vfio/spapr.c diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index ceddbb8f99..c25e32b029 100644 --- a/hw/vfio/Makefile.objs +++ b/hw/vfio/Makefile.objs @@ -4,4 +4,5 @@ obj-$(CONFIG_PCI) += pci.o pci-quirks.o obj-$(CONFIG_SOFTMMU) += platform.o obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o obj-$(CONFIG_SOFTMMU) += amd-xgbe.o +obj-$(CONFIG_SOFTMMU) += spapr.o endif diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7be638e0e3..46381e6242 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -503,6 +503,9 @@ static const MemoryListener vfio_memory_listener = { static void vfio_listener_release(VFIOContainer *container) { memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } } static struct vfio_info_cap_header * @@ -861,8 +864,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - ret = ioctl(fd, VFIO_SET_IOMMU, - v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU); + container->iommu_type = v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU; + ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type); if (ret) { error_report("vfio: failed to set iommu for container: %m"); ret = -errno; @@ -887,8 +890,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) { container->iova_pgsizes = info.iova_pgsizes; } - } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) || + ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) { struct vfio_iommu_spapr_tce_info info; + bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU); ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { @@ -896,7 +901,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) ret = -errno; goto free_container_exit; } - ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU); + container->iommu_type = + v2 ? VFIO_SPAPR_TCE_v2_IOMMU : VFIO_SPAPR_TCE_IOMMU; + ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type); if (ret) { error_report("vfio: failed to set iommu for container: %m"); ret = -errno; @@ -908,11 +915,23 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) * when container fd is closed so we do not call it explicitly * in this file. */ - ret = ioctl(fd, VFIO_IOMMU_ENABLE); - if (ret) { - error_report("vfio: failed to enable container: %m"); - ret = -errno; - goto free_container_exit; + if (!v2) { + ret = ioctl(fd, VFIO_IOMMU_ENABLE); + if (ret) { + error_report("vfio: failed to enable container: %m"); + ret = -errno; + goto free_container_exit; + } + } else { + container->prereg_listener = vfio_prereg_listener; + + memory_listener_register(&container->prereg_listener, + &address_space_memory); + if (container->error) { + memory_listener_unregister(&container->prereg_listener); + error_report("vfio: RAM memory listener initialization failed for container"); + goto free_container_exit; + } } /* @@ -925,6 +944,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) if (ret) { error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m"); ret = -errno; + if (v2) { + memory_listener_unregister(&container->prereg_listener); + } goto free_container_exit; } container->min_iova = info.dma32_window_start; diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c new file mode 100644 index 0000000000..7d64443b7f --- /dev/null +++ b/hw/vfio/spapr.c @@ -0,0 +1,139 @@ +/* + * DMA memory preregistration + * + * Authors: + * Alexey Kardashevskiy + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include +#include + +#include "hw/vfio/vfio-common.h" +#include "hw/hw.h" +#include "qemu/error-report.h" +#include "trace.h" + +static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) +{ + if (memory_region_is_iommu(section->mr)) { + hw_error("Cannot possibly preregister IOMMU memory"); + } + + return !memory_region_is_ram(section->mr) || + memory_region_is_skip_dump(section->mr); +} + +static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) +{ + return memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (gpa - section->offset_within_address_space); +} + +static void vfio_prereg_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + prereg_listener); + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; + int ret; + hwaddr page_mask = qemu_real_host_page_mask; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0, + }; + + if (vfio_prereg_listener_skipped_section(section)) { + trace_vfio_prereg_listener_region_add_skip( + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) || + (section->offset_within_region & ~page_mask) || + (int128_get64(section->size) & ~page_mask))) { + error_report("%s received unaligned region", __func__); + return; + } + + end = section->offset_within_address_space + int128_get64(section->size); + if (gpa >= end) { + return; + } + + memory_region_ref(section->mr); + + reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); + reg.size = end - gpa; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); + if (ret) { + /* + * On the initfn path, store the first error in the container so we + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ + if (!container->initialized) { + if (!container->error) { + container->error = ret; + } + } else { + hw_error("vfio: Memory registering failed, unable to continue"); + } + } +} + +static void vfio_prereg_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + prereg_listener); + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; + int ret; + hwaddr page_mask = qemu_real_host_page_mask; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0, + }; + + if (vfio_prereg_listener_skipped_section(section)) { + trace_vfio_prereg_listener_region_del_skip( + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) || + (section->offset_within_region & ~page_mask) || + (int128_get64(section->size) & ~page_mask))) { + error_report("%s received unaligned region", __func__); + return; + } + + end = section->offset_within_address_space + int128_get64(section->size); + if (gpa >= end) { + return; + } + + reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); + reg.size = end - gpa; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); +} + +const MemoryListener vfio_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, +}; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a768fb54ec..0b02a3bd54 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -115,3 +115,9 @@ vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING" vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d" + +# hw/vfio/spapr.c +vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64 +vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64 +vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" +vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0610377789..405c3b29d6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -73,6 +73,8 @@ typedef struct VFIOContainer { VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; + MemoryListener prereg_listener; + unsigned iommu_type; int error; bool initialized; /* @@ -158,4 +160,6 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, uint32_t subtype, struct vfio_region_info **info); #endif +extern const MemoryListener vfio_prereg_listener; + #endif /* !HW_VFIO_VFIO_COMMON_H */ -- cgit v1.2.3 From f4ec5e26edbd4c7509623ec882c344dc334bc1b2 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 4 Jul 2016 13:33:05 +1000 Subject: vfio: Add host side DMA window capabilities There are going to be multiple IOMMUs per a container. This moves the single host IOMMU parameter set to a list of VFIOHostDMAWindow. This should cause no behavioral change and will be used later by the SPAPR TCE IOMMU v2 which will also add a vfio_host_win_del() helper. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: David Gibson --- hw/vfio/common.c | 60 +++++++++++++++++++++++++++++++------------ include/hw/vfio/vfio-common.h | 10 ++++++-- 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 46381e6242..7c8a5310ae 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -28,6 +28,7 @@ #include "exec/memory.h" #include "hw/hw.h" #include "qemu/error-report.h" +#include "qemu/range.h" #include "sysemu/kvm.h" #ifdef CONFIG_KVM #include "linux/kvm.h" @@ -241,6 +242,29 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } +static void vfio_host_win_add(VFIOContainer *container, + hwaddr min_iova, hwaddr max_iova, + uint64_t iova_pgsizes) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + min_iova, + max_iova - min_iova + 1)) { + hw_error("%s: Overlapped IOMMU are not enabled", __func__); + } + } + + hostwin = g_malloc0(sizeof(*hostwin)); + + hostwin->min_iova = min_iova; + hostwin->max_iova = max_iova; + hostwin->iova_pgsizes = iova_pgsizes; + QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); +} + static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -329,6 +353,8 @@ static void vfio_listener_region_add(MemoryListener *listener, Int128 llend, llsize; void *vaddr; int ret; + VFIOHostDMAWindow *hostwin; + bool hostwin_found; if (vfio_listener_skipped_section(section)) { trace_vfio_listener_region_add_skip( @@ -354,7 +380,15 @@ static void vfio_listener_region_add(MemoryListener *listener, } end = int128_get64(int128_sub(llend, int128_one())); - if ((iova < container->min_iova) || (end > container->max_iova)) { + hostwin_found = false; + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + if (!hostwin_found) { error_report("vfio: IOMMU container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); @@ -369,10 +403,6 @@ static void vfio_listener_region_add(MemoryListener *listener, trace_vfio_listener_region_add_iommu(iova, end); /* - * FIXME: We should do some checking to see if the - * capabilities of the host VFIO IOMMU are adequate to model - * the guest IOMMU - * * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this * would be the right place to wire that up (tell the KVM @@ -879,17 +909,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) * existing Type1 IOMMUs generally support any IOVA we're * going to actually try in practice. */ - container->min_iova = 0; - container->max_iova = (hwaddr)-1; - - /* Assume just 4K IOVA page size */ - container->iova_pgsizes = 0x1000; info.argsz = sizeof(info); ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); /* Ignore errors */ - if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) { - container->iova_pgsizes = info.iova_pgsizes; + if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + /* Assume 4k IOVA page size */ + info.iova_pgsizes = 4096; } + vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes); } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) || ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) { struct vfio_iommu_spapr_tce_info info; @@ -949,11 +976,12 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) } goto free_container_exit; } - container->min_iova = info.dma32_window_start; - container->max_iova = container->min_iova + info.dma32_window_size - 1; - /* Assume just 4K IOVA pages for now */ - container->iova_pgsizes = 0x1000; + /* The default table uses 4K pages */ + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 405c3b29d6..b1f3e92405 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -82,9 +82,8 @@ typedef struct VFIOContainer { * contiguous IOVA window. We may need to generalize that in * future */ - hwaddr min_iova, max_iova; - uint64_t iova_pgsizes; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; @@ -97,6 +96,13 @@ typedef struct VFIOGuestIOMMU { QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; +typedef struct VFIOHostDMAWindow { + hwaddr min_iova; + hwaddr max_iova; + uint64_t iova_pgsizes; + QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; +} VFIOHostDMAWindow; + typedef struct VFIODeviceOps VFIODeviceOps; typedef struct VFIODevice { -- cgit v1.2.3 From 2e4109de8e589beecd69996ee14f24021b991c0d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 4 Jul 2016 13:33:06 +1000 Subject: vfio/spapr: Create DMA window dynamically (SPAPR IOMMU v2) New VFIO_SPAPR_TCE_v2_IOMMU type supports dynamic DMA window management. This adds ability to VFIO common code to dynamically allocate/remove DMA windows in the host kernel when new VFIO container is added/removed. This adds a helper to vfio_listener_region_add which makes VFIO_IOMMU_SPAPR_TCE_CREATE ioctl and adds just created IOMMU into the host IOMMU list; the opposite action is taken in vfio_listener_region_del. When creating a new window, this uses heuristic to decide on the TCE table levels number. This should cause no guest visible change in behavior. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson [dwg: Added some casts to prevent printf() warnings on certain targets where the kernel headers' __u64 doesn't match uint64_t or PRIx64] Signed-off-by: David Gibson --- hw/vfio/common.c | 80 +++++++++++++++++++++++++++++++++++++------ hw/vfio/spapr.c | 71 ++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 2 ++ include/hw/vfio/vfio-common.h | 6 ++++ 4 files changed, 149 insertions(+), 10 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7c8a5310ae..f3c0522e7e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -265,6 +265,21 @@ static void vfio_host_win_add(VFIOContainer *container, QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); } +static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova, + hwaddr max_iova) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { + QLIST_REMOVE(hostwin, hostwin_next); + return 0; + } + } + + return -1; +} + static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -380,6 +395,31 @@ static void vfio_listener_region_add(MemoryListener *listener, } end = int128_get64(int128_sub(llend, int128_one())); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; + + /* For now intersections are not allowed, we may relax this later */ + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + section->offset_within_address_space, + int128_get64(section->size))) { + ret = -1; + goto fail; + } + } + + ret = vfio_spapr_create_window(container, section, &pgsize); + if (ret) { + goto fail; + } + + vfio_host_win_add(container, section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, pgsize); + } + hostwin_found = false; QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { @@ -523,6 +563,18 @@ static void vfio_listener_region_del(MemoryListener *listener, "0x%"HWADDR_PRIx") = %d (%m)", container, iova, int128_get64(llsize), ret); } + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + vfio_spapr_remove_window(container, + section->offset_within_address_space); + if (vfio_host_win_del(container, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { + hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, + __func__, section->offset_within_address_space); + } + } } static const MemoryListener vfio_memory_listener = { @@ -961,11 +1013,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) } } - /* - * This only considers the host IOMMU's 32-bit window. At - * some point we need to add support for the optional 64-bit - * window and dynamic windows - */ info.argsz = sizeof(info); ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); if (ret) { @@ -977,11 +1024,24 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - /* The default table uses 4K pages */ - vfio_host_win_add(container, info.dma32_window_start, - info.dma32_window_start + - info.dma32_window_size - 1, - 0x1000); + if (v2) { + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this + * window now and let those iommu_listener callbacks + * create/remove them when needed. + */ + ret = vfio_spapr_remove_window(container, info.dma32_window_start); + if (ret) { + goto free_container_exit; + } + } else { + /* The default table uses 4K pages */ + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); + } } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 7d64443b7f..0af342332c 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -137,3 +137,74 @@ const MemoryListener vfio_prereg_listener = { .region_add = vfio_prereg_listener_region_add, .region_del = vfio_prereg_listener_region_del, }; + +int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize) +{ + int ret; + unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr); + unsigned entries, pages; + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; + + /* + * FIXME: For VFIO iommu types which have KVM acceleration to + * avoid bouncing all map/unmaps through qemu this way, this + * would be the right place to wire that up (tell the KVM + * device emulation the VFIO iommu handles to use). + */ + create.window_size = int128_get64(section->size); + create.page_shift = ctz64(pagesize); + /* + * SPAPR host supports multilevel TCE tables, there is some + * heuristic to decide how many levels we want for our table: + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 + */ + entries = create.window_size >> create.page_shift; + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ + create.levels = ctz64(pages) / 6 + 1; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + if (ret) { + error_report("Failed to create a window, ret = %d (%m)", ret); + return -errno; + } + + if (create.start_addr != section->offset_within_address_space) { + vfio_spapr_remove_window(container, create.start_addr); + + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, + section->offset_within_address_space, + (uint64_t)create.start_addr); + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + return -EINVAL; + } + trace_vfio_spapr_create_window(create.page_shift, + create.window_size, + create.start_addr); + *pgsize = pagesize; + + return 0; +} + +int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space) +{ + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = offset_within_address_space, + }; + int ret; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("Failed to remove window at %"PRIx64, + (uint64_t)remove.start_addr); + return -errno; + } + + trace_vfio_spapr_remove_window(offset_within_address_space); + + return 0; +} diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0b02a3bd54..4bb7690c46 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -121,3 +121,5 @@ vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" - vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64 vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index b1f3e92405..07f7188df4 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -168,4 +168,10 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, #endif extern const MemoryListener vfio_prereg_listener; +int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize); +int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space); + #endif /* !HW_VFIO_VFIO_COMMON_H */ -- cgit v1.2.3 From ae4de14cd36b6a899d83df9595be3971ac0802d4 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 4 Jul 2016 13:33:07 +1000 Subject: spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW) This adds support for Dynamic DMA Windows (DDW) option defined by the SPAPR specification which allows to have additional DMA window(s) The "ddw" property is enabled by default on a PHB but for compatibility the pseries-2.6 machine and older disable it. This also creates a single DMA window for the older machines to maintain backward migration. This implements DDW for PHB with emulated and VFIO devices. The host kernel support is required. The advertised IOMMU page sizes are 4K and 64K; 16M pages are supported but not advertised by default, in order to enable them, the user has to specify "pgsz" property for PHB and enable huge pages for RAM. The existing linux guests try creating one additional huge DMA window with 64K or 16MB pages and map the entire guest RAM to. If succeeded, the guest switches to dma_direct_ops and never calls TCE hypercalls (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM and not waste time on map/unmap later. This adds a "dma64_win_addr" property which is a bus address for the 64bit window and by default set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware uses and this allows having emulated and VFIO devices on the same bus. This adds 4 RTAS handlers: * ibm,query-pe-dma-window * ibm,create-pe-dma-window * ibm,remove-pe-dma-window * ibm,reset-pe-dma-window These are registered from type_init() callback. These RTAS handlers are implemented in a separate file to avoid polluting spapr_iommu.c with PCI. This changes sPAPRPHBState::dma_liobn to an array to allow 2 LIOBNs and updates all references to dma_liobn. However this does not add 64bit LIOBN to the migration stream as in fact even 32bit LIOBN is rather pointless there (as it is a PHB property and the management software can/should pass LIOBNs via CLI) but we keep it for the backward migration support. Signed-off-by: Alexey Kardashevskiy Signed-off-by: David Gibson --- hw/ppc/Makefile.objs | 1 + hw/ppc/spapr.c | 7 +- hw/ppc/spapr_pci.c | 75 ++++++++--- hw/ppc/spapr_rtas_ddw.c | 295 ++++++++++++++++++++++++++++++++++++++++++++ hw/ppc/trace-events | 4 + include/hw/pci-host/spapr.h | 8 +- include/hw/ppc/spapr.h | 16 ++- 7 files changed, 385 insertions(+), 21 deletions(-) create mode 100644 hw/ppc/spapr_rtas_ddw.c diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index 5cc6608e50..91a3420f47 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -8,6 +8,7 @@ obj-$(CONFIG_PSERIES) += spapr_cpu_core.o ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) obj-y += spapr_pci_vfio.o endif +obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o # PowerPC 4xx boards obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o obj-y += ppc4xx_pci.o diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 78ebd9ee38..9c1c2c1858 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2489,7 +2489,12 @@ DEFINE_SPAPR_MACHINE(2_7, "2.7", true); * pseries-2.6 */ #define SPAPR_COMPAT_2_6 \ - HW_COMPAT_2_6 + HW_COMPAT_2_6 \ + { \ + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\ + .property = "ddw",\ + .value = stringify(off),\ + }, static void spapr_machine_2_6_instance_options(MachineState *machine) { diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index cbb7cdd774..949c44fec8 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -35,6 +35,7 @@ #include "hw/ppc/spapr.h" #include "hw/pci-host/spapr.h" #include "exec/address-spaces.h" +#include "exec/ram_addr.h" #include #include "trace.h" #include "qemu/error-report.h" @@ -45,6 +46,7 @@ #include "hw/ppc/spapr_drc.h" #include "sysemu/device_tree.h" #include "sysemu/kvm.h" +#include "sysemu/hostmem.h" #include "hw/vfio/vfio.h" @@ -1304,11 +1306,14 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) PCIBus *bus; uint64_t msi_window_size = 4096; sPAPRTCETable *tcet; + const unsigned windows_supported = + sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1; if (sphb->index != (uint32_t)-1) { hwaddr windows_base; - if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn != (uint32_t)-1) + if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != (uint32_t)-1) + || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2) || (sphb->mem_win_addr != (hwaddr)-1) || (sphb->io_win_addr != (hwaddr)-1)) { error_setg(errp, "Either \"index\" or other parameters must" @@ -1323,7 +1328,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) } sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index; - sphb->dma_liobn = SPAPR_PCI_LIOBN(sphb->index, 0); + for (i = 0; i < windows_supported; ++i) { + sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i); + } windows_base = SPAPR_PCI_WINDOW_BASE + sphb->index * SPAPR_PCI_WINDOW_SPACING; @@ -1336,8 +1343,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) return; } - if (sphb->dma_liobn == (uint32_t)-1) { - error_setg(errp, "LIOBN not specified for PHB"); + if ((sphb->dma_liobn[0] == (uint32_t)-1) || + ((sphb->dma_liobn[1] == (uint32_t)-1) && (windows_supported > 1))) { + error_setg(errp, "LIOBN(s) not specified for PHB"); return; } @@ -1456,16 +1464,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) } } - tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn); - if (!tcet) { - error_setg(errp, "Unable to create TCE table for %s", - sphb->dtbusname); - return; + /* DMA setup */ + for (i = 0; i < windows_supported; ++i) { + tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]); + if (!tcet) { + error_setg(errp, "Creating window#%d failed for %s", + i, sphb->dtbusname); + return; + } + memory_region_add_subregion_overlap(&sphb->iommu_root, 0, + spapr_tce_get_iommu(tcet), 0); } - memory_region_add_subregion_overlap(&sphb->iommu_root, 0, - spapr_tce_get_iommu(tcet), 0); - sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free); } @@ -1482,13 +1492,19 @@ static int spapr_phb_children_reset(Object *child, void *opaque) void spapr_phb_dma_reset(sPAPRPHBState *sphb) { - sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn); + int i; + sPAPRTCETable *tcet; + + for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) { + tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[i]); - if (tcet && tcet->nb_table) { - spapr_tce_table_disable(tcet); + if (tcet && tcet->nb_table) { + spapr_tce_table_disable(tcet); + } } /* Register default 32bit DMA window */ + tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[0]); spapr_tce_table_enable(tcet, SPAPR_TCE_PAGE_SHIFT, sphb->dma_win_addr, sphb->dma_win_size >> SPAPR_TCE_PAGE_SHIFT); } @@ -1510,7 +1526,8 @@ static void spapr_phb_reset(DeviceState *qdev) static Property spapr_phb_properties[] = { DEFINE_PROP_UINT32("index", sPAPRPHBState, index, -1), DEFINE_PROP_UINT64("buid", sPAPRPHBState, buid, -1), - DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn, -1), + DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn[0], -1), + DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1), DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1), DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size, SPAPR_PCI_MMIO_WIN_SIZE), @@ -1522,6 +1539,11 @@ static Property spapr_phb_properties[] = { /* Default DMA window is 0..1GB */ DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0), DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000), + DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_win_addr, + 0x800000000000000ULL), + DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), + DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, + (1ULL << 12) | (1ULL << 16)), DEFINE_PROP_END_OF_LIST(), }; @@ -1598,7 +1620,7 @@ static const VMStateDescription vmstate_spapr_pci = { .post_load = spapr_pci_post_load, .fields = (VMStateField[]) { VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState), - VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState), + VMSTATE_UINT32_EQUAL(dma_liobn[0], sPAPRPHBState), VMSTATE_UINT64_EQUAL(mem_win_addr, sPAPRPHBState), VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState), VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState), @@ -1774,6 +1796,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, uint32_t interrupt_map_mask[] = { cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)}; uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7]; + uint32_t ddw_applicable[] = { + cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW), + cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW), + cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW) + }; + uint32_t ddw_extensions[] = { + cpu_to_be32(1), + cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) + }; sPAPRTCETable *tcet; PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; sPAPRFDT s_fdt; @@ -1798,6 +1829,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1)); _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR)); + /* Dynamic DMA window */ + if (phb->ddw_enabled) { + _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable, + sizeof(ddw_applicable))); + _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions", + &ddw_extensions, sizeof(ddw_extensions))); + } + /* Build the interrupt-map, this must matches what is done * in pci_spapr_map_irq */ @@ -1821,7 +1860,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, sizeof(interrupt_map))); - tcet = spapr_tce_find_by_liobn(phb->dma_liobn); + tcet = spapr_tce_find_by_liobn(phb->dma_liobn[0]); if (!tcet) { return -1; } diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c new file mode 100644 index 0000000000..177dcffc9b --- /dev/null +++ b/hw/ppc/spapr_rtas_ddw.c @@ -0,0 +1,295 @@ +/* + * QEMU sPAPR Dynamic DMA windows support + * + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "qemu/error-report.h" +#include "hw/ppc/spapr.h" +#include "hw/pci-host/spapr.h" +#include "trace.h" + +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque) +{ + sPAPRTCETable *tcet; + + tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE); + if (tcet && tcet->nb_table) { + ++*(unsigned *)opaque; + } + return 0; +} + +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb) +{ + unsigned ret = 0; + + object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret); + + return ret; +} + +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque) +{ + sPAPRTCETable *tcet; + + tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE); + if (tcet && !tcet->nb_table) { + *(uint32_t *)opaque = tcet->liobn; + return 1; + } + return 0; +} + +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb) +{ + uint32_t liobn = 0; + + object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn); + + return liobn; +} + +static uint32_t spapr_page_mask_to_query_mask(uint64_t page_mask) +{ + int i; + uint32_t mask = 0; + const struct { int shift; uint32_t mask; } masks[] = { + { 12, RTAS_DDW_PGSIZE_4K }, + { 16, RTAS_DDW_PGSIZE_64K }, + { 24, RTAS_DDW_PGSIZE_16M }, + { 25, RTAS_DDW_PGSIZE_32M }, + { 26, RTAS_DDW_PGSIZE_64M }, + { 27, RTAS_DDW_PGSIZE_128M }, + { 28, RTAS_DDW_PGSIZE_256M }, + { 34, RTAS_DDW_PGSIZE_16G }, + }; + + for (i = 0; i < ARRAY_SIZE(masks); ++i) { + if (page_mask & (1ULL << masks[i].shift)) { + mask |= masks[i].mask; + } + } + + return mask; +} + +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + uint64_t buid, max_window_size; + uint32_t avail, addr, pgmask = 0; + MachineState *machine = MACHINE(spapr); + + if ((nargs != 3) || (nret != 5)) { + goto param_error_exit; + } + + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); + addr = rtas_ld(args, 0); + sphb = spapr_pci_find_phb(spapr, buid); + if (!sphb || !sphb->ddw_enabled) { + goto param_error_exit; + } + + /* Translate page mask to LoPAPR format */ + pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask); + + /* + * This is "Largest contiguous block of TCEs allocated specifically + * for (that is, are reserved for) this PE". + * Return the maximum number as maximum supported RAM size was in 4K pages. + */ + if (machine->ram_size == machine->maxram_size) { + max_window_size = machine->ram_size; + } else { + MemoryHotplugState *hpms = &spapr->hotplug_memory; + + max_window_size = hpms->base + memory_region_size(&hpms->mr); + } + + avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + rtas_st(rets, 1, avail); + rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT); + rtas_st(rets, 3, pgmask); + rtas_st(rets, 4, 0); /* DMA migration mask, not supported */ + + trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask); + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + sPAPRTCETable *tcet = NULL; + uint32_t addr, page_shift, window_shift, liobn; + uint64_t buid, win_addr; + int windows; + + if ((nargs != 5) || (nret != 4)) { + goto param_error_exit; + } + + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); + addr = rtas_ld(args, 0); + sphb = spapr_pci_find_phb(spapr, buid); + if (!sphb || !sphb->ddw_enabled) { + goto param_error_exit; + } + + page_shift = rtas_ld(args, 3); + window_shift = rtas_ld(args, 4); + liobn = spapr_phb_get_free_liobn(sphb); + windows = spapr_phb_get_active_win_num(sphb); + + if (!(sphb->page_size_mask & (1ULL << page_shift)) || + (window_shift < page_shift)) { + goto param_error_exit; + } + + if (!liobn || !sphb->ddw_enabled || windows == SPAPR_PCI_DMA_MAX_WINDOWS) { + goto hw_error_exit; + } + + tcet = spapr_tce_find_by_liobn(liobn); + if (!tcet) { + goto hw_error_exit; + } + + win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr; + spapr_tce_table_enable(tcet, page_shift, win_addr, + 1ULL << (window_shift - page_shift)); + if (!tcet->nb_table) { + goto hw_error_exit; + } + + trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift, + 1ULL << window_shift, tcet->bus_offset, liobn); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + rtas_st(rets, 1, liobn); + rtas_st(rets, 2, tcet->bus_offset >> 32); + rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1)); + + return; + +hw_error_exit: + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + sPAPRTCETable *tcet; + uint32_t liobn; + + if ((nargs != 1) || (nret != 1)) { + goto param_error_exit; + } + + liobn = rtas_ld(args, 0); + tcet = spapr_tce_find_by_liobn(liobn); + if (!tcet) { + goto param_error_exit; + } + + sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent); + if (!sphb || !sphb->ddw_enabled || !tcet->nb_table) { + goto param_error_exit; + } + + spapr_tce_table_disable(tcet); + trace_spapr_iommu_ddw_remove(liobn); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + uint64_t buid; + uint32_t addr; + + if ((nargs != 3) || (nret != 1)) { + goto param_error_exit; + } + + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); + addr = rtas_ld(args, 0); + sphb = spapr_pci_find_phb(spapr, buid); + if (!sphb || !sphb->ddw_enabled) { + goto param_error_exit; + } + + spapr_phb_dma_reset(sphb); + trace_spapr_iommu_ddw_reset(buid, addr); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void spapr_rtas_ddw_init(void) +{ + spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW, + "ibm,query-pe-dma-window", + rtas_ibm_query_pe_dma_window); + spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW, + "ibm,create-pe-dma-window", + rtas_ibm_create_pe_dma_window); + spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW, + "ibm,remove-pe-dma-window", + rtas_ibm_remove_pe_dma_window); + spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW, + "ibm,reset-pe-dma-window", + rtas_ibm_reset_pe_dma_window); +} + +type_init(spapr_rtas_ddw_init) diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index 6da713547f..900679bc9d 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -30,6 +30,10 @@ spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, un spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d" spapr_iommu_pre_save(uint64_t liobn, uint32_t nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32 spapr_iommu_post_load(uint64_t liobn, uint32_t pre_nb, uint32_t post_nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" => %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32 +spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32 +spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32 +spapr_iommu_ddw_remove(uint32_t liobn) "liobn=%"PRIx32 +spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32 # hw/ppc/ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 288b89c04a..193631d2dc 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -32,6 +32,8 @@ #define SPAPR_PCI_HOST_BRIDGE(obj) \ OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE) +#define SPAPR_PCI_DMA_MAX_WINDOWS 2 + typedef struct sPAPRPHBState sPAPRPHBState; typedef struct spapr_pci_msi { @@ -56,7 +58,7 @@ struct sPAPRPHBState { hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size; MemoryRegion memwindow, iowindow, msiwindow; - uint32_t dma_liobn; + uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS]; hwaddr dma_win_addr, dma_win_size; AddressSpace iommu_as; MemoryRegion iommu_root; @@ -71,6 +73,10 @@ struct sPAPRPHBState { spapr_pci_msi_mig *msi_devs; QLIST_ENTRY(sPAPRPHBState) list; + + bool ddw_enabled; + uint64_t page_size_mask; + uint64_t dma64_win_addr; }; #define SPAPR_PCI_MAX_INDEX 255 diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 49325555d3..2e2dd14c30 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -416,6 +416,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi); #define RTAS_OUT_NOT_AUTHORIZED -9002 #define RTAS_OUT_SYSPARM_PARAM_ERROR -9999 +/* DDW pagesize mask values from ibm,query-pe-dma-window */ +#define RTAS_DDW_PGSIZE_4K 0x01 +#define RTAS_DDW_PGSIZE_64K 0x02 +#define RTAS_DDW_PGSIZE_16M 0x04 +#define RTAS_DDW_PGSIZE_32M 0x08 +#define RTAS_DDW_PGSIZE_64M 0x10 +#define RTAS_DDW_PGSIZE_128M 0x20 +#define RTAS_DDW_PGSIZE_256M 0x40 +#define RTAS_DDW_PGSIZE_16G 0x80 + /* RTAS tokens */ #define RTAS_TOKEN_BASE 0x2000 @@ -457,8 +467,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi); #define RTAS_IBM_SET_SLOT_RESET (RTAS_TOKEN_BASE + 0x23) #define RTAS_IBM_CONFIGURE_PE (RTAS_TOKEN_BASE + 0x24) #define RTAS_IBM_SLOT_ERROR_DETAIL (RTAS_TOKEN_BASE + 0x25) +#define RTAS_IBM_QUERY_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x26) +#define RTAS_IBM_CREATE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x27) +#define RTAS_IBM_REMOVE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x28) +#define RTAS_IBM_RESET_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x29) -#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x26) +#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x2A) /* RTAS ibm,get-system-parameter token values */ #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20 -- cgit v1.2.3 From 1f0252e66e76f0b5967419e2a1e53a1f1398bf7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 1 Jul 2016 09:10:10 +0200 Subject: ppc: simplify ppc_hash64_hpte_page_shift_noslb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The segment page shift parameter is never used. Let's remove it. Signed-off-by: Cédric Le Goater Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 4 ++-- target-ppc/mmu-hash64.c | 6 +----- target-ppc/mmu-hash64.h | 3 +-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index e011ed4b66..73af112e1d 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -83,12 +83,12 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong pte_index = args[1]; target_ulong pteh = args[2]; target_ulong ptel = args[3]; - unsigned apshift, spshift; + unsigned apshift; target_ulong raddr; target_ulong index; uint64_t token; - apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel, &spshift); + apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel); if (!apshift) { /* Bad page size encoding */ return H_PARAMETER; diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 3b1357a648..a15ef83f7d 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -610,14 +610,12 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, } unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, - uint64_t pte0, uint64_t pte1, - unsigned *seg_page_shift) + uint64_t pte0, uint64_t pte1) { CPUPPCState *env = &cpu->env; int i; if (!(pte0 & HPTE64_V_LARGE)) { - *seg_page_shift = 12; return 12; } @@ -635,12 +633,10 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, shift = hpte_page_shift(sps, pte0, pte1); if (shift) { - *seg_page_shift = sps->page_shift; return shift; } } - *seg_page_shift = 0; return 0; } diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h index 6423b9f791..7fcb555b47 100644 --- a/target-ppc/mmu-hash64.h +++ b/target-ppc/mmu-hash64.h @@ -17,8 +17,7 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, target_ulong pte_index, target_ulong pte0, target_ulong pte1); unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, - uint64_t pte0, uint64_t pte1, - unsigned *seg_page_shift); + uint64_t pte0, uint64_t pte1); #endif /* -- cgit v1.2.3 From 651060aba79dc9d0cc77ac3921948ea78dba7409 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 5 Jul 2016 12:17:56 +1000 Subject: target-ppc: Correct page size decoding in ppc_hash64_pteg_search() The architecture specifies that when searching a PTEG for PTEs, entries with a page size encoding that's not valid for the current segment should be ignored, continuing the search. The current implementation does this with ppc_hash64_pte_size_decode() which is a very incomplete implementation of this check. We already have code to do a full and correct page size decode in hpte_page_shift(). This patch moves hpte_page_shift() so it can be used in ppc_hash64_pteg_search() and adjusts the latter's parameters to include a full SLBE instead of just a segment page shift. Signed-off-by: David Gibson Reviewed-by: Benjamin Herrenschmidt --- target-ppc/mmu-hash64.c | 99 ++++++++++++++++++++----------------------------- 1 file changed, 41 insertions(+), 58 deletions(-) diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index a15ef83f7d..6d3428ef7d 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -450,30 +450,45 @@ void ppc_hash64_stop_access(PowerPCCPU *cpu, uint64_t token) } } -/* Returns the effective page shift or 0. MPSS isn't supported yet so - * this will always be the slb_pshift or 0 - */ -static uint32_t ppc_hash64_pte_size_decode(uint64_t pte1, uint32_t slb_pshift) +static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, + uint64_t pte0, uint64_t pte1) { - switch (slb_pshift) { - case 12: + int i; + + if (!(pte0 & HPTE64_V_LARGE)) { + if (sps->page_shift != 12) { + /* 4kiB page in a non 4kiB segment */ + return 0; + } + /* Normal 4kiB page */ return 12; - case 16: - if ((pte1 & 0xf000) == 0x1000) { - return 16; + } + + for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { + const struct ppc_one_page_size *ps = &sps->enc[i]; + uint64_t mask; + + if (!ps->page_shift) { + break; } - return 0; - case 24: - if ((pte1 & 0xff000) == 0) { - return 24; + + if (ps->page_shift == 12) { + /* L bit is set so this can't be a 4kiB page */ + continue; + } + + mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN; + + if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) { + return ps->page_shift; } - return 0; } - return 0; + + return 0; /* Bad page size encoding */ } static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, - uint32_t slb_pshift, bool secondary, + ppc_slb_t *slb, bool secondary, target_ulong ptem, ppc_hash_pte64_t *pte) { CPUPPCState *env = &cpu->env; @@ -494,7 +509,14 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, if ((pte0 & HPTE64_V_VALID) && (secondary == !!(pte0 & HPTE64_V_SECONDARY)) && HPTE64_V_COMPARE(pte0, ptem)) { - uint32_t pshift = ppc_hash64_pte_size_decode(pte1, slb_pshift); + unsigned pshift = hpte_page_shift(slb->sps, pte0, pte1); + /* + * If there is no match, ignore the PTE, it could simply + * be for a different segment size encoding and the + * architecture specifies we should not match. Linux will + * potentially leave behind PTEs for the wrong base page + * size when demoting segments. + */ if (pshift == 0) { continue; } @@ -554,8 +576,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, hash); - pte_offset = ppc_hash64_pteg_search(cpu, hash, slb->sps->page_shift, - 0, ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, 0, ptem, pte); if (pte_offset == -1) { /* Secondary PTEG lookup */ @@ -565,50 +586,12 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, ~hash); - pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb->sps->page_shift, 1, - ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, 1, ptem, pte); } return pte_offset; } -static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, - uint64_t pte0, uint64_t pte1) -{ - int i; - - if (!(pte0 & HPTE64_V_LARGE)) { - if (sps->page_shift != 12) { - /* 4kiB page in a non 4kiB segment */ - return 0; - } - /* Normal 4kiB page */ - return 12; - } - - for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { - const struct ppc_one_page_size *ps = &sps->enc[i]; - uint64_t mask; - - if (!ps->page_shift) { - break; - } - - if (ps->page_shift == 12) { - /* L bit is set so this can't be a 4kiB page */ - continue; - } - - mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN; - - if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) { - return ps->page_shift; - } - } - - return 0; /* Bad page size encoding */ -} - unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, uint64_t pte0, uint64_t pte1) { -- cgit v1.2.3 From 073de86aa934d46d596a2367e7501da5500e5b86 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 5 Jul 2016 12:31:48 +1000 Subject: target-ppc: Simplify HPTE matching ppc_hash64_pteg_search() explicitly checks each HPTE's VALID and SECONDARY bits, then uses the HPTE64_V_COMPARE() macro to check the B field and AVPN. However, a small tweak to HPTE64_V_COMPARE() means we can check all of these bits at once with a suitable ptem value. So, consolidate all the comparisons for simplicity. Signed-off-by: David Gibson Reviewed-by: Benjamin Herrenschmidt --- target-ppc/mmu-hash64.c | 15 ++++++++------- target-ppc/mmu-hash64.h | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 6d3428ef7d..07d324930c 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -488,8 +488,8 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, } static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, - ppc_slb_t *slb, bool secondary, - target_ulong ptem, ppc_hash_pte64_t *pte) + ppc_slb_t *slb, target_ulong ptem, + ppc_hash_pte64_t *pte) { CPUPPCState *env = &cpu->env; int i; @@ -506,9 +506,8 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, pte0 = ppc_hash64_load_hpte0(cpu, token, i); pte1 = ppc_hash64_load_hpte1(cpu, token, i); - if ((pte0 & HPTE64_V_VALID) - && (secondary == !!(pte0 & HPTE64_V_SECONDARY)) - && HPTE64_V_COMPARE(pte0, ptem)) { + /* This compares V, B, H (secondary) and the AVPN */ + if (HPTE64_V_COMPARE(pte0, ptem)) { unsigned pshift = hpte_page_shift(slb->sps, pte0, pte1); /* * If there is no match, ignore the PTE, it could simply @@ -563,6 +562,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, hash = vsid ^ (epn >> slb->sps->page_shift); } ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN); + ptem |= HPTE64_V_VALID; /* Page address translation */ qemu_log_mask(CPU_LOG_MMU, @@ -576,17 +576,18 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, hash); - pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, 0, ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte); if (pte_offset == -1) { /* Secondary PTEG lookup */ + ptem |= HPTE64_V_SECONDARY; qemu_log_mask(CPU_LOG_MMU, "1 htab=" TARGET_FMT_plx "/" TARGET_FMT_plx " vsid=" TARGET_FMT_lx " api=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, ~hash); - pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, 1, ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte); } return pte_offset; diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h index 7fcb555b47..154a306997 100644 --- a/target-ppc/mmu-hash64.h +++ b/target-ppc/mmu-hash64.h @@ -62,7 +62,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, #define HPTE64_V_AVPN_SHIFT 7 #define HPTE64_V_AVPN 0x3fffffffffffff80ULL #define HPTE64_V_AVPN_VAL(x) (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT) -#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff80ULL)) +#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff83ULL)) #define HPTE64_V_LARGE 0x0000000000000004ULL #define HPTE64_V_SECONDARY 0x0000000000000002ULL #define HPTE64_V_VALID 0x0000000000000001ULL -- cgit v1.2.3 From 949868633f0454715af1781c0f377413b6ab000e Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 5 Jul 2016 12:31:57 +1000 Subject: target-ppc: Return page shift from PTEG search ppc_hash64_pteg_search() now decodes a PTEs page size encoding, which it didn't previously do. This means we're now double decoding the page size because we check it int he fault path after ppc64_hash64_htab_lookup() returns. To avoid this duplication have ppc_hash64_pteg_search() and ppc_hash64_htab_lookup() return the page size from the PTE and use that in the callers instead of decoding again. Signed-off-by: David Gibson Reviewed-by: Benjamin Herrenschmidt --- target-ppc/mmu-hash64.c | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 07d324930c..7c1b169676 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -489,7 +489,7 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, ppc_slb_t *slb, target_ulong ptem, - ppc_hash_pte64_t *pte) + ppc_hash_pte64_t *pte, unsigned *pshift) { CPUPPCState *env = &cpu->env; int i; @@ -508,7 +508,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, /* This compares V, B, H (secondary) and the AVPN */ if (HPTE64_V_COMPARE(pte0, ptem)) { - unsigned pshift = hpte_page_shift(slb->sps, pte0, pte1); + *pshift = hpte_page_shift(slb->sps, pte0, pte1); /* * If there is no match, ignore the PTE, it could simply * be for a different segment size encoding and the @@ -516,7 +516,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, * potentially leave behind PTEs for the wrong base page * size when demoting segments. */ - if (pshift == 0) { + if (*pshift == 0) { continue; } /* We don't do anything with pshift yet as qemu TLB only deals @@ -537,7 +537,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, ppc_slb_t *slb, target_ulong eaddr, - ppc_hash_pte64_t *pte) + ppc_hash_pte64_t *pte, unsigned *pshift) { CPUPPCState *env = &cpu->env; hwaddr pte_offset; @@ -576,7 +576,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, hash); - pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte, pshift); if (pte_offset == -1) { /* Secondary PTEG lookup */ @@ -587,7 +587,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, ~hash); - pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte, pshift); } return pte_offset; @@ -714,7 +714,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, } /* 4. Locate the PTE in the hash table */ - pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte); + pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte, &apshift); if (pte_offset == -1) { dsisr = 0x40000000; if (rwx == 2) { @@ -730,18 +730,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, qemu_log_mask(CPU_LOG_MMU, "found PTE at offset %08" HWADDR_PRIx "\n", pte_offset); - /* Validate page size encoding */ - apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1); - if (!apshift) { - error_report("Bad page size encoding in HPTE 0x%"PRIx64" - 0x%"PRIx64 - " @ 0x%"HWADDR_PRIx, pte.pte0, pte.pte1, pte_offset); - /* Not entirely sure what the right action here, but machine - * check seems reasonable */ - cs->exception_index = POWERPC_EXCP_MCHECK; - env->error_code = 0; - return 1; - } - /* 5. Check access permissions */ pp_prot = ppc_hash64_pte_prot(cpu, slb, pte); @@ -815,16 +803,11 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr) return -1; } - pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte); + pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift); if (pte_offset == -1) { return -1; } - apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1); - if (!apshift) { - return -1; - } - return deposit64(pte.pte1 & HPTE64_R_RPN, 0, apshift, addr) & TARGET_PAGE_MASK; } -- cgit v1.2.3 From 912acdf487a3c8c0083b904fdb917fe6d79f87a7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 5 Jul 2016 07:37:08 +1000 Subject: ppc/hash64: Add proper real mode translation support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds proper support for translating real mode addresses based on the combination of HV and LPCR bits. This handles HRMOR offset for hypervisor real mode, and both RMA and VRMA modes for guest real mode. PAPR mode adjusts the offsets appropriately to match the RMA used in TCG, but we need to limit to the max supported by the implementation (16G). This includes some fixes by Cédric Le Goater Signed-off-by: Benjamin Herrenschmidt [dwg: Adjusted for differences in my version of the prereq patches] Signed-off-by: David Gibson --- hw/ppc/spapr.c | 7 ++ target-ppc/cpu.h | 2 + target-ppc/mmu-hash64.c | 165 +++++++++++++++++++++++++++++++++++++++++--- target-ppc/mmu-hash64.h | 3 + target-ppc/translate_init.c | 14 +++- 5 files changed, 181 insertions(+), 10 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 9c1c2c1858..7f33a1b2b5 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1771,6 +1771,13 @@ static void ppc_spapr_init(MachineState *machine) spapr->vrma_adjust = 1; spapr->rma_size = MIN(spapr->rma_size, 0x10000000); } + + /* Actually we don't support unbounded RMA anymore since we + * added proper emulation of HV mode. The max we can get is + * 16G which also happens to be what we configure for PAPR + * mode so make sure we don't do anything bigger than that + */ + spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull); } if (spapr->rma_size > node0_size) { diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h index af73bced9f..2666a3f80d 100644 --- a/target-ppc/cpu.h +++ b/target-ppc/cpu.h @@ -1047,6 +1047,8 @@ struct CPUPPCState { uint64_t insns_flags2; #if defined(TARGET_PPC64) struct ppc_segment_page_sizes sps; + ppc_slb_t vrma_slb; + target_ulong rmls; bool ci_large_pages; #endif diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 7c1b169676..7f314442ca 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -681,11 +681,52 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, assert((rwx == 0) || (rwx == 1) || (rwx == 2)); + /* Note on LPCR usage: 970 uses HID4, but our special variant + * of store_spr copies relevant fields into env->spr[SPR_LPCR]. + * Similarily we filter unimplemented bits when storing into + * LPCR depending on the MMU version. This code can thus just + * use the LPCR "as-is". + */ + /* 1. Handle real mode accesses */ if (((rwx == 2) && (msr_ir == 0)) || ((rwx != 2) && (msr_dr == 0))) { - /* Translation is off */ - /* In real mode the top 4 effective address bits are ignored */ + /* Translation is supposedly "off" */ + /* In real mode the top 4 effective address bits are (mostly) ignored */ raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL; + + /* In HV mode, add HRMOR if top EA bit is clear */ + if (msr_hv || !env->has_hv_mode) { + if (!(eaddr >> 63)) { + raddr |= env->spr[SPR_HRMOR]; + } + } else { + /* Otherwise, check VPM for RMA vs VRMA */ + if (env->spr[SPR_LPCR] & LPCR_VPM0) { + slb = &env->vrma_slb; + if (slb->sps) { + goto skip_slb_search; + } + /* Not much else to do here */ + cs->exception_index = POWERPC_EXCP_MCHECK; + env->error_code = 0; + return 1; + } else if (raddr < env->rmls) { + /* RMA. Check bounds in RMLS */ + raddr |= env->spr[SPR_RMOR]; + } else { + /* The access failed, generate the approriate interrupt */ + if (rwx == 2) { + ppc_hash64_set_isi(cs, env, 0x08000000); + } else { + dsisr = 0x08000000; + if (rwx == 1) { + dsisr |= 0x02000000; + } + ppc_hash64_set_dsi(cs, env, eaddr, dsisr); + } + return 1; + } + } tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, TARGET_PAGE_SIZE); @@ -694,7 +735,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, /* 2. Translation is on, so look up the SLB */ slb = slb_lookup(cpu, eaddr); - if (!slb) { if (rwx == 2) { cs->exception_index = POWERPC_EXCP_ISEG; @@ -707,6 +747,8 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, return 1; } +skip_slb_search: + /* 3. Check for segment level no-execute violation */ if ((rwx == 2) && (slb->vsid & SLB_VSID_N)) { ppc_hash64_set_isi(cs, env, 0x10000000); @@ -789,18 +831,37 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr) { CPUPPCState *env = &cpu->env; ppc_slb_t *slb; - hwaddr pte_offset; + hwaddr pte_offset, raddr; ppc_hash_pte64_t pte; unsigned apshift; + /* Handle real mode */ if (msr_dr == 0) { /* In real mode the top 4 effective address bits are ignored */ - return addr & 0x0FFFFFFFFFFFFFFFULL; - } + raddr = addr & 0x0FFFFFFFFFFFFFFFULL; - slb = slb_lookup(cpu, addr); - if (!slb) { - return -1; + /* In HV mode, add HRMOR if top EA bit is clear */ + if ((msr_hv || !env->has_hv_mode) && !(addr >> 63)) { + return raddr | env->spr[SPR_HRMOR]; + } + + /* Otherwise, check VPM for RMA vs VRMA */ + if (env->spr[SPR_LPCR] & LPCR_VPM0) { + slb = &env->vrma_slb; + if (!slb->sps) { + return -1; + } + } else if (raddr < env->rmls) { + /* RMA. Check bounds in RMLS */ + return raddr | env->spr[SPR_RMOR]; + } else { + return -1; + } + } else { + slb = slb_lookup(cpu, addr); + if (!slb) { + return -1; + } } pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift); @@ -846,6 +907,90 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, tlb_flush(CPU(cpu), 1); } +void ppc_hash64_update_rmls(CPUPPCState *env) +{ + uint64_t lpcr = env->spr[SPR_LPCR]; + + /* + * This is the full 4 bits encoding of POWER8. Previous + * CPUs only support a subset of these but the filtering + * is done when writing LPCR + */ + switch ((lpcr & LPCR_RMLS) >> LPCR_RMLS_SHIFT) { + case 0x8: /* 32MB */ + env->rmls = 0x2000000ull; + break; + case 0x3: /* 64MB */ + env->rmls = 0x4000000ull; + break; + case 0x7: /* 128MB */ + env->rmls = 0x8000000ull; + break; + case 0x4: /* 256MB */ + env->rmls = 0x10000000ull; + break; + case 0x2: /* 1GB */ + env->rmls = 0x40000000ull; + break; + case 0x1: /* 16GB */ + env->rmls = 0x400000000ull; + break; + default: + /* What to do here ??? */ + env->rmls = 0; + } +} + +void ppc_hash64_update_vrma(CPUPPCState *env) +{ + const struct ppc_one_seg_page_size *sps = NULL; + target_ulong esid, vsid, lpcr; + ppc_slb_t *slb = &env->vrma_slb; + uint32_t vrmasd; + int i; + + /* First clear it */ + slb->esid = slb->vsid = 0; + slb->sps = NULL; + + /* Is VRMA enabled ? */ + lpcr = env->spr[SPR_LPCR]; + if (!(lpcr & LPCR_VPM0)) { + return; + } + + /* Make one up. Mostly ignore the ESID which will not be + * needed for translation + */ + vsid = SLB_VSID_VRMA; + vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT; + vsid |= (vrmasd << 4) & (SLB_VSID_L | SLB_VSID_LP); + esid = SLB_ESID_V; + + for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { + const struct ppc_one_seg_page_size *sps1 = &env->sps.sps[i]; + + if (!sps1->page_shift) { + break; + } + + if ((vsid & SLB_VSID_LLP_MASK) == sps1->slb_enc) { + sps = sps1; + break; + } + } + + if (!sps) { + error_report("Bad page size encoding esid 0x"TARGET_FMT_lx + " vsid 0x"TARGET_FMT_lx, esid, vsid); + return; + } + + slb->vsid = vsid; + slb->esid = esid; + slb->sps = sps; +} + void helper_store_lpcr(CPUPPCState *env, target_ulong val) { uint64_t lpcr = 0; @@ -901,4 +1046,6 @@ void helper_store_lpcr(CPUPPCState *env, target_ulong val) ; } env->spr[SPR_LPCR] = lpcr; + ppc_hash64_update_rmls(env); + ppc_hash64_update_vrma(env); } diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h index 154a306997..3a7476b30a 100644 --- a/target-ppc/mmu-hash64.h +++ b/target-ppc/mmu-hash64.h @@ -18,6 +18,8 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, target_ulong pte0, target_ulong pte1); unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, uint64_t pte0, uint64_t pte1); +void ppc_hash64_update_vrma(CPUPPCState *env); +void ppc_hash64_update_rmls(CPUPPCState *env); #endif /* @@ -36,6 +38,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, #define SLB_VSID_B_256M 0x0000000000000000ULL #define SLB_VSID_B_1T 0x4000000000000000ULL #define SLB_VSID_VSID 0x3FFFFFFFFFFFF000ULL +#define SLB_VSID_VRMA (0x0001FFFFFF000000ULL | SLB_VSID_B_1T) #define SLB_VSID_PTEM (SLB_VSID_B | SLB_VSID_VSID) #define SLB_VSID_KS 0x0000000000000800ULL #define SLB_VSID_KP 0x0000000000000400ULL diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c index a06bf50b65..8f257fb74a 100644 --- a/target-ppc/translate_init.c +++ b/target-ppc/translate_init.c @@ -8791,11 +8791,19 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu) /* Set emulated LPCR to not send interrupts to hypervisor. Note that * under KVM, the actual HW LPCR will be set differently by KVM itself, * the settings below ensure proper operations with TCG in absence of - * a real hypervisor + * a real hypervisor. + * + * Clearing VPM0 will also cause us to use RMOR in mmu-hash64.c for + * real mode accesses, which thankfully defaults to 0 and isn't + * accessible in guest mode. */ lpcr->default_value &= ~(LPCR_VPM0 | LPCR_VPM1 | LPCR_ISL | LPCR_KBV); lpcr->default_value |= LPCR_LPES0 | LPCR_LPES1; + /* Set RMLS to the max (ie, 16G) */ + lpcr->default_value &= ~LPCR_RMLS; + lpcr->default_value |= 1ull << LPCR_RMLS_SHIFT; + /* P7 and P8 has slightly different PECE bits, mostly because P8 adds * bit 47 and 48 which are reserved on P7. Here we set them all, which * will work as expected for both implementations @@ -8811,6 +8819,10 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu) /* Set a full AMOR so guest can use the AMR as it sees fit */ env->spr[SPR_AMOR] = amor->default_value = 0xffffffffffffffffull; + /* Update some env bits based on new LPCR value */ + ppc_hash64_update_rmls(env); + ppc_hash64_update_vrma(env); + /* Tell KVM that we're in PAPR mode */ if (kvm_enabled()) { kvmppc_set_papr(cpu); -- cgit v1.2.3 From 2c7ad80443e9747eb85b508be01cded958191bad Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 4 Jul 2016 17:44:11 +1000 Subject: ppc/hash64: Fix support for LPCR:ISL We need to ignore the segment page size and essentially treat all pages as coming from a 4K segment. Signed-off-by: Benjamin Herrenschmidt [dwg: Adjusted for differences in my version of the prereq patches] Signed-off-by: David Gibson --- target-ppc/mmu-hash64.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 7f314442ca..82c2186bcf 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -488,7 +488,8 @@ static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, } static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, - ppc_slb_t *slb, target_ulong ptem, + const struct ppc_one_seg_page_size *sps, + target_ulong ptem, ppc_hash_pte64_t *pte, unsigned *pshift) { CPUPPCState *env = &cpu->env; @@ -508,7 +509,7 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, /* This compares V, B, H (secondary) and the AVPN */ if (HPTE64_V_COMPARE(pte0, ptem)) { - *pshift = hpte_page_shift(slb->sps, pte0, pte1); + *pshift = hpte_page_shift(sps, pte0, pte1); /* * If there is no match, ignore the PTE, it could simply * be for a different segment size encoding and the @@ -543,23 +544,31 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, hwaddr pte_offset; hwaddr hash; uint64_t vsid, epnmask, epn, ptem; + const struct ppc_one_seg_page_size *sps = slb->sps; /* The SLB store path should prevent any bad page size encodings * getting in there, so: */ - assert(slb->sps); + assert(sps); - epnmask = ~((1ULL << slb->sps->page_shift) - 1); + /* If ISL is set in LPCR we need to clamp the page size to 4K */ + if (env->spr[SPR_LPCR] & LPCR_ISL) { + /* We assume that when using TCG, 4k is first entry of SPS */ + sps = &env->sps.sps[0]; + assert(sps->page_shift == 12); + } + + epnmask = ~((1ULL << sps->page_shift) - 1); if (slb->vsid & SLB_VSID_B) { /* 1TB segment */ vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT_1T; epn = (eaddr & ~SEGMENT_MASK_1T) & epnmask; - hash = vsid ^ (vsid << 25) ^ (epn >> slb->sps->page_shift); + hash = vsid ^ (vsid << 25) ^ (epn >> sps->page_shift); } else { /* 256M segment */ vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT; epn = (eaddr & ~SEGMENT_MASK_256M) & epnmask; - hash = vsid ^ (epn >> slb->sps->page_shift); + hash = vsid ^ (epn >> sps->page_shift); } ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN); ptem |= HPTE64_V_VALID; @@ -576,7 +585,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, hash); - pte_offset = ppc_hash64_pteg_search(cpu, hash, slb, ptem, pte, pshift); + pte_offset = ppc_hash64_pteg_search(cpu, hash, sps, ptem, pte, pshift); if (pte_offset == -1) { /* Secondary PTEG lookup */ @@ -587,7 +596,7 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, ~hash); - pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb, ptem, pte, pshift); + pte_offset = ppc_hash64_pteg_search(cpu, ~hash, sps, ptem, pte, pshift); } return pte_offset; -- cgit v1.2.3