diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2016-07-05 11:14:27 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2016-07-05 11:14:27 +0100 |
commit | 8662d7db392f906c7808014051b278ad1542db93 (patch) | |
tree | ff0bced1fc218f98a01813144f43ce6de905d486 | |
parent | 11659423113d2fbcf055085b5e8285d590addfaa (diff) | |
parent | 2c7ad80443e9747eb85b508be01cded958191bad (diff) |
Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-2.7-20160705' into staging
ppc patch queue for 2016-07-05
Here's the current ppc, sPAPR and related drivers patch queue.
* The big addition is dynamic DMA window support (this includes some
core VFIO changes)
* There are also several fixes to the MMU emulation for bugs
introduced with the HV mode patches
* Several other bugfixes and cleanups
Changes in v2:
I messed up and forgot to make a fix in the last patch which BenH
pointed out (introduced by my rebasing). That's fixed in this
version, and I'm replacing the tag in place with the revised
version.
# gpg: Signature made Tue 05 Jul 2016 06:28:58 BST
# gpg: using RSA key 0x6C38CACA20D9B392
# gpg: Good signature from "David Gibson <david@gibson.dropbear.id.au>"
# gpg: aka "David Gibson (Red Hat) <dgibson@redhat.com>"
# gpg: aka "David Gibson (ozlabs.org) <dgibson@ozlabs.org>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg: It is not certain that the signature belongs to the owner.
# Primary key fingerprint: 75F4 6586 AE61 A66C C44E 87DC 6C38 CACA 20D9 B392
* remotes/dgibson/tags/ppc-for-2.7-20160705:
ppc/hash64: Fix support for LPCR:ISL
ppc/hash64: Add proper real mode translation support
target-ppc: Return page shift from PTEG search
target-ppc: Simplify HPTE matching
target-ppc: Correct page size decoding in ppc_hash64_pteg_search()
ppc: simplify ppc_hash64_hpte_page_shift_noslb()
spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)
vfio/spapr: Create DMA window dynamically (SPAPR IOMMU v2)
vfio: Add host side DMA window capabilities
vfio: spapr: Add DMA memory preregistering (SPAPR IOMMU v2)
spapr_iommu: Realloc guest visible TCE table when starting/stopping listening
ppc: simplify max_smt initialization in ppc_cpu_realizefn()
spapr: Ensure thread0 of CPU core is always realized first
ppc: Fix xsrdpi, xvrdpi and xvrspi rounding
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | hw/ppc/Makefile.objs | 1 | ||||
-rw-r--r-- | hw/ppc/spapr.c | 14 | ||||
-rw-r--r-- | hw/ppc/spapr_cpu_core.c | 29 | ||||
-rw-r--r-- | hw/ppc/spapr_hcall.c | 4 | ||||
-rw-r--r-- | hw/ppc/spapr_iommu.c | 12 | ||||
-rw-r--r-- | hw/ppc/spapr_pci.c | 81 | ||||
-rw-r--r-- | hw/ppc/spapr_rtas_ddw.c | 295 | ||||
-rw-r--r-- | hw/ppc/trace-events | 4 | ||||
-rw-r--r-- | hw/vfio/Makefile.objs | 1 | ||||
-rw-r--r-- | hw/vfio/common.c | 170 | ||||
-rw-r--r-- | hw/vfio/spapr.c | 210 | ||||
-rw-r--r-- | hw/vfio/trace-events | 8 | ||||
-rw-r--r-- | include/hw/pci-host/spapr.h | 8 | ||||
-rw-r--r-- | include/hw/ppc/spapr.h | 16 | ||||
-rw-r--r-- | include/hw/vfio/vfio-common.h | 20 | ||||
-rw-r--r-- | target-ppc/cpu.h | 2 | ||||
-rw-r--r-- | target-ppc/fpu_helper.c | 6 | ||||
-rw-r--r-- | target-ppc/mmu-hash64.c | 321 | ||||
-rw-r--r-- | target-ppc/mmu-hash64.h | 8 | ||||
-rw-r--r-- | target-ppc/translate_init.c | 16 |
20 files changed, 1043 insertions, 183 deletions
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index 5cc6608e50..91a3420f47 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -8,6 +8,7 @@ obj-$(CONFIG_PSERIES) += spapr_cpu_core.o ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) obj-y += spapr_pci_vfio.o endif +obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o # PowerPC 4xx boards obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o obj-y += ppc4xx_pci.o diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 78ebd9ee38..7f33a1b2b5 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1771,6 +1771,13 @@ static void ppc_spapr_init(MachineState *machine) spapr->vrma_adjust = 1; spapr->rma_size = MIN(spapr->rma_size, 0x10000000); } + + /* Actually we don't support unbounded RMA anymore since we + * added proper emulation of HV mode. The max we can get is + * 16G which also happens to be what we configure for PAPR + * mode so make sure we don't do anything bigger than that + */ + spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull); } if (spapr->rma_size > node0_size) { @@ -2489,7 +2496,12 @@ DEFINE_SPAPR_MACHINE(2_7, "2.7", true); * pseries-2.6 */ #define SPAPR_COMPAT_2_6 \ - HW_COMPAT_2_6 + HW_COMPAT_2_6 \ + { \ + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\ + .property = "ddw",\ + .value = stringify(off),\ + }, static void spapr_machine_2_6_instance_options(MachineState *machine) { diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index a384db5204..70b6b0b5ee 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -259,9 +259,9 @@ out: error_propagate(errp, local_err); } -static int spapr_cpu_core_realize_child(Object *child, void *opaque) +static void spapr_cpu_core_realize_child(Object *child, Error **errp) { - Error **errp = opaque, *local_err = NULL; + Error *local_err = NULL; sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); CPUState *cs = CPU(child); PowerPCCPU *cpu = POWERPC_CPU(cs); @@ -269,15 +269,14 @@ static int spapr_cpu_core_realize_child(Object *child, void *opaque) object_property_set_bool(child, true, "realized", &local_err); if (local_err) { error_propagate(errp, local_err); - return 1; + return; } spapr_cpu_init(spapr, cpu, &local_err); if (local_err) { error_propagate(errp, local_err); - return 1; + return; } - return 0; } static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) @@ -287,13 +286,13 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) const char *typename = object_class_get_name(sc->cpu_class); size_t size = object_type_get_instance_size(typename); Error *local_err = NULL; - Object *obj; - int i; + void *obj; + int i, j; sc->threads = g_malloc0(size * cc->nr_threads); for (i = 0; i < cc->nr_threads; i++) { char id[32]; - void *obj = sc->threads + i * size; + obj = sc->threads + i * size; object_initialize(obj, size, typename); snprintf(id, sizeof(id), "thread[%d]", i); @@ -303,12 +302,16 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) } object_unref(obj); } - object_child_foreach(OBJECT(dev), spapr_cpu_core_realize_child, &local_err); - if (local_err) { - goto err; - } else { - return; + + for (j = 0; j < cc->nr_threads; j++) { + obj = sc->threads + j * size; + + spapr_cpu_core_realize_child(obj, &local_err); + if (local_err) { + goto err; + } } + return; err: while (--i >= 0) { diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index e011ed4b66..73af112e1d 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -83,12 +83,12 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong pte_index = args[1]; target_ulong pteh = args[2]; target_ulong ptel = args[3]; - unsigned apshift, spshift; + unsigned apshift; target_ulong raddr; target_ulong index; uint64_t token; - apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel, &spshift); + apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel); if (!apshift) { /* Bad page size encoding */ return H_PARAMETER; diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index e230bacae1..d57b05d5c0 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -156,6 +156,16 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu) return 1ULL << tcet->page_shift; } +static void spapr_tce_notify_started(MemoryRegion *iommu) +{ + spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true); +} + +static void spapr_tce_notify_stopped(MemoryRegion *iommu) +{ + spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false); +} + static int spapr_tce_table_post_load(void *opaque, int version_id) { sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); @@ -236,6 +246,8 @@ static const VMStateDescription vmstate_spapr_tce_table = { static MemoryRegionIOMMUOps spapr_iommu_ops = { .translate = spapr_tce_translate_iommu, .get_min_page_size = spapr_tce_get_min_page_size, + .notify_started = spapr_tce_notify_started, + .notify_stopped = spapr_tce_notify_stopped, }; static int spapr_tce_table_realize(DeviceState *dev) diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 8c1e6b17c3..949c44fec8 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -35,6 +35,7 @@ #include "hw/ppc/spapr.h" #include "hw/pci-host/spapr.h" #include "exec/address-spaces.h" +#include "exec/ram_addr.h" #include <libfdt.h> #include "trace.h" #include "qemu/error-report.h" @@ -45,6 +46,7 @@ #include "hw/ppc/spapr_drc.h" #include "sysemu/device_tree.h" #include "sysemu/kvm.h" +#include "sysemu/hostmem.h" #include "hw/vfio/vfio.h" @@ -1087,12 +1089,6 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc, void *fdt = NULL; int fdt_start_offset = 0, fdt_size; - if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { - sPAPRTCETable *tcet = spapr_tce_find_by_liobn(phb->dma_liobn); - - spapr_tce_set_need_vfio(tcet, true); - } - fdt = create_device_tree(&fdt_size); fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0); if (!fdt_start_offset) { @@ -1310,11 +1306,14 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) PCIBus *bus; uint64_t msi_window_size = 4096; sPAPRTCETable *tcet; + const unsigned windows_supported = + sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1; if (sphb->index != (uint32_t)-1) { hwaddr windows_base; - if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn != (uint32_t)-1) + if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != (uint32_t)-1) + || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2) || (sphb->mem_win_addr != (hwaddr)-1) || (sphb->io_win_addr != (hwaddr)-1)) { error_setg(errp, "Either \"index\" or other parameters must" @@ -1329,7 +1328,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) } sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index; - sphb->dma_liobn = SPAPR_PCI_LIOBN(sphb->index, 0); + for (i = 0; i < windows_supported; ++i) { + sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i); + } windows_base = SPAPR_PCI_WINDOW_BASE + sphb->index * SPAPR_PCI_WINDOW_SPACING; @@ -1342,8 +1343,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) return; } - if (sphb->dma_liobn == (uint32_t)-1) { - error_setg(errp, "LIOBN not specified for PHB"); + if ((sphb->dma_liobn[0] == (uint32_t)-1) || + ((sphb->dma_liobn[1] == (uint32_t)-1) && (windows_supported > 1))) { + error_setg(errp, "LIOBN(s) not specified for PHB"); return; } @@ -1462,16 +1464,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) } } - tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn); - if (!tcet) { - error_setg(errp, "Unable to create TCE table for %s", - sphb->dtbusname); - return; + /* DMA setup */ + for (i = 0; i < windows_supported; ++i) { + tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]); + if (!tcet) { + error_setg(errp, "Creating window#%d failed for %s", + i, sphb->dtbusname); + return; + } + memory_region_add_subregion_overlap(&sphb->iommu_root, 0, + spapr_tce_get_iommu(tcet), 0); } - memory_region_add_subregion_overlap(&sphb->iommu_root, 0, - spapr_tce_get_iommu(tcet), 0); - sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free); } @@ -1488,13 +1492,19 @@ static int spapr_phb_children_reset(Object *child, void *opaque) void spapr_phb_dma_reset(sPAPRPHBState *sphb) { - sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn); + int i; + sPAPRTCETable *tcet; - if (tcet && tcet->nb_table) { - spapr_tce_table_disable(tcet); + for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) { + tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[i]); + + if (tcet && tcet->nb_table) { + spapr_tce_table_disable(tcet); + } } /* Register default 32bit DMA window */ + tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[0]); spapr_tce_table_enable(tcet, SPAPR_TCE_PAGE_SHIFT, sphb->dma_win_addr, sphb->dma_win_size >> SPAPR_TCE_PAGE_SHIFT); } @@ -1516,7 +1526,8 @@ static void spapr_phb_reset(DeviceState *qdev) static Property spapr_phb_properties[] = { DEFINE_PROP_UINT32("index", sPAPRPHBState, index, -1), DEFINE_PROP_UINT64("buid", sPAPRPHBState, buid, -1), - DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn, -1), + DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn[0], -1), + DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1), DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1), DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size, SPAPR_PCI_MMIO_WIN_SIZE), @@ -1528,6 +1539,11 @@ static Property spapr_phb_properties[] = { /* Default DMA window is 0..1GB */ DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0), DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000), + DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_win_addr, + 0x800000000000000ULL), + DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), + DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask, + (1ULL << 12) | (1ULL << 16)), DEFINE_PROP_END_OF_LIST(), }; @@ -1604,7 +1620,7 @@ static const VMStateDescription vmstate_spapr_pci = { .post_load = spapr_pci_post_load, .fields = (VMStateField[]) { VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState), - VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState), + VMSTATE_UINT32_EQUAL(dma_liobn[0], sPAPRPHBState), VMSTATE_UINT64_EQUAL(mem_win_addr, sPAPRPHBState), VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState), VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState), @@ -1780,6 +1796,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, uint32_t interrupt_map_mask[] = { cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)}; uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7]; + uint32_t ddw_applicable[] = { + cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW), + cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW), + cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW) + }; + uint32_t ddw_extensions[] = { + cpu_to_be32(1), + cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) + }; sPAPRTCETable *tcet; PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; sPAPRFDT s_fdt; @@ -1804,6 +1829,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1)); _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR)); + /* Dynamic DMA window */ + if (phb->ddw_enabled) { + _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable, + sizeof(ddw_applicable))); + _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions", + &ddw_extensions, sizeof(ddw_extensions))); + } + /* Build the interrupt-map, this must matches what is done * in pci_spapr_map_irq */ @@ -1827,7 +1860,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, sizeof(interrupt_map))); - tcet = spapr_tce_find_by_liobn(phb->dma_liobn); + tcet = spapr_tce_find_by_liobn(phb->dma_liobn[0]); if (!tcet) { return -1; } diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c new file mode 100644 index 0000000000..177dcffc9b --- /dev/null +++ b/hw/ppc/spapr_rtas_ddw.c @@ -0,0 +1,295 @@ +/* + * QEMU sPAPR Dynamic DMA windows support + * + * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "qemu/error-report.h" +#include "hw/ppc/spapr.h" +#include "hw/pci-host/spapr.h" +#include "trace.h" + +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque) +{ + sPAPRTCETable *tcet; + + tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE); + if (tcet && tcet->nb_table) { + ++*(unsigned *)opaque; + } + return 0; +} + +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb) +{ + unsigned ret = 0; + + object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret); + + return ret; +} + +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque) +{ + sPAPRTCETable *tcet; + + tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE); + if (tcet && !tcet->nb_table) { + *(uint32_t *)opaque = tcet->liobn; + return 1; + } + return 0; +} + +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb) +{ + uint32_t liobn = 0; + + object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn); + + return liobn; +} + +static uint32_t spapr_page_mask_to_query_mask(uint64_t page_mask) +{ + int i; + uint32_t mask = 0; + const struct { int shift; uint32_t mask; } masks[] = { + { 12, RTAS_DDW_PGSIZE_4K }, + { 16, RTAS_DDW_PGSIZE_64K }, + { 24, RTAS_DDW_PGSIZE_16M }, + { 25, RTAS_DDW_PGSIZE_32M }, + { 26, RTAS_DDW_PGSIZE_64M }, + { 27, RTAS_DDW_PGSIZE_128M }, + { 28, RTAS_DDW_PGSIZE_256M }, + { 34, RTAS_DDW_PGSIZE_16G }, + }; + + for (i = 0; i < ARRAY_SIZE(masks); ++i) { + if (page_mask & (1ULL << masks[i].shift)) { + mask |= masks[i].mask; + } + } + + return mask; +} + +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + uint64_t buid, max_window_size; + uint32_t avail, addr, pgmask = 0; + MachineState *machine = MACHINE(spapr); + + if ((nargs != 3) || (nret != 5)) { + goto param_error_exit; + } + + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); + addr = rtas_ld(args, 0); + sphb = spapr_pci_find_phb(spapr, buid); + if (!sphb || !sphb->ddw_enabled) { + goto param_error_exit; + } + + /* Translate page mask to LoPAPR format */ + pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask); + + /* + * This is "Largest contiguous block of TCEs allocated specifically + * for (that is, are reserved for) this PE". + * Return the maximum number as maximum supported RAM size was in 4K pages. + */ + if (machine->ram_size == machine->maxram_size) { + max_window_size = machine->ram_size; + } else { + MemoryHotplugState *hpms = &spapr->hotplug_memory; + + max_window_size = hpms->base + memory_region_size(&hpms->mr); + } + + avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + rtas_st(rets, 1, avail); + rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT); + rtas_st(rets, 3, pgmask); + rtas_st(rets, 4, 0); /* DMA migration mask, not supported */ + + trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask); + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + sPAPRTCETable *tcet = NULL; + uint32_t addr, page_shift, window_shift, liobn; + uint64_t buid, win_addr; + int windows; + + if ((nargs != 5) || (nret != 4)) { + goto param_error_exit; + } + + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); + addr = rtas_ld(args, 0); + sphb = spapr_pci_find_phb(spapr, buid); + if (!sphb || !sphb->ddw_enabled) { + goto param_error_exit; + } + + page_shift = rtas_ld(args, 3); + window_shift = rtas_ld(args, 4); + liobn = spapr_phb_get_free_liobn(sphb); + windows = spapr_phb_get_active_win_num(sphb); + + if (!(sphb->page_size_mask & (1ULL << page_shift)) || + (window_shift < page_shift)) { + goto param_error_exit; + } + + if (!liobn || !sphb->ddw_enabled || windows == SPAPR_PCI_DMA_MAX_WINDOWS) { + goto hw_error_exit; + } + + tcet = spapr_tce_find_by_liobn(liobn); + if (!tcet) { + goto hw_error_exit; + } + + win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr; + spapr_tce_table_enable(tcet, page_shift, win_addr, + 1ULL << (window_shift - page_shift)); + if (!tcet->nb_table) { + goto hw_error_exit; + } + + trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift, + 1ULL << window_shift, tcet->bus_offset, liobn); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + rtas_st(rets, 1, liobn); + rtas_st(rets, 2, tcet->bus_offset >> 32); + rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1)); + + return; + +hw_error_exit: + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + sPAPRTCETable *tcet; + uint32_t liobn; + + if ((nargs != 1) || (nret != 1)) { + goto param_error_exit; + } + + liobn = rtas_ld(args, 0); + tcet = spapr_tce_find_by_liobn(liobn); + if (!tcet) { + goto param_error_exit; + } + + sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent); + if (!sphb || !sphb->ddw_enabled || !tcet->nb_table) { + goto param_error_exit; + } + + spapr_tce_table_disable(tcet); + trace_spapr_iommu_ddw_remove(liobn); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu, + sPAPRMachineState *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + sPAPRPHBState *sphb; + uint64_t buid; + uint32_t addr; + + if ((nargs != 3) || (nret != 1)) { + goto param_error_exit; + } + + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); + addr = rtas_ld(args, 0); + sphb = spapr_pci_find_phb(spapr, buid); + if (!sphb || !sphb->ddw_enabled) { + goto param_error_exit; + } + + spapr_phb_dma_reset(sphb); + trace_spapr_iommu_ddw_reset(buid, addr); + + rtas_st(rets, 0, RTAS_OUT_SUCCESS); + + return; + +param_error_exit: + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); +} + +static void spapr_rtas_ddw_init(void) +{ + spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW, + "ibm,query-pe-dma-window", + rtas_ibm_query_pe_dma_window); + spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW, + "ibm,create-pe-dma-window", + rtas_ibm_create_pe_dma_window); + spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW, + "ibm,remove-pe-dma-window", + rtas_ibm_remove_pe_dma_window); + spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW, + "ibm,reset-pe-dma-window", + rtas_ibm_reset_pe_dma_window); +} + +type_init(spapr_rtas_ddw_init) diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index 6da713547f..900679bc9d 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -30,6 +30,10 @@ spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, un spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d" spapr_iommu_pre_save(uint64_t liobn, uint32_t nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32 spapr_iommu_post_load(uint64_t liobn, uint32_t pre_nb, uint32_t post_nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" => %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32 +spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32 +spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32 +spapr_iommu_ddw_remove(uint32_t liobn) "liobn=%"PRIx32 +spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32 # hw/ppc/ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index ceddbb8f99..c25e32b029 100644 --- a/hw/vfio/Makefile.objs +++ b/hw/vfio/Makefile.objs @@ -4,4 +4,5 @@ obj-$(CONFIG_PCI) += pci.o pci-quirks.o obj-$(CONFIG_SOFTMMU) += platform.o obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o obj-$(CONFIG_SOFTMMU) += amd-xgbe.o +obj-$(CONFIG_SOFTMMU) += spapr.o endif diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7be638e0e3..f3c0522e7e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -28,6 +28,7 @@ #include "exec/memory.h" #include "hw/hw.h" #include "qemu/error-report.h" +#include "qemu/range.h" #include "sysemu/kvm.h" #ifdef CONFIG_KVM #include "linux/kvm.h" @@ -241,6 +242,44 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } +static void vfio_host_win_add(VFIOContainer *container, + hwaddr min_iova, hwaddr max_iova, + uint64_t iova_pgsizes) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + min_iova, + max_iova - min_iova + 1)) { + hw_error("%s: Overlapped IOMMU are not enabled", __func__); + } + } + + hostwin = g_malloc0(sizeof(*hostwin)); + + hostwin->min_iova = min_iova; + hostwin->max_iova = max_iova; + hostwin->iova_pgsizes = iova_pgsizes; + QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); +} + +static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova, + hwaddr max_iova) +{ + VFIOHostDMAWindow *hostwin; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { + QLIST_REMOVE(hostwin, hostwin_next); + return 0; + } + } + + return -1; +} + static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && @@ -329,6 +368,8 @@ static void vfio_listener_region_add(MemoryListener *listener, Int128 llend, llsize; void *vaddr; int ret; + VFIOHostDMAWindow *hostwin; + bool hostwin_found; if (vfio_listener_skipped_section(section)) { trace_vfio_listener_region_add_skip( @@ -354,7 +395,40 @@ static void vfio_listener_region_add(MemoryListener *listener, } end = int128_get64(int128_sub(llend, int128_one())); - if ((iova < container->min_iova) || (end > container->max_iova)) { + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + VFIOHostDMAWindow *hostwin; + hwaddr pgsize = 0; + + /* For now intersections are not allowed, we may relax this later */ + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (ranges_overlap(hostwin->min_iova, + hostwin->max_iova - hostwin->min_iova + 1, + section->offset_within_address_space, + int128_get64(section->size))) { + ret = -1; + goto fail; + } + } + + ret = vfio_spapr_create_window(container, section, &pgsize); + if (ret) { + goto fail; + } + + vfio_host_win_add(container, section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1, pgsize); + } + + hostwin_found = false; + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + + if (!hostwin_found) { error_report("vfio: IOMMU container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); @@ -369,10 +443,6 @@ static void vfio_listener_region_add(MemoryListener *listener, trace_vfio_listener_region_add_iommu(iova, end); /* - * FIXME: We should do some checking to see if the - * capabilities of the host VFIO IOMMU are adequate to model - * the guest IOMMU - * * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this * would be the right place to wire that up (tell the KVM @@ -493,6 +563,18 @@ static void vfio_listener_region_del(MemoryListener *listener, "0x%"HWADDR_PRIx") = %d (%m)", container, iova, int128_get64(llsize), ret); } + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + vfio_spapr_remove_window(container, + section->offset_within_address_space); + if (vfio_host_win_del(container, + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(section->size) - 1) < 0) { + hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, + __func__, section->offset_within_address_space); + } + } } static const MemoryListener vfio_memory_listener = { @@ -503,6 +585,9 @@ static const MemoryListener vfio_memory_listener = { static void vfio_listener_release(VFIOContainer *container) { memory_listener_unregister(&container->listener); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&container->prereg_listener); + } } static struct vfio_info_cap_header * @@ -861,8 +946,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - ret = ioctl(fd, VFIO_SET_IOMMU, - v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU); + container->iommu_type = v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU; + ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type); if (ret) { error_report("vfio: failed to set iommu for container: %m"); ret = -errno; @@ -876,19 +961,18 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) * existing Type1 IOMMUs generally support any IOVA we're * going to actually try in practice. */ - container->min_iova = 0; - container->max_iova = (hwaddr)-1; - - /* Assume just 4K IOVA page size */ - container->iova_pgsizes = 0x1000; info.argsz = sizeof(info); ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); /* Ignore errors */ - if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) { - container->iova_pgsizes = info.iova_pgsizes; + if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + /* Assume 4k IOVA page size */ + info.iova_pgsizes = 4096; } - } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { + vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes); + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) || + ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) { struct vfio_iommu_spapr_tce_info info; + bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU); ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { @@ -896,7 +980,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) ret = -errno; goto free_container_exit; } - ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU); + container->iommu_type = + v2 ? VFIO_SPAPR_TCE_v2_IOMMU : VFIO_SPAPR_TCE_IOMMU; + ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type); if (ret) { error_report("vfio: failed to set iommu for container: %m"); ret = -errno; @@ -908,30 +994,54 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) * when container fd is closed so we do not call it explicitly * in this file. */ - ret = ioctl(fd, VFIO_IOMMU_ENABLE); - if (ret) { - error_report("vfio: failed to enable container: %m"); - ret = -errno; - goto free_container_exit; + if (!v2) { + ret = ioctl(fd, VFIO_IOMMU_ENABLE); + if (ret) { + error_report("vfio: failed to enable container: %m"); + ret = -errno; + goto free_container_exit; + } + } else { + container->prereg_listener = vfio_prereg_listener; + + memory_listener_register(&container->prereg_listener, + &address_space_memory); + if (container->error) { + memory_listener_unregister(&container->prereg_listener); + error_report("vfio: RAM memory listener initialization failed for container"); + goto free_container_exit; + } } - /* - * This only considers the host IOMMU's 32-bit window. At - * some point we need to add support for the optional 64-bit - * window and dynamic windows - */ info.argsz = sizeof(info); ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); if (ret) { error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m"); ret = -errno; + if (v2) { + memory_listener_unregister(&container->prereg_listener); + } goto free_container_exit; } - container->min_iova = info.dma32_window_start; - container->max_iova = container->min_iova + info.dma32_window_size - 1; - /* Assume just 4K IOVA pages for now */ - container->iova_pgsizes = 0x1000; + if (v2) { + /* + * There is a default window in just created container. + * To make region_add/del simpler, we better remove this + * window now and let those iommu_listener callbacks + * create/remove them when needed. + */ + ret = vfio_spapr_remove_window(container, info.dma32_window_start); + if (ret) { + goto free_container_exit; + } + } else { + /* The default table uses 4K pages */ + vfio_host_win_add(container, info.dma32_window_start, + info.dma32_window_start + + info.dma32_window_size - 1, + 0x1000); + } } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c new file mode 100644 index 0000000000..0af342332c --- /dev/null +++ b/hw/vfio/spapr.c @@ -0,0 +1,210 @@ +/* + * DMA memory preregistration + * + * Authors: + * Alexey Kardashevskiy <aik@ozlabs.ru> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include <sys/ioctl.h> +#include <linux/vfio.h> + +#include "hw/vfio/vfio-common.h" +#include "hw/hw.h" +#include "qemu/error-report.h" +#include "trace.h" + +static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) +{ + if (memory_region_is_iommu(section->mr)) { + hw_error("Cannot possibly preregister IOMMU memory"); + } + + return !memory_region_is_ram(section->mr) || + memory_region_is_skip_dump(section->mr); +} + +static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) +{ + return memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (gpa - section->offset_within_address_space); +} + +static void vfio_prereg_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + prereg_listener); + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; + int ret; + hwaddr page_mask = qemu_real_host_page_mask; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0, + }; + + if (vfio_prereg_listener_skipped_section(section)) { + trace_vfio_prereg_listener_region_add_skip( + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) || + (section->offset_within_region & ~page_mask) || + (int128_get64(section->size) & ~page_mask))) { + error_report("%s received unaligned region", __func__); + return; + } + + end = section->offset_within_address_space + int128_get64(section->size); + if (gpa >= end) { + return; + } + + memory_region_ref(section->mr); + + reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); + reg.size = end - gpa; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); + if (ret) { + /* + * On the initfn path, store the first error in the container so we + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ + if (!container->initialized) { + if (!container->error) { + container->error = ret; + } + } else { + hw_error("vfio: Memory registering failed, unable to continue"); + } + } +} + +static void vfio_prereg_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + prereg_listener); + const hwaddr gpa = section->offset_within_address_space; + hwaddr end; + int ret; + hwaddr page_mask = qemu_real_host_page_mask; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0, + }; + + if (vfio_prereg_listener_skipped_section(section)) { + trace_vfio_prereg_listener_region_del_skip( + section->offset_within_address_space, + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) || + (section->offset_within_region & ~page_mask) || + (int128_get64(section->size) & ~page_mask))) { + error_report("%s received unaligned region", __func__); + return; + } + + end = section->offset_within_address_space + int128_get64(section->size); + if (gpa >= end) { + return; + } + + reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); + reg.size = end - gpa; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); +} + +const MemoryListener vfio_prereg_listener = { + .region_add = vfio_prereg_listener_region_add, + .region_del = vfio_prereg_listener_region_del, +}; + +int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize) +{ + int ret; + unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr); + unsigned entries, pages; + struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; + + /* + * FIXME: For VFIO iommu types which have KVM acceleration to + * avoid bouncing all map/unmaps through qemu this way, this + * would be the right place to wire that up (tell the KVM + * device emulation the VFIO iommu handles to use). + */ + create.window_size = int128_get64(section->size); + create.page_shift = ctz64(pagesize); + /* + * SPAPR host supports multilevel TCE tables, there is some + * heuristic to decide how many levels we want for our table: + * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 + */ + entries = create.window_size >> create.page_shift; + pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); + pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ + create.levels = ctz64(pages) / 6 + 1; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + if (ret) { + error_report("Failed to create a window, ret = %d (%m)", ret); + return -errno; + } + + if (create.start_addr != section->offset_within_address_space) { + vfio_spapr_remove_window(container, create.start_addr); + + error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, + section->offset_within_address_space, + (uint64_t)create.start_addr); + ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + return -EINVAL; + } + trace_vfio_spapr_create_window(create.page_shift, + create.window_size, + create.start_addr); + *pgsize = pagesize; + + return 0; +} + +int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space) +{ + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + .start_addr = offset_within_address_space, + }; + int ret; + + ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + error_report("Failed to remove window at %"PRIx64, + (uint64_t)remove.start_addr); + return -errno; + } + + trace_vfio_spapr_remove_window(offset_within_address_space); + + return 0; +} diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a768fb54ec..4bb7690c46 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -115,3 +115,11 @@ vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING" vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d" + +# hw/vfio/spapr.c +vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64 +vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64 +vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" +vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d" +vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64 +vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64 diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 288b89c04a..193631d2dc 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -32,6 +32,8 @@ #define SPAPR_PCI_HOST_BRIDGE(obj) \ OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE) +#define SPAPR_PCI_DMA_MAX_WINDOWS 2 + typedef struct sPAPRPHBState sPAPRPHBState; typedef struct spapr_pci_msi { @@ -56,7 +58,7 @@ struct sPAPRPHBState { hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size; MemoryRegion memwindow, iowindow, msiwindow; - uint32_t dma_liobn; + uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS]; hwaddr dma_win_addr, dma_win_size; AddressSpace iommu_as; MemoryRegion iommu_root; @@ -71,6 +73,10 @@ struct sPAPRPHBState { spapr_pci_msi_mig *msi_devs; QLIST_ENTRY(sPAPRPHBState) list; + + bool ddw_enabled; + uint64_t page_size_mask; + uint64_t dma64_win_addr; }; #define SPAPR_PCI_MAX_INDEX 255 diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 49325555d3..2e2dd14c30 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -416,6 +416,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi); #define RTAS_OUT_NOT_AUTHORIZED -9002 #define RTAS_OUT_SYSPARM_PARAM_ERROR -9999 +/* DDW pagesize mask values from ibm,query-pe-dma-window */ +#define RTAS_DDW_PGSIZE_4K 0x01 +#define RTAS_DDW_PGSIZE_64K 0x02 +#define RTAS_DDW_PGSIZE_16M 0x04 +#define RTAS_DDW_PGSIZE_32M 0x08 +#define RTAS_DDW_PGSIZE_64M 0x10 +#define RTAS_DDW_PGSIZE_128M 0x20 +#define RTAS_DDW_PGSIZE_256M 0x40 +#define RTAS_DDW_PGSIZE_16G 0x80 + /* RTAS tokens */ #define RTAS_TOKEN_BASE 0x2000 @@ -457,8 +467,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi); #define RTAS_IBM_SET_SLOT_RESET (RTAS_TOKEN_BASE + 0x23) #define RTAS_IBM_CONFIGURE_PE (RTAS_TOKEN_BASE + 0x24) #define RTAS_IBM_SLOT_ERROR_DETAIL (RTAS_TOKEN_BASE + 0x25) +#define RTAS_IBM_QUERY_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x26) +#define RTAS_IBM_CREATE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x27) +#define RTAS_IBM_REMOVE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x28) +#define RTAS_IBM_RESET_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x29) -#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x26) +#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x2A) /* RTAS ibm,get-system-parameter token values */ #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 0610377789..07f7188df4 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -73,6 +73,8 @@ typedef struct VFIOContainer { VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ MemoryListener listener; + MemoryListener prereg_listener; + unsigned iommu_type; int error; bool initialized; /* @@ -80,9 +82,8 @@ typedef struct VFIOContainer { * contiguous IOVA window. We may need to generalize that in * future */ - hwaddr min_iova, max_iova; - uint64_t iova_pgsizes; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; @@ -95,6 +96,13 @@ typedef struct VFIOGuestIOMMU { QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; +typedef struct VFIOHostDMAWindow { + hwaddr min_iova; + hwaddr max_iova; + uint64_t iova_pgsizes; + QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; +} VFIOHostDMAWindow; + typedef struct VFIODeviceOps VFIODeviceOps; typedef struct VFIODevice { @@ -158,4 +166,12 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, uint32_t subtype, struct vfio_region_info **info); #endif +extern const MemoryListener vfio_prereg_listener; + +int vfio_spapr_create_window(VFIOContainer *container, + MemoryRegionSection *section, + hwaddr *pgsize); +int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space); + #endif /* !HW_VFIO_VFIO_COMMON_H */ diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h index af73bced9f..2666a3f80d 100644 --- a/target-ppc/cpu.h +++ b/target-ppc/cpu.h @@ -1047,6 +1047,8 @@ struct CPUPPCState { uint64_t insns_flags2; #if defined(TARGET_PPC64) struct ppc_segment_page_sizes sps; + ppc_slb_t vrma_slb; + target_ulong rmls; bool ci_large_pages; #endif diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c index 4ef893be2c..d9795d04d0 100644 --- a/target-ppc/fpu_helper.c +++ b/target-ppc/fpu_helper.c @@ -2689,19 +2689,19 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) \ helper_float_check_status(env); \ } -VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_nearest_even, 1) +VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_ties_away, 1) VSX_ROUND(xsrdpic, 1, float64, VsrD(0), FLOAT_ROUND_CURRENT, 1) VSX_ROUND(xsrdpim, 1, float64, VsrD(0), float_round_down, 1) VSX_ROUND(xsrdpip, 1, float64, VsrD(0), float_round_up, 1) VSX_ROUND(xsrdpiz, 1, float64, VsrD(0), float_round_to_zero, 1) -VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_nearest_even, 0) +VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_ties_away, 0) VSX_ROUND(xvrdpic, 2, float64, VsrD(i), FLOAT_ROUND_CURRENT, 0) VSX_ROUND(xvrdpim, 2, float64, VsrD(i), float_round_down, 0) VSX_ROUND(xvrdpip, 2, float64, VsrD(i), float_round_up, 0) VSX_ROUND(xvrdpiz, 2, float64, VsrD(i), float_round_to_zero, 0) -VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_nearest_even, 0) +VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_ties_away, 0) VSX_ROUND(xvrspic, 4, float32, VsrW(i), FLOAT_ROUND_CURRENT, 0) VSX_ROUND(xvrspim, 4, float32, VsrW(i), float_round_down, 0) VSX_ROUND(xvrspip, 4, float32, VsrW(i), float_round_up, 0) diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c index 3b1357a648..82c2186bcf 100644 --- a/target-ppc/mmu-hash64.c +++ b/target-ppc/mmu-hash64.c @@ -450,31 +450,47 @@ void ppc_hash64_stop_access(PowerPCCPU *cpu, uint64_t token) } } -/* Returns the effective page shift or 0. MPSS isn't supported yet so - * this will always be the slb_pshift or 0 - */ -static uint32_t ppc_hash64_pte_size_decode(uint64_t pte1, uint32_t slb_pshift) +static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, + uint64_t pte0, uint64_t pte1) { - switch (slb_pshift) { - case 12: + int i; + + if (!(pte0 & HPTE64_V_LARGE)) { + if (sps->page_shift != 12) { + /* 4kiB page in a non 4kiB segment */ + return 0; + } + /* Normal 4kiB page */ return 12; - case 16: - if ((pte1 & 0xf000) == 0x1000) { - return 16; + } + + for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { + const struct ppc_one_page_size *ps = &sps->enc[i]; + uint64_t mask; + + if (!ps->page_shift) { + break; } - return 0; - case 24: - if ((pte1 & 0xff000) == 0) { - return 24; + + if (ps->page_shift == 12) { + /* L bit is set so this can't be a 4kiB page */ + continue; + } + + mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN; + + if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) { + return ps->page_shift; } - return 0; } - return 0; + + return 0; /* Bad page size encoding */ } static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, - uint32_t slb_pshift, bool secondary, - target_ulong ptem, ppc_hash_pte64_t *pte) + const struct ppc_one_seg_page_size *sps, + target_ulong ptem, + ppc_hash_pte64_t *pte, unsigned *pshift) { CPUPPCState *env = &cpu->env; int i; @@ -491,11 +507,17 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, pte0 = ppc_hash64_load_hpte0(cpu, token, i); pte1 = ppc_hash64_load_hpte1(cpu, token, i); - if ((pte0 & HPTE64_V_VALID) - && (secondary == !!(pte0 & HPTE64_V_SECONDARY)) - && HPTE64_V_COMPARE(pte0, ptem)) { - uint32_t pshift = ppc_hash64_pte_size_decode(pte1, slb_pshift); - if (pshift == 0) { + /* This compares V, B, H (secondary) and the AVPN */ + if (HPTE64_V_COMPARE(pte0, ptem)) { + *pshift = hpte_page_shift(sps, pte0, pte1); + /* + * If there is no match, ignore the PTE, it could simply + * be for a different segment size encoding and the + * architecture specifies we should not match. Linux will + * potentially leave behind PTEs for the wrong base page + * size when demoting segments. + */ + if (*pshift == 0) { continue; } /* We don't do anything with pshift yet as qemu TLB only deals @@ -516,31 +538,40 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash, static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, ppc_slb_t *slb, target_ulong eaddr, - ppc_hash_pte64_t *pte) + ppc_hash_pte64_t *pte, unsigned *pshift) { CPUPPCState *env = &cpu->env; hwaddr pte_offset; hwaddr hash; uint64_t vsid, epnmask, epn, ptem; + const struct ppc_one_seg_page_size *sps = slb->sps; /* The SLB store path should prevent any bad page size encodings * getting in there, so: */ - assert(slb->sps); + assert(sps); + + /* If ISL is set in LPCR we need to clamp the page size to 4K */ + if (env->spr[SPR_LPCR] & LPCR_ISL) { + /* We assume that when using TCG, 4k is first entry of SPS */ + sps = &env->sps.sps[0]; + assert(sps->page_shift == 12); + } - epnmask = ~((1ULL << slb->sps->page_shift) - 1); + epnmask = ~((1ULL << sps->page_shift) - 1); if (slb->vsid & SLB_VSID_B) { /* 1TB segment */ vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT_1T; epn = (eaddr & ~SEGMENT_MASK_1T) & epnmask; - hash = vsid ^ (vsid << 25) ^ (epn >> slb->sps->page_shift); + hash = vsid ^ (vsid << 25) ^ (epn >> sps->page_shift); } else { /* 256M segment */ vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT; epn = (eaddr & ~SEGMENT_MASK_256M) & epnmask; - hash = vsid ^ (epn >> slb->sps->page_shift); + hash = vsid ^ (epn >> sps->page_shift); } ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN); + ptem |= HPTE64_V_VALID; /* Page address translation */ qemu_log_mask(CPU_LOG_MMU, @@ -554,70 +585,30 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu, " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, hash); - pte_offset = ppc_hash64_pteg_search(cpu, hash, slb->sps->page_shift, - 0, ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, hash, sps, ptem, pte, pshift); if (pte_offset == -1) { /* Secondary PTEG lookup */ + ptem |= HPTE64_V_SECONDARY; qemu_log_mask(CPU_LOG_MMU, "1 htab=" TARGET_FMT_plx "/" TARGET_FMT_plx " vsid=" TARGET_FMT_lx " api=" TARGET_FMT_lx " hash=" TARGET_FMT_plx "\n", env->htab_base, env->htab_mask, vsid, ptem, ~hash); - pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb->sps->page_shift, 1, - ptem, pte); + pte_offset = ppc_hash64_pteg_search(cpu, ~hash, sps, ptem, pte, pshift); } return pte_offset; } -static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps, - uint64_t pte0, uint64_t pte1) -{ - int i; - - if (!(pte0 & HPTE64_V_LARGE)) { - if (sps->page_shift != 12) { - /* 4kiB page in a non 4kiB segment */ - return 0; - } - /* Normal 4kiB page */ - return 12; - } - - for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { - const struct ppc_one_page_size *ps = &sps->enc[i]; - uint64_t mask; - - if (!ps->page_shift) { - break; - } - - if (ps->page_shift == 12) { - /* L bit is set so this can't be a 4kiB page */ - continue; - } - - mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN; - - if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) { - return ps->page_shift; - } - } - - return 0; /* Bad page size encoding */ -} - unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, - uint64_t pte0, uint64_t pte1, - unsigned *seg_page_shift) + uint64_t pte0, uint64_t pte1) { CPUPPCState *env = &cpu->env; int i; if (!(pte0 & HPTE64_V_LARGE)) { - *seg_page_shift = 12; return 12; } @@ -635,12 +626,10 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, shift = hpte_page_shift(sps, pte0, pte1); if (shift) { - *seg_page_shift = sps->page_shift; return shift; } } - *seg_page_shift = 0; return 0; } @@ -701,11 +690,52 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, assert((rwx == 0) || (rwx == 1) || (rwx == 2)); + /* Note on LPCR usage: 970 uses HID4, but our special variant + * of store_spr copies relevant fields into env->spr[SPR_LPCR]. + * Similarily we filter unimplemented bits when storing into + * LPCR depending on the MMU version. This code can thus just + * use the LPCR "as-is". + */ + /* 1. Handle real mode accesses */ if (((rwx == 2) && (msr_ir == 0)) || ((rwx != 2) && (msr_dr == 0))) { - /* Translation is off */ - /* In real mode the top 4 effective address bits are ignored */ + /* Translation is supposedly "off" */ + /* In real mode the top 4 effective address bits are (mostly) ignored */ raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL; + + /* In HV mode, add HRMOR if top EA bit is clear */ + if (msr_hv || !env->has_hv_mode) { + if (!(eaddr >> 63)) { + raddr |= env->spr[SPR_HRMOR]; + } + } else { + /* Otherwise, check VPM for RMA vs VRMA */ + if (env->spr[SPR_LPCR] & LPCR_VPM0) { + slb = &env->vrma_slb; + if (slb->sps) { + goto skip_slb_search; + } + /* Not much else to do here */ + cs->exception_index = POWERPC_EXCP_MCHECK; + env->error_code = 0; + return 1; + } else if (raddr < env->rmls) { + /* RMA. Check bounds in RMLS */ + raddr |= env->spr[SPR_RMOR]; + } else { + /* The access failed, generate the approriate interrupt */ + if (rwx == 2) { + ppc_hash64_set_isi(cs, env, 0x08000000); + } else { + dsisr = 0x08000000; + if (rwx == 1) { + dsisr |= 0x02000000; + } + ppc_hash64_set_dsi(cs, env, eaddr, dsisr); + } + return 1; + } + } tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK, PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx, TARGET_PAGE_SIZE); @@ -714,7 +744,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, /* 2. Translation is on, so look up the SLB */ slb = slb_lookup(cpu, eaddr); - if (!slb) { if (rwx == 2) { cs->exception_index = POWERPC_EXCP_ISEG; @@ -727,6 +756,8 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, return 1; } +skip_slb_search: + /* 3. Check for segment level no-execute violation */ if ((rwx == 2) && (slb->vsid & SLB_VSID_N)) { ppc_hash64_set_isi(cs, env, 0x10000000); @@ -734,7 +765,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, } /* 4. Locate the PTE in the hash table */ - pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte); + pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte, &apshift); if (pte_offset == -1) { dsisr = 0x40000000; if (rwx == 2) { @@ -750,18 +781,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, qemu_log_mask(CPU_LOG_MMU, "found PTE at offset %08" HWADDR_PRIx "\n", pte_offset); - /* Validate page size encoding */ - apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1); - if (!apshift) { - error_report("Bad page size encoding in HPTE 0x%"PRIx64" - 0x%"PRIx64 - " @ 0x%"HWADDR_PRIx, pte.pte0, pte.pte1, pte_offset); - /* Not entirely sure what the right action here, but machine - * check seems reasonable */ - cs->exception_index = POWERPC_EXCP_MCHECK; - env->error_code = 0; - return 1; - } - /* 5. Check access permissions */ pp_prot = ppc_hash64_pte_prot(cpu, slb, pte); @@ -821,27 +840,41 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr) { CPUPPCState *env = &cpu->env; ppc_slb_t *slb; - hwaddr pte_offset; + hwaddr pte_offset, raddr; ppc_hash_pte64_t pte; unsigned apshift; + /* Handle real mode */ if (msr_dr == 0) { /* In real mode the top 4 effective address bits are ignored */ - return addr & 0x0FFFFFFFFFFFFFFFULL; - } + raddr = addr & 0x0FFFFFFFFFFFFFFFULL; - slb = slb_lookup(cpu, addr); - if (!slb) { - return -1; - } + /* In HV mode, add HRMOR if top EA bit is clear */ + if ((msr_hv || !env->has_hv_mode) && !(addr >> 63)) { + return raddr | env->spr[SPR_HRMOR]; + } - pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte); - if (pte_offset == -1) { - return -1; + /* Otherwise, check VPM for RMA vs VRMA */ + if (env->spr[SPR_LPCR] & LPCR_VPM0) { + slb = &env->vrma_slb; + if (!slb->sps) { + return -1; + } + } else if (raddr < env->rmls) { + /* RMA. Check bounds in RMLS */ + return raddr | env->spr[SPR_RMOR]; + } else { + return -1; + } + } else { + slb = slb_lookup(cpu, addr); + if (!slb) { + return -1; + } } - apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1); - if (!apshift) { + pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift); + if (pte_offset == -1) { return -1; } @@ -883,6 +916,90 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, tlb_flush(CPU(cpu), 1); } +void ppc_hash64_update_rmls(CPUPPCState *env) +{ + uint64_t lpcr = env->spr[SPR_LPCR]; + + /* + * This is the full 4 bits encoding of POWER8. Previous + * CPUs only support a subset of these but the filtering + * is done when writing LPCR + */ + switch ((lpcr & LPCR_RMLS) >> LPCR_RMLS_SHIFT) { + case 0x8: /* 32MB */ + env->rmls = 0x2000000ull; + break; + case 0x3: /* 64MB */ + env->rmls = 0x4000000ull; + break; + case 0x7: /* 128MB */ + env->rmls = 0x8000000ull; + break; + case 0x4: /* 256MB */ + env->rmls = 0x10000000ull; + break; + case 0x2: /* 1GB */ + env->rmls = 0x40000000ull; + break; + case 0x1: /* 16GB */ + env->rmls = 0x400000000ull; + break; + default: + /* What to do here ??? */ + env->rmls = 0; + } +} + +void ppc_hash64_update_vrma(CPUPPCState *env) +{ + const struct ppc_one_seg_page_size *sps = NULL; + target_ulong esid, vsid, lpcr; + ppc_slb_t *slb = &env->vrma_slb; + uint32_t vrmasd; + int i; + + /* First clear it */ + slb->esid = slb->vsid = 0; + slb->sps = NULL; + + /* Is VRMA enabled ? */ + lpcr = env->spr[SPR_LPCR]; + if (!(lpcr & LPCR_VPM0)) { + return; + } + + /* Make one up. Mostly ignore the ESID which will not be + * needed for translation + */ + vsid = SLB_VSID_VRMA; + vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT; + vsid |= (vrmasd << 4) & (SLB_VSID_L | SLB_VSID_LP); + esid = SLB_ESID_V; + + for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { + const struct ppc_one_seg_page_size *sps1 = &env->sps.sps[i]; + + if (!sps1->page_shift) { + break; + } + + if ((vsid & SLB_VSID_LLP_MASK) == sps1->slb_enc) { + sps = sps1; + break; + } + } + + if (!sps) { + error_report("Bad page size encoding esid 0x"TARGET_FMT_lx + " vsid 0x"TARGET_FMT_lx, esid, vsid); + return; + } + + slb->vsid = vsid; + slb->esid = esid; + slb->sps = sps; +} + void helper_store_lpcr(CPUPPCState *env, target_ulong val) { uint64_t lpcr = 0; @@ -938,4 +1055,6 @@ void helper_store_lpcr(CPUPPCState *env, target_ulong val) ; } env->spr[SPR_LPCR] = lpcr; + ppc_hash64_update_rmls(env); + ppc_hash64_update_vrma(env); } diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h index 6423b9f791..3a7476b30a 100644 --- a/target-ppc/mmu-hash64.h +++ b/target-ppc/mmu-hash64.h @@ -17,8 +17,9 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu, target_ulong pte_index, target_ulong pte0, target_ulong pte1); unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, - uint64_t pte0, uint64_t pte1, - unsigned *seg_page_shift); + uint64_t pte0, uint64_t pte1); +void ppc_hash64_update_vrma(CPUPPCState *env); +void ppc_hash64_update_rmls(CPUPPCState *env); #endif /* @@ -37,6 +38,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, #define SLB_VSID_B_256M 0x0000000000000000ULL #define SLB_VSID_B_1T 0x4000000000000000ULL #define SLB_VSID_VSID 0x3FFFFFFFFFFFF000ULL +#define SLB_VSID_VRMA (0x0001FFFFFF000000ULL | SLB_VSID_B_1T) #define SLB_VSID_PTEM (SLB_VSID_B | SLB_VSID_VSID) #define SLB_VSID_KS 0x0000000000000800ULL #define SLB_VSID_KP 0x0000000000000400ULL @@ -63,7 +65,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu, #define HPTE64_V_AVPN_SHIFT 7 #define HPTE64_V_AVPN 0x3fffffffffffff80ULL #define HPTE64_V_AVPN_VAL(x) (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT) -#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff80ULL)) +#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff83ULL)) #define HPTE64_V_LARGE 0x0000000000000004ULL #define HPTE64_V_SECONDARY 0x0000000000000002ULL #define HPTE64_V_VALID 0x0000000000000001ULL diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c index 843f19b748..8f257fb74a 100644 --- a/target-ppc/translate_init.c +++ b/target-ppc/translate_init.c @@ -8791,11 +8791,19 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu) /* Set emulated LPCR to not send interrupts to hypervisor. Note that * under KVM, the actual HW LPCR will be set differently by KVM itself, * the settings below ensure proper operations with TCG in absence of - * a real hypervisor + * a real hypervisor. + * + * Clearing VPM0 will also cause us to use RMOR in mmu-hash64.c for + * real mode accesses, which thankfully defaults to 0 and isn't + * accessible in guest mode. */ lpcr->default_value &= ~(LPCR_VPM0 | LPCR_VPM1 | LPCR_ISL | LPCR_KBV); lpcr->default_value |= LPCR_LPES0 | LPCR_LPES1; + /* Set RMLS to the max (ie, 16G) */ + lpcr->default_value &= ~LPCR_RMLS; + lpcr->default_value |= 1ull << LPCR_RMLS_SHIFT; + /* P7 and P8 has slightly different PECE bits, mostly because P8 adds * bit 47 and 48 which are reserved on P7. Here we set them all, which * will work as expected for both implementations @@ -8811,6 +8819,10 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu) /* Set a full AMOR so guest can use the AMR as it sees fit */ env->spr[SPR_AMOR] = amor->default_value = 0xffffffffffffffffull; + /* Update some env bits based on new LPCR value */ + ppc_hash64_update_rmls(env); + ppc_hash64_update_vrma(env); + /* Tell KVM that we're in PAPR mode */ if (kvm_enabled()) { kvmppc_set_papr(cpu); @@ -9516,7 +9528,7 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp) PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); Error *local_err = NULL; #if !defined(CONFIG_USER_ONLY) - int max_smt = kvm_enabled() ? kvmppc_smt_threads() : 1; + int max_smt = kvmppc_smt_threads(); #endif #if !defined(CONFIG_USER_ONLY) |