aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--hw/ppc/Makefile.objs2
-rw-r--r--hw/ppc/spapr.c48
-rw-r--r--hw/ppc/spapr_pci.c19
-rw-r--r--hw/ppc/spapr_pci_nvlink2.c450
-rw-r--r--hw/vfio/pci-quirks.c131
-rw-r--r--hw/vfio/pci.c14
-rw-r--r--hw/vfio/pci.h2
-rw-r--r--hw/vfio/trace-events4
-rw-r--r--include/hw/pci-host/spapr.h45
-rw-r--r--include/hw/ppc/spapr.h5
10 files changed, 711 insertions, 9 deletions
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 1111b218a0..636e717f20 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -9,7 +9,7 @@ obj-$(CONFIG_SPAPR_RNG) += spapr_rng.o
# IBM PowerNV
obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
-obj-y += spapr_pci_vfio.o
+obj-y += spapr_pci_vfio.o spapr_pci_nvlink2.o
endif
obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
# PowerPC 4xx boards
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index b52b82d298..b81e237635 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1034,12 +1034,13 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
0, cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE),
cpu_to_be32(max_cpus / smp_threads),
};
+ uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0);
uint32_t maxdomains[] = {
cpu_to_be32(4),
- cpu_to_be32(0),
- cpu_to_be32(0),
- cpu_to_be32(0),
- cpu_to_be32(nb_numa_nodes ? nb_numa_nodes : 1),
+ maxdomain,
+ maxdomain,
+ maxdomain,
+ cpu_to_be32(spapr->gpu_numa_id),
};
_FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
@@ -1698,6 +1699,16 @@ static void spapr_machine_reset(void)
spapr_irq_msi_reset(spapr);
}
+ /*
+ * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
+ * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
+ * called from vPHB reset handler so we initialize the counter here.
+ * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
+ * must be equally distant from any other node.
+ * The final value of spapr->gpu_numa_id is going to be written to
+ * max-associativity-domains in spapr_build_fdt().
+ */
+ spapr->gpu_numa_id = MAX(1, nb_numa_nodes);
qemu_devices_reset();
/*
@@ -3907,7 +3918,9 @@ static void spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
smc->phb_placement(spapr, sphb->index,
&sphb->buid, &sphb->io_win_addr,
&sphb->mem_win_addr, &sphb->mem64_win_addr,
- windows_supported, sphb->dma_liobn, errp);
+ windows_supported, sphb->dma_liobn,
+ &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
+ errp);
}
static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
@@ -4108,7 +4121,8 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
static void spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
hwaddr *mmio32, hwaddr *mmio64,
- unsigned n_dma, uint32_t *liobns, Error **errp)
+ unsigned n_dma, uint32_t *liobns,
+ hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
{
/*
* New-style PHB window placement.
@@ -4153,6 +4167,9 @@ static void spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
*pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
*mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
*mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
+
+ *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
+ *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
}
static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
@@ -4357,6 +4374,18 @@ DEFINE_SPAPR_MACHINE(4_0, "4.0", true);
/*
* pseries-3.1
*/
+static void phb_placement_3_1(SpaprMachineState *spapr, uint32_t index,
+ uint64_t *buid, hwaddr *pio,
+ hwaddr *mmio32, hwaddr *mmio64,
+ unsigned n_dma, uint32_t *liobns,
+ hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
+{
+ spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma, liobns,
+ nv2gpa, nv2atsd, errp);
+ *nv2gpa = 0;
+ *nv2atsd = 0;
+}
+
static void spapr_machine_3_1_class_options(MachineClass *mc)
{
SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
@@ -4372,6 +4401,7 @@ static void spapr_machine_3_1_class_options(MachineClass *mc)
smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
+ smc->phb_placement = phb_placement_3_1;
}
DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
@@ -4503,7 +4533,8 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
hwaddr *mmio32, hwaddr *mmio64,
- unsigned n_dma, uint32_t *liobns, Error **errp)
+ unsigned n_dma, uint32_t *liobns,
+ hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
{
/* Legacy PHB placement for pseries-2.7 and earlier machine types */
const uint64_t base_buid = 0x800000020000000ULL;
@@ -4547,6 +4578,9 @@ static void phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
* fallback behaviour of automatically splitting a large "32-bit"
* window into contiguous 32-bit and 64-bit windows
*/
+
+ *nv2gpa = 0;
+ *nv2atsd = 0;
}
static void spapr_machine_2_7_class_options(MachineClass *mc)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index f62e6833b8..d6d0a7115c 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1355,6 +1355,8 @@ static void spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
if (sphb->pcie_ecs && pci_is_express(dev)) {
_FDT(fdt_setprop_cell(fdt, offset, "ibm,pci-config-space-type", 0x1));
}
+
+ spapr_phb_nvgpu_populate_pcidev_dt(dev, fdt, offset, sphb);
}
/* create OF node for pci device and required OF DT properties */
@@ -1587,6 +1589,8 @@ static void spapr_phb_unrealize(DeviceState *dev, Error **errp)
int i;
const unsigned windows_supported = spapr_phb_windows_supported(sphb);
+ spapr_phb_nvgpu_free(sphb);
+
if (sphb->msi) {
g_hash_table_unref(sphb->msi);
sphb->msi = NULL;
@@ -1898,8 +1902,14 @@ void spapr_phb_dma_reset(SpaprPhbState *sphb)
static void spapr_phb_reset(DeviceState *qdev)
{
SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
+ Error *errp = NULL;
spapr_phb_dma_reset(sphb);
+ spapr_phb_nvgpu_free(sphb);
+ spapr_phb_nvgpu_setup(sphb, &errp);
+ if (errp) {
+ error_report_err(errp);
+ }
/* Reset the IOMMU state */
object_child_foreach(OBJECT(qdev), spapr_phb_children_reset, NULL);
@@ -1932,6 +1942,8 @@ static Property spapr_phb_properties[] = {
pre_2_8_migration, false),
DEFINE_PROP_BOOL("pcie-extended-configuration-space", SpaprPhbState,
pcie_ecs, true),
+ DEFINE_PROP_UINT64("gpa", SpaprPhbState, nv2_gpa_win_addr, 0),
+ DEFINE_PROP_UINT64("atsd", SpaprPhbState, nv2_atsd_win_addr, 0),
DEFINE_PROP_END_OF_LIST(),
};
@@ -2212,6 +2224,7 @@ int spapr_populate_pci_dt(SpaprPhbState *phb, uint32_t intc_phandle, void *fdt,
PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
SpaprFdt s_fdt;
SpaprDrc *drc;
+ Error *errp = NULL;
/* Start populating the FDT */
nodename = g_strdup_printf("pci@%" PRIx64, phb->buid);
@@ -2304,6 +2317,12 @@ int spapr_populate_pci_dt(SpaprPhbState *phb, uint32_t intc_phandle, void *fdt,
return ret;
}
+ spapr_phb_nvgpu_populate_dt(phb, fdt, bus_off, &errp);
+ if (errp) {
+ error_report_err(errp);
+ }
+ spapr_phb_nvgpu_ram_populate_dt(phb, fdt);
+
return 0;
}
diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
new file mode 100644
index 0000000000..eda8c752aa
--- /dev/null
+++ b/hw/ppc/spapr_pci_nvlink2.c
@@ -0,0 +1,450 @@
+/*
+ * QEMU sPAPR PCI for NVLink2 pass through
+ *
+ * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "hw/pci/pci.h"
+#include "hw/pci-host/spapr.h"
+#include "qemu/error-report.h"
+#include "hw/ppc/fdt.h"
+#include "hw/pci/pci_bridge.h"
+
+#define PHANDLE_PCIDEV(phb, pdev) (0x12000000 | \
+ (((phb)->index) << 16) | ((pdev)->devfn))
+#define PHANDLE_GPURAM(phb, n) (0x110000FF | ((n) << 8) | \
+ (((phb)->index) << 16))
+#define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \
+ ((gn) << 4) | (nn))
+
+#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1))
+
+struct spapr_phb_pci_nvgpu_config {
+ uint64_t nv2_ram_current;
+ uint64_t nv2_atsd_current;
+ int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */
+ struct spapr_phb_pci_nvgpu_slot {
+ uint64_t tgt;
+ uint64_t gpa;
+ unsigned numa_id;
+ PCIDevice *gpdev;
+ int linknum;
+ struct {
+ uint64_t atsd_gpa;
+ PCIDevice *npdev;
+ uint32_t link_speed;
+ } links[NVGPU_MAX_LINKS];
+ } slots[NVGPU_MAX_NUM];
+ Error *errp;
+};
+
+static struct spapr_phb_pci_nvgpu_slot *
+spapr_nvgpu_get_slot(struct spapr_phb_pci_nvgpu_config *nvgpus, uint64_t tgt)
+{
+ int i;
+
+ /* Search for partially collected "slot" */
+ for (i = 0; i < nvgpus->num; ++i) {
+ if (nvgpus->slots[i].tgt == tgt) {
+ return &nvgpus->slots[i];
+ }
+ }
+
+ if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) {
+ return NULL;
+ }
+
+ i = nvgpus->num;
+ nvgpus->slots[i].tgt = tgt;
+ ++nvgpus->num;
+
+ return &nvgpus->slots[i];
+}
+
+static void spapr_pci_collect_nvgpu(struct spapr_phb_pci_nvgpu_config *nvgpus,
+ PCIDevice *pdev, uint64_t tgt,
+ MemoryRegion *mr, Error **errp)
+{
+ MachineState *machine = MACHINE(qdev_get_machine());
+ SpaprMachineState *spapr = SPAPR_MACHINE(machine);
+ struct spapr_phb_pci_nvgpu_slot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
+
+ if (!nvslot) {
+ error_setg(errp, "Found too many GPUs per vPHB");
+ return;
+ }
+ g_assert(!nvslot->gpdev);
+ nvslot->gpdev = pdev;
+
+ nvslot->gpa = nvgpus->nv2_ram_current;
+ nvgpus->nv2_ram_current += memory_region_size(mr);
+ nvslot->numa_id = spapr->gpu_numa_id;
+ ++spapr->gpu_numa_id;
+}
+
+static void spapr_pci_collect_nvnpu(struct spapr_phb_pci_nvgpu_config *nvgpus,
+ PCIDevice *pdev, uint64_t tgt,
+ MemoryRegion *mr, Error **errp)
+{
+ struct spapr_phb_pci_nvgpu_slot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
+ int j;
+
+ if (!nvslot) {
+ error_setg(errp, "Found too many NVLink bridges per vPHB");
+ return;
+ }
+
+ j = nvslot->linknum;
+ if (j == ARRAY_SIZE(nvslot->links)) {
+ error_setg(errp, "Found too many NVLink bridges per GPU");
+ return;
+ }
+ ++nvslot->linknum;
+
+ g_assert(!nvslot->links[j].npdev);
+ nvslot->links[j].npdev = pdev;
+ nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current;
+ nvgpus->nv2_atsd_current += memory_region_size(mr);
+ nvslot->links[j].link_speed =
+ object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL);
+}
+
+static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev,
+ void *opaque)
+{
+ PCIBus *sec_bus;
+ Object *po = OBJECT(pdev);
+ uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL);
+
+ if (tgt) {
+ Error *local_err = NULL;
+ struct spapr_phb_pci_nvgpu_config *nvgpus = opaque;
+ Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL);
+ Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]",
+ NULL);
+
+ g_assert(mr_gpu || mr_npu);
+ if (mr_gpu) {
+ spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu),
+ &local_err);
+ } else {
+ spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu),
+ &local_err);
+ }
+ error_propagate(&nvgpus->errp, local_err);
+ }
+ if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) !=
+ PCI_HEADER_TYPE_BRIDGE)) {
+ return;
+ }
+
+ sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
+ if (!sec_bus) {
+ return;
+ }
+
+ pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
+ spapr_phb_pci_collect_nvgpu, opaque);
+}
+
+void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp)
+{
+ int i, j, valid_gpu_num;
+ PCIBus *bus;
+
+ /* Search for GPUs and NPUs */
+ if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) {
+ return;
+ }
+
+ sphb->nvgpus = g_new0(struct spapr_phb_pci_nvgpu_config, 1);
+ sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr;
+ sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr;
+
+ bus = PCI_HOST_BRIDGE(sphb)->bus;
+ pci_for_each_device(bus, pci_bus_num(bus),
+ spapr_phb_pci_collect_nvgpu, sphb->nvgpus);
+
+ if (sphb->nvgpus->errp) {
+ error_propagate(errp, sphb->nvgpus->errp);
+ sphb->nvgpus->errp = NULL;
+ goto cleanup_exit;
+ }
+
+ /* Add found GPU RAM and ATSD MRs if found */
+ for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) {
+ Object *nvmrobj;
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
+
+ if (!nvslot->gpdev) {
+ continue;
+ }
+ nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev),
+ "nvlink2-mr[0]", NULL);
+ /* ATSD is pointless without GPU RAM MR so skip those */
+ if (!nvmrobj) {
+ continue;
+ }
+
+ ++valid_gpu_num;
+ memory_region_add_subregion(get_system_memory(), nvslot->gpa,
+ MEMORY_REGION(nvmrobj));
+
+ for (j = 0; j < nvslot->linknum; ++j) {
+ Object *atsdmrobj;
+
+ atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev),
+ "nvlink2-atsd-mr[0]", NULL);
+ if (!atsdmrobj) {
+ continue;
+ }
+ memory_region_add_subregion(get_system_memory(),
+ nvslot->links[j].atsd_gpa,
+ MEMORY_REGION(atsdmrobj));
+ }
+ }
+
+ if (valid_gpu_num) {
+ return;
+ }
+ /* We did not find any interesting GPU */
+cleanup_exit:
+ g_free(sphb->nvgpus);
+ sphb->nvgpus = NULL;
+}
+
+void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
+{
+ int i, j;
+
+ if (!sphb->nvgpus) {
+ return;
+ }
+
+ for (i = 0; i < sphb->nvgpus->num; ++i) {
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
+ Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
+ "nvlink2-mr[0]", NULL);
+
+ if (nv_mrobj) {
+ memory_region_del_subregion(get_system_memory(),
+ MEMORY_REGION(nv_mrobj));
+ }
+ for (j = 0; j < nvslot->linknum; ++j) {
+ PCIDevice *npdev = nvslot->links[j].npdev;
+ Object *atsd_mrobj;
+ atsd_mrobj = object_property_get_link(OBJECT(npdev),
+ "nvlink2-atsd-mr[0]", NULL);
+ if (atsd_mrobj) {
+ memory_region_del_subregion(get_system_memory(),
+ MEMORY_REGION(atsd_mrobj));
+ }
+ }
+ }
+ g_free(sphb->nvgpus);
+ sphb->nvgpus = NULL;
+}
+
+void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off,
+ Error **errp)
+{
+ int i, j, atsdnum = 0;
+ uint64_t atsd[8]; /* The existing limitation of known guests */
+
+ if (!sphb->nvgpus) {
+ return;
+ }
+
+ for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) {
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
+
+ if (!nvslot->gpdev) {
+ continue;
+ }
+ for (j = 0; j < nvslot->linknum; ++j) {
+ if (!nvslot->links[j].atsd_gpa) {
+ continue;
+ }
+
+ if (atsdnum == ARRAY_SIZE(atsd)) {
+ error_report("Only %"PRIuPTR" ATSD registers supported",
+ ARRAY_SIZE(atsd));
+ break;
+ }
+ atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa);
+ ++atsdnum;
+ }
+ }
+
+ if (!atsdnum) {
+ error_setg(errp, "No ATSD registers found");
+ return;
+ }
+
+ if (!spapr_phb_eeh_available(sphb)) {
+ /*
+ * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB
+ * which we do not emulate as a separate device. Instead we put
+ * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not
+ * put GPUs from different IOMMU groups to the same vPHB to ensure
+ * that the guest will use ATSDs from the corresponding NPU.
+ */
+ error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group");
+ return;
+ }
+
+ _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd,
+ atsdnum * sizeof(atsd[0]))));
+}
+
+void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt)
+{
+ int i, j, linkidx, npuoff;
+ char *npuname;
+
+ if (!sphb->nvgpus) {
+ return;
+ }
+
+ npuname = g_strdup_printf("npuphb%d", sphb->index);
+ npuoff = fdt_add_subnode(fdt, 0, npuname);
+ _FDT(npuoff);
+ _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1));
+ _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0));
+ /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */
+ _FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu")));
+ g_free(npuname);
+
+ for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) {
+ for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) {
+ char *linkname = g_strdup_printf("link@%d", linkidx);
+ int off = fdt_add_subnode(fdt, npuoff, linkname);
+
+ _FDT(off);
+ /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */
+ _FDT((fdt_setprop_string(fdt, off, "compatible",
+ "ibm,npu-link")));
+ _FDT((fdt_setprop_cell(fdt, off, "phandle",
+ PHANDLE_NVLINK(sphb, i, j))));
+ _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx)));
+ g_free(linkname);
+ ++linkidx;
+ }
+ }
+
+ /* Add memory nodes for GPU RAM and mark them unusable */
+ for (i = 0; i < sphb->nvgpus->num; ++i) {
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
+ Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
+ "nvlink2-mr[0]", NULL);
+ uint32_t associativity[] = {
+ cpu_to_be32(0x4),
+ SPAPR_GPU_NUMA_ID,
+ SPAPR_GPU_NUMA_ID,
+ SPAPR_GPU_NUMA_ID,
+ cpu_to_be32(nvslot->numa_id)
+ };
+ uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
+ uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) };
+ char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa);
+ int off = fdt_add_subnode(fdt, 0, mem_name);
+
+ _FDT(off);
+ _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
+ _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
+ _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
+ sizeof(associativity))));
+
+ _FDT((fdt_setprop_string(fdt, off, "compatible",
+ "ibm,coherent-device-memory")));
+
+ mem_reg[1] = cpu_to_be64(0);
+ _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg,
+ sizeof(mem_reg))));
+ _FDT((fdt_setprop_cell(fdt, off, "phandle",
+ PHANDLE_GPURAM(sphb, i))));
+ g_free(mem_name);
+ }
+
+}
+
+void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
+ SpaprPhbState *sphb)
+{
+ int i, j;
+
+ if (!sphb->nvgpus) {
+ return;
+ }
+
+ for (i = 0; i < sphb->nvgpus->num; ++i) {
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
+
+ /* Skip "slot" without attached GPU */
+ if (!nvslot->gpdev) {
+ continue;
+ }
+ if (dev == nvslot->gpdev) {
+ uint32_t npus[nvslot->linknum];
+
+ for (j = 0; j < nvslot->linknum; ++j) {
+ PCIDevice *npdev = nvslot->links[j].npdev;
+
+ npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev));
+ }
+ _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus,
+ j * sizeof(npus[0])));
+ _FDT((fdt_setprop_cell(fdt, offset, "phandle",
+ PHANDLE_PCIDEV(sphb, dev))));
+ continue;
+ }
+
+ for (j = 0; j < nvslot->linknum; ++j) {
+ if (dev != nvslot->links[j].npdev) {
+ continue;
+ }
+
+ _FDT((fdt_setprop_cell(fdt, offset, "phandle",
+ PHANDLE_PCIDEV(sphb, dev))));
+ _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu",
+ PHANDLE_PCIDEV(sphb, nvslot->gpdev)));
+ _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink",
+ PHANDLE_NVLINK(sphb, i, j))));
+ /*
+ * If we ever want to emulate GPU RAM at the same location as on
+ * the host - here is the encoding GPA->TGT:
+ *
+ * gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42;
+ * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43;
+ * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45;
+ * gta |= sphb->nv2_gpa & ((1UL << 43) - 1);
+ */
+ _FDT(fdt_setprop_cell(fdt, offset, "memory-region",
+ PHANDLE_GPURAM(sphb, i)));
+ _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr",
+ nvslot->tgt));
+ _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed",
+ nvslot->links[j].link_speed));
+ }
+ }
+}
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index 40a12001f5..29b2697fe1 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -2180,3 +2180,134 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
return 0;
}
+
+static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
+ const char *name,
+ void *opaque, Error **errp)
+{
+ uint64_t tgt = (uintptr_t) opaque;
+ visit_type_uint64(v, name, &tgt, errp);
+}
+
+static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
+ const char *name,
+ void *opaque, Error **errp)
+{
+ uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
+ visit_type_uint32(v, name, &link_speed, errp);
+}
+
+int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
+{
+ int ret;
+ void *p;
+ struct vfio_region_info *nv2reg = NULL;
+ struct vfio_info_cap_header *hdr;
+ struct vfio_region_info_cap_nvlink2_ssatgt *cap;
+ VFIOQuirk *quirk;
+
+ ret = vfio_get_dev_region_info(&vdev->vbasedev,
+ VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
+ PCI_VENDOR_ID_NVIDIA,
+ VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
+ &nv2reg);
+ if (ret) {
+ return ret;
+ }
+
+ hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
+ if (!hdr) {
+ ret = -ENODEV;
+ goto free_exit;
+ }
+ cap = (void *) hdr;
+
+ p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
+ if (p == MAP_FAILED) {
+ ret = -errno;
+ goto free_exit;
+ }
+
+ quirk = vfio_quirk_alloc(1);
+ memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
+ nv2reg->size, p);
+ QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
+
+ object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
+ vfio_pci_nvlink2_get_tgt, NULL, NULL,
+ (void *) (uintptr_t) cap->tgt, NULL);
+ trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
+ nv2reg->size);
+free_exit:
+ g_free(nv2reg);
+
+ return ret;
+}
+
+int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
+{
+ int ret;
+ void *p;
+ struct vfio_region_info *atsdreg = NULL;
+ struct vfio_info_cap_header *hdr;
+ struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
+ struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
+ VFIOQuirk *quirk;
+
+ ret = vfio_get_dev_region_info(&vdev->vbasedev,
+ VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
+ PCI_VENDOR_ID_IBM,
+ VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
+ &atsdreg);
+ if (ret) {
+ return ret;
+ }
+
+ hdr = vfio_get_region_info_cap(atsdreg,
+ VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
+ if (!hdr) {
+ ret = -ENODEV;
+ goto free_exit;
+ }
+ captgt = (void *) hdr;
+
+ hdr = vfio_get_region_info_cap(atsdreg,
+ VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
+ if (!hdr) {
+ ret = -ENODEV;
+ goto free_exit;
+ }
+ capspeed = (void *) hdr;
+
+ /* Some NVLink bridges may not have assigned ATSD */
+ if (atsdreg->size) {
+ p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
+ if (p == MAP_FAILED) {
+ ret = -errno;
+ goto free_exit;
+ }
+
+ quirk = vfio_quirk_alloc(1);
+ memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
+ "nvlink2-atsd-mr", atsdreg->size, p);
+ QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
+ }
+
+ object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
+ vfio_pci_nvlink2_get_tgt, NULL, NULL,
+ (void *) (uintptr_t) captgt->tgt, NULL);
+ trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
+ atsdreg->size);
+
+ object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
+ vfio_pci_nvlink2_get_link_speed, NULL, NULL,
+ (void *) (uintptr_t) capspeed->link_speed, NULL);
+ trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
+ capspeed->link_speed);
+free_exit:
+ g_free(atsdreg);
+
+ return ret;
+}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0142819ea6..8cecb53d5c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3086,6 +3086,20 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
}
+ if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
+ ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
+ if (ret && ret != -ENODEV) {
+ error_report("Failed to setup NVIDIA V100 GPU RAM");
+ }
+ }
+
+ if (vdev->vendor_id == PCI_VENDOR_ID_IBM) {
+ ret = vfio_pci_nvlink2_init(vdev, errp);
+ if (ret && ret != -ENODEV) {
+ error_report("Failed to setup NVlink2 bridge");
+ }
+ }
+
vfio_register_err_notifier(vdev);
vfio_register_req_notifier(vdev);
vfio_setup_resetfn_quirk(vdev);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index c11c3f1670..cfcd1a81b8 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -196,6 +196,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
struct vfio_region_info *info,
Error **errp);
+int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp);
+int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp);
void vfio_display_reset(VFIOPCIDevice *vdev);
int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index eb589930a5..b1ef55a33f 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -86,6 +86,10 @@ vfio_pci_igd_opregion_enabled(const char *name) "%s"
vfio_pci_igd_host_bridge_enabled(const char *name) "%s"
vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
+vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
+vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
+vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t link_speed) "%s link_speed=0x%x"
+
# common.c
vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index b4aad26798..53519c835e 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -87,6 +87,9 @@ struct SpaprPhbState {
uint32_t mig_liobn;
hwaddr mig_mem_win_addr, mig_mem_win_size;
hwaddr mig_io_win_addr, mig_io_win_size;
+ hwaddr nv2_gpa_win_addr;
+ hwaddr nv2_atsd_win_addr;
+ struct spapr_phb_pci_nvgpu_config *nvgpus;
};
#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
@@ -105,6 +108,22 @@ struct SpaprPhbState {
#define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL
+#define SPAPR_PCI_NV2RAM64_WIN_BASE SPAPR_PCI_LIMIT
+#define SPAPR_PCI_NV2RAM64_WIN_SIZE (2 * TiB) /* For up to 6 GPUs 256GB each */
+
+/* Max number of these GPUsper a physical box */
+#define NVGPU_MAX_NUM 6
+/* Max number of NVLinks per GPU in any physical box */
+#define NVGPU_MAX_LINKS 3
+
+/*
+ * GPU RAM starts at 64TiB so huge DMA window to cover it all ends at 128TiB
+ * which is enough. We do not need DMA for ATSD so we put them at 128TiB.
+ */
+#define SPAPR_PCI_NV2ATSD_WIN_BASE (128 * TiB)
+#define SPAPR_PCI_NV2ATSD_WIN_SIZE (NVGPU_MAX_NUM * NVGPU_MAX_LINKS * \
+ 64 * KiB)
+
static inline qemu_irq spapr_phb_lsi_qirq(struct SpaprPhbState *phb, int pin)
{
SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -135,6 +154,13 @@ int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state);
int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option);
int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb);
void spapr_phb_vfio_reset(DeviceState *qdev);
+void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp);
+void spapr_phb_nvgpu_free(SpaprPhbState *sphb);
+void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off,
+ Error **errp);
+void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt);
+void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
+ SpaprPhbState *sphb);
#else
static inline bool spapr_phb_eeh_available(SpaprPhbState *sphb)
{
@@ -161,6 +187,25 @@ static inline int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
static inline void spapr_phb_vfio_reset(DeviceState *qdev)
{
}
+static inline void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp)
+{
+}
+static inline void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
+{
+}
+static inline void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt,
+ int bus_off, Error **errp)
+{
+}
+static inline void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb,
+ void *fdt)
+{
+}
+static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt,
+ int offset,
+ SpaprPhbState *sphb)
+{
+}
#endif
void spapr_phb_dma_reset(SpaprPhbState *sphb);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 5ea8081041..02b5ce7e40 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -123,7 +123,8 @@ struct SpaprMachineClass {
void (*phb_placement)(SpaprMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
hwaddr *mmio32, hwaddr *mmio64,
- unsigned n_dma, uint32_t *liobns, Error **errp);
+ unsigned n_dma, uint32_t *liobns, hwaddr *nv2gpa,
+ hwaddr *nv2atsd, Error **errp);
SpaprResizeHpt resize_hpt_default;
SpaprCapabilities default_caps;
SpaprIrq *irq;
@@ -199,6 +200,8 @@ struct SpaprMachineState {
bool cmd_line_caps[SPAPR_CAP_NUM];
SpaprCapabilities def, eff, mig;
+
+ unsigned gpu_numa_id;
};
#define H_SUCCESS 0