From ec132efaa81f09861a3bd6afad94827e74543b3f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 12 Mar 2019 19:21:03 +1100 Subject: spapr: Support NVIDIA V100 GPU with NVLink2 NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver implements special regions for such GPUs and emulates an NVLink bridge. NVLink2-enabled POWER9 CPUs also provide address translation services which includes an ATS shootdown (ATSD) register exported via the NVLink bridge device. This adds a quirk to VFIO to map the GPU memory and create an MR; the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses this to get the MR and map it to the system address space. Another quirk does the same for ATSD. This adds additional steps to sPAPR PHB setup: 1. Search for specific GPUs and NPUs, collect findings in sPAPRPHBState::nvgpus, manage system address space mappings; 2. Add device-specific properties such as "ibm,npu", "ibm,gpu", "memory-block", "link-speed" to advertise the NVLink2 function to the guest; 3. Add "mmio-atsd" to vPHB to advertise the ATSD capability; 4. Add new memory blocks (with extra "linux,memory-usable" to prevent the guest OS from accessing the new memory until it is onlined) and npuphb# nodes representing an NPU unit for every vPHB as the GPU driver uses it for link discovery. This allocates space for GPU RAM and ATSD like we do for MMIOs by adding 2 new parameters to the phb_placement() hook. Older machine types set these to zero. This puts new memory nodes in a separate NUMA node to as the GPU RAM needs to be configured equally distant from any other node in the system. Unlike the host setup which assigns numa ids from 255 downwards, this adds new NUMA nodes after the user configures nodes or from 1 if none were configured. This adds requirement similar to EEH - one IOMMU group per vPHB. The reason for this is that ATSD registers belong to a physical NPU so they cannot invalidate translations on GPUs attached to another NPU. It is guaranteed by the host platform as it does not mix NVLink bridges or GPUs from different NPU in the same IOMMU group. If more than one IOMMU group is detected on a vPHB, this disables ATSD support for that vPHB and prints a warning. Signed-off-by: Alexey Kardashevskiy [aw: for vfio portions] Acked-by: Alex Williamson Message-Id: <20190312082103.130561-1-aik@ozlabs.ru> Signed-off-by: David Gibson --- include/hw/pci-host/spapr.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/hw/pci-host') diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index b4aad26798..53519c835e 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -87,6 +87,9 @@ struct SpaprPhbState { uint32_t mig_liobn; hwaddr mig_mem_win_addr, mig_mem_win_size; hwaddr mig_io_win_addr, mig_io_win_size; + hwaddr nv2_gpa_win_addr; + hwaddr nv2_atsd_win_addr; + struct spapr_phb_pci_nvgpu_config *nvgpus; }; #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL @@ -105,6 +108,22 @@ struct SpaprPhbState { #define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL +#define SPAPR_PCI_NV2RAM64_WIN_BASE SPAPR_PCI_LIMIT +#define SPAPR_PCI_NV2RAM64_WIN_SIZE (2 * TiB) /* For up to 6 GPUs 256GB each */ + +/* Max number of these GPUsper a physical box */ +#define NVGPU_MAX_NUM 6 +/* Max number of NVLinks per GPU in any physical box */ +#define NVGPU_MAX_LINKS 3 + +/* + * GPU RAM starts at 64TiB so huge DMA window to cover it all ends at 128TiB + * which is enough. We do not need DMA for ATSD so we put them at 128TiB. + */ +#define SPAPR_PCI_NV2ATSD_WIN_BASE (128 * TiB) +#define SPAPR_PCI_NV2ATSD_WIN_SIZE (NVGPU_MAX_NUM * NVGPU_MAX_LINKS * \ + 64 * KiB) + static inline qemu_irq spapr_phb_lsi_qirq(struct SpaprPhbState *phb, int pin) { SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); @@ -135,6 +154,13 @@ int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state); int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option); int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb); void spapr_phb_vfio_reset(DeviceState *qdev); +void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp); +void spapr_phb_nvgpu_free(SpaprPhbState *sphb); +void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off, + Error **errp); +void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt); +void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset, + SpaprPhbState *sphb); #else static inline bool spapr_phb_eeh_available(SpaprPhbState *sphb) { @@ -161,6 +187,25 @@ static inline int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb) static inline void spapr_phb_vfio_reset(DeviceState *qdev) { } +static inline void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp) +{ +} +static inline void spapr_phb_nvgpu_free(SpaprPhbState *sphb) +{ +} +static inline void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, + int bus_off, Error **errp) +{ +} +static inline void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, + void *fdt) +{ +} +static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, + int offset, + SpaprPhbState *sphb) +{ +} #endif void spapr_phb_dma_reset(SpaprPhbState *sphb); -- cgit v1.2.3