From b53b0f696b10828f6393155f44a352c019e673fd Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:20 -0600 Subject: vfio: Enable sparse mmap capability The sparse mmap capability in a vfio region info allows vfio to tell us which sub-areas of a region may be mmap'd. Thus rather than assuming a single mmap covers the entire region and later frobbing it ourselves for things like the PCI MSI-X vector table, we can read that directly from vfio. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/common.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- trace-events | 2 ++ 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 88154a1f03..49121798ff 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -499,6 +499,54 @@ static void vfio_listener_release(VFIOContainer *container) memory_listener_unregister(&container->listener); } +static struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_setup_region_sparse_mmaps(VFIORegion *region, + struct vfio_region_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; + int i; + + hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + if (!hdr) { + return; + } + + sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); + + trace_vfio_region_sparse_mmap_header(region->vbasedev->name, + region->nr, sparse->nr_areas); + + region->nr_mmaps = sparse->nr_areas; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + + for (i = 0; i < region->nr_mmaps; i++) { + region->mmaps[i].offset = sparse->areas[i].offset; + region->mmaps[i].size = sparse->areas[i].size; + trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset, + region->mmaps[i].offset + + region->mmaps[i].size); + } +} + int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, int index, const char *name) { @@ -525,11 +573,14 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->flags & VFIO_REGION_INFO_FLAG_MMAP && !(region->size & ~qemu_real_host_page_mask)) { - region->nr_mmaps = 1; - region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + vfio_setup_region_sparse_mmaps(region, info); - region->mmaps[0].offset = 0; - region->mmaps[0].size = region->size; + if (!region->nr_mmaps) { + region->nr_mmaps = 1; + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); + region->mmaps[0].offset = 0; + region->mmaps[0].size = region->size; + } } } @@ -1089,6 +1140,7 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, *info = g_malloc0(argsz); (*info)->index = index; +retry: (*info)->argsz = argsz; if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { @@ -1096,6 +1148,13 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, return -errno; } + if ((*info)->argsz > argsz) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + + goto retry; + } + return 0; } diff --git a/trace-events b/trace-events index 4450d8f377..145d8c38b4 100644 --- a/trace-events +++ b/trace-events @@ -1734,6 +1734,8 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg vfio_region_exit(const char *name, int index) "Device %s, region %d" vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" +vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" +vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" # hw/vfio/platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" -- cgit v1.2.3 From e61a424f0573634a1bc180de965b2cb794c1038e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:20 -0600 Subject: vfio: Create device specific region info helper Given a device specific region type and sub-type, find it. Also cleanup return point on error in vfio_get_region_info() so that we always return 0 with a valid pointer or -errno and NULL. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/common.c | 36 ++++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ trace-events | 4 ++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 49121798ff..902a047947 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1145,6 +1145,7 @@ retry: if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { g_free(*info); + *info = NULL; return -errno; } @@ -1158,6 +1159,41 @@ retry: return 0; } +int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info) +{ + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_type *cap_type; + + if (vfio_get_region_info(vbasedev, i, info)) { + continue; + } + + hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); + if (!hdr) { + g_free(*info); + continue; + } + + cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); + + trace_vfio_get_dev_region(vbasedev->name, i, + cap_type->type, cap_type->subtype); + + if (cap_type->type == type && cap_type->subtype == subtype) { + return 0; + } + + g_free(*info); + } + + *info = NULL; + return -ENODEV; +} + /* * Interfaces for IBM EEH (Enhanced Error Handling) */ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index eb0e1b0342..a947f9ccda 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -154,5 +154,7 @@ extern QLIST_HEAD(vfio_as_head, VFIOAddressSpace) vfio_address_spaces; #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); +int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, + uint32_t subtype, struct vfio_region_info **info); #endif #endif /* !HW_VFIO_VFIO_COMMON_H */ diff --git a/trace-events b/trace-events index 145d8c38b4..3d2c79ea02 100644 --- a/trace-events +++ b/trace-events @@ -1714,8 +1714,7 @@ vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s" vfio_quirk_ati_bonaire_reset_done(const char *name) "%s" vfio_quirk_ati_bonaire_reset(const char *name) "%s" - -# hw/vfio/vfio-common.c +# hw/vfio/common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ %"PRIx64" - %"PRIx64 @@ -1736,6 +1735,7 @@ vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" +vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" # hw/vfio/platform.c vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d" -- cgit v1.2.3 From 4225f2b67089ab2dc6c0af24a9648a421f2cdb51 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:20 -0600 Subject: vfio/pci: Fix return of vfio_populate_vga() This function returns success if either we setup the VGA region or the host vfio doesn't return enough regions to support the VGA index. This latter case doesn't make any sense. If we're asked to populate VGA, fail if it doesn't exist and let the caller decide if that's important. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/pci.c | 55 ++++++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d091d8cf0e..dfce31378a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2061,42 +2061,39 @@ int vfio_populate_vga(VFIOPCIDevice *vdev) struct vfio_region_info *reg_info; int ret; - if (vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) { - ret = vfio_get_region_info(vbasedev, - VFIO_PCI_VGA_REGION_INDEX, ®_info); - if (ret) { - return ret; - } + ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); + if (ret) { + return ret; + } - if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || - !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || - reg_info->size < 0xbffff + 1) { - error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", - (unsigned long)reg_info->flags, - (unsigned long)reg_info->size); - g_free(reg_info); - return -EINVAL; - } + if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) || + !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) || + reg_info->size < 0xbffff + 1) { + error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx", + (unsigned long)reg_info->flags, + (unsigned long)reg_info->size); + g_free(reg_info); + return -EINVAL; + } - vdev->vga = g_new0(VFIOVGA, 1); + vdev->vga = g_new0(VFIOVGA, 1); - vdev->vga->fd_offset = reg_info->offset; - vdev->vga->fd = vdev->vbasedev.fd; + vdev->vga->fd_offset = reg_info->offset; + vdev->vga->fd = vdev->vbasedev.fd; - g_free(reg_info); + g_free(reg_info); - vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; - vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; - QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); + vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE; + vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); - vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; - vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; - QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); + vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; + vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); - vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; - vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; - QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); - } + vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; + vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; + QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); return 0; } -- cgit v1.2.3 From 182bca4592a8a6a57cdbd6889683cced78e85a03 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:21 -0600 Subject: vfio/pci: Consolidate VGA setup Combine VGA discovery and registration. Quirks can have dependencies on BARs, so the quirks push out until after we've scanned the BARs. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/pci.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index dfce31378a..daf10b8dd3 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1452,29 +1452,6 @@ static void vfio_bars_setup(VFIOPCIDevice *vdev) for (i = 0; i < PCI_ROM_SLOT; i++) { vfio_bar_setup(vdev, i); } - - if (vdev->vga) { - memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, - OBJECT(vdev), &vfio_vga_ops, - &vdev->vga->region[QEMU_PCI_VGA_MEM], - "vfio-vga-mmio@0xa0000", - QEMU_PCI_VGA_MEM_SIZE); - memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, - OBJECT(vdev), &vfio_vga_ops, - &vdev->vga->region[QEMU_PCI_VGA_IO_LO], - "vfio-vga-io@0x3b0", - QEMU_PCI_VGA_IO_LO_SIZE); - memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, - OBJECT(vdev), &vfio_vga_ops, - &vdev->vga->region[QEMU_PCI_VGA_IO_HI], - "vfio-vga-io@0x3c0", - QEMU_PCI_VGA_IO_HI_SIZE); - - pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); - vfio_vga_quirk_setup(vdev); - } } static void vfio_bars_exit(VFIOPCIDevice *vdev) @@ -2087,14 +2064,36 @@ int vfio_populate_vga(VFIOPCIDevice *vdev) vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM; QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks); + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + OBJECT(vdev), &vfio_vga_ops, + &vdev->vga->region[QEMU_PCI_VGA_MEM], + "vfio-vga-mmio@0xa0000", + QEMU_PCI_VGA_MEM_SIZE); + vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE; vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO; QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks); + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + OBJECT(vdev), &vfio_vga_ops, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO], + "vfio-vga-io@0x3b0", + QEMU_PCI_VGA_IO_LO_SIZE); + vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE; vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI; QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks); + memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem, + OBJECT(vdev), &vfio_vga_ops, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI], + "vfio-vga-io@0x3c0", + QEMU_PCI_VGA_IO_HI_SIZE); + + pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); + return 0; } @@ -2557,6 +2556,10 @@ static int vfio_initfn(PCIDevice *pdev) goto out_teardown; } + if (vdev->vga) { + vfio_vga_quirk_setup(vdev); + } + /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, -- cgit v1.2.3 From 581406e0e349be99b6c08a7a1b492af5951f9a4e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:21 -0600 Subject: vfio/pci: Setup BAR quirks after capabilities probing Capability probing modifies wmask, which quirks may be interested in changing themselves. Apply our BAR quirks after the capability scan to make this possible. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/pci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index daf10b8dd3..aa6fb7bd6f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1440,8 +1440,6 @@ static void vfio_bar_setup(VFIOPCIDevice *vdev, int nr) vdev->vbasedev.name, nr); } - vfio_bar_quirk_setup(vdev, nr); - pci_register_bar(&vdev->pdev, nr, type, bar->region.mem); } @@ -2394,7 +2392,7 @@ static int vfio_initfn(PCIDevice *pdev) ssize_t len; struct stat st; int groupid; - int ret; + int i, ret; if (!vdev->vbasedev.sysfsdev) { vdev->vbasedev.sysfsdev = @@ -2560,6 +2558,10 @@ static int vfio_initfn(PCIDevice *pdev) vfio_vga_quirk_setup(vdev); } + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_bar_quirk_setup(vdev, i); + } + /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, -- cgit v1.2.3 From c4c45e943e519f5ac220f7af1afb2a0025d03c54 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:21 -0600 Subject: vfio/pci: Intel graphics legacy mode assignment Enable quirks to support SandyBridge and newer IGD devices as primary VM graphics. This requires new vfio-pci device specific regions added in kernel v4.6 to expose the IGD OpRegion, the shadow ROM, and config space access to the PCI host bridge and LPC/ISA bridge. VM firmware support, SeaBIOS only so far, is also required for reserving memory regions for IGD specific use. In order to enable this mode, IGD must be assigned to the VM at PCI bus address 00:02.0, it must have a ROM, it must be able to enable VGA, it must have or be able to create on its own an LPC/ISA bridge of the proper type at PCI bus address 00:1f.0 (sorry, not compatible with Q35 yet), and it must have the above noted vfio-pci kernel features and BIOS. The intention is that to enable this mode, a user simply needs to assign 00:02.0 from the host to 00:02.0 in the VM: -device vfio-pci,host=0000:00:02.0,bus=pci.0,addr=02.0 and everything either happens automatically or it doesn't. In the case that it doesn't, we leave error reports, but assume the device will operate in universal passthrough mode (UPT), which doesn't require any of this, but has a much more narrow window of supported devices, supported use cases, and supported guest drivers. When using IGD in this mode, the VM firmware is required to reserve some VM RAM for the OpRegion (on the order or several 4k pages) and stolen memory for the GTT (up to 8MB for the latest GPUs). An additional option, x-igd-gms allows the user to specify some amount of additional memory (value is number of 32MB chunks up to 512MB) that is pre-allocated for graphics use. TBH, I don't know of anything that requires this or makes use of this memory, which is why we don't allocate any by default, but the specification suggests this is not actually a valid combination, so the option exists as a workaround. Please report if it's actually necessary in some environment. See code comments for further discussion about the actual operation of the quirks necessary to assign these devices. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/pci-quirks.c | 643 ++++++++++++++++++++++++++++++++++++++++++++++++++- hw/vfio/pci.c | 8 + hw/vfio/pci.h | 2 + trace-events | 5 + 4 files changed, 657 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 49ecf1172a..f2b8ed54ab 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -11,9 +11,12 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "qapi/error.h" +#include "hw/nvram/fw_cfg.h" #include "pci.h" #include "trace.h" -#include "qemu/range.h" /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */ static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device) @@ -961,6 +964,643 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); } +/* + * Intel IGD support + * + * Obviously IGD is not a discrete device, this is evidenced not only by it + * being integrated into the CPU, but by the various chipset and BIOS + * dependencies that it brings along with it. Intel is trying to move away + * from this and Broadwell and newer devices can run in what Intel calls + * "Universal Pass-Through" mode, or UPT. Theoretically in UPT mode, nothing + * more is required beyond assigning the IGD device to a VM. There are + * however support limitations to this mode. It only supports IGD as a + * secondary graphics device in the VM and it doesn't officially support any + * physical outputs. + * + * The code here attempts to enable what we'll call legacy mode assignment, + * IGD retains most of the capabilities we expect for it to have on bare + * metal. To enable this mode, the IGD device must be assigned to the VM + * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA + * support, we must have VM BIOS support for reserving and populating some + * of the required tables, and we need to tweak the chipset with revisions + * and IDs and an LPC/ISA bridge device. The intention is to make all of + * this happen automatically by installing the device at the correct VM PCI + * bus address. If any of the conditions are not met, we cross our fingers + * and hope the user knows better. + * + * NB - It is possible to enable physical outputs in UPT mode by supplying + * an OpRegion table. We don't do this by default because the guest driver + * behaves differently if an OpRegion is provided and no monitor is attached + * vs no OpRegion and a monitor being attached or not. Effectively, if a + * headless setup is desired, the OpRegion gets in the way of that. + */ + +/* + * This presumes the device is already known to be an Intel VGA device, so we + * take liberties in which device ID bits match which generation. This should + * not be taken as an indication that all the devices are supported, or even + * supportable, some of them don't even support VT-d. + * See linux:include/drm/i915_pciids.h for IDs. + */ +static int igd_gen(VFIOPCIDevice *vdev) +{ + if ((vdev->device_id & 0xfff) == 0xa84) { + return 8; /* Broxton */ + } + + switch (vdev->device_id & 0xff00) { + /* Old, untested, unavailable, unknown */ + case 0x0000: + case 0x2500: + case 0x2700: + case 0x2900: + case 0x2a00: + case 0x2e00: + case 0x3500: + case 0xa000: + return -1; + /* SandyBridge, IvyBridge, ValleyView, Haswell */ + case 0x0100: + case 0x0400: + case 0x0a00: + case 0x0c00: + case 0x0d00: + case 0x0f00: + return 6; + /* BroadWell, CherryView, SkyLake, KabyLake */ + case 0x1600: + case 0x1900: + case 0x2200: + case 0x5900: + return 8; + } + + return 8; /* Assume newer is compatible */ +} + +typedef struct VFIOIGDQuirk { + struct VFIOPCIDevice *vdev; + uint32_t index; +} VFIOIGDQuirk; + +#define IGD_GMCH 0x50 /* Graphics Control Register */ +#define IGD_BDSM 0x5c /* Base Data of Stolen Memory */ +#define IGD_ASLS 0xfc /* ASL Storage Register */ + +/* + * The OpRegion includes the Video BIOS Table, which seems important for + * telling the driver what sort of outputs it has. Without this, the device + * may work in the guest, but we may not get output. This also requires BIOS + * support to reserve and populate a section of guest memory sufficient for + * the table and to write the base address of that memory to the ASLS register + * of the IGD device. + */ +static int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) +{ + int ret; + + vdev->igd_opregion = g_malloc0(info->size); + ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, + info->size, info->offset); + if (ret != info->size) { + error_report("vfio: Error reading IGD OpRegion"); + g_free(vdev->igd_opregion); + vdev->igd_opregion = NULL; + return -EINVAL; + } + + /* + * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to + * allocate 32bit reserved memory for, copy these contents into, and write + * the reserved memory base address to the device ASLS register at 0xFC. + * Alignment of this reserved region seems flexible, but using a 4k page + * alignment seems to work well. This interface assumes a single IGD + * device, which may be at VM address 00:02.0 in legacy mode or another + * address in UPT mode. + * + * NB, there may be future use cases discovered where the VM should have + * direct interaction with the host OpRegion, in which case the write to + * the ASLS register would trigger MemoryRegion setup to enable that. + */ + fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", + vdev->igd_opregion, info->size); + + trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); + + pci_set_long(vdev->pdev.config + IGD_ASLS, 0); + pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); + + return 0; +} + +/* + * The rather short list of registers that we copy from the host devices. + * The LPC/ISA bridge values are definitely needed to support the vBIOS, the + * host bridge values may or may not be needed depending on the guest OS. + * Since we're only munging revision and subsystem values on the host bridge, + * we don't require our own device. The LPC/ISA bridge needs to be our very + * own though. + */ +typedef struct { + uint8_t offset; + uint8_t len; +} IGDHostInfo; + +static const IGDHostInfo igd_host_bridge_infos[] = { + {PCI_REVISION_ID, 2}, + {PCI_SUBSYSTEM_VENDOR_ID, 2}, + {PCI_SUBSYSTEM_ID, 2}, +}; + +static const IGDHostInfo igd_lpc_bridge_infos[] = { + {PCI_VENDOR_ID, 2}, + {PCI_DEVICE_ID, 2}, + {PCI_REVISION_ID, 2}, + {PCI_SUBSYSTEM_VENDOR_ID, 2}, + {PCI_SUBSYSTEM_ID, 2}, +}; + +static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev, + struct vfio_region_info *info, + const IGDHostInfo *list, int len) +{ + int i, ret; + + for (i = 0; i < len; i++) { + ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset, + list[i].len, info->offset + list[i].offset); + if (ret != list[i].len) { + error_report("IGD copy failed: %m"); + return -errno; + } + } + + return 0; +} + +/* + * Stuff a few values into the host bridge. + */ +static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) +{ + PCIBus *bus; + PCIDevice *host_bridge; + int ret; + + bus = pci_device_root_bus(&vdev->pdev); + host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0)); + + if (!host_bridge) { + error_report("Can't find host bridge"); + return -ENODEV; + } + + ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos, + ARRAY_SIZE(igd_host_bridge_infos)); + if (!ret) { + trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name); + } + + return ret; +} + +/* + * IGD LPC/ISA bridge support code. The vBIOS needs this, but we can't write + * arbitrary values into just any bridge, so we must create our own. We try + * to handle if the user has created it for us, which they might want to do + * to enable multifuction so we don't occupy the whole PCI slot. + */ +static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp) +{ + if (pdev->devfn != PCI_DEVFN(0x1f, 0)) { + error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0"); + } +} + +static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment"; + dc->hotpluggable = false; + k->realize = vfio_pci_igd_lpc_bridge_realize; + k->class_id = PCI_CLASS_BRIDGE_ISA; +} + +static TypeInfo vfio_pci_igd_lpc_bridge_info = { + .name = "vfio-pci-igd-lpc-bridge", + .parent = TYPE_PCI_DEVICE, + .class_init = vfio_pci_igd_lpc_bridge_class_init, +}; + +static void vfio_pci_igd_register_types(void) +{ + type_register_static(&vfio_pci_igd_lpc_bridge_info); +} + +type_init(vfio_pci_igd_register_types) + +static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) +{ + PCIDevice *lpc_bridge; + int ret; + + lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x1f, 0)); + if (!lpc_bridge) { + lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev), + PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge"); + } + + ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos, + ARRAY_SIZE(igd_lpc_bridge_infos)); + if (!ret) { + trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name); + } + + return ret; +} + +/* + * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE + * entry, older IGDs use 2MB and 32bit. Each PTE maps a 4k page. Therefore + * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index + * for programming the GTT. + * + * See linux:include/drm/i915_drm.h for shift and mask values. + */ +static int vfio_igd_gtt_max(VFIOPCIDevice *vdev) +{ + uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); + int ggms, gen = igd_gen(vdev); + + gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); + ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3; + if (gen > 6) { + ggms = 1 << ggms; + } + + ggms *= 1024 * 1024; + + return (ggms / (4 * 1024)) * (gen < 8 ? 4 : 8); +} + +/* + * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes. + * Somehow the host stolen memory range is used for this, but how the ROM gets + * it is a mystery, perhaps it's hardcoded into the ROM. Thankfully though, it + * reprograms the GTT through the IOBAR where we can trap it and transpose the + * programming to the VM allocated buffer. That buffer gets reserved by the VM + * firmware via the fw_cfg entry added below. Here we're just monitoring the + * IOBAR address and data registers to detect a write sequence targeting the + * GTTADR. This code is developed by observed behavior and doesn't have a + * direct spec reference, unfortunately. + */ +static uint64_t vfio_igd_quirk_data_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + + igd->index = ~0; + + return vfio_region_read(&vdev->bars[4].region, addr + 4, size); +} + +static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + uint64_t val = data; + int gen = igd_gen(vdev); + + /* + * Programming the GGMS starts at index 0x1 and uses every 4th index (ie. + * 0x1, 0x5, 0x9, 0xd,...). For pre-Gen8 each 4-byte write is a whole PTE + * entry, with 0th bit enable set. For Gen8 and up, PTEs are 64bit, so + * entries 0x5 & 0xd are the high dword, in our case zero. Each PTE points + * to a 4k page, which we translate to a page from the VM allocated region, + * pointed to by the BDSM register. If this is not set, we fail. + * + * We trap writes to the full configured GTT size, but we typically only + * see the vBIOS writing up to (nearly) the 1MB barrier. In fact it often + * seems to miss the last entry for an even 1MB GTT. Doing a gratuitous + * write of that last entry does work, but is hopefully unnecessary since + * we clear the previous GTT on initialization. + */ + if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) { + if (gen < 8 || (igd->index % 8 == 1)) { + uint32_t base; + + base = pci_get_long(vdev->pdev.config + IGD_BDSM); + if (!base) { + hw_error("vfio-igd: Guest attempted to program IGD GTT before " + "BIOS reserved stolen memory. Unsupported BIOS?"); + } + + val = base | (data & ((1 << 20) - 1)); + } else { + val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */ + } + + trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name, + igd->index, data, val); + } + + vfio_region_write(&vdev->bars[4].region, addr + 4, val, size); + + igd->index = ~0; +} + +static const MemoryRegionOps vfio_igd_data_quirk = { + .read = vfio_igd_quirk_data_read, + .write = vfio_igd_quirk_data_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t vfio_igd_quirk_index_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + + igd->index = ~0; + + return vfio_region_read(&vdev->bars[4].region, addr, size); +} + +static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOIGDQuirk *igd = opaque; + VFIOPCIDevice *vdev = igd->vdev; + + igd->index = data; + + vfio_region_write(&vdev->bars[4].region, addr, data, size); +} + +static const MemoryRegionOps vfio_igd_index_quirk = { + .read = vfio_igd_quirk_index_read, + .write = vfio_igd_quirk_index_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) +{ + struct vfio_region_info *rom = NULL, *opregion = NULL, + *host = NULL, *lpc = NULL; + VFIOQuirk *quirk; + VFIOIGDQuirk *igd; + PCIDevice *lpc_bridge; + int i, ret, ggms_mb, gms_mb = 0, gen; + uint64_t *bdsm_size; + uint32_t gmch; + uint16_t cmd_orig, cmd; + + /* + * This must be an Intel VGA device at address 00:02.0 for us to even + * consider enabling legacy mode. The vBIOS has dependencies on the + * PCI bus address. + */ + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || + !vfio_is_vga(vdev) || nr != 4 || + &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x2, 0))) { + return; + } + + /* + * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we + * can stuff host values into, so if there's already one there and it's not + * one we can hack on, legacy mode is no-go. Sorry Q35. + */ + lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x1f, 0)); + if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge), + "vfio-pci-igd-lpc-bridge")) { + error_report("IGD device %s cannot support legacy mode due to existing " + "devices at address 1f.0", vdev->vbasedev.name); + return; + } + + /* + * IGD is not a standard, they like to change their specs often. We + * only attempt to support back to SandBridge and we hope that newer + * devices maintain compatibility with generation 8. + */ + gen = igd_gen(vdev); + if (gen != 6 && gen != 8) { + error_report("IGD device %s is unsupported in legacy mode, " + "try SandyBridge or newer", vdev->vbasedev.name); + return; + } + + /* + * Most of what we're doing here is to enable the ROM to run, so if + * there's no ROM, there's no point in setting up this quirk. + * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support. + */ + ret = vfio_get_region_info(&vdev->vbasedev, + VFIO_PCI_ROM_REGION_INDEX, &rom); + if ((ret || !rom->size) && !vdev->pdev.romfile) { + error_report("IGD device %s has no ROM, legacy mode disabled", + vdev->vbasedev.name); + goto out; + } + + /* + * Ignore the hotplug corner case, mark the ROM failed, we can't + * create the devices we need for legacy mode in the hotplug scenario. + */ + if (vdev->pdev.qdev.hotplugged) { + error_report("IGD device %s hotplugged, ROM disabled, " + "legacy mode disabled", vdev->vbasedev.name); + vdev->rom_read_failed = true; + goto out; + } + + /* + * Check whether we have all the vfio device specific regions to + * support legacy mode (added in Linux v4.6). If not, bail. + */ + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + if (ret) { + error_report("IGD device %s does not support OpRegion access," + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); + if (ret) { + error_report("IGD device %s does not support host bridge access," + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); + if (ret) { + error_report("IGD device %s does not support LPC bridge access," + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); + + /* + * If IGD VGA Disable is clear (expected) and VGA is not already enabled, + * try to enable it. Probably shouldn't be using legacy mode without VGA, + * but also no point in us enabling VGA if disabled in hardware. + */ + if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev)) { + error_report("IGD device %s failed to enable VGA access, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Create our LPC/ISA bridge */ + ret = vfio_pci_igd_lpc_init(vdev, lpc); + if (ret) { + error_report("IGD device %s failed to create LPC bridge, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Stuff some host values into the VM PCI host bridge */ + ret = vfio_pci_igd_host_init(vdev, host); + if (ret) { + error_report("IGD device %s failed to modify host bridge, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Setup OpRegion access */ + ret = vfio_pci_igd_opregion_init(vdev, opregion); + if (ret) { + error_report("IGD device %s failed to setup OpRegion, " + "legacy mode disabled", vdev->vbasedev.name); + goto out; + } + + /* Setup our quirk to munge GTT addresses to the VM allocated buffer */ + quirk = g_malloc0(sizeof(*quirk)); + quirk->mem = g_new0(MemoryRegion, 2); + quirk->nr_mem = 2; + igd = quirk->data = g_malloc0(sizeof(*igd)); + igd->vdev = vdev; + igd->index = ~0; + + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk, + igd, "vfio-igd-index-quirk", 4); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + 0, &quirk->mem[0], 1); + + memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk, + igd, "vfio-igd-data-quirk", 4); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + 4, &quirk->mem[1], 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + /* Determine the size of stolen memory needed for GTT */ + ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3; + if (gen > 6) { + ggms_mb = 1 << ggms_mb; + } + + /* + * Assume we have no GMS memory, but allow it to be overrided by device + * option (experimental). The spec doesn't actually allow zero GMS when + * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused, + * so let's not waste VM memory for it. + */ + gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8)); + + if (vdev->igd_gms) { + if (vdev->igd_gms <= 0x10) { + gms_mb = vdev->igd_gms * 32; + gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8); + } else { + error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms); + vdev->igd_gms = 0; + } + } + + /* + * Request reserved memory for stolen memory via fw_cfg. VM firmware + * must allocate a 1MB aligned reserved memory region below 4GB with + * the requested size (in bytes) for use by the Intel PCI class VGA + * device at VM address 00:02.0. The base address of this reserved + * memory region must be written to the device BDSM regsiter at PCI + * config offset 0x5C. + */ + bdsm_size = g_malloc(sizeof(*bdsm_size)); + *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * 1024 * 1024); + fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", + bdsm_size, sizeof(*bdsm_size)); + + /* GMCH is read-only, emulated */ + pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); + pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); + pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); + + /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ + pci_set_long(vdev->pdev.config + IGD_BDSM, 0); + pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); + + /* + * This IOBAR gives us access to GTTADR, which allows us to write to + * the GTT itself. So let's go ahead and write zero to all the GTT + * entries to avoid spurious DMA faults. Be sure I/O access is enabled + * before talking to the device. + */ + if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), + vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { + error_report("IGD device %s - failed to read PCI command register", + vdev->vbasedev.name); + } + + cmd = cmd_orig | PCI_COMMAND_IO; + + if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd), + vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) { + error_report("IGD device %s - failed to write PCI command register", + vdev->vbasedev.name); + } + + for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) { + vfio_region_write(&vdev->bars[4].region, 0, i, 4); + vfio_region_write(&vdev->bars[4].region, 4, 0, 4); + } + + if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), + vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { + error_report("IGD device %s - failed to restore PCI command register", + vdev->vbasedev.name); + } + + trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb); + +out: + g_free(rom); + g_free(opregion); + g_free(host); + g_free(lpc); +} + /* * Common quirk probe entry points. */ @@ -1010,6 +1650,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) vfio_probe_nvidia_bar5_quirk(vdev, nr); vfio_probe_nvidia_bar0_quirk(vdev, nr); vfio_probe_rtl8168_bar2_quirk(vdev, nr); + vfio_probe_igd_bar4_quirk(vdev, nr); } void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index aa6fb7bd6f..06d91cc892 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2605,6 +2605,13 @@ static void vfio_instance_finalize(Object *obj) vfio_bars_finalize(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ vfio_put_device(vdev); vfio_put_group(group); } @@ -2689,6 +2696,7 @@ static Property vfio_pci_dev_properties[] = { sub_vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), /* * TODO - support passed fds... is this necessary? * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 3976f68549..31ee8da4de 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -115,6 +115,7 @@ typedef struct VFIOPCIDevice { int interrupt; /* Current interrupt type */ VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ + void *igd_opregion; PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; @@ -129,6 +130,7 @@ typedef struct VFIOPCIDevice { #define VFIO_FEATURE_ENABLE_REQ_BIT 1 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT) int32_t bootindex; + uint32_t igd_gms; uint8_t pm_cap; bool has_vga; bool pci_aer; diff --git a/trace-events b/trace-events index 3d2c79ea02..018d75f0a8 100644 --- a/trace-events +++ b/trace-events @@ -1713,6 +1713,11 @@ vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s" vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s" vfio_quirk_ati_bonaire_reset_done(const char *name) "%s" vfio_quirk_ati_bonaire_reset(const char *name) "%s" +vfio_pci_igd_bar4_write(const char *name, uint32_t index, uint32_t data, uint32_t base) "%s [%03x] %08x -> %08x" +vfio_pci_igd_bdsm_enabled(const char *name, int size) "%s %dMB" +vfio_pci_igd_opregion_enabled(const char *name) "%s" +vfio_pci_igd_host_bridge_enabled(const char *name) "%s" +vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" # hw/vfio/common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" -- cgit v1.2.3 From 6ced0bba70ff557792b781ad35366de03bcd105b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:22 -0600 Subject: vfio/pci: Add a separate option for IGD OpRegion support The IGD OpRegion is enabled automatically when running in legacy mode, but it can sometimes be useful in universal passthrough mode as well. Without an OpRegion, output spigots don't work, and even though Intel doesn't officially support physical outputs in UPT mode, it's a useful feature. Note that if an OpRegion is enabled but a monitor is not connected, some graphics features will be disabled in the guest versus a headless system without an OpRegion, where they would work. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- hw/vfio/pci-quirks.c | 4 ++-- hw/vfio/pci.c | 31 +++++++++++++++++++++++++++++++ hw/vfio/pci.h | 6 ++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index f2b8ed54ab..35d32b78f4 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1055,8 +1055,8 @@ typedef struct VFIOIGDQuirk { * the table and to write the base address of that memory to the ASLS register * of the IGD device. */ -static int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, - struct vfio_region_info *info) +int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info) { int ret; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 06d91cc892..deab0c601a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2562,6 +2562,35 @@ static int vfio_initfn(PCIDevice *pdev) vfio_bar_quirk_setup(vdev, i); } + if (!vdev->igd_opregion && + vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { + struct vfio_region_info *opregion; + + if (vdev->pdev.qdev.hotplugged) { + error_report("Cannot support IGD OpRegion feature on hotplugged " + "device %s", vdev->vbasedev.name); + ret = -EINVAL; + goto out_teardown; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + if (ret) { + error_report("Device %s does not support requested IGD OpRegion " + "feature", vdev->vbasedev.name); + goto out_teardown; + } + + ret = vfio_pci_igd_opregion_init(vdev, opregion); + g_free(opregion); + if (ret) { + error_report("Device %s IGD OpRegion initialization failed", + vdev->vbasedev.name); + goto out_teardown; + } + } + /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, @@ -2686,6 +2715,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_VGA_BIT, false), DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), + DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 31ee8da4de..b3eb0d838e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -129,6 +129,9 @@ typedef struct VFIOPCIDevice { #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) #define VFIO_FEATURE_ENABLE_REQ_BIT 1 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT) +#define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2 +#define VFIO_FEATURE_ENABLE_IGD_OPREGION \ + (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT) int32_t bootindex; uint32_t igd_gms; uint8_t pm_cap; @@ -161,4 +164,7 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev); int vfio_populate_vga(VFIOPCIDevice *vdev); +int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info); + #endif /* HW_VFIO_VFIO_PCI_H */ -- cgit v1.2.3 From 0eb734241762bc63bf8308bed3573431f195ddcf Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 26 May 2016 09:43:22 -0600 Subject: vfio/pci: Add IGD documentation Document the usage modes, host primary graphics considerations, usage, and fw_cfg ABI required for IGD assignment with vfio. Signed-off-by: Alex Williamson Reviewed-by: Gerd Hoffmann Tested-by: Gerd Hoffmann --- docs/igd-assign.txt | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 docs/igd-assign.txt diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt new file mode 100644 index 0000000000..e17bb50789 --- /dev/null +++ b/docs/igd-assign.txt @@ -0,0 +1,133 @@ +Intel Graphics Device (IGD) assignment with vfio-pci +==================================================== + +IGD has two different modes for assignment using vfio-pci: + +1) Universal Pass-Through (UPT) mode: + + In this mode the IGD device is added as a *secondary* (ie. non-primary) + graphics device in combination with an emulated primary graphics device. + This mode *requires* guest driver support to remove the external + dependencies generally associated with IGD (see below). Those guest + drivers only support this mode for Broadwell and newer IGD, according to + Intel. Additionally, this mode by default, and as officially supported + by Intel, does not support direct video output. The intention is to use + this mode either to provide hardware acceleration to the emulated graphics + or to use this mode in combination with guest-based remote access software, + for example VNC (see below for optional output support). This mode + theoretically has no device specific handling dependencies on vfio-pci or + the VM firmware. + +2) "Legacy" mode: + + In this mode the IGD device is intended to be the primary and exclusive + graphics device in the VM[1], as such QEMU does not facilitate any sort + of remote graphics to the VM in this mode. A connected physical monitor + is the intended output device for IGD. This mode includes several + requirements and restrictions: + + * IGD must be given address 02.0 on the PCI root bus in the VM + * The host kernel must support vfio extensions for IGD (v4.6) + * vfio VGA support very likely needs to be enabled in the host kernel + * The VM firmware must support specific fw_cfg enablers for IGD + * The VM machine type must support a PCI host bridge at 00.0 (standard) + * The VM machine type must provide or allow to be created a special + ISA/LPC bridge device (vfio-pci-igd-lpc-bridge) on the root bus at + PCI address 1f.0. + * The IGD device must have a VGA ROM, either provided via the romfile + option or loaded automatically through vfio (standard). rombar=0 + will disable legacy mode support. + * Hotplug of the IGD device is not supported. + * The IGD device must be a SandyBridge or newer model device. + +For either mode, depending on the host kernel, the i915 driver in the host +may generate faults and errors upon re-binding to an IGD device after it +has been assigned to a VM. It's therefore generally recommended to prevent +such driver binding unless the host driver is known to work well for this. +There are numerous ways to do this, i915 can be blacklisted on the host, +the driver_override option can be used to ensure that only vfio-pci can bind +to the device on the host[2], virsh nodedev-detach can be used to bind the +device to vfio drivers and then managed='no' set in the VM xml to prevent +re-binding to i915, etc. Also note that IGD is also typically the primary +graphics in the host and special options may be required beyond simply +blacklisting i915 or using pci-stub/vfio-pci to take ownership of IGD as a +PCI class device. Lower level drivers exist that may still claim the device. +It may therefore be necessary to use kernel boot options video=vesafb:off or +video=efifb:off (depending on host BIOS/UEFI) or these can be combined to +a catch-all, video=vesafb:off,efifb:off. Error messages such as: + + Failed to mmap 0000:00:02.0 BAR <>. Performance may be slow + +are a good indicator that such a problem exists. The host files /proc/iomem +and /proc/ioports are often useful for identifying drivers consuming ranges +of the device to cause such conflicts. + +Additionally, IGD device are known to generate small numbers of DMAR faults +when initially assigned. It is believed that this is simply the IGD attempting +to access the reserved GTT space after reset, which it no longer has access to +when accessed from userspace. So long as the DMAR faults are small in number +and most importantly, not ongoing, these are not an indication of an error. + +Additionally++, analog VGA output (as opposed to digital outputs like HDMI, +DVI, or DisplayPort) may be unsupported in some use cases. In the author's +experience, even DP to VGA adapters can be troublesome while adapters between +digital formats work well. + +Usage +===== +The intention is for IGD assignment to be transparent for users and thus for +management tools like libvirt. To make use of legacy mode, simply remove all +other graphics options and use "-nographic" and either "-vga none" or +"-nodefaults", along with adding the device using vfio-pci: + + -device vfio-pci,host=00:02.0,id=hostdev0,bus=pci.0,addr=0x2 + +For UPT mode, retain the default emulated graphics and simply add the vfio-pci +device making use of any other bus address other than 02.0. libvirt will +default to assigning the device a UPT compatible address while legacy mode +users will need to manually edit the XML if using a tool like virt-manager +where the VM device address is not expressly specified. + +An experimental vfio-pci option also exists to enable OpRegion, and thus +external monitor support, for UPT mode. This can be enabled by adding +"x-igd-opregion=on" to the vfio-pci device options for the IGD device. As +with legacy mode, this requires the host to support features introduced in +the v4.6 kernel. If Intel chooses to embrace this support, the option may +be made non-experimental in the future, opening it to libvirt support. + +Developer ABI +============= +Legacy mode IGD support imposes two fw_cfg requirements on the VM firmware: + +1) "etc/igd-opregion" + + This fw_cfg file exposes the OpRegion for the IGD device. A reserved + region should be created below 4GB (recommended 4KB alignment), sized + sufficient for the fw_cfg file size, and the content of this file copied + to it. The dword based address of this reserved memory region must also + be written to the ASLS register at offset 0xFC on the IGD device. It is + recommended that firmware should make use of this fw_cfg entry for any + PCI class VGA device with Intel vendor ID. Multiple of such devices + within a VM is undefined. + +2) "etc/igd-bdsm-size" + + This fw_cfg file contains an 8-byte, little endian integer indicating + the size of the reserved memory region required for IGD stolen memory. + Firmware must allocate a reserved memory below 4GB with required 1MB + alignment equal to this size. Additionally the base address of this + reserved region must be written to the dword BDSM register in PCI config + space of the IGD device at offset 0x5C. As this support is related to + running the IGD ROM, which has other dependencies on the device appearing + at guest address 00:02.0, it's expected that this fw_cfg file is only + relevant to a single PCI class VGA device with Intel vendor ID, appearing + at PCI bus address 00:02.0. + +Footnotes +========= +[1] Nothing precludes adding additional emulated or assigned graphics devices + as non-primary, other than the combination typically not working. I only + intend to set user expectations, others are welcome to find working + combinations or fix whatever issues prevent this from working in the common + case. +[2] # echo "vfio-pci" > /sys/bus/pci/devices/0000:00:02.0/driver_override -- cgit v1.2.3 From 7a057b4fb9dec2baf975d1737a7258602e793c40 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 26 May 2016 09:43:22 -0600 Subject: vfio: Fix 128 bit handling when deleting region 7532d3cbf "vfio: Fix 128 bit handling" added support for 64bit IOMMU memory regions when those are added to VFIO address space; however removing code cannot cope with these as int128_get64() will fail on 1<<64. This copies 128bit handling from region_add() to region_del(). Since the only machine type which is actually going to use 64bit IOMMU is pseries and it never really removes them (instead it will dynamically add/remove subregions), this should cause no behavioral change. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Alex Williamson --- hw/vfio/common.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 902a047947..29f8f84cc8 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -433,6 +433,7 @@ static void vfio_listener_region_del(MemoryListener *listener, { VFIOContainer *container = container_of(listener, VFIOContainer, listener); hwaddr iova, end; + Int128 llend, llsize; int ret; if (vfio_listener_skipped_section(section)) { @@ -471,21 +472,25 @@ static void vfio_listener_region_del(MemoryListener *listener, } iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); - end = (section->offset_within_address_space + int128_get64(section->size)) & - TARGET_PAGE_MASK; + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); - if (iova >= end) { + if (int128_ge(int128_make64(iova), llend)) { return; } + end = int128_get64(int128_sub(llend, int128_one())); + + llsize = int128_sub(llend, int128_make64(iova)); - trace_vfio_listener_region_del(iova, end - 1); + trace_vfio_listener_region_del(iova, end); - ret = vfio_dma_unmap(container, iova, end - iova); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); memory_region_unref(section->mr); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, end - iova, ret); + container, iova, int128_get64(llsize), ret); } } -- cgit v1.2.3 From d78c19b5cf4821d0c198f4132a085bdbf19dda4c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 26 May 2016 09:43:23 -0600 Subject: memory: Fix IOMMU replay base address Since a788f227 "memory: Allow replay of IOMMU mapping notifications" when new VFIO listener is added, all existing IOMMU mappings are replayed. However there is a problem that the base address of an IOMMU memory region (IOMMU MR) is ignored which is not a problem for the existing user (which is pseries) with its default 32bit DMA window starting at 0 but it is if there is another DMA window. This stores the IOMMU's offset_within_address_space and adjusts the IOVA before calling vfio_dma_map/vfio_dma_unmap. As the IOMMU notifier expects IOVA offset rather than the absolute address, this also adjusts IOVA in sPAPR H_PUT_TCE handler before calling notifier(s). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- hw/ppc/spapr_iommu.c | 2 +- hw/vfio/common.c | 14 ++++++++------ include/hw/vfio/vfio-common.h | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 722db91ffa..b6fe1dd986 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -278,7 +278,7 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba, tcet->table[index] = tce; entry.target_as = &address_space_memory, - entry.iova = ioba & page_mask; + entry.iova = (ioba - tcet->bus_offset) & page_mask; entry.translated_addr = tce & page_mask; entry.addr_mask = ~page_mask; entry.perm = spapr_tce_iommu_access_flags(tce); diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 29f8f84cc8..e2d5a8d9d5 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -260,14 +260,14 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; IOMMUTLBEntry *iotlb = data; + hwaddr iova = iotlb->iova + giommu->iommu_offset; MemoryRegion *mr; hwaddr xlat; hwaddr len = iotlb->addr_mask + 1; void *vaddr; int ret; - trace_vfio_iommu_map_notify(iotlb->iova, - iotlb->iova + iotlb->addr_mask); + trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask); /* * The IOMMU TLB entry we have just covers translation through @@ -294,21 +294,21 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { vaddr = memory_region_get_ram_ptr(mr) + xlat; - ret = vfio_dma_map(container, iotlb->iova, + ret = vfio_dma_map(container, iova, iotlb->addr_mask + 1, vaddr, !(iotlb->perm & IOMMU_WO) || mr->readonly); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", - container, iotlb->iova, + container, iova, iotlb->addr_mask + 1, vaddr, ret); } } else { - ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); + ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", - container, iotlb->iova, + container, iova, iotlb->addr_mask + 1, ret); } } @@ -380,6 +380,8 @@ static void vfio_listener_region_add(MemoryListener *listener, */ giommu = g_malloc0(sizeof(*giommu)); giommu->iommu = section->mr; + giommu->iommu_offset = section->offset_within_address_space - + section->offset_within_region; giommu->container = container; giommu->n.notify = vfio_iommu_map_notify; QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index a947f9ccda..0610377789 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -90,6 +90,7 @@ typedef struct VFIOContainer { typedef struct VFIOGuestIOMMU { VFIOContainer *container; MemoryRegion *iommu; + hwaddr iommu_offset; Notifier n; QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; -- cgit v1.2.3 From f1f9365019bb257af087b454972c396bb0d53b26 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 26 May 2016 09:43:23 -0600 Subject: vfio: Check that IOMMU MR translates to system address space At the moment IOMMU MR only translate to the system memory. However if some new code changes this, we will need clear indication why it is not working so here is the check. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- hw/vfio/common.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e2d5a8d9d5..e51ed3a348 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -269,6 +269,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data) trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask); + if (iotlb->target_as != &address_space_memory) { + error_report("Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + return; + } + /* * The IOMMU TLB entry we have just covers translation through * this IOMMU to its immediate target. We need to translate -- cgit v1.2.3