aboutsummaryrefslogtreecommitdiff
path: root/hw
diff options
context:
space:
mode:
Diffstat (limited to 'hw')
-rw-r--r--hw/misc/vfio.c428
1 files changed, 380 insertions, 48 deletions
diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 9cf5b84045..7437c2e3c3 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -133,6 +133,15 @@ enum {
VFIO_INT_MSIX = 3,
};
+typedef struct VFIOAddressSpace {
+ AddressSpace *as;
+ QLIST_HEAD(, VFIOContainer) containers;
+ QLIST_ENTRY(VFIOAddressSpace) list;
+} VFIOAddressSpace;
+
+static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
+ QLIST_HEAD_INITIALIZER(vfio_address_spaces);
+
struct VFIOGroup;
typedef struct VFIOType1 {
@@ -142,6 +151,7 @@ typedef struct VFIOType1 {
} VFIOType1;
typedef struct VFIOContainer {
+ VFIOAddressSpace *space;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
struct {
/* enable abstraction to support various iommu backends */
@@ -150,10 +160,18 @@ typedef struct VFIOContainer {
};
void (*release)(struct VFIOContainer *);
} iommu_data;
+ QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_ENTRY(VFIOContainer) next;
} VFIOContainer;
+typedef struct VFIOGuestIOMMU {
+ VFIOContainer *container;
+ MemoryRegion *iommu;
+ Notifier n;
+ QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+} VFIOGuestIOMMU;
+
/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
typedef struct VFIOMSIXInfo {
uint8_t table_bar;
@@ -234,9 +252,6 @@ static const VFIORomBlacklistEntry romblacklist[] = {
#define MSIX_CAP_LENGTH 12
-static QLIST_HEAD(, VFIOContainer)
- container_list = QLIST_HEAD_INITIALIZER(container_list);
-
static QLIST_HEAD(, VFIOGroup)
group_list = QLIST_HEAD_INITIALIZER(group_list);
@@ -1668,6 +1683,149 @@ static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr)
vdev->host.function);
}
+#define PCI_VENDOR_ID_REALTEK 0x10ec
+
+/*
+ * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2
+ * offset 0x70 there is a dword data register, offset 0x74 is a dword address
+ * register. According to the Linux r8169 driver, the MSI-X table is addressed
+ * when the "type" portion of the address register is set to 0x1. This appears
+ * to be bits 16:30. Bit 31 is both a write indicator and some sort of
+ * "address latched" indicator. Bits 12:15 are a mask field, which we can
+ * ignore because the MSI-X table should always be accessed as a dword (full
+ * mask). Bits 0:11 is offset within the type.
+ *
+ * Example trace:
+ *
+ * Read from MSI-X table offset 0
+ * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
+ * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
+ * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
+ *
+ * Write 0xfee00000 to MSI-X table offset 0
+ * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
+ * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
+ * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
+ */
+
+static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
+ hwaddr addr, unsigned size)
+{
+ VFIOQuirk *quirk = opaque;
+ VFIODevice *vdev = quirk->vdev;
+
+ switch (addr) {
+ case 4: /* address */
+ if (quirk->data.flags) {
+ DPRINTF("%s fake read(%04x:%02x:%02x.%d)\n",
+ memory_region_name(&quirk->mem), vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+ return quirk->data.address_match ^ 0x10000000U;
+ }
+ break;
+ case 0: /* data */
+ if (quirk->data.flags) {
+ uint64_t val;
+
+ DPRINTF("%s MSI-X table read(%04x:%02x:%02x.%d)\n",
+ memory_region_name(&quirk->mem), vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+ if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
+ return 0;
+ }
+
+ io_mem_read(&vdev->pdev.msix_table_mmio,
+ (hwaddr)(quirk->data.address_match & 0xfff),
+ &val, size);
+ return val;
+ }
+ }
+
+ DPRINTF("%s direct read(%04x:%02x:%02x.%d)\n",
+ memory_region_name(&quirk->mem), vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+ return vfio_bar_read(&vdev->bars[quirk->data.bar], addr + 0x70, size);
+}
+
+static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ VFIOQuirk *quirk = opaque;
+ VFIODevice *vdev = quirk->vdev;
+
+ switch (addr) {
+ case 4: /* address */
+ if ((data & 0x7fff0000) == 0x10000) {
+ if (data & 0x10000000U &&
+ vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
+
+ DPRINTF("%s MSI-X table write(%04x:%02x:%02x.%d)\n",
+ memory_region_name(&quirk->mem), vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+ io_mem_write(&vdev->pdev.msix_table_mmio,
+ (hwaddr)(quirk->data.address_match & 0xfff),
+ data, size);
+ }
+
+ quirk->data.flags = 1;
+ quirk->data.address_match = data;
+
+ return;
+ }
+ quirk->data.flags = 0;
+ break;
+ case 0: /* data */
+ quirk->data.address_mask = data;
+ break;
+ }
+
+ DPRINTF("%s direct write(%04x:%02x:%02x.%d)\n",
+ memory_region_name(&quirk->mem), vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+ vfio_bar_write(&vdev->bars[quirk->data.bar], addr + 0x70, data, size);
+}
+
+static const MemoryRegionOps vfio_rtl8168_window_quirk = {
+ .read = vfio_rtl8168_window_quirk_read,
+ .write = vfio_rtl8168_window_quirk_write,
+ .valid = {
+ .min_access_size = 4,
+ .max_access_size = 4,
+ .unaligned = false,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_probe_rtl8168_bar2_window_quirk(VFIODevice *vdev, int nr)
+{
+ PCIDevice *pdev = &vdev->pdev;
+ VFIOQuirk *quirk;
+
+ if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK ||
+ pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) {
+ return;
+ }
+
+ quirk = g_malloc0(sizeof(*quirk));
+ quirk->vdev = vdev;
+ quirk->data.bar = nr;
+
+ memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk,
+ quirk, "vfio-rtl8168-window-quirk", 8);
+ memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
+ 0x70, &quirk->mem, 1);
+
+ QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
+
+ DPRINTF("Enabled RTL8168 BAR2 window quirk for device %04x:%02x:%02x.%x\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+}
/*
* Trap the BAR2 MMIO window to config space as well.
*/
@@ -2071,6 +2229,7 @@ static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr)
vfio_probe_nvidia_bar5_window_quirk(vdev, nr);
vfio_probe_nvidia_bar0_88000_quirk(vdev, nr);
vfio_probe_nvidia_bar0_1800_quirk(vdev, nr);
+ vfio_probe_rtl8168_bar2_window_quirk(vdev, nr);
}
static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr)
@@ -2232,7 +2391,8 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
{
- return !memory_region_is_ram(section->mr) ||
+ return (!memory_region_is_ram(section->mr) &&
+ !memory_region_is_iommu(section->mr)) ||
/*
* Sizing an enabled 64-bit BAR can cause spurious mappings to
* addresses in the upper part of the 64-bit address space. These
@@ -2242,17 +2402,75 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
section->offset_within_address_space & (1ULL << 63);
}
+static void vfio_iommu_map_notify(Notifier *n, void *data)
+{
+ VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+ VFIOContainer *container = giommu->container;
+ IOMMUTLBEntry *iotlb = data;
+ MemoryRegion *mr;
+ hwaddr xlat;
+ hwaddr len = iotlb->addr_mask + 1;
+ void *vaddr;
+ int ret;
+
+ DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+ iotlb->iova, iotlb->iova + iotlb->addr_mask);
+
+ /*
+ * The IOMMU TLB entry we have just covers translation through
+ * this IOMMU to its immediate target. We need to translate
+ * it the rest of the way through to memory.
+ */
+ mr = address_space_translate(&address_space_memory,
+ iotlb->translated_addr,
+ &xlat, &len, iotlb->perm & IOMMU_WO);
+ if (!memory_region_is_ram(mr)) {
+ DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n",
+ xlat);
+ return;
+ }
+ /*
+ * Translation truncates length to the IOMMU page size,
+ * check that it did not truncate too much.
+ */
+ if (len & iotlb->addr_mask) {
+ DPRINTF("iommu has granularity incompatible with target AS\n");
+ return;
+ }
+
+ if (iotlb->perm != IOMMU_NONE) {
+ vaddr = memory_region_get_ram_ptr(mr) + xlat;
+
+ ret = vfio_dma_map(container, iotlb->iova,
+ iotlb->addr_mask + 1, vaddr,
+ !(iotlb->perm & IOMMU_WO) || mr->readonly);
+ if (ret) {
+ error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+ "0x%"HWADDR_PRIx", %p) = %d (%m)",
+ container, iotlb->iova,
+ iotlb->addr_mask + 1, vaddr, ret);
+ }
+ } else {
+ ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1);
+ if (ret) {
+ error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+ "0x%"HWADDR_PRIx") = %d (%m)",
+ container, iotlb->iova,
+ iotlb->addr_mask + 1, ret);
+ }
+ }
+}
+
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer,
iommu_data.type1.listener);
hwaddr iova, end;
+ Int128 llend;
void *vaddr;
int ret;
- assert(!memory_region_is_iommu(section->mr));
-
if (vfio_listener_skipped_section(section)) {
DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
section->offset_within_address_space,
@@ -2268,21 +2486,65 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
- end = (section->offset_within_address_space + int128_get64(section->size)) &
- TARGET_PAGE_MASK;
+ llend = int128_make64(section->offset_within_address_space);
+ llend = int128_add(llend, section->size);
+ llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
- if (iova >= end) {
+ if (int128_ge(int128_make64(iova), llend)) {
return;
}
+ memory_region_ref(section->mr);
+
+ if (memory_region_is_iommu(section->mr)) {
+ VFIOGuestIOMMU *giommu;
+
+ DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
+ iova, int128_get64(int128_sub(llend, int128_one())));
+ /*
+ * FIXME: We should do some checking to see if the
+ * capabilities of the host VFIO IOMMU are adequate to model
+ * the guest IOMMU
+ *
+ * FIXME: For VFIO iommu types which have KVM acceleration to
+ * avoid bouncing all map/unmaps through qemu this way, this
+ * would be the right place to wire that up (tell the KVM
+ * device emulation the VFIO iommu handles to use).
+ */
+ /*
+ * This assumes that the guest IOMMU is empty of
+ * mappings at this point.
+ *
+ * One way of doing this is:
+ * 1. Avoid sharing IOMMUs between emulated devices or different
+ * IOMMU groups.
+ * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if
+ * there are some mappings in IOMMU.
+ *
+ * VFIO on SPAPR does that. Other IOMMU models may do that different,
+ * they must make sure there are no existing mappings or
+ * loop through existing mappings to map them into VFIO.
+ */
+ giommu = g_malloc0(sizeof(*giommu));
+ giommu->iommu = section->mr;
+ giommu->container = container;
+ giommu->n.notify = vfio_iommu_map_notify;
+ QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+ memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+
+ return;
+ }
+
+ /* Here we assume that memory_region_is_ram(section->mr)==true */
+
+ end = int128_get64(llend);
vaddr = memory_region_get_ram_ptr(section->mr) +
section->offset_within_region +
(iova - section->offset_within_address_space);
- DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
+ DPRINTF("region_add [ram] %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n",
iova, end - 1, vaddr);
- memory_region_ref(section->mr);
ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
if (ret) {
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
@@ -2326,6 +2588,27 @@ static void vfio_listener_region_del(MemoryListener *listener,
return;
}
+ if (memory_region_is_iommu(section->mr)) {
+ VFIOGuestIOMMU *giommu;
+
+ QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+ if (giommu->iommu == section->mr) {
+ memory_region_unregister_iommu_notifier(&giommu->n);
+ QLIST_REMOVE(giommu, giommu_next);
+ g_free(giommu);
+ break;
+ }
+ }
+
+ /*
+ * FIXME: We assume the one big unmap below is adequate to
+ * remove any individual page mappings in the IOMMU which
+ * might have been copied into VFIO. This works for a page table
+ * based IOMMU where a big unmap flattens a large range of IO-PTEs.
+ * That may not be true for all IOMMU types.
+ */
+ }
+
iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
end = (section->offset_within_address_space + int128_get64(section->size)) &
TARGET_PAGE_MASK;
@@ -3274,16 +3557,43 @@ static void vfio_kvm_device_del_group(VFIOGroup *group)
#endif
}
-static int vfio_connect_container(VFIOGroup *group)
+static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
+{
+ VFIOAddressSpace *space;
+
+ QLIST_FOREACH(space, &vfio_address_spaces, list) {
+ if (space->as == as) {
+ return space;
+ }
+ }
+
+ /* No suitable VFIOAddressSpace, create a new one */
+ space = g_malloc0(sizeof(*space));
+ space->as = as;
+ QLIST_INIT(&space->containers);
+
+ QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
+
+ return space;
+}
+
+static void vfio_put_address_space(VFIOAddressSpace *space)
+{
+ if (QLIST_EMPTY(&space->containers)) {
+ QLIST_REMOVE(space, list);
+ g_free(space);
+ }
+}
+
+static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
{
VFIOContainer *container;
int ret, fd;
+ VFIOAddressSpace *space;
- if (group->container) {
- return 0;
- }
+ space = vfio_get_address_space(as);
- QLIST_FOREACH(container, &container_list, next) {
+ QLIST_FOREACH(container, &space->containers, next) {
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
@@ -3294,35 +3604,35 @@ static int vfio_connect_container(VFIOGroup *group)
fd = qemu_open("/dev/vfio/vfio", O_RDWR);
if (fd < 0) {
error_report("vfio: failed to open /dev/vfio/vfio: %m");
- return -errno;
+ ret = -errno;
+ goto put_space_exit;
}
ret = ioctl(fd, VFIO_GET_API_VERSION);
if (ret != VFIO_API_VERSION) {
error_report("vfio: supported vfio version: %d, "
"reported version: %d", VFIO_API_VERSION, ret);
- close(fd);
- return -EINVAL;
+ ret = -EINVAL;
+ goto close_fd_exit;
}
container = g_malloc0(sizeof(*container));
+ container->space = space;
container->fd = fd;
if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
if (ret) {
error_report("vfio: failed to set group container: %m");
- g_free(container);
- close(fd);
- return -errno;
+ ret = -errno;
+ goto free_container_exit;
}
ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
if (ret) {
error_report("vfio: failed to set iommu for container: %m");
- g_free(container);
- close(fd);
- return -errno;
+ ret = -errno;
+ goto free_container_exit;
}
container->iommu_data.type1.listener = vfio_memory_listener;
@@ -3333,29 +3643,39 @@ static int vfio_connect_container(VFIOGroup *group)
if (container->iommu_data.type1.error) {
ret = container->iommu_data.type1.error;
- vfio_listener_release(container);
- g_free(container);
- close(fd);
error_report("vfio: memory listener initialization failed for container");
- return ret;
+ goto listener_release_exit;
}
container->iommu_data.type1.initialized = true;
} else {
error_report("vfio: No available IOMMU models");
- g_free(container);
- close(fd);
- return -EINVAL;
+ ret = -EINVAL;
+ goto free_container_exit;
}
QLIST_INIT(&container->group_list);
- QLIST_INSERT_HEAD(&container_list, container, next);
+ QLIST_INSERT_HEAD(&space->containers, container, next);
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
return 0;
+
+listener_release_exit:
+ vfio_listener_release(container);
+
+free_container_exit:
+ g_free(container);
+
+close_fd_exit:
+ close(fd);
+
+put_space_exit:
+ vfio_put_address_space(space);
+
+ return ret;
}
static void vfio_disconnect_container(VFIOGroup *group)
@@ -3371,6 +3691,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
group->container = NULL;
if (QLIST_EMPTY(&container->group_list)) {
+ VFIOAddressSpace *space = container->space;
+
if (container->iommu_data.release) {
container->iommu_data.release(container);
}
@@ -3378,10 +3700,12 @@ static void vfio_disconnect_container(VFIOGroup *group)
DPRINTF("vfio_disconnect_container: close container->fd\n");
close(container->fd);
g_free(container);
+
+ vfio_put_address_space(space);
}
}
-static VFIOGroup *vfio_get_group(int groupid)
+static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
{
VFIOGroup *group;
char path[32];
@@ -3389,7 +3713,14 @@ static VFIOGroup *vfio_get_group(int groupid)
QLIST_FOREACH(group, &group_list, next) {
if (group->groupid == groupid) {
- return group;
+ /* Found it. Now is it already in the right context? */
+ if (group->container->space->as == as) {
+ return group;
+ } else {
+ error_report("vfio: group %d used in multiple address spaces",
+ group->groupid);
+ return NULL;
+ }
}
}
@@ -3399,34 +3730,27 @@ static VFIOGroup *vfio_get_group(int groupid)
group->fd = qemu_open(path, O_RDWR);
if (group->fd < 0) {
error_report("vfio: error opening %s: %m", path);
- g_free(group);
- return NULL;
+ goto free_group_exit;
}
if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
error_report("vfio: error getting group status: %m");
- close(group->fd);
- g_free(group);
- return NULL;
+ goto close_fd_exit;
}
if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
error_report("vfio: error, group %d is not viable, please ensure "
"all devices within the iommu_group are bound to their "
"vfio bus driver.", groupid);
- close(group->fd);
- g_free(group);
- return NULL;
+ goto close_fd_exit;
}
group->groupid = groupid;
QLIST_INIT(&group->device_list);
- if (vfio_connect_container(group)) {
+ if (vfio_connect_container(group, as)) {
error_report("vfio: failed to setup container for group %d", groupid);
- close(group->fd);
- g_free(group);
- return NULL;
+ goto close_fd_exit;
}
if (QLIST_EMPTY(&group_list)) {
@@ -3438,6 +3762,14 @@ static VFIOGroup *vfio_get_group(int groupid)
vfio_kvm_device_add_group(group);
return group;
+
+close_fd_exit:
+ close(group->fd);
+
+free_group_exit:
+ g_free(group);
+
+ return NULL;
}
static void vfio_put_group(VFIOGroup *group)
@@ -3768,7 +4100,7 @@ static int vfio_initfn(PCIDevice *pdev)
DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
- group = vfio_get_group(groupid);
+ group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
if (!group) {
error_report("vfio: failed to get group %d", groupid);
return -ENOENT;