diff options
Diffstat (limited to 'hw')
-rw-r--r-- | hw/Makefile.objs | 3 | ||||
-rw-r--r-- | hw/hw.h | 2 | ||||
-rw-r--r-- | hw/intel-hda.c | 8 | ||||
-rw-r--r-- | hw/kvm/pci-assign.c | 12 | ||||
-rw-r--r-- | hw/rtl8139.c | 6 | ||||
-rw-r--r-- | hw/usb/hcd-ehci.c | 40 | ||||
-rw-r--r-- | hw/usb/hcd-xhci.c | 179 | ||||
-rw-r--r-- | hw/usb/libhw.c | 24 | ||||
-rw-r--r-- | hw/vfio_pci.c | 1864 | ||||
-rw-r--r-- | hw/vfio_pci_int.h | 114 | ||||
-rw-r--r-- | hw/virtio-net.c | 4 | ||||
-rw-r--r-- | hw/virtio-serial-bus.c | 11 | ||||
-rw-r--r-- | hw/virtio.c | 32 | ||||
-rw-r--r-- | hw/virtio.h | 5 |
14 files changed, 2155 insertions, 149 deletions
diff --git a/hw/Makefile.objs b/hw/Makefile.objs index 6dfebd2978..ecdbe44ddd 100644 --- a/hw/Makefile.objs +++ b/hw/Makefile.objs @@ -198,7 +198,8 @@ obj-$(CONFIG_VGA) += vga.o obj-$(CONFIG_SOFTMMU) += device-hotplug.o obj-$(CONFIG_XEN) += xen_domainbuild.o xen_machine_pv.o -# Inter-VM PCI shared memory +# Inter-VM PCI shared memory & VFIO PCI device assignment ifeq ($(CONFIG_PCI), y) obj-$(CONFIG_KVM) += ivshmem.o +obj-$(CONFIG_LINUX) += vfio_pci.o endif @@ -4,7 +4,7 @@ #include "qemu-common.h" -#if defined(TARGET_PHYS_ADDR_BITS) && !defined(NEED_CPU_H) +#if !defined(CONFIG_USER_ONLY) && !defined(NEED_CPU_H) #include "cpu-common.h" #endif diff --git a/hw/intel-hda.c b/hw/intel-hda.c index 127e81888b..d8e1b23a60 100644 --- a/hw/intel-hda.c +++ b/hw/intel-hda.c @@ -210,13 +210,7 @@ static target_phys_addr_t intel_hda_addr(uint32_t lbase, uint32_t ubase) { target_phys_addr_t addr; -#if TARGET_PHYS_ADDR_BITS == 32 - addr = lbase; -#else - addr = ubase; - addr <<= 32; - addr |= lbase; -#endif + addr = ((uint64_t)ubase << 32) | lbase; return addr; } diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c index 05b93d9a51..7a0998c518 100644 --- a/hw/kvm/pci-assign.c +++ b/hw/kvm/pci-assign.c @@ -579,15 +579,9 @@ static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg, snprintf(name, sizeof(name), "%sconfig", dir); if (pci_dev->configfd_name && *pci_dev->configfd_name) { - if (qemu_isdigit(pci_dev->configfd_name[0])) { - dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0); - } else { - dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name); - if (dev->config_fd < 0) { - error_report("%s: (%s) unkown", __func__, - pci_dev->configfd_name); - return 1; - } + dev->config_fd = monitor_handle_fd_param(cur_mon, pci_dev->configfd_name); + if (dev->config_fd < 0) { + return 1; } } else { dev->config_fd = open(name, O_RDWR); diff --git a/hw/rtl8139.c b/hw/rtl8139.c index 844f1b8c3f..b7c82ee027 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -774,11 +774,7 @@ static void rtl8139_write_buffer(RTL8139State *s, const void *buf, int size) #define MIN_BUF_SIZE 60 static inline dma_addr_t rtl8139_addr64(uint32_t low, uint32_t high) { -#if TARGET_PHYS_ADDR_BITS > 32 - return low | ((target_phys_addr_t)high << 32); -#else - return low; -#endif + return low | ((uint64_t)high << 32); } /* Workaround for buggy guest driver such as linux who allocates rx diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c index 6a5da8413f..8bdb806b9b 100644 --- a/hw/usb/hcd-ehci.c +++ b/hw/usb/hcd-ehci.c @@ -373,6 +373,7 @@ struct EHCIQueue { uint32_t seen; uint64_t ts; int async; + int transact_ctr; /* cached data from guest - needs to be flushed * when guest removes an entry (doorbell, handshake sequence) @@ -1837,6 +1838,11 @@ static EHCIQueue *ehci_state_fetchqh(EHCIState *ehci, int async) } q->qh = qh; + q->transact_ctr = get_field(q->qh.epcap, QH_EPCAP_MULT); + if (q->transact_ctr == 0) { /* Guest bug in some versions of windows */ + q->transact_ctr = 4; + } + if (q->dev == NULL) { q->dev = ehci_find_device(q->ehci, devaddr); } @@ -2014,11 +2020,8 @@ static int ehci_state_fetchqtd(EHCIQueue *q) } else if (p != NULL) { switch (p->async) { case EHCI_ASYNC_NONE: - /* Should never happen packet should at least be initialized */ - assert(0); - break; case EHCI_ASYNC_INITIALIZED: - /* Previously nacked packet (likely interrupt ep) */ + /* Not yet executed (MULT), or previously nacked (int) packet */ ehci_set_state(q->ehci, q->async, EST_EXECUTE); break; case EHCI_ASYNC_INFLIGHT: @@ -2107,15 +2110,12 @@ static int ehci_state_execute(EHCIQueue *q) // TODO verify enough time remains in the uframe as in 4.4.1.1 // TODO write back ptr to async list when done or out of time - // TODO Windows does not seem to ever set the MULT field - if (!q->async) { - int transactCtr = get_field(q->qh.epcap, QH_EPCAP_MULT); - if (!transactCtr) { - ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH); - again = 1; - goto out; - } + /* 4.10.3, bottom of page 82, go horizontal on transaction counter == 0 */ + if (!q->async && q->transact_ctr == 0) { + ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH); + again = 1; + goto out; } if (q->async) { @@ -2132,7 +2132,11 @@ static int ehci_state_execute(EHCIQueue *q) trace_usb_ehci_packet_action(p->queue, p, "async"); p->async = EHCI_ASYNC_INFLIGHT; ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH); - again = (ehci_fill_queue(p) == USB_RET_PROCERR) ? -1 : 1; + if (q->async) { + again = (ehci_fill_queue(p) == USB_RET_PROCERR) ? -1 : 1; + } else { + again = 1; + } goto out; } @@ -2152,13 +2156,9 @@ static int ehci_state_executing(EHCIQueue *q) ehci_execute_complete(q); - // 4.10.3 - if (!q->async) { - int transactCtr = get_field(q->qh.epcap, QH_EPCAP_MULT); - transactCtr--; - set_field(&q->qh.epcap, transactCtr, QH_EPCAP_MULT); - // 4.10.3, bottom of page 82, should exit this state when transaction - // counter decrements to 0 + /* 4.10.3 */ + if (!q->async && q->transact_ctr > 0) { + q->transact_ctr--; } /* 4.10.5 */ diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c index e0ca69044a..e79a8724c5 100644 --- a/hw/usb/hcd-xhci.c +++ b/hw/usb/hcd-xhci.c @@ -37,12 +37,12 @@ #define FIXME() do { fprintf(stderr, "FIXME %s:%d\n", \ __func__, __LINE__); abort(); } while (0) -#define MAXPORTS_2 8 -#define MAXPORTS_3 8 +#define MAXPORTS_2 15 +#define MAXPORTS_3 15 #define MAXPORTS (MAXPORTS_2+MAXPORTS_3) -#define MAXSLOTS MAXPORTS -#define MAXINTRS MAXPORTS +#define MAXSLOTS 64 +#define MAXINTRS 16 #define TD_QUEUE 24 @@ -285,6 +285,8 @@ typedef enum TRBCCode { #define SLOT_CONTEXT_ENTRIES_MASK 0x1f #define SLOT_CONTEXT_ENTRIES_SHIFT 27 +typedef struct XHCIState XHCIState; + typedef enum EPType { ET_INVALID = 0, ET_ISO_OUT, @@ -303,15 +305,15 @@ typedef struct XHCIRing { } XHCIRing; typedef struct XHCIPort { + XHCIState *xhci; uint32_t portsc; uint32_t portnr; USBPort *uport; uint32_t speedmask; + char name[16]; + MemoryRegion mem; } XHCIPort; -struct XHCIState; -typedef struct XHCIState XHCIState; - typedef struct XHCITransfer { XHCIState *xhci; USBPacket packet; @@ -363,7 +365,7 @@ typedef struct XHCIEPContext { typedef struct XHCISlot { bool enabled; dma_addr_t ctx; - unsigned int port; + USBPort *uport; unsigned int devaddr; XHCIEPContext * eps[31]; } XHCISlot; @@ -1230,7 +1232,7 @@ static TRBCCode xhci_reset_ep(XHCIState *xhci, unsigned int slotid, ep |= 0x80; } - dev = xhci->ports[xhci->slots[slotid-1].port-1].uport->dev; + dev = xhci->slots[slotid-1].uport->dev; if (!dev) { return CC_USB_TRANSACTION_ERROR; } @@ -1412,18 +1414,9 @@ static void xhci_stall_ep(XHCITransfer *xfer) static int xhci_submit(XHCIState *xhci, XHCITransfer *xfer, XHCIEPContext *epctx); -static USBDevice *xhci_find_device(XHCIPort *port, uint8_t addr) -{ - if (!(port->portsc & PORTSC_PED)) { - return NULL; - } - return usb_find_device(port->uport, addr); -} - static int xhci_setup_packet(XHCITransfer *xfer) { XHCIState *xhci = xfer->xhci; - XHCIPort *port; USBDevice *dev; USBEndpoint *ep; int dir; @@ -1434,13 +1427,12 @@ static int xhci_setup_packet(XHCITransfer *xfer) ep = xfer->packet.ep; dev = ep->dev; } else { - port = &xhci->ports[xhci->slots[xfer->slotid-1].port-1]; - dev = xhci_find_device(port, xhci->slots[xfer->slotid-1].devaddr); - if (!dev) { - fprintf(stderr, "xhci: slot %d port %d has no device\n", - xfer->slotid, xhci->slots[xfer->slotid-1].port); + if (!xhci->slots[xfer->slotid-1].uport) { + fprintf(stderr, "xhci: slot %d has no device\n", + xfer->slotid); return -1; } + dev = xhci->slots[xfer->slotid-1].uport->dev; ep = usb_ep_get(dev, dir, xfer->epid >> 1); } @@ -1772,7 +1764,7 @@ static TRBCCode xhci_enable_slot(XHCIState *xhci, unsigned int slotid) trace_usb_xhci_slot_enable(slotid); assert(slotid >= 1 && slotid <= MAXSLOTS); xhci->slots[slotid-1].enabled = 1; - xhci->slots[slotid-1].port = 0; + xhci->slots[slotid-1].uport = NULL; memset(xhci->slots[slotid-1].eps, 0, sizeof(XHCIEPContext*)*31); return CC_SUCCESS; @@ -1795,17 +1787,42 @@ static TRBCCode xhci_disable_slot(XHCIState *xhci, unsigned int slotid) return CC_SUCCESS; } +static USBPort *xhci_lookup_uport(XHCIState *xhci, uint32_t *slot_ctx) +{ + USBPort *uport; + char path[32]; + int i, pos, port; + + port = (slot_ctx[1]>>16) & 0xFF; + port = xhci->ports[port-1].uport->index+1; + pos = snprintf(path, sizeof(path), "%d", port); + for (i = 0; i < 5; i++) { + port = (slot_ctx[0] >> 4*i) & 0x0f; + if (!port) { + break; + } + pos += snprintf(path + pos, sizeof(path) - pos, ".%d", port); + } + + QTAILQ_FOREACH(uport, &xhci->bus.used, next) { + if (strcmp(uport->path, path) == 0) { + return uport; + } + } + return NULL; +} + static TRBCCode xhci_address_slot(XHCIState *xhci, unsigned int slotid, uint64_t pictx, bool bsr) { XHCISlot *slot; + USBPort *uport; USBDevice *dev; dma_addr_t ictx, octx, dcbaap; uint64_t poctx; uint32_t ictl_ctx[2]; uint32_t slot_ctx[4]; uint32_t ep0_ctx[5]; - unsigned int port; int i; TRBCCode res; @@ -1837,27 +1854,28 @@ static TRBCCode xhci_address_slot(XHCIState *xhci, unsigned int slotid, DPRINTF("xhci: input ep0 context: %08x %08x %08x %08x %08x\n", ep0_ctx[0], ep0_ctx[1], ep0_ctx[2], ep0_ctx[3], ep0_ctx[4]); - port = (slot_ctx[1]>>16) & 0xFF; - dev = xhci->ports[port-1].uport->dev; - - if (port < 1 || port > xhci->numports) { - fprintf(stderr, "xhci: bad port %d\n", port); + uport = xhci_lookup_uport(xhci, slot_ctx); + if (uport == NULL) { + fprintf(stderr, "xhci: port not found\n"); return CC_TRB_ERROR; - } else if (!dev) { - fprintf(stderr, "xhci: port %d not connected\n", port); + } + + dev = uport->dev; + if (!dev) { + fprintf(stderr, "xhci: port %s not connected\n", uport->path); return CC_USB_TRANSACTION_ERROR; } for (i = 0; i < MAXSLOTS; i++) { - if (xhci->slots[i].port == port) { - fprintf(stderr, "xhci: port %d already assigned to slot %d\n", - port, i+1); + if (xhci->slots[i].uport == uport) { + fprintf(stderr, "xhci: port %s already assigned to slot %d\n", + uport->path, i+1); return CC_TRB_ERROR; } } slot = &xhci->slots[slotid-1]; - slot->port = port; + slot->uport = uport; slot->ctx = octx; if (bsr) { @@ -2414,20 +2432,14 @@ static uint64_t xhci_cap_read(void *ptr, target_phys_addr_t reg, unsigned size) return ret; } -static uint32_t xhci_port_read(XHCIState *xhci, uint32_t reg) +static uint64_t xhci_port_read(void *ptr, target_phys_addr_t reg, unsigned size) { - uint32_t port = reg >> 4; + XHCIPort *port = ptr; uint32_t ret; - if (port >= xhci->numports) { - fprintf(stderr, "xhci_port_read: port %d out of bounds\n", port); - ret = 0; - goto out; - } - - switch (reg & 0xf) { + switch (reg) { case 0x00: /* PORTSC */ - ret = xhci->ports[port].portsc; + ret = port->portsc; break; case 0x04: /* PORTPMSC */ case 0x08: /* PORTLI */ @@ -2436,30 +2448,25 @@ static uint32_t xhci_port_read(XHCIState *xhci, uint32_t reg) case 0x0c: /* reserved */ default: fprintf(stderr, "xhci_port_read (port %d): reg 0x%x unimplemented\n", - port, reg); + port->portnr, (uint32_t)reg); ret = 0; } -out: - trace_usb_xhci_port_read(port, reg & 0x0f, ret); + trace_usb_xhci_port_read(port->portnr, reg, ret); return ret; } -static void xhci_port_write(XHCIState *xhci, uint32_t reg, uint32_t val) +static void xhci_port_write(void *ptr, target_phys_addr_t reg, + uint64_t val, unsigned size) { - uint32_t port = reg >> 4; + XHCIPort *port = ptr; uint32_t portsc; - trace_usb_xhci_port_write(port, reg & 0x0f, val); - - if (port >= xhci->numports) { - fprintf(stderr, "xhci_port_read: port %d out of bounds\n", port); - return; - } + trace_usb_xhci_port_write(port->portnr, reg, val); - switch (reg & 0xf) { + switch (reg) { case 0x00: /* PORTSC */ - portsc = xhci->ports[port].portsc; + portsc = port->portsc; /* write-1-to-clear bits*/ portsc &= ~(val & (PORTSC_CSC|PORTSC_PEC|PORTSC_WRC|PORTSC_OCC| PORTSC_PRC|PORTSC_PLC|PORTSC_CEC)); @@ -2474,16 +2481,16 @@ static void xhci_port_write(XHCIState *xhci, uint32_t reg, uint32_t val) /* write-1-to-start bits */ if (val & PORTSC_PR) { DPRINTF("xhci: port %d reset\n", port); - usb_device_reset(xhci->ports[port].uport->dev); + usb_device_reset(port->uport->dev); portsc |= PORTSC_PRC | PORTSC_PED; } - xhci->ports[port].portsc = portsc; + port->portsc = portsc; break; case 0x04: /* PORTPMSC */ case 0x08: /* PORTLI */ default: fprintf(stderr, "xhci_port_write (port %d): reg 0x%x unimplemented\n", - port, reg); + port->portnr, (uint32_t)reg); } } @@ -2492,10 +2499,6 @@ static uint64_t xhci_oper_read(void *ptr, target_phys_addr_t reg, unsigned size) XHCIState *xhci = ptr; uint32_t ret; - if (reg >= 0x400) { - return xhci_port_read(xhci, reg - 0x400); - } - switch (reg) { case 0x00: /* USBCMD */ ret = xhci->usbcmd; @@ -2538,11 +2541,6 @@ static void xhci_oper_write(void *ptr, target_phys_addr_t reg, { XHCIState *xhci = ptr; - if (reg >= 0x400) { - xhci_port_write(xhci, reg - 0x400, val); - return; - } - trace_usb_xhci_oper_write(reg, val); switch (reg) { @@ -2761,6 +2759,14 @@ static const MemoryRegionOps xhci_oper_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; +static const MemoryRegionOps xhci_port_ops = { + .read = xhci_port_read, + .write = xhci_port_write, + .valid.min_access_size = 4, + .valid.max_access_size = 4, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + static const MemoryRegionOps xhci_runtime_ops = { .read = xhci_runtime_read, .write = xhci_runtime_write, @@ -2821,12 +2827,20 @@ static void xhci_complete(USBPort *port, USBPacket *packet) xhci_kick_ep(xfer->xhci, xfer->slotid, xfer->epid); } -static void xhci_child_detach(USBPort *port, USBDevice *child) +static void xhci_child_detach(USBPort *uport, USBDevice *child) { - FIXME(); + USBBus *bus = usb_bus_from_device(child); + XHCIState *xhci = container_of(bus, XHCIState, bus); + int i; + + for (i = 0; i < MAXSLOTS; i++) { + if (xhci->slots[i].uport == uport) { + xhci->slots[i].uport = NULL; + } + } } -static USBPortOps xhci_port_ops = { +static USBPortOps xhci_uport_ops = { .attach = xhci_attach, .detach = xhci_detach, .wakeup = xhci_wakeup, @@ -2906,6 +2920,7 @@ static void usb_xhci_init(XHCIState *xhci, DeviceState *dev) USB_SPEED_MASK_LOW | USB_SPEED_MASK_FULL | USB_SPEED_MASK_HIGH; + snprintf(port->name, sizeof(port->name), "usb2 port #%d", i+1); speedmask |= port->speedmask; } if (i < xhci->numports_3) { @@ -2913,16 +2928,17 @@ static void usb_xhci_init(XHCIState *xhci, DeviceState *dev) port->portnr = i + 1 + xhci->numports_2; port->uport = &xhci->uports[i]; port->speedmask = USB_SPEED_MASK_SUPER; + snprintf(port->name, sizeof(port->name), "usb3 port #%d", i+1); speedmask |= port->speedmask; } usb_register_port(&xhci->bus, &xhci->uports[i], xhci, i, - &xhci_port_ops, speedmask); + &xhci_uport_ops, speedmask); } } static int usb_xhci_initfn(struct PCIDevice *dev) { - int ret; + int i, ret; XHCIState *xhci = DO_UPCAST(XHCIState, pci_dev, dev); @@ -2941,7 +2957,7 @@ static int usb_xhci_initfn(struct PCIDevice *dev) memory_region_init_io(&xhci->mem_cap, &xhci_cap_ops, xhci, "capabilities", LEN_CAP); memory_region_init_io(&xhci->mem_oper, &xhci_oper_ops, xhci, - "operational", 0x400 + 0x10 * xhci->numports); + "operational", 0x400); memory_region_init_io(&xhci->mem_runtime, &xhci_runtime_ops, xhci, "runtime", LEN_RUNTIME); memory_region_init_io(&xhci->mem_doorbell, &xhci_doorbell_ops, xhci, @@ -2952,6 +2968,15 @@ static int usb_xhci_initfn(struct PCIDevice *dev) memory_region_add_subregion(&xhci->mem, OFF_RUNTIME, &xhci->mem_runtime); memory_region_add_subregion(&xhci->mem, OFF_DOORBELL, &xhci->mem_doorbell); + for (i = 0; i < xhci->numports; i++) { + XHCIPort *port = &xhci->ports[i]; + uint32_t offset = OFF_OPER + 0x400 + 0x10 * i; + port->xhci = xhci; + memory_region_init_io(&port->mem, &xhci_port_ops, port, + port->name, 0x10); + memory_region_add_subregion(&xhci->mem, offset, &port->mem); + } + pci_register_bar(&xhci->pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64, &xhci->mem); diff --git a/hw/usb/libhw.c b/hw/usb/libhw.c index c0de30ea88..703e2d213b 100644 --- a/hw/usb/libhw.c +++ b/hw/usb/libhw.c @@ -28,19 +28,25 @@ int usb_packet_map(USBPacket *p, QEMUSGList *sgl) { DMADirection dir = (p->pid == USB_TOKEN_IN) ? DMA_DIRECTION_FROM_DEVICE : DMA_DIRECTION_TO_DEVICE; - dma_addr_t len; void *mem; int i; for (i = 0; i < sgl->nsg; i++) { - len = sgl->sg[i].len; - mem = dma_memory_map(sgl->dma, sgl->sg[i].base, &len, dir); - if (!mem) { - goto err; - } - qemu_iovec_add(&p->iov, mem, len); - if (len != sgl->sg[i].len) { - goto err; + dma_addr_t base = sgl->sg[i].base; + dma_addr_t len = sgl->sg[i].len; + + while (len) { + dma_addr_t xlen = len; + mem = dma_memory_map(sgl->dma, sgl->sg[i].base, &xlen, dir); + if (!mem) { + goto err; + } + if (xlen > len) { + xlen = len; + } + qemu_iovec_add(&p->iov, mem, xlen); + len -= xlen; + base += xlen; } } return 0; diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c new file mode 100644 index 0000000000..a1eeced8fd --- /dev/null +++ b/hw/vfio_pci.c @@ -0,0 +1,1864 @@ +/* + * vfio based device assignment support + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include <dirent.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <linux/vfio.h> + +#include "config.h" +#include "event_notifier.h" +#include "exec-memory.h" +#include "kvm.h" +#include "memory.h" +#include "msi.h" +#include "msix.h" +#include "qemu-error.h" +#include "range.h" +#include "vfio_pci_int.h" + +/* #define DEBUG_VFIO */ +#ifdef DEBUG_VFIO +#define DPRINTF(fmt, ...) \ + do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +#define MSIX_CAP_LENGTH 12 + +static QLIST_HEAD(, VFIOContainer) + container_list = QLIST_HEAD_INITIALIZER(container_list); + +static QLIST_HEAD(, VFIOGroup) + group_list = QLIST_HEAD_INITIALIZER(group_list); + +static void vfio_disable_interrupts(VFIODevice *vdev); +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); +static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled); + +/* + * Common VFIO interrupt disable + */ +static void vfio_disable_irqindex(VFIODevice *vdev, int index) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = index, + .start = 0, + .count = 0, + }; + + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + + vdev->interrupt = VFIO_INT_NONE; +} + +/* + * INTx + */ +static void vfio_unmask_intx(VFIODevice *vdev) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, + .index = VFIO_PCI_INTX_IRQ_INDEX, + .start = 0, + .count = 1, + }; + + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); +} + +static void vfio_intx_interrupt(void *opaque) +{ + VFIODevice *vdev = opaque; + + if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, + 'A' + vdev->intx.pin); + + vdev->intx.pending = true; + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1); +} + +static void vfio_eoi(VFIODevice *vdev) +{ + if (!vdev->intx.pending) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + vdev->intx.pending = false; + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); + vfio_unmask_intx(vdev); +} + +typedef struct QEMU_PACKED VFIOIRQSetFD { + struct vfio_irq_set irq_set; + int32_t fd; +} VFIOIRQSetFD; + +static int vfio_enable_intx(VFIODevice *vdev) +{ + VFIOIRQSetFD irq_set_fd = { + .irq_set = { + .argsz = sizeof(irq_set_fd), + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = VFIO_PCI_INTX_IRQ_INDEX, + .start = 0, + .count = 1, + }, + }; + uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); + int ret; + + if (vdev->intx.disabled || !pin) { + return 0; + } + + vfio_disable_interrupts(vdev); + + vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ + ret = event_notifier_init(&vdev->intx.interrupt, 0); + if (ret) { + error_report("vfio: Error: event_notifier_init failed\n"); + return ret; + } + + irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt); + qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev); + + if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) { + error_report("vfio: Error: Failed to setup INTx fd: %m\n"); + return -errno; + } + + /* + * Disable mmaps so we can trap on BAR accesses. We interpret any + * access as a response to an interrupt and unmask the physical + * device. The device will re-assert if the interrupt is still + * pending. We'll likely retrigger on the host multiple times per + * guest interrupt, but without EOI notification it's better than + * nothing. Acceleration paths through KVM will avoid this. + */ + vfio_mmap_set_enabled(vdev, false); + + vdev->interrupt = VFIO_INT_INTx; + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + return 0; +} + +static void vfio_disable_intx(VFIODevice *vdev) +{ + int fd; + + vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX); + vdev->intx.pending = false; + qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0); + vfio_mmap_set_enabled(vdev, true); + + fd = event_notifier_get_fd(&vdev->intx.interrupt); + qemu_set_fd_handler(fd, NULL, NULL, vdev); + event_notifier_cleanup(&vdev->intx.interrupt); + + vdev->interrupt = VFIO_INT_NONE; + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); +} + +/* + * MSI/X + */ +static void vfio_msi_interrupt(void *opaque) +{ + VFIOMSIVector *vector = opaque; + VFIODevice *vdev = vector->vdev; + int nr = vector - vdev->msi_vectors; + + if (!event_notifier_test_and_clear(&vector->interrupt)) { + return; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, nr); + + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_notify(&vdev->pdev, nr); + } else if (vdev->interrupt == VFIO_INT_MSI) { + msi_notify(&vdev->pdev, nr); + } else { + error_report("vfio: MSI interrupt receieved, but not enabled?\n"); + } +} + +static int vfio_enable_vectors(VFIODevice *vdev, bool msix) +{ + struct vfio_irq_set *irq_set; + int ret = 0, i, argsz; + int32_t *fds; + + argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = vdev->nr_vectors; + fds = (int32_t *)&irq_set->data; + + for (i = 0; i < vdev->nr_vectors; i++) { + if (!vdev->msi_vectors[i].use) { + fds[i] = -1; + continue; + } + + fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); + } + + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); + + g_free(irq_set); + + if (!ret) { + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + } + + return ret; +} + +static int vfio_msix_vector_use(PCIDevice *pdev, + unsigned int nr, MSIMessage msg) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + VFIOMSIVector *vector; + int ret; + + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, nr); + + if (vdev->interrupt != VFIO_INT_MSIX) { + vfio_disable_interrupts(vdev); + } + + if (!vdev->msi_vectors) { + vdev->msi_vectors = g_malloc0(vdev->msix->entries * + sizeof(VFIOMSIVector)); + } + + vector = &vdev->msi_vectors[nr]; + vector->vdev = vdev; + vector->use = true; + + msix_vector_use(pdev, nr); + + if (event_notifier_init(&vector->interrupt, 0)) { + error_report("vfio: Error: event_notifier_init failed\n"); + } + + /* + * Attempt to enable route through KVM irqchip, + * default to userspace handling if unavailable. + */ + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); + if (vector->virq < 0 || + kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, + vector->virq) < 0) { + if (vector->virq >= 0) { + kvm_irqchip_release_virq(kvm_state, vector->virq); + vector->virq = -1; + } + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + vfio_msi_interrupt, NULL, vector); + } + + /* + * We don't want to have the host allocate all possible MSI vectors + * for a device if they're not in use, so we shutdown and incrementally + * increase them as needed. + */ + if (vdev->nr_vectors < nr + 1) { + int i; + + vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX); + vdev->nr_vectors = nr + 1; + ret = vfio_enable_vectors(vdev, true); + if (ret) { + error_report("vfio: failed to enable vectors, %d\n", ret); + } + + /* We don't know if we've missed interrupts in the interim... */ + for (i = 0; i < vdev->msix->entries; i++) { + if (vdev->msi_vectors[i].use) { + msix_notify(&vdev->pdev, i); + } + } + } else { + VFIOIRQSetFD irq_set_fd = { + .irq_set = { + .argsz = sizeof(irq_set_fd), + .flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER, + .index = VFIO_PCI_MSIX_IRQ_INDEX, + .start = nr, + .count = 1, + }, + .fd = event_notifier_get_fd(&vector->interrupt), + }; + ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd); + if (ret) { + error_report("vfio: failed to modify vector, %d\n", ret); + } + + /* + * If we were connected to the hardware PBA we could skip this, + * until then, a spurious interrupt is better than starvation. + */ + msix_notify(&vdev->pdev, nr); + } + + return 0; +} + +static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + VFIOIRQSetFD irq_set_fd = { + .irq_set = { + .argsz = sizeof(irq_set_fd), + .flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER, + .index = VFIO_PCI_MSIX_IRQ_INDEX, + .start = nr, + .count = 1, + }, + .fd = -1, + }; + + DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, nr); + + /* + * XXX What's the right thing to do here? This turns off the interrupt + * completely, but do we really just want to switch the interrupt to + * bouncing through userspace and let msix.c drop it? Not sure. + */ + msix_vector_unuse(pdev, nr); + ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd); + + if (vector->virq < 0) { + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + NULL, NULL, NULL); + } else { + kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt, + vector->virq); + kvm_irqchip_release_virq(kvm_state, vector->virq); + vector->virq = -1; + } + + event_notifier_cleanup(&vector->interrupt); + vector->use = false; +} + +/* TODO This should move to msi.c */ +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector) +{ + uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS); + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; + MSIMessage msg; + + if (msi64bit) { + msg.address = pci_get_quad(pdev->config + + pdev->msi_cap + PCI_MSI_ADDRESS_LO); + } else { + msg.address = pci_get_long(pdev->config + + pdev->msi_cap + PCI_MSI_ADDRESS_LO); + } + + msg.data = pci_get_word(pdev->config + pdev->msi_cap + + (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32)); + msg.data += vector; + + return msg; +} + +/* So should this */ +static void msi_set_qsize(PCIDevice *pdev, uint8_t size) +{ + uint8_t *config = pdev->config + pdev->msi_cap; + uint16_t flags; + + flags = pci_get_word(config + PCI_MSI_FLAGS); + flags = le16_to_cpu(flags); + flags &= ~PCI_MSI_FLAGS_QSIZE; + flags |= (size & 0x7) << 4; + flags = cpu_to_le16(flags); + pci_set_word(config + PCI_MSI_FLAGS, flags); +} + +static void vfio_enable_msi(VFIODevice *vdev) +{ + int ret, i; + + vfio_disable_interrupts(vdev); + + vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); +retry: + vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector)); + + for (i = 0; i < vdev->nr_vectors; i++) { + MSIMessage msg; + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + vector->vdev = vdev; + vector->use = true; + + if (event_notifier_init(&vector->interrupt, 0)) { + error_report("vfio: Error: event_notifier_init failed\n"); + } + + msg = msi_get_msg(&vdev->pdev, i); + + /* + * Attempt to enable route through KVM irqchip, + * default to userspace handling if unavailable. + */ + vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); + if (vector->virq < 0 || + kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, + vector->virq) < 0) { + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + vfio_msi_interrupt, NULL, vector); + } + } + + ret = vfio_enable_vectors(vdev, false); + if (ret) { + if (ret < 0) { + error_report("vfio: Error: Failed to setup MSI fds: %m\n"); + } else if (ret != vdev->nr_vectors) { + error_report("vfio: Error: Failed to enable %d " + "MSI vectors, retry with %d\n", vdev->nr_vectors, ret); + } + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + if (vector->virq >= 0) { + kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt, + vector->virq); + kvm_irqchip_release_virq(kvm_state, vector->virq); + vector->virq = -1; + } else { + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + NULL, NULL, NULL); + } + event_notifier_cleanup(&vector->interrupt); + } + + g_free(vdev->msi_vectors); + + if (ret > 0 && ret != vdev->nr_vectors) { + vdev->nr_vectors = ret; + goto retry; + } + vdev->nr_vectors = 0; + + return; + } + + msi_set_qsize(&vdev->pdev, vdev->nr_vectors); + + DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, vdev->nr_vectors); +} + +static void vfio_disable_msi_x(VFIODevice *vdev, bool msix) +{ + int i; + + vfio_disable_irqindex(vdev, msix ? VFIO_PCI_MSIX_IRQ_INDEX : + VFIO_PCI_MSI_IRQ_INDEX); + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + if (!vector->use) { + continue; + } + + if (vector->virq >= 0) { + kvm_irqchip_remove_irqfd_notifier(kvm_state, + &vector->interrupt, vector->virq); + kvm_irqchip_release_virq(kvm_state, vector->virq); + vector->virq = -1; + } else { + qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), + NULL, NULL, NULL); + } + + if (msix) { + msix_vector_unuse(&vdev->pdev, i); + } + + event_notifier_cleanup(&vector->interrupt); + } + + g_free(vdev->msi_vectors); + vdev->msi_vectors = NULL; + vdev->nr_vectors = 0; + + if (!msix) { + msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */ + } + + DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, msix ? "x" : ""); + + vfio_enable_intx(vdev); +} + +/* + * IO Port/MMIO - Beware of the endians, VFIO is always little endian + */ +static void vfio_bar_write(void *opaque, target_phys_addr_t addr, + uint64_t data, unsigned size) +{ + VFIOBAR *bar = opaque; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + uint64_t qword; + } buf; + + switch (size) { + case 1: + buf.byte = data; + break; + case 2: + buf.word = cpu_to_le16(data); + break; + case 4: + buf.dword = cpu_to_le32(data); + break; + default: + hw_error("vfio: unsupported write size, %d bytes\n", size); + break; + } + + if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) { + error_report("%s(,0x%"TARGET_PRIxPHYS", 0x%"PRIx64", %d) failed: %m\n", + __func__, addr, data, size); + } + + DPRINTF("%s(BAR%d+0x%"TARGET_PRIxPHYS", 0x%"PRIx64", %d)\n", + __func__, bar->nr, addr, data, size); + + /* + * A read or write to a BAR always signals an INTx EOI. This will + * do nothing if not pending (including not in INTx mode). We assume + * that a BAR access is in response to an interrupt and that BAR + * accesses will service the interrupt. Unfortunately, we don't know + * which access will service the interrupt, so we're potentially + * getting quite a few host interrupts per guest interrupt. + */ + vfio_eoi(DO_UPCAST(VFIODevice, bars[bar->nr], bar)); +} + +static uint64_t vfio_bar_read(void *opaque, + target_phys_addr_t addr, unsigned size) +{ + VFIOBAR *bar = opaque; + union { + uint8_t byte; + uint16_t word; + uint32_t dword; + uint64_t qword; + } buf; + uint64_t data = 0; + + if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) { + error_report("%s(,0x%"TARGET_PRIxPHYS", %d) failed: %m\n", + __func__, addr, size); + return (uint64_t)-1; + } + + switch (size) { + case 1: + data = buf.byte; + break; + case 2: + data = le16_to_cpu(buf.word); + break; + case 4: + data = le32_to_cpu(buf.dword); + break; + default: + hw_error("vfio: unsupported read size, %d bytes\n", size); + break; + } + + DPRINTF("%s(BAR%d+0x%"TARGET_PRIxPHYS", %d) = 0x%"PRIx64"\n", + __func__, bar->nr, addr, size, data); + + /* Same as write above */ + vfio_eoi(DO_UPCAST(VFIODevice, bars[bar->nr], bar)); + + return data; +} + +static const MemoryRegionOps vfio_bar_ops = { + .read = vfio_bar_read, + .write = vfio_bar_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +/* + * PCI config space + */ +static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + uint32_t val = 0; + + /* + * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX + * capabilities, and the multifunction bit below. We let VFIO handle + * virtualizing everything else. Performance is not a concern here. + */ + if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) || + (pdev->cap_present & QEMU_PCI_CAP_MSIX && + ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) || + (pdev->cap_present & QEMU_PCI_CAP_MSI && + ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) { + + val = pci_default_read_config(pdev, addr, len); + } else { + if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) { + error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m\n", + __func__, vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function, addr, len); + return -errno; + } + val = le32_to_cpu(val); + } + + /* Multifunction bit is virualized in QEMU */ + if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) { + uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION; + + if (len == 4) { + mask <<= 16; + } + + if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + val |= mask; + } else { + val &= ~mask; + } + } + + DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, addr, len, val); + + return val; +} + +static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, + uint32_t val, int len) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + uint32_t val_le = cpu_to_le32(val); + + DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__, + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, addr, val, len); + + /* Write everything to VFIO, let it filter out what we can't write */ + if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) { + error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m\n", + __func__, vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function, addr, val, len); + } + + /* Write standard header bits to emulation */ + if (addr < PCI_CONFIG_HEADER_SIZE) { + pci_default_write_config(pdev, addr, val, len); + return; + } + + /* MSI/MSI-X Enabling/Disabling */ + if (pdev->cap_present & QEMU_PCI_CAP_MSI && + ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) { + int is_enabled, was_enabled = msi_enabled(pdev); + + pci_default_write_config(pdev, addr, val, len); + + is_enabled = msi_enabled(pdev); + + if (!was_enabled && is_enabled) { + vfio_enable_msi(vdev); + } else if (was_enabled && !is_enabled) { + vfio_disable_msi_x(vdev, false); + } + } + + if (pdev->cap_present & QEMU_PCI_CAP_MSIX && + ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { + int is_enabled, was_enabled = msix_enabled(pdev); + + pci_default_write_config(pdev, addr, val, len); + + is_enabled = msix_enabled(pdev); + + if (!was_enabled && is_enabled) { + /* vfio_msix_vector_use handles this automatically */ + } else if (was_enabled && !is_enabled) { + vfio_disable_msi_x(vdev, true); + } + } +} + +/* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_READ, + .vaddr = (__u64)(intptr_t)vaddr, + .iova = iova, + .size = size, + }; + + if (!readonly) { + map.flags |= VFIO_DMA_MAP_FLAG_WRITE; + } + + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + DPRINTF("VFIO_MAP_DMA: %d\n", -errno); + return -errno; + } + + return 0; +} + +static int vfio_dma_unmap(VFIOContainer *container, + target_phys_addr_t iova, ram_addr_t size) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = iova, + .size = size, + }; + + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno); + return -errno; + } + + return 0; +} + +static void vfio_listener_dummy1(MemoryListener *listener) +{ + /* We don't do batching (begin/commit) or care about logging */ +} + +static void vfio_listener_dummy2(MemoryListener *listener, + MemoryRegionSection *section) +{ + /* We don't do logging or care about nops */ +} + +static void vfio_listener_dummy3(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + /* We don't care about eventfds */ +} + +static bool vfio_listener_skipped_section(MemoryRegionSection *section) +{ + return !memory_region_is_ram(section->mr); +} + +static void vfio_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + iommu_data.listener); + target_phys_addr_t iova, end; + void *vaddr; + int ret; + + if (vfio_listener_skipped_section(section)) { + DPRINTF("vfio: SKIPPING region_add %"TARGET_PRIxPHYS" - %"PRIx64"\n", + section->offset_within_address_space, + section->offset_within_address_space + section->size - 1); + return; + } + + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != + (section->offset_within_region & ~TARGET_PAGE_MASK))) { + error_report("%s received unaligned region\n", __func__); + return; + } + + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + end = (section->offset_within_address_space + section->size) & + TARGET_PAGE_MASK; + + if (iova >= end) { + return; + } + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + DPRINTF("vfio: region_add %"TARGET_PRIxPHYS" - %"TARGET_PRIxPHYS" [%p]\n", + iova, end - 1, vaddr); + + ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly); + if (ret) { + error_report("vfio_dma_map(%p, 0x%"TARGET_PRIxPHYS", " + "0x%"TARGET_PRIxPHYS", %p) = %d (%m)\n", + container, iova, end - iova, vaddr, ret); + } +} + +static void vfio_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + iommu_data.listener); + target_phys_addr_t iova, end; + int ret; + + if (vfio_listener_skipped_section(section)) { + DPRINTF("vfio: SKIPPING region_del %"TARGET_PRIxPHYS" - %"PRIx64"\n", + section->offset_within_address_space, + section->offset_within_address_space + section->size - 1); + return; + } + + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != + (section->offset_within_region & ~TARGET_PAGE_MASK))) { + error_report("%s received unaligned region\n", __func__); + return; + } + + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + end = (section->offset_within_address_space + section->size) & + TARGET_PAGE_MASK; + + if (iova >= end) { + return; + } + + DPRINTF("vfio: region_del %"TARGET_PRIxPHYS" - %"TARGET_PRIxPHYS"\n", + iova, end - 1); + + ret = vfio_dma_unmap(container, iova, end - iova); + if (ret) { + error_report("vfio_dma_unmap(%p, 0x%"TARGET_PRIxPHYS", " + "0x%"TARGET_PRIxPHYS") = %d (%m)\n", + container, iova, end - iova, ret); + } +} + +static MemoryListener vfio_memory_listener = { + .begin = vfio_listener_dummy1, + .commit = vfio_listener_dummy1, + .region_add = vfio_listener_region_add, + .region_del = vfio_listener_region_del, + .region_nop = vfio_listener_dummy2, + .log_start = vfio_listener_dummy2, + .log_stop = vfio_listener_dummy2, + .log_sync = vfio_listener_dummy2, + .log_global_start = vfio_listener_dummy1, + .log_global_stop = vfio_listener_dummy1, + .eventfd_add = vfio_listener_dummy3, + .eventfd_del = vfio_listener_dummy3, +}; + +static void vfio_listener_release(VFIOContainer *container) +{ + memory_listener_unregister(&container->iommu_data.listener); +} + +/* + * Interrupt setup + */ +static void vfio_disable_interrupts(VFIODevice *vdev) +{ + switch (vdev->interrupt) { + case VFIO_INT_INTx: + vfio_disable_intx(vdev); + break; + case VFIO_INT_MSI: + vfio_disable_msi_x(vdev, false); + break; + case VFIO_INT_MSIX: + vfio_disable_msi_x(vdev, true); + break; + } +} + +static int vfio_setup_msi(VFIODevice *vdev, int pos) +{ + uint16_t ctrl; + bool msi_64bit, msi_maskbit; + int ret, entries; + + /* + * TODO: don't peek into msi_supported, let msi_init fail and + * check for ENOTSUP + */ + if (!msi_supported) { + return 0; + } + + if (pread(vdev->fd, &ctrl, sizeof(ctrl), + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { + return -errno; + } + ctrl = le16_to_cpu(ctrl); + + msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT); + msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); + entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); + + DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, pos); + + ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); + if (ret < 0) { + error_report("vfio: msi_init failed\n"); + return ret; + } + vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0); + + return 0; +} + +/* + * We don't have any control over how pci_add_capability() inserts + * capabilities into the chain. In order to setup MSI-X we need a + * MemoryRegion for the BAR. In order to setup the BAR and not + * attempt to mmap the MSI-X table area, which VFIO won't allow, we + * need to first look for where the MSI-X table lives. So we + * unfortunately split MSI-X setup across two functions. + */ +static int vfio_early_setup_msix(VFIODevice *vdev) +{ + uint8_t pos; + uint16_t ctrl; + uint32_t table, pba; + + pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); + if (!pos) { + return 0; + } + + if (pread(vdev->fd, &ctrl, sizeof(ctrl), + vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { + return -errno; + } + + if (pread(vdev->fd, &table, sizeof(table), + vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { + return -errno; + } + + if (pread(vdev->fd, &pba, sizeof(pba), + vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { + return -errno; + } + + ctrl = le16_to_cpu(ctrl); + table = le32_to_cpu(table); + pba = le32_to_cpu(pba); + + vdev->msix = g_malloc0(sizeof(*(vdev->msix))); + vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; + vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; + vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; + vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; + vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; + + DPRINTF("%04x:%02x:%02x.%x " + "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, pos, vdev->msix->table_bar, + vdev->msix->table_offset, vdev->msix->entries); + + return 0; +} + +static int vfio_setup_msix(VFIODevice *vdev, int pos) +{ + int ret; + + /* + * TODO: don't peek into msi_supported, let msix_init fail and + * check for ENOTSUP + */ + if (!msi_supported) { + return 0; + } + + ret = msix_init(&vdev->pdev, vdev->msix->entries, + &vdev->bars[vdev->msix->table_bar].mem, + vdev->msix->table_bar, vdev->msix->table_offset, + &vdev->bars[vdev->msix->pba_bar].mem, + vdev->msix->pba_bar, vdev->msix->pba_offset, pos); + if (ret < 0) { + error_report("vfio: msix_init failed\n"); + return ret; + } + + ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, + vfio_msix_vector_release); + if (ret) { + error_report("vfio: msix_set_vector_notifiers failed %d\n", ret); + msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem, + &vdev->bars[vdev->msix->pba_bar].mem); + return ret; + } + + return 0; +} + +static void vfio_teardown_msi(VFIODevice *vdev) +{ + msi_uninit(&vdev->pdev); + + if (vdev->msix) { + /* FIXME: Why can't unset just silently do nothing?? */ + if (vdev->pdev.msix_vector_use_notifier && + vdev->pdev.msix_vector_release_notifier) { + msix_unset_vector_notifiers(&vdev->pdev); + } + + msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem, + &vdev->bars[vdev->msix->pba_bar].mem); + } +} + +/* + * Resource setup + */ +static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled) +{ + int i; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + VFIOBAR *bar = &vdev->bars[i]; + + if (!bar->size) { + continue; + } + + memory_region_set_enabled(&bar->mmap_mem, enabled); + if (vdev->msix && vdev->msix->table_bar == i) { + memory_region_set_enabled(&vdev->msix->mmap_mem, enabled); + } + } +} + +static void vfio_unmap_bar(VFIODevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + + if (!bar->size) { + return; + } + + memory_region_del_subregion(&bar->mem, &bar->mmap_mem); + munmap(bar->mmap, memory_region_size(&bar->mmap_mem)); + + if (vdev->msix && vdev->msix->table_bar == nr) { + memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem); + munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); + } + + memory_region_destroy(&bar->mem); +} + +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem, + void **map, size_t size, off_t offset, + const char *name) +{ + int ret = 0; + + if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) { + int prot = 0; + + if (bar->flags & VFIO_REGION_INFO_FLAG_READ) { + prot |= PROT_READ; + } + + if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) { + prot |= PROT_WRITE; + } + + *map = mmap(NULL, size, prot, MAP_SHARED, + bar->fd, bar->fd_offset + offset); + if (*map == MAP_FAILED) { + *map = NULL; + ret = -errno; + goto empty_region; + } + + memory_region_init_ram_ptr(submem, name, size, *map); + } else { +empty_region: + /* Create a zero sized sub-region to make cleanup easy. */ + memory_region_init(submem, name, 0); + } + + memory_region_add_subregion(mem, offset, submem); + + return ret; +} + +static void vfio_map_bar(VFIODevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + unsigned size = bar->size; + char name[64]; + uint32_t pci_bar; + uint8_t type; + int ret; + + /* Skip both unimplemented BARs and the upper half of 64bit BARS. */ + if (!size) { + return; + } + + snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, nr); + + /* Determine what type of BAR this is for registration */ + ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar), + vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + if (ret != sizeof(pci_bar)) { + error_report("vfio: Failed to read BAR %d (%m)\n", nr); + return; + } + + pci_bar = le32_to_cpu(pci_bar); + type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ? + ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); + + /* A "slow" read/write mapping underlies all BARs */ + memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size); + pci_register_bar(&vdev->pdev, nr, type, &bar->mem); + + /* + * We can't mmap areas overlapping the MSIX vector table, so we + * potentially insert a direct-mapped subregion before and after it. + */ + if (vdev->msix && vdev->msix->table_bar == nr) { + size = vdev->msix->table_offset & TARGET_PAGE_MASK; + } + + strncat(name, " mmap", sizeof(name) - strlen(name) - 1); + if (vfio_mmap_bar(bar, &bar->mem, + &bar->mmap_mem, &bar->mmap, size, 0, name)) { + error_report("%s unsupported. Performance may be slow\n", name); + } + + if (vdev->msix && vdev->msix->table_bar == nr) { + unsigned start; + + start = TARGET_PAGE_ALIGN(vdev->msix->table_offset + + (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); + + size = start < bar->size ? bar->size - start : 0; + strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); + /* VFIOMSIXInfo contains another MemoryRegion for this mapping */ + if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem, + &vdev->msix->mmap, size, start, name)) { + error_report("%s unsupported. Performance may be slow\n", name); + } + } +} + +static void vfio_map_bars(VFIODevice *vdev) +{ + int i; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_map_bar(vdev, i); + } +} + +static void vfio_unmap_bars(VFIODevice *vdev) +{ + int i; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_unmap_bar(vdev, i); + } +} + +/* + * General setup + */ +static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) +{ + uint8_t tmp, next = 0xff; + + for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp; + tmp = pdev->config[tmp + 1]) { + if (tmp > pos && tmp < next) { + next = tmp; + } + } + + return next - pos; +} + +static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos) +{ + PCIDevice *pdev = &vdev->pdev; + uint8_t cap_id, next, size; + int ret; + + cap_id = pdev->config[pos]; + next = pdev->config[pos + 1]; + + /* + * If it becomes important to configure capabilities to their actual + * size, use this as the default when it's something we don't recognize. + * Since QEMU doesn't actually handle many of the config accesses, + * exact size doesn't seem worthwhile. + */ + size = vfio_std_cap_max_size(pdev, pos); + + /* + * pci_add_capability always inserts the new capability at the head + * of the chain. Therefore to end up with a chain that matches the + * physical device, we insert from the end by making this recursive. + * This is also why we pre-caclulate size above as cached config space + * will be changed as we unwind the stack. + */ + if (next) { + ret = vfio_add_std_cap(vdev, next); + if (ret) { + return ret; + } + } else { + pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */ + } + + switch (cap_id) { + case PCI_CAP_ID_MSI: + ret = vfio_setup_msi(vdev, pos); + break; + case PCI_CAP_ID_MSIX: + ret = vfio_setup_msix(vdev, pos); + break; + default: + ret = pci_add_capability(pdev, cap_id, pos, size); + break; + } + + if (ret < 0) { + error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability " + "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, + cap_id, size, pos, ret); + return ret; + } + + return 0; +} + +static int vfio_add_capabilities(VFIODevice *vdev) +{ + PCIDevice *pdev = &vdev->pdev; + + if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || + !pdev->config[PCI_CAPABILITY_LIST]) { + return 0; /* Nothing to add */ + } + + return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); +} + +static int vfio_load_rom(VFIODevice *vdev) +{ + uint64_t size = vdev->rom_size; + char name[32]; + off_t off = 0, voff = vdev->rom_offset; + ssize_t bytes; + void *ptr; + + /* If loading ROM from file, pci handles it */ + if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) { + return 0; + } + + DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + memory_region_init_ram(&vdev->pdev.rom, name, size); + ptr = memory_region_get_ram_ptr(&vdev->pdev.rom); + memset(ptr, 0xff, size); + + while (size) { + bytes = pread(vdev->fd, ptr + off, size, voff + off); + if (bytes == 0) { + break; /* expect that we could get back less than the ROM BAR */ + } else if (bytes > 0) { + off += bytes; + size -= bytes; + } else { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + error_report("vfio: Error reading device ROM: %m\n"); + memory_region_destroy(&vdev->pdev.rom); + return -errno; + } + } + + pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom); + vdev->pdev.has_rom = true; + return 0; +} + +static int vfio_connect_container(VFIOGroup *group) +{ + VFIOContainer *container; + int ret, fd; + + if (group->container) { + return 0; + } + + QLIST_FOREACH(container, &container_list, next) { + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + return 0; + } + } + + fd = qemu_open("/dev/vfio/vfio", O_RDWR); + if (fd < 0) { + error_report("vfio: failed to open /dev/vfio/vfio: %m\n"); + return -errno; + } + + ret = ioctl(fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + error_report("vfio: supported vfio version: %d, " + "reported version: %d\n", VFIO_API_VERSION, ret); + close(fd); + return -EINVAL; + } + + container = g_malloc0(sizeof(*container)); + container->fd = fd; + + if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); + if (ret) { + error_report("vfio: failed to set group container: %m\n"); + g_free(container); + close(fd); + return -errno; + } + + ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); + if (ret) { + error_report("vfio: failed to set iommu for container: %m\n"); + g_free(container); + close(fd); + return -errno; + } + + container->iommu_data.listener = vfio_memory_listener; + container->iommu_data.release = vfio_listener_release; + + memory_listener_register(&container->iommu_data.listener, + get_system_memory()); + } else { + error_report("vfio: No available IOMMU models\n"); + g_free(container); + close(fd); + return -EINVAL; + } + + QLIST_INIT(&container->group_list); + QLIST_INSERT_HEAD(&container_list, container, next); + + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + + return 0; +} + +static void vfio_disconnect_container(VFIOGroup *group) +{ + VFIOContainer *container = group->container; + + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from container\n", + group->groupid); + } + + QLIST_REMOVE(group, container_next); + group->container = NULL; + + if (QLIST_EMPTY(&container->group_list)) { + if (container->iommu_data.release) { + container->iommu_data.release(container); + } + QLIST_REMOVE(container, next); + DPRINTF("vfio_disconnect_container: close container->fd\n"); + close(container->fd); + g_free(container); + } +} + +static VFIOGroup *vfio_get_group(int groupid) +{ + VFIOGroup *group; + char path[32]; + struct vfio_group_status status = { .argsz = sizeof(status) }; + + QLIST_FOREACH(group, &group_list, next) { + if (group->groupid == groupid) { + return group; + } + } + + group = g_malloc0(sizeof(*group)); + + snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); + group->fd = qemu_open(path, O_RDWR); + if (group->fd < 0) { + error_report("vfio: error opening %s: %m\n", path); + g_free(group); + return NULL; + } + + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { + error_report("vfio: error getting group status: %m\n"); + close(group->fd); + g_free(group); + return NULL; + } + + if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_report("vfio: error, group %d is not viable, please ensure " + "all devices within the iommu_group are bound to their " + "vfio bus driver.\n", groupid); + close(group->fd); + g_free(group); + return NULL; + } + + group->groupid = groupid; + QLIST_INIT(&group->device_list); + + if (vfio_connect_container(group)) { + error_report("vfio: failed to setup container for group %d\n", groupid); + close(group->fd); + g_free(group); + return NULL; + } + + QLIST_INSERT_HEAD(&group_list, group, next); + + return group; +} + +static void vfio_put_group(VFIOGroup *group) +{ + if (!QLIST_EMPTY(&group->device_list)) { + return; + } + + vfio_disconnect_container(group); + QLIST_REMOVE(group, next); + DPRINTF("vfio_put_group: close group->fd\n"); + close(group->fd); + g_free(group); +} + +static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev) +{ + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; + int ret, i; + + ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + if (ret < 0) { + error_report("vfio: error getting device %s from group %d: %m\n", + name, group->groupid); + error_report("Verify all devices in group %d are bound to vfio-pci " + "or pci-stub and not already in use\n", group->groupid); + return ret; + } + + vdev->fd = ret; + vdev->group = group; + QLIST_INSERT_HEAD(&group->device_list, vdev, next); + + /* Sanity check device */ + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info); + if (ret) { + error_report("vfio: error getting device info: %m\n"); + goto error; + } + + DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name, + dev_info.flags, dev_info.num_regions, dev_info.num_irqs); + + if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) { + error_report("vfio: Um, this isn't a PCI device\n"); + goto error; + } + + vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + if (!vdev->reset_works) { + error_report("Warning, device %s does not support reset\n", name); + } + + if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) { + error_report("vfio: unexpected number of io regions %u\n", + dev_info.num_regions); + goto error; + } + + if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) { + error_report("vfio: unexpected number of irqs %u\n", dev_info.num_irqs); + goto error; + } + + for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) { + reg_info.index = i; + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + if (ret) { + error_report("vfio: Error getting region %d info: %m\n", i); + goto error; + } + + DPRINTF("Device %s region %d:\n", name, i); + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, + (unsigned long)reg_info.flags); + + vdev->bars[i].flags = reg_info.flags; + vdev->bars[i].size = reg_info.size; + vdev->bars[i].fd_offset = reg_info.offset; + vdev->bars[i].fd = vdev->fd; + vdev->bars[i].nr = i; + } + + reg_info.index = VFIO_PCI_ROM_REGION_INDEX; + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + if (ret) { + error_report("vfio: Error getting ROM info: %m\n"); + goto error; + } + + DPRINTF("Device %s ROM:\n", name); + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, + (unsigned long)reg_info.flags); + + vdev->rom_size = reg_info.size; + vdev->rom_offset = reg_info.offset; + + reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + if (ret) { + error_report("vfio: Error getting config info: %m\n"); + goto error; + } + + DPRINTF("Device %s config:\n", name); + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, + (unsigned long)reg_info.flags); + + vdev->config_size = reg_info.size; + vdev->config_offset = reg_info.offset; + +error: + if (ret) { + QLIST_REMOVE(vdev, next); + vdev->group = NULL; + close(vdev->fd); + } + return ret; +} + +static void vfio_put_device(VFIODevice *vdev) +{ + QLIST_REMOVE(vdev, next); + vdev->group = NULL; + DPRINTF("vfio_put_device: close vdev->fd\n"); + close(vdev->fd); + if (vdev->msix) { + g_free(vdev->msix); + vdev->msix = NULL; + } +} + +static int vfio_initfn(PCIDevice *pdev) +{ + VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + VFIOGroup *group; + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; + ssize_t len; + struct stat st; + int groupid; + int ret; + + /* Check that the host device exists */ + snprintf(path, sizeof(path), + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + if (stat(path, &st) < 0) { + error_report("vfio: error: no such host device: %s\n", path); + return -errno; + } + + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); + + len = readlink(path, iommu_group_path, PATH_MAX); + if (len <= 0) { + error_report("vfio: error no iommu_group for device\n"); + return -errno; + } + + iommu_group_path[len] = 0; + group_name = basename(iommu_group_path); + + if (sscanf(group_name, "%d", &groupid) != 1) { + error_report("vfio: error reading %s: %m\n", path); + return -errno; + } + + DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, groupid); + + group = vfio_get_group(groupid); + if (!group) { + error_report("vfio: failed to get group %d\n", groupid); + return -ENOENT; + } + + snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + + QLIST_FOREACH(pvdev, &group->device_list, next) { + if (pvdev->host.domain == vdev->host.domain && + pvdev->host.bus == vdev->host.bus && + pvdev->host.slot == vdev->host.slot && + pvdev->host.function == vdev->host.function) { + + error_report("vfio: error: device %s is already attached\n", path); + vfio_put_group(group); + return -EBUSY; + } + } + + ret = vfio_get_device(group, path, vdev); + if (ret) { + error_report("vfio: failed to get device %s\n", path); + vfio_put_group(group); + return ret; + } + + /* Get a copy of config space */ + ret = pread(vdev->fd, vdev->pdev.config, + MIN(pci_config_size(&vdev->pdev), vdev->config_size), + vdev->config_offset); + if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { + ret = ret < 0 ? -errno : -EFAULT; + error_report("vfio: Failed to read device config space\n"); + goto out_put; + } + + /* + * Clear host resource mapping info. If we choose not to register a + * BAR, such as might be the case with the option ROM, we can get + * confusing, unwritable, residual addresses from the host here. + */ + memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); + memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); + + vfio_load_rom(vdev); + + ret = vfio_early_setup_msix(vdev); + if (ret) { + goto out_put; + } + + vfio_map_bars(vdev); + + ret = vfio_add_capabilities(vdev); + if (ret) { + goto out_teardown; + } + + if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { + if (vdev->intx.intx && strcmp(vdev->intx.intx, "off")) { + error_report("vfio: Unknown option x-intx=%s, " + "valid options: \"off\".\n", vdev->intx.intx); + ret = -EINVAL; + goto out_teardown; + } + + if (vdev->intx.intx && !strcmp(vdev->intx.intx, "off")) { + vdev->intx.disabled = true; + } + + ret = vfio_enable_intx(vdev); + if (ret) { + goto out_teardown; + } + } + + return 0; + +out_teardown: + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + vfio_teardown_msi(vdev); + vfio_unmap_bars(vdev); +out_put: + vfio_put_device(vdev); + vfio_put_group(group); + return ret; +} + +static void vfio_exitfn(PCIDevice *pdev) +{ + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + VFIOGroup *group = vdev->group; + + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + vfio_disable_interrupts(vdev); + vfio_teardown_msi(vdev); + vfio_unmap_bars(vdev); + vfio_put_device(vdev); + vfio_put_group(group); +} + +static void vfio_pci_reset(DeviceState *dev) +{ + PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); + VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); + + if (!vdev->reset_works) { + return; + } + + if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) { + error_report("vfio: Error unable to reset physical device " + "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + } +} + +static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host), + DEFINE_PROP_STRING("x-intx", VFIODevice, intx.intx), + /* + * TODO - support passed fds... is this necessary? + * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name), + * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name), + */ + DEFINE_PROP_END_OF_LIST(), +}; + + +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->reset = vfio_pci_reset; + dc->props = vfio_pci_dev_properties; + pdc->init = vfio_initfn; + pdc->exit = vfio_exitfn; + pdc->config_read = vfio_pci_read_config; + pdc->config_write = vfio_pci_write_config; +} + +static const TypeInfo vfio_pci_dev_info = { + .name = "vfio-pci", + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(VFIODevice), + .class_init = vfio_pci_dev_class_init, +}; + +static void register_vfio_pci_dev_type(void) +{ + type_register_static(&vfio_pci_dev_info); +} + +type_init(register_vfio_pci_dev_type) diff --git a/hw/vfio_pci_int.h b/hw/vfio_pci_int.h new file mode 100644 index 0000000000..3812d8d7f1 --- /dev/null +++ b/hw/vfio_pci_int.h @@ -0,0 +1,114 @@ +/* + * vfio based device assignment support + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef HW_VFIO_PCI_INT_H +#define HW_VFIO_PCI_INT_H + +#include "qemu-common.h" +#include "qemu-queue.h" +#include "pci.h" +#include "event_notifier.h" + +typedef struct VFIOBAR { + off_t fd_offset; /* offset of BAR within device fd */ + int fd; /* device fd, allows us to pass VFIOBAR as opaque data */ + MemoryRegion mem; /* slow, read/write access */ + MemoryRegion mmap_mem; /* direct mapped access */ + void *mmap; + size_t size; + uint32_t flags; /* VFIO region flags (rd/wr/mmap) */ + uint8_t nr; /* cache the BAR number for debug */ +} VFIOBAR; + +typedef struct VFIOINTx { + bool pending; /* interrupt pending */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ + uint8_t pin; /* which pin to pull for qemu_set_irq */ + EventNotifier interrupt; /* eventfd triggered on interrupt */ + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ + PCIINTxRoute route; /* routing info for QEMU bypass */ + bool disabled; + char *intx; +} VFIOINTx; + +struct VFIODevice; + +typedef struct VFIOMSIVector { + EventNotifier interrupt; /* eventfd triggered on interrupt */ + struct VFIODevice *vdev; /* back pointer to device */ + int virq; /* KVM irqchip route for QEMU bypass */ + bool use; +} VFIOMSIVector; + +enum { + VFIO_INT_NONE = 0, + VFIO_INT_INTx = 1, + VFIO_INT_MSI = 2, + VFIO_INT_MSIX = 3, +}; + +struct VFIOGroup; + +typedef struct VFIOContainer { + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ + struct { + /* enable abstraction to support various iommu backends */ + union { + MemoryListener listener; /* Used by type1 iommu */ + }; + void (*release)(struct VFIOContainer *); + } iommu_data; + QLIST_HEAD(, VFIOGroup) group_list; + QLIST_ENTRY(VFIOContainer) next; +} VFIOContainer; + +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ +typedef struct VFIOMSIXInfo { + uint8_t table_bar; + uint8_t pba_bar; + uint16_t entries; + uint32_t table_offset; + uint32_t pba_offset; + MemoryRegion mmap_mem; + void *mmap; +} VFIOMSIXInfo; + +typedef struct VFIODevice { + PCIDevice pdev; + int fd; + VFIOINTx intx; + unsigned int config_size; + off_t config_offset; /* Offset of config space region within device fd */ + unsigned int rom_size; + off_t rom_offset; /* Offset of ROM region within device fd */ + int msi_cap_size; + VFIOMSIVector *msi_vectors; + VFIOMSIXInfo *msix; + int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ + int interrupt; /* Current interrupt type */ + VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ + PCIHostDeviceAddress host; + QLIST_ENTRY(VFIODevice) next; + struct VFIOGroup *group; + bool reset_works; +} VFIODevice; + +typedef struct VFIOGroup { + int fd; + int groupid; + VFIOContainer *container; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOGroup) next; + QLIST_ENTRY(VFIOGroup) container_next; +} VFIOGroup; + +#endif /* HW_VFIO_PCI_INT_H */ diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 6490743290..247d7bef56 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -690,7 +690,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len) { VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque; - virtqueue_push(n->tx_vq, &n->async_tx.elem, n->async_tx.len); + virtqueue_push(n->tx_vq, &n->async_tx.elem, 0); virtio_notify(&n->vdev, n->tx_vq); n->async_tx.elem.out_num = n->async_tx.len = 0; @@ -754,7 +754,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) len += ret; - virtqueue_push(vq, &elem, len); + virtqueue_push(vq, &elem, 0); virtio_notify(&n->vdev, vq); if (++num_packets >= n->tx_burst) { diff --git a/hw/virtio-serial-bus.c b/hw/virtio-serial-bus.c index 82073f5dc2..d20bd8bf75 100644 --- a/hw/virtio-serial-bus.c +++ b/hw/virtio-serial-bus.c @@ -287,6 +287,7 @@ ssize_t virtio_serial_write(VirtIOSerialPort *port, const uint8_t *buf, size_t virtio_serial_guest_ready(VirtIOSerialPort *port) { VirtQueue *vq = port->ivq; + unsigned int bytes; if (!virtio_queue_ready(vq) || !(port->vser->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) || @@ -296,14 +297,8 @@ size_t virtio_serial_guest_ready(VirtIOSerialPort *port) if (use_multiport(port->vser) && !port->guest_connected) { return 0; } - - if (virtqueue_avail_bytes(vq, 4096, 0)) { - return 4096; - } - if (virtqueue_avail_bytes(vq, 1, 0)) { - return 1; - } - return 0; + virtqueue_get_avail_bytes(vq, &bytes, NULL); + return bytes; } static void flush_queued_data_bh(void *opaque) diff --git a/hw/virtio.c b/hw/virtio.c index 209c763751..6821092df2 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -241,7 +241,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, elem->in_sg[i].iov_len, 1, size); - offset += elem->in_sg[i].iov_len; + offset += size; } for (i = 0; i < elem->out_num; i++) @@ -335,10 +335,11 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa, return next; } -int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes) +void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes) { unsigned int idx; - int total_bufs, in_total, out_total; + unsigned int total_bufs, in_total, out_total; idx = vq->last_avail_idx; @@ -380,13 +381,9 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes) } if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) { - if (in_bytes > 0 && - (in_total += vring_desc_len(desc_pa, i)) >= in_bytes) - return 1; + in_total += vring_desc_len(desc_pa, i); } else { - if (out_bytes > 0 && - (out_total += vring_desc_len(desc_pa, i)) >= out_bytes) - return 1; + out_total += vring_desc_len(desc_pa, i); } } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max); @@ -395,7 +392,24 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes) else total_bufs++; } + if (in_bytes) { + *in_bytes = in_total; + } + if (out_bytes) { + *out_bytes = out_total; + } +} +int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, + unsigned int out_bytes) +{ + unsigned int in_total, out_total; + + virtqueue_get_avail_bytes(vq, &in_total, &out_total); + if ((in_bytes && in_bytes < in_total) + || (out_bytes && out_bytes < out_total)) { + return 1; + } return 0; } diff --git a/hw/virtio.h b/hw/virtio.h index 7a4f564529..80de3757e3 100644 --- a/hw/virtio.h +++ b/hw/virtio.h @@ -147,7 +147,10 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr, size_t num_sg, int is_write); int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem); -int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes); +int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, + unsigned int out_bytes); +void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes); void virtio_notify(VirtIODevice *vdev, VirtQueue *vq); |