diff options
author | David Woodhouse <dwmw@amazon.co.uk> | 2023-01-13 23:35:46 +0000 |
---|---|---|
committer | David Woodhouse <dwmw@amazon.co.uk> | 2023-03-01 09:09:22 +0000 |
commit | 6096cf7877bc6ee84e6b3b44dfe144bc8b549724 (patch) | |
tree | 15f6b36d90979aa91f68e5df93f159507e7666a0 /hw/i386/kvm | |
parent | 4f81baa33ed645fc17a9908236630b8154502ae5 (diff) |
hw/xen: Support MSI mapping to PIRQ
The way that Xen handles MSI PIRQs is kind of awful.
There is a special MSI message which targets a PIRQ. The vector in the
low bits of data must be zero. The low 8 bits of the PIRQ# are in the
destination ID field, the extended destination ID field is unused, and
instead the high bits of the PIRQ# are in the high 32 bits of the address.
Using the high bits of the address means that we can't intercept and
translate these messages in kvm_send_msi(), because they won't be caught
by the APIC — addresses like 0x1000fee46000 aren't in the APIC's range.
So we catch them in pci_msi_trigger() instead, and deliver the event
channel directly.
That isn't even the worst part. The worst part is that Xen snoops on
writes to devices' MSI vectors while they are *masked*. When a MSI
message is written which looks like it targets a PIRQ, it remembers
the device and vector for later.
When the guest makes a hypercall to bind that PIRQ# (snooped from a
marked MSI vector) to an event channel port, Xen *unmasks* that MSI
vector on the device. Xen guests using PIRQ delivery of MSI don't
ever actually unmask the MSI for themselves.
Now that this is working we can finally enable XENFEAT_hvm_pirqs and
let the guest use it all.
Tested with passthrough igb and emulated e1000e + AHCI.
CPU0 CPU1
0: 65 0 IO-APIC 2-edge timer
1: 0 14 xen-pirq 1-ioapic-edge i8042
4: 0 846 xen-pirq 4-ioapic-edge ttyS0
8: 1 0 xen-pirq 8-ioapic-edge rtc0
9: 0 0 xen-pirq 9-ioapic-level acpi
12: 257 0 xen-pirq 12-ioapic-edge i8042
24: 9600 0 xen-percpu -virq timer0
25: 2758 0 xen-percpu -ipi resched0
26: 0 0 xen-percpu -ipi callfunc0
27: 0 0 xen-percpu -virq debug0
28: 1526 0 xen-percpu -ipi callfuncsingle0
29: 0 0 xen-percpu -ipi spinlock0
30: 0 8608 xen-percpu -virq timer1
31: 0 874 xen-percpu -ipi resched1
32: 0 0 xen-percpu -ipi callfunc1
33: 0 0 xen-percpu -virq debug1
34: 0 1617 xen-percpu -ipi callfuncsingle1
35: 0 0 xen-percpu -ipi spinlock1
36: 8 0 xen-dyn -event xenbus
37: 0 6046 xen-pirq -msi ahci[0000:00:03.0]
38: 1 0 xen-pirq -msi-x ens4
39: 0 73 xen-pirq -msi-x ens4-rx-0
40: 14 0 xen-pirq -msi-x ens4-rx-1
41: 0 32 xen-pirq -msi-x ens4-tx-0
42: 47 0 xen-pirq -msi-x ens4-tx-1
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Paul Durrant <paul@xen.org>
Diffstat (limited to 'hw/i386/kvm')
-rw-r--r-- | hw/i386/kvm/trace-events | 1 | ||||
-rw-r--r-- | hw/i386/kvm/xen-stubs.c | 16 | ||||
-rw-r--r-- | hw/i386/kvm/xen_evtchn.c | 262 | ||||
-rw-r--r-- | hw/i386/kvm/xen_evtchn.h | 8 |
4 files changed, 280 insertions, 7 deletions
diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events index 04e60c5bb8..b83c3eb965 100644 --- a/hw/i386/kvm/trace-events +++ b/hw/i386/kvm/trace-events @@ -2,3 +2,4 @@ kvm_xen_map_pirq(int pirq, int gsi) "pirq %d gsi %d" kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d" kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d" kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d" +kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d" diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c index 720590aedd..ae406e0b02 100644 --- a/hw/i386/kvm/xen-stubs.c +++ b/hw/i386/kvm/xen-stubs.c @@ -14,6 +14,22 @@ #include "qapi/error.h" #include "qapi/qapi-commands-misc-target.h" +#include "xen_evtchn.h" + +void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, + uint64_t addr, uint32_t data, bool is_masked) +{ +} + +void xen_evtchn_remove_pci_device(PCIDevice *dev) +{ +} + +bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data) +{ + return false; +} + #ifdef TARGET_I386 EvtchnInfoList *qmp_xen_event_list(Error **errp) { diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index f2c4b43871..69c0204d4f 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -31,6 +31,8 @@ #include "hw/i386/x86.h" #include "hw/i386/pc.h" #include "hw/pci/pci.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" #include "hw/irq.h" #include "xen_evtchn.h" @@ -45,6 +47,9 @@ #include "hw/xen/interface/memory.h" #include "hw/xen/interface/hvm/params.h" +/* XX: For kvm_update_msi_routes_all() */ +#include "target/i386/kvm/kvm_i386.h" + #define TYPE_XEN_EVTCHN "xen-evtchn" OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN) @@ -119,6 +124,11 @@ struct xenevtchn_handle { struct pirq_info { int gsi; uint16_t port; + PCIDevice *dev; + int vector; + bool is_msix; + bool is_masked; + bool is_translated; }; struct XenEvtchnState { @@ -151,7 +161,7 @@ struct XenEvtchnState { /* Per-GSI assertion state (serialized) */ uint32_t pirq_gsi_set; - /* Per-PIRQ information (rebuilt on migration) */ + /* Per-PIRQ information (rebuilt on migration, protected by BQL) */ struct pirq_info *pirq; }; @@ -1007,16 +1017,23 @@ static bool virq_is_global(uint32_t virq) } } -static int close_port(XenEvtchnState *s, evtchn_port_t port) +static int close_port(XenEvtchnState *s, evtchn_port_t port, + bool *flush_kvm_routes) { XenEvtchnPort *p = &s->port_table[port]; + /* Because it *might* be a PIRQ port */ + assert(qemu_mutex_iothread_locked()); + switch (p->type) { case EVTCHNSTAT_closed: return -ENOENT; case EVTCHNSTAT_pirq: s->pirq[p->type_val].port = 0; + if (s->pirq[p->type_val].is_translated) { + *flush_kvm_routes = true; + } break; case EVTCHNSTAT_virq: @@ -1065,6 +1082,7 @@ static int close_port(XenEvtchnState *s, evtchn_port_t port) int xen_evtchn_soft_reset(void) { XenEvtchnState *s = xen_evtchn_singleton; + bool flush_kvm_routes; int i; if (!s) { @@ -1073,10 +1091,16 @@ int xen_evtchn_soft_reset(void) assert(qemu_mutex_iothread_locked()); - QEMU_LOCK_GUARD(&s->port_lock); + qemu_mutex_lock(&s->port_lock); for (i = 0; i < s->nr_ports; i++) { - close_port(s, i); + close_port(s, i, &flush_kvm_routes); + } + + qemu_mutex_unlock(&s->port_lock); + + if (flush_kvm_routes) { + kvm_update_msi_routes_all(NULL, true, 0, 0); } return 0; @@ -1094,6 +1118,7 @@ int xen_evtchn_reset_op(struct evtchn_reset *reset) int xen_evtchn_close_op(struct evtchn_close *close) { XenEvtchnState *s = xen_evtchn_singleton; + bool flush_kvm_routes = false; int ret; if (!s) { @@ -1104,12 +1129,17 @@ int xen_evtchn_close_op(struct evtchn_close *close) return -EINVAL; } + QEMU_IOTHREAD_LOCK_GUARD(); qemu_mutex_lock(&s->port_lock); - ret = close_port(s, close->port); + ret = close_port(s, close->port, &flush_kvm_routes); qemu_mutex_unlock(&s->port_lock); + if (flush_kvm_routes) { + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + return ret; } @@ -1226,21 +1256,54 @@ int xen_evtchn_bind_pirq_op(struct evtchn_bind_pirq *pirq) return -EINVAL; } - QEMU_LOCK_GUARD(&s->port_lock); + QEMU_IOTHREAD_LOCK_GUARD(); if (s->pirq[pirq->pirq].port) { return -EBUSY; } + qemu_mutex_lock(&s->port_lock); + ret = allocate_port(s, 0, EVTCHNSTAT_pirq, pirq->pirq, &pirq->port); if (ret) { + qemu_mutex_unlock(&s->port_lock); return ret; } s->pirq[pirq->pirq].port = pirq->port; trace_kvm_xen_bind_pirq(pirq->pirq, pirq->port); + qemu_mutex_unlock(&s->port_lock); + + /* + * Need to do the unmask outside port_lock because it may call + * back into the MSI translate function. + */ + if (s->pirq[pirq->pirq].gsi == IRQ_MSI_EMU) { + if (s->pirq[pirq->pirq].is_masked) { + PCIDevice *dev = s->pirq[pirq->pirq].dev; + int vector = s->pirq[pirq->pirq].vector; + char *dev_path = qdev_get_dev_path(DEVICE(dev)); + + trace_kvm_xen_unmask_pirq(pirq->pirq, dev_path, vector); + g_free(dev_path); + + if (s->pirq[pirq->pirq].is_msix) { + msix_set_mask(dev, vector, false); + } else { + msi_set_mask(dev, vector, false, NULL); + } + } else if (s->pirq[pirq->pirq].is_translated) { + /* + * If KVM had attempted to translate this one before, make it try + * again. If we unmasked, then the notifier on the MSI(-X) vector + * will already have had the same effect. + */ + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + } + return ret; } @@ -1559,6 +1622,179 @@ bool xen_evtchn_set_gsi(int gsi, int level) return true; } +static uint32_t msi_pirq_target(uint64_t addr, uint32_t data) +{ + /* The vector (in low 8 bits of data) must be zero */ + if (data & 0xff) { + return 0; + } + + uint32_t pirq = (addr & 0xff000) >> 12; + pirq |= (addr >> 32) & 0xffffff00; + + return pirq; +} + +static void do_remove_pci_vector(XenEvtchnState *s, PCIDevice *dev, int vector, + int except_pirq) +{ + uint32_t pirq; + + for (pirq = 0; pirq < s->nr_pirqs; pirq++) { + /* + * We could be cleverer here, but it isn't really a fast path, and + * this trivial optimisation is enough to let us skip the big gap + * in the middle a bit quicker (in terms of both loop iterations, + * and cache lines). + */ + if (!(pirq & 63) && !(pirq_inuse_word(s, pirq))) { + pirq += 64; + continue; + } + if (except_pirq && pirq == except_pirq) { + continue; + } + if (s->pirq[pirq].dev != dev) { + continue; + } + if (vector != -1 && s->pirq[pirq].vector != vector) { + continue; + } + + /* It could theoretically be bound to a port already, but that is OK. */ + s->pirq[pirq].dev = dev; + s->pirq[pirq].gsi = IRQ_UNBOUND; + s->pirq[pirq].is_msix = false; + s->pirq[pirq].vector = 0; + s->pirq[pirq].is_masked = false; + s->pirq[pirq].is_translated = false; + } +} + +void xen_evtchn_remove_pci_device(PCIDevice *dev) +{ + XenEvtchnState *s = xen_evtchn_singleton; + + if (!s) { + return; + } + + QEMU_LOCK_GUARD(&s->port_lock); + do_remove_pci_vector(s, dev, -1, 0); +} + +void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, + uint64_t addr, uint32_t data, bool is_masked) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint32_t pirq; + + if (!s) { + return; + } + + assert(qemu_mutex_iothread_locked()); + + pirq = msi_pirq_target(addr, data); + + /* + * The PIRQ# must be sane, and there must be an allocated PIRQ in + * IRQ_UNBOUND or IRQ_MSI_EMU state to match it. + */ + if (!pirq || pirq >= s->nr_pirqs || !pirq_inuse(s, pirq) || + (s->pirq[pirq].gsi != IRQ_UNBOUND && + s->pirq[pirq].gsi != IRQ_MSI_EMU)) { + pirq = 0; + } + + if (pirq) { + s->pirq[pirq].dev = dev; + s->pirq[pirq].gsi = IRQ_MSI_EMU; + s->pirq[pirq].is_msix = is_msix; + s->pirq[pirq].vector = vector; + s->pirq[pirq].is_masked = is_masked; + } + + /* Remove any (other) entries for this {device, vector} */ + do_remove_pci_vector(s, dev, vector, pirq); +} + +int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint32_t pirq, port; + CPUState *cpu; + + if (!s) { + return 1; /* Not a PIRQ */ + } + + assert(qemu_mutex_iothread_locked()); + + pirq = msi_pirq_target(address, data); + if (!pirq || pirq >= s->nr_pirqs) { + return 1; /* Not a PIRQ */ + } + + if (!kvm_xen_has_cap(EVTCHN_2LEVEL)) { + return -ENOTSUP; + } + + if (s->pirq[pirq].gsi != IRQ_MSI_EMU) { + return -EINVAL; + } + + /* Remember that KVM tried to translate this. It might need to try again. */ + s->pirq[pirq].is_translated = true; + + QEMU_LOCK_GUARD(&s->port_lock); + + port = s->pirq[pirq].port; + if (!valid_port(port)) { + return -EINVAL; + } + + cpu = qemu_get_cpu(s->port_table[port].vcpu); + if (!cpu) { + return -EINVAL; + } + + route->type = KVM_IRQ_ROUTING_XEN_EVTCHN; + route->u.xen_evtchn.port = port; + route->u.xen_evtchn.vcpu = kvm_arch_vcpu_id(cpu); + route->u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + return 0; /* Handled */ +} + +bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data) +{ + XenEvtchnState *s = xen_evtchn_singleton; + uint32_t pirq, port; + + if (!s) { + return false; + } + + assert(qemu_mutex_iothread_locked()); + + pirq = msi_pirq_target(address, data); + if (!pirq || pirq >= s->nr_pirqs) { + return false; + } + + QEMU_LOCK_GUARD(&s->port_lock); + + port = s->pirq[pirq].port; + if (!valid_port(port)) { + return false; + } + + set_port_pending(s, port); + return true; +} + int xen_physdev_map_pirq(struct physdev_map_pirq *map) { XenEvtchnState *s = xen_evtchn_singleton; @@ -1569,6 +1805,7 @@ int xen_physdev_map_pirq(struct physdev_map_pirq *map) return -ENOTSUP; } + QEMU_IOTHREAD_LOCK_GUARD(); QEMU_LOCK_GUARD(&s->port_lock); if (map->domid != DOMID_SELF && map->domid != xen_domid) { @@ -1628,9 +1865,11 @@ int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap) return -EINVAL; } - QEMU_LOCK_GUARD(&s->port_lock); + QEMU_IOTHREAD_LOCK_GUARD(); + qemu_mutex_lock(&s->port_lock); if (!pirq_inuse(s, pirq)) { + qemu_mutex_unlock(&s->port_lock); return -ENOENT; } @@ -1638,6 +1877,7 @@ int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap) /* We can only unmap GSI PIRQs */ if (gsi < 0) { + qemu_mutex_unlock(&s->port_lock); return -EINVAL; } @@ -1646,6 +1886,12 @@ int xen_physdev_unmap_pirq(struct physdev_unmap_pirq *unmap) pirq_inuse_word(s, pirq) &= ~pirq_inuse_bit(pirq); trace_kvm_xen_unmap_pirq(pirq, gsi); + qemu_mutex_unlock(&s->port_lock); + + if (gsi == IRQ_MSI_EMU) { + kvm_update_msi_routes_all(NULL, true, 0, 0); + } + return 0; } @@ -1659,6 +1905,7 @@ int xen_physdev_eoi_pirq(struct physdev_eoi *eoi) return -ENOTSUP; } + QEMU_IOTHREAD_LOCK_GUARD(); QEMU_LOCK_GUARD(&s->port_lock); if (!pirq_inuse(s, pirq)) { @@ -1690,6 +1937,7 @@ int xen_physdev_query_pirq(struct physdev_irq_status_query *query) return -ENOTSUP; } + QEMU_IOTHREAD_LOCK_GUARD(); QEMU_LOCK_GUARD(&s->port_lock); if (!pirq_inuse(s, pirq)) { diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h index 95400b7fbf..bfb67ac2bc 100644 --- a/hw/i386/kvm/xen_evtchn.h +++ b/hw/i386/kvm/xen_evtchn.h @@ -25,6 +25,14 @@ void xen_evtchn_set_callback_level(int level); int xen_evtchn_set_port(uint16_t port); bool xen_evtchn_set_gsi(int gsi, int level); +void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, + uint64_t addr, uint32_t data, bool is_masked); +void xen_evtchn_remove_pci_device(PCIDevice *dev); +struct kvm_irq_routing_entry; +int xen_evtchn_translate_pirq_msi(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data); +bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data); + /* * These functions mirror the libxenevtchn library API, providing the QEMU |