diff options
Diffstat (limited to 'system/xen/xsa')
23 files changed, 3217 insertions, 0 deletions
diff --git a/system/xen/xsa/xsa333.patch b/system/xen/xsa/xsa333.patch new file mode 100644 index 0000000000000..6b86c942faa07 --- /dev/null +++ b/system/xen/xsa/xsa333.patch @@ -0,0 +1,39 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/pv: Handle the Intel-specific MSR_MISC_ENABLE correctly + +This MSR doesn't exist on AMD hardware, and switching away from the safe +functions in the common MSR path was an erroneous change. + +Partially revert the change. + +This is XSA-333. + +Fixes: 4fdc932b3cc ("x86/Intel: drop another 32-bit leftover") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Wei Liu <wl@xen.org> + +diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c +index efeb2a727e..6332c74b80 100644 +--- a/xen/arch/x86/pv/emul-priv-op.c ++++ b/xen/arch/x86/pv/emul-priv-op.c +@@ -924,7 +924,8 @@ static int read_msr(unsigned int reg, uint64_t *val, + return X86EMUL_OKAY; + + case MSR_IA32_MISC_ENABLE: +- rdmsrl(reg, *val); ++ if ( rdmsr_safe(reg, *val) ) ++ break; + *val = guest_misc_enable(*val); + return X86EMUL_OKAY; + +@@ -1059,7 +1060,8 @@ static int write_msr(unsigned int reg, uint64_t val, + break; + + case MSR_IA32_MISC_ENABLE: +- rdmsrl(reg, temp); ++ if ( rdmsr_safe(reg, temp) ) ++ break; + if ( val != guest_misc_enable(temp) ) + goto invalid; + return X86EMUL_OKAY; diff --git a/system/xen/xsa/xsa334.patch b/system/xen/xsa/xsa334.patch new file mode 100644 index 0000000000000..4260cdb2b2db8 --- /dev/null +++ b/system/xen/xsa/xsa334.patch @@ -0,0 +1,51 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: xen/memory: Don't skip the RCU unlock path in acquire_resource() + +In the case that an HVM Stubdomain makes an XENMEM_acquire_resource hypercall, +the FIXME path will bypass rcu_unlock_domain() on the way out of the function. + +Move the check to the start of the function. This does change the behaviour +of the get-size path for HVM Stubdomains, but that functionality is currently +broken and unused anyway, as well as being quite useless to entities which +can't actually map the resource anyway. + +This is XSA-334. + +Fixes: 83fa6552ce ("common: add a new mappable resource type: XENMEM_resource_grant_table") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/common/memory.c b/xen/common/memory.c +index 1a3c9ffb30..29741d8904 100644 +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -1058,6 +1058,14 @@ static int acquire_resource( + xen_pfn_t mfn_list[32]; + int rc; + ++ /* ++ * FIXME: Until foreign pages inserted into the P2M are properly ++ * reference counted, it is unsafe to allow mapping of ++ * resource pages unless the caller is the hardware domain. ++ */ ++ if ( paging_mode_translate(currd) && !is_hardware_domain(currd) ) ++ return -EACCES; ++ + if ( copy_from_guest(&xmar, arg, 1) ) + return -EFAULT; + +@@ -1114,14 +1122,6 @@ static int acquire_resource( + xen_pfn_t gfn_list[ARRAY_SIZE(mfn_list)]; + unsigned int i; + +- /* +- * FIXME: Until foreign pages inserted into the P2M are properly +- * reference counted, it is unsafe to allow mapping of +- * resource pages unless the caller is the hardware domain. +- */ +- if ( !is_hardware_domain(currd) ) +- return -EACCES; +- + if ( copy_from_guest(gfn_list, xmar.frame_list, xmar.nr_frames) ) + rc = -EFAULT; + diff --git a/system/xen/xsa/xsa335-qemu.patch b/system/xen/xsa/xsa335-qemu.patch new file mode 100644 index 0000000000000..5617502359adf --- /dev/null +++ b/system/xen/xsa/xsa335-qemu.patch @@ -0,0 +1,84 @@ +From c5bd2924c6d6a5bcbffb8b5e7798a88970131c07 Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann <kraxel@redhat.com> +Date: Mon, 17 Aug 2020 08:34:22 +0200 +Subject: [PATCH] usb: fix setup_len init (CVE-2020-14364) + +Store calculated setup_len in a local variable, verify it, and only +write it to the struct (USBDevice->setup_len) in case it passed the +sanity checks. + +This prevents other code (do_token_{in,out} functions specifically) +from working with invalid USBDevice->setup_len values and overrunning +the USBDevice->setup_buf[] buffer. + +Fixes: CVE-2020-14364 +Signed-off-by: Gerd Hoffmann <kraxel@redhat.com> +--- + hw/usb/core.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/hw/usb/core.c b/hw/usb/core.c +index 5abd128b6bc5..5234dcc73fea 100644 +--- a/hw/usb/core.c ++++ b/hw/usb/core.c +@@ -129,6 +129,7 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream) + static void do_token_setup(USBDevice *s, USBPacket *p) + { + int request, value, index; ++ unsigned int setup_len; + + if (p->iov.size != 8) { + p->status = USB_RET_STALL; +@@ -138,14 +139,15 @@ static void do_token_setup(USBDevice *s, USBPacket *p) + usb_packet_copy(p, s->setup_buf, p->iov.size); + s->setup_index = 0; + p->actual_length = 0; +- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; +- if (s->setup_len > sizeof(s->data_buf)) { ++ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; ++ if (setup_len > sizeof(s->data_buf)) { + fprintf(stderr, + "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", +- s->setup_len, sizeof(s->data_buf)); ++ setup_len, sizeof(s->data_buf)); + p->status = USB_RET_STALL; + return; + } ++ s->setup_len = setup_len; + + request = (s->setup_buf[0] << 8) | s->setup_buf[1]; + value = (s->setup_buf[3] << 8) | s->setup_buf[2]; +@@ -259,26 +261,28 @@ static void do_token_out(USBDevice *s, USBPacket *p) + static void do_parameter(USBDevice *s, USBPacket *p) + { + int i, request, value, index; ++ unsigned int setup_len; + + for (i = 0; i < 8; i++) { + s->setup_buf[i] = p->parameter >> (i*8); + } + + s->setup_state = SETUP_STATE_PARAM; +- s->setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; + s->setup_index = 0; + + request = (s->setup_buf[0] << 8) | s->setup_buf[1]; + value = (s->setup_buf[3] << 8) | s->setup_buf[2]; + index = (s->setup_buf[5] << 8) | s->setup_buf[4]; + +- if (s->setup_len > sizeof(s->data_buf)) { ++ setup_len = (s->setup_buf[7] << 8) | s->setup_buf[6]; ++ if (setup_len > sizeof(s->data_buf)) { + fprintf(stderr, + "usb_generic_handle_packet: ctrl buffer too small (%d > %zu)\n", +- s->setup_len, sizeof(s->data_buf)); ++ setup_len, sizeof(s->data_buf)); + p->status = USB_RET_STALL; + return; + } ++ s->setup_len = setup_len; + + if (p->pid == USB_TOKEN_OUT) { + usb_packet_copy(p, s->data_buf, s->setup_len); +-- +2.18.4 diff --git a/system/xen/xsa/xsa336.patch b/system/xen/xsa/xsa336.patch new file mode 100644 index 0000000000000..b44c298b70b70 --- /dev/null +++ b/system/xen/xsa/xsa336.patch @@ -0,0 +1,283 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/vpt: fix race when migrating timers between vCPUs + +The current vPT code will migrate the emulated timers between vCPUs +(change the pt->vcpu field) while just holding the destination lock, +either from create_periodic_time or pt_adjust_global_vcpu_target if +the global target is adjusted. Changing the periodic_timer vCPU field +in this way creates a race where a third party could grab the lock in +the unlocked region of pt_adjust_global_vcpu_target (or before +create_periodic_time performs the vcpu change) and then release the +lock from a different vCPU, creating a locking imbalance. + +Introduce a per-domain rwlock in order to protect periodic_time +migration between vCPU lists. Taking the lock in read mode prevents +any timer from being migrated to a different vCPU, while taking it in +write mode allows performing migration of timers across vCPUs. The +per-vcpu locks are still used to protect all the other fields from the +periodic_timer struct. + +Note that such migration shouldn't happen frequently, and hence +there's no performance drop as a result of such locking. + +This is XSA-336. + +Reported-by: Igor Druzhinin <igor.druzhinin@citrix.com> +Tested-by: Igor Druzhinin <igor.druzhinin@citrix.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- +Changes since v2: + - Re-order pt_adjust_vcpu to remove one if. + - Fix pt_lock to not call pt_vcpu_lock, as we might end up using a + stale value of pt->vcpu when taking the per-vcpu lock. + +Changes since v1: + - Use a per-domain rwlock to protect timer vCPU migration. + +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -658,6 +658,8 @@ int hvm_domain_initialise(struct domain + /* need link to containing domain */ + d->arch.hvm.pl_time->domain = d; + ++ rwlock_init(&d->arch.hvm.pl_time->pt_migrate); ++ + /* Set the default IO Bitmap. */ + if ( is_hardware_domain(d) ) + { +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -153,23 +153,32 @@ static int pt_irq_masked(struct periodic + return 1; + } + +-static void pt_lock(struct periodic_time *pt) ++static void pt_vcpu_lock(struct vcpu *v) + { +- struct vcpu *v; ++ read_lock(&v->domain->arch.hvm.pl_time->pt_migrate); ++ spin_lock(&v->arch.hvm.tm_lock); ++} + +- for ( ; ; ) +- { +- v = pt->vcpu; +- spin_lock(&v->arch.hvm.tm_lock); +- if ( likely(pt->vcpu == v) ) +- break; +- spin_unlock(&v->arch.hvm.tm_lock); +- } ++static void pt_vcpu_unlock(struct vcpu *v) ++{ ++ spin_unlock(&v->arch.hvm.tm_lock); ++ read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate); ++} ++ ++static void pt_lock(struct periodic_time *pt) ++{ ++ /* ++ * We cannot use pt_vcpu_lock here, because we need to acquire the ++ * per-domain lock first and then (re-)fetch the value of pt->vcpu, or ++ * else we might be using a stale value of pt->vcpu. ++ */ ++ read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); ++ spin_lock(&pt->vcpu->arch.hvm.tm_lock); + } + + static void pt_unlock(struct periodic_time *pt) + { +- spin_unlock(&pt->vcpu->arch.hvm.tm_lock); ++ pt_vcpu_unlock(pt->vcpu); + } + + static void pt_process_missed_ticks(struct periodic_time *pt) +@@ -219,7 +228,7 @@ void pt_save_timer(struct vcpu *v) + if ( v->pause_flags & VPF_blocked ) + return; + +- spin_lock(&v->arch.hvm.tm_lock); ++ pt_vcpu_lock(v); + + list_for_each_entry ( pt, head, list ) + if ( !pt->do_not_freeze ) +@@ -227,7 +236,7 @@ void pt_save_timer(struct vcpu *v) + + pt_freeze_time(v); + +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + } + + void pt_restore_timer(struct vcpu *v) +@@ -235,7 +244,7 @@ void pt_restore_timer(struct vcpu *v) + struct list_head *head = &v->arch.hvm.tm_list; + struct periodic_time *pt; + +- spin_lock(&v->arch.hvm.tm_lock); ++ pt_vcpu_lock(v); + + list_for_each_entry ( pt, head, list ) + { +@@ -248,7 +257,7 @@ void pt_restore_timer(struct vcpu *v) + + pt_thaw_time(v); + +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + } + + static void pt_timer_fn(void *data) +@@ -309,7 +318,7 @@ int pt_update_irq(struct vcpu *v) + int irq, pt_vector = -1; + bool level; + +- spin_lock(&v->arch.hvm.tm_lock); ++ pt_vcpu_lock(v); + + earliest_pt = NULL; + max_lag = -1ULL; +@@ -339,7 +348,7 @@ int pt_update_irq(struct vcpu *v) + + if ( earliest_pt == NULL ) + { +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + return -1; + } + +@@ -347,7 +356,7 @@ int pt_update_irq(struct vcpu *v) + irq = earliest_pt->irq; + level = earliest_pt->level; + +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + + switch ( earliest_pt->source ) + { +@@ -394,7 +403,7 @@ int pt_update_irq(struct vcpu *v) + time_cb *cb = NULL; + void *cb_priv; + +- spin_lock(&v->arch.hvm.tm_lock); ++ pt_vcpu_lock(v); + /* Make sure the timer is still on the list. */ + list_for_each_entry ( pt, &v->arch.hvm.tm_list, list ) + if ( pt == earliest_pt ) +@@ -404,7 +413,7 @@ int pt_update_irq(struct vcpu *v) + cb_priv = pt->priv; + break; + } +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + + if ( cb != NULL ) + cb(v, cb_priv); +@@ -441,12 +450,12 @@ void pt_intr_post(struct vcpu *v, struct + if ( intack.source == hvm_intsrc_vector ) + return; + +- spin_lock(&v->arch.hvm.tm_lock); ++ pt_vcpu_lock(v); + + pt = is_pt_irq(v, intack); + if ( pt == NULL ) + { +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + return; + } + +@@ -455,7 +464,7 @@ void pt_intr_post(struct vcpu *v, struct + cb = pt->cb; + cb_priv = pt->priv; + +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + + if ( cb != NULL ) + cb(v, cb_priv); +@@ -466,12 +475,12 @@ void pt_migrate(struct vcpu *v) + struct list_head *head = &v->arch.hvm.tm_list; + struct periodic_time *pt; + +- spin_lock(&v->arch.hvm.tm_lock); ++ pt_vcpu_lock(v); + + list_for_each_entry ( pt, head, list ) + migrate_timer(&pt->timer, v->processor); + +- spin_unlock(&v->arch.hvm.tm_lock); ++ pt_vcpu_unlock(v); + } + + void create_periodic_time( +@@ -490,7 +499,7 @@ void create_periodic_time( + + destroy_periodic_time(pt); + +- spin_lock(&v->arch.hvm.tm_lock); ++ write_lock(&v->domain->arch.hvm.pl_time->pt_migrate); + + pt->pending_intr_nr = 0; + pt->do_not_freeze = 0; +@@ -540,7 +549,7 @@ void create_periodic_time( + init_timer(&pt->timer, pt_timer_fn, pt, v->processor); + set_timer(&pt->timer, pt->scheduled); + +- spin_unlock(&v->arch.hvm.tm_lock); ++ write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate); + } + + void destroy_periodic_time(struct periodic_time *pt) +@@ -565,30 +574,20 @@ void destroy_periodic_time(struct period + + static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) + { +- int on_list; +- + ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); + + if ( pt->vcpu == NULL ) + return; + +- pt_lock(pt); +- on_list = pt->on_list; +- if ( pt->on_list ) +- list_del(&pt->list); +- pt->on_list = 0; +- pt_unlock(pt); +- +- spin_lock(&v->arch.hvm.tm_lock); ++ write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + pt->vcpu = v; +- if ( on_list ) ++ if ( pt->on_list ) + { +- pt->on_list = 1; ++ list_del(&pt->list); + list_add(&pt->list, &v->arch.hvm.tm_list); +- + migrate_timer(&pt->timer, v->processor); + } +- spin_unlock(&v->arch.hvm.tm_lock); ++ write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + } + + void pt_adjust_global_vcpu_target(struct vcpu *v) +--- a/xen/include/asm-x86/hvm/vpt.h ++++ b/xen/include/asm-x86/hvm/vpt.h +@@ -128,6 +128,13 @@ struct pl_time { /* platform time */ + struct RTCState vrtc; + struct HPETState vhpet; + struct PMTState vpmt; ++ /* ++ * rwlock to prevent periodic_time vCPU migration. Take the lock in read ++ * mode in order to prevent the vcpu field of periodic_time from changing. ++ * Lock must be taken in write mode when changes to the vcpu field are ++ * performed, as it allows exclusive access to all the timers of a domain. ++ */ ++ rwlock_t pt_migrate; + /* guest_time = Xen sys time + stime_offset */ + int64_t stime_offset; + /* Ensures monotonicity in appropriate timer modes. */ diff --git a/system/xen/xsa/xsa337-4.13-1.patch b/system/xen/xsa/xsa337-4.13-1.patch new file mode 100644 index 0000000000000..2091626f4f536 --- /dev/null +++ b/system/xen/xsa/xsa337-4.13-1.patch @@ -0,0 +1,87 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/msi: get rid of read_msi_msg + +It's safer and faster to just use the cached last written +(untranslated) MSI message stored in msi_desc for the single user that +calls read_msi_msg. + +This also prevents relying on the data read from the device MSI +registers in order to figure out the index into the IOMMU interrupt +remapping table, which is not safe. + +This is part of XSA-337. + +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Requested-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +--- a/xen/arch/x86/msi.c ++++ b/xen/arch/x86/msi.c +@@ -183,54 +183,6 @@ void msi_compose_msg(unsigned vector, co + MSI_DATA_VECTOR(vector); + } + +-static bool read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +-{ +- switch ( entry->msi_attrib.type ) +- { +- case PCI_CAP_ID_MSI: +- { +- struct pci_dev *dev = entry->dev; +- int pos = entry->msi_attrib.pos; +- uint16_t data; +- +- msg->address_lo = pci_conf_read32(dev->sbdf, +- msi_lower_address_reg(pos)); +- if ( entry->msi_attrib.is_64 ) +- { +- msg->address_hi = pci_conf_read32(dev->sbdf, +- msi_upper_address_reg(pos)); +- data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 1)); +- } +- else +- { +- msg->address_hi = 0; +- data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 0)); +- } +- msg->data = data; +- break; +- } +- case PCI_CAP_ID_MSIX: +- { +- void __iomem *base = entry->mask_base; +- +- if ( unlikely(!msix_memory_decoded(entry->dev, +- entry->msi_attrib.pos)) ) +- return false; +- msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); +- msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); +- msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); +- break; +- } +- default: +- BUG(); +- } +- +- if ( iommu_intremap ) +- iommu_read_msi_from_ire(entry, msg); +- +- return true; +-} +- + static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) + { + entry->msg = *msg; +@@ -302,10 +254,7 @@ void set_msi_affinity(struct irq_desc *d + + ASSERT(spin_is_locked(&desc->lock)); + +- memset(&msg, 0, sizeof(msg)); +- if ( !read_msi_msg(msi_desc, &msg) ) +- return; +- ++ msg = msi_desc->msg; + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(desc->arch.vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; diff --git a/system/xen/xsa/xsa337-4.13-2.patch b/system/xen/xsa/xsa337-4.13-2.patch new file mode 100644 index 0000000000000..bdefd37cdc660 --- /dev/null +++ b/system/xen/xsa/xsa337-4.13-2.patch @@ -0,0 +1,181 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/MSI-X: restrict reading of table/PBA bases from BARs + +When assigned to less trusted or un-trusted guests, devices may change +state behind our backs (they may e.g. get reset by means we may not know +about). Therefore we should avoid reading BARs from hardware once a +device is no longer owned by Dom0. Furthermore when we can't read a BAR, +or when we read zero, we shouldn't instead use the caller provided +address unless that caller can be trusted. + +Re-arrange the logic in msix_capability_init() such that only Dom0 (and +only if the device isn't DomU-owned yet) or calls through +PHYSDEVOP_prepare_msix will actually result in the reading of the +respective BAR register(s). Additionally do so only as long as in-use +table entries are known (note that invocation of PHYSDEVOP_prepare_msix +counts as a "pseudo" entry). In all other uses the value already +recorded will get used instead. + +Clear the recorded values in _pci_cleanup_msix() as well as on the one +affected error path. (Adjust this error path to also avoid blindly +disabling MSI-X when it was enabled on entry to the function.) + +While moving around variable declarations (in many cases to reduce their +scopes), also adjust some of their types. + +This is part of XSA-337. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> + +--- a/xen/arch/x86/msi.c ++++ b/xen/arch/x86/msi.c +@@ -769,16 +769,14 @@ static int msix_capability_init(struct p + { + struct arch_msix *msix = dev->msix; + struct msi_desc *entry = NULL; +- int vf; + u16 control; + u64 table_paddr; + u32 table_offset; +- u8 bir, pbus, pslot, pfunc; + u16 seg = dev->seg; + u8 bus = dev->bus; + u8 slot = PCI_SLOT(dev->devfn); + u8 func = PCI_FUNC(dev->devfn); +- bool maskall = msix->host_maskall; ++ bool maskall = msix->host_maskall, zap_on_error = false; + unsigned int pos = pci_find_cap_offset(seg, bus, slot, func, + PCI_CAP_ID_MSIX); + +@@ -820,43 +818,45 @@ static int msix_capability_init(struct p + + /* Locate MSI-X table region */ + table_offset = pci_conf_read32(dev->sbdf, msix_table_offset_reg(pos)); +- bir = (u8)(table_offset & PCI_MSIX_BIRMASK); +- table_offset &= ~PCI_MSIX_BIRMASK; ++ if ( !msix->used_entries && ++ (!msi || ++ (is_hardware_domain(current->domain) && ++ (dev->domain == current->domain || dev->domain == dom_io))) ) ++ { ++ unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc; ++ int vf; ++ paddr_t pba_paddr; ++ unsigned int pba_offset; + +- if ( !dev->info.is_virtfn ) +- { +- pbus = bus; +- pslot = slot; +- pfunc = func; +- vf = -1; +- } +- else +- { +- pbus = dev->info.physfn.bus; +- pslot = PCI_SLOT(dev->info.physfn.devfn); +- pfunc = PCI_FUNC(dev->info.physfn.devfn); +- vf = PCI_BDF2(dev->bus, dev->devfn); +- } +- +- table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); +- WARN_ON(msi && msi->table_base != table_paddr); +- if ( !table_paddr ) +- { +- if ( !msi || !msi->table_base ) ++ if ( !dev->info.is_virtfn ) + { +- pci_conf_write16(dev->sbdf, msix_control_reg(pos), +- control & ~PCI_MSIX_FLAGS_ENABLE); +- xfree(entry); +- return -ENXIO; ++ pbus = bus; ++ pslot = slot; ++ pfunc = func; ++ vf = -1; ++ } ++ else ++ { ++ pbus = dev->info.physfn.bus; ++ pslot = PCI_SLOT(dev->info.physfn.devfn); ++ pfunc = PCI_FUNC(dev->info.physfn.devfn); ++ vf = PCI_BDF2(dev->bus, dev->devfn); + } +- table_paddr = msi->table_base; +- } +- table_paddr += table_offset; + +- if ( !msix->used_entries ) +- { +- u64 pba_paddr; +- u32 pba_offset; ++ table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); ++ WARN_ON(msi && msi->table_base != table_paddr); ++ if ( !table_paddr ) ++ { ++ if ( !msi || !msi->table_base ) ++ { ++ pci_conf_write16(dev->sbdf, msix_control_reg(pos), ++ control & ~PCI_MSIX_FLAGS_ENABLE); ++ xfree(entry); ++ return -ENXIO; ++ } ++ table_paddr = msi->table_base; ++ } ++ table_paddr += table_offset & ~PCI_MSIX_BIRMASK; + + msix->table.first = PFN_DOWN(table_paddr); + msix->table.last = PFN_DOWN(table_paddr + +@@ -875,7 +875,18 @@ static int msix_capability_init(struct p + BITS_TO_LONGS(msix->nr_entries) - 1); + WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first, + msix->pba.last)); ++ ++ zap_on_error = true; ++ } ++ else if ( !msix->table.first ) ++ { ++ pci_conf_write16(dev->sbdf, msix_control_reg(pos), control); ++ xfree(entry); ++ return -ENODATA; + } ++ else ++ table_paddr = (msix->table.first << PAGE_SHIFT) + ++ (table_offset & ~PCI_MSIX_BIRMASK & ~PAGE_MASK); + + if ( entry ) + { +@@ -886,8 +897,15 @@ static int msix_capability_init(struct p + + if ( idx < 0 ) + { +- pci_conf_write16(dev->sbdf, msix_control_reg(pos), +- control & ~PCI_MSIX_FLAGS_ENABLE); ++ if ( zap_on_error ) ++ { ++ msix->table.first = 0; ++ msix->pba.first = 0; ++ ++ control &= ~PCI_MSIX_FLAGS_ENABLE; ++ } ++ ++ pci_conf_write16(dev->sbdf, msix_control_reg(pos), control); + xfree(entry); + return idx; + } +@@ -1076,9 +1094,14 @@ static void _pci_cleanup_msix(struct arc + if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first, + msix->table.last) ) + WARN(); ++ msix->table.first = 0; ++ msix->table.last = 0; ++ + if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first, + msix->pba.last) ) + WARN(); ++ msix->pba.first = 0; ++ msix->pba.last = 0; + } + } + diff --git a/system/xen/xsa/xsa338.patch b/system/xen/xsa/xsa338.patch new file mode 100644 index 0000000000000..776521990e7a3 --- /dev/null +++ b/system/xen/xsa/xsa338.patch @@ -0,0 +1,42 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn: relax port_is_valid() + +To avoid ports potentially becoming invalid behind the back of certain +other functions (due to ->max_evtchn shrinking) because of +- a guest invoking evtchn_reset() and from a 2nd vCPU opening new + channels in parallel (see also XSA-343), +- alloc_unbound_xen_event_channel() produced channels living above the + 2-level range (see also XSA-342), +drop the max_evtchns check from port_is_valid(). For a port for which +the function once returned "true", the returned value may not turn into +"false" later on. The function's result may only depend on bounds which +can only ever grow (which is the case for d->valid_evtchns). + +This also eliminates a false sense of safety, utilized by some of the +users (see again XSA-343): Without a suitable lock held, d->max_evtchns +may change at any time, and hence deducing that certain other operations +are safe when port_is_valid() returned true is not legitimate. The +opportunities to abuse this may get widened by the change here +(depending on guest and host configuration), but will be taken care of +by the other XSA. + +This is XSA-338. + +Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of event channels") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Julien Grall <jgrall@amazon.com> +--- +v5: New, split from larger patch. + +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct + + static inline bool_t port_is_valid(struct domain *d, unsigned int p) + { +- if ( p >= d->max_evtchns ) +- return 0; + return p < read_atomic(&d->valid_evtchns); + } + diff --git a/system/xen/xsa/xsa339.patch b/system/xen/xsa/xsa339.patch new file mode 100644 index 0000000000000..3311ae093fd67 --- /dev/null +++ b/system/xen/xsa/xsa339.patch @@ -0,0 +1,76 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/pv: Avoid double exception injection + +There is at least one path (SYSENTER with NT set, Xen converts to #GP) which +ends up injecting the #GP fault twice, first in compat_sysenter(), and then a +second time in compat_test_all_events(), due to the stale TBF_EXCEPTION left +in TRAPBOUNCE_flags. + +The guest kernel sees the second fault first, which is a kernel level #GP +pointing at the head of the #GP handler, and is therefore a userspace +trigger-able DoS. + +This particular bug has bitten us several times before, so rearrange +{compat_,}create_bounce_frame() to clobber TRAPBOUNCE on success, rather than +leaving this task to one area of code which isn't used uniformly. + +Other scenarios which might result in a double injection (e.g. two calls +directly to compat_create_bounce_frame) will now crash the guest, which is far +more obvious than letting the kernel run with corrupt state. + +This is XSA-339 + +Fixes: fdac9515607b ("x86: clear EFLAGS.NT in SYSENTER entry path") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index c3e62f8734..73619f57ca 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -78,7 +78,6 @@ compat_process_softirqs: + sti + .Lcompat_bounce_exception: + call compat_create_bounce_frame +- movb $0, TRAPBOUNCE_flags(%rdx) + jmp compat_test_all_events + + ALIGN +@@ -352,7 +351,13 @@ __UNLIKELY_END(compat_bounce_null_selector) + movl %eax,UREGS_cs+8(%rsp) + movl TRAPBOUNCE_eip(%rdx),%eax + movl %eax,UREGS_rip+8(%rsp) ++ ++ /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */ ++ xor %eax, %eax ++ mov %ax, TRAPBOUNCE_cs(%rdx) ++ mov %al, TRAPBOUNCE_flags(%rdx) + ret ++ + .section .fixup,"ax" + .Lfx13: + xorl %edi,%edi +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 1e880eb9f6..71a00e846b 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -90,7 +90,6 @@ process_softirqs: + sti + .Lbounce_exception: + call create_bounce_frame +- movb $0, TRAPBOUNCE_flags(%rdx) + jmp test_all_events + + ALIGN +@@ -512,6 +511,11 @@ UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip) + jmp asm_domain_crash_synchronous /* Does not return */ + __UNLIKELY_END(create_bounce_frame_bad_bounce_ip) + movq %rax,UREGS_rip+8(%rsp) ++ ++ /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */ ++ xor %eax, %eax ++ mov %rax, TRAPBOUNCE_eip(%rdx) ++ mov %al, TRAPBOUNCE_flags(%rdx) + ret + + .pushsection .fixup, "ax", @progbits diff --git a/system/xen/xsa/xsa340.patch b/system/xen/xsa/xsa340.patch new file mode 100644 index 0000000000000..38d04da4650a3 --- /dev/null +++ b/system/xen/xsa/xsa340.patch @@ -0,0 +1,65 @@ +From: Julien Grall <jgrall@amazon.com> +Subject: xen/evtchn: Add missing barriers when accessing/allocating an event channel + +While the allocation of a bucket is always performed with the per-domain +lock, the bucket may be accessed without the lock taken (for instance, see +evtchn_send()). + +Instead such sites relies on port_is_valid() to return a non-zero value +when the port has a struct evtchn associated to it. The function will +mostly check whether the port is less than d->valid_evtchns as all the +buckets/event channels should be allocated up to that point. + +Unfortunately a compiler is free to re-order the assignment in +evtchn_allocate_port() so it would be possible to have d->valid_evtchns +updated before the new bucket has finish to allocate. + +Additionally on Arm, even if this was compiled "correctly", the +processor can still re-order the memory access. + +Add a write memory barrier in the allocation side and a read memory +barrier when the port is valid to prevent any re-ordering issue. + +This is XSA-340. + +Reported-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> + +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -178,6 +178,13 @@ int evtchn_allocate_port(struct domain * + return -ENOMEM; + bucket_from_port(d, port) = chn; + ++ /* ++ * d->valid_evtchns is used to check whether the bucket can be ++ * accessed without the per-domain lock. Therefore, ++ * d->valid_evtchns should be seen *after* the new bucket has ++ * been setup. ++ */ ++ smp_wmb(); + write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); + } + +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -107,7 +107,17 @@ void notify_via_xen_event_channel(struct + + static inline bool_t port_is_valid(struct domain *d, unsigned int p) + { +- return p < read_atomic(&d->valid_evtchns); ++ if ( p >= read_atomic(&d->valid_evtchns) ) ++ return false; ++ ++ /* ++ * The caller will usually access the event channel afterwards and ++ * may be done without taking the per-domain lock. The barrier is ++ * going in pair the smp_wmb() barrier in evtchn_allocate_port(). ++ */ ++ smp_rmb(); ++ ++ return true; + } + + static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p) diff --git a/system/xen/xsa/xsa342-4.13.patch b/system/xen/xsa/xsa342-4.13.patch new file mode 100644 index 0000000000000..334baf1b69c0d --- /dev/null +++ b/system/xen/xsa/xsa342-4.13.patch @@ -0,0 +1,145 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn/x86: enforce correct upper limit for 32-bit guests + +The recording of d->max_evtchns in evtchn_2l_init(), in particular with +the limited set of callers of the function, is insufficient. Neither for +PV nor for HVM guests the bitness is known at domain_create() time, yet +the upper bound in 2-level mode depends upon guest bitness. Recording +too high a limit "allows" x86 32-bit domains to open not properly usable +event channels, management of which (inside Xen) would then result in +corruption of the shared info and vCPU info structures. + +Keep the upper limit dynamic for the 2-level case, introducing a helper +function to retrieve the effective limit. This helper is now supposed to +be private to the event channel code. The used in do_poll() and +domain_dump_evtchn_info() weren't consistent with port uses elsewhere +and hence get switched to port_is_valid(). + +Furthermore FIFO mode's setup_ports() gets adjusted to loop only up to +the prior ABI limit, rather than all the way up to the new one. + +Finally a word on the change to do_poll(): Accessing ->max_evtchns +without holding a suitable lock was never safe, as it as well as +->evtchn_port_ops may change behind do_poll()'s back. Using +port_is_valid() instead widens some the window for potential abuse, +until we've dealt with the race altogether (see XSA-343). + +This is XSA-342. + +Reported-by: Julien Grall <jgrall@amazon.com> +Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of event channels") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Julien Grall <jgrall@amazon.com> + +--- a/xen/common/event_2l.c ++++ b/xen/common/event_2l.c +@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtc + void evtchn_2l_init(struct domain *d) + { + d->evtchn_port_ops = &evtchn_port_ops_2l; +- d->max_evtchns = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); + } + + /* +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct do + + int evtchn_allocate_port(struct domain *d, evtchn_port_t port) + { +- if ( port > d->max_evtchn_port || port >= d->max_evtchns ) ++ if ( port > d->max_evtchn_port || port >= max_evtchns(d) ) + return -ENOSPC; + + if ( port_is_valid(d, port) ) +@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(stru + + spin_lock(&d->event_lock); + +- for ( port = 1; port < d->max_evtchns; ++port ) ++ for ( port = 1; port_is_valid(d, port); ++port ) + { + const struct evtchn *chn; + char *ssid; + +- if ( !port_is_valid(d, port) ) +- continue; + chn = evtchn_from_port(d, port); + if ( chn->state == ECS_FREE ) + continue; +--- a/xen/common/event_fifo.c ++++ b/xen/common/event_fifo.c +@@ -478,7 +478,7 @@ static void cleanup_event_array(struct d + d->evtchn_fifo = NULL; + } + +-static void setup_ports(struct domain *d) ++static void setup_ports(struct domain *d, unsigned int prev_evtchns) + { + unsigned int port; + +@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d + * - save its pending state. + * - set default priority. + */ +- for ( port = 1; port < d->max_evtchns; port++ ) ++ for ( port = 1; port < prev_evtchns; port++ ) + { + struct evtchn *evtchn; + +@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtc + if ( !d->evtchn_fifo ) + { + struct vcpu *vcb; ++ /* Latch the value before it changes during setup_event_array(). */ ++ unsigned int prev_evtchns = max_evtchns(d); + + for_each_vcpu ( d, vcb ) { + rc = setup_control_block(vcb); +@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtc + goto error; + + d->evtchn_port_ops = &evtchn_port_ops_fifo; +- d->max_evtchns = EVTCHN_FIFO_NR_CHANNELS; +- setup_ports(d); ++ setup_ports(d, prev_evtchns); + } + else + rc = map_control_block(v, gfn, offset); +--- a/xen/common/schedule.c ++++ b/xen/common/schedule.c +@@ -1434,7 +1434,7 @@ static long do_poll(struct sched_poll *s + goto out; + + rc = -EINVAL; +- if ( port >= d->max_evtchns ) ++ if ( !port_is_valid(d, port) ) + goto out; + + rc = 0; +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct + #define bucket_from_port(d, p) \ + ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) + ++static inline unsigned int max_evtchns(const struct domain *d) ++{ ++ return d->evtchn_fifo ? EVTCHN_FIFO_NR_CHANNELS ++ : BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); ++} ++ + static inline bool_t port_is_valid(struct domain *d, unsigned int p) + { + if ( p >= read_atomic(&d->valid_evtchns) ) +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -382,7 +382,6 @@ struct domain + /* Event channel information. */ + struct evtchn *evtchn; /* first bucket only */ + struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ +- unsigned int max_evtchns; /* number supported by ABI */ + unsigned int max_evtchn_port; /* max permitted port number */ + unsigned int valid_evtchns; /* number of allocated event channels */ + spinlock_t event_lock; diff --git a/system/xen/xsa/xsa343-1.patch b/system/xen/xsa/xsa343-1.patch new file mode 100644 index 0000000000000..0abbc03e8d6ac --- /dev/null +++ b/system/xen/xsa/xsa343-1.patch @@ -0,0 +1,199 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn: evtchn_reset() shouldn't succeed with still-open ports + +While the function closes all ports, it does so without holding any +lock, and hence racing requests may be issued causing new ports to get +opened. This would have been problematic in particular if such a newly +opened port had a port number above the new implementation limit (i.e. +when switching from FIFO to 2-level) after the reset, as prior to +"evtchn: relax port_is_valid()" this could have led to e.g. +evtchn_close()'s "BUG_ON(!port_is_valid(d2, port2))" to trigger. + +Introduce a counter of active ports and check that it's (still) no +larger then the number of Xen internally used ones after obtaining the +necessary lock in evtchn_reset(). + +As to the access model of the new {active,xen}_evtchns fields - while +all writes get done using write_atomic(), reads ought to use +read_atomic() only when outside of a suitably locked region. + +Note that as of now evtchn_bind_virq() and evtchn_bind_ipi() don't have +a need to call check_free_port(). + +This is part of XSA-343. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Julien Grall <jgrall@amazon.com> +--- +v7: Drop optimization from evtchn_reset(). +v6: Fix loop exit condition in evtchn_reset(). Use {read,write}_atomic() + also for xen_evtchns. +v5: Move increment in alloc_unbound_xen_event_channel() out of the inner + locked region. +v4: Account for Xen internal ports. +v3: Document intended access next to new struct field. +v2: Add comment to check_free_port(). Drop commented out calls. + +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -188,6 +188,8 @@ int evtchn_allocate_port(struct domain * + write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET); + } + ++ write_atomic(&d->active_evtchns, d->active_evtchns + 1); ++ + return 0; + } + +@@ -211,11 +213,26 @@ static int get_free_port(struct domain * + return -ENOSPC; + } + ++/* ++ * Check whether a port is still marked free, and if so update the domain ++ * counter accordingly. To be used on function exit paths. ++ */ ++static void check_free_port(struct domain *d, evtchn_port_t port) ++{ ++ if ( port_is_valid(d, port) && ++ evtchn_from_port(d, port)->state == ECS_FREE ) ++ write_atomic(&d->active_evtchns, d->active_evtchns - 1); ++} ++ + void evtchn_free(struct domain *d, struct evtchn *chn) + { + /* Clear pending event to avoid unexpected behavior on re-bind. */ + evtchn_port_clear_pending(d, chn); + ++ if ( consumer_is_xen(chn) ) ++ write_atomic(&d->xen_evtchns, d->xen_evtchns - 1); ++ write_atomic(&d->active_evtchns, d->active_evtchns - 1); ++ + /* Reset binding to vcpu0 when the channel is freed. */ + chn->state = ECS_FREE; + chn->notify_vcpu_id = 0; +@@ -258,6 +275,7 @@ static long evtchn_alloc_unbound(evtchn_ + alloc->port = port; + + out: ++ check_free_port(d, port); + spin_unlock(&d->event_lock); + rcu_unlock_domain(d); + +@@ -351,6 +369,7 @@ static long evtchn_bind_interdomain(evtc + bind->local_port = lport; + + out: ++ check_free_port(ld, lport); + spin_unlock(&ld->event_lock); + if ( ld != rd ) + spin_unlock(&rd->event_lock); +@@ -488,7 +507,7 @@ static long evtchn_bind_pirq(evtchn_bind + struct domain *d = current->domain; + struct vcpu *v = d->vcpu[0]; + struct pirq *info; +- int port, pirq = bind->pirq; ++ int port = 0, pirq = bind->pirq; + long rc; + + if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) +@@ -536,6 +555,7 @@ static long evtchn_bind_pirq(evtchn_bind + arch_evtchn_bind_pirq(d, pirq); + + out: ++ check_free_port(d, port); + spin_unlock(&d->event_lock); + + return rc; +@@ -1011,10 +1031,10 @@ int evtchn_unmask(unsigned int port) + return 0; + } + +- + int evtchn_reset(struct domain *d) + { + unsigned int i; ++ int rc = 0; + + if ( d != current->domain && !d->controller_pause_count ) + return -EINVAL; +@@ -1024,7 +1044,9 @@ int evtchn_reset(struct domain *d) + + spin_lock(&d->event_lock); + +- if ( d->evtchn_fifo ) ++ if ( d->active_evtchns > d->xen_evtchns ) ++ rc = -EAGAIN; ++ else if ( d->evtchn_fifo ) + { + /* Switching back to 2-level ABI. */ + evtchn_fifo_destroy(d); +@@ -1033,7 +1055,7 @@ int evtchn_reset(struct domain *d) + + spin_unlock(&d->event_lock); + +- return 0; ++ return rc; + } + + static long evtchn_set_priority(const struct evtchn_set_priority *set_priority) +@@ -1219,10 +1241,9 @@ int alloc_unbound_xen_event_channel( + + spin_lock(&ld->event_lock); + +- rc = get_free_port(ld); ++ port = rc = get_free_port(ld); + if ( rc < 0 ) + goto out; +- port = rc; + chn = evtchn_from_port(ld, port); + + rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid); +@@ -1238,7 +1259,10 @@ int alloc_unbound_xen_event_channel( + + spin_unlock(&chn->lock); + ++ write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); ++ + out: ++ check_free_port(ld, port); + spin_unlock(&ld->event_lock); + + return rc < 0 ? rc : port; +@@ -1314,6 +1338,7 @@ int evtchn_init(struct domain *d, unsign + return -EINVAL; + } + evtchn_from_port(d, 0)->state = ECS_RESERVED; ++ write_atomic(&d->active_evtchns, 0); + + #if MAX_VIRT_CPUS > BITS_PER_LONG + d->poll_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(d->max_vcpus)); +@@ -1340,6 +1365,8 @@ void evtchn_destroy(struct domain *d) + for ( i = 0; port_is_valid(d, i); i++ ) + evtchn_close(d, i, 0); + ++ ASSERT(!d->active_evtchns); ++ + clear_global_virq_handlers(d); + + evtchn_fifo_destroy(d); +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -361,6 +361,16 @@ struct domain + struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ + unsigned int max_evtchn_port; /* max permitted port number */ + unsigned int valid_evtchns; /* number of allocated event channels */ ++ /* ++ * Number of in-use event channels. Writers should use write_atomic(). ++ * Readers need to use read_atomic() only when not holding event_lock. ++ */ ++ unsigned int active_evtchns; ++ /* ++ * Number of event channels used internally by Xen (not subject to ++ * EVTCHNOP_reset). Read/write access like for active_evtchns. ++ */ ++ unsigned int xen_evtchns; + spinlock_t event_lock; + const struct evtchn_port_ops *evtchn_port_ops; + struct evtchn_fifo_domain *evtchn_fifo; diff --git a/system/xen/xsa/xsa343-2.patch b/system/xen/xsa/xsa343-2.patch new file mode 100644 index 0000000000000..b8eb4998f11dd --- /dev/null +++ b/system/xen/xsa/xsa343-2.patch @@ -0,0 +1,295 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn: convert per-channel lock to be IRQ-safe + +... in order for send_guest_{global,vcpu}_virq() to be able to make use +of it. + +This is part of XSA-343. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +--- +v6: New. +--- +TBD: This is the "dumb" conversion variant. In a couple of cases the + slightly simpler spin_{,un}lock_irq() could apparently be used. + +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -248,6 +248,7 @@ static long evtchn_alloc_unbound(evtchn_ + int port; + domid_t dom = alloc->dom; + long rc; ++ unsigned long flags; + + d = rcu_lock_domain_by_any_id(dom); + if ( d == NULL ) +@@ -263,14 +264,14 @@ static long evtchn_alloc_unbound(evtchn_ + if ( rc ) + goto out; + +- spin_lock(&chn->lock); ++ spin_lock_irqsave(&chn->lock, flags); + + chn->state = ECS_UNBOUND; + if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF ) + chn->u.unbound.remote_domid = current->domain->domain_id; + evtchn_port_init(d, chn); + +- spin_unlock(&chn->lock); ++ spin_unlock_irqrestore(&chn->lock, flags); + + alloc->port = port; + +@@ -283,26 +284,32 @@ static long evtchn_alloc_unbound(evtchn_ + } + + +-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) ++static unsigned long double_evtchn_lock(struct evtchn *lchn, ++ struct evtchn *rchn) + { +- if ( lchn < rchn ) ++ unsigned long flags; ++ ++ if ( lchn <= rchn ) + { +- spin_lock(&lchn->lock); +- spin_lock(&rchn->lock); ++ spin_lock_irqsave(&lchn->lock, flags); ++ if ( lchn != rchn ) ++ spin_lock(&rchn->lock); + } + else + { +- if ( lchn != rchn ) +- spin_lock(&rchn->lock); ++ spin_lock_irqsave(&rchn->lock, flags); + spin_lock(&lchn->lock); + } ++ ++ return flags; + } + +-static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn) ++static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn, ++ unsigned long flags) + { +- spin_unlock(&lchn->lock); + if ( lchn != rchn ) +- spin_unlock(&rchn->lock); ++ spin_unlock(&lchn->lock); ++ spin_unlock_irqrestore(&rchn->lock, flags); + } + + static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) +@@ -312,6 +319,7 @@ static long evtchn_bind_interdomain(evtc + int lport, rport = bind->remote_port; + domid_t rdom = bind->remote_dom; + long rc; ++ unsigned long flags; + + if ( rdom == DOMID_SELF ) + rdom = current->domain->domain_id; +@@ -347,7 +355,7 @@ static long evtchn_bind_interdomain(evtc + if ( rc ) + goto out; + +- double_evtchn_lock(lchn, rchn); ++ flags = double_evtchn_lock(lchn, rchn); + + lchn->u.interdomain.remote_dom = rd; + lchn->u.interdomain.remote_port = rport; +@@ -364,7 +372,7 @@ static long evtchn_bind_interdomain(evtc + */ + evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn); + +- double_evtchn_unlock(lchn, rchn); ++ double_evtchn_unlock(lchn, rchn, flags); + + bind->local_port = lport; + +@@ -387,6 +395,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t + struct domain *d = current->domain; + int virq = bind->virq, vcpu = bind->vcpu; + int rc = 0; ++ unsigned long flags; + + if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) ) + return -EINVAL; +@@ -424,14 +433,14 @@ int evtchn_bind_virq(evtchn_bind_virq_t + + chn = evtchn_from_port(d, port); + +- spin_lock(&chn->lock); ++ spin_lock_irqsave(&chn->lock, flags); + + chn->state = ECS_VIRQ; + chn->notify_vcpu_id = vcpu; + chn->u.virq = virq; + evtchn_port_init(d, chn); + +- spin_unlock(&chn->lock); ++ spin_unlock_irqrestore(&chn->lock, flags); + + v->virq_to_evtchn[virq] = bind->port = port; + +@@ -448,6 +457,7 @@ static long evtchn_bind_ipi(evtchn_bind_ + struct domain *d = current->domain; + int port, vcpu = bind->vcpu; + long rc = 0; ++ unsigned long flags; + + if ( domain_vcpu(d, vcpu) == NULL ) + return -ENOENT; +@@ -459,13 +469,13 @@ static long evtchn_bind_ipi(evtchn_bind_ + + chn = evtchn_from_port(d, port); + +- spin_lock(&chn->lock); ++ spin_lock_irqsave(&chn->lock, flags); + + chn->state = ECS_IPI; + chn->notify_vcpu_id = vcpu; + evtchn_port_init(d, chn); + +- spin_unlock(&chn->lock); ++ spin_unlock_irqrestore(&chn->lock, flags); + + bind->port = port; + +@@ -509,6 +519,7 @@ static long evtchn_bind_pirq(evtchn_bind + struct pirq *info; + int port = 0, pirq = bind->pirq; + long rc; ++ unsigned long flags; + + if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) + return -EINVAL; +@@ -541,14 +552,14 @@ static long evtchn_bind_pirq(evtchn_bind + goto out; + } + +- spin_lock(&chn->lock); ++ spin_lock_irqsave(&chn->lock, flags); + + chn->state = ECS_PIRQ; + chn->u.pirq.irq = pirq; + link_pirq_port(port, chn, v); + evtchn_port_init(d, chn); + +- spin_unlock(&chn->lock); ++ spin_unlock_irqrestore(&chn->lock, flags); + + bind->port = port; + +@@ -569,6 +580,7 @@ int evtchn_close(struct domain *d1, int + struct evtchn *chn1, *chn2; + int port2; + long rc = 0; ++ unsigned long flags; + + again: + spin_lock(&d1->event_lock); +@@ -668,14 +680,14 @@ int evtchn_close(struct domain *d1, int + BUG_ON(chn2->state != ECS_INTERDOMAIN); + BUG_ON(chn2->u.interdomain.remote_dom != d1); + +- double_evtchn_lock(chn1, chn2); ++ flags = double_evtchn_lock(chn1, chn2); + + evtchn_free(d1, chn1); + + chn2->state = ECS_UNBOUND; + chn2->u.unbound.remote_domid = d1->domain_id; + +- double_evtchn_unlock(chn1, chn2); ++ double_evtchn_unlock(chn1, chn2, flags); + + goto out; + +@@ -683,9 +695,9 @@ int evtchn_close(struct domain *d1, int + BUG(); + } + +- spin_lock(&chn1->lock); ++ spin_lock_irqsave(&chn1->lock, flags); + evtchn_free(d1, chn1); +- spin_unlock(&chn1->lock); ++ spin_unlock_irqrestore(&chn1->lock, flags); + + out: + if ( d2 != NULL ) +@@ -705,13 +717,14 @@ int evtchn_send(struct domain *ld, unsig + struct evtchn *lchn, *rchn; + struct domain *rd; + int rport, ret = 0; ++ unsigned long flags; + + if ( !port_is_valid(ld, lport) ) + return -EINVAL; + + lchn = evtchn_from_port(ld, lport); + +- spin_lock(&lchn->lock); ++ spin_lock_irqsave(&lchn->lock, flags); + + /* Guest cannot send via a Xen-attached event channel. */ + if ( unlikely(consumer_is_xen(lchn)) ) +@@ -746,7 +759,7 @@ int evtchn_send(struct domain *ld, unsig + } + + out: +- spin_unlock(&lchn->lock); ++ spin_unlock_irqrestore(&lchn->lock, flags); + + return ret; + } +@@ -1238,6 +1251,7 @@ int alloc_unbound_xen_event_channel( + { + struct evtchn *chn; + int port, rc; ++ unsigned long flags; + + spin_lock(&ld->event_lock); + +@@ -1250,14 +1264,14 @@ int alloc_unbound_xen_event_channel( + if ( rc ) + goto out; + +- spin_lock(&chn->lock); ++ spin_lock_irqsave(&chn->lock, flags); + + chn->state = ECS_UNBOUND; + chn->xen_consumer = get_xen_consumer(notification_fn); + chn->notify_vcpu_id = lvcpu; + chn->u.unbound.remote_domid = remote_domid; + +- spin_unlock(&chn->lock); ++ spin_unlock_irqrestore(&chn->lock, flags); + + write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1); + +@@ -1280,11 +1294,12 @@ void notify_via_xen_event_channel(struct + { + struct evtchn *lchn, *rchn; + struct domain *rd; ++ unsigned long flags; + + ASSERT(port_is_valid(ld, lport)); + lchn = evtchn_from_port(ld, lport); + +- spin_lock(&lchn->lock); ++ spin_lock_irqsave(&lchn->lock, flags); + + if ( likely(lchn->state == ECS_INTERDOMAIN) ) + { +@@ -1294,7 +1309,7 @@ void notify_via_xen_event_channel(struct + evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn); + } + +- spin_unlock(&lchn->lock); ++ spin_unlock_irqrestore(&lchn->lock, flags); + } + + void evtchn_check_pollers(struct domain *d, unsigned int port) diff --git a/system/xen/xsa/xsa343-3.patch b/system/xen/xsa/xsa343-3.patch new file mode 100644 index 0000000000000..e513e308eb3ee --- /dev/null +++ b/system/xen/xsa/xsa343-3.patch @@ -0,0 +1,392 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn: address races with evtchn_reset() + +Neither d->evtchn_port_ops nor max_evtchns(d) may be used in an entirely +lock-less manner, as both may change by a racing evtchn_reset(). In the +common case, at least one of the domain's event lock or the per-channel +lock needs to be held. In the specific case of the inter-domain sending +by evtchn_send() and notify_via_xen_event_channel() holding the other +side's per-channel lock is sufficient, as the channel can't change state +without both per-channel locks held. Without such a channel changing +state, evtchn_reset() can't complete successfully. + +Lock-free accesses continue to be permitted for the shim (calling some +otherwise internal event channel functions), as this happens while the +domain is in effectively single-threaded mode. Special care also needs +taking for the shim's marking of in-use ports as ECS_RESERVED (allowing +use of such ports in the shim case is okay because switching into and +hence also out of FIFO mode is impossihble there). + +As a side effect, certain operations on Xen bound event channels which +were mistakenly permitted so far (e.g. unmask or poll) will be refused +now. + +This is part of XSA-343. + +Reported-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +--- +v9: Add arch_evtchn_is_special() to fix PV shim. +v8: Add BUILD_BUG_ON() in evtchn_usable(). +v7: Add locking related comment ahead of struct evtchn_port_ops. +v6: New. +--- +TBD: I've been considering to move some of the wrappers from xen/event.h + into event_channel.c (or even drop them altogether), when they + require external locking (e.g. evtchn_port_init() or + evtchn_port_set_priority()). Does anyone have a strong opinion + either way? + +--- a/xen/arch/x86/irq.c ++++ b/xen/arch/x86/irq.c +@@ -2488,14 +2488,24 @@ static void dump_irqs(unsigned char key) + + for ( i = 0; i < action->nr_guests; ) + { ++ struct evtchn *evtchn; ++ unsigned int pending = 2, masked = 2; ++ + d = action->guest[i++]; + pirq = domain_irq_to_pirq(d, irq); + info = pirq_info(d, pirq); ++ evtchn = evtchn_from_port(d, info->evtchn); ++ local_irq_disable(); ++ if ( spin_trylock(&evtchn->lock) ) ++ { ++ pending = evtchn_is_pending(d, evtchn); ++ masked = evtchn_is_masked(d, evtchn); ++ spin_unlock(&evtchn->lock); ++ } ++ local_irq_enable(); + printk("d%d:%3d(%c%c%c)%c", +- d->domain_id, pirq, +- evtchn_port_is_pending(d, info->evtchn) ? 'P' : '-', +- evtchn_port_is_masked(d, info->evtchn) ? 'M' : '-', +- info->masked ? 'M' : '-', ++ d->domain_id, pirq, "-P?"[pending], ++ "-M?"[masked], info->masked ? 'M' : '-', + i < action->nr_guests ? ',' : '\n'); + } + } +--- a/xen/arch/x86/pv/shim.c ++++ b/xen/arch/x86/pv/shim.c +@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int + if ( port_is_valid(guest, port) ) + { + struct evtchn *chn = evtchn_from_port(guest, port); ++ unsigned long flags; + ++ spin_lock_irqsave(&chn->lock, flags); + evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn); ++ spin_unlock_irqrestore(&chn->lock, flags); + } + } + +--- a/xen/common/event_2l.c ++++ b/xen/common/event_2l.c +@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct doma + } + } + +-static bool evtchn_2l_is_pending(const struct domain *d, evtchn_port_t port) ++static bool evtchn_2l_is_pending(const struct domain *d, ++ const struct evtchn *evtchn) + { ++ evtchn_port_t port = evtchn->port; + unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); + + ASSERT(port < max_ports); +@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const s + guest_test_bit(d, port, &shared_info(d, evtchn_pending))); + } + +-static bool evtchn_2l_is_masked(const struct domain *d, evtchn_port_t port) ++static bool evtchn_2l_is_masked(const struct domain *d, ++ const struct evtchn *evtchn) + { ++ evtchn_port_t port = evtchn->port; + unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); + + ASSERT(port < max_ports); +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain * + + if ( port_is_valid(d, port) ) + { +- if ( evtchn_from_port(d, port)->state != ECS_FREE || +- evtchn_port_is_busy(d, port) ) ++ const struct evtchn *chn = evtchn_from_port(d, port); ++ ++ if ( chn->state != ECS_FREE || evtchn_is_busy(d, chn) ) + return -EBUSY; + } + else +@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v + unsigned long flags; + int port; + struct domain *d; ++ struct evtchn *chn; + + ASSERT(!virq_is_global(virq)); + +@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v + goto out; + + d = v->domain; +- evtchn_port_set_pending(d, v->vcpu_id, evtchn_from_port(d, port)); ++ chn = evtchn_from_port(d, port); ++ spin_lock(&chn->lock); ++ evtchn_port_set_pending(d, v->vcpu_id, chn); ++ spin_unlock(&chn->lock); + + out: + spin_unlock_irqrestore(&v->virq_lock, flags); +@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domai + goto out; + + chn = evtchn_from_port(d, port); ++ spin_lock(&chn->lock); + evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); ++ spin_unlock(&chn->lock); + + out: + spin_unlock_irqrestore(&v->virq_lock, flags); +@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, c + { + int port; + struct evtchn *chn; ++ unsigned long flags; + + /* + * PV guests: It should not be possible to race with __evtchn_close(). The +@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, c + } + + chn = evtchn_from_port(d, port); ++ spin_lock_irqsave(&chn->lock, flags); + evtchn_port_set_pending(d, chn->notify_vcpu_id, chn); ++ spin_unlock_irqrestore(&chn->lock, flags); + } + + static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly; +@@ -1034,12 +1044,15 @@ int evtchn_unmask(unsigned int port) + { + struct domain *d = current->domain; + struct evtchn *evtchn; ++ unsigned long flags; + + if ( unlikely(!port_is_valid(d, port)) ) + return -EINVAL; + + evtchn = evtchn_from_port(d, port); ++ spin_lock_irqsave(&evtchn->lock, flags); + evtchn_port_unmask(d, evtchn); ++ spin_unlock_irqrestore(&evtchn->lock, flags); + + return 0; + } +@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(stru + + printk(" %4u [%d/%d/", + port, +- evtchn_port_is_pending(d, port), +- evtchn_port_is_masked(d, port)); ++ evtchn_is_pending(d, chn), ++ evtchn_is_masked(d, chn)); + evtchn_port_print_state(d, chn); + printk("]: s=%d n=%d x=%d", + chn->state, chn->notify_vcpu_id, chn->xen_consumer); +--- a/xen/common/event_fifo.c ++++ b/xen/common/event_fifo.c +@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct do + evtchn_fifo_set_pending(v, evtchn); + } + +-static bool evtchn_fifo_is_pending(const struct domain *d, evtchn_port_t port) ++static bool evtchn_fifo_is_pending(const struct domain *d, ++ const struct evtchn *evtchn) + { +- const event_word_t *word = evtchn_fifo_word_from_port(d, port); ++ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port); + + return word && guest_test_bit(d, EVTCHN_FIFO_PENDING, word); + } + +-static bool_t evtchn_fifo_is_masked(const struct domain *d, evtchn_port_t port) ++static bool_t evtchn_fifo_is_masked(const struct domain *d, ++ const struct evtchn *evtchn) + { +- const event_word_t *word = evtchn_fifo_word_from_port(d, port); ++ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port); + + return !word || guest_test_bit(d, EVTCHN_FIFO_MASKED, word); + } + +-static bool_t evtchn_fifo_is_busy(const struct domain *d, evtchn_port_t port) ++static bool_t evtchn_fifo_is_busy(const struct domain *d, ++ const struct evtchn *evtchn) + { +- const event_word_t *word = evtchn_fifo_word_from_port(d, port); ++ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port); + + return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word); + } +--- a/xen/include/asm-x86/event.h ++++ b/xen/include/asm-x86/event.h +@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(u + return true; + } + ++#ifdef CONFIG_PV_SHIM ++# include <asm/pv/shim.h> ++# define arch_evtchn_is_special(chn) \ ++ (pv_shim && (chn)->port && (chn)->state == ECS_RESERVED) ++#endif ++ + #endif +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from + return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET); + } + ++/* ++ * "usable" as in "by a guest", i.e. Xen consumed channels are assumed to be ++ * taken care of separately where used for Xen's internal purposes. ++ */ ++static bool evtchn_usable(const struct evtchn *evtchn) ++{ ++ if ( evtchn->xen_consumer ) ++ return false; ++ ++#ifdef arch_evtchn_is_special ++ if ( arch_evtchn_is_special(evtchn) ) ++ return true; ++#endif ++ ++ BUILD_BUG_ON(ECS_FREE > ECS_RESERVED); ++ return evtchn->state > ECS_RESERVED; ++} ++ + /* Wait on a Xen-attached event channel. */ + #define wait_on_xen_event_channel(port, condition) \ + do { \ +@@ -165,19 +183,24 @@ int evtchn_reset(struct domain *d); + + /* + * Low-level event channel port ops. ++ * ++ * All hooks have to be called with a lock held which prevents the channel ++ * from changing state. This may be the domain event lock, the per-channel ++ * lock, or in the case of sending interdomain events also the other side's ++ * per-channel lock. Exceptions apply in certain cases for the PV shim. + */ + struct evtchn_port_ops { + void (*init)(struct domain *d, struct evtchn *evtchn); + void (*set_pending)(struct vcpu *v, struct evtchn *evtchn); + void (*clear_pending)(struct domain *d, struct evtchn *evtchn); + void (*unmask)(struct domain *d, struct evtchn *evtchn); +- bool (*is_pending)(const struct domain *d, evtchn_port_t port); +- bool (*is_masked)(const struct domain *d, evtchn_port_t port); ++ bool (*is_pending)(const struct domain *d, const struct evtchn *evtchn); ++ bool (*is_masked)(const struct domain *d, const struct evtchn *evtchn); + /* + * Is the port unavailable because it's still being cleaned up + * after being closed? + */ +- bool (*is_busy)(const struct domain *d, evtchn_port_t port); ++ bool (*is_busy)(const struct domain *d, const struct evtchn *evtchn); + int (*set_priority)(struct domain *d, struct evtchn *evtchn, + unsigned int priority); + void (*print_state)(struct domain *d, const struct evtchn *evtchn); +@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pendi + unsigned int vcpu_id, + struct evtchn *evtchn) + { +- d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn); ++ if ( evtchn_usable(evtchn) ) ++ d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn); + } + + static inline void evtchn_port_clear_pending(struct domain *d, + struct evtchn *evtchn) + { +- d->evtchn_port_ops->clear_pending(d, evtchn); ++ if ( evtchn_usable(evtchn) ) ++ d->evtchn_port_ops->clear_pending(d, evtchn); + } + + static inline void evtchn_port_unmask(struct domain *d, + struct evtchn *evtchn) + { +- d->evtchn_port_ops->unmask(d, evtchn); ++ if ( evtchn_usable(evtchn) ) ++ d->evtchn_port_ops->unmask(d, evtchn); + } + +-static inline bool evtchn_port_is_pending(const struct domain *d, +- evtchn_port_t port) ++static inline bool evtchn_is_pending(const struct domain *d, ++ const struct evtchn *evtchn) + { +- return d->evtchn_port_ops->is_pending(d, port); ++ return evtchn_usable(evtchn) && d->evtchn_port_ops->is_pending(d, evtchn); + } + +-static inline bool evtchn_port_is_masked(const struct domain *d, +- evtchn_port_t port) ++static inline bool evtchn_port_is_pending(struct domain *d, evtchn_port_t port) + { +- return d->evtchn_port_ops->is_masked(d, port); ++ struct evtchn *evtchn = evtchn_from_port(d, port); ++ bool rc; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&evtchn->lock, flags); ++ rc = evtchn_is_pending(d, evtchn); ++ spin_unlock_irqrestore(&evtchn->lock, flags); ++ ++ return rc; ++} ++ ++static inline bool evtchn_is_masked(const struct domain *d, ++ const struct evtchn *evtchn) ++{ ++ return !evtchn_usable(evtchn) || d->evtchn_port_ops->is_masked(d, evtchn); ++} ++ ++static inline bool evtchn_port_is_masked(struct domain *d, evtchn_port_t port) ++{ ++ struct evtchn *evtchn = evtchn_from_port(d, port); ++ bool rc; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&evtchn->lock, flags); ++ rc = evtchn_is_masked(d, evtchn); ++ spin_unlock_irqrestore(&evtchn->lock, flags); ++ ++ return rc; + } + +-static inline bool evtchn_port_is_busy(const struct domain *d, +- evtchn_port_t port) ++static inline bool evtchn_is_busy(const struct domain *d, ++ const struct evtchn *evtchn) + { + return d->evtchn_port_ops->is_busy && +- d->evtchn_port_ops->is_busy(d, port); ++ d->evtchn_port_ops->is_busy(d, evtchn); + } + + static inline int evtchn_port_set_priority(struct domain *d, +@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priori + { + if ( !d->evtchn_port_ops->set_priority ) + return -ENOSYS; ++ if ( !evtchn_usable(evtchn) ) ++ return -EACCES; + return d->evtchn_port_ops->set_priority(d, evtchn, priority); + } + diff --git a/system/xen/xsa/xsa344-4.13-1.patch b/system/xen/xsa/xsa344-4.13-1.patch new file mode 100644 index 0000000000000..d8e9b3f43ffaa --- /dev/null +++ b/system/xen/xsa/xsa344-4.13-1.patch @@ -0,0 +1,130 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn: arrange for preemption in evtchn_destroy() + +Especially closing of fully established interdomain channels can take +quite some time, due to the locking involved. Therefore we shouldn't +assume we can clean up still active ports all in one go. Besides adding +the necessary preemption check, also avoid pointlessly starting from +(or now really ending at) 0; 1 is the lowest numbered port which may +need closing. + +Since we're now reducing ->valid_evtchns, free_xen_event_channel(), +and (at least to be on the safe side) notify_via_xen_event_channel() +need to cope with attempts to close / unbind from / send through already +closed (and no longer valid, as per port_is_valid()) ports. + +This is part of XSA-344. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> + +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -770,12 +770,14 @@ int domain_kill(struct domain *d) + return domain_kill(d); + d->is_dying = DOMDYING_dying; + argo_destroy(d); +- evtchn_destroy(d); + gnttab_release_mappings(d); + vnuma_destroy(d->vnuma); + domain_set_outstanding_pages(d, 0); + /* fallthrough */ + case DOMDYING_dying: ++ rc = evtchn_destroy(d); ++ if ( rc ) ++ break; + rc = domain_relinquish_resources(d); + if ( rc != 0 ) + break; +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -1297,7 +1297,16 @@ int alloc_unbound_xen_event_channel( + + void free_xen_event_channel(struct domain *d, int port) + { +- BUG_ON(!port_is_valid(d, port)); ++ if ( !port_is_valid(d, port) ) ++ { ++ /* ++ * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing ++ * with the spin_barrier() and BUG_ON() in evtchn_destroy(). ++ */ ++ smp_rmb(); ++ BUG_ON(!d->is_dying); ++ return; ++ } + + evtchn_close(d, port, 0); + } +@@ -1309,7 +1318,17 @@ void notify_via_xen_event_channel(struct + struct domain *rd; + unsigned long flags; + +- ASSERT(port_is_valid(ld, lport)); ++ if ( !port_is_valid(ld, lport) ) ++ { ++ /* ++ * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing ++ * with the spin_barrier() and BUG_ON() in evtchn_destroy(). ++ */ ++ smp_rmb(); ++ ASSERT(ld->is_dying); ++ return; ++ } ++ + lchn = evtchn_from_port(ld, lport); + + spin_lock_irqsave(&lchn->lock, flags); +@@ -1380,8 +1399,7 @@ int evtchn_init(struct domain *d, unsign + return 0; + } + +- +-void evtchn_destroy(struct domain *d) ++int evtchn_destroy(struct domain *d) + { + unsigned int i; + +@@ -1390,14 +1408,29 @@ void evtchn_destroy(struct domain *d) + spin_barrier(&d->event_lock); + + /* Close all existing event channels. */ +- for ( i = 0; port_is_valid(d, i); i++ ) ++ for ( i = d->valid_evtchns; --i; ) ++ { + evtchn_close(d, i, 0); + ++ /* ++ * Avoid preempting when called from domain_create()'s error path, ++ * and don't check too often (choice of frequency is arbitrary). ++ */ ++ if ( i && !(i & 0x3f) && d->is_dying != DOMDYING_dead && ++ hypercall_preempt_check() ) ++ { ++ write_atomic(&d->valid_evtchns, i); ++ return -ERESTART; ++ } ++ } ++ + ASSERT(!d->active_evtchns); + + clear_global_virq_handlers(d); + + evtchn_fifo_destroy(d); ++ ++ return 0; + } + + +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -136,7 +136,7 @@ struct evtchn + } __attribute__((aligned(64))); + + int evtchn_init(struct domain *d, unsigned int max_port); +-void evtchn_destroy(struct domain *d); /* from domain_kill */ ++int evtchn_destroy(struct domain *d); /* from domain_kill */ + void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */ + + struct waitqueue_vcpu; diff --git a/system/xen/xsa/xsa344-4.13-2.patch b/system/xen/xsa/xsa344-4.13-2.patch new file mode 100644 index 0000000000000..3f0339498f078 --- /dev/null +++ b/system/xen/xsa/xsa344-4.13-2.patch @@ -0,0 +1,203 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: evtchn: arrange for preemption in evtchn_reset() + +Like for evtchn_destroy() looping over all possible event channels to +close them can take a significant amount of time. Unlike done there, we +can't alter domain properties (i.e. d->valid_evtchns) here. Borrow, in a +lightweight form, the paging domctl continuation concept, redirecting +the continuations to different sub-ops. Just like there this is to be +able to allow for predictable overall results of the involved sub-ops: +Racing requests should either complete or be refused. + +Note that a domain can't interfere with an already started (by a remote +domain) reset, due to being paused. It can prevent a remote reset from +happening by leaving a reset unfinished, but that's only going to affect +itself. + +This is part of XSA-344. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> + +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -1214,7 +1214,7 @@ void domain_unpause_except_self(struct d + domain_unpause(d); + } + +-int domain_soft_reset(struct domain *d) ++int domain_soft_reset(struct domain *d, bool resuming) + { + struct vcpu *v; + int rc; +@@ -1228,7 +1228,7 @@ int domain_soft_reset(struct domain *d) + } + spin_unlock(&d->shutdown_lock); + +- rc = evtchn_reset(d); ++ rc = evtchn_reset(d, resuming); + if ( rc ) + return rc; + +--- a/xen/common/domctl.c ++++ b/xen/common/domctl.c +@@ -572,12 +572,22 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe + } + + case XEN_DOMCTL_soft_reset: ++ case XEN_DOMCTL_soft_reset_cont: + if ( d == current->domain ) /* no domain_pause() */ + { + ret = -EINVAL; + break; + } +- ret = domain_soft_reset(d); ++ ret = domain_soft_reset(d, op->cmd == XEN_DOMCTL_soft_reset_cont); ++ if ( ret == -ERESTART ) ++ { ++ op->cmd = XEN_DOMCTL_soft_reset_cont; ++ if ( !__copy_field_to_guest(u_domctl, op, cmd) ) ++ ret = hypercall_create_continuation(__HYPERVISOR_domctl, ++ "h", u_domctl); ++ else ++ ret = -EFAULT; ++ } + break; + + case XEN_DOMCTL_destroydomain: +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -1057,7 +1057,7 @@ int evtchn_unmask(unsigned int port) + return 0; + } + +-int evtchn_reset(struct domain *d) ++int evtchn_reset(struct domain *d, bool resuming) + { + unsigned int i; + int rc = 0; +@@ -1065,11 +1065,40 @@ int evtchn_reset(struct domain *d) + if ( d != current->domain && !d->controller_pause_count ) + return -EINVAL; + +- for ( i = 0; port_is_valid(d, i); i++ ) ++ spin_lock(&d->event_lock); ++ ++ /* ++ * If we are resuming, then start where we stopped. Otherwise, check ++ * that a reset operation is not already in progress, and if none is, ++ * record that this is now the case. ++ */ ++ i = resuming ? d->next_evtchn : !d->next_evtchn; ++ if ( i > d->next_evtchn ) ++ d->next_evtchn = i; ++ ++ spin_unlock(&d->event_lock); ++ ++ if ( !i ) ++ return -EBUSY; ++ ++ for ( ; port_is_valid(d, i); i++ ) ++ { + evtchn_close(d, i, 1); + ++ /* NB: Choice of frequency is arbitrary. */ ++ if ( !(i & 0x3f) && hypercall_preempt_check() ) ++ { ++ spin_lock(&d->event_lock); ++ d->next_evtchn = i; ++ spin_unlock(&d->event_lock); ++ return -ERESTART; ++ } ++ } ++ + spin_lock(&d->event_lock); + ++ d->next_evtchn = 0; ++ + if ( d->active_evtchns > d->xen_evtchns ) + rc = -EAGAIN; + else if ( d->evtchn_fifo ) +@@ -1204,7 +1233,8 @@ long do_event_channel_op(int cmd, XEN_GU + break; + } + +- case EVTCHNOP_reset: { ++ case EVTCHNOP_reset: ++ case EVTCHNOP_reset_cont: { + struct evtchn_reset reset; + struct domain *d; + +@@ -1217,9 +1247,13 @@ long do_event_channel_op(int cmd, XEN_GU + + rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d); + if ( !rc ) +- rc = evtchn_reset(d); ++ rc = evtchn_reset(d, cmd == EVTCHNOP_reset_cont); + + rcu_unlock_domain(d); ++ ++ if ( rc == -ERESTART ) ++ rc = hypercall_create_continuation(__HYPERVISOR_event_channel_op, ++ "ih", EVTCHNOP_reset_cont, arg); + break; + } + +--- a/xen/include/public/domctl.h ++++ b/xen/include/public/domctl.h +@@ -1152,7 +1152,10 @@ struct xen_domctl { + #define XEN_DOMCTL_iomem_permission 20 + #define XEN_DOMCTL_ioport_permission 21 + #define XEN_DOMCTL_hypercall_init 22 +-#define XEN_DOMCTL_arch_setup 23 /* Obsolete IA64 only */ ++#ifdef __XEN__ ++/* #define XEN_DOMCTL_arch_setup 23 Obsolete IA64 only */ ++#define XEN_DOMCTL_soft_reset_cont 23 ++#endif + #define XEN_DOMCTL_settimeoffset 24 + #define XEN_DOMCTL_getvcpuaffinity 25 + #define XEN_DOMCTL_real_mode_area 26 /* Obsolete PPC only */ +--- a/xen/include/public/event_channel.h ++++ b/xen/include/public/event_channel.h +@@ -74,6 +74,9 @@ + #define EVTCHNOP_init_control 11 + #define EVTCHNOP_expand_array 12 + #define EVTCHNOP_set_priority 13 ++#ifdef __XEN__ ++#define EVTCHNOP_reset_cont 14 ++#endif + /* ` } */ + + typedef uint32_t evtchn_port_t; +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -171,7 +171,7 @@ void evtchn_check_pollers(struct domain + void evtchn_2l_init(struct domain *d); + + /* Close all event channels and reset to 2-level ABI. */ +-int evtchn_reset(struct domain *d); ++int evtchn_reset(struct domain *d, bool resuming); + + /* + * Low-level event channel port ops. +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -394,6 +394,8 @@ struct domain + * EVTCHNOP_reset). Read/write access like for active_evtchns. + */ + unsigned int xen_evtchns; ++ /* Port to resume from in evtchn_reset(), when in a continuation. */ ++ unsigned int next_evtchn; + spinlock_t event_lock; + const struct evtchn_port_ops *evtchn_port_ops; + struct evtchn_fifo_domain *evtchn_fifo; +@@ -663,7 +665,7 @@ int domain_shutdown(struct domain *d, u8 + void domain_resume(struct domain *d); + void domain_pause_for_debugger(void); + +-int domain_soft_reset(struct domain *d); ++int domain_soft_reset(struct domain *d, bool resuming); + + int vcpu_start_shutdown_deferral(struct vcpu *v); + void vcpu_end_shutdown_deferral(struct vcpu *v); diff --git a/system/xen/xsa/xsa345-0001-x86-mm-Refactor-map_pages_to_xen-to-have-only-a-sing.patch b/system/xen/xsa/xsa345-0001-x86-mm-Refactor-map_pages_to_xen-to-have-only-a-sing.patch new file mode 100644 index 0000000000000..d325385a56ef9 --- /dev/null +++ b/system/xen/xsa/xsa345-0001-x86-mm-Refactor-map_pages_to_xen-to-have-only-a-sing.patch @@ -0,0 +1,94 @@ +From b3e0d4e37b7902533a463812374947d4d6d2e463 Mon Sep 17 00:00:00 2001 +From: Wei Liu <wei.liu2@citrix.com> +Date: Sat, 11 Jan 2020 21:57:41 +0000 +Subject: [PATCH 1/3] x86/mm: Refactor map_pages_to_xen to have only a single + exit path + +We will soon need to perform clean-ups before returning. + +No functional change. + +This is part of XSA-345. + +Reported-by: Hongyan Xia <hongyxia@amazon.com> +Signed-off-by: Wei Liu <wei.liu2@citrix.com> +Signed-off-by: Hongyan Xia <hongyxia@amazon.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 30dffb68e8..133a393875 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5187,6 +5187,7 @@ int map_pages_to_xen( + l2_pgentry_t *pl2e, ol2e; + l1_pgentry_t *pl1e, ol1e; + unsigned int i; ++ int rc = -ENOMEM; + + #define flush_flags(oldf) do { \ + unsigned int o_ = (oldf); \ +@@ -5207,7 +5208,8 @@ int map_pages_to_xen( + l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); + + if ( !pl3e ) +- return -ENOMEM; ++ goto out; ++ + ol3e = *pl3e; + + if ( cpu_has_page1gb && +@@ -5295,7 +5297,7 @@ int map_pages_to_xen( + + pl2e = alloc_xen_pagetable(); + if ( pl2e == NULL ) +- return -ENOMEM; ++ goto out; + + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + l2e_write(pl2e + i, +@@ -5324,7 +5326,7 @@ int map_pages_to_xen( + + pl2e = virt_to_xen_l2e(virt); + if ( !pl2e ) +- return -ENOMEM; ++ goto out; + + if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) & + ((1u << PAGETABLE_ORDER) - 1)) == 0) && +@@ -5367,7 +5369,7 @@ int map_pages_to_xen( + { + pl1e = virt_to_xen_l1e(virt); + if ( pl1e == NULL ) +- return -ENOMEM; ++ goto out; + } + else if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) + { +@@ -5394,7 +5396,7 @@ int map_pages_to_xen( + + pl1e = alloc_xen_pagetable(); + if ( pl1e == NULL ) +- return -ENOMEM; ++ goto out; + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + l1e_write(&pl1e[i], +@@ -5538,7 +5540,10 @@ int map_pages_to_xen( + + #undef flush_flags + +- return 0; ++ rc = 0; ++ ++ out: ++ return rc; + } + + int populate_pt_range(unsigned long virt, unsigned long nr_mfns) +-- +2.25.1 + diff --git a/system/xen/xsa/xsa345-0002-x86-mm-Refactor-modify_xen_mappings-to-have-one-exit.patch b/system/xen/xsa/xsa345-0002-x86-mm-Refactor-modify_xen_mappings-to-have-one-exit.patch new file mode 100644 index 0000000000000..836bed681aada --- /dev/null +++ b/system/xen/xsa/xsa345-0002-x86-mm-Refactor-modify_xen_mappings-to-have-one-exit.patch @@ -0,0 +1,68 @@ +From 9f6f35b833d295acaaa2d8ff8cf309bf688cfd50 Mon Sep 17 00:00:00 2001 +From: Wei Liu <wei.liu2@citrix.com> +Date: Sat, 11 Jan 2020 21:57:42 +0000 +Subject: [PATCH 2/3] x86/mm: Refactor modify_xen_mappings to have one exit + path + +We will soon need to perform clean-ups before returning. + +No functional change. + +This is part of XSA-345. + +Reported-by: Hongyan Xia <hongyxia@amazon.com> +Signed-off-by: Wei Liu <wei.liu2@citrix.com> +Signed-off-by: Hongyan Xia <hongyxia@amazon.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 133a393875..af726d3274 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5570,6 +5570,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l1_pgentry_t *pl1e; + unsigned int i; + unsigned long v = s; ++ int rc = -ENOMEM; + + /* Set of valid PTE bits which may be altered. */ + #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) +@@ -5611,7 +5612,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + /* PAGE1GB: shatter the superpage and fall through. */ + pl2e = alloc_xen_pagetable(); + if ( !pl2e ) +- return -ENOMEM; ++ goto out; ++ + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + l2e_write(pl2e + i, + l2e_from_pfn(l3e_get_pfn(*pl3e) + +@@ -5666,7 +5668,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + /* PSE: shatter the superpage and try again. */ + pl1e = alloc_xen_pagetable(); + if ( !pl1e ) +- return -ENOMEM; ++ goto out; ++ + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + l1e_write(&pl1e[i], + l1e_from_pfn(l2e_get_pfn(*pl2e) + i, +@@ -5795,7 +5798,10 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + flush_area(NULL, FLUSH_TLB_GLOBAL); + + #undef FLAGS_MASK +- return 0; ++ rc = 0; ++ ++ out: ++ return rc; + } + + #undef flush_area +-- +2.25.1 + diff --git a/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch new file mode 100644 index 0000000000000..db407416b9ade --- /dev/null +++ b/system/xen/xsa/xsa345-0003-x86-mm-Prevent-some-races-in-hypervisor-mapping-upda.patch @@ -0,0 +1,249 @@ +From 0ff9a8453dc47cd47eee9659d5916afb5094e871 Mon Sep 17 00:00:00 2001 +From: Hongyan Xia <hongyxia@amazon.com> +Date: Sat, 11 Jan 2020 21:57:43 +0000 +Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates + +map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB +superpages if possible, to maximize TLB efficiency. This means both +replacing superpage entries with smaller entries, and replacing +smaller entries with superpages. + +Unfortunately, while some potential races are handled correctly, +others are not. These include: + +1. When one processor modifies a sub-superpage mapping while another +processor replaces the entire range with a superpage. + +Take the following example: + +Suppose L3[N] points to L2. And suppose we have two processors, A and +B. + +* A walks the pagetables, get a pointer to L2. +* B replaces L3[N] with a 1GiB mapping. +* B Frees L2 +* A writes L2[M] # + +This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't +handle higher-level superpages properly: If you call virt_xen_to_l2e +on a virtual address within an L3 superpage, you'll either hit a BUG() +(most likely), or get a pointer into the middle of a data page; same +with virt_xen_to_l1 on a virtual address within either an L3 or L2 +superpage. + +So take the following example: + +* A reads pl3e and discovers it to point to an L2. +* B replaces L3[N] with a 1GiB mapping +* A calls virt_to_xen_l2e() and hits the BUG_ON() # + +2. When two processors simultaneously try to replace a sub-superpage +mapping with a superpage mapping. + +Take the following example: + +Suppose L3[N] points to L2. And suppose we have two processors, A and B, +both trying to replace L3[N] with a superpage. + +* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2. +* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2. +* A writes the new value into L3[N] +* B writes the new value into L3[N] +* A recursively frees all the L1's under L2, then frees L2 +* B recursively double-frees all the L1's under L2, then double-frees L2 # + +Fix this by grabbing a lock for the entirety of the mapping update +operation. + +Rather than grabbing map_pgdir_lock for the entire operation, however, +repurpose the PGT_locked bit from L3's page->type_info as a lock. +This means that rather than locking the entire address space, we +"only" lock a single 512GiB chunk of hypervisor address space at a +time. + +There was a proposal for a lock-and-reverify approach, where we walk +the pagetables to the point where we decide what to do; then grab the +map_pgdir_lock, re-verify the information we collected without the +lock, and finally make the change (starting over again if anything had +changed). Without being able to guarantee that the L2 table wasn't +freed, however, that means every read would need to be considered +potentially unsafe. Thinking carefully about that is probably +something that wants to be done on public, not under time pressure. + +This is part of XSA-345. + +Reported-by: Hongyan Xia <hongyxia@amazon.com> +Signed-off-by: Hongyan Xia <hongyxia@amazon.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 89 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index af726d3274..d6a0761f43 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2167,6 +2167,50 @@ void page_unlock(struct page_info *page) + current_locked_page_set(NULL); + } + ++/* ++ * L3 table locks: ++ * ++ * Used for serialization in map_pages_to_xen() and modify_xen_mappings(). ++ * ++ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to ++ * reuse the PGT_locked flag. This lock is taken only when we move down to L3 ++ * tables and below, since L4 (and above, for 5-level paging) is still globally ++ * protected by map_pgdir_lock. ++ * ++ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock(). ++ * This has two implications: ++ * - We cannot reuse reuse current_locked_page_* for debugging ++ * - To avoid the chance of deadlock, even for different pages, we ++ * must never grab page_lock() after grabbing l3t_lock(). This ++ * includes any page_lock()-based locks, such as ++ * mem_sharing_page_lock(). ++ * ++ * Also note that we grab the map_pgdir_lock while holding the ++ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in ++ * reverse order. ++ */ ++static void l3t_lock(struct page_info *page) ++{ ++ unsigned long x, nx; ++ ++ do { ++ while ( (x = page->u.inuse.type_info) & PGT_locked ) ++ cpu_relax(); ++ nx = x | PGT_locked; ++ } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); ++} ++ ++static void l3t_unlock(struct page_info *page) ++{ ++ unsigned long x, nx, y = page->u.inuse.type_info; ++ ++ do { ++ x = y; ++ BUG_ON(!(x & PGT_locked)); ++ nx = x & ~PGT_locked; ++ } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); ++} ++ + #ifdef CONFIG_PV + /* + * PTE flags that a guest may change without re-validating the PTE. +@@ -5177,6 +5221,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + flush_area_local((const void *)v, f) : \ + flush_area_all((const void *)v, f)) + ++#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR ++ ++#define L3T_LOCK(page) \ ++ do { \ ++ if ( locking ) \ ++ l3t_lock(page); \ ++ } while ( false ) ++ ++#define L3T_UNLOCK(page) \ ++ do { \ ++ if ( locking && (page) != ZERO_BLOCK_PTR ) \ ++ { \ ++ l3t_unlock(page); \ ++ (page) = ZERO_BLOCK_PTR; \ ++ } \ ++ } while ( false ) ++ + int map_pages_to_xen( + unsigned long virt, + mfn_t mfn, +@@ -5188,6 +5249,7 @@ int map_pages_to_xen( + l1_pgentry_t *pl1e, ol1e; + unsigned int i; + int rc = -ENOMEM; ++ struct page_info *current_l3page; + + #define flush_flags(oldf) do { \ + unsigned int o_ = (oldf); \ +@@ -5203,13 +5265,20 @@ int map_pages_to_xen( + } \ + } while (0) + ++ L3T_INIT(current_l3page); ++ + while ( nr_mfns != 0 ) + { +- l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); ++ l3_pgentry_t *pl3e, ol3e; + ++ L3T_UNLOCK(current_l3page); ++ ++ pl3e = virt_to_xen_l3e(virt); + if ( !pl3e ) + goto out; + ++ current_l3page = virt_to_page(pl3e); ++ L3T_LOCK(current_l3page); + ol3e = *pl3e; + + if ( cpu_has_page1gb && +@@ -5543,6 +5612,7 @@ int map_pages_to_xen( + rc = 0; + + out: ++ L3T_UNLOCK(current_l3page); + return rc; + } + +@@ -5571,6 +5641,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + unsigned int i; + unsigned long v = s; + int rc = -ENOMEM; ++ struct page_info *current_l3page; + + /* Set of valid PTE bits which may be altered. */ + #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT) +@@ -5579,11 +5650,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + ASSERT(IS_ALIGNED(s, PAGE_SIZE)); + ASSERT(IS_ALIGNED(e, PAGE_SIZE)); + ++ L3T_INIT(current_l3page); ++ + while ( v < e ) + { +- l3_pgentry_t *pl3e = virt_to_xen_l3e(v); ++ l3_pgentry_t *pl3e; ++ ++ L3T_UNLOCK(current_l3page); + +- if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) ++ pl3e = virt_to_xen_l3e(v); ++ if ( !pl3e ) ++ goto out; ++ ++ current_l3page = virt_to_page(pl3e); ++ L3T_LOCK(current_l3page); ++ ++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + { + /* Confirm the caller isn't trying to create new mappings. */ + ASSERT(!(nf & _PAGE_PRESENT)); +@@ -5801,9 +5883,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + rc = 0; + + out: ++ L3T_UNLOCK(current_l3page); + return rc; + } + ++#undef L3T_LOCK ++#undef L3T_UNLOCK ++ + #undef flush_area + + int destroy_xen_mappings(unsigned long s, unsigned long e) +-- +2.25.1 + diff --git a/system/xen/xsa/xsa346-4.13-1.patch b/system/xen/xsa/xsa346-4.13-1.patch new file mode 100644 index 0000000000000..a32e658e8085d --- /dev/null +++ b/system/xen/xsa/xsa346-4.13-1.patch @@ -0,0 +1,50 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page + +Deferring flushes to a single, wide range one - as is done when +handling XENMAPSPACE_gmfn_range - is okay only as long as +pages don't get freed ahead of the eventual flush. While the only +function setting the flag (xenmem_add_to_physmap()) suggests by its name +that it's only mapping new entries, in reality the way +xenmem_add_to_physmap_one() works means an unmap would happen not only +for the page being moved (but not freed) but, if the destination GFN is +populated, also for the page being displaced from that GFN. Collapsing +the two flushes for this GFN into just one (end even more so deferring +it to a batched invocation) is not correct. + +This is part of XSA-346. + +Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> +Acked-by: Julien Grall <jgrall@amazon.com> + +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -292,6 +292,7 @@ int guest_remove_page(struct domain *d, + p2m_type_t p2mt; + #endif + mfn_t mfn; ++ bool *dont_flush_p, dont_flush; + int rc; + + #ifdef CONFIG_X86 +@@ -378,8 +379,18 @@ int guest_remove_page(struct domain *d, + return -ENXIO; + } + ++ /* ++ * Since we're likely to free the page below, we need to suspend ++ * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes. ++ */ ++ dont_flush_p = &this_cpu(iommu_dont_flush_iotlb); ++ dont_flush = *dont_flush_p; ++ *dont_flush_p = false; ++ + rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0); + ++ *dont_flush_p = dont_flush; ++ + /* + * With the lack of an IOMMU on some platforms, domains with DMA-capable + * device must retrieve the same pfn when the hypercall populate_physmap diff --git a/system/xen/xsa/xsa346-4.13-2.patch b/system/xen/xsa/xsa346-4.13-2.patch new file mode 100644 index 0000000000000..6371b5c3db7d2 --- /dev/null +++ b/system/xen/xsa/xsa346-4.13-2.patch @@ -0,0 +1,204 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: IOMMU: hold page ref until after deferred TLB flush + +When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB +flush for the "from" GFN range requires that the page remains allocated +to the guest until the TLB flush has actually occurred. Otherwise a +parallel hypercall to remove the page would only flush the TLB for the +GFN it has been moved to, but not the one is was mapped at originally. + +This is part of XSA-346. + +Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ") +Reported-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> + +--- a/xen/arch/arm/mm.c ++++ b/xen/arch/arm/mm.c +@@ -1407,7 +1407,7 @@ void share_xen_page_with_guest(struct pa + int xenmem_add_to_physmap_one( + struct domain *d, + unsigned int space, +- union xen_add_to_physmap_batch_extra extra, ++ union add_to_physmap_extra extra, + unsigned long idx, + gfn_t gfn) + { +@@ -1480,10 +1480,6 @@ int xenmem_add_to_physmap_one( + break; + } + case XENMAPSPACE_dev_mmio: +- /* extra should be 0. Reserved for future use. */ +- if ( extra.res0 ) +- return -EOPNOTSUPP; +- + rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx)); + return rc; + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -4617,7 +4617,7 @@ static int handle_iomem_range(unsigned l + int xenmem_add_to_physmap_one( + struct domain *d, + unsigned int space, +- union xen_add_to_physmap_batch_extra extra, ++ union add_to_physmap_extra extra, + unsigned long idx, + gfn_t gpfn) + { +@@ -4701,9 +4701,20 @@ int xenmem_add_to_physmap_one( + rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); + + put_both: +- /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */ ++ /* ++ * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. ++ * We also may need to transfer ownership of the page reference to our ++ * caller. ++ */ + if ( space == XENMAPSPACE_gmfn ) ++ { + put_gfn(d, gfn); ++ if ( !rc && extra.ppage ) ++ { ++ *extra.ppage = page; ++ page = NULL; ++ } ++ } + + if ( page ) + put_page(page); +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -814,13 +814,12 @@ int xenmem_add_to_physmap(struct domain + { + unsigned int done = 0; + long rc = 0; +- union xen_add_to_physmap_batch_extra extra; ++ union add_to_physmap_extra extra = {}; ++ struct page_info *pages[16]; + + ASSERT(paging_mode_translate(d)); + +- if ( xatp->space != XENMAPSPACE_gmfn_foreign ) +- extra.res0 = 0; +- else ++ if ( xatp->space == XENMAPSPACE_gmfn_foreign ) + extra.foreign_domid = DOMID_INVALID; + + if ( xatp->space != XENMAPSPACE_gmfn_range ) +@@ -835,7 +834,10 @@ int xenmem_add_to_physmap(struct domain + xatp->size -= start; + + if ( is_iommu_enabled(d) ) ++ { + this_cpu(iommu_dont_flush_iotlb) = 1; ++ extra.ppage = &pages[0]; ++ } + + while ( xatp->size > done ) + { +@@ -847,8 +849,12 @@ int xenmem_add_to_physmap(struct domain + xatp->idx++; + xatp->gpfn++; + ++ if ( extra.ppage ) ++ ++extra.ppage; ++ + /* Check for continuation if it's not the last iteration. */ +- if ( xatp->size > ++done && hypercall_preempt_check() ) ++ if ( (++done > ARRAY_SIZE(pages) && extra.ppage) || ++ (xatp->size > done && hypercall_preempt_check()) ) + { + rc = start + done; + break; +@@ -858,6 +864,7 @@ int xenmem_add_to_physmap(struct domain + if ( is_iommu_enabled(d) ) + { + int ret; ++ unsigned int i; + + this_cpu(iommu_dont_flush_iotlb) = 0; + +@@ -866,6 +873,15 @@ int xenmem_add_to_physmap(struct domain + if ( unlikely(ret) && rc >= 0 ) + rc = ret; + ++ /* ++ * Now that the IOMMU TLB flush was done for the original GFN, drop ++ * the page references. The 2nd flush below is fine to make later, as ++ * whoever removes the page again from its new GFN will have to do ++ * another flush anyway. ++ */ ++ for ( i = 0; i < done; ++i ) ++ put_page(pages[i]); ++ + ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done, + IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified); + if ( unlikely(ret) && rc >= 0 ) +@@ -879,6 +895,8 @@ static int xenmem_add_to_physmap_batch(s + struct xen_add_to_physmap_batch *xatpb, + unsigned int extent) + { ++ union add_to_physmap_extra extra = {}; ++ + if ( unlikely(xatpb->size < extent) ) + return -EILSEQ; + +@@ -890,6 +908,19 @@ static int xenmem_add_to_physmap_batch(s + !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) ) + return -EFAULT; + ++ switch ( xatpb->space ) ++ { ++ case XENMAPSPACE_dev_mmio: ++ /* res0 is reserved for future use. */ ++ if ( xatpb->u.res0 ) ++ return -EOPNOTSUPP; ++ break; ++ ++ case XENMAPSPACE_gmfn_foreign: ++ extra.foreign_domid = xatpb->u.foreign_domid; ++ break; ++ } ++ + while ( xatpb->size > extent ) + { + xen_ulong_t idx; +@@ -902,8 +933,7 @@ static int xenmem_add_to_physmap_batch(s + extent, 1)) ) + return -EFAULT; + +- rc = xenmem_add_to_physmap_one(d, xatpb->space, +- xatpb->u, ++ rc = xenmem_add_to_physmap_one(d, xatpb->space, extra, + idx, _gfn(gpfn)); + + if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) ) +--- a/xen/include/xen/mm.h ++++ b/xen/include/xen/mm.h +@@ -588,8 +588,22 @@ void scrub_one_page(struct page_info *); + &(d)->xenpage_list : &(d)->page_list) + #endif + ++union add_to_physmap_extra { ++ /* ++ * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs ++ * to be kept until after the flush, so the page can't get removed from ++ * the domain (and re-used for another purpose) beforehand. By passing ++ * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants ++ * to have ownership of such a reference transferred in the success case. ++ */ ++ struct page_info **ppage; ++ ++ /* XENMAPSPACE_gmfn_foreign */ ++ domid_t foreign_domid; ++}; ++ + int xenmem_add_to_physmap_one(struct domain *d, unsigned int space, +- union xen_add_to_physmap_batch_extra extra, ++ union add_to_physmap_extra extra, + unsigned long idx, gfn_t gfn); + + int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, diff --git a/system/xen/xsa/xsa347-4.13-1.patch b/system/xen/xsa/xsa347-4.13-1.patch new file mode 100644 index 0000000000000..e9f31a151f016 --- /dev/null +++ b/system/xen/xsa/xsa347-4.13-1.patch @@ -0,0 +1,149 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: AMD/IOMMU: convert amd_iommu_pte from struct to union + +This is to add a "raw" counterpart to the bitfield equivalent. Take the +opportunity and + - convert fields to bool / unsigned int, + - drop the naming of the reserved field, + - shorten the names of the ignored ones. + +This is part of XSA-347. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Paul Durrant <paul@xen.org> + +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -38,7 +38,7 @@ static unsigned int pfn_to_pde_idx(unsig + static unsigned int clear_iommu_pte_present(unsigned long l1_mfn, + unsigned long dfn) + { +- struct amd_iommu_pte *table, *pte; ++ union amd_iommu_pte *table, *pte; + unsigned int flush_flags; + + table = map_domain_page(_mfn(l1_mfn)); +@@ -52,7 +52,7 @@ static unsigned int clear_iommu_pte_pres + return flush_flags; + } + +-static unsigned int set_iommu_pde_present(struct amd_iommu_pte *pte, ++static unsigned int set_iommu_pde_present(union amd_iommu_pte *pte, + unsigned long next_mfn, + unsigned int next_level, bool iw, + bool ir) +@@ -87,7 +87,7 @@ static unsigned int set_iommu_pte_presen + int pde_level, + bool iw, bool ir) + { +- struct amd_iommu_pte *table, *pde; ++ union amd_iommu_pte *table, *pde; + unsigned int flush_flags; + + table = map_domain_page(_mfn(pt_mfn)); +@@ -178,7 +178,7 @@ void iommu_dte_set_guest_cr3(struct amd_ + static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn, + unsigned long pt_mfn[], bool map) + { +- struct amd_iommu_pte *pde, *next_table_vaddr; ++ union amd_iommu_pte *pde, *next_table_vaddr; + unsigned long next_table_mfn; + unsigned int level; + struct page_info *table; +@@ -458,7 +458,7 @@ int __init amd_iommu_quarantine_init(str + unsigned long end_gfn = + 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); + unsigned int level = amd_iommu_get_paging_mode(end_gfn); +- struct amd_iommu_pte *table; ++ union amd_iommu_pte *table; + + if ( hd->arch.root_table ) + { +@@ -489,7 +489,7 @@ int __init amd_iommu_quarantine_init(str + + for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) + { +- struct amd_iommu_pte *pde = &table[i]; ++ union amd_iommu_pte *pde = &table[i]; + + /* + * PDEs are essentially a subset of PTEs, so this function +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -390,7 +390,7 @@ static void deallocate_next_page_table(s + + static void deallocate_page_table(struct page_info *pg) + { +- struct amd_iommu_pte *table_vaddr; ++ union amd_iommu_pte *table_vaddr; + unsigned int index, level = PFN_ORDER(pg); + + PFN_ORDER(pg) = 0; +@@ -405,7 +405,7 @@ static void deallocate_page_table(struct + + for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) + { +- struct amd_iommu_pte *pde = &table_vaddr[index]; ++ union amd_iommu_pte *pde = &table_vaddr[index]; + + if ( pde->mfn && pde->next_level && pde->pr ) + { +@@ -557,7 +557,7 @@ static void amd_dump_p2m_table_level(str + paddr_t gpa, int indent) + { + paddr_t address; +- struct amd_iommu_pte *table_vaddr; ++ const union amd_iommu_pte *table_vaddr; + int index; + + if ( level < 1 ) +@@ -573,7 +573,7 @@ static void amd_dump_p2m_table_level(str + + for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) + { +- struct amd_iommu_pte *pde = &table_vaddr[index]; ++ const union amd_iommu_pte *pde = &table_vaddr[index]; + + if ( !(index % 2) ) + process_pending_softirqs(); +--- a/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h ++++ b/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h +@@ -465,20 +465,23 @@ union amd_iommu_x2apic_control { + #define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4) + #define IOMMU_PAGE_TABLE_ALIGNMENT 4096 + +-struct amd_iommu_pte { +- uint64_t pr:1; +- uint64_t ignored0:4; +- uint64_t a:1; +- uint64_t d:1; +- uint64_t ignored1:2; +- uint64_t next_level:3; +- uint64_t mfn:40; +- uint64_t reserved:7; +- uint64_t u:1; +- uint64_t fc:1; +- uint64_t ir:1; +- uint64_t iw:1; +- uint64_t ignored2:1; ++union amd_iommu_pte { ++ uint64_t raw; ++ struct { ++ bool pr:1; ++ unsigned int ign0:4; ++ bool a:1; ++ bool d:1; ++ unsigned int ign1:2; ++ unsigned int next_level:3; ++ uint64_t mfn:40; ++ unsigned int :7; ++ bool u:1; ++ bool fc:1; ++ bool ir:1; ++ bool iw:1; ++ unsigned int ign2:1; ++ }; + }; + + /* Paging modes */ diff --git a/system/xen/xsa/xsa347-4.13-2.patch b/system/xen/xsa/xsa347-4.13-2.patch new file mode 100644 index 0000000000000..fbe7461636071 --- /dev/null +++ b/system/xen/xsa/xsa347-4.13-2.patch @@ -0,0 +1,72 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: AMD/IOMMU: update live PTEs atomically + +Updating a live PTE bitfield by bitfield risks the compiler re-ordering +the individual updates as well as splitting individual updates into +multiple memory writes. Construct the new entry fully in a local +variable, do the check to determine the flushing needs on the thus +established new entry, and then write the new entry by a single insn. + +Similarly using memset() to clear a PTE is unsafe, as the order of +writes the function does is, at least in principle, undefined. + +This is part of XSA-347. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> + +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -45,7 +45,7 @@ static unsigned int clear_iommu_pte_pres + pte = &table[pfn_to_pde_idx(dfn, 1)]; + + flush_flags = pte->pr ? IOMMU_FLUSHF_modified : 0; +- memset(pte, 0, sizeof(*pte)); ++ write_atomic(&pte->raw, 0); + + unmap_domain_page(table); + +@@ -57,26 +57,30 @@ static unsigned int set_iommu_pde_presen + unsigned int next_level, bool iw, + bool ir) + { ++ union amd_iommu_pte new = {}, old; + unsigned int flush_flags = IOMMU_FLUSHF_added; + +- if ( pte->pr && +- (pte->mfn != next_mfn || +- pte->iw != iw || +- pte->ir != ir || +- pte->next_level != next_level) ) +- flush_flags |= IOMMU_FLUSHF_modified; +- + /* + * FC bit should be enabled in PTE, this helps to solve potential + * issues with ATS devices + */ +- pte->fc = !next_level; ++ new.fc = !next_level; ++ ++ new.mfn = next_mfn; ++ new.iw = iw; ++ new.ir = ir; ++ new.next_level = next_level; ++ new.pr = true; ++ ++ old.raw = read_atomic(&pte->raw); ++ old.ign0 = 0; ++ old.ign1 = 0; ++ old.ign2 = 0; ++ ++ if ( old.pr && old.raw != new.raw ) ++ flush_flags |= IOMMU_FLUSHF_modified; + +- pte->mfn = next_mfn; +- pte->iw = iw; +- pte->ir = ir; +- pte->next_level = next_level; +- pte->pr = 1; ++ write_atomic(&pte->raw, new.raw); + + return flush_flags; + } diff --git a/system/xen/xsa/xsa347-4.13-3.patch b/system/xen/xsa/xsa347-4.13-3.patch new file mode 100644 index 0000000000000..90c8e66020c71 --- /dev/null +++ b/system/xen/xsa/xsa347-4.13-3.patch @@ -0,0 +1,59 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: AMD/IOMMU: ensure suitable ordering of DTE modifications + +DMA and interrupt translation should be enabled only after other +applicable DTE fields have been written. Similarly when disabling +translation or when moving a device between domains, translation should +first be disabled, before other entry fields get modified. Note however +that the "moving" aspect doesn't apply to the interrupt remapping side, +as domain specifics are maintained in the IRTEs here, not the DTE. We +also never disable interrupt remapping once it got enabled for a device +(the respective argument passed is always the immutable iommu_intremap). + +This is part of XSA-347. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Paul Durrant <paul@xen.org> + +--- a/xen/drivers/passthrough/amd/iommu_map.c ++++ b/xen/drivers/passthrough/amd/iommu_map.c +@@ -107,11 +107,18 @@ void amd_iommu_set_root_page_table(struc + uint64_t root_ptr, uint16_t domain_id, + uint8_t paging_mode, bool valid) + { ++ if ( valid || dte->v ) ++ { ++ dte->tv = false; ++ dte->v = true; ++ smp_wmb(); ++ } + dte->domain_id = domain_id; + dte->pt_root = paddr_to_pfn(root_ptr); + dte->iw = true; + dte->ir = true; + dte->paging_mode = paging_mode; ++ smp_wmb(); + dte->tv = true; + dte->v = valid; + } +@@ -134,6 +141,7 @@ void amd_iommu_set_intremap_table( + } + + dte->ig = false; /* unmapped interrupts result in i/o page faults */ ++ smp_wmb(); + dte->iv = valid; + } + +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -120,7 +120,10 @@ static void amd_iommu_setup_domain_devic + /* Undo what amd_iommu_disable_domain_device() may have done. */ + ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; + if ( dte->it_root ) ++ { + dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED; ++ smp_wmb(); ++ } + dte->iv = iommu_intremap; + dte->ex = ivrs_dev->dte_allow_exclusion; + dte->sys_mgt = MASK_EXTR(ivrs_dev->device_flags, ACPI_IVHD_SYSTEM_MGMT); |