diff options
author | Mario Preksavec <mario@slackware.hr> | 2019-11-19 13:17:56 +0100 |
---|---|---|
committer | Willy Sudiarto Raharjo <willysr@slackbuilds.org> | 2019-11-23 16:02:01 +0700 |
commit | 903c02712d4cf39ae8218eb47149258dfa8c7d8a (patch) | |
tree | 61c869ed4904270e41898b9bcf74090822e28418 /system/xen | |
parent | 604be6a3da8dc95e2d89a426877c7f4021eb91df (diff) |
system/xen: Updated for version 4.12.1.
Signed-off-by: Mario Preksavec <mario@slackware.hr>
Diffstat (limited to 'system/xen')
34 files changed, 4956 insertions, 18 deletions
diff --git a/system/xen/dom0/README.dom0 b/system/xen/dom0/README.dom0 index a624587c3d717..728e3bae79bdf 100644 --- a/system/xen/dom0/README.dom0 +++ b/system/xen/dom0/README.dom0 @@ -46,7 +46,7 @@ Xen EFI binary. To make things a bit easier, a copy of Xen EFI binary can be found here: - http://slackware.hr/~mario/xen/xen-4.12.0.efi.gz + http://slackware.hr/~mario/xen/xen-4.12.1.efi.gz If an automatic boot to Xen kernel is desired, the binary should be renamed and copied to the following location: /boot/efi/EFI/BOOT/bootx64.efi diff --git a/system/xen/dom0/config-4.4.172-xen.i686 b/system/xen/dom0/config-4.4.202-xen.i686 index 0cf848c1c95dd..b31c558e377e3 100644 --- a/system/xen/dom0/config-4.4.172-xen.i686 +++ b/system/xen/dom0/config-4.4.202-xen.i686 @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.4.172 Kernel Configuration +# Linux/x86 4.4.202 Kernel Configuration # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -545,6 +545,9 @@ CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_ARCH_RANDOM=y CONFIG_X86_SMAP=y # CONFIG_X86_INTEL_MPX is not set +CONFIG_X86_INTEL_TSX_MODE_OFF=y +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +# CONFIG_X86_INTEL_TSX_MODE_AUTO is not set CONFIG_EFI=y # CONFIG_EFI_STUB is not set CONFIG_SECCOMP=y @@ -3473,6 +3476,7 @@ CONFIG_N_HDLC=m CONFIG_N_GSM=m CONFIG_TRACE_ROUTER=m CONFIG_TRACE_SINK=m +CONFIG_LDISC_AUTOLOAD=y CONFIG_DEVMEM=y CONFIG_DEVKMEM=y @@ -3553,7 +3557,6 @@ CONFIG_HW_RANDOM_VIRTIO=m CONFIG_HW_RANDOM_TPM=m CONFIG_NVRAM=m CONFIG_DTLK=m -CONFIG_R3964=m CONFIG_APPLICOM=m CONFIG_SONYPI=m @@ -5666,7 +5669,6 @@ CONFIG_USB_EMI62=m CONFIG_USB_EMI26=m CONFIG_USB_ADUTUX=m CONFIG_USB_SEVSEG=m -CONFIG_USB_RIO500=m CONFIG_USB_LEGOTOWER=m CONFIG_USB_LCD=m CONFIG_USB_LED=m @@ -6669,7 +6671,7 @@ CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY_USER=y CONFIG_FANOTIFY=y -# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -7081,6 +7083,7 @@ CONFIG_TEST_HEXDUMP=m # CONFIG_TEST_KSTRTOX is not set # CONFIG_TEST_PRINTF is not set # CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set # CONFIG_DMA_API_DEBUG is not set CONFIG_TEST_LKM=m @@ -7226,7 +7229,7 @@ CONFIG_CRYPTO_VMAC=m # Digest # CONFIG_CRYPTO_CRC32C=y -CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32C_INTEL=y CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_CRC32_PCLMUL=m CONFIG_CRYPTO_CRCT10DIF=y diff --git a/system/xen/dom0/config-4.4.172-xen.x86_64 b/system/xen/dom0/config-4.4.202-xen.x86_64 index 9dcff9e02418d..f81a31b9ab97d 100644 --- a/system/xen/dom0/config-4.4.172-xen.x86_64 +++ b/system/xen/dom0/config-4.4.202-xen.x86_64 @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.4.172 Kernel Configuration +# Linux/x86 4.4.202 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -525,6 +525,9 @@ CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_ARCH_RANDOM=y CONFIG_X86_SMAP=y # CONFIG_X86_INTEL_MPX is not set +CONFIG_X86_INTEL_TSX_MODE_OFF=y +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +# CONFIG_X86_INTEL_TSX_MODE_AUTO is not set CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_EFI_MIXED=y @@ -3378,6 +3381,7 @@ CONFIG_N_HDLC=m CONFIG_N_GSM=m CONFIG_TRACE_ROUTER=m CONFIG_TRACE_SINK=m +CONFIG_LDISC_AUTOLOAD=y CONFIG_DEVMEM=y CONFIG_DEVKMEM=y @@ -3449,7 +3453,6 @@ CONFIG_HW_RANDOM_VIA=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_HW_RANDOM_TPM=m CONFIG_NVRAM=m -CONFIG_R3964=m CONFIG_APPLICOM=m # @@ -5468,7 +5471,6 @@ CONFIG_USB_EMI62=m CONFIG_USB_EMI26=m CONFIG_USB_ADUTUX=m CONFIG_USB_SEVSEG=m -CONFIG_USB_RIO500=m CONFIG_USB_LEGOTOWER=m CONFIG_USB_LCD=m CONFIG_USB_LED=m @@ -6477,7 +6479,7 @@ CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY_USER=y CONFIG_FANOTIFY=y -# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -6892,6 +6894,7 @@ CONFIG_TEST_HEXDUMP=m # CONFIG_TEST_KSTRTOX is not set # CONFIG_TEST_PRINTF is not set # CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set # CONFIG_DMA_API_DEBUG is not set CONFIG_TEST_LKM=m @@ -7040,7 +7043,7 @@ CONFIG_CRYPTO_VMAC=m # Digest # CONFIG_CRYPTO_CRC32C=y -CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32C_INTEL=y CONFIG_CRYPTO_CRC32=m CONFIG_CRYPTO_CRC32_PCLMUL=m CONFIG_CRYPTO_CRCT10DIF=y diff --git a/system/xen/dom0/kernel-xen.sh b/system/xen/dom0/kernel-xen.sh index 7067c04b79949..7228dcc541470 100644 --- a/system/xen/dom0/kernel-xen.sh +++ b/system/xen/dom0/kernel-xen.sh @@ -5,8 +5,8 @@ # Written by Chris Abela <chris.abela@maltats.com>, 20100515 # Modified by Mario Preksavec <mario@slackware.hr> -KERNEL=${KERNEL:-4.4.172} -XEN=${XEN:-4.12.0} +KERNEL=${KERNEL:-4.4.202} +XEN=${XEN:-4.12.1} BOOTLOADER=${BOOTLOADER:-lilo} ROOTMOD=${ROOTMOD:-ext4} diff --git a/system/xen/domU/domU.sh b/system/xen/domU/domU.sh index c471242392e6c..f017c1bd23737 100644 --- a/system/xen/domU/domU.sh +++ b/system/xen/domU/domU.sh @@ -7,7 +7,7 @@ set -e -KERNEL=${KERNEL:-4.4.172} +KERNEL=${KERNEL:-4.4.202} # Build an image for the root file system and another for the swap # Default values : 8GB and 500MB resepectively. diff --git a/system/xen/xen.SlackBuild b/system/xen/xen.SlackBuild index 506d100a4c1fb..52bec8230e342 100644 --- a/system/xen/xen.SlackBuild +++ b/system/xen/xen.SlackBuild @@ -23,7 +23,7 @@ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PRGNAM=xen -VERSION=${VERSION:-4.12.0} +VERSION=${VERSION:-4.12.1} BUILD=${BUILD:-1} TAG=${TAG:-_SBo} diff --git a/system/xen/xen.info b/system/xen/xen.info index ae94f715cdf14..e0ee8e6f20056 100644 --- a/system/xen/xen.info +++ b/system/xen/xen.info @@ -1,7 +1,7 @@ PRGNAM="xen" -VERSION="4.12.0" +VERSION="4.12.1" HOMEPAGE="http://www.xenproject.org/" -DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.0.tar.gz \ +DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.1.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-d2063b7693e0e35db97b2264aa987eb6341ae779.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \ @@ -13,7 +13,7 @@ DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.0.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \ http://mirror.slackware.hr/sources/xen-seabios/seabios-1.12.0.tar.gz \ http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20180725_ef529e6ab7.tar.bz2" -MD5SUM="7d24d4541e3025421e02384cabc3528b \ +MD5SUM="3f96ae93a5d6a3dd89bdf1398e30895e \ 0de05da7aec358881bb1dff815ecca14 \ 36cc57650cffda9a0269493be2a169bb \ debc62758716a169df9f62e6ab2bc634 \ diff --git a/system/xen/xsa/xsa296.patch b/system/xen/xsa/xsa296.patch new file mode 100644 index 0000000000000..e71ea7f790f27 --- /dev/null +++ b/system/xen/xsa/xsa296.patch @@ -0,0 +1,195 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: xen/hypercall: Don't use BUG() for parameter checking in hypercall_create_continuation() + +Since c/s 1d429034 "hypercall: update vcpu_op to take an unsigned vcpuid", +which incorrectly swapped 'i' for 'u' in the parameter type list, guests have +been able to hit the BUG() in next_args()'s default case. + +Correct these back to 'i'. + +In addition, make adjustments to prevent this class of issue from occurring in +the future - crashing Xen is not an appropriate form of parameter checking. + +Capitalise NEXT_ARG() to catch all uses, to highlight that it is a macro doing +non-function-like things behind the scenes, and undef it when appropriate. +Implement a bad_fmt: block which prints an error, asserts unreachable, and +crashes the guest. + +On the ARM side, drop all parameter checking of p. It is asymmetric with the +x86 side, and akin to expecting memcpy() or sprintf() to check their src/fmt +parameter before use. A caller passing "" or something other than a string +literal will be obvious during code review. + +This is XSA-296. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Julien Grall <julien.grall@arm.com> + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index 941bbff4fe..a3da8e9c08 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -383,14 +383,15 @@ void sync_vcpu_execstate(struct vcpu *v) + /* Nothing to do -- no lazy switching */ + } + +-#define next_arg(fmt, args) ({ \ ++#define NEXT_ARG(fmt, args) \ ++({ \ + unsigned long __arg; \ + switch ( *(fmt)++ ) \ + { \ + case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ + case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ + case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ +- default: __arg = 0; BUG(); \ ++ default: goto bad_fmt; \ + } \ + __arg; \ + }) +@@ -405,9 +406,6 @@ unsigned long hypercall_create_continuation( + unsigned int i; + va_list args; + +- /* All hypercalls take at least one argument */ +- BUG_ON( !p || *p == '\0' ); +- + current->hcall_preempted = true; + + va_start(args, format); +@@ -415,7 +413,7 @@ unsigned long hypercall_create_continuation( + if ( mcs->flags & MCSF_in_multicall ) + { + for ( i = 0; *p != '\0'; i++ ) +- mcs->call.args[i] = next_arg(p, args); ++ mcs->call.args[i] = NEXT_ARG(p, args); + + /* Return value gets written back to mcs->call.result */ + rc = mcs->call.result; +@@ -431,7 +429,7 @@ unsigned long hypercall_create_continuation( + + for ( i = 0; *p != '\0'; i++ ) + { +- arg = next_arg(p, args); ++ arg = NEXT_ARG(p, args); + + switch ( i ) + { +@@ -454,7 +452,7 @@ unsigned long hypercall_create_continuation( + + for ( i = 0; *p != '\0'; i++ ) + { +- arg = next_arg(p, args); ++ arg = NEXT_ARG(p, args); + + switch ( i ) + { +@@ -475,8 +473,16 @@ unsigned long hypercall_create_continuation( + va_end(args); + + return rc; ++ ++ bad_fmt: ++ gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p); ++ ASSERT_UNREACHABLE(); ++ domain_crash(current->domain); ++ return 0; + } + ++#undef NEXT_ARG ++ + void startup_cpu_idle_loop(void) + { + struct vcpu *v = current; +diff --git a/xen/arch/x86/hypercall.c b/xen/arch/x86/hypercall.c +index d483dbaa6b..4643e5eb43 100644 +--- a/xen/arch/x86/hypercall.c ++++ b/xen/arch/x86/hypercall.c +@@ -80,14 +80,15 @@ const hypercall_args_t hypercall_args_table[NR_hypercalls] = + #undef COMP + #undef ARGS + +-#define next_arg(fmt, args) ({ \ ++#define NEXT_ARG(fmt, args) \ ++({ \ + unsigned long __arg; \ + switch ( *(fmt)++ ) \ + { \ + case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ + case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ + case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ +- default: __arg = 0; BUG(); \ ++ default: goto bad_fmt; \ + } \ + __arg; \ + }) +@@ -109,7 +110,7 @@ unsigned long hypercall_create_continuation( + if ( mcs->flags & MCSF_in_multicall ) + { + for ( i = 0; *p != '\0'; i++ ) +- mcs->call.args[i] = next_arg(p, args); ++ mcs->call.args[i] = NEXT_ARG(p, args); + } + else + { +@@ -121,7 +122,7 @@ unsigned long hypercall_create_continuation( + { + for ( i = 0; *p != '\0'; i++ ) + { +- arg = next_arg(p, args); ++ arg = NEXT_ARG(p, args); + switch ( i ) + { + case 0: regs->rdi = arg; break; +@@ -137,7 +138,7 @@ unsigned long hypercall_create_continuation( + { + for ( i = 0; *p != '\0'; i++ ) + { +- arg = next_arg(p, args); ++ arg = NEXT_ARG(p, args); + switch ( i ) + { + case 0: regs->rbx = arg; break; +@@ -154,8 +155,16 @@ unsigned long hypercall_create_continuation( + va_end(args); + + return op; ++ ++ bad_fmt: ++ gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p); ++ ASSERT_UNREACHABLE(); ++ domain_crash(curr->domain); ++ return 0; + } + ++#undef NEXT_ARG ++ + int hypercall_xlat_continuation(unsigned int *id, unsigned int nr, + unsigned int mask, ...) + { +diff --git a/xen/common/compat/domain.c b/xen/common/compat/domain.c +index 39877b3ab2..2531fa7421 100644 +--- a/xen/common/compat/domain.c ++++ b/xen/common/compat/domain.c +@@ -81,7 +81,7 @@ int compat_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) ar + } + + if ( rc == -ERESTART ) +- rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh", ++ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", + cmd, vcpuid, arg); + + break; +diff --git a/xen/common/domain.c b/xen/common/domain.c +index 2308588052..65bcd85e34 100644 +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -1411,7 +1411,7 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) + + rc = arch_initialise_vcpu(v, arg); + if ( rc == -ERESTART ) +- rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh", ++ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", + cmd, vcpuid, arg); + + break; diff --git a/system/xen/xsa/xsa298.patch b/system/xen/xsa/xsa298.patch new file mode 100644 index 0000000000000..aa39042be56bd --- /dev/null +++ b/system/xen/xsa/xsa298.patch @@ -0,0 +1,89 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/PV: check GDT/LDT limits during emulation + +Accesses beyond the LDT limit originating from emulation would trigger +the ASSERT() in pv_map_ldt_shadow_page(). On production builds such +accesses would cause an attempt to promote the touched page (offset from +the present LDT base address) to a segment descriptor one. If this +happens to succeed, guest user mode would be able to elevate its +privileges to that of the guest kernel. This is particularly easy when +there's no LDT at all, in which case the LDT base stored internally to +Xen is simply zero. + +Also adjust the ASSERT() that was triggering: It was off by one to +begin with, and for production builds we also better use +ASSERT_UNREACHABLE() instead with suitable recovery code afterwards. + +This is XSA-298. + +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +--- +v2: Correct 64-bit-only limit check (by folding into the common one). + +--- a/xen/arch/x86/pv/emul-gate-op.c ++++ b/xen/arch/x86/pv/emul-gate-op.c +@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned + const seg_desc_t *pdesc = gdt_ldt_desc_ptr(gate_sel); + + if ( (gate_sel < 4) || +- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || ++ /* ++ * We're interested in call gates only, which occupy a single ++ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit. ++ */ ++ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >= ++ (gate_sel & 4 ? v->arch.pv.ldt_ents ++ : v->arch.pv.gdt_ents)) || + __get_user(desc, pdesc) ) + return 0; + +@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned + if ( !is_pv_32bit_vcpu(v) ) + { + if ( (*ar & 0x1f00) != 0x0c00 || +- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || ++ /* Limit check done above already. */ + __get_user(desc, pdesc + 1) || + (desc.b & 0x1f00) ) + return 0; +--- a/xen/arch/x86/pv/emulate.c ++++ b/xen/arch/x86/pv/emulate.c +@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int + { + seg_desc_t desc; + +- if ( sel < 4) ++ if ( sel < 4 || ++ /* ++ * Don't apply the GDT limit here, as the selector may be a Xen ++ * provided one. __get_user() will fail (without taking further ++ * action) for ones falling in the gap between guest populated ++ * and Xen ones. ++ */ ++ ((sel & 4) && (sel >> 3) >= v->arch.pv.ldt_ents) ) + desc.b = desc.a = 0; + else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) ) + return 0; +--- a/xen/arch/x86/pv/mm.c ++++ b/xen/arch/x86/pv/mm.c +@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int + BUG_ON(unlikely(in_irq())); + + /* +- * Hardware limit checking should guarantee this property. NB. This is ++ * Prior limit checking should guarantee this property. NB. This is + * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the + * current vcpu, and vcpu_reset() will block until this vcpu has been + * descheduled before continuing. + */ +- ASSERT((offset >> 3) <= curr->arch.pv.ldt_ents); ++ if ( unlikely((offset >> 3) >= curr->arch.pv.ldt_ents) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return false; ++ } + + if ( is_pv_32bit_domain(currd) ) + linear = (uint32_t)linear; diff --git a/system/xen/xsa/xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch b/system/xen/xsa/xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch new file mode 100644 index 0000000000000..fbb9d8086b6f8 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch @@ -0,0 +1,94 @@ +From 33d051917d5ef38f678b507a3c832afde48b9b49 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry + +On detection of a potential L1TF issue, most validation code returns +-ERESTART to allow the switch to shadow mode to happen and cause the +original operation to be restarted. + +However, in the validation code, the return value -ERESTART has been +repurposed to indicate 1) the function has partially completed +something which needs to be undone, and 2) calling put_page_type() +should cleanly undo it. This causes problems in several places. + +For L1 tables, on receiving an -ERESTART return from alloc_l1_table(), +alloc_page_type() will set PGT_partial on the page. If for some +reason the original operation never restarts, then on domain +destruction, relinquish_memory() will call free_page_type() on the +page. + +Unfortunately, alloc_ and free_l1_table() aren't set up to deal with +PGT_partial. When returning a failure, alloc_l1_table() always +de-validates whatever it's validated so far, and free_l1_table() +always devalidates the whole page. This means that if +relinquish_memory() calls free_page_type() on an L1 that didn't +complete due to an L1TF, it will call put_page_from_l1e() on "page +entries" that have never been validated. + +For L2+ tables, setting rc to ERESTART causes the rest of the +alloc_lN_table() function to *think* that the entry in question will +have PGT_partial set. This will cause it to set partial_pte = 1. If +relinqush_memory() then calls free_page_type() on one of those pages, +then free_lN_table() will call put_page_from_lNe() on the entry when +it shouldn't. + +Rather than indicating -ERESTART, indicate -EINTR. This is the code +to indicate that nothing has changed from when you started the call +(which is effectively how alloc_l1_table() handles errors). + +mod_lN_entry() shouldn't have any of these types of problems, so leave +potential changes there for a clean-up patch later. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 3557cd1178..a1b55c10ff 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1409,7 +1409,7 @@ static int alloc_l1_table(struct page_info *page) + { + if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) + { +- ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; ++ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0; + if ( ret ) + goto out; + } +@@ -1517,7 +1517,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + { + if ( !pv_l1tf_check_l2e(d, l2e) ) + continue; +- rc = -ERESTART; ++ rc = -EINTR; + } + else + rc = get_page_from_l2e(l2e, pfn, d, partial); +@@ -1603,7 +1603,7 @@ static int alloc_l3_table(struct page_info *page) + { + if ( !pv_l1tf_check_l3e(d, l3e) ) + continue; +- rc = -ERESTART; ++ rc = -EINTR; + } + else + rc = get_page_from_l3e(l3e, pfn, d, partial); +@@ -1783,7 +1783,7 @@ static int alloc_l4_table(struct page_info *page) + { + if ( !pv_l1tf_check_l4e(d, l4e) ) + continue; +- rc = -ERESTART; ++ rc = -EINTR; + } + else + rc = get_page_from_l4e(l4e, pfn, d, partial); +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch b/system/xen/xsa/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch new file mode 100644 index 0000000000000..a74598e597fad --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch @@ -0,0 +1,99 @@ +From b490792c18f74b76ec8161721c1e07f810e36309 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially + de-validated page + +When unpinning pagetables, if an operation is interrupted, +relinquish_memory() re-sets PGT_pinned so that the un-pin will +pickedup again when the hypercall restarts. + +This is appropriate when put_page_and_type_preemptible() returns +-EINTR, which indicates that the page is back in its initial state +(i.e., completely validated). However, for -ERESTART, this leads to a +state where a page has both PGT_pinned and PGT_partial set. + +This happens to work at the moment, although it's not really a +"canonical" state; but in subsequent patches, where we need to make a +distinction in handling between PGT_validated and PGT_partial pages, +this causes issues. + +Move to a "canonical" state by: +- Only re-setting PGT_pinned on -EINTR +- Re-dropping the refcount held by PGT_pinned on -ERESTART + +In the latter case, the PGT_partial bit will be cleared further down +with the rest of the other PGT_partial pages. + +While here, clean up some trainling whitespace. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++--- + 1 file changed, 28 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 2585327834..59df8a6d8d 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -114,7 +114,7 @@ static void play_dead(void) + * this case, heap corruption or #PF can occur (when heap debugging is + * enabled). For example, even printk() can involve tasklet scheduling, + * which touches per-cpu vars. +- * ++ * + * Consider very carefully when adding code to *dead_idle. Most hypervisor + * subsystems are unsafe to call. + */ +@@ -1909,9 +1909,34 @@ static int relinquish_memory( + break; + case -ERESTART: + case -EINTR: ++ /* ++ * -EINTR means PGT_validated has been re-set; re-set ++ * PGT_pinned again so that it gets picked up next time ++ * around. ++ * ++ * -ERESTART, OTOH, means PGT_partial is set instead. Put ++ * it back on the list, but don't set PGT_pinned; the ++ * section below will finish off de-validation. But we do ++ * need to drop the general ref associated with ++ * PGT_pinned, since put_page_and_type_preemptible() ++ * didn't do it. ++ * ++ * NB we can do an ASSERT for PGT_validated, since we ++ * "own" the type ref; but theoretically, the PGT_partial ++ * could be cleared by someone else. ++ */ ++ if ( ret == -EINTR ) ++ { ++ ASSERT(page->u.inuse.type_info & PGT_validated); ++ set_bit(_PGT_pinned, &page->u.inuse.type_info); ++ } ++ else ++ put_page(page); ++ + ret = -ERESTART; ++ ++ /* Put the page back on the list and drop the ref we grabbed above */ + page_list_add(page, list); +- set_bit(_PGT_pinned, &page->u.inuse.type_info); + put_page(page); + goto out; + default: +@@ -2161,7 +2186,7 @@ void vcpu_kick(struct vcpu *v) + * pending flag. These values may fluctuate (after all, we hold no + * locks) but the key insight is that each change will cause + * evtchn_upcall_pending to be polled. +- * ++ * + * NB2. We save the running flag across the unblock to avoid a needless + * IPI for domains that we IPI'd to unblock. + */ +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch b/system/xen/xsa/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch new file mode 100644 index 0000000000000..226e5487b1579 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch @@ -0,0 +1,618 @@ +From 0f9f61e5737fdd346550ec6e30161fa99e4653fa Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into + individual flags + +At the moment, partial_pte is a tri-state that contains two distinct bits +of information: + +1. If zero, the pte at index [nr_validated_ptes] is un-validated. If + non-zero, the pte was last seen with PGT_partial set. + +2. If positive, the pte at index [nr_validated_ptes] does not hold a + general reference count. If negative, it does. + +To make future patches more clear, separate out this functionality +into two distinct, named bits: PTF_partial_set (for #1) and +PTF_partial_general_ref (for #2). + +Additionally, a number of functions which need this information also +take other flags to control behavior (such as `preemptible` and +`defer`). These are hard to read in the caller (since you only see +'true' or 'false'), and ugly when many are added together. In +preparation for adding yet another flag in a future patch, collapse +all of these into a single `flag` variable. + +NB that this does mean checking for what was previously the '-1' +condition a bit more ugly in the put_page_from_lNe functions (since +you have to check for both partial_set and general ref); but this +clause will go away in a future patch. + +Also note that the original comment had an off-by-one error: +partial_flags (like partial_pte before it) concerns +plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1]. + +No functional change intended. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 165 ++++++++++++++++++++++++--------------- + xen/include/asm-x86/mm.h | 41 +++++++--- + 2 files changed, 128 insertions(+), 78 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index a1b55c10ff..3f6f8cc9b8 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1094,20 +1094,35 @@ get_page_from_l1e( + } + + #ifdef CONFIG_PV ++ ++/* ++ * The following flags are used to specify behavior of various get and ++ * put commands. The first two are also stored in page->partial_flags ++ * to indicate the state of the page pointed to by ++ * page->pte[page->nr_validated_entries]. See the comment in mm.h for ++ * more information. ++ */ ++#define PTF_partial_set (1 << 0) ++#define PTF_partial_general_ref (1 << 1) ++#define PTF_preemptible (1 << 2) ++#define PTF_defer (1 << 3) ++ + static int get_page_and_type_from_mfn( + mfn_t mfn, unsigned long type, struct domain *d, +- int partial, int preemptible) ++ unsigned int flags) + { + struct page_info *page = mfn_to_page(mfn); + int rc; ++ bool preemptible = flags & PTF_preemptible, ++ partial_ref = flags & PTF_partial_general_ref; + +- if ( likely(partial >= 0) && ++ if ( likely(!partial_ref) && + unlikely(!get_page_from_mfn(mfn, d)) ) + return -EINVAL; + + rc = _get_page_type(page, type, preemptible); + +- if ( unlikely(rc) && partial >= 0 && ++ if ( unlikely(rc) && !partial_ref && + (!preemptible || page != current->arch.old_guest_table) ) + put_page(page); + +@@ -1117,7 +1132,7 @@ static int get_page_and_type_from_mfn( + define_get_linear_pagetable(l2); + static int + get_page_from_l2e( +- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial) ++ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags) + { + unsigned long mfn = l2e_get_pfn(l2e); + int rc; +@@ -1129,8 +1144,9 @@ get_page_from_l2e( + return -EINVAL; + } + +- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, +- partial, false); ++ ASSERT(!(flags & PTF_preemptible)); ++ ++ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; + +@@ -1140,7 +1156,7 @@ get_page_from_l2e( + define_get_linear_pagetable(l3); + static int + get_page_from_l3e( +- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial) ++ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags) + { + int rc; + +@@ -1152,7 +1168,7 @@ get_page_from_l3e( + } + + rc = get_page_and_type_from_mfn( +- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1); ++ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible); + if ( unlikely(rc == -EINVAL) && + !is_pv_32bit_domain(d) && + get_l3_linear_pagetable(l3e, pfn, d) ) +@@ -1164,7 +1180,7 @@ get_page_from_l3e( + define_get_linear_pagetable(l4); + static int + get_page_from_l4e( +- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial) ++ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags) + { + int rc; + +@@ -1176,7 +1192,7 @@ get_page_from_l4e( + } + + rc = get_page_and_type_from_mfn( +- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1); ++ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible); + if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) + rc = 0; + +@@ -1277,7 +1293,7 @@ static void put_data_page(struct page_info *page, bool writeable) + * Note also that this automatically deals correctly with linear p.t.'s. + */ + static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, +- int partial, bool defer) ++ unsigned int flags) + { + int rc = 0; + +@@ -1300,12 +1316,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + struct page_info *pg = l2e_get_page(l2e); + struct page_info *ptpg = mfn_to_page(_mfn(pfn)); + +- if ( unlikely(partial > 0) ) ++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == ++ PTF_partial_set ) + { +- ASSERT(!defer); ++ ASSERT(!(flags & PTF_defer)); + rc = _put_page_type(pg, true, ptpg); + } +- else if ( defer ) ++ else if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = ptpg; + current->arch.old_guest_table = pg; +@@ -1322,7 +1339,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + } + + static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, +- int partial, bool defer) ++ unsigned int flags) + { + struct page_info *pg; + int rc; +@@ -1345,13 +1362,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + + pg = l3e_get_page(l3e); + +- if ( unlikely(partial > 0) ) ++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == ++ PTF_partial_set ) + { +- ASSERT(!defer); ++ ASSERT(!(flags & PTF_defer)); + return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); + } + +- if ( defer ) ++ if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; +@@ -1366,7 +1384,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + } + + static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, +- int partial, bool defer) ++ unsigned int flags) + { + int rc = 1; + +@@ -1375,13 +1393,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + { + struct page_info *pg = l4e_get_page(l4e); + +- if ( unlikely(partial > 0) ) ++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == ++ PTF_partial_set ) + { +- ASSERT(!defer); ++ ASSERT(!(flags & PTF_defer)); + return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); + } + +- if ( defer ) ++ if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; +@@ -1492,12 +1511,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + unsigned long pfn = mfn_x(page_to_mfn(page)); + l2_pgentry_t *pl2e; + unsigned int i; +- int rc = 0, partial = page->partial_pte; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags; + + pl2e = map_domain_page(_mfn(pfn)); + + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; +- i++, partial = 0 ) ++ i++, partial_flags = 0 ) + { + l2_pgentry_t l2e; + +@@ -1520,17 +1540,18 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + rc = -EINTR; + } + else +- rc = get_page_from_l2e(l2e, pfn, d, partial); ++ rc = get_page_from_l2e(l2e, pfn, d, partial_flags); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: 1; ++ /* Set 'set', retain 'general ref' */ ++ page->partial_flags = partial_flags | PTF_partial_set; + } + else if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + else if ( rc < 0 && rc != -EINTR ) +@@ -1539,7 +1560,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1563,7 +1584,8 @@ static int alloc_l3_table(struct page_info *page) + unsigned long pfn = mfn_x(page_to_mfn(page)); + l3_pgentry_t *pl3e; + unsigned int i; +- int rc = 0, partial = page->partial_pte; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags; + + pl3e = map_domain_page(_mfn(pfn)); + +@@ -1578,7 +1600,7 @@ static int alloc_l3_table(struct page_info *page) + memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); + + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; +- i++, partial = 0 ) ++ i++, partial_flags = 0 ) + { + l3_pgentry_t l3e = pl3e[i]; + +@@ -1597,7 +1619,8 @@ static int alloc_l3_table(struct page_info *page) + else + rc = get_page_and_type_from_mfn( + l3e_get_mfn(l3e), +- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1); ++ PGT_l2_page_table | PGT_pae_xen_l2, d, ++ partial_flags | PTF_preemptible); + } + else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + { +@@ -1606,17 +1629,18 @@ static int alloc_l3_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l3e(l3e, pfn, d, partial); ++ rc = get_page_from_l3e(l3e, pfn, d, partial_flags); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: 1; ++ /* Set 'set', leave 'general ref' set if this entry was set */ ++ page->partial_flags = partial_flags | PTF_partial_set; + } + else if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + if ( rc < 0 ) +@@ -1633,7 +1657,7 @@ static int alloc_l3_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1767,10 +1791,11 @@ static int alloc_l4_table(struct page_info *page) + unsigned long pfn = mfn_x(page_to_mfn(page)); + l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); + unsigned int i; +- int rc = 0, partial = page->partial_pte; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags; + + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; +- i++, partial = 0 ) ++ i++, partial_flags = 0 ) + { + l4_pgentry_t l4e; + +@@ -1786,12 +1811,13 @@ static int alloc_l4_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l4e(l4e, pfn, d, partial); ++ rc = get_page_from_l4e(l4e, pfn, d, partial_flags); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: 1; ++ /* Set 'set', leave 'general ref' set if this entry was set */ ++ page->partial_flags = partial_flags | PTF_partial_set; + } + else if ( rc < 0 ) + { +@@ -1801,7 +1827,7 @@ static int alloc_l4_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_pte = 0; ++ page->partial_flags = 0; + if ( rc == -EINTR ) + rc = -ERESTART; + else +@@ -1853,19 +1879,20 @@ static int free_l2_table(struct page_info *page) + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l2_pgentry_t *pl2e; +- int rc = 0, partial = page->partial_pte; +- unsigned int i = page->nr_validated_ptes - !partial; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags, ++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); + + pl2e = map_domain_page(_mfn(pfn)); + + for ( ; ; ) + { + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) +- rc = put_page_from_l2e(pl2e[i], pfn, partial, false); ++ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags); + if ( rc < 0 ) + break; + +- partial = 0; ++ partial_flags = 0; + + if ( !i-- ) + break; +@@ -1887,12 +1914,14 @@ static int free_l2_table(struct page_info *page) + else if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: -1; ++ page->partial_flags = (partial_flags & PTF_partial_set) ? ++ partial_flags : ++ (PTF_partial_set | PTF_partial_general_ref); + } + else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + +@@ -1904,18 +1933,19 @@ static int free_l3_table(struct page_info *page) + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l3_pgentry_t *pl3e; +- int rc = 0, partial = page->partial_pte; +- unsigned int i = page->nr_validated_ptes - !partial; ++ int rc = 0; ++ unsigned int partial_flags = page->partial_flags, ++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); + + pl3e = map_domain_page(_mfn(pfn)); + + for ( ; ; ) + { +- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); ++ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags); + if ( rc < 0 ) + break; + +- partial = 0; ++ partial_flags = 0; + if ( rc == 0 ) + pl3e[i] = unadjust_guest_l3e(pl3e[i], d); + +@@ -1934,12 +1964,14 @@ static int free_l3_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: -1; ++ page->partial_flags = (partial_flags & PTF_partial_set) ? ++ partial_flags : ++ (PTF_partial_set | PTF_partial_general_ref); + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + return rc > 0 ? 0 : rc; +@@ -1950,26 +1982,29 @@ static int free_l4_table(struct page_info *page) + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); +- int rc = 0, partial = page->partial_pte; +- unsigned int i = page->nr_validated_ptes - !partial; ++ int rc = 0; ++ unsigned partial_flags = page->partial_flags, ++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); + + do { + if ( is_guest_l4_slot(d, i) ) +- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0); ++ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags); + if ( rc < 0 ) + break; +- partial = 0; ++ partial_flags = 0; + } while ( i-- ); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_pte = partial ?: -1; ++ page->partial_flags = (partial_flags & PTF_partial_set) ? ++ partial_flags : ++ (PTF_partial_set | PTF_partial_general_ref); + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; +- page->partial_pte = 0; ++ page->partial_flags = 0; + rc = -ERESTART; + } + +@@ -2247,7 +2282,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, + return -EBUSY; + } + +- put_page_from_l2e(ol2e, pfn, 0, true); ++ put_page_from_l2e(ol2e, pfn, PTF_defer); + + return rc; + } +@@ -2315,7 +2350,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, + if ( !create_pae_xen_mappings(d, pl3e) ) + BUG(); + +- put_page_from_l3e(ol3e, pfn, 0, 1); ++ put_page_from_l3e(ol3e, pfn, PTF_defer); + return rc; + } + +@@ -2378,7 +2413,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, + return -EFAULT; + } + +- put_page_from_l4e(ol4e, pfn, 0, 1); ++ put_page_from_l4e(ol4e, pfn, PTF_defer); + return rc; + } + #endif /* CONFIG_PV */ +@@ -2649,7 +2684,7 @@ int free_page_type(struct page_info *page, unsigned long type, + if ( !(type & PGT_partial) ) + { + page->nr_validated_ptes = 1U << PAGETABLE_ORDER; +- page->partial_pte = 0; ++ page->partial_flags = 0; + } + + switch ( type & PGT_type_mask ) +@@ -2946,7 +2981,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + if ( !(x & PGT_partial) ) + { + page->nr_validated_ptes = 0; +- page->partial_pte = 0; ++ page->partial_flags = 0; + } + page->linear_pt_count = 0; + rc = alloc_page_type(page, type, preemptible); +@@ -3122,7 +3157,7 @@ int new_guest_cr3(mfn_t mfn) + return 0; + } + +- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1); ++ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible); + switch ( rc ) + { + case 0: +@@ -3473,7 +3508,7 @@ long do_mmuext_op( + if ( op.arg1.mfn != 0 ) + { + rc = get_page_and_type_from_mfn( +- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1); ++ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible); + + if ( unlikely(rc) ) + { +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 6faa563167..8406ac3c37 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -228,19 +228,34 @@ struct page_info + * setting the flag must not drop that reference, whereas the instance + * clearing it will have to. + * +- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has +- * been partially validated. This implies that the general reference +- * to the page (acquired from get_page_from_lNe()) would be dropped +- * (again due to the apparent failure) and hence must be re-acquired +- * when resuming the validation, but must not be dropped when picking +- * up the page for invalidation. ++ * If partial_flags & PTF_partial_set is set, then the page at ++ * at @nr_validated_ptes had PGT_partial set as a result of an ++ * operation on the current page. (That page may or may not ++ * still have PGT_partial set.) + * +- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has +- * been partially invalidated. This is basically the opposite case of +- * above, i.e. the general reference to the page was not dropped in +- * put_page_from_lNe() (due to the apparent failure), and hence it +- * must be dropped when the put operation is resumed (and completes), +- * but it must not be acquired if picking up the page for validation. ++ * If PTF_partial_general_ref is set, then the PTE at ++ * @nr_validated_ptef holds a general reference count for the ++ * page. ++ * ++ * This happens: ++ * - During de-validation, if de-validation of the page was ++ * interrupted ++ * - During validation, if an invalid entry is encountered and ++ * validation is preemptible ++ * - During validation, if PTF_partial_general_ref was set on ++ * this entry to begin with (perhaps because we're picking ++ * up from a partial de-validation). ++ * ++ * When resuming validation, if PTF_partial_general_ref is clear, ++ * then a general reference must be re-acquired; if it is set, no ++ * reference should be acquired. ++ * ++ * When resuming de-validation, if PTF_partial_general_ref is ++ * clear, no reference should be dropped; if it is set, a ++ * reference should be dropped. ++ * ++ * NB that PTF_partial_set and PTF_partial_general_ref are ++ * defined in mm.c, the only place where they are used. + * + * The 3rd field, @linear_pt_count, indicates + * - by a positive value, how many same-level page table entries a page +@@ -251,7 +266,7 @@ struct page_info + struct { + u16 nr_validated_ptes:PAGETABLE_ORDER + 1; + u16 :16 - PAGETABLE_ORDER - 1 - 2; +- s16 partial_pte:2; ++ u16 partial_flags:2; + s16 linear_pt_count; + }; + +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch b/system/xen/xsa/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch new file mode 100644 index 0000000000000..d07c233225664 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch @@ -0,0 +1,140 @@ +From db1d801aa8dcb918a27486a6e8d9cf5d7307dec3 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a + boolean + +This is in mainly in preparation for _put_page_type taking the +partial_flags value in the future. It also makes it easier to read in +the caller (since you see a flag name rather than `true` or `false`). + +No functional change intended. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 25 +++++++++++++------------ + 1 file changed, 13 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 3f6f8cc9b8..0740b61af8 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1200,7 +1200,7 @@ get_page_from_l4e( + } + #endif /* CONFIG_PV */ + +-static int _put_page_type(struct page_info *page, bool preemptible, ++static int _put_page_type(struct page_info *page, unsigned int flags, + struct page_info *ptpg); + + void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) +@@ -1320,7 +1320,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + PTF_partial_set ) + { + ASSERT(!(flags & PTF_defer)); +- rc = _put_page_type(pg, true, ptpg); ++ rc = _put_page_type(pg, PTF_preemptible, ptpg); + } + else if ( flags & PTF_defer ) + { +@@ -1329,7 +1329,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + } + else + { +- rc = _put_page_type(pg, true, ptpg); ++ rc = _put_page_type(pg, PTF_preemptible, ptpg); + if ( likely(!rc) ) + put_page(pg); + } +@@ -1366,7 +1366,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + PTF_partial_set ) + { + ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + } + + if ( flags & PTF_defer ) +@@ -1376,7 +1376,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + return 0; + } + +- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + +@@ -1397,7 +1397,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + PTF_partial_set ) + { + ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + } + + if ( flags & PTF_defer ) +@@ -1407,7 +1407,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + return 0; + } + +- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + } +@@ -2757,10 +2757,11 @@ static int _put_final_page_type(struct page_info *page, unsigned long type, + } + + +-static int _put_page_type(struct page_info *page, bool preemptible, ++static int _put_page_type(struct page_info *page, unsigned int flags, + struct page_info *ptpg) + { + unsigned long nx, x, y = page->u.inuse.type_info; ++ bool preemptible = flags & PTF_preemptible; + + ASSERT(current_locked_page_ne_check(page)); + +@@ -2969,7 +2970,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(iommu_ret) ) + { +- _put_page_type(page, false, NULL); ++ _put_page_type(page, 0, NULL); + rc = iommu_ret; + goto out; + } +@@ -2996,7 +2997,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + void put_page_type(struct page_info *page) + { +- int rc = _put_page_type(page, false, NULL); ++ int rc = _put_page_type(page, 0, NULL); + ASSERT(rc == 0); + (void)rc; + } +@@ -3013,7 +3014,7 @@ int get_page_type(struct page_info *page, unsigned long type) + + int put_page_type_preemptible(struct page_info *page) + { +- return _put_page_type(page, true, NULL); ++ return _put_page_type(page, PTF_preemptible, NULL); + } + + int get_page_type_preemptible(struct page_info *page, unsigned long type) +@@ -3030,7 +3031,7 @@ int put_old_guest_table(struct vcpu *v) + if ( !v->arch.old_guest_table ) + return 0; + +- switch ( rc = _put_page_type(v->arch.old_guest_table, true, ++ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, + v->arch.old_guest_ptpg) ) + { + case -EINTR: +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch b/system/xen/xsa/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch new file mode 100644 index 0000000000000..9cfbb739079b5 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch @@ -0,0 +1,79 @@ +From 6f257854c8778774210281c5c21028c4b7739b44 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional + +Make it easier to read by declaring the conditions in which we will +retain the ref, rather than the conditions under which we release it. + +The only way (page == current->arch.old_guest_table) can be true is if +preemptible is true; so remove this from the query itself, and add an +ASSERT() to that effect on the opposite path. + +No functional change intended. + +NB that alloc_lN_table() mishandle the "linear pt failure" situation +described in the comment; this will be addressed in a future patch. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 37 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0740b61af8..0a4d39a2c3 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1122,8 +1122,43 @@ static int get_page_and_type_from_mfn( + + rc = _get_page_type(page, type, preemptible); + +- if ( unlikely(rc) && !partial_ref && +- (!preemptible || page != current->arch.old_guest_table) ) ++ /* ++ * Retain the refcount if: ++ * - page is fully validated (rc == 0) ++ * - page is not validated (rc < 0) but: ++ * - We came in with a reference (partial_ref) ++ * - page is partially validated but there's been an error ++ * (page == current->arch.old_guest_table) ++ * ++ * The partial_ref-on-error clause is worth an explanation. There ++ * are two scenarios where partial_ref might be true coming in: ++ * - mfn has been partially demoted as type `type`; i.e. has ++ * PGT_partial set ++ * - mfn has been partially demoted as L(type+1) (i.e., a linear ++ * page; e.g. we're being called from get_page_from_l2e with ++ * type == PGT_l1_table, but the mfn is PGT_l2_table) ++ * ++ * If there's an error, in the first case, _get_page_type will ++ * either return -ERESTART, in which case we want to retain the ++ * ref (as the caller will consider it retained), or -EINVAL, in ++ * which case old_guest_table will be set; in both cases, we need ++ * to retain the ref. ++ * ++ * In the second case, if there's an error, _get_page_type() can ++ * *only* return -EINVAL, and *never* set old_guest_table. In ++ * that case we also want to retain the reference, to allow the ++ * page to continue to be torn down (i.e., PGT_partial cleared) ++ * safely. ++ * ++ * Also note that we shouldn't be able to leave with the reference ++ * count retained unless we succeeded, or the operation was ++ * preemptible. ++ */ ++ if ( likely(!rc) || partial_ref ) ++ /* nothing */; ++ else if ( page == current->arch.old_guest_table ) ++ ASSERT(preemptible); ++ else + put_page(page); + + return rc; +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch b/system/xen/xsa/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch new file mode 100644 index 0000000000000..72ee3eac9e4e5 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch @@ -0,0 +1,111 @@ +From 4ad70553611a7a4e4494d5a3b51b5cc295a488e0 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when + preempting + +In order to allow recursive pagetable promotions and demotions to be +interrupted, Xen must keep track of the state of the sub-pages +promoted or demoted. This is stored in two elements in the page +struct: nr_entries_validated and partial_flags. + +The rule is that entries [0, nr_entries_validated) should always be +validated and hold a general reference count. If partial_flags is +zero, then [nr_entries_validated] is not validated and no reference +count is held. If PTF_partial_set is set, then [nr_entries_validated] +is partially validated. + +At the moment, a distinction is made between promotion and demotion +with regard to whether the entry itself "holds" a general reference +count: when entry promotion is interrupted (i.e., returns -ERESTART), +the entry is not considered to hold a reference; when entry demotion +is interrupted, the entry is still considered to hold a general +reference. + +PTF_partial_general_ref is used to distinguish between these cases. +If clear, it's a partial promotion => no general reference count held +by the entry; if set, it's partial demotion, so a general reference +count held. Because promotions and demotions can be interleaved, this +value is passed to get_page_and_type_from_mfn and put_page_from_l*e, +to be able to properly handle reference counts. + +Unfortunately, when alloc_l[23]_table check hypercall_preempt_check() +and return -ERESTART, they set nr_entries_validated, but don't clear +partial_flags. + +If we were picking up from a previously-interrupted promotion, that +means that PTF_partial_set would be set even though +[nr_entries_validated] was not partially validated. This means that +if the page in this state were de-validated, put_page_type() would +erroneously be called on that entry. + +Perhaps worse, if we were racing with a de-validation, then we might +leave both PTF_partial_set and PTF_partial_general_ref; and when +de-validation picked up again, both the type and the general ref would +be erroneously dropped from [nr_entries_validated]. + +In a sense, the real issue here is code duplication. Rather than +duplicate the interruption code, set rc to -EINTR and fall through to +the code which already handles that case correctly. + +Given the logic at this point, it should be impossible for +partial_flags to be non-zero; add an ASSERT() to catch any changes. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 25 ++++++------------------- + 1 file changed, 6 insertions(+), 19 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0a4d39a2c3..bbd29a68f4 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1554,21 +1554,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; + i++, partial_flags = 0 ) + { +- l2_pgentry_t l2e; ++ l2_pgentry_t l2e = pl2e[i]; + + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) +- { +- page->nr_validated_ptes = i; +- rc = -ERESTART; +- break; +- } +- +- if ( !is_guest_l2_slot(d, type, i) ) ++ rc = -EINTR; ++ else if ( !is_guest_l2_slot(d, type, i) ) + continue; +- +- l2e = pl2e[i]; +- +- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) ++ else if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + if ( !pv_l1tf_check_l2e(d, l2e) ) + continue; +@@ -1640,13 +1632,8 @@ static int alloc_l3_table(struct page_info *page) + l3_pgentry_t l3e = pl3e[i]; + + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) +- { +- page->nr_validated_ptes = i; +- rc = -ERESTART; +- break; +- } +- +- if ( is_pv_32bit_domain(d) && (i == 3) ) ++ rc = -EINTR; ++ else if ( is_pv_32bit_domain(d) && (i == 3) ) + { + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || + (l3e_get_flags(l3e) & l3_disallow_mask(d)) ) +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch b/system/xen/xsa/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch new file mode 100644 index 0000000000000..ef390e2b137db --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch @@ -0,0 +1,378 @@ +From 51fe4e67d954649fcf103116be6206a769f0db1e Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial + +In order to allow recursive pagetable promotions and demotions to be +interrupted, Xen must keep track of the state of the sub-pages +promoted or demoted. This is stored in two elements in the page struct: +nr_entries_validated and partial_flags. + +The rule is that entries [0, nr_entries_validated) should always be +validated and hold a general reference count. If partial_flags is +zero, then [nr_entries_validated] is not validated and no reference +count is held. If PTF_partial_set is set, then [nr_entries_validated] +is partially validated. + +At the moment, a distinction is made between promotion and demotion +with regard to whether the entry itself "holds" a general reference +count: when entry promotion is interrupted (i.e., returns -ERESTART), +the entry is not considered to hold a reference; when entry demotion +is interrupted, the entry is still considered to hold a general +reference. + +PTF_partial_general_ref is used to distinguish between these cases. +If clear, it's a partial promotion => no general reference count held +by the entry; if set, it's partial demotion, so a general reference +count held. Because promotions and demotions can be interleaved, this +value is passed to get_page_and_type_from_mfn and put_page_from_l*e, +to be able to properly handle reference counts. + +Unfortunately, because a refcount is not held, it is possible to +engineer a situation where PFT_partial_set is set but the page in +question has been assigned to another domain. A sketch is provided in +the appendix. + +Fix this by having the parent page table entry hold a general +reference count whenever PFT_partial_set is set. (For clarity of +change, keep two separate flags. These will be collapsed in a +subsequent changeset.) + +This has two basic implications. On the put_page_from_lNe() side, +this mean that the (partial_set && !partial_ref) case can never happen, +and no longer needs to be special-cased. + +Secondly, because both flags are set together, there's no need to carry over +existing bits from partial_pte. + +(NB there is still another issue with calling _put_page_type() on a +page which had PGT_partial set; that will be handled in a subsequent +patch.) + +On the get_page_and_type_from_mfn() side, we need to distinguish +between callers which hold a reference on partial (i.e., +alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and +so on): pass a flag if the type should be retained on interruption. + +NB that since l1 promotion can't be preempted, that get_page_from_l2e +can't return -ERESTART. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +----- +* Appendix: Engineering PTF_partial_set while a page belongs to a + foreign domain + +Suppose A is a page which can be promoted to an l3, and B is a page +which can be promoted to an l2, and A[x] points to B. B has +PGC_allocated set but no other general references. + +V1: PIN_L3 A. + A is validated, B is validated. + A.type_count = 1 | PGT_validated | PGT_pinned + B.type_count = 1 | PGT_validated + B.count = 2 | PGC_allocated (A[x] holds a general ref) + +V1: UNPIN A. + A begins de-validation. + Arrange to be interrupted when i < x + V1->old_guest_table = A + V1->old_guest_table_ref_held = false + A.type_count = 1 | PGT_partial + A.nr_validated_entries = i < x + B.type_count = 0 + B.count = 1 | PGC_allocated + +V2: MOD_L4_ENTRY to point some l4e to A. + Picks up re-validation of A. + Arrange to be interrupted halfway through B's validation + B.type_count = 1 | PGT_partial + B.count = 2 | PGC_allocated (PGT_partial holds a general ref) + A.type_count = 1 | PGT_partial + A.nr_validated_entries = x + A.partial_pte = PTF_partial_set + +V3: MOD_L3_ENTRY to point some other l3e (not in A) to B. + Validates B. + B.type_count = 1 | PGT_validated + B.count = 2 | PGC_allocated ("other l3e" holds a general ref) + +V3: MOD_L3_ENTRY to clear l3e pointing to B. + Devalidates B. + B.type_count = 0 + B.count = 1 | PGC_allocated + +V3: decrease_reservation(B) + Clears PGC_allocated + B.count = 0 => B is freed + +B gets assigned to a different domain + +V1: Restarts UNPIN of A + put_old_guest_table(A) + ... + free_l3_table(A) + +Now since A.partial_flags has PTF_partial_set, free_l3_table() will +call put_page_from_l3e() on A[x], which points to B, while B is owned +by another domain. + +If A[x] held a general refcount for B on partial validation, as it does +for partial de-validation, then B would still have a reference count of +1 after PGC_allocated was freed; so B wouldn't be freed until after +put_page_from_l3e() had happend on A[x]. +--- + xen/arch/x86/mm.c | 84 +++++++++++++++++++++++----------------- + xen/include/asm-x86/mm.h | 15 ++++--- + 2 files changed, 58 insertions(+), 41 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index bbd29a68f4..4d3ebf341d 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1102,10 +1102,11 @@ get_page_from_l1e( + * page->pte[page->nr_validated_entries]. See the comment in mm.h for + * more information. + */ +-#define PTF_partial_set (1 << 0) +-#define PTF_partial_general_ref (1 << 1) +-#define PTF_preemptible (1 << 2) +-#define PTF_defer (1 << 3) ++#define PTF_partial_set (1 << 0) ++#define PTF_partial_general_ref (1 << 1) ++#define PTF_preemptible (1 << 2) ++#define PTF_defer (1 << 3) ++#define PTF_retain_ref_on_restart (1 << 4) + + static int get_page_and_type_from_mfn( + mfn_t mfn, unsigned long type, struct domain *d, +@@ -1114,7 +1115,11 @@ static int get_page_and_type_from_mfn( + struct page_info *page = mfn_to_page(mfn); + int rc; + bool preemptible = flags & PTF_preemptible, +- partial_ref = flags & PTF_partial_general_ref; ++ partial_ref = flags & PTF_partial_general_ref, ++ partial_set = flags & PTF_partial_set, ++ retain_ref = flags & PTF_retain_ref_on_restart; ++ ++ ASSERT(partial_ref == partial_set); + + if ( likely(!partial_ref) && + unlikely(!get_page_from_mfn(mfn, d)) ) +@@ -1127,13 +1132,15 @@ static int get_page_and_type_from_mfn( + * - page is fully validated (rc == 0) + * - page is not validated (rc < 0) but: + * - We came in with a reference (partial_ref) ++ * - page is partially validated (rc == -ERESTART), and the ++ * caller has asked the ref to be retained in that case + * - page is partially validated but there's been an error + * (page == current->arch.old_guest_table) + * + * The partial_ref-on-error clause is worth an explanation. There + * are two scenarios where partial_ref might be true coming in: +- * - mfn has been partially demoted as type `type`; i.e. has +- * PGT_partial set ++ * - mfn has been partially promoted / demoted as type `type`; ++ * i.e. has PGT_partial set + * - mfn has been partially demoted as L(type+1) (i.e., a linear + * page; e.g. we're being called from get_page_from_l2e with + * type == PGT_l1_table, but the mfn is PGT_l2_table) +@@ -1156,7 +1163,8 @@ static int get_page_and_type_from_mfn( + */ + if ( likely(!rc) || partial_ref ) + /* nothing */; +- else if ( page == current->arch.old_guest_table ) ++ else if ( page == current->arch.old_guest_table || ++ (retain_ref && rc == -ERESTART) ) + ASSERT(preemptible); + else + put_page(page); +@@ -1354,8 +1362,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == + PTF_partial_set ) + { +- ASSERT(!(flags & PTF_defer)); +- rc = _put_page_type(pg, PTF_preemptible, ptpg); ++ /* partial_set should always imply partial_ref */ ++ BUG(); + } + else if ( flags & PTF_defer ) + { +@@ -1400,8 +1408,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == + PTF_partial_set ) + { +- ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ /* partial_set should always imply partial_ref */ ++ BUG(); + } + + if ( flags & PTF_defer ) +@@ -1431,8 +1439,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == + PTF_partial_set ) + { +- ASSERT(!(flags & PTF_defer)); +- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ /* partial_set should always imply partial_ref */ ++ BUG(); + } + + if ( flags & PTF_defer ) +@@ -1569,13 +1577,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + else + rc = get_page_from_l2e(l2e, pfn, d, partial_flags); + +- if ( rc == -ERESTART ) +- { +- page->nr_validated_ptes = i; +- /* Set 'set', retain 'general ref' */ +- page->partial_flags = partial_flags | PTF_partial_set; +- } +- else if ( rc == -EINTR && i ) ++ /* ++ * It shouldn't be possible for get_page_from_l2e to return ++ * -ERESTART, since we never call this with PTF_preemptible. ++ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable ++ * entry.) ++ * ++ * NB that while on a "clean" promotion, we can never get ++ * PGT_partial. It is possible to arrange for an l2e to ++ * contain a partially-devalidated l2; but in that case, both ++ * of the following functions will fail anyway (the first ++ * because the page in question is not an l1; the second ++ * because the page is not fully validated). ++ */ ++ ASSERT(rc != -ERESTART); ++ ++ if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; + page->partial_flags = 0; +@@ -1584,6 +1601,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + else if ( rc < 0 && rc != -EINTR ) + { + gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); ++ ASSERT(current->arch.old_guest_table == NULL); + if ( i ) + { + page->nr_validated_ptes = i; +@@ -1642,7 +1660,7 @@ static int alloc_l3_table(struct page_info *page) + rc = get_page_and_type_from_mfn( + l3e_get_mfn(l3e), + PGT_l2_page_table | PGT_pae_xen_l2, d, +- partial_flags | PTF_preemptible); ++ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart); + } + else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + { +@@ -1651,13 +1669,14 @@ static int alloc_l3_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l3e(l3e, pfn, d, partial_flags); ++ rc = get_page_from_l3e(l3e, pfn, d, ++ partial_flags | PTF_retain_ref_on_restart); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = partial_flags | PTF_partial_set; ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i ) + { +@@ -1833,13 +1852,14 @@ static int alloc_l4_table(struct page_info *page) + rc = -EINTR; + } + else +- rc = get_page_from_l4e(l4e, pfn, d, partial_flags); ++ rc = get_page_from_l4e(l4e, pfn, d, ++ partial_flags | PTF_retain_ref_on_restart); + + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = partial_flags | PTF_partial_set; ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc < 0 ) + { +@@ -1936,9 +1956,7 @@ static int free_l2_table(struct page_info *page) + else if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = (partial_flags & PTF_partial_set) ? +- partial_flags : +- (PTF_partial_set | PTF_partial_general_ref); ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) + { +@@ -1986,9 +2004,7 @@ static int free_l3_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = (partial_flags & PTF_partial_set) ? +- partial_flags : +- (PTF_partial_set | PTF_partial_general_ref); ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { +@@ -2019,9 +2035,7 @@ static int free_l4_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = (partial_flags & PTF_partial_set) ? +- partial_flags : +- (PTF_partial_set | PTF_partial_general_ref); ++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 8406ac3c37..02079e1324 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -238,22 +238,25 @@ struct page_info + * page. + * + * This happens: +- * - During de-validation, if de-validation of the page was ++ * - During validation or de-validation, if the operation was + * interrupted + * - During validation, if an invalid entry is encountered and + * validation is preemptible + * - During validation, if PTF_partial_general_ref was set on +- * this entry to begin with (perhaps because we're picking +- * up from a partial de-validation). ++ * this entry to begin with (perhaps because it picked up a ++ * previous operation) + * +- * When resuming validation, if PTF_partial_general_ref is clear, +- * then a general reference must be re-acquired; if it is set, no +- * reference should be acquired. ++ * When resuming validation, if PTF_partial_general_ref is ++ * clear, then a general reference must be re-acquired; if it ++ * is set, no reference should be acquired. + * + * When resuming de-validation, if PTF_partial_general_ref is + * clear, no reference should be dropped; if it is set, a + * reference should be dropped. + * ++ * NB at the moment, PTF_partial_set should be set if and only if ++ * PTF_partial_general_ref is set. ++ * + * NB that PTF_partial_set and PTF_partial_general_ref are + * defined in mm.c, the only place where they are used. + * +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch b/system/xen/xsa/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch new file mode 100644 index 0000000000000..6cf41d1cd6941 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch @@ -0,0 +1,227 @@ +From 8a8d836f7f7418e659d37817a66cd7a6b115042b Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and + PTF_partial_general_ref into one + +...now that they are equivalent. No functional change intended. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 50 +++++++++++----------------------------- + xen/include/asm-x86/mm.h | 29 +++++++++++------------ + 2 files changed, 26 insertions(+), 53 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 4d3ebf341d..886e93b8aa 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1097,13 +1097,12 @@ get_page_from_l1e( + + /* + * The following flags are used to specify behavior of various get and +- * put commands. The first two are also stored in page->partial_flags +- * to indicate the state of the page pointed to by ++ * put commands. The first is also stored in page->partial_flags to ++ * indicate the state of the page pointed to by + * page->pte[page->nr_validated_entries]. See the comment in mm.h for + * more information. + */ + #define PTF_partial_set (1 << 0) +-#define PTF_partial_general_ref (1 << 1) + #define PTF_preemptible (1 << 2) + #define PTF_defer (1 << 3) + #define PTF_retain_ref_on_restart (1 << 4) +@@ -1115,13 +1114,10 @@ static int get_page_and_type_from_mfn( + struct page_info *page = mfn_to_page(mfn); + int rc; + bool preemptible = flags & PTF_preemptible, +- partial_ref = flags & PTF_partial_general_ref, + partial_set = flags & PTF_partial_set, + retain_ref = flags & PTF_retain_ref_on_restart; + +- ASSERT(partial_ref == partial_set); +- +- if ( likely(!partial_ref) && ++ if ( likely(!partial_set) && + unlikely(!get_page_from_mfn(mfn, d)) ) + return -EINVAL; + +@@ -1131,14 +1127,14 @@ static int get_page_and_type_from_mfn( + * Retain the refcount if: + * - page is fully validated (rc == 0) + * - page is not validated (rc < 0) but: +- * - We came in with a reference (partial_ref) ++ * - We came in with a reference (partial_set) + * - page is partially validated (rc == -ERESTART), and the + * caller has asked the ref to be retained in that case + * - page is partially validated but there's been an error + * (page == current->arch.old_guest_table) + * +- * The partial_ref-on-error clause is worth an explanation. There +- * are two scenarios where partial_ref might be true coming in: ++ * The partial_set-on-error clause is worth an explanation. There ++ * are two scenarios where partial_set might be true coming in: + * - mfn has been partially promoted / demoted as type `type`; + * i.e. has PGT_partial set + * - mfn has been partially demoted as L(type+1) (i.e., a linear +@@ -1161,7 +1157,7 @@ static int get_page_and_type_from_mfn( + * count retained unless we succeeded, or the operation was + * preemptible. + */ +- if ( likely(!rc) || partial_ref ) ++ if ( likely(!rc) || partial_set ) + /* nothing */; + else if ( page == current->arch.old_guest_table || + (retain_ref && rc == -ERESTART) ) +@@ -1359,13 +1355,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + struct page_info *pg = l2e_get_page(l2e); + struct page_info *ptpg = mfn_to_page(_mfn(pfn)); + +- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == +- PTF_partial_set ) +- { +- /* partial_set should always imply partial_ref */ +- BUG(); +- } +- else if ( flags & PTF_defer ) ++ if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = ptpg; + current->arch.old_guest_table = pg; +@@ -1405,13 +1395,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + + pg = l3e_get_page(l3e); + +- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == +- PTF_partial_set ) +- { +- /* partial_set should always imply partial_ref */ +- BUG(); +- } +- + if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); +@@ -1436,13 +1419,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + { + struct page_info *pg = l4e_get_page(l4e); + +- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == +- PTF_partial_set ) +- { +- /* partial_set should always imply partial_ref */ +- BUG(); +- } +- + if ( flags & PTF_defer ) + { + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); +@@ -1676,7 +1652,7 @@ static int alloc_l3_table(struct page_info *page) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i ) + { +@@ -1859,7 +1835,7 @@ static int alloc_l4_table(struct page_info *page) + { + page->nr_validated_ptes = i; + /* Set 'set', leave 'general ref' set if this entry was set */ +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc < 0 ) + { +@@ -1956,7 +1932,7 @@ static int free_l2_table(struct page_info *page) + else if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) + { +@@ -2004,7 +1980,7 @@ static int free_l3_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { +@@ -2035,7 +2011,7 @@ static int free_l4_table(struct page_info *page) + if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; ++ page->partial_flags = PTF_partial_set; + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 02079e1324..f0fd35bf6b 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -233,7 +233,7 @@ struct page_info + * operation on the current page. (That page may or may not + * still have PGT_partial set.) + * +- * If PTF_partial_general_ref is set, then the PTE at ++ * Additionally, if PTF_partial_set is set, then the PTE at + * @nr_validated_ptef holds a general reference count for the + * page. + * +@@ -242,23 +242,20 @@ struct page_info + * interrupted + * - During validation, if an invalid entry is encountered and + * validation is preemptible +- * - During validation, if PTF_partial_general_ref was set on +- * this entry to begin with (perhaps because it picked up a ++ * - During validation, if PTF_partial_set was set on this ++ * entry to begin with (perhaps because it picked up a + * previous operation) + * +- * When resuming validation, if PTF_partial_general_ref is +- * clear, then a general reference must be re-acquired; if it +- * is set, no reference should be acquired. ++ * When resuming validation, if PTF_partial_set is clear, then ++ * a general reference must be re-acquired; if it is set, no ++ * reference should be acquired. + * +- * When resuming de-validation, if PTF_partial_general_ref is +- * clear, no reference should be dropped; if it is set, a +- * reference should be dropped. ++ * When resuming de-validation, if PTF_partial_set is clear, ++ * no reference should be dropped; if it is set, a reference ++ * should be dropped. + * +- * NB at the moment, PTF_partial_set should be set if and only if +- * PTF_partial_general_ref is set. +- * +- * NB that PTF_partial_set and PTF_partial_general_ref are +- * defined in mm.c, the only place where they are used. ++ * NB that PTF_partial_set is defined in mm.c, the only place ++ * where it is used. + * + * The 3rd field, @linear_pt_count, indicates + * - by a positive value, how many same-level page table entries a page +@@ -268,8 +265,8 @@ struct page_info + */ + struct { + u16 nr_validated_ptes:PAGETABLE_ORDER + 1; +- u16 :16 - PAGETABLE_ORDER - 1 - 2; +- u16 partial_flags:2; ++ u16 :16 - PAGETABLE_ORDER - 1 - 1; ++ u16 partial_flags:1; + s16 linear_pt_count; + }; + +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch b/system/xen/xsa/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch new file mode 100644 index 0000000000000..bbaba794fc7bc --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch @@ -0,0 +1,106 @@ +From da3d1d258e54fe600f7f75287183b74d957ec63b Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion + failures + +In order to allow recursive pagetable promotions and demotions to be +interrupted, Xen must keep track of the state of the sub-pages +promoted or demoted. This is stored in two elements in the page +struct: nr_entries_validated and partial_flags. + +The rule is that entries [0, nr_entries_validated) should always be +validated and hold a general reference count. If partial_flags is +zero, then [nr_entries_validated] is not validated and no reference +count is held. If PTF_partial_set is set, then [nr_entries_validated] +is partially validated, and a general reference count is held. + +Unfortunately, in cases where an entry began with PTF_partial_set set, +and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is +erroneously dropped. (This scenario can be engineered mainly by the +use of interleaving of promoting and demoting a page which has "linear +pagetable" entries; see the appendix for a sketch.) This means that +we will "leak" a general reference count on the page in question, +preventing the page from being freed. + +Fix this by setting page->partial_flags to the partial_flags local +variable. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +----- +Appendix + +Suppose A and B can both be promoted to L2 pages, and A[x] points to B. + +V1: PIN_L2 B. + B.type_count = 1 | PGT_validated + B.count = 2 | PGC_allocated + +V1: MOD_L3_ENTRY pointing something to A. + In the process of validating A[x], grab an extra type / ref on B: + B.type_count = 2 | PGT_validated + B.count = 3 | PGC_allocated + A.type_count = 1 | PGT_validated + A.count = 2 | PGC_allocated + +V1: UNPIN B. + B.type_count = 1 | PGT_validate + B.count = 2 | PGC_allocated + +V1: MOD_L3_ENTRY removing the reference to A. + De-validate A, down to A[x], which points to B. + Drop the final type on B. Arrange to be interrupted. + B.type_count = 1 | PGT_partial + B.count = 2 | PGC_allocated + A.type_count = 1 | PGT_partial + A.nr_validated_entries = x + A.partial_pte = -1 + +V2: MOD_L3_ENTRY adds a reference to A. + +At this point, get_page_from_l2e(A[x]) tries +get_page_and_type_from_mfn(), which fails because it's the wrong type; +and get_l2_linear_pagetable() also fails, because B isn't validated as +an l2 anymore. +--- + xen/arch/x86/mm.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 886e93b8aa..0a094291da 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1581,7 +1581,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_flags = 0; ++ page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1674,7 +1674,7 @@ static int alloc_l3_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_flags = 0; ++ page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1845,7 +1845,7 @@ static int alloc_l4_table(struct page_info *page) + if ( i ) + { + page->nr_validated_ptes = i; +- page->partial_flags = 0; ++ page->partial_flags = partial_flags; + if ( rc == -EINTR ) + rc = -ERESTART; + else +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch b/system/xen/xsa/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch new file mode 100644 index 0000000000000..7d5f022e892c4 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch @@ -0,0 +1,166 @@ +From b3e169dc8daeae85b0b51c25fdb142e2e552ec7f Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:49 +0100 +Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error + +If an invalid entry is discovered when validating a page-table tree, +the entire tree which has so far been validated must be de-validated. +Since this may take a long time, alloc_l[2-4]_table() set current +vcpu's old_guest_table immediately; put_old_guest_table() will make +sure that put_page_type() will be called to finish off the +de-validation before any other MMU operations can happen on the vcpu. + +The invariant for partial pages should be: + +* Entries [0, nr_validated_ptes) should be completely validated; + put_page_type() will de-validate these. + +* If [nr_validated_ptes] is partially validated, partial_flags should + set PTF_partiaL_set. put_page_type() will be called on this page to + finish off devalidation, and the appropriate refcount adjustments + will be done. + +alloc_l[2-3]_table() indicates partial validation to its callers by +setting current->old_guest_table. + +Unfortunately, this is mishandled. + +Take the case where validating lNe[x] returns an error. + +First, alloc_l3_table() doesn't check old_guest_table at all; as a +result, partial_flags is not set when it should be. nr_validated_ptes +is set to x; and since PFT_partial_set clear, de-validation resumes at +nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not +have put_page_type() called on it when de-validating the rest of the +l3: it will be stuck in the PGT_partial state until the domain is +destroyed, or until it is re-used as an l2. (Any other page type will +fail.) + +Worse, alloc_l4_table(), rather than setting PTF_partial_set as it +should, sets nr_validated_ptes to x+1. When de-validating, since +partial is 0, this will correctly resume calling put_page_type at [x]; +but, if the put_page_type() is never called, but instead +get_page_type() is called, validation will pick up at [x+1], +neglecting to validate [x]. If the rest of the validation succeeds, +the l4 will be validated even though [x] is invalid. + +Fix this in both cases by setting PTF_partial_set if old_guest_table +is set. + +While here, add some safety catches: +- old_guest_table must point to the page contained in + [nr_validated_ptes]. +- alloc_l1_page shouldn't set old_guest_table + +If we experience one of these situations in production builds, it's +safer to avoid calling put_page_type for the pages in question. If +they have PGT_partial set, they will be cleaned up on domain +destruction; if not, we have no idea whether a type count is safe to +drop. Retaining an extra type ref that should have been dropped may +trigger a BUG() on the free_domain_page() path, but dropping a type +count that shouldn't be dropped may cause a privilege escalation. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +--- + xen/arch/x86/mm.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 51 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 0a094291da..a432e69c74 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1580,6 +1580,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + ASSERT(current->arch.old_guest_table == NULL); + if ( i ) + { ++ /* ++ * alloc_l1_table() doesn't set old_guest_table; it does ++ * its own tear-down immediately on failure. If it ++ * did we'd need to check it and set partial_flags as we ++ * do in alloc_l[34]_table(). ++ * ++ * Note on the use of ASSERT: if it's non-null and ++ * hasn't been cleaned up yet, it should have ++ * PGT_partial set; and so the type will be cleaned up ++ * on domain destruction. Unfortunately, we would ++ * leak the general ref held by old_guest_table; but ++ * leaking a page is less bad than a host crash. ++ */ ++ ASSERT(current->arch.old_guest_table == NULL); + page->nr_validated_ptes = i; + page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; +@@ -1607,6 +1621,7 @@ static int alloc_l3_table(struct page_info *page) + unsigned int i; + int rc = 0; + unsigned int partial_flags = page->partial_flags; ++ l3_pgentry_t l3e = l3e_empty(); + + pl3e = map_domain_page(_mfn(pfn)); + +@@ -1623,7 +1638,7 @@ static int alloc_l3_table(struct page_info *page) + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial_flags = 0 ) + { +- l3_pgentry_t l3e = pl3e[i]; ++ l3e = pl3e[i]; + + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) + rc = -EINTR; +@@ -1675,6 +1690,24 @@ static int alloc_l3_table(struct page_info *page) + { + page->nr_validated_ptes = i; + page->partial_flags = partial_flags; ++ if ( current->arch.old_guest_table ) ++ { ++ /* ++ * We've experienced a validation failure. If ++ * old_guest_table is set, "transfer" the general ++ * reference count to pl3e[nr_validated_ptes] by ++ * setting PTF_partial_set. ++ * ++ * As a precaution, check that old_guest_table is the ++ * page pointed to by pl3e[nr_validated_ptes]. If ++ * not, it's safer to leak a type ref on production ++ * builds. ++ */ ++ if ( current->arch.old_guest_table == l3e_get_page(l3e) ) ++ page->partial_flags = PTF_partial_set; ++ else ++ ASSERT_UNREACHABLE(); ++ } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +@@ -1851,7 +1884,23 @@ static int alloc_l4_table(struct page_info *page) + else + { + if ( current->arch.old_guest_table ) +- page->nr_validated_ptes++; ++ { ++ /* ++ * We've experienced a validation failure. If ++ * old_guest_table is set, "transfer" the general ++ * reference count to pl3e[nr_validated_ptes] by ++ * setting PTF_partial_set. ++ * ++ * As a precaution, check that old_guest_table is the ++ * page pointed to by pl4e[nr_validated_ptes]. If ++ * not, it's safer to leak a type ref on production ++ * builds. ++ */ ++ if ( current->arch.old_guest_table == l4e_get_page(l4e) ) ++ page->partial_flags = PTF_partial_set; ++ else ++ ASSERT_UNREACHABLE(); ++ } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } +-- +2.23.0 + diff --git a/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch new file mode 100644 index 0000000000000..ad7e6fee1b9d9 --- /dev/null +++ b/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch @@ -0,0 +1,413 @@ +From ea3dc624c5e6325a9c2f079e52a85965d4ab6ce8 Mon Sep 17 00:00:00 2001 +From: George Dunlap <george.dunlap@citrix.com> +Date: Thu, 10 Oct 2019 17:57:50 +0100 +Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to + begin with + +Validation and de-validation of pagetable trees may take arbitrarily +large amounts of time, and so must be preemptible. This is indicated +by setting the PGT_partial bit in the type_info, and setting +nr_validated_entries and partial_flags appropriately. Specifically, +if the entry at [nr_validated_entries] is partially validated, +partial_flags should have the PGT_partial_set bit set, and the entry +should hold a general reference count. During de-validation, +put_page_type() is called on partially validated entries. + +Unfortunately, there are a number of issues with the current algorithm. + +First, doing a "normal" put_page_type() is not safe when no type ref +is held: there is nothing to stop another vcpu from coming along and +picking up validation again: at which point the put_page_type may drop +the only page ref on an in-use page. Some examples are listed in the +appendix. + +The core issue is that put_page_type() is being called both to clean +up PGT_partial, and to drop a type count; and has no way of knowing +which is which; and so if in between, PGT_partial is cleared, +put_page_type() will drop the type ref erroneously. + +What is needed is to distinguish between two states: +- Dropping a type ref which is held +- Cleaning up a page which has been partially de/validated + +Fix this by telling put_page_type() which of the two activities you +intend. + +When cleaning up a partial de/validation, take no action unless you +find a page partially validated. + +If put_page_type() is called without PTF_partial_set, and finds the +page in a PGT_partial state anyway, then there's certainly been a +misaccounting somewhere, and carrying on would almost certainly cause +a security issue, so crash the host instead. + +In put_page_from_lNe, pass partial_flags on to _put_page_type(). + +old_guest_table may be set either with a fully validated page (when +using the "deferred put" pattern), or with a partially validated page +(when a normal "de-validation" is interrupted, or when a validation +fails part-way through due to invalid entries). Add a flag, +old_guest_table_partial, to indicate which of these it is, and use +that to pass the appropriate flag to _put_page_type(). + +While here, delete stray trailing whitespace. + +This is part of XSA-299. + +Reported-by: George Dunlap <george.dunlap@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +----- +Appendix: + +Suppose page A, when interpreted as an l3 pagetable, contains all +valid entries; and suppose A[x] points to page B, which when +interpreted as an l2 pagetable, contains all valid entries. + +P1: PIN_L3_TABLE + A -> PGT_l3_table | 1 | valid + B -> PGT_l2_table | 1 | valid + +P1: UNPIN_TABLE + > Arrange to interrupt after B has been de-validated + B: + type_info -> PGT_l2_table | 0 + A: + type_info -> PGT_l3_table | 1 | partial + nr_validated_enties -> (less than x) + +P2: mod_l4_entry to point to A + > Arrange for this to be interrupted while B is being validated + B: + type_info -> PGT_l2_table | 1 | partial + (nr_validated_entires &c set as appropriate) + A: + type_info -> PGT_l3_table | 1 | partial + nr_validated_entries -> x + partial_pte = 1 + +P3: mod_l3_entry some other unrelated l3 to point to B: + B: + type_info -> PGT_l2_table | 1 + +P1: Restart UNPIN_TABLE + +At this point, since A.nr_validate_entries == x and A.partial_pte != +0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping +its type count to 0 while it's still being pointed to by some other l3 + +A similar issue arises with old_guest_table. Consider the following +scenario: + +Suppose A is a page which, when interpreted as an l2, has valid entries +until entry x, which is invalid. + +V1: PIN_L2_TABLE(A) + <Validate until we try to validate [x], get -EINVAL> + A -> PGT_l2_table | 1 | PGT_partial + V1 -> old_guest_table = A + <delayed> + +V2: PIN_L2_TABLE(A) + <Pick up where V1 left off, try to re-validate [x], get -EINVAL> + A -> PGT_l2_table | 1 | PGT_partial + V2 -> old_guest_table = A + <restart> + put_old_guest_table() + _put_page_type(A) + A -> PGT_l2_table | 0 + +V1: <restart> + put_old_guest_table() + _put_page_type(A) # UNDERFLOW + +Indeed, it is possible to engineer for old_guest_table for every vcpu +a guest has to point to the same page. +--- + xen/arch/x86/domain.c | 6 +++ + xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++----- + xen/include/asm-x86/domain.h | 4 +- + 3 files changed, 95 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 59df8a6d8d..f1ae5f89f5 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1104,9 +1104,15 @@ int arch_set_info_guest( + rc = -ERESTART; + /* Fallthrough */ + case -ERESTART: ++ /* ++ * NB that we're putting the kernel-mode table ++ * here, which we've already successfully ++ * validated above; hence partial = false; ++ */ + v->arch.old_guest_ptpg = NULL; + v->arch.old_guest_table = + pagetable_get_page(v->arch.guest_table); ++ v->arch.old_guest_table_partial = false; + v->arch.guest_table = pagetable_null(); + break; + default: +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index a432e69c74..81774368a0 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1359,10 +1359,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, + { + current->arch.old_guest_ptpg = ptpg; + current->arch.old_guest_table = pg; ++ current->arch.old_guest_table_partial = false; + } + else + { +- rc = _put_page_type(pg, PTF_preemptible, ptpg); ++ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); + if ( likely(!rc) ) + put_page(pg); + } +@@ -1385,6 +1386,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + unsigned long mfn = l3e_get_pfn(l3e); + bool writeable = l3e_get_flags(l3e) & _PAGE_RW; + ++ ASSERT(!(flags & PTF_partial_set)); + ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); + do { + put_data_page(mfn_to_page(_mfn(mfn)), writeable); +@@ -1397,12 +1399,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + + if ( flags & PTF_defer ) + { ++ ASSERT(!(flags & PTF_partial_set)); + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; ++ current->arch.old_guest_table_partial = false; + return 0; + } + +- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + +@@ -1421,12 +1425,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + + if ( flags & PTF_defer ) + { ++ ASSERT(!(flags & PTF_partial_set)); + current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); + current->arch.old_guest_table = pg; ++ current->arch.old_guest_table_partial = false; + return 0; + } + +- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); ++ rc = _put_page_type(pg, flags | PTF_preemptible, ++ mfn_to_page(_mfn(pfn))); + if ( likely(!rc) ) + put_page(pg); + } +@@ -1535,6 +1542,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + + pl2e = map_domain_page(_mfn(pfn)); + ++ /* ++ * NB that alloc_l2_table will never set partial_pte on an l2; but ++ * free_l2_table might if a linear_pagetable entry is interrupted ++ * partway through de-validation. In that circumstance, ++ * get_page_from_l2e() will always return -EINVAL; and we must ++ * retain the type ref by doing the normal partial_flags tracking. ++ */ ++ + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; + i++, partial_flags = 0 ) + { +@@ -1598,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) + page->partial_flags = partial_flags; + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; ++ current->arch.old_guest_table_partial = true; + } + } + if ( rc < 0 ) +@@ -1704,12 +1720,16 @@ static int alloc_l3_table(struct page_info *page) + * builds. + */ + if ( current->arch.old_guest_table == l3e_get_page(l3e) ) ++ { ++ ASSERT(current->arch.old_guest_table_partial); + page->partial_flags = PTF_partial_set; ++ } + else + ASSERT_UNREACHABLE(); + } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; ++ current->arch.old_guest_table_partial = true; + } + while ( i-- > 0 ) + pl3e[i] = unadjust_guest_l3e(pl3e[i], d); +@@ -1897,12 +1917,16 @@ static int alloc_l4_table(struct page_info *page) + * builds. + */ + if ( current->arch.old_guest_table == l4e_get_page(l4e) ) ++ { ++ ASSERT(current->arch.old_guest_table_partial); + page->partial_flags = PTF_partial_set; ++ } + else + ASSERT_UNREACHABLE(); + } + current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; ++ current->arch.old_guest_table_partial = true; + } + } + } +@@ -2831,6 +2855,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags, + x = y; + nx = x - 1; + ++ /* ++ * Is this expected to do a full reference drop, or only ++ * cleanup partial validation / devalidation? ++ * ++ * If the former, the caller must hold a "full" type ref; ++ * which means the page must be validated. If the page is ++ * *not* fully validated, continuing would almost certainly ++ * open up a security hole. An exception to this is during ++ * domain destruction, where PGT_validated can be dropped ++ * without dropping a type ref. ++ * ++ * If the latter, do nothing unless type PGT_partial is set. ++ * If it is set, the type count must be 1. ++ */ ++ if ( !(flags & PTF_partial_set) ) ++ BUG_ON((x & PGT_partial) || ++ !((x & PGT_validated) || page_get_owner(page)->is_dying)); ++ else if ( !(x & PGT_partial) ) ++ return 0; ++ else ++ BUG_ON((x & PGT_count_mask) != 1); ++ + ASSERT((x & PGT_count_mask) != 0); + + switch ( nx & (PGT_locked | PGT_count_mask) ) +@@ -3092,17 +3138,34 @@ int put_old_guest_table(struct vcpu *v) + if ( !v->arch.old_guest_table ) + return 0; + +- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, +- v->arch.old_guest_ptpg) ) ++ rc = _put_page_type(v->arch.old_guest_table, ++ PTF_preemptible | ++ ( v->arch.old_guest_table_partial ? ++ PTF_partial_set : 0 ), ++ v->arch.old_guest_ptpg); ++ ++ if ( rc == -ERESTART || rc == -EINTR ) + { +- case -EINTR: +- case -ERESTART: ++ v->arch.old_guest_table_partial = (rc == -ERESTART); + return -ERESTART; +- case 0: +- put_page(v->arch.old_guest_table); + } + ++ /* ++ * It shouldn't be possible for _put_page_type() to return ++ * anything else at the moment; but if it does happen in ++ * production, leaking the type ref is probably the best thing to ++ * do. Either way, drop the general ref held by old_guest_table. ++ */ ++ ASSERT(rc == 0); ++ ++ put_page(v->arch.old_guest_table); + v->arch.old_guest_table = NULL; ++ v->arch.old_guest_ptpg = NULL; ++ /* ++ * Safest default if someone sets old_guest_table without ++ * explicitly setting old_guest_table_partial. ++ */ ++ v->arch.old_guest_table_partial = true; + + return rc; + } +@@ -3253,11 +3316,11 @@ int new_guest_cr3(mfn_t mfn) + switch ( rc = put_page_and_type_preemptible(page) ) + { + case -EINTR: +- rc = -ERESTART; +- /* fallthrough */ + case -ERESTART: + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ curr->arch.old_guest_table_partial = (rc == -ERESTART); ++ rc = -ERESTART; + break; + default: + BUG_ON(rc); +@@ -3494,6 +3557,7 @@ long do_mmuext_op( + { + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ curr->arch.old_guest_table_partial = false; + } + } + } +@@ -3528,6 +3592,11 @@ long do_mmuext_op( + case -ERESTART: + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ /* ++ * EINTR means we still hold the type ref; ERESTART ++ * means PGT_partial holds the type ref ++ */ ++ curr->arch.old_guest_table_partial = (rc == -ERESTART); + rc = 0; + break; + default: +@@ -3596,11 +3665,15 @@ long do_mmuext_op( + switch ( rc = put_page_and_type_preemptible(page) ) + { + case -EINTR: +- rc = -ERESTART; +- /* fallthrough */ + case -ERESTART: + curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ /* ++ * EINTR means we still hold the type ref; ++ * ERESTART means PGT_partial holds the ref ++ */ ++ curr->arch.old_guest_table_partial = (rc == -ERESTART); ++ rc = -ERESTART; + break; + default: + BUG_ON(rc); +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index 214e44ce1c..2cfce7b36b 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -307,7 +307,7 @@ struct arch_domain + + struct paging_domain paging; + struct p2m_domain *p2m; +- /* To enforce lock ordering in the pod code wrt the ++ /* To enforce lock ordering in the pod code wrt the + * page_alloc lock */ + int page_alloc_unlock_level; + +@@ -581,6 +581,8 @@ struct arch_vcpu + struct page_info *old_guest_table; /* partially destructed pagetable */ + struct page_info *old_guest_ptpg; /* containing page table of the */ + /* former, if any */ ++ bool old_guest_table_partial; /* Are we dropping a type ref, or just ++ * finishing up a partial de-validation? */ + /* guest_table holds a ref to the page, and also a type-count unless + * shadow refcounts are in use */ + pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ +-- +2.23.0 + diff --git a/system/xen/xsa/xsa301-master-1.patch b/system/xen/xsa/xsa301-master-1.patch new file mode 100644 index 0000000000000..54cce2ce28456 --- /dev/null +++ b/system/xen/xsa/xsa301-master-1.patch @@ -0,0 +1,80 @@ +From 19d6330f142cb941b6340a88592e8a294de0ff8c Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Tue, 15 Oct 2019 17:10:40 +0100 +Subject: [PATCH 1/3] xen/arm: p2m: Avoid aliasing guest physical frame + +The P2M helpers implementation is quite lax and will end up to ignore +the unused top bits of a guest physical frame. + +This effectively means that p2m_set_entry() will create a mapping for a +different frame (it is always equal to gfn & (mask unused bits)). Yet +p2m->max_mapped_gfn will be updated using the original frame. + +At the moment, p2m_get_entry() and p2m_resolve_translation_fault() +assume that p2m_get_root_pointer() will always return a non-NULL pointer +when the GFN is smaller than p2m->max_mapped_gfn. + +Unfortunately, because of the aliasing described above, it would be +possible to set p2m->max_mapped_gfn high enough so it covers frame that +would lead p2m_get_root_pointer() to return NULL. + +As we don't sanity check the guest physical frame provided by a guest, a +malicious guest could craft a series of hypercalls that will hit the +BUG_ON() and therefore DoS Xen. + +To prevent aliasing, the function p2m_get_root_pointer() is now reworked +to return NULL If any of the unused top bits are not zero. The caller +can then decide what's the appropriate action to do. Since the two paths +(i.e. P2M_ROOT_PAGES == 1 and P2M_ROOT_PAGES != 1) are now very +similarly, take the opportunity to consolidate them making the code a +bit simpler. + +With this change, p2m_get_entry() will not try to insert a mapping as +the root pointer is invalid. + +Note that root_table is now switch to unsigned long as unsigned int is +not enough to hold part of a GFN. + +This is part of XSA-301. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +--- + xen/arch/arm/p2m.c | 17 +++++------------ + 1 file changed, 5 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index a2749d9b6f..d0045a8b28 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -229,21 +229,14 @@ void p2m_tlb_flush_sync(struct p2m_domain *p2m) + static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m, + gfn_t gfn) + { +- unsigned int root_table; +- +- if ( P2M_ROOT_PAGES == 1 ) +- return __map_domain_page(p2m->root); ++ unsigned long root_table; + + /* +- * Concatenated root-level tables. The table number will be the +- * offset at the previous level. It is not possible to +- * concatenate a level-0 root. ++ * While the root table index is the offset from the previous level, ++ * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be ++ * 0. Yet we still want to check if all the unused bits are zeroed. + */ +- ASSERT(P2M_ROOT_LEVEL > 0); +- +- root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]); +- root_table &= LPAE_ENTRY_MASK; +- ++ root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT); + if ( root_table >= P2M_ROOT_PAGES ) + return NULL; + +-- +2.23.0 + diff --git a/system/xen/xsa/xsa301-master-2.patch b/system/xen/xsa/xsa301-master-2.patch new file mode 100644 index 0000000000000..baedc9c29751f --- /dev/null +++ b/system/xen/xsa/xsa301-master-2.patch @@ -0,0 +1,92 @@ +From 3b896936f7505e929dd869d14afcb185d0ee75f8 Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Tue, 15 Oct 2019 17:10:41 +0100 +Subject: [PATCH 2/3] xen/arm: p2m: Avoid off-by-one check on + p2m->max_mapped_gfn + +The code base is using inconsistently the field p2m->max_mapped_gfn. +Some of the useres expect that p2m->max_guest_gfn contain the highest +mapped GFN while others expect highest + 1. + +p2m->max_guest_gfn is set as highest + 1, because of that the sanity +check on the GFN in p2m_resolved_translation_fault() and +p2m_get_entry() can be bypassed when GFN == p2m->max_guest_gfn. + +p2m_get_root_pointer(p2m->max_guest_gfn) may return NULL if it is +outside of address range supported and therefore the BUG_ON() could be +hit. + +The current value hold in p2m->max_mapped_gfn is inconsistent with the +expectation of the common code (see domain_get_maximum_gpfn()) and also +the documentation of the field. + +Rather than changing the check in p2m_translation_fault() and +p2m_get_entry(), p2m->max_mapped_gfn is now containing the highest +mapped GFN and the callers assuming "highest + 1" are now adjusted. + +Take the opportunity to use 1UL rather than 1 as page_order could +theoritically big enough to overflow a 32-bit integer. + +Lastly, the documentation of the field max_guest_gfn to reflect how it +is computed. + +This is part of XSA-301. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +--- + xen/arch/arm/p2m.c | 6 +++--- + xen/include/asm-arm/p2m.h | 5 +---- + 2 files changed, 4 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index d0045a8b28..8d20d27961 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1041,7 +1041,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m, + p2m_write_pte(entry, pte, p2m->clean_pte); + + p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn, +- gfn_add(sgfn, 1 << page_order)); ++ gfn_add(sgfn, (1UL << page_order) - 1)); + p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn); + } + +@@ -1572,7 +1572,7 @@ int relinquish_p2m_mapping(struct domain *d) + p2m_write_lock(p2m); + + start = p2m->lowest_mapped_gfn; +- end = p2m->max_mapped_gfn; ++ end = gfn_add(p2m->max_mapped_gfn, 1); + + for ( ; gfn_x(start) < gfn_x(end); + start = gfn_next_boundary(start, order) ) +@@ -1641,7 +1641,7 @@ int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end) + p2m_read_lock(p2m); + + start = gfn_max(start, p2m->lowest_mapped_gfn); +- end = gfn_min(end, p2m->max_mapped_gfn); ++ end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1)); + + next_block_gfn = start; + +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index 89f82df380..5fdb6e8183 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -36,10 +36,7 @@ struct p2m_domain { + /* Current Translation Table Base Register for the p2m */ + uint64_t vttbr; + +- /* +- * Highest guest frame that's ever been mapped in the p2m +- * Only takes into account ram and foreign mapping +- */ ++ /* Highest guest frame that's ever been mapped in the p2m */ + gfn_t max_mapped_gfn; + + /* +-- +2.23.0 + diff --git a/system/xen/xsa/xsa301-master-3.patch b/system/xen/xsa/xsa301-master-3.patch new file mode 100644 index 0000000000000..9f137b89f6ef2 --- /dev/null +++ b/system/xen/xsa/xsa301-master-3.patch @@ -0,0 +1,67 @@ +From 060c2dd3b7c2674a019d94afb2b4ebf3663f6c6e Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Tue, 15 Oct 2019 17:10:42 +0100 +Subject: [PATCH 3/3] xen/arm: p2m: Don't check the return of + p2m_get_root_pointer() with BUG_ON() + +It turns out that the BUG_ON() was actually reachable with well-crafted +hypercalls. The BUG_ON() is here to prevent catch logical error, so +crashing Xen is a bit over the top. + +While all the holes should now be fixed, it would be better to downgrade +the BUG_ON() to something less fatal to prevent any more DoS. + +The BUG_ON() in p2m_get_entry() is now replaced by ASSERT_UNREACHABLE() +to catch mistake in debug build and return INVALID_MFN for production +build. The interface also requires to set page_order to give an idea of +the size of "hole". So 'level' is now set so we report a hole of size of +the an entry of the root page-table. This stays inline with what happen +when the GFN is higher than p2m->max_mapped_gfn. + +The BUG_ON() in p2m_resolve_translation_fault() is now replaced by +ASSERT_UNREACHABLE() to catch mistake in debug build and just report a +fault for producion build. + +This is part of XSA-301. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +--- + xen/arch/arm/p2m.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 8d20d27961..ce59f2b503 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -395,7 +395,12 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn, + * the table should always be non-NULL because the gfn is below + * p2m->max_mapped_gfn and the root table pages are always present. + */ +- BUG_ON(table == NULL); ++ if ( !table ) ++ { ++ ASSERT_UNREACHABLE(); ++ level = P2M_ROOT_LEVEL; ++ goto out; ++ } + + for ( level = P2M_ROOT_LEVEL; level < 3; level++ ) + { +@@ -1196,7 +1201,11 @@ bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn) + * The table should always be non-NULL because the gfn is below + * p2m->max_mapped_gfn and the root table pages are always present. + */ +- BUG_ON(table == NULL); ++ if ( !table ) ++ { ++ ASSERT_UNREACHABLE(); ++ goto out; ++ } + + /* + * Go down the page-tables until an entry has the valid bit unset or +-- +2.23.0 + diff --git a/system/xen/xsa/xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch b/system/xen/xsa/xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch new file mode 100644 index 0000000000000..5d52163406f0c --- /dev/null +++ b/system/xen/xsa/xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch @@ -0,0 +1,37 @@ +From 0c9c0fbb356e3210cb77b3d738be50981b26058a Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 2 Oct 2019 13:36:59 +0200 +Subject: [PATCH 1/2] IOMMU: add missing HVM check +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Fix an unguarded d->arch.hvm access in assign_device(). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> + +(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6) +Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> +--- + xen/drivers/passthrough/pci.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 8108ed5f9a..d7420bd8bf 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -1452,7 +1452,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + /* Prevent device assign if mem paging or mem sharing have been + * enabled for this domain */ +- if ( unlikely(d->arch.hvm.mem_sharing_enabled || ++ if ( unlikely((is_hvm_domain(d) && ++ d->arch.hvm.mem_sharing_enabled) || + vm_event_check_ring(d->vm_event_paging) || + p2m_get_hostp2m(d)->global_logdirty) ) + return -EXDEV; +-- +2.11.0 + diff --git a/system/xen/xsa/xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch b/system/xen/xsa/xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch new file mode 100644 index 0000000000000..181ece3bb72a2 --- /dev/null +++ b/system/xen/xsa/xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch @@ -0,0 +1,499 @@ +From 278d8e585a9f110a1af0bd92a9fc43733c9c7227 Mon Sep 17 00:00:00 2001 +From: Paul Durrant <paul.durrant@citrix.com> +Date: Mon, 14 Oct 2019 17:52:59 +0100 +Subject: [PATCH 2/2] passthrough: quarantine PCI devices + +When a PCI device is assigned to an untrusted domain, it is possible for +that domain to program the device to DMA to an arbitrary address. The +IOMMU is used to protect the host from malicious DMA by making sure that +the device addresses can only target memory assigned to the guest. However, +when the guest domain is torn down the device is assigned back to dom0, +thus allowing any in-flight DMA to potentially target critical host data. + +This patch introduces a 'quarantine' for PCI devices using dom_io. When +the toolstack makes a device assignable (by binding it to pciback), it +will now also assign it to DOMID_IO and the device will only be assigned +back to dom0 when the device is made unassignable again. Whilst device is +assignable it will only ever transfer between dom_io and guest domains. +dom_io is actually only used as a sentinel domain for quarantining purposes; +it is not configured with any IOMMU mappings. Assignment to dom_io simply +means that the device's initiator (requestor) identifier is not present in +the IOMMU's device table and thus any DMA transactions issued will be +terminated with a fault condition. + +In addition, a fix to assignment handling is made for VT-d. Failure +during the assignment step should not lead to a device still being +associated with its prior owner. Hand the device to DomIO temporarily, +until the assignment step has completed successfully. Remove the PI +hooks from the source domain then earlier as well. + +Failure of the recovery reassign_device_ownership() may not go silent: +There e.g. may still be left over RMRR mappings in the domain assignment +to which has failed, and hence we can't allow that domain to continue +executing. + +NOTE: This patch also includes one printk() cleanup; the + "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(), + since similar printk()-s elsewhere also don't log such a tag. + +This is XSA-302. + +Signed-off-by: Paul Durrant <paul.durrant@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> +(cherry picked from commit ec99857f59f7f06236f11ca8b0b2303e5e745cc4) +--- + tools/libxl/libxl_pci.c | 25 +++++++++++- + xen/arch/x86/mm.c | 2 + + xen/common/domctl.c | 14 ++++++- + xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++- + xen/drivers/passthrough/iommu.c | 9 +++++ + xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++------- + xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++--- + xen/include/xen/pci.h | 3 ++ + 8 files changed, 138 insertions(+), 24 deletions(-) + +diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c +index 88c324ea23..d6a23fb5f8 100644 +--- a/tools/libxl/libxl_pci.c ++++ b/tools/libxl/libxl_pci.c +@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, + libxl_device_pci *pcidev, + int rebind) + { ++ libxl_ctx *ctx = libxl__gc_owner(gc); + unsigned dom, bus, dev, func; + char *spath, *driver_path = NULL; + int rc; +@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, + } + if ( rc ) { + LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func); +- return 0; ++ goto quarantine; + } + + /* Check to see if there's already a driver that we need to unbind from */ +@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, + return ERROR_FAIL; + } + ++quarantine: ++ /* ++ * DOMID_IO is just a sentinel domain, without any actual mappings, ++ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being ++ * unnecessarily denied. ++ */ ++ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev), ++ XEN_DOMCTL_DEV_RDM_RELAXED); ++ if ( rc < 0 ) { ++ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func); ++ return ERROR_FAIL; ++ } ++ + return 0; + } + +@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc, + libxl_device_pci *pcidev, + int rebind) + { ++ libxl_ctx *ctx = libxl__gc_owner(gc); + int rc; + char *driver_path; + ++ /* De-quarantine */ ++ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev)); ++ if ( rc < 0 ) { ++ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus, ++ pcidev->dev, pcidev->func); ++ return ERROR_FAIL; ++ } ++ + /* Unbind from pciback */ + if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) { + return ERROR_FAIL; +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 3557cd1178..11d753d8d2 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -295,9 +295,11 @@ void __init arch_init_memory(void) + * Initialise our DOMID_IO domain. + * This domain owns I/O pages that are within the range of the page_info + * array. Mappings occur at the priv of the caller. ++ * Quarantined PCI devices will be associated with this domain. + */ + dom_io = domain_create(DOMID_IO, NULL, false); + BUG_ON(IS_ERR(dom_io)); ++ INIT_LIST_HEAD(&dom_io->arch.pdev_list); + + /* + * Initialise our COW domain. +diff --git a/xen/common/domctl.c b/xen/common/domctl.c +index d08b6274e2..e3c4be2b48 100644 +--- a/xen/common/domctl.c ++++ b/xen/common/domctl.c +@@ -391,6 +391,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + + switch ( op->cmd ) + { ++ case XEN_DOMCTL_assign_device: ++ case XEN_DOMCTL_deassign_device: ++ if ( op->domain == DOMID_IO ) ++ { ++ d = dom_io; ++ break; ++ } ++ else if ( op->domain == DOMID_INVALID ) ++ return -ESRCH; ++ /* fall through */ + case XEN_DOMCTL_test_assign_device: + if ( op->domain == DOMID_INVALID ) + { +@@ -412,7 +422,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + + if ( !domctl_lock_acquire() ) + { +- if ( d ) ++ if ( d && d != dom_io ) + rcu_unlock_domain(d); + return hypercall_create_continuation( + __HYPERVISOR_domctl, "h", u_domctl); +@@ -1074,7 +1084,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + domctl_lock_release(); + + domctl_out_unlock_domonly: +- if ( d ) ++ if ( d && d != dom_io ) + rcu_unlock_domain(d); + + if ( copyback && __copy_to_guest(u_domctl, op, 1) ) +diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c +index 33a3798f36..15c13e1163 100644 +--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c ++++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -120,6 +120,10 @@ static void amd_iommu_setup_domain_device( + u8 bus = pdev->bus; + const struct domain_iommu *hd = dom_iommu(domain); + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return; ++ + BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || + !iommu->dev_table.buffer ); + +@@ -277,6 +281,10 @@ void amd_iommu_disable_domain_device(struct domain *domain, + int req_id; + u8 bus = pdev->bus; + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return; ++ + BUG_ON ( iommu->dev_table.buffer == NULL ); + req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); + dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); +@@ -363,7 +371,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn, + ivrs_mappings[req_id].read_permission); + } + +- return reassign_device(hardware_domain, d, devfn, pdev); ++ return reassign_device(pdev->domain, d, devfn, pdev); + } + + static void deallocate_next_page_table(struct page_info *pg, int level) +diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c +index a6697d58fb..2762e1342f 100644 +--- a/xen/drivers/passthrough/iommu.c ++++ b/xen/drivers/passthrough/iommu.c +@@ -232,6 +232,9 @@ void iommu_teardown(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); + ++ if ( d == dom_io ) ++ return; ++ + hd->status = IOMMU_STATUS_disabled; + hd->platform_ops->teardown(d); + tasklet_schedule(&iommu_pt_cleanup_tasklet); +@@ -241,6 +244,9 @@ int iommu_construct(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); + ++ if ( d == dom_io ) ++ return 0; ++ + if ( hd->status == IOMMU_STATUS_initialized ) + return 0; + +@@ -521,6 +527,9 @@ int __init iommu_setup(void) + printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); + if ( iommu_enabled ) + { ++ if ( iommu_domain_init(dom_io) ) ++ panic("Could not set up quarantine\n"); ++ + printk(" - Dom0 mode: %s\n", + iommu_hwdom_passthrough ? "Passthrough" : + iommu_hwdom_strict ? "Strict" : "Relaxed"); +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index d7420bd8bf..d66a8a1daf 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -1426,19 +1426,29 @@ static int iommu_remove_device(struct pci_dev *pdev) + return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev)); + } + +-/* +- * If the device isn't owned by the hardware domain, it means it already +- * has been assigned to other domain, or it doesn't exist. +- */ + static int device_assigned(u16 seg, u8 bus, u8 devfn) + { + struct pci_dev *pdev; ++ int rc = 0; + + pcidevs_lock(); +- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); ++ ++ pdev = pci_get_pdev(seg, bus, devfn); ++ ++ if ( !pdev ) ++ rc = -ENODEV; ++ /* ++ * If the device exists and it is not owned by either the hardware ++ * domain or dom_io then it must be assigned to a guest, or be ++ * hidden (owned by dom_xen). ++ */ ++ else if ( pdev->domain != hardware_domain && ++ pdev->domain != dom_io ) ++ rc = -EBUSY; ++ + pcidevs_unlock(); + +- return pdev ? 0 : -EBUSY; ++ return rc; + } + + static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) +@@ -1452,7 +1462,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + /* Prevent device assign if mem paging or mem sharing have been + * enabled for this domain */ +- if ( unlikely((is_hvm_domain(d) && ++ if ( d != dom_io && ++ unlikely((is_hvm_domain(d) && + d->arch.hvm.mem_sharing_enabled) || + vm_event_check_ring(d->vm_event_paging) || + p2m_get_hostp2m(d)->global_logdirty) ) +@@ -1468,12 +1479,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + return rc; + } + +- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); ++ pdev = pci_get_pdev(seg, bus, devfn); ++ ++ rc = -ENODEV; + if ( !pdev ) +- { +- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; + goto done; +- } ++ ++ rc = 0; ++ if ( d == pdev->domain ) ++ goto done; ++ ++ rc = -EBUSY; ++ if ( pdev->domain != hardware_domain && ++ pdev->domain != dom_io ) ++ goto done; + + if ( pdev->msix ) + msixtbl_init(d); +@@ -1496,6 +1515,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + } + + done: ++ /* The device is assigned to dom_io so mark it as quarantined */ ++ if ( !rc && d == dom_io ) ++ pdev->quarantine = true; ++ + if ( !has_arch_pdevs(d) && has_iommu_pt(d) ) + iommu_teardown(d); + pcidevs_unlock(); +@@ -1508,6 +1531,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + { + const struct domain_iommu *hd = dom_iommu(d); + struct pci_dev *pdev = NULL; ++ struct domain *target; + int ret = 0; + + if ( !iommu_enabled || !hd->platform_ops ) +@@ -1518,12 +1542,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + if ( !pdev ) + return -ENODEV; + ++ /* De-assignment from dom_io should de-quarantine the device */ ++ target = (pdev->quarantine && pdev->domain != dom_io) ? ++ dom_io : hardware_domain; ++ + while ( pdev->phantom_stride ) + { + devfn += pdev->phantom_stride; + if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) + break; +- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, ++ ret = hd->platform_ops->reassign_device(d, target, devfn, + pci_to_dev(pdev)); + if ( !ret ) + continue; +@@ -1534,7 +1562,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + } + + devfn = pdev->devfn; +- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, ++ ret = hd->platform_ops->reassign_device(d, target, devfn, + pci_to_dev(pdev)); + if ( ret ) + { +@@ -1544,6 +1572,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + return ret; + } + ++ if ( pdev->domain == hardware_domain ) ++ pdev->quarantine = false; ++ + pdev->fault.count = 0; + + if ( !has_arch_pdevs(d) && has_iommu_pt(d) ) +@@ -1722,7 +1753,7 @@ int iommu_do_pci_domctl( + ret = hypercall_create_continuation(__HYPERVISOR_domctl, + "h", u_domctl); + else if ( ret ) +- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " ++ printk(XENLOG_G_ERR + "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + d->domain_id, ret); +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 1db1cd9f2d..a8d1baa064 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1338,6 +1338,10 @@ int domain_context_mapping_one( + int agaw, rc, ret; + bool_t flush_dev_iotlb; + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return 0; ++ + ASSERT(pcidevs_locked()); + spin_lock(&iommu->lock); + maddr = bus_to_context_maddr(iommu, bus); +@@ -1573,6 +1577,10 @@ int domain_context_unmap_one( + int iommu_domid, rc, ret; + bool_t flush_dev_iotlb; + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return 0; ++ + ASSERT(pcidevs_locked()); + spin_lock(&iommu->lock); + +@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + goto out; + } + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ goto out; ++ + /* + * if no other devices under the same iommu owned by this domain, + * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp +@@ -2441,6 +2453,15 @@ static int reassign_device_ownership( + if ( ret ) + return ret; + ++ if ( devfn == pdev->devfn ) ++ { ++ list_move(&pdev->domain_list, &dom_io->arch.pdev_list); ++ pdev->domain = dom_io; ++ } ++ ++ if ( !has_arch_pdevs(source) ) ++ vmx_pi_hooks_deassign(source); ++ + if ( !has_arch_pdevs(target) ) + vmx_pi_hooks_assign(target); + +@@ -2459,15 +2480,13 @@ static int reassign_device_ownership( + pdev->domain = target; + } + +- if ( !has_arch_pdevs(source) ) +- vmx_pi_hooks_deassign(source); +- + return ret; + } + + static int intel_iommu_assign_device( + struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) + { ++ struct domain *s = pdev->domain; + struct acpi_rmrr_unit *rmrr; + int ret = 0, i; + u16 bdf, seg; +@@ -2510,8 +2529,8 @@ static int intel_iommu_assign_device( + } + } + +- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev); +- if ( ret ) ++ ret = reassign_device_ownership(s, d, devfn, pdev); ++ if ( ret || d == dom_io ) + return ret; + + /* Setup rmrr identity mapping */ +@@ -2524,11 +2543,20 @@ static int intel_iommu_assign_device( + ret = rmrr_identity_mapping(d, 1, rmrr, flag); + if ( ret ) + { +- reassign_device_ownership(d, hardware_domain, devfn, pdev); ++ int rc; ++ ++ rc = reassign_device_ownership(d, s, devfn, pdev); + printk(XENLOG_G_ERR VTDPREFIX + " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", + rmrr->base_address, rmrr->end_address, + d->domain_id, ret); ++ if ( rc ) ++ { ++ printk(XENLOG_ERR VTDPREFIX ++ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", ++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); ++ domain_crash(d); ++ } + break; + } + } +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index 8b21e8dc84..a031fd6020 100644 +--- a/xen/include/xen/pci.h ++++ b/xen/include/xen/pci.h +@@ -88,6 +88,9 @@ struct pci_dev { + + nodeid_t node; /* NUMA node */ + ++ /* Device to be quarantined, don't automatically re-assign to dom0 */ ++ bool quarantine; ++ + /* Device with errata, ignore the BARs. */ + bool ignore_bars; + +-- +2.11.0 + diff --git a/system/xen/xsa/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch b/system/xen/xsa/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch new file mode 100644 index 0000000000000..afb1096c1d30b --- /dev/null +++ b/system/xen/xsa/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch @@ -0,0 +1,74 @@ +From c8cb33fa64c9ccbfa2a494a9dad2e0a763c09176 Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Tue, 1 Oct 2019 13:07:53 +0100 +Subject: [PATCH 1/4] xen/arm32: entry: Split __DEFINE_ENTRY_TRAP in two + +The preprocessing macro __DEFINE_ENTRY_TRAP is used to generate trap +entry function. While the macro is fairly small today, follow-up patches +will increase the size signicantly. + +In general, assembly macros are more readable as they allow you to name +parameters and avoid '\'. So the actual implementation of the trap is +now switched to an assembly macro. + +This is part of XSA-303. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Andre Przywara <andre.przywara@arm.com> +--- + xen/arch/arm/arm32/entry.S | 34 +++++++++++++++++++--------------- + 1 file changed, 19 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S +index 0b4cd19abd..4a762e04f1 100644 +--- a/xen/arch/arm/arm32/entry.S ++++ b/xen/arch/arm/arm32/entry.S +@@ -126,24 +126,28 @@ abort_guest_exit_end: + skip_check: + mov pc, lr + +-/* +- * Macro to define trap entry. The iflags corresponds to the list of +- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. +- */ ++ /* ++ * Macro to define trap entry. The iflags corresponds to the list of ++ * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. ++ */ ++ .macro vector trap, iflags ++ SAVE_ALL ++ cpsie \iflags ++ adr lr, return_from_trap ++ mov r0, sp ++ /* ++ * Save the stack pointer in r11. It will be restored after the ++ * trap has been handled (see return_from_trap). ++ */ ++ mov r11, sp ++ bic sp, #7 /* Align the stack pointer (noop on guest trap) */ ++ b do_trap_\trap ++ .endm ++ + #define __DEFINE_TRAP_ENTRY(trap, iflags) \ + ALIGN; \ + trap_##trap: \ +- SAVE_ALL; \ +- cpsie iflags; \ +- adr lr, return_from_trap; \ +- mov r0, sp; \ +- /* \ +- * Save the stack pointer in r11. It will be restored after the \ +- * trap has been handled (see return_from_trap). \ +- */ \ +- mov r11, sp; \ +- bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \ +- b do_trap_##trap ++ vector trap, iflags + + /* Trap handler which unmask IRQ/Abort, keep FIQ masked */ + #define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) +-- +2.11.0 + diff --git a/system/xen/xsa/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch b/system/xen/xsa/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch new file mode 100644 index 0000000000000..35f9c0475e62c --- /dev/null +++ b/system/xen/xsa/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch @@ -0,0 +1,97 @@ +From be7379207c83fa74f8a6c22a8ea213f02714776f Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Tue, 1 Oct 2019 13:15:48 +0100 +Subject: [PATCH 2/4] xen/arm32: entry: Fold the macro SAVE_ALL in the macro + vector + +Follow-up rework will require the macro vector to distinguish between +a trap from a guest vs while in the hypervisor. + +The macro SAVE_ALL already has code to distinguish between the two and +it is only called by the vector macro. So fold the former into the +latter. This will help to avoid duplicating the check. + +This is part of XSA-303. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Andre Przywara <andre.przywara@arm.com> +--- + xen/arch/arm/arm32/entry.S | 46 +++++++++++++++++++++++----------------------- + 1 file changed, 23 insertions(+), 23 deletions(-) + +diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S +index 4a762e04f1..150cbc0b4b 100644 +--- a/xen/arch/arm/arm32/entry.S ++++ b/xen/arch/arm/arm32/entry.S +@@ -13,27 +13,6 @@ + #define RESTORE_BANKED(mode) \ + RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode) + +-#define SAVE_ALL \ +- sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \ +- push {r0-r12}; /* Save R0-R12 */ \ +- \ +- mrs r11, ELR_hyp; /* ELR_hyp is return address. */\ +- str r11, [sp, #UREGS_pc]; \ +- \ +- str lr, [sp, #UREGS_lr]; \ +- \ +- add r11, sp, #UREGS_kernel_sizeof+4; \ +- str r11, [sp, #UREGS_sp]; \ +- \ +- mrc CP32(r11, HSR); /* Save exception syndrome */ \ +- str r11, [sp, #UREGS_hsr]; \ +- \ +- mrs r11, SPSR_hyp; \ +- str r11, [sp, #UREGS_cpsr]; \ +- and r11, #PSR_MODE_MASK; \ +- cmp r11, #PSR_MODE_HYP; \ +- blne save_guest_regs +- + save_guest_regs: + #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR + /* +@@ -52,7 +31,7 @@ save_guest_regs: + ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */ + str r11, [sp, #UREGS_sp] + SAVE_ONE_BANKED(SP_usr) +- /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */ ++ /* LR_usr is the same physical register as lr and is saved by the caller */ + SAVE_BANKED(svc) + SAVE_BANKED(abt) + SAVE_BANKED(und) +@@ -131,7 +110,28 @@ skip_check: + * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. + */ + .macro vector trap, iflags +- SAVE_ALL ++ /* Save registers in the stack */ ++ sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ ++ push {r0-r12} /* Save R0-R12 */ ++ mrs r11, ELR_hyp /* ELR_hyp is return address */ ++ str r11, [sp, #UREGS_pc] ++ ++ str lr, [sp, #UREGS_lr] ++ ++ add r11, sp, #(UREGS_kernel_sizeof + 4) ++ ++ str r11, [sp, #UREGS_sp] ++ ++ mrc CP32(r11, HSR) /* Save exception syndrome */ ++ str r11, [sp, #UREGS_hsr] ++ ++ mrs r11, SPSR_hyp ++ str r11, [sp, #UREGS_cpsr] ++ and r11, #PSR_MODE_MASK ++ cmp r11, #PSR_MODE_HYP ++ blne save_guest_regs ++ ++ /* We are ready to handle the trap, setup the registers and jump. */ + cpsie \iflags + adr lr, return_from_trap + mov r0, sp +-- +2.11.0 + diff --git a/system/xen/xsa/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/system/xen/xsa/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch new file mode 100644 index 0000000000000..516845214880c --- /dev/null +++ b/system/xen/xsa/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch @@ -0,0 +1,226 @@ +From 098fe877967870ffda2dfd9629a5fd272f6aacdc Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Fri, 11 Oct 2019 17:49:28 +0100 +Subject: [PATCH 3/4] xen/arm32: Don't blindly unmask interrupts on trap + without a change of level + +Exception vectors will unmask interrupts regardless the state of them in +the interrupted context. + +One of the consequences is IRQ will be unmasked when receiving an +undefined instruction exception (used by WARN*) from the hypervisor. +This could result to unexpected behavior such as deadlock (if a lock was +shared with interrupts). + +In a nutshell, interrupts should only be unmasked when it is safe to do. +Xen only unmask IRQ and Abort interrupts, so the logic can stay simple. + +As vectors exceptions may be shared between guest and hypervisor, we now +need to have a different policy for the interrupts. + +On exception from hypervisor, each vector will select the list of +interrupts to inherit from the interrupted context. Any interrupts not +listed will be kept masked. + +On exception from the guest, the Abort and IRQ will be unmasked +depending on the exact vector. + +The interrupts will be kept unmasked when the vector cannot used by +either guest or hypervisor. + +Note that each vector is not anymore preceded by ALIGN. This is fine +because the alignment is already bigger than what we need. + +This is part of XSA-303. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Andre Przywara <andre.przywara@arm.com> +--- + xen/arch/arm/arm32/entry.S | 138 +++++++++++++++++++++++++++++++++++---------- + 1 file changed, 109 insertions(+), 29 deletions(-) + +diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S +index 150cbc0b4b..ec90cca093 100644 +--- a/xen/arch/arm/arm32/entry.S ++++ b/xen/arch/arm/arm32/entry.S +@@ -4,6 +4,17 @@ + #include <asm/alternative.h> + #include <public/xen.h> + ++/* ++ * Short-hands to defined the interrupts (A, I, F) ++ * ++ * _ means the interrupt state will not change ++ * X means the state of interrupt X will change ++ * ++ * To be used with msr cpsr_* only ++ */ ++#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK ++#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK ++ + #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg] + #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11 + +@@ -106,10 +117,18 @@ skip_check: + mov pc, lr + + /* +- * Macro to define trap entry. The iflags corresponds to the list of +- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. ++ * Macro to define a trap entry. ++ * ++ * @guest_iflags: Optional list of interrupts to unmask when ++ * entering from guest context. As this is used with cpsie, ++ * the letter (a, i, f) should be used. ++ * ++ * @hyp_iflags: Optional list of interrupts to inherit when ++ * entering from hypervisor context. Any interrupts not ++ * listed will be kept unchanged. As this is used with cpsr_*, ++ * IFLAGS_* short-hands should be used. + */ +- .macro vector trap, iflags ++ .macro vector trap, guest_iflags=n, hyp_iflags=0 + /* Save registers in the stack */ + sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ + push {r0-r12} /* Save R0-R12 */ +@@ -127,12 +146,39 @@ skip_check: + + mrs r11, SPSR_hyp + str r11, [sp, #UREGS_cpsr] +- and r11, #PSR_MODE_MASK +- cmp r11, #PSR_MODE_HYP +- blne save_guest_regs + ++ /* ++ * We need to distinguish whether we came from guest or ++ * hypervisor context. ++ */ ++ and r0, r11, #PSR_MODE_MASK ++ cmp r0, #PSR_MODE_HYP ++ ++ bne 1f ++ /* ++ * Trap from the hypervisor ++ * ++ * Inherit the state of the interrupts from the hypervisor ++ * context. For that we need to use SPSR (stored in r11) and ++ * modify CPSR accordingly. ++ * ++ * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags) ++ */ ++ mrs r10, cpsr ++ bic r10, r10, #\hyp_iflags ++ and r11, r11, #\hyp_iflags ++ orr r10, r10, r11 ++ msr cpsr_cx, r10 ++ b 2f ++ ++1: ++ /* Trap from the guest */ ++ bl save_guest_regs ++ .if \guest_iflags != n ++ cpsie \guest_iflags ++ .endif ++2: + /* We are ready to handle the trap, setup the registers and jump. */ +- cpsie \iflags + adr lr, return_from_trap + mov r0, sp + /* +@@ -144,20 +190,6 @@ skip_check: + b do_trap_\trap + .endm + +-#define __DEFINE_TRAP_ENTRY(trap, iflags) \ +- ALIGN; \ +-trap_##trap: \ +- vector trap, iflags +- +-/* Trap handler which unmask IRQ/Abort, keep FIQ masked */ +-#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) +- +-/* Trap handler which unmask Abort, keep IRQ/FIQ masked */ +-#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a) +- +-/* Trap handler which unmask IRQ, keep Abort/FIQ masked */ +-#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i) +- + .align 5 + GLOBAL(hyp_traps_vector) + b trap_reset /* 0x00 - Reset */ +@@ -228,14 +260,62 @@ decode_vectors: + + #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + +-DEFINE_TRAP_ENTRY(reset) +-DEFINE_TRAP_ENTRY(undefined_instruction) +-DEFINE_TRAP_ENTRY(hypervisor_call) +-DEFINE_TRAP_ENTRY(prefetch_abort) +-DEFINE_TRAP_ENTRY(guest_sync) +-DEFINE_TRAP_ENTRY_NOIRQ(irq) +-DEFINE_TRAP_ENTRY_NOIRQ(fiq) +-DEFINE_TRAP_ENTRY_NOABORT(data_abort) ++/* Vector not used by the Hypervisor. */ ++trap_reset: ++ vector reset ++ ++/* ++ * Vector only used by the Hypervisor. ++ * ++ * While the exception can be executed with all the interrupts (e.g. ++ * IRQ) unmasked, the interrupted context may have purposefully masked ++ * some of them. So we want to inherit the state from the interrupted ++ * context. ++ */ ++trap_undefined_instruction: ++ vector undefined_instruction, hyp_iflags=IFLAGS_AIF ++ ++/* We should never reach this trap */ ++trap_hypervisor_call: ++ vector hypervisor_call ++ ++/* ++ * Vector only used by the hypervisor. ++ * ++ * While the exception can be executed with all the interrupts (e.g. ++ * IRQ) unmasked, the interrupted context may have purposefully masked ++ * some of them. So we want to inherit the state from the interrupted ++ * context. ++ */ ++trap_prefetch_abort: ++ vector prefetch_abort, hyp_iflags=IFLAGS_AIF ++ ++/* ++ * Vector only used by the hypervisor. ++ * ++ * Data Abort should be rare and most likely fatal. It is best to not ++ * unmask any interrupts to limit the amount of code that can run before ++ * the Data Abort is treated. ++ */ ++trap_data_abort: ++ vector data_abort ++ ++/* Vector only used by the guest. We can unmask Abort/IRQ. */ ++trap_guest_sync: ++ vector guest_sync, guest_iflags=ai ++ ++ ++/* Vector used by the hypervisor and the guest. */ ++trap_irq: ++ vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F ++ ++/* ++ * Vector used by the hypervisor and the guest. ++ * ++ * FIQ are not meant to happen, so we don't unmask any interrupts. ++ */ ++trap_fiq: ++ vector fiq + + return_from_trap: + /* +-- +2.11.0 + diff --git a/system/xen/xsa/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/system/xen/xsa/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch new file mode 100644 index 0000000000000..106cbf98f1771 --- /dev/null +++ b/system/xen/xsa/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch @@ -0,0 +1,114 @@ +From c6d290ce157a044dec417fdda8db71e41a37d744 Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien.grall@arm.com> +Date: Mon, 7 Oct 2019 18:10:56 +0100 +Subject: [PATCH 4/4] xen/arm64: Don't blindly unmask interrupts on trap + without a change of level + +Some of the traps without a change of the level (i.e. hypervisor -> +hypervisor) will unmask interrupts regardless the state of them in the +interrupted context. + +One of the consequences is IRQ will be unmasked when receiving a +synchronous exception (used by WARN*()). This could result to unexpected +behavior such as deadlock (if a lock was shared with interrupts). + +In a nutshell, interrupts should only be unmasked when it is safe to +do. Xen only unmask IRQ and Abort interrupts, so the logic can stay +simple: + - hyp_error: All the interrupts are now kept masked. SError should + be pretty rare and if ever happen then we most likely want to + avoid any other interrupts to be generated. The potential main + "caller" is during virtual SError synchronization on the exit + path from the guest (see check_pending_vserror). + + - hyp_sync: The interrupts state is inherited from the interrupted + context. + + - hyp_irq: All the interrupts but IRQ state are inherited from the + interrupted context. IRQ is kept masked. + +This is part of XSA-303. + +Reported-by: Julien Grall <Julien.Grall@arm.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +Reviewed-by: Andre Przywara <andre.przywara@arm.com> +--- + xen/arch/arm/arm64/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 43 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S +index 2d9a2713a1..3e41ba65b6 100644 +--- a/xen/arch/arm/arm64/entry.S ++++ b/xen/arch/arm/arm64/entry.S +@@ -188,24 +188,63 @@ hyp_error_invalid: + entry hyp=1 + invalid BAD_ERROR + ++/* ++ * SError received while running in the hypervisor mode. ++ * ++ * Technically, we could unmask the IRQ if it were unmasked in the ++ * interrupted context. However, this require to check the PSTATE. For ++ * simplicity, as SError should be rare and potentially fatal, ++ * all interrupts are kept masked. ++ */ + hyp_error: + entry hyp=1 +- msr daifclr, #2 + mov x0, sp + bl do_trap_hyp_serror + exit hyp=1 + +-/* Traps taken in Current EL with SP_ELx */ ++/* ++ * Synchronous exception received while running in the hypervisor mode. ++ * ++ * While the exception could be executed with all the interrupts (e.g. ++ * IRQ) unmasked, the interrupted context may have purposefully masked ++ * some of them. So we want to inherit the state from the interrupted ++ * context. ++ */ + hyp_sync: + entry hyp=1 +- msr daifclr, #6 ++ ++ /* Inherit interrupts */ ++ mrs x0, SPSR_el2 ++ and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK) ++ msr daif, x0 ++ + mov x0, sp + bl do_trap_hyp_sync + exit hyp=1 + ++/* ++ * IRQ received while running in the hypervisor mode. ++ * ++ * While the exception could be executed with all the interrupts but IRQ ++ * unmasked, the interrupted context may have purposefully masked some ++ * of them. So we want to inherit the state from the interrupt context ++ * and keep IRQ masked. ++ * ++ * XXX: We may want to consider an ordering between interrupts (e.g. if ++ * SError are masked, then IRQ should be masked too). However, this ++ * would require some rework in some paths (e.g. panic, livepatch) to ++ * ensure the ordering is enforced everywhere. ++ */ + hyp_irq: + entry hyp=1 +- msr daifclr, #4 ++ ++ /* Inherit D, A, F interrupts and keep I masked */ ++ mrs x0, SPSR_el2 ++ mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK) ++ and x0, x0, x1 ++ orr x0, x0, #PSR_IRQ_MASK ++ msr daif, x0 ++ + mov x0, sp + bl do_trap_irq + exit hyp=1 +-- +2.11.0 + diff --git a/system/xen/xsa/xsa304-4.12-1.patch b/system/xen/xsa/xsa304-4.12-1.patch new file mode 100644 index 0000000000000..c2ed2c2ced89f --- /dev/null +++ b/system/xen/xsa/xsa304-4.12-1.patch @@ -0,0 +1,71 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs + +Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and +an EPT superpage gets shattered. The root cause is still under investigation, +but the end result is unusable in combination with CVE-2018-12207 protections. + +This is part of XSA-304 / CVE-2018-12207 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index 16eada9fa2..a71c8b0f84 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -97,6 +97,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu); + int __must_check me_wifi_quirk(struct domain *domain, + u8 bus, u8 devfn, int map); + void pci_vtd_quirk(const struct pci_dev *); ++void quirk_iommu_caps(struct iommu *iommu); ++ + bool_t platform_supports_intremap(void); + bool_t platform_supports_x2apic(void); + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index b3664ecbe0..5d34f75306 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -1215,6 +1215,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) + return -ENODEV; + ++ quirk_iommu_caps(iommu); ++ + if ( cap_fault_reg_offset(iommu->cap) + + cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || + ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) +diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c +index d6db862678..b02688e316 100644 +--- a/xen/drivers/passthrough/vtd/quirks.c ++++ b/xen/drivers/passthrough/vtd/quirks.c +@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev) + break; + } + } ++ ++void __init quirk_iommu_caps(struct iommu *iommu) ++{ ++ /* ++ * IOMMU Quirks: ++ * ++ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't ++ * implement superpages internally. ++ * ++ * There are issues changing the walk length under in-flight DMA, which ++ * has manifested as incompatibility between EPT/IOMMU sharing and the ++ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages ++ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT ++ * and IOMMU pagetables. ++ * ++ * Detection of SandyBridge unfortunately has to be done by processor ++ * model because the client parts don't expose their IOMMUs as PCI devices ++ * we could match with a Device ID. ++ */ ++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && ++ boot_cpu_data.x86 == 6 && ++ (boot_cpu_data.x86_model == 0x2a || ++ boot_cpu_data.x86_model == 0x2d) ) ++ iommu->cap &= ~(0xful << 34); ++} diff --git a/system/xen/xsa/xsa304-4.12-2.patch b/system/xen/xsa/xsa304-4.12-2.patch new file mode 100644 index 0000000000000..66d4301838e6e --- /dev/null +++ b/system/xen/xsa/xsa304-4.12-2.patch @@ -0,0 +1,272 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vtx: Disable executable EPT superpages to work around + CVE-2018-12207 + +CVE-2018-12207 covers a set of errata on various Intel processors, whereby a +machine check exception can be generated in a corner case when an executable +mapping changes size or cacheability without TLB invalidation. HVM guest +kernels can trigger this to DoS the host. + +To mitigate, in affected hardware, all EPT superpages are marked NX. When an +instruction fetch violation is observed against the superpage, the superpage +is shattered to 4k and has execute permissions restored. This prevents the +guest kernel from being able to create the necessary preconditions in the iTLB +to exploit the vulnerability. + +This does come with a workload-dependent performance overhead, caused by +increased TLB pressure. Performance can be restored, if guest kernels are +trusted not to mount an attack, by specifying ept=exec-sp on the command line. + +This is part of XSA-304 / CVE-2018-12207 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 85081fdc94..e283017015 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -895,7 +895,7 @@ Controls for interacting with the system Extended Firmware Interface. + uncacheable. + + ### ept +-> `= List of [ ad=<bool>, pml=<bool> ]` ++> `= List of [ ad=<bool>, pml=<bool>, exec-sp=<bool> ]` + + > Applicability: Intel + +@@ -926,6 +926,16 @@ introduced with the Nehalem architecture. + disable PML. `pml=0` can be used to prevent the use of PML on otherwise + capable hardware. + ++* The `exec-sp` boolean controls whether EPT superpages with execute ++ permissions are permitted. In general this is good for performance. ++ ++ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can ++ use executable superpages to crash the host. By default, executable ++ superpages are disabled on affected hardware. ++ ++ If HVM guest kernels are trusted not to mount a DoS against the system, ++ this option can enabled to regain performance. ++ + ### extra_guest_irqs + > `= [<domU number>][,<dom0 number>]` + +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index 2089a77270..84191d4e4b 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1814,6 +1814,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, + break; + } + ++ /* ++ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution ++ * fault against a non-executable superpage, shatter it to regain ++ * execute permissions. ++ */ ++ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation ) ++ { ++ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K, ++ p2mt, p2ma); ++ ++ if ( res ) ++ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n", ++ gfn, res); ++ ++ rc = !res; ++ goto out_put_gfn; ++ } ++ + if ( violation ) + { + /* Should #VE be emulated for this fault? */ +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 56519fee84..ec5ab860ad 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window); + + static bool __read_mostly opt_ept_pml = true; + static s8 __read_mostly opt_ept_ad = -1; ++int8_t __read_mostly opt_ept_exec_sp = -1; + + static int __init parse_ept_param(const char *s) + { +@@ -82,6 +83,8 @@ static int __init parse_ept_param(const char *s) + opt_ept_ad = val; + else if ( (val = parse_boolean("pml", s, ss)) >= 0 ) + opt_ept_pml = val; ++ else if ( (val = parse_boolean("exec-sp", s, ss)) >= 0 ) ++ opt_ept_exec_sp = val; + else + rc = -EINVAL; + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 26b7ddb5fe..28cba8ec28 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -2445,6 +2445,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs) + static void __init lbr_tsx_fixup_check(void); + static void __init bdw_erratum_bdf14_fixup_check(void); + ++/* ++ * Calculate whether the CPU is vulnerable to Instruction Fetch page ++ * size-change MCEs. ++ */ ++static bool __init has_if_pschange_mc(void) ++{ ++ uint64_t caps = 0; ++ ++ /* ++ * If we are virtualised, there is nothing we can do. Our EPT tables are ++ * shadowed by our hypervisor, and not walked by hardware. ++ */ ++ if ( cpu_has_hypervisor ) ++ return false; ++ ++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) ++ rdmsrl(MSR_ARCH_CAPABILITIES, caps); ++ ++ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO ) ++ return false; ++ ++ /* ++ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at ++ * this time. ++ */ ++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || ++ boot_cpu_data.x86 != 6 ) ++ return false; ++ ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* ++ * Core processors since at least Nehalem are vulnerable. ++ */ ++ case 0x1f: /* Auburndale / Havendale */ ++ case 0x1e: /* Nehalem */ ++ case 0x1a: /* Nehalem EP */ ++ case 0x2e: /* Nehalem EX */ ++ case 0x25: /* Westmere */ ++ case 0x2c: /* Westmere EP */ ++ case 0x2f: /* Westmere EX */ ++ case 0x2a: /* SandyBridge */ ++ case 0x2d: /* SandyBridge EP/EX */ ++ case 0x3a: /* IvyBridge */ ++ case 0x3e: /* IvyBridge EP/EX */ ++ case 0x3c: /* Haswell */ ++ case 0x3f: /* Haswell EX/EP */ ++ case 0x45: /* Haswell D */ ++ case 0x46: /* Haswell H */ ++ case 0x3d: /* Broadwell */ ++ case 0x47: /* Broadwell H */ ++ case 0x4f: /* Broadwell EP/EX */ ++ case 0x56: /* Broadwell D */ ++ case 0x4e: /* Skylake M */ ++ case 0x5e: /* Skylake D */ ++ case 0x55: /* Skylake-X / Cascade Lake */ ++ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */ ++ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */ ++ return true; ++ ++ /* ++ * Atom processors are not vulnerable. ++ */ ++ case 0x1c: /* Pineview */ ++ case 0x26: /* Lincroft */ ++ case 0x27: /* Penwell */ ++ case 0x35: /* Cloverview */ ++ case 0x36: /* Cedarview */ ++ case 0x37: /* Baytrail / Valleyview (Silvermont) */ ++ case 0x4d: /* Avaton / Rangely (Silvermont) */ ++ case 0x4c: /* Cherrytrail / Brasswell */ ++ case 0x4a: /* Merrifield */ ++ case 0x5a: /* Moorefield */ ++ case 0x5c: /* Goldmont */ ++ case 0x5d: /* SoFIA 3G Granite/ES2.1 */ ++ case 0x65: /* SoFIA LTE AOSP */ ++ case 0x5f: /* Denverton */ ++ case 0x6e: /* Cougar Mountain */ ++ case 0x75: /* Lightning Mountain */ ++ case 0x7a: /* Gemini Lake */ ++ case 0x86: /* Jacobsville */ ++ ++ /* ++ * Knights processors are not vulnerable. ++ */ ++ case 0x57: /* Knights Landing */ ++ case 0x85: /* Knights Mill */ ++ return false; ++ ++ default: ++ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n", ++ boot_cpu_data.x86_model); ++ return true; ++ } ++} ++ + const struct hvm_function_table * __init start_vmx(void) + { + set_in_cr4(X86_CR4_VMXE); +@@ -2465,6 +2561,17 @@ const struct hvm_function_table * __init start_vmx(void) + */ + if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) ) + { ++ bool cpu_has_bug_pschange_mc = has_if_pschange_mc(); ++ ++ if ( opt_ept_exec_sp == -1 ) ++ { ++ /* Default to non-executable superpages on vulnerable hardware. */ ++ opt_ept_exec_sp = !cpu_has_bug_pschange_mc; ++ ++ if ( cpu_has_bug_pschange_mc ) ++ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); ++ } ++ + vmx_function_table.hap_supported = 1; + vmx_function_table.altp2m_supported = 1; + +diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c +index 952ebad82f..834d4798c8 100644 +--- a/xen/arch/x86/mm/p2m-ept.c ++++ b/xen/arch/x86/mm/p2m-ept.c +@@ -174,6 +174,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry, + break; + } + ++ /* ++ * Don't create executable superpages if we need to shatter them to ++ * protect against CVE-2018-12207. ++ */ ++ if ( !opt_ept_exec_sp && is_epte_superpage(entry) ) ++ entry->x = 0; + } + + #define GUEST_TABLE_MAP_FAILED 0 +diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h +index ebaa74449b..371b912887 100644 +--- a/xen/include/asm-x86/hvm/vmx/vmx.h ++++ b/xen/include/asm-x86/hvm/vmx/vmx.h +@@ -28,6 +28,8 @@ + #include <asm/hvm/trace.h> + #include <asm/hvm/vmx/vmcs.h> + ++extern int8_t opt_ept_exec_sp; ++ + typedef union { + struct { + u64 r : 1, /* bit 0 - Read permission */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 637259bd1f..32746aa8ae 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -52,6 +52,7 @@ + #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) + #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) + #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) ++#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) diff --git a/system/xen/xsa/xsa304-4.12-3.patch b/system/xen/xsa/xsa304-4.12-3.patch new file mode 100644 index 0000000000000..04b4c454f24fa --- /dev/null +++ b/system/xen/xsa/xsa304-4.12-3.patch @@ -0,0 +1,108 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vtx: Allow runtime modification of the exec-sp setting + +See patch for details. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index e283017015..84221fe60a 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -936,6 +936,21 @@ introduced with the Nehalem architecture. + If HVM guest kernels are trusted not to mount a DoS against the system, + this option can enabled to regain performance. + ++ This boolean may be modified at runtime using `xl set-parameters ++ ept=[no-]exec-sp` to switch between fast and secure. ++ ++ * When switching from secure to fast, preexisting HVM domains will run ++ at their current performance until they are rebooted; new domains will ++ run without any overhead. ++ ++ * When switching from fast to secure, all HVM domains will immediately ++ suffer a performance penalty. ++ ++ **Warning: No guarantee is made that this runtime option will be retained ++ indefinitely, or that it will retain this exact behaviour. It is ++ intended as an emergency option for people who first chose fast, then ++ change their minds to secure, and wish not to reboot.** ++ + ### extra_guest_irqs + > `= [<domU number>][,<dom0 number>]` + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index ec5ab860ad..c4d8a5ba78 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -95,6 +95,41 @@ static int __init parse_ept_param(const char *s) + } + custom_param("ept", parse_ept_param); + ++static int parse_ept_param_runtime(const char *s) ++{ ++ int val; ++ ++ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || ++ !(hvm_funcs.hap_capabilities & ++ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) ++ { ++ printk("VMX: EPT not available, or not in use - ignoring\n"); ++ return 0; ++ } ++ ++ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 ) ++ return -EINVAL; ++ ++ if ( val != opt_ept_exec_sp ) ++ { ++ struct domain *d; ++ ++ opt_ept_exec_sp = val; ++ ++ rcu_read_lock(&domlist_read_lock); ++ for_each_domain ( d ) ++ if ( paging_mode_hap(d) ) ++ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); ++ rcu_read_unlock(&domlist_read_lock); ++ } ++ ++ printk("VMX: EPT executable superpages %sabled\n", ++ val ? "en" : "dis"); ++ ++ return 0; ++} ++custom_runtime_only_param("ept", parse_ept_param_runtime); ++ + /* Dynamic (run-time adjusted) execution control flags. */ + u32 vmx_pin_based_exec_control __read_mostly; + u32 vmx_cpu_based_exec_control __read_mostly; +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index f518f86493..16608098b1 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -289,15 +289,20 @@ static void change_entry_type_global(struct p2m_domain *p2m, + p2m_type_t ot, p2m_type_t nt) + { + p2m->change_entry_type_global(p2m, ot, nt); +- p2m->global_logdirty = (nt == p2m_ram_logdirty); ++ /* Don't allow 'recalculate' operations to change the logdirty state. */ ++ if ( ot != nt ) ++ p2m->global_logdirty = (nt == p2m_ram_logdirty); + } + ++/* ++ * May be called with ot = nt = p2m_ram_rw for its side effect of ++ * recalculating all PTEs in the p2m. ++ */ + void p2m_change_entry_type_global(struct domain *d, + p2m_type_t ot, p2m_type_t nt) + { + struct p2m_domain *hostp2m = p2m_get_hostp2m(d); + +- ASSERT(ot != nt); + ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt)); + + p2m_lock(hostp2m); diff --git a/system/xen/xsa/xsa305-4.12-1.patch b/system/xen/xsa/xsa305-4.12-1.patch new file mode 100644 index 0000000000000..e1a91a52640b6 --- /dev/null +++ b/system/xen/xsa/xsa305-4.12-1.patch @@ -0,0 +1,288 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available + +To protect against the TSX Async Abort speculative vulnerability, Intel have +released new microcode for affected parts which introduce the MSR_TSX_CTRL +control, which allows TSX to be turned off. This will be architectural on +future parts. + +Introduce tsx= to provide a global on/off for TSX, including its enumeration +via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to +guests at the moment. + +VMs may have booted before microcode is loaded, or before hosts have rebooted, +and they still want to migrate freely. A VM which booted seeing TSX can +migrate safely to hosts with TSX disabled - TSX will start unconditionally +aborting, but still behave in a manner compatible with the ABI. + +The guest-visible behaviour is equivalent to late loading the microcode and +setting the RTM_DISABLE bit in the course of live patching. + +This is part of XSA-305 / CVE-2019-11135 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index e283017015..b7e1bf8e8b 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2033,6 +2033,20 @@ Xen version. + ### tsc (x86) + > `= unstable | skewed | stable:socket` + ++### tsx ++ = <bool> ++ ++ Applicability: x86 ++ Default: true ++ ++Controls for the use of Transactional Synchronization eXtensions. ++ ++On Intel parts released in Q3 2019 (with updated microcode), and future parts, ++a control has been introduced which allows TSX to be turned off. ++ ++On systems with the ability to turn TSX off, this boolean offers system wide ++control of whether TSX is enabled or disabled. ++ + ### ucode (x86) + > `= [<integer> | scan]` + +diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile +index 8a8d8f060f..9b9a4435fb 100644 +--- a/xen/arch/x86/Makefile ++++ b/xen/arch/x86/Makefile +@@ -66,6 +66,7 @@ obj-y += sysctl.o + obj-y += time.o + obj-y += trace.o + obj-y += traps.o ++obj-y += tsx.o + obj-y += usercopy.o + obj-y += x86_emulate.o + obj-$(CONFIG_TBOOT) += tboot.o +diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c +index 57e80694f2..1727497459 100644 +--- a/xen/arch/x86/cpuid.c ++++ b/xen/arch/x86/cpuid.c +@@ -524,6 +524,20 @@ void recalculate_cpuid_policy(struct domain *d) + if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) ) + __set_bit(X86_FEATURE_ITSC, max_fs); + ++ /* ++ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable ++ * TSX and hide the feature bits. Migrating-in VMs may have been booted ++ * pre-mitigation when the TSX features were visbile. ++ * ++ * This situation is compatible (albeit with a perf hit to any TSX code in ++ * the guest), so allow the feature bits to remain set. ++ */ ++ if ( cpu_has_tsx_ctrl ) ++ { ++ __set_bit(X86_FEATURE_HLE, max_fs); ++ __set_bit(X86_FEATURE_RTM, max_fs); ++ } ++ + /* Clamp the toolstacks choices to reality. */ + for ( i = 0; i < ARRAY_SIZE(fs); i++ ) + fs[i] &= max_fs[i]; +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index 56de0fe9e1..c2722d7c73 100644 +--- a/xen/arch/x86/msr.c ++++ b/xen/arch/x86/msr.c +@@ -132,6 +132,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) + case MSR_FLUSH_CMD: + /* Write-only */ + case MSR_TSX_FORCE_ABORT: ++ case MSR_TSX_CTRL: + /* Not offered to guests. */ + goto gp_fault; + +@@ -260,6 +261,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + case MSR_ARCH_CAPABILITIES: + /* Read-only */ + case MSR_TSX_FORCE_ABORT: ++ case MSR_TSX_CTRL: + /* Not offered to guests. */ + goto gp_fault; + +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index cf790f36ef..c1c7c44000 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -1594,6 +1594,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + early_microcode_init(); + ++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ ++ + identify_cpu(&boot_cpu_data); + + set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT); +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 737a44f055..e21cf0a310 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -376,6 +376,8 @@ void start_secondary(void *unused) + if ( boot_cpu_has(X86_FEATURE_IBRSB) ) + wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); + ++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ ++ + if ( xen_guest ) + hypervisor_ap_setup(); + +diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c +new file mode 100644 +index 0000000000..a8ec2ccc69 +--- /dev/null ++++ b/xen/arch/x86/tsx.c +@@ -0,0 +1,74 @@ ++#include <xen/init.h> ++#include <asm/msr.h> ++ ++/* ++ * Valid values: ++ * 1 => Explicit tsx=1 ++ * 0 => Explicit tsx=0 ++ * -1 => Default, implicit tsx=1 ++ * ++ * This is arranged such that the bottom bit encodes whether TSX is actually ++ * disabled, while identifying various explicit (>=0) and implicit (<0) ++ * conditions. ++ */ ++int8_t __read_mostly opt_tsx = -1; ++int8_t __read_mostly cpu_has_tsx_ctrl = -1; ++ ++static int __init parse_tsx(const char *s) ++{ ++ int rc = 0, val = parse_bool(s, NULL); ++ ++ if ( val >= 0 ) ++ opt_tsx = val; ++ else ++ rc = -EINVAL; ++ ++ return rc; ++} ++custom_param("tsx", parse_tsx); ++ ++void tsx_init(void) ++{ ++ /* ++ * This function is first called between microcode being loaded, and CPUID ++ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL ++ * is available. ++ */ ++ if ( unlikely(cpu_has_tsx_ctrl < 0) ) ++ { ++ uint64_t caps = 0; ++ ++ if ( boot_cpu_data.cpuid_level >= 7 && ++ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) ) ++ rdmsrl(MSR_ARCH_CAPABILITIES, caps); ++ ++ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL); ++ } ++ ++ if ( cpu_has_tsx_ctrl ) ++ { ++ uint64_t val; ++ ++ rdmsrl(MSR_TSX_CTRL, val); ++ ++ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR); ++ /* Check bottom bit only. Higher bits are various sentinals. */ ++ if ( !(opt_tsx & 1) ) ++ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR; ++ ++ wrmsrl(MSR_TSX_CTRL, val); ++ } ++ else if ( opt_tsx >= 0 ) ++ printk_once(XENLOG_WARNING ++ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n"); ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 32746aa8ae..d5f3899f73 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -53,6 +53,7 @@ + #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) + #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) + #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) ++#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +@@ -60,6 +61,10 @@ + #define MSR_TSX_FORCE_ABORT 0x0000010f + #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0) + ++#define MSR_TSX_CTRL 0x00000122 ++#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0) ++#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1) ++ + /* Intel MSRs. Some also available on other CPUs */ + #define MSR_IA32_PERFCTR0 0x000000c1 + #define MSR_IA32_A_PERFCTR0 0x000004c1 +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index d33ac34d29..1b52712180 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -263,6 +263,16 @@ static always_inline unsigned int cpuid_count_ebx( + return ebx; + } + ++static always_inline unsigned int cpuid_count_edx( ++ unsigned int leaf, unsigned int subleaf) ++{ ++ unsigned int edx, tmp; ++ ++ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx); ++ ++ return edx; ++} ++ + static inline unsigned long read_cr0(void) + { + unsigned long cr0; +@@ -609,6 +619,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model, + return fam; + } + ++extern int8_t opt_tsx, cpu_has_tsx_ctrl; ++void tsx_init(void); ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __ASM_X86_PROCESSOR_H */ +diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h +index 89939f43c8..6529f12dae 100644 +--- a/xen/include/xen/lib.h ++++ b/xen/include/xen/lib.h +@@ -114,6 +114,16 @@ extern int printk_ratelimit(void); + #define gprintk(lvl, fmt, args...) \ + printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args) + ++#define printk_once(fmt, args...) \ ++({ \ ++ static bool __read_mostly once_; \ ++ if ( unlikely(!once_) ) \ ++ { \ ++ once_ = true; \ ++ printk(fmt, ## args); \ ++ } \ ++}) ++ + #ifdef NDEBUG + + static inline void diff --git a/system/xen/xsa/xsa305-4.12-2.patch b/system/xen/xsa/xsa305-4.12-2.patch new file mode 100644 index 0000000000000..07fba86287f8a --- /dev/null +++ b/system/xen/xsa/xsa305-4.12-2.patch @@ -0,0 +1,192 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel + +See patch documentation and comments. + +This is part of XSA-305 / CVE-2019-11135 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index b7e1bf8e8b..74e1e35b88 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -1920,7 +1920,7 @@ extreme care.** + An overall boolean value, `spec-ctrl=no`, can be specified to turn off all + mitigations, including pieces of infrastructure used to virtualise certain + mitigation features for guests. This also includes settings which `xpti`, +-`smt`, `pv-l1tf` control, unless the respective option(s) have been ++`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been + specified earlier on the command line. + + Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to +@@ -2037,7 +2037,7 @@ Xen version. + = <bool> + + Applicability: x86 +- Default: true ++ Default: false on parts vulnerable to TAA, true otherwise + + Controls for the use of Transactional Synchronization eXtensions. + +@@ -2047,6 +2047,19 @@ a control has been introduced which allows TSX to be turned off. + On systems with the ability to turn TSX off, this boolean offers system wide + control of whether TSX is enabled or disabled. + ++On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following ++logic applies: ++ ++ * An explicit `tsx=` choice is honoured, even if it is `true` and would ++ result in a vulnerable system. ++ ++ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be ++ mitigated by disabling TSX, as this is the lowest overhead option. ++ ++ * If the use of TSX is important, the more expensive TAA mitigations can be ++ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain ++ active by default. ++ + ### ucode (x86) + > `= [<integer> | scan]` + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index b37d40e643..800139d79c 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -96,6 +96,9 @@ static int __init parse_spec_ctrl(const char *s) + if ( opt_pv_l1tf_domu < 0 ) + opt_pv_l1tf_domu = 0; + ++ if ( opt_tsx == -1 ) ++ opt_tsx = -3; ++ + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +@@ -306,7 +309,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + printk("Speculative mitigation facilities:\n"); + + /* Hardware features which pertain to speculative mitigations. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", +@@ -318,7 +321,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", + (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", + (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", +- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : ""); ++ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", ++ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", ++ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); + + /* Compiled-in support which pertains to mitigations. */ + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) +@@ -332,7 +337,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n", ++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n", + thunk == THUNK_NONE ? "N/A" : + thunk == THUNK_RETPOLINE ? "RETPOLINE" : + thunk == THUNK_LFENCE ? "LFENCE" : +@@ -341,6 +346,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", + !boot_cpu_has(X86_FEATURE_SSBD) ? "" : + (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", ++ !(caps & ARCH_CAPS_TSX_CTRL) ? "" : ++ (opt_tsx & 1) ? " TSX+" : " TSX-", + opt_ibpb ? " IBPB" : "", + opt_l1d_flush ? " L1D_FLUSH" : "", + opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : ""); +@@ -862,6 +869,7 @@ void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; + bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; ++ bool cpu_has_bug_taa; + uint64_t caps = 0; + + if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) +@@ -1086,6 +1094,53 @@ void __init init_speculation_mitigations(void) + "enabled. Mitigations will not be fully effective. Please\n" + "choose an explicit smt=<bool> setting. See XSA-297.\n"); + ++ /* ++ * Vulnerability to TAA is a little complicated to quantify. ++ * ++ * In the pipeline, it is just another way to get speculative access to ++ * stale load port, store buffer or fill buffer data, and therefore can be ++ * considered a superset of MDS (on TSX-capable parts). On parts which ++ * predate MDS_NO, the existing VERW flushing will mitigate this ++ * sidechannel as well. ++ * ++ * On parts which contain MDS_NO, the lack of VERW flushing means that an ++ * attacker can still use TSX to target microarchitectural buffers to leak ++ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts ++ * which have MDS_NO but lack TAA_NO. ++ * ++ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the ++ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so ++ * we check both to spot TSX in a microcode/cmdline independent way. ++ */ ++ cpu_has_bug_taa = ++ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) && ++ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO; ++ ++ /* ++ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs ++ * the MDS mitigation of disabling HT and using VERW flushing. ++ * ++ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until ++ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being ++ * advertised, and there isn't a MD_CLEAR_2 flag to use... ++ * ++ * If we're on affected hardware, able to do something about it (which ++ * implies that VERW now works), no explicit TSX choice and traditional ++ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might ++ * plausibly value TSX higher than Hyperthreading...), disable TSX to ++ * mitigate TAA. ++ */ ++ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) && ++ ((hw_smt_enabled && opt_smt) || ++ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) ) ++ { ++ setup_clear_cpu_cap(X86_FEATURE_HLE); ++ setup_clear_cpu_cap(X86_FEATURE_RTM); ++ ++ opt_tsx = 0; ++ tsx_init(); ++ } ++ + print_details(thunk, caps); + + /* +diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c +index a8ec2ccc69..2d202a0d4e 100644 +--- a/xen/arch/x86/tsx.c ++++ b/xen/arch/x86/tsx.c +@@ -5,7 +5,8 @@ + * Valid values: + * 1 => Explicit tsx=1 + * 0 => Explicit tsx=0 +- * -1 => Default, implicit tsx=1 ++ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA ++ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) + * + * This is arranged such that the bottom bit encodes whether TSX is actually + * disabled, while identifying various explicit (>=0) and implicit (<0) +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index d5f3899f73..3971b992d3 100644 +--- a/xen/include/asm-x86/msr-index.h ++++ b/xen/include/asm-x86/msr-index.h +@@ -54,6 +54,7 @@ + #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) + #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) + #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) ++#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) |