diff options
author | Mario Preksavec <mario@slackware.hr> | 2020-04-15 14:52:28 +0200 |
---|---|---|
committer | Willy Sudiarto Raharjo <willysr@slackbuilds.org> | 2020-04-19 10:52:33 +0700 |
commit | 08a409483ce6acd12bfb884dac0d0603cc85ce28 (patch) | |
tree | e822a4e5f46a8ab9768224bdf2f97ed28b625e4e /system/xen | |
parent | bddbdfdccbc87f272fd604ef992b4df2f6e71f45 (diff) |
system/xen: Updated for version 4.13.0.
Signed-off-by: Mario Preksavec <mario@slackware.hr>
Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
Diffstat (limited to 'system/xen')
41 files changed, 478 insertions, 4958 deletions
diff --git a/system/xen/README b/system/xen/README index f5f8c96b6d473..2999a8e54d26b 100644 --- a/system/xen/README +++ b/system/xen/README @@ -7,7 +7,8 @@ Solaris, and various versions of the BSD operating systems. This script has a few optional dependencies: mbootpack - creates LILO compatible kernel images - libssh2 - mostly used by libvirt, enable with USE_LIBSSH2=yes + libssh - mostly used by libvirt, enable with USE_LIBSSH=yes + (previously known as USE_LIBSSH2) ocaml-findlib - autodetected, builds oxenstored binary spice - enable with USE_SPICE=yes @@ -15,6 +16,8 @@ Linking with the stock libraries: bluez - enable with USE_BLUEZ=yes gtk - enable with USE_GTK=yes + audio - enable with USE_AUDIO=yes + (or a comma-delimited list: oss alsa sdl pa) Reading material: diff --git a/system/xen/dom0/README.dom0 b/system/xen/dom0/README.dom0 index 728e3bae79bdf..f798e2ddc7def 100644 --- a/system/xen/dom0/README.dom0 +++ b/system/xen/dom0/README.dom0 @@ -46,7 +46,7 @@ Xen EFI binary. To make things a bit easier, a copy of Xen EFI binary can be found here: - http://slackware.hr/~mario/xen/xen-4.12.1.efi.gz + http://slackware.hr/~mario/xen/xen-4.13.0.efi.gz If an automatic boot to Xen kernel is desired, the binary should be renamed and copied to the following location: /boot/efi/EFI/BOOT/bootx64.efi diff --git a/system/xen/dom0/config-4.4.202-xen.i686 b/system/xen/dom0/config-4.4.217-xen.i686 index b31c558e377e3..d60d6d74255a1 100644 --- a/system/xen/dom0/config-4.4.202-xen.i686 +++ b/system/xen/dom0/config-4.4.217-xen.i686 @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.4.202 Kernel Configuration +# Linux/x86 4.4.217 Kernel Configuration # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -7113,6 +7113,7 @@ CONFIG_DOUBLEFAULT=y # CONFIG_DEBUG_TLBFLUSH is not set # CONFIG_IOMMU_STRESS is not set CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 diff --git a/system/xen/dom0/config-4.4.202-xen.x86_64 b/system/xen/dom0/config-4.4.217-xen.x86_64 index f81a31b9ab97d..df9403f2dfe9a 100644 --- a/system/xen/dom0/config-4.4.202-xen.x86_64 +++ b/system/xen/dom0/config-4.4.217-xen.x86_64 @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.4.202 Kernel Configuration +# Linux/x86 4.4.217 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -6925,6 +6925,7 @@ CONFIG_DOUBLEFAULT=y # CONFIG_IOMMU_DEBUG is not set # CONFIG_IOMMU_STRESS is not set CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 diff --git a/system/xen/dom0/kernel-xen.sh b/system/xen/dom0/kernel-xen.sh index 7228dcc541470..483d6460c28cd 100644 --- a/system/xen/dom0/kernel-xen.sh +++ b/system/xen/dom0/kernel-xen.sh @@ -5,8 +5,8 @@ # Written by Chris Abela <chris.abela@maltats.com>, 20100515 # Modified by Mario Preksavec <mario@slackware.hr> -KERNEL=${KERNEL:-4.4.202} -XEN=${XEN:-4.12.1} +KERNEL=${KERNEL:-4.4.217} +XEN=${XEN:-4.13.0} BOOTLOADER=${BOOTLOADER:-lilo} ROOTMOD=${ROOTMOD:-ext4} diff --git a/system/xen/domU/domU.sh b/system/xen/domU/domU.sh index f017c1bd23737..4056208f5ab26 100644 --- a/system/xen/domU/domU.sh +++ b/system/xen/domU/domU.sh @@ -7,7 +7,7 @@ set -e -KERNEL=${KERNEL:-4.4.202} +KERNEL=${KERNEL:-4.4.217} # Build an image for the root file system and another for the swap # Default values : 8GB and 500MB resepectively. diff --git a/system/xen/xen.SlackBuild b/system/xen/xen.SlackBuild index 52bec8230e342..262ff9996a4cf 100644 --- a/system/xen/xen.SlackBuild +++ b/system/xen/xen.SlackBuild @@ -2,7 +2,7 @@ # Slackware build script for xen -# Copyright 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2018, 2019 Mario Preksavec, Zagreb, Croatia +# Copyright 2010, 2011, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Mario Preksavec, Zagreb, Croatia # All rights reserved. # # Redistribution and use of this script, with or without modification, is @@ -23,13 +23,13 @@ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PRGNAM=xen -VERSION=${VERSION:-4.12.1} +VERSION=${VERSION:-4.13.0} BUILD=${BUILD:-1} TAG=${TAG:-_SBo} -SEABIOS=${SEABIOS:-1.12.0} -OVMF=${OVMF:-20180725_ef529e6ab7} -IPXE=${IPXE:-d2063b7693e0e35db97b2264aa987eb6341ae779} +SEABIOS=${SEABIOS:-1.12.1} +OVMF=${OVMF:-20190606_20d2e5a125} +IPXE=${IPXE:-1dd56dbd11082fb622c2ed21cfaced4f47d798a6} if [ -z "$ARCH" ]; then case "$( uname -m )" in @@ -90,9 +90,9 @@ EOF esac esac -case "${USE_LIBSSH2:-no}" in - yes) CONF_QEMUU+=" --enable-libssh2" ;; - *) CONF_QEMUU+=" --disable-libssh2" ;; +case "${USE_LIBSSH:-no}" in + yes) CONF_QEMUU+=" --enable-libssh" ;; + *) CONF_QEMUU+=" --disable-libssh" ;; esac case "${USE_BLUEZ:-no}" in @@ -110,6 +110,12 @@ case "${USE_SPICE:-no}" in *) CONF_QEMUU+=" --disable-spice" ;; esac +case "${USE_AUDIO:-no}" in + yes) CONF_QEMUU+="" ;; + no) CONF_QEMUU+=" --audio-drv-list=" ;; + *) CONF_QEMUU+=" --audio-drv-list=$USE_AUDIO" ;; +esac + set -e rm -rf $PKG @@ -142,8 +148,8 @@ cp $CWD/ipxe-git-$IPXE.tar.gz tools/firmware/etherboot/_ipxe.tar.gz ( # Seabios cd tools/firmware - tar -xf $CWD/seabios-$SEABIOS.tar.?z - mv seabios-$SEABIOS seabios-dir-remote + tar -xf $CWD/seabios-$SEABIOS.tar.?z || tar -xf $CWD/seabios-rel-$SEABIOS.tar.?z + mv seabios-$SEABIOS seabios-dir-remote || mv seabios-rel-$SEABIOS seabios-dir-remote ln -s seabios-dir-remote seabios-dir make -C seabios-dir defconfig # OVMF diff --git a/system/xen/xen.info b/system/xen/xen.info index e0ee8e6f20056..fa9672aace6b5 100644 --- a/system/xen/xen.info +++ b/system/xen/xen.info @@ -1,8 +1,8 @@ PRGNAM="xen" -VERSION="4.12.1" +VERSION="4.13.0" HOMEPAGE="http://www.xenproject.org/" -DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.1.tar.gz \ - http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-d2063b7693e0e35db97b2264aa987eb6341ae779.tar.gz \ +DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.13.0.tar.gz \ + http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-1dd56dbd11082fb622c2ed21cfaced4f47d798a6.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/newlib-1.16.0.tar.gz \ @@ -11,10 +11,10 @@ DOWNLOAD="http://mirror.slackware.hr/sources/xen/xen-4.12.1.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/polarssl-1.1.4-gpl.tgz \ http://mirror.slackware.hr/sources/xen-extfiles/gmp-4.3.2.tar.bz2 \ http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \ - http://mirror.slackware.hr/sources/xen-seabios/seabios-1.12.0.tar.gz \ - http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20180725_ef529e6ab7.tar.bz2" -MD5SUM="3f96ae93a5d6a3dd89bdf1398e30895e \ - 0de05da7aec358881bb1dff815ecca14 \ + http://mirror.slackware.hr/sources/xen-seabios/seabios-1.12.1.tar.gz \ + http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20190606_20d2e5a125.tar.bz2" +MD5SUM="d3b13c4c785601be2f104eaddd7c6a00 \ + b3ab0488a989a089207302111d12e1a0 \ 36cc57650cffda9a0269493be2a169bb \ debc62758716a169df9f62e6ab2bc634 \ bf8f1f9e3ca83d732c00a79a6ef29bc4 \ @@ -23,8 +23,8 @@ MD5SUM="3f96ae93a5d6a3dd89bdf1398e30895e \ 7b72caf22b01464ee7d6165f2fd85f44 \ dd60683d7057917e34630b4a787932e8 \ e26becb8a6a2b6695f6b3e8097593db8 \ - 2fd637b323d247a0948556104a2121c9 \ - 75df1ed5ad9e08b1fb0be78c8b5234b9" + 6cb6cba431fd725126ddb5ec529ab85c \ + a6063a0d3d45e6f77deea8c80569653e" DOWNLOAD_x86_64="" MD5SUM_x86_64="" REQUIRES="acpica yajl" diff --git a/system/xen/xsa/xsa296.patch b/system/xen/xsa/xsa296.patch deleted file mode 100644 index e71ea7f790f27..0000000000000 --- a/system/xen/xsa/xsa296.patch +++ /dev/null @@ -1,195 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: xen/hypercall: Don't use BUG() for parameter checking in hypercall_create_continuation() - -Since c/s 1d429034 "hypercall: update vcpu_op to take an unsigned vcpuid", -which incorrectly swapped 'i' for 'u' in the parameter type list, guests have -been able to hit the BUG() in next_args()'s default case. - -Correct these back to 'i'. - -In addition, make adjustments to prevent this class of issue from occurring in -the future - crashing Xen is not an appropriate form of parameter checking. - -Capitalise NEXT_ARG() to catch all uses, to highlight that it is a macro doing -non-function-like things behind the scenes, and undef it when appropriate. -Implement a bad_fmt: block which prints an error, asserts unreachable, and -crashes the guest. - -On the ARM side, drop all parameter checking of p. It is asymmetric with the -x86 side, and akin to expecting memcpy() or sprintf() to check their src/fmt -parameter before use. A caller passing "" or something other than a string -literal will be obvious during code review. - -This is XSA-296. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Julien Grall <julien.grall@arm.com> - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 941bbff4fe..a3da8e9c08 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -383,14 +383,15 @@ void sync_vcpu_execstate(struct vcpu *v) - /* Nothing to do -- no lazy switching */ - } - --#define next_arg(fmt, args) ({ \ -+#define NEXT_ARG(fmt, args) \ -+({ \ - unsigned long __arg; \ - switch ( *(fmt)++ ) \ - { \ - case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ - case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ - case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ -- default: __arg = 0; BUG(); \ -+ default: goto bad_fmt; \ - } \ - __arg; \ - }) -@@ -405,9 +406,6 @@ unsigned long hypercall_create_continuation( - unsigned int i; - va_list args; - -- /* All hypercalls take at least one argument */ -- BUG_ON( !p || *p == '\0' ); -- - current->hcall_preempted = true; - - va_start(args, format); -@@ -415,7 +413,7 @@ unsigned long hypercall_create_continuation( - if ( mcs->flags & MCSF_in_multicall ) - { - for ( i = 0; *p != '\0'; i++ ) -- mcs->call.args[i] = next_arg(p, args); -+ mcs->call.args[i] = NEXT_ARG(p, args); - - /* Return value gets written back to mcs->call.result */ - rc = mcs->call.result; -@@ -431,7 +429,7 @@ unsigned long hypercall_create_continuation( - - for ( i = 0; *p != '\0'; i++ ) - { -- arg = next_arg(p, args); -+ arg = NEXT_ARG(p, args); - - switch ( i ) - { -@@ -454,7 +452,7 @@ unsigned long hypercall_create_continuation( - - for ( i = 0; *p != '\0'; i++ ) - { -- arg = next_arg(p, args); -+ arg = NEXT_ARG(p, args); - - switch ( i ) - { -@@ -475,8 +473,16 @@ unsigned long hypercall_create_continuation( - va_end(args); - - return rc; -+ -+ bad_fmt: -+ gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p); -+ ASSERT_UNREACHABLE(); -+ domain_crash(current->domain); -+ return 0; - } - -+#undef NEXT_ARG -+ - void startup_cpu_idle_loop(void) - { - struct vcpu *v = current; -diff --git a/xen/arch/x86/hypercall.c b/xen/arch/x86/hypercall.c -index d483dbaa6b..4643e5eb43 100644 ---- a/xen/arch/x86/hypercall.c -+++ b/xen/arch/x86/hypercall.c -@@ -80,14 +80,15 @@ const hypercall_args_t hypercall_args_table[NR_hypercalls] = - #undef COMP - #undef ARGS - --#define next_arg(fmt, args) ({ \ -+#define NEXT_ARG(fmt, args) \ -+({ \ - unsigned long __arg; \ - switch ( *(fmt)++ ) \ - { \ - case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ - case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ - case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ -- default: __arg = 0; BUG(); \ -+ default: goto bad_fmt; \ - } \ - __arg; \ - }) -@@ -109,7 +110,7 @@ unsigned long hypercall_create_continuation( - if ( mcs->flags & MCSF_in_multicall ) - { - for ( i = 0; *p != '\0'; i++ ) -- mcs->call.args[i] = next_arg(p, args); -+ mcs->call.args[i] = NEXT_ARG(p, args); - } - else - { -@@ -121,7 +122,7 @@ unsigned long hypercall_create_continuation( - { - for ( i = 0; *p != '\0'; i++ ) - { -- arg = next_arg(p, args); -+ arg = NEXT_ARG(p, args); - switch ( i ) - { - case 0: regs->rdi = arg; break; -@@ -137,7 +138,7 @@ unsigned long hypercall_create_continuation( - { - for ( i = 0; *p != '\0'; i++ ) - { -- arg = next_arg(p, args); -+ arg = NEXT_ARG(p, args); - switch ( i ) - { - case 0: regs->rbx = arg; break; -@@ -154,8 +155,16 @@ unsigned long hypercall_create_continuation( - va_end(args); - - return op; -+ -+ bad_fmt: -+ gprintk(XENLOG_ERR, "Bad hypercall continuation format '%c'\n", *p); -+ ASSERT_UNREACHABLE(); -+ domain_crash(curr->domain); -+ return 0; - } - -+#undef NEXT_ARG -+ - int hypercall_xlat_continuation(unsigned int *id, unsigned int nr, - unsigned int mask, ...) - { -diff --git a/xen/common/compat/domain.c b/xen/common/compat/domain.c -index 39877b3ab2..2531fa7421 100644 ---- a/xen/common/compat/domain.c -+++ b/xen/common/compat/domain.c -@@ -81,7 +81,7 @@ int compat_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) ar - } - - if ( rc == -ERESTART ) -- rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh", -+ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", - cmd, vcpuid, arg); - - break; -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 2308588052..65bcd85e34 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1411,7 +1411,7 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) - - rc = arch_initialise_vcpu(v, arg); - if ( rc == -ERESTART ) -- rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iuh", -+ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", - cmd, vcpuid, arg); - - break; diff --git a/system/xen/xsa/xsa298.patch b/system/xen/xsa/xsa298.patch deleted file mode 100644 index aa39042be56bd..0000000000000 --- a/system/xen/xsa/xsa298.patch +++ /dev/null @@ -1,89 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: x86/PV: check GDT/LDT limits during emulation - -Accesses beyond the LDT limit originating from emulation would trigger -the ASSERT() in pv_map_ldt_shadow_page(). On production builds such -accesses would cause an attempt to promote the touched page (offset from -the present LDT base address) to a segment descriptor one. If this -happens to succeed, guest user mode would be able to elevate its -privileges to that of the guest kernel. This is particularly easy when -there's no LDT at all, in which case the LDT base stored internally to -Xen is simply zero. - -Also adjust the ASSERT() that was triggering: It was off by one to -begin with, and for production builds we also better use -ASSERT_UNREACHABLE() instead with suitable recovery code afterwards. - -This is XSA-298. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> ---- -v2: Correct 64-bit-only limit check (by folding into the common one). - ---- a/xen/arch/x86/pv/emul-gate-op.c -+++ b/xen/arch/x86/pv/emul-gate-op.c -@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned - const seg_desc_t *pdesc = gdt_ldt_desc_ptr(gate_sel); - - if ( (gate_sel < 4) || -- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || -+ /* -+ * We're interested in call gates only, which occupy a single -+ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit. -+ */ -+ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >= -+ (gate_sel & 4 ? v->arch.pv.ldt_ents -+ : v->arch.pv.gdt_ents)) || - __get_user(desc, pdesc) ) - return 0; - -@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned - if ( !is_pv_32bit_vcpu(v) ) - { - if ( (*ar & 0x1f00) != 0x0c00 || -- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || -+ /* Limit check done above already. */ - __get_user(desc, pdesc + 1) || - (desc.b & 0x1f00) ) - return 0; ---- a/xen/arch/x86/pv/emulate.c -+++ b/xen/arch/x86/pv/emulate.c -@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int - { - seg_desc_t desc; - -- if ( sel < 4) -+ if ( sel < 4 || -+ /* -+ * Don't apply the GDT limit here, as the selector may be a Xen -+ * provided one. __get_user() will fail (without taking further -+ * action) for ones falling in the gap between guest populated -+ * and Xen ones. -+ */ -+ ((sel & 4) && (sel >> 3) >= v->arch.pv.ldt_ents) ) - desc.b = desc.a = 0; - else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) ) - return 0; ---- a/xen/arch/x86/pv/mm.c -+++ b/xen/arch/x86/pv/mm.c -@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int - BUG_ON(unlikely(in_irq())); - - /* -- * Hardware limit checking should guarantee this property. NB. This is -+ * Prior limit checking should guarantee this property. NB. This is - * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the - * current vcpu, and vcpu_reset() will block until this vcpu has been - * descheduled before continuing. - */ -- ASSERT((offset >> 3) <= curr->arch.pv.ldt_ents); -+ if ( unlikely((offset >> 3) >= curr->arch.pv.ldt_ents) ) -+ { -+ ASSERT_UNREACHABLE(); -+ return false; -+ } - - if ( is_pv_32bit_domain(currd) ) - linear = (uint32_t)linear; diff --git a/system/xen/xsa/xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch b/system/xen/xsa/xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch deleted file mode 100644 index fbb9d8086b6f8..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 33d051917d5ef38f678b507a3c832afde48b9b49 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry - -On detection of a potential L1TF issue, most validation code returns --ERESTART to allow the switch to shadow mode to happen and cause the -original operation to be restarted. - -However, in the validation code, the return value -ERESTART has been -repurposed to indicate 1) the function has partially completed -something which needs to be undone, and 2) calling put_page_type() -should cleanly undo it. This causes problems in several places. - -For L1 tables, on receiving an -ERESTART return from alloc_l1_table(), -alloc_page_type() will set PGT_partial on the page. If for some -reason the original operation never restarts, then on domain -destruction, relinquish_memory() will call free_page_type() on the -page. - -Unfortunately, alloc_ and free_l1_table() aren't set up to deal with -PGT_partial. When returning a failure, alloc_l1_table() always -de-validates whatever it's validated so far, and free_l1_table() -always devalidates the whole page. This means that if -relinquish_memory() calls free_page_type() on an L1 that didn't -complete due to an L1TF, it will call put_page_from_l1e() on "page -entries" that have never been validated. - -For L2+ tables, setting rc to ERESTART causes the rest of the -alloc_lN_table() function to *think* that the entry in question will -have PGT_partial set. This will cause it to set partial_pte = 1. If -relinqush_memory() then calls free_page_type() on one of those pages, -then free_lN_table() will call put_page_from_lNe() on the entry when -it shouldn't. - -Rather than indicating -ERESTART, indicate -EINTR. This is the code -to indicate that nothing has changed from when you started the call -(which is effectively how alloc_l1_table() handles errors). - -mod_lN_entry() shouldn't have any of these types of problems, so leave -potential changes there for a clean-up patch later. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 3557cd1178..a1b55c10ff 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1409,7 +1409,7 @@ static int alloc_l1_table(struct page_info *page) - { - if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) - { -- ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0; -+ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0; - if ( ret ) - goto out; - } -@@ -1517,7 +1517,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - { - if ( !pv_l1tf_check_l2e(d, l2e) ) - continue; -- rc = -ERESTART; -+ rc = -EINTR; - } - else - rc = get_page_from_l2e(l2e, pfn, d, partial); -@@ -1603,7 +1603,7 @@ static int alloc_l3_table(struct page_info *page) - { - if ( !pv_l1tf_check_l3e(d, l3e) ) - continue; -- rc = -ERESTART; -+ rc = -EINTR; - } - else - rc = get_page_from_l3e(l3e, pfn, d, partial); -@@ -1783,7 +1783,7 @@ static int alloc_l4_table(struct page_info *page) - { - if ( !pv_l1tf_check_l4e(d, l4e) ) - continue; -- rc = -ERESTART; -+ rc = -EINTR; - } - else - rc = get_page_from_l4e(l4e, pfn, d, partial); --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch b/system/xen/xsa/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch deleted file mode 100644 index a74598e597fad..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch +++ /dev/null @@ -1,99 +0,0 @@ -From b490792c18f74b76ec8161721c1e07f810e36309 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially - de-validated page - -When unpinning pagetables, if an operation is interrupted, -relinquish_memory() re-sets PGT_pinned so that the un-pin will -pickedup again when the hypercall restarts. - -This is appropriate when put_page_and_type_preemptible() returns --EINTR, which indicates that the page is back in its initial state -(i.e., completely validated). However, for -ERESTART, this leads to a -state where a page has both PGT_pinned and PGT_partial set. - -This happens to work at the moment, although it's not really a -"canonical" state; but in subsequent patches, where we need to make a -distinction in handling between PGT_validated and PGT_partial pages, -this causes issues. - -Move to a "canonical" state by: -- Only re-setting PGT_pinned on -EINTR -- Re-dropping the refcount held by PGT_pinned on -ERESTART - -In the latter case, the PGT_partial bit will be cleared further down -with the rest of the other PGT_partial pages. - -While here, clean up some trainling whitespace. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++--- - 1 file changed, 28 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 2585327834..59df8a6d8d 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -114,7 +114,7 @@ static void play_dead(void) - * this case, heap corruption or #PF can occur (when heap debugging is - * enabled). For example, even printk() can involve tasklet scheduling, - * which touches per-cpu vars. -- * -+ * - * Consider very carefully when adding code to *dead_idle. Most hypervisor - * subsystems are unsafe to call. - */ -@@ -1909,9 +1909,34 @@ static int relinquish_memory( - break; - case -ERESTART: - case -EINTR: -+ /* -+ * -EINTR means PGT_validated has been re-set; re-set -+ * PGT_pinned again so that it gets picked up next time -+ * around. -+ * -+ * -ERESTART, OTOH, means PGT_partial is set instead. Put -+ * it back on the list, but don't set PGT_pinned; the -+ * section below will finish off de-validation. But we do -+ * need to drop the general ref associated with -+ * PGT_pinned, since put_page_and_type_preemptible() -+ * didn't do it. -+ * -+ * NB we can do an ASSERT for PGT_validated, since we -+ * "own" the type ref; but theoretically, the PGT_partial -+ * could be cleared by someone else. -+ */ -+ if ( ret == -EINTR ) -+ { -+ ASSERT(page->u.inuse.type_info & PGT_validated); -+ set_bit(_PGT_pinned, &page->u.inuse.type_info); -+ } -+ else -+ put_page(page); -+ - ret = -ERESTART; -+ -+ /* Put the page back on the list and drop the ref we grabbed above */ - page_list_add(page, list); -- set_bit(_PGT_pinned, &page->u.inuse.type_info); - put_page(page); - goto out; - default: -@@ -2161,7 +2186,7 @@ void vcpu_kick(struct vcpu *v) - * pending flag. These values may fluctuate (after all, we hold no - * locks) but the key insight is that each change will cause - * evtchn_upcall_pending to be polled. -- * -+ * - * NB2. We save the running flag across the unblock to avoid a needless - * IPI for domains that we IPI'd to unblock. - */ --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch b/system/xen/xsa/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch deleted file mode 100644 index 226e5487b1579..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch +++ /dev/null @@ -1,618 +0,0 @@ -From 0f9f61e5737fdd346550ec6e30161fa99e4653fa Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into - individual flags - -At the moment, partial_pte is a tri-state that contains two distinct bits -of information: - -1. If zero, the pte at index [nr_validated_ptes] is un-validated. If - non-zero, the pte was last seen with PGT_partial set. - -2. If positive, the pte at index [nr_validated_ptes] does not hold a - general reference count. If negative, it does. - -To make future patches more clear, separate out this functionality -into two distinct, named bits: PTF_partial_set (for #1) and -PTF_partial_general_ref (for #2). - -Additionally, a number of functions which need this information also -take other flags to control behavior (such as `preemptible` and -`defer`). These are hard to read in the caller (since you only see -'true' or 'false'), and ugly when many are added together. In -preparation for adding yet another flag in a future patch, collapse -all of these into a single `flag` variable. - -NB that this does mean checking for what was previously the '-1' -condition a bit more ugly in the put_page_from_lNe functions (since -you have to check for both partial_set and general ref); but this -clause will go away in a future patch. - -Also note that the original comment had an off-by-one error: -partial_flags (like partial_pte before it) concerns -plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1]. - -No functional change intended. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 165 ++++++++++++++++++++++++--------------- - xen/include/asm-x86/mm.h | 41 +++++++--- - 2 files changed, 128 insertions(+), 78 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index a1b55c10ff..3f6f8cc9b8 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1094,20 +1094,35 @@ get_page_from_l1e( - } - - #ifdef CONFIG_PV -+ -+/* -+ * The following flags are used to specify behavior of various get and -+ * put commands. The first two are also stored in page->partial_flags -+ * to indicate the state of the page pointed to by -+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for -+ * more information. -+ */ -+#define PTF_partial_set (1 << 0) -+#define PTF_partial_general_ref (1 << 1) -+#define PTF_preemptible (1 << 2) -+#define PTF_defer (1 << 3) -+ - static int get_page_and_type_from_mfn( - mfn_t mfn, unsigned long type, struct domain *d, -- int partial, int preemptible) -+ unsigned int flags) - { - struct page_info *page = mfn_to_page(mfn); - int rc; -+ bool preemptible = flags & PTF_preemptible, -+ partial_ref = flags & PTF_partial_general_ref; - -- if ( likely(partial >= 0) && -+ if ( likely(!partial_ref) && - unlikely(!get_page_from_mfn(mfn, d)) ) - return -EINVAL; - - rc = _get_page_type(page, type, preemptible); - -- if ( unlikely(rc) && partial >= 0 && -+ if ( unlikely(rc) && !partial_ref && - (!preemptible || page != current->arch.old_guest_table) ) - put_page(page); - -@@ -1117,7 +1132,7 @@ static int get_page_and_type_from_mfn( - define_get_linear_pagetable(l2); - static int - get_page_from_l2e( -- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial) -+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags) - { - unsigned long mfn = l2e_get_pfn(l2e); - int rc; -@@ -1129,8 +1144,9 @@ get_page_from_l2e( - return -EINVAL; - } - -- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, -- partial, false); -+ ASSERT(!(flags & PTF_preemptible)); -+ -+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags); - if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) - rc = 0; - -@@ -1140,7 +1156,7 @@ get_page_from_l2e( - define_get_linear_pagetable(l3); - static int - get_page_from_l3e( -- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial) -+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags) - { - int rc; - -@@ -1152,7 +1168,7 @@ get_page_from_l3e( - } - - rc = get_page_and_type_from_mfn( -- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1); -+ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible); - if ( unlikely(rc == -EINVAL) && - !is_pv_32bit_domain(d) && - get_l3_linear_pagetable(l3e, pfn, d) ) -@@ -1164,7 +1180,7 @@ get_page_from_l3e( - define_get_linear_pagetable(l4); - static int - get_page_from_l4e( -- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial) -+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags) - { - int rc; - -@@ -1176,7 +1192,7 @@ get_page_from_l4e( - } - - rc = get_page_and_type_from_mfn( -- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1); -+ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible); - if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) - rc = 0; - -@@ -1277,7 +1293,7 @@ static void put_data_page(struct page_info *page, bool writeable) - * Note also that this automatically deals correctly with linear p.t.'s. - */ - static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, -- int partial, bool defer) -+ unsigned int flags) - { - int rc = 0; - -@@ -1300,12 +1316,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - struct page_info *pg = l2e_get_page(l2e); - struct page_info *ptpg = mfn_to_page(_mfn(pfn)); - -- if ( unlikely(partial > 0) ) -+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == -+ PTF_partial_set ) - { -- ASSERT(!defer); -+ ASSERT(!(flags & PTF_defer)); - rc = _put_page_type(pg, true, ptpg); - } -- else if ( defer ) -+ else if ( flags & PTF_defer ) - { - current->arch.old_guest_ptpg = ptpg; - current->arch.old_guest_table = pg; -@@ -1322,7 +1339,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - } - - static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, -- int partial, bool defer) -+ unsigned int flags) - { - struct page_info *pg; - int rc; -@@ -1345,13 +1362,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - - pg = l3e_get_page(l3e); - -- if ( unlikely(partial > 0) ) -+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == -+ PTF_partial_set ) - { -- ASSERT(!defer); -+ ASSERT(!(flags & PTF_defer)); - return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); - } - -- if ( defer ) -+ if ( flags & PTF_defer ) - { - current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); - current->arch.old_guest_table = pg; -@@ -1366,7 +1384,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - } - - static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, -- int partial, bool defer) -+ unsigned int flags) - { - int rc = 1; - -@@ -1375,13 +1393,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - { - struct page_info *pg = l4e_get_page(l4e); - -- if ( unlikely(partial > 0) ) -+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == -+ PTF_partial_set ) - { -- ASSERT(!defer); -+ ASSERT(!(flags & PTF_defer)); - return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); - } - -- if ( defer ) -+ if ( flags & PTF_defer ) - { - current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); - current->arch.old_guest_table = pg; -@@ -1492,12 +1511,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - unsigned long pfn = mfn_x(page_to_mfn(page)); - l2_pgentry_t *pl2e; - unsigned int i; -- int rc = 0, partial = page->partial_pte; -+ int rc = 0; -+ unsigned int partial_flags = page->partial_flags; - - pl2e = map_domain_page(_mfn(pfn)); - - for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; -- i++, partial = 0 ) -+ i++, partial_flags = 0 ) - { - l2_pgentry_t l2e; - -@@ -1520,17 +1540,18 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - rc = -EINTR; - } - else -- rc = get_page_from_l2e(l2e, pfn, d, partial); -+ rc = get_page_from_l2e(l2e, pfn, d, partial_flags); - - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_pte = partial ?: 1; -+ /* Set 'set', retain 'general ref' */ -+ page->partial_flags = partial_flags | PTF_partial_set; - } - else if ( rc == -EINTR && i ) - { - page->nr_validated_ptes = i; -- page->partial_pte = 0; -+ page->partial_flags = 0; - rc = -ERESTART; - } - else if ( rc < 0 && rc != -EINTR ) -@@ -1539,7 +1560,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - if ( i ) - { - page->nr_validated_ptes = i; -- page->partial_pte = 0; -+ page->partial_flags = 0; - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; - } -@@ -1563,7 +1584,8 @@ static int alloc_l3_table(struct page_info *page) - unsigned long pfn = mfn_x(page_to_mfn(page)); - l3_pgentry_t *pl3e; - unsigned int i; -- int rc = 0, partial = page->partial_pte; -+ int rc = 0; -+ unsigned int partial_flags = page->partial_flags; - - pl3e = map_domain_page(_mfn(pfn)); - -@@ -1578,7 +1600,7 @@ static int alloc_l3_table(struct page_info *page) - memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); - - for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; -- i++, partial = 0 ) -+ i++, partial_flags = 0 ) - { - l3_pgentry_t l3e = pl3e[i]; - -@@ -1597,7 +1619,8 @@ static int alloc_l3_table(struct page_info *page) - else - rc = get_page_and_type_from_mfn( - l3e_get_mfn(l3e), -- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1); -+ PGT_l2_page_table | PGT_pae_xen_l2, d, -+ partial_flags | PTF_preemptible); - } - else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - { -@@ -1606,17 +1629,18 @@ static int alloc_l3_table(struct page_info *page) - rc = -EINTR; - } - else -- rc = get_page_from_l3e(l3e, pfn, d, partial); -+ rc = get_page_from_l3e(l3e, pfn, d, partial_flags); - - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_pte = partial ?: 1; -+ /* Set 'set', leave 'general ref' set if this entry was set */ -+ page->partial_flags = partial_flags | PTF_partial_set; - } - else if ( rc == -EINTR && i ) - { - page->nr_validated_ptes = i; -- page->partial_pte = 0; -+ page->partial_flags = 0; - rc = -ERESTART; - } - if ( rc < 0 ) -@@ -1633,7 +1657,7 @@ static int alloc_l3_table(struct page_info *page) - if ( i ) - { - page->nr_validated_ptes = i; -- page->partial_pte = 0; -+ page->partial_flags = 0; - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; - } -@@ -1767,10 +1791,11 @@ static int alloc_l4_table(struct page_info *page) - unsigned long pfn = mfn_x(page_to_mfn(page)); - l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); - unsigned int i; -- int rc = 0, partial = page->partial_pte; -+ int rc = 0; -+ unsigned int partial_flags = page->partial_flags; - - for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; -- i++, partial = 0 ) -+ i++, partial_flags = 0 ) - { - l4_pgentry_t l4e; - -@@ -1786,12 +1811,13 @@ static int alloc_l4_table(struct page_info *page) - rc = -EINTR; - } - else -- rc = get_page_from_l4e(l4e, pfn, d, partial); -+ rc = get_page_from_l4e(l4e, pfn, d, partial_flags); - - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_pte = partial ?: 1; -+ /* Set 'set', leave 'general ref' set if this entry was set */ -+ page->partial_flags = partial_flags | PTF_partial_set; - } - else if ( rc < 0 ) - { -@@ -1801,7 +1827,7 @@ static int alloc_l4_table(struct page_info *page) - if ( i ) - { - page->nr_validated_ptes = i; -- page->partial_pte = 0; -+ page->partial_flags = 0; - if ( rc == -EINTR ) - rc = -ERESTART; - else -@@ -1853,19 +1879,20 @@ static int free_l2_table(struct page_info *page) - struct domain *d = page_get_owner(page); - unsigned long pfn = mfn_x(page_to_mfn(page)); - l2_pgentry_t *pl2e; -- int rc = 0, partial = page->partial_pte; -- unsigned int i = page->nr_validated_ptes - !partial; -+ int rc = 0; -+ unsigned int partial_flags = page->partial_flags, -+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); - - pl2e = map_domain_page(_mfn(pfn)); - - for ( ; ; ) - { - if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) -- rc = put_page_from_l2e(pl2e[i], pfn, partial, false); -+ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags); - if ( rc < 0 ) - break; - -- partial = 0; -+ partial_flags = 0; - - if ( !i-- ) - break; -@@ -1887,12 +1914,14 @@ static int free_l2_table(struct page_info *page) - else if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_pte = partial ?: -1; -+ page->partial_flags = (partial_flags & PTF_partial_set) ? -+ partial_flags : -+ (PTF_partial_set | PTF_partial_general_ref); - } - else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) - { - page->nr_validated_ptes = i + 1; -- page->partial_pte = 0; -+ page->partial_flags = 0; - rc = -ERESTART; - } - -@@ -1904,18 +1933,19 @@ static int free_l3_table(struct page_info *page) - struct domain *d = page_get_owner(page); - unsigned long pfn = mfn_x(page_to_mfn(page)); - l3_pgentry_t *pl3e; -- int rc = 0, partial = page->partial_pte; -- unsigned int i = page->nr_validated_ptes - !partial; -+ int rc = 0; -+ unsigned int partial_flags = page->partial_flags, -+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); - - pl3e = map_domain_page(_mfn(pfn)); - - for ( ; ; ) - { -- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); -+ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags); - if ( rc < 0 ) - break; - -- partial = 0; -+ partial_flags = 0; - if ( rc == 0 ) - pl3e[i] = unadjust_guest_l3e(pl3e[i], d); - -@@ -1934,12 +1964,14 @@ static int free_l3_table(struct page_info *page) - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_pte = partial ?: -1; -+ page->partial_flags = (partial_flags & PTF_partial_set) ? -+ partial_flags : -+ (PTF_partial_set | PTF_partial_general_ref); - } - else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) - { - page->nr_validated_ptes = i + 1; -- page->partial_pte = 0; -+ page->partial_flags = 0; - rc = -ERESTART; - } - return rc > 0 ? 0 : rc; -@@ -1950,26 +1982,29 @@ static int free_l4_table(struct page_info *page) - struct domain *d = page_get_owner(page); - unsigned long pfn = mfn_x(page_to_mfn(page)); - l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn)); -- int rc = 0, partial = page->partial_pte; -- unsigned int i = page->nr_validated_ptes - !partial; -+ int rc = 0; -+ unsigned partial_flags = page->partial_flags, -+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set); - - do { - if ( is_guest_l4_slot(d, i) ) -- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0); -+ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags); - if ( rc < 0 ) - break; -- partial = 0; -+ partial_flags = 0; - } while ( i-- ); - - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_pte = partial ?: -1; -+ page->partial_flags = (partial_flags & PTF_partial_set) ? -+ partial_flags : -+ (PTF_partial_set | PTF_partial_general_ref); - } - else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) - { - page->nr_validated_ptes = i + 1; -- page->partial_pte = 0; -+ page->partial_flags = 0; - rc = -ERESTART; - } - -@@ -2247,7 +2282,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, - return -EBUSY; - } - -- put_page_from_l2e(ol2e, pfn, 0, true); -+ put_page_from_l2e(ol2e, pfn, PTF_defer); - - return rc; - } -@@ -2315,7 +2350,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, - if ( !create_pae_xen_mappings(d, pl3e) ) - BUG(); - -- put_page_from_l3e(ol3e, pfn, 0, 1); -+ put_page_from_l3e(ol3e, pfn, PTF_defer); - return rc; - } - -@@ -2378,7 +2413,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, - return -EFAULT; - } - -- put_page_from_l4e(ol4e, pfn, 0, 1); -+ put_page_from_l4e(ol4e, pfn, PTF_defer); - return rc; - } - #endif /* CONFIG_PV */ -@@ -2649,7 +2684,7 @@ int free_page_type(struct page_info *page, unsigned long type, - if ( !(type & PGT_partial) ) - { - page->nr_validated_ptes = 1U << PAGETABLE_ORDER; -- page->partial_pte = 0; -+ page->partial_flags = 0; - } - - switch ( type & PGT_type_mask ) -@@ -2946,7 +2981,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, - if ( !(x & PGT_partial) ) - { - page->nr_validated_ptes = 0; -- page->partial_pte = 0; -+ page->partial_flags = 0; - } - page->linear_pt_count = 0; - rc = alloc_page_type(page, type, preemptible); -@@ -3122,7 +3157,7 @@ int new_guest_cr3(mfn_t mfn) - return 0; - } - -- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1); -+ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible); - switch ( rc ) - { - case 0: -@@ -3473,7 +3508,7 @@ long do_mmuext_op( - if ( op.arg1.mfn != 0 ) - { - rc = get_page_and_type_from_mfn( -- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1); -+ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible); - - if ( unlikely(rc) ) - { -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 6faa563167..8406ac3c37 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -228,19 +228,34 @@ struct page_info - * setting the flag must not drop that reference, whereas the instance - * clearing it will have to. - * -- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has -- * been partially validated. This implies that the general reference -- * to the page (acquired from get_page_from_lNe()) would be dropped -- * (again due to the apparent failure) and hence must be re-acquired -- * when resuming the validation, but must not be dropped when picking -- * up the page for invalidation. -+ * If partial_flags & PTF_partial_set is set, then the page at -+ * at @nr_validated_ptes had PGT_partial set as a result of an -+ * operation on the current page. (That page may or may not -+ * still have PGT_partial set.) - * -- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has -- * been partially invalidated. This is basically the opposite case of -- * above, i.e. the general reference to the page was not dropped in -- * put_page_from_lNe() (due to the apparent failure), and hence it -- * must be dropped when the put operation is resumed (and completes), -- * but it must not be acquired if picking up the page for validation. -+ * If PTF_partial_general_ref is set, then the PTE at -+ * @nr_validated_ptef holds a general reference count for the -+ * page. -+ * -+ * This happens: -+ * - During de-validation, if de-validation of the page was -+ * interrupted -+ * - During validation, if an invalid entry is encountered and -+ * validation is preemptible -+ * - During validation, if PTF_partial_general_ref was set on -+ * this entry to begin with (perhaps because we're picking -+ * up from a partial de-validation). -+ * -+ * When resuming validation, if PTF_partial_general_ref is clear, -+ * then a general reference must be re-acquired; if it is set, no -+ * reference should be acquired. -+ * -+ * When resuming de-validation, if PTF_partial_general_ref is -+ * clear, no reference should be dropped; if it is set, a -+ * reference should be dropped. -+ * -+ * NB that PTF_partial_set and PTF_partial_general_ref are -+ * defined in mm.c, the only place where they are used. - * - * The 3rd field, @linear_pt_count, indicates - * - by a positive value, how many same-level page table entries a page -@@ -251,7 +266,7 @@ struct page_info - struct { - u16 nr_validated_ptes:PAGETABLE_ORDER + 1; - u16 :16 - PAGETABLE_ORDER - 1 - 2; -- s16 partial_pte:2; -+ u16 partial_flags:2; - s16 linear_pt_count; - }; - --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch b/system/xen/xsa/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch deleted file mode 100644 index d07c233225664..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch +++ /dev/null @@ -1,140 +0,0 @@ -From db1d801aa8dcb918a27486a6e8d9cf5d7307dec3 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a - boolean - -This is in mainly in preparation for _put_page_type taking the -partial_flags value in the future. It also makes it easier to read in -the caller (since you see a flag name rather than `true` or `false`). - -No functional change intended. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 25 +++++++++++++------------ - 1 file changed, 13 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 3f6f8cc9b8..0740b61af8 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1200,7 +1200,7 @@ get_page_from_l4e( - } - #endif /* CONFIG_PV */ - --static int _put_page_type(struct page_info *page, bool preemptible, -+static int _put_page_type(struct page_info *page, unsigned int flags, - struct page_info *ptpg); - - void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) -@@ -1320,7 +1320,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - PTF_partial_set ) - { - ASSERT(!(flags & PTF_defer)); -- rc = _put_page_type(pg, true, ptpg); -+ rc = _put_page_type(pg, PTF_preemptible, ptpg); - } - else if ( flags & PTF_defer ) - { -@@ -1329,7 +1329,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - } - else - { -- rc = _put_page_type(pg, true, ptpg); -+ rc = _put_page_type(pg, PTF_preemptible, ptpg); - if ( likely(!rc) ) - put_page(pg); - } -@@ -1366,7 +1366,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - PTF_partial_set ) - { - ASSERT(!(flags & PTF_defer)); -- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); -+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); - } - - if ( flags & PTF_defer ) -@@ -1376,7 +1376,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - return 0; - } - -- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); -+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); - if ( likely(!rc) ) - put_page(pg); - -@@ -1397,7 +1397,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - PTF_partial_set ) - { - ASSERT(!(flags & PTF_defer)); -- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); -+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); - } - - if ( flags & PTF_defer ) -@@ -1407,7 +1407,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - return 0; - } - -- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn))); -+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); - if ( likely(!rc) ) - put_page(pg); - } -@@ -2757,10 +2757,11 @@ static int _put_final_page_type(struct page_info *page, unsigned long type, - } - - --static int _put_page_type(struct page_info *page, bool preemptible, -+static int _put_page_type(struct page_info *page, unsigned int flags, - struct page_info *ptpg) - { - unsigned long nx, x, y = page->u.inuse.type_info; -+ bool preemptible = flags & PTF_preemptible; - - ASSERT(current_locked_page_ne_check(page)); - -@@ -2969,7 +2970,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - if ( unlikely(iommu_ret) ) - { -- _put_page_type(page, false, NULL); -+ _put_page_type(page, 0, NULL); - rc = iommu_ret; - goto out; - } -@@ -2996,7 +2997,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - void put_page_type(struct page_info *page) - { -- int rc = _put_page_type(page, false, NULL); -+ int rc = _put_page_type(page, 0, NULL); - ASSERT(rc == 0); - (void)rc; - } -@@ -3013,7 +3014,7 @@ int get_page_type(struct page_info *page, unsigned long type) - - int put_page_type_preemptible(struct page_info *page) - { -- return _put_page_type(page, true, NULL); -+ return _put_page_type(page, PTF_preemptible, NULL); - } - - int get_page_type_preemptible(struct page_info *page, unsigned long type) -@@ -3030,7 +3031,7 @@ int put_old_guest_table(struct vcpu *v) - if ( !v->arch.old_guest_table ) - return 0; - -- switch ( rc = _put_page_type(v->arch.old_guest_table, true, -+ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, - v->arch.old_guest_ptpg) ) - { - case -EINTR: --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch b/system/xen/xsa/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch deleted file mode 100644 index 9cfbb739079b5..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch +++ /dev/null @@ -1,79 +0,0 @@ -From 6f257854c8778774210281c5c21028c4b7739b44 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional - -Make it easier to read by declaring the conditions in which we will -retain the ref, rather than the conditions under which we release it. - -The only way (page == current->arch.old_guest_table) can be true is if -preemptible is true; so remove this from the query itself, and add an -ASSERT() to that effect on the opposite path. - -No functional change intended. - -NB that alloc_lN_table() mishandle the "linear pt failure" situation -described in the comment; this will be addressed in a future patch. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++-- - 1 file changed, 37 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 0740b61af8..0a4d39a2c3 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1122,8 +1122,43 @@ static int get_page_and_type_from_mfn( - - rc = _get_page_type(page, type, preemptible); - -- if ( unlikely(rc) && !partial_ref && -- (!preemptible || page != current->arch.old_guest_table) ) -+ /* -+ * Retain the refcount if: -+ * - page is fully validated (rc == 0) -+ * - page is not validated (rc < 0) but: -+ * - We came in with a reference (partial_ref) -+ * - page is partially validated but there's been an error -+ * (page == current->arch.old_guest_table) -+ * -+ * The partial_ref-on-error clause is worth an explanation. There -+ * are two scenarios where partial_ref might be true coming in: -+ * - mfn has been partially demoted as type `type`; i.e. has -+ * PGT_partial set -+ * - mfn has been partially demoted as L(type+1) (i.e., a linear -+ * page; e.g. we're being called from get_page_from_l2e with -+ * type == PGT_l1_table, but the mfn is PGT_l2_table) -+ * -+ * If there's an error, in the first case, _get_page_type will -+ * either return -ERESTART, in which case we want to retain the -+ * ref (as the caller will consider it retained), or -EINVAL, in -+ * which case old_guest_table will be set; in both cases, we need -+ * to retain the ref. -+ * -+ * In the second case, if there's an error, _get_page_type() can -+ * *only* return -EINVAL, and *never* set old_guest_table. In -+ * that case we also want to retain the reference, to allow the -+ * page to continue to be torn down (i.e., PGT_partial cleared) -+ * safely. -+ * -+ * Also note that we shouldn't be able to leave with the reference -+ * count retained unless we succeeded, or the operation was -+ * preemptible. -+ */ -+ if ( likely(!rc) || partial_ref ) -+ /* nothing */; -+ else if ( page == current->arch.old_guest_table ) -+ ASSERT(preemptible); -+ else - put_page(page); - - return rc; --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch b/system/xen/xsa/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch deleted file mode 100644 index 72ee3eac9e4e5..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch +++ /dev/null @@ -1,111 +0,0 @@ -From 4ad70553611a7a4e4494d5a3b51b5cc295a488e0 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when - preempting - -In order to allow recursive pagetable promotions and demotions to be -interrupted, Xen must keep track of the state of the sub-pages -promoted or demoted. This is stored in two elements in the page -struct: nr_entries_validated and partial_flags. - -The rule is that entries [0, nr_entries_validated) should always be -validated and hold a general reference count. If partial_flags is -zero, then [nr_entries_validated] is not validated and no reference -count is held. If PTF_partial_set is set, then [nr_entries_validated] -is partially validated. - -At the moment, a distinction is made between promotion and demotion -with regard to whether the entry itself "holds" a general reference -count: when entry promotion is interrupted (i.e., returns -ERESTART), -the entry is not considered to hold a reference; when entry demotion -is interrupted, the entry is still considered to hold a general -reference. - -PTF_partial_general_ref is used to distinguish between these cases. -If clear, it's a partial promotion => no general reference count held -by the entry; if set, it's partial demotion, so a general reference -count held. Because promotions and demotions can be interleaved, this -value is passed to get_page_and_type_from_mfn and put_page_from_l*e, -to be able to properly handle reference counts. - -Unfortunately, when alloc_l[23]_table check hypercall_preempt_check() -and return -ERESTART, they set nr_entries_validated, but don't clear -partial_flags. - -If we were picking up from a previously-interrupted promotion, that -means that PTF_partial_set would be set even though -[nr_entries_validated] was not partially validated. This means that -if the page in this state were de-validated, put_page_type() would -erroneously be called on that entry. - -Perhaps worse, if we were racing with a de-validation, then we might -leave both PTF_partial_set and PTF_partial_general_ref; and when -de-validation picked up again, both the type and the general ref would -be erroneously dropped from [nr_entries_validated]. - -In a sense, the real issue here is code duplication. Rather than -duplicate the interruption code, set rc to -EINTR and fall through to -the code which already handles that case correctly. - -Given the logic at this point, it should be impossible for -partial_flags to be non-zero; add an ASSERT() to catch any changes. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 25 ++++++------------------- - 1 file changed, 6 insertions(+), 19 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 0a4d39a2c3..bbd29a68f4 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1554,21 +1554,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; - i++, partial_flags = 0 ) - { -- l2_pgentry_t l2e; -+ l2_pgentry_t l2e = pl2e[i]; - - if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) -- { -- page->nr_validated_ptes = i; -- rc = -ERESTART; -- break; -- } -- -- if ( !is_guest_l2_slot(d, type, i) ) -+ rc = -EINTR; -+ else if ( !is_guest_l2_slot(d, type, i) ) - continue; -- -- l2e = pl2e[i]; -- -- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) -+ else if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - { - if ( !pv_l1tf_check_l2e(d, l2e) ) - continue; -@@ -1640,13 +1632,8 @@ static int alloc_l3_table(struct page_info *page) - l3_pgentry_t l3e = pl3e[i]; - - if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) -- { -- page->nr_validated_ptes = i; -- rc = -ERESTART; -- break; -- } -- -- if ( is_pv_32bit_domain(d) && (i == 3) ) -+ rc = -EINTR; -+ else if ( is_pv_32bit_domain(d) && (i == 3) ) - { - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || - (l3e_get_flags(l3e) & l3_disallow_mask(d)) ) --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch b/system/xen/xsa/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch deleted file mode 100644 index ef390e2b137db..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch +++ /dev/null @@ -1,378 +0,0 @@ -From 51fe4e67d954649fcf103116be6206a769f0db1e Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial - -In order to allow recursive pagetable promotions and demotions to be -interrupted, Xen must keep track of the state of the sub-pages -promoted or demoted. This is stored in two elements in the page struct: -nr_entries_validated and partial_flags. - -The rule is that entries [0, nr_entries_validated) should always be -validated and hold a general reference count. If partial_flags is -zero, then [nr_entries_validated] is not validated and no reference -count is held. If PTF_partial_set is set, then [nr_entries_validated] -is partially validated. - -At the moment, a distinction is made between promotion and demotion -with regard to whether the entry itself "holds" a general reference -count: when entry promotion is interrupted (i.e., returns -ERESTART), -the entry is not considered to hold a reference; when entry demotion -is interrupted, the entry is still considered to hold a general -reference. - -PTF_partial_general_ref is used to distinguish between these cases. -If clear, it's a partial promotion => no general reference count held -by the entry; if set, it's partial demotion, so a general reference -count held. Because promotions and demotions can be interleaved, this -value is passed to get_page_and_type_from_mfn and put_page_from_l*e, -to be able to properly handle reference counts. - -Unfortunately, because a refcount is not held, it is possible to -engineer a situation where PFT_partial_set is set but the page in -question has been assigned to another domain. A sketch is provided in -the appendix. - -Fix this by having the parent page table entry hold a general -reference count whenever PFT_partial_set is set. (For clarity of -change, keep two separate flags. These will be collapsed in a -subsequent changeset.) - -This has two basic implications. On the put_page_from_lNe() side, -this mean that the (partial_set && !partial_ref) case can never happen, -and no longer needs to be special-cased. - -Secondly, because both flags are set together, there's no need to carry over -existing bits from partial_pte. - -(NB there is still another issue with calling _put_page_type() on a -page which had PGT_partial set; that will be handled in a subsequent -patch.) - -On the get_page_and_type_from_mfn() side, we need to distinguish -between callers which hold a reference on partial (i.e., -alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and -so on): pass a flag if the type should be retained on interruption. - -NB that since l1 promotion can't be preempted, that get_page_from_l2e -can't return -ERESTART. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ------ -* Appendix: Engineering PTF_partial_set while a page belongs to a - foreign domain - -Suppose A is a page which can be promoted to an l3, and B is a page -which can be promoted to an l2, and A[x] points to B. B has -PGC_allocated set but no other general references. - -V1: PIN_L3 A. - A is validated, B is validated. - A.type_count = 1 | PGT_validated | PGT_pinned - B.type_count = 1 | PGT_validated - B.count = 2 | PGC_allocated (A[x] holds a general ref) - -V1: UNPIN A. - A begins de-validation. - Arrange to be interrupted when i < x - V1->old_guest_table = A - V1->old_guest_table_ref_held = false - A.type_count = 1 | PGT_partial - A.nr_validated_entries = i < x - B.type_count = 0 - B.count = 1 | PGC_allocated - -V2: MOD_L4_ENTRY to point some l4e to A. - Picks up re-validation of A. - Arrange to be interrupted halfway through B's validation - B.type_count = 1 | PGT_partial - B.count = 2 | PGC_allocated (PGT_partial holds a general ref) - A.type_count = 1 | PGT_partial - A.nr_validated_entries = x - A.partial_pte = PTF_partial_set - -V3: MOD_L3_ENTRY to point some other l3e (not in A) to B. - Validates B. - B.type_count = 1 | PGT_validated - B.count = 2 | PGC_allocated ("other l3e" holds a general ref) - -V3: MOD_L3_ENTRY to clear l3e pointing to B. - Devalidates B. - B.type_count = 0 - B.count = 1 | PGC_allocated - -V3: decrease_reservation(B) - Clears PGC_allocated - B.count = 0 => B is freed - -B gets assigned to a different domain - -V1: Restarts UNPIN of A - put_old_guest_table(A) - ... - free_l3_table(A) - -Now since A.partial_flags has PTF_partial_set, free_l3_table() will -call put_page_from_l3e() on A[x], which points to B, while B is owned -by another domain. - -If A[x] held a general refcount for B on partial validation, as it does -for partial de-validation, then B would still have a reference count of -1 after PGC_allocated was freed; so B wouldn't be freed until after -put_page_from_l3e() had happend on A[x]. ---- - xen/arch/x86/mm.c | 84 +++++++++++++++++++++++----------------- - xen/include/asm-x86/mm.h | 15 ++++--- - 2 files changed, 58 insertions(+), 41 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index bbd29a68f4..4d3ebf341d 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1102,10 +1102,11 @@ get_page_from_l1e( - * page->pte[page->nr_validated_entries]. See the comment in mm.h for - * more information. - */ --#define PTF_partial_set (1 << 0) --#define PTF_partial_general_ref (1 << 1) --#define PTF_preemptible (1 << 2) --#define PTF_defer (1 << 3) -+#define PTF_partial_set (1 << 0) -+#define PTF_partial_general_ref (1 << 1) -+#define PTF_preemptible (1 << 2) -+#define PTF_defer (1 << 3) -+#define PTF_retain_ref_on_restart (1 << 4) - - static int get_page_and_type_from_mfn( - mfn_t mfn, unsigned long type, struct domain *d, -@@ -1114,7 +1115,11 @@ static int get_page_and_type_from_mfn( - struct page_info *page = mfn_to_page(mfn); - int rc; - bool preemptible = flags & PTF_preemptible, -- partial_ref = flags & PTF_partial_general_ref; -+ partial_ref = flags & PTF_partial_general_ref, -+ partial_set = flags & PTF_partial_set, -+ retain_ref = flags & PTF_retain_ref_on_restart; -+ -+ ASSERT(partial_ref == partial_set); - - if ( likely(!partial_ref) && - unlikely(!get_page_from_mfn(mfn, d)) ) -@@ -1127,13 +1132,15 @@ static int get_page_and_type_from_mfn( - * - page is fully validated (rc == 0) - * - page is not validated (rc < 0) but: - * - We came in with a reference (partial_ref) -+ * - page is partially validated (rc == -ERESTART), and the -+ * caller has asked the ref to be retained in that case - * - page is partially validated but there's been an error - * (page == current->arch.old_guest_table) - * - * The partial_ref-on-error clause is worth an explanation. There - * are two scenarios where partial_ref might be true coming in: -- * - mfn has been partially demoted as type `type`; i.e. has -- * PGT_partial set -+ * - mfn has been partially promoted / demoted as type `type`; -+ * i.e. has PGT_partial set - * - mfn has been partially demoted as L(type+1) (i.e., a linear - * page; e.g. we're being called from get_page_from_l2e with - * type == PGT_l1_table, but the mfn is PGT_l2_table) -@@ -1156,7 +1163,8 @@ static int get_page_and_type_from_mfn( - */ - if ( likely(!rc) || partial_ref ) - /* nothing */; -- else if ( page == current->arch.old_guest_table ) -+ else if ( page == current->arch.old_guest_table || -+ (retain_ref && rc == -ERESTART) ) - ASSERT(preemptible); - else - put_page(page); -@@ -1354,8 +1362,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == - PTF_partial_set ) - { -- ASSERT(!(flags & PTF_defer)); -- rc = _put_page_type(pg, PTF_preemptible, ptpg); -+ /* partial_set should always imply partial_ref */ -+ BUG(); - } - else if ( flags & PTF_defer ) - { -@@ -1400,8 +1408,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == - PTF_partial_set ) - { -- ASSERT(!(flags & PTF_defer)); -- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); -+ /* partial_set should always imply partial_ref */ -+ BUG(); - } - - if ( flags & PTF_defer ) -@@ -1431,8 +1439,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == - PTF_partial_set ) - { -- ASSERT(!(flags & PTF_defer)); -- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); -+ /* partial_set should always imply partial_ref */ -+ BUG(); - } - - if ( flags & PTF_defer ) -@@ -1569,13 +1577,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - else - rc = get_page_from_l2e(l2e, pfn, d, partial_flags); - -- if ( rc == -ERESTART ) -- { -- page->nr_validated_ptes = i; -- /* Set 'set', retain 'general ref' */ -- page->partial_flags = partial_flags | PTF_partial_set; -- } -- else if ( rc == -EINTR && i ) -+ /* -+ * It shouldn't be possible for get_page_from_l2e to return -+ * -ERESTART, since we never call this with PTF_preemptible. -+ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable -+ * entry.) -+ * -+ * NB that while on a "clean" promotion, we can never get -+ * PGT_partial. It is possible to arrange for an l2e to -+ * contain a partially-devalidated l2; but in that case, both -+ * of the following functions will fail anyway (the first -+ * because the page in question is not an l1; the second -+ * because the page is not fully validated). -+ */ -+ ASSERT(rc != -ERESTART); -+ -+ if ( rc == -EINTR && i ) - { - page->nr_validated_ptes = i; - page->partial_flags = 0; -@@ -1584,6 +1601,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - else if ( rc < 0 && rc != -EINTR ) - { - gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); -+ ASSERT(current->arch.old_guest_table == NULL); - if ( i ) - { - page->nr_validated_ptes = i; -@@ -1642,7 +1660,7 @@ static int alloc_l3_table(struct page_info *page) - rc = get_page_and_type_from_mfn( - l3e_get_mfn(l3e), - PGT_l2_page_table | PGT_pae_xen_l2, d, -- partial_flags | PTF_preemptible); -+ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart); - } - else if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - { -@@ -1651,13 +1669,14 @@ static int alloc_l3_table(struct page_info *page) - rc = -EINTR; - } - else -- rc = get_page_from_l3e(l3e, pfn, d, partial_flags); -+ rc = get_page_from_l3e(l3e, pfn, d, -+ partial_flags | PTF_retain_ref_on_restart); - - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; - /* Set 'set', leave 'general ref' set if this entry was set */ -- page->partial_flags = partial_flags | PTF_partial_set; -+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; - } - else if ( rc == -EINTR && i ) - { -@@ -1833,13 +1852,14 @@ static int alloc_l4_table(struct page_info *page) - rc = -EINTR; - } - else -- rc = get_page_from_l4e(l4e, pfn, d, partial_flags); -+ rc = get_page_from_l4e(l4e, pfn, d, -+ partial_flags | PTF_retain_ref_on_restart); - - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; - /* Set 'set', leave 'general ref' set if this entry was set */ -- page->partial_flags = partial_flags | PTF_partial_set; -+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; - } - else if ( rc < 0 ) - { -@@ -1936,9 +1956,7 @@ static int free_l2_table(struct page_info *page) - else if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_flags = (partial_flags & PTF_partial_set) ? -- partial_flags : -- (PTF_partial_set | PTF_partial_general_ref); -+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; - } - else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) - { -@@ -1986,9 +2004,7 @@ static int free_l3_table(struct page_info *page) - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_flags = (partial_flags & PTF_partial_set) ? -- partial_flags : -- (PTF_partial_set | PTF_partial_general_ref); -+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; - } - else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) - { -@@ -2019,9 +2035,7 @@ static int free_l4_table(struct page_info *page) - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_flags = (partial_flags & PTF_partial_set) ? -- partial_flags : -- (PTF_partial_set | PTF_partial_general_ref); -+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref; - } - else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) - { -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 8406ac3c37..02079e1324 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -238,22 +238,25 @@ struct page_info - * page. - * - * This happens: -- * - During de-validation, if de-validation of the page was -+ * - During validation or de-validation, if the operation was - * interrupted - * - During validation, if an invalid entry is encountered and - * validation is preemptible - * - During validation, if PTF_partial_general_ref was set on -- * this entry to begin with (perhaps because we're picking -- * up from a partial de-validation). -+ * this entry to begin with (perhaps because it picked up a -+ * previous operation) - * -- * When resuming validation, if PTF_partial_general_ref is clear, -- * then a general reference must be re-acquired; if it is set, no -- * reference should be acquired. -+ * When resuming validation, if PTF_partial_general_ref is -+ * clear, then a general reference must be re-acquired; if it -+ * is set, no reference should be acquired. - * - * When resuming de-validation, if PTF_partial_general_ref is - * clear, no reference should be dropped; if it is set, a - * reference should be dropped. - * -+ * NB at the moment, PTF_partial_set should be set if and only if -+ * PTF_partial_general_ref is set. -+ * - * NB that PTF_partial_set and PTF_partial_general_ref are - * defined in mm.c, the only place where they are used. - * --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch b/system/xen/xsa/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch deleted file mode 100644 index 6cf41d1cd6941..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch +++ /dev/null @@ -1,227 +0,0 @@ -From 8a8d836f7f7418e659d37817a66cd7a6b115042b Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and - PTF_partial_general_ref into one - -...now that they are equivalent. No functional change intended. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 50 +++++++++++----------------------------- - xen/include/asm-x86/mm.h | 29 +++++++++++------------ - 2 files changed, 26 insertions(+), 53 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 4d3ebf341d..886e93b8aa 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1097,13 +1097,12 @@ get_page_from_l1e( - - /* - * The following flags are used to specify behavior of various get and -- * put commands. The first two are also stored in page->partial_flags -- * to indicate the state of the page pointed to by -+ * put commands. The first is also stored in page->partial_flags to -+ * indicate the state of the page pointed to by - * page->pte[page->nr_validated_entries]. See the comment in mm.h for - * more information. - */ - #define PTF_partial_set (1 << 0) --#define PTF_partial_general_ref (1 << 1) - #define PTF_preemptible (1 << 2) - #define PTF_defer (1 << 3) - #define PTF_retain_ref_on_restart (1 << 4) -@@ -1115,13 +1114,10 @@ static int get_page_and_type_from_mfn( - struct page_info *page = mfn_to_page(mfn); - int rc; - bool preemptible = flags & PTF_preemptible, -- partial_ref = flags & PTF_partial_general_ref, - partial_set = flags & PTF_partial_set, - retain_ref = flags & PTF_retain_ref_on_restart; - -- ASSERT(partial_ref == partial_set); -- -- if ( likely(!partial_ref) && -+ if ( likely(!partial_set) && - unlikely(!get_page_from_mfn(mfn, d)) ) - return -EINVAL; - -@@ -1131,14 +1127,14 @@ static int get_page_and_type_from_mfn( - * Retain the refcount if: - * - page is fully validated (rc == 0) - * - page is not validated (rc < 0) but: -- * - We came in with a reference (partial_ref) -+ * - We came in with a reference (partial_set) - * - page is partially validated (rc == -ERESTART), and the - * caller has asked the ref to be retained in that case - * - page is partially validated but there's been an error - * (page == current->arch.old_guest_table) - * -- * The partial_ref-on-error clause is worth an explanation. There -- * are two scenarios where partial_ref might be true coming in: -+ * The partial_set-on-error clause is worth an explanation. There -+ * are two scenarios where partial_set might be true coming in: - * - mfn has been partially promoted / demoted as type `type`; - * i.e. has PGT_partial set - * - mfn has been partially demoted as L(type+1) (i.e., a linear -@@ -1161,7 +1157,7 @@ static int get_page_and_type_from_mfn( - * count retained unless we succeeded, or the operation was - * preemptible. - */ -- if ( likely(!rc) || partial_ref ) -+ if ( likely(!rc) || partial_set ) - /* nothing */; - else if ( page == current->arch.old_guest_table || - (retain_ref && rc == -ERESTART) ) -@@ -1359,13 +1355,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - struct page_info *pg = l2e_get_page(l2e); - struct page_info *ptpg = mfn_to_page(_mfn(pfn)); - -- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == -- PTF_partial_set ) -- { -- /* partial_set should always imply partial_ref */ -- BUG(); -- } -- else if ( flags & PTF_defer ) -+ if ( flags & PTF_defer ) - { - current->arch.old_guest_ptpg = ptpg; - current->arch.old_guest_table = pg; -@@ -1405,13 +1395,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - - pg = l3e_get_page(l3e); - -- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == -- PTF_partial_set ) -- { -- /* partial_set should always imply partial_ref */ -- BUG(); -- } -- - if ( flags & PTF_defer ) - { - current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); -@@ -1436,13 +1419,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - { - struct page_info *pg = l4e_get_page(l4e); - -- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) == -- PTF_partial_set ) -- { -- /* partial_set should always imply partial_ref */ -- BUG(); -- } -- - if ( flags & PTF_defer ) - { - current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); -@@ -1676,7 +1652,7 @@ static int alloc_l3_table(struct page_info *page) - { - page->nr_validated_ptes = i; - /* Set 'set', leave 'general ref' set if this entry was set */ -- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; -+ page->partial_flags = PTF_partial_set; - } - else if ( rc == -EINTR && i ) - { -@@ -1859,7 +1835,7 @@ static int alloc_l4_table(struct page_info *page) - { - page->nr_validated_ptes = i; - /* Set 'set', leave 'general ref' set if this entry was set */ -- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; -+ page->partial_flags = PTF_partial_set; - } - else if ( rc < 0 ) - { -@@ -1956,7 +1932,7 @@ static int free_l2_table(struct page_info *page) - else if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; -+ page->partial_flags = PTF_partial_set; - } - else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) - { -@@ -2004,7 +1980,7 @@ static int free_l3_table(struct page_info *page) - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; -+ page->partial_flags = PTF_partial_set; - } - else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) - { -@@ -2035,7 +2011,7 @@ static int free_l4_table(struct page_info *page) - if ( rc == -ERESTART ) - { - page->nr_validated_ptes = i; -- page->partial_flags = PTF_partial_set | PTF_partial_general_ref; -+ page->partial_flags = PTF_partial_set; - } - else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) - { -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 02079e1324..f0fd35bf6b 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -233,7 +233,7 @@ struct page_info - * operation on the current page. (That page may or may not - * still have PGT_partial set.) - * -- * If PTF_partial_general_ref is set, then the PTE at -+ * Additionally, if PTF_partial_set is set, then the PTE at - * @nr_validated_ptef holds a general reference count for the - * page. - * -@@ -242,23 +242,20 @@ struct page_info - * interrupted - * - During validation, if an invalid entry is encountered and - * validation is preemptible -- * - During validation, if PTF_partial_general_ref was set on -- * this entry to begin with (perhaps because it picked up a -+ * - During validation, if PTF_partial_set was set on this -+ * entry to begin with (perhaps because it picked up a - * previous operation) - * -- * When resuming validation, if PTF_partial_general_ref is -- * clear, then a general reference must be re-acquired; if it -- * is set, no reference should be acquired. -+ * When resuming validation, if PTF_partial_set is clear, then -+ * a general reference must be re-acquired; if it is set, no -+ * reference should be acquired. - * -- * When resuming de-validation, if PTF_partial_general_ref is -- * clear, no reference should be dropped; if it is set, a -- * reference should be dropped. -+ * When resuming de-validation, if PTF_partial_set is clear, -+ * no reference should be dropped; if it is set, a reference -+ * should be dropped. - * -- * NB at the moment, PTF_partial_set should be set if and only if -- * PTF_partial_general_ref is set. -- * -- * NB that PTF_partial_set and PTF_partial_general_ref are -- * defined in mm.c, the only place where they are used. -+ * NB that PTF_partial_set is defined in mm.c, the only place -+ * where it is used. - * - * The 3rd field, @linear_pt_count, indicates - * - by a positive value, how many same-level page table entries a page -@@ -268,8 +265,8 @@ struct page_info - */ - struct { - u16 nr_validated_ptes:PAGETABLE_ORDER + 1; -- u16 :16 - PAGETABLE_ORDER - 1 - 2; -- u16 partial_flags:2; -+ u16 :16 - PAGETABLE_ORDER - 1 - 1; -+ u16 partial_flags:1; - s16 linear_pt_count; - }; - --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch b/system/xen/xsa/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch deleted file mode 100644 index bbaba794fc7bc..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch +++ /dev/null @@ -1,106 +0,0 @@ -From da3d1d258e54fe600f7f75287183b74d957ec63b Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion - failures - -In order to allow recursive pagetable promotions and demotions to be -interrupted, Xen must keep track of the state of the sub-pages -promoted or demoted. This is stored in two elements in the page -struct: nr_entries_validated and partial_flags. - -The rule is that entries [0, nr_entries_validated) should always be -validated and hold a general reference count. If partial_flags is -zero, then [nr_entries_validated] is not validated and no reference -count is held. If PTF_partial_set is set, then [nr_entries_validated] -is partially validated, and a general reference count is held. - -Unfortunately, in cases where an entry began with PTF_partial_set set, -and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is -erroneously dropped. (This scenario can be engineered mainly by the -use of interleaving of promoting and demoting a page which has "linear -pagetable" entries; see the appendix for a sketch.) This means that -we will "leak" a general reference count on the page in question, -preventing the page from being freed. - -Fix this by setting page->partial_flags to the partial_flags local -variable. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ------ -Appendix - -Suppose A and B can both be promoted to L2 pages, and A[x] points to B. - -V1: PIN_L2 B. - B.type_count = 1 | PGT_validated - B.count = 2 | PGC_allocated - -V1: MOD_L3_ENTRY pointing something to A. - In the process of validating A[x], grab an extra type / ref on B: - B.type_count = 2 | PGT_validated - B.count = 3 | PGC_allocated - A.type_count = 1 | PGT_validated - A.count = 2 | PGC_allocated - -V1: UNPIN B. - B.type_count = 1 | PGT_validate - B.count = 2 | PGC_allocated - -V1: MOD_L3_ENTRY removing the reference to A. - De-validate A, down to A[x], which points to B. - Drop the final type on B. Arrange to be interrupted. - B.type_count = 1 | PGT_partial - B.count = 2 | PGC_allocated - A.type_count = 1 | PGT_partial - A.nr_validated_entries = x - A.partial_pte = -1 - -V2: MOD_L3_ENTRY adds a reference to A. - -At this point, get_page_from_l2e(A[x]) tries -get_page_and_type_from_mfn(), which fails because it's the wrong type; -and get_l2_linear_pagetable() also fails, because B isn't validated as -an l2 anymore. ---- - xen/arch/x86/mm.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 886e93b8aa..0a094291da 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1581,7 +1581,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - if ( i ) - { - page->nr_validated_ptes = i; -- page->partial_flags = 0; -+ page->partial_flags = partial_flags; - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; - } -@@ -1674,7 +1674,7 @@ static int alloc_l3_table(struct page_info *page) - if ( i ) - { - page->nr_validated_ptes = i; -- page->partial_flags = 0; -+ page->partial_flags = partial_flags; - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; - } -@@ -1845,7 +1845,7 @@ static int alloc_l4_table(struct page_info *page) - if ( i ) - { - page->nr_validated_ptes = i; -- page->partial_flags = 0; -+ page->partial_flags = partial_flags; - if ( rc == -EINTR ) - rc = -ERESTART; - else --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch b/system/xen/xsa/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch deleted file mode 100644 index 7d5f022e892c4..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0010-x86-mm-Fix-nested-de-validation-on-error.patch +++ /dev/null @@ -1,166 +0,0 @@ -From b3e169dc8daeae85b0b51c25fdb142e2e552ec7f Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:49 +0100 -Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error - -If an invalid entry is discovered when validating a page-table tree, -the entire tree which has so far been validated must be de-validated. -Since this may take a long time, alloc_l[2-4]_table() set current -vcpu's old_guest_table immediately; put_old_guest_table() will make -sure that put_page_type() will be called to finish off the -de-validation before any other MMU operations can happen on the vcpu. - -The invariant for partial pages should be: - -* Entries [0, nr_validated_ptes) should be completely validated; - put_page_type() will de-validate these. - -* If [nr_validated_ptes] is partially validated, partial_flags should - set PTF_partiaL_set. put_page_type() will be called on this page to - finish off devalidation, and the appropriate refcount adjustments - will be done. - -alloc_l[2-3]_table() indicates partial validation to its callers by -setting current->old_guest_table. - -Unfortunately, this is mishandled. - -Take the case where validating lNe[x] returns an error. - -First, alloc_l3_table() doesn't check old_guest_table at all; as a -result, partial_flags is not set when it should be. nr_validated_ptes -is set to x; and since PFT_partial_set clear, de-validation resumes at -nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not -have put_page_type() called on it when de-validating the rest of the -l3: it will be stuck in the PGT_partial state until the domain is -destroyed, or until it is re-used as an l2. (Any other page type will -fail.) - -Worse, alloc_l4_table(), rather than setting PTF_partial_set as it -should, sets nr_validated_ptes to x+1. When de-validating, since -partial is 0, this will correctly resume calling put_page_type at [x]; -but, if the put_page_type() is never called, but instead -get_page_type() is called, validation will pick up at [x+1], -neglecting to validate [x]. If the rest of the validation succeeds, -the l4 will be validated even though [x] is invalid. - -Fix this in both cases by setting PTF_partial_set if old_guest_table -is set. - -While here, add some safety catches: -- old_guest_table must point to the page contained in - [nr_validated_ptes]. -- alloc_l1_page shouldn't set old_guest_table - -If we experience one of these situations in production builds, it's -safer to avoid calling put_page_type for the pages in question. If -they have PGT_partial set, they will be cleaned up on domain -destruction; if not, we have no idea whether a type count is safe to -drop. Retaining an extra type ref that should have been dropped may -trigger a BUG() on the free_domain_page() path, but dropping a type -count that shouldn't be dropped may cause a privilege escalation. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - xen/arch/x86/mm.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-- - 1 file changed, 51 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 0a094291da..a432e69c74 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1580,6 +1580,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - ASSERT(current->arch.old_guest_table == NULL); - if ( i ) - { -+ /* -+ * alloc_l1_table() doesn't set old_guest_table; it does -+ * its own tear-down immediately on failure. If it -+ * did we'd need to check it and set partial_flags as we -+ * do in alloc_l[34]_table(). -+ * -+ * Note on the use of ASSERT: if it's non-null and -+ * hasn't been cleaned up yet, it should have -+ * PGT_partial set; and so the type will be cleaned up -+ * on domain destruction. Unfortunately, we would -+ * leak the general ref held by old_guest_table; but -+ * leaking a page is less bad than a host crash. -+ */ -+ ASSERT(current->arch.old_guest_table == NULL); - page->nr_validated_ptes = i; - page->partial_flags = partial_flags; - current->arch.old_guest_ptpg = NULL; -@@ -1607,6 +1621,7 @@ static int alloc_l3_table(struct page_info *page) - unsigned int i; - int rc = 0; - unsigned int partial_flags = page->partial_flags; -+ l3_pgentry_t l3e = l3e_empty(); - - pl3e = map_domain_page(_mfn(pfn)); - -@@ -1623,7 +1638,7 @@ static int alloc_l3_table(struct page_info *page) - for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; - i++, partial_flags = 0 ) - { -- l3_pgentry_t l3e = pl3e[i]; -+ l3e = pl3e[i]; - - if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) - rc = -EINTR; -@@ -1675,6 +1690,24 @@ static int alloc_l3_table(struct page_info *page) - { - page->nr_validated_ptes = i; - page->partial_flags = partial_flags; -+ if ( current->arch.old_guest_table ) -+ { -+ /* -+ * We've experienced a validation failure. If -+ * old_guest_table is set, "transfer" the general -+ * reference count to pl3e[nr_validated_ptes] by -+ * setting PTF_partial_set. -+ * -+ * As a precaution, check that old_guest_table is the -+ * page pointed to by pl3e[nr_validated_ptes]. If -+ * not, it's safer to leak a type ref on production -+ * builds. -+ */ -+ if ( current->arch.old_guest_table == l3e_get_page(l3e) ) -+ page->partial_flags = PTF_partial_set; -+ else -+ ASSERT_UNREACHABLE(); -+ } - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; - } -@@ -1851,7 +1884,23 @@ static int alloc_l4_table(struct page_info *page) - else - { - if ( current->arch.old_guest_table ) -- page->nr_validated_ptes++; -+ { -+ /* -+ * We've experienced a validation failure. If -+ * old_guest_table is set, "transfer" the general -+ * reference count to pl3e[nr_validated_ptes] by -+ * setting PTF_partial_set. -+ * -+ * As a precaution, check that old_guest_table is the -+ * page pointed to by pl4e[nr_validated_ptes]. If -+ * not, it's safer to leak a type ref on production -+ * builds. -+ */ -+ if ( current->arch.old_guest_table == l4e_get_page(l4e) ) -+ page->partial_flags = PTF_partial_set; -+ else -+ ASSERT_UNREACHABLE(); -+ } - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; - } --- -2.23.0 - diff --git a/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch deleted file mode 100644 index ad7e6fee1b9d9..0000000000000 --- a/system/xen/xsa/xsa299-4.12-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch +++ /dev/null @@ -1,413 +0,0 @@ -From ea3dc624c5e6325a9c2f079e52a85965d4ab6ce8 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@citrix.com> -Date: Thu, 10 Oct 2019 17:57:50 +0100 -Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to - begin with - -Validation and de-validation of pagetable trees may take arbitrarily -large amounts of time, and so must be preemptible. This is indicated -by setting the PGT_partial bit in the type_info, and setting -nr_validated_entries and partial_flags appropriately. Specifically, -if the entry at [nr_validated_entries] is partially validated, -partial_flags should have the PGT_partial_set bit set, and the entry -should hold a general reference count. During de-validation, -put_page_type() is called on partially validated entries. - -Unfortunately, there are a number of issues with the current algorithm. - -First, doing a "normal" put_page_type() is not safe when no type ref -is held: there is nothing to stop another vcpu from coming along and -picking up validation again: at which point the put_page_type may drop -the only page ref on an in-use page. Some examples are listed in the -appendix. - -The core issue is that put_page_type() is being called both to clean -up PGT_partial, and to drop a type count; and has no way of knowing -which is which; and so if in between, PGT_partial is cleared, -put_page_type() will drop the type ref erroneously. - -What is needed is to distinguish between two states: -- Dropping a type ref which is held -- Cleaning up a page which has been partially de/validated - -Fix this by telling put_page_type() which of the two activities you -intend. - -When cleaning up a partial de/validation, take no action unless you -find a page partially validated. - -If put_page_type() is called without PTF_partial_set, and finds the -page in a PGT_partial state anyway, then there's certainly been a -misaccounting somewhere, and carrying on would almost certainly cause -a security issue, so crash the host instead. - -In put_page_from_lNe, pass partial_flags on to _put_page_type(). - -old_guest_table may be set either with a fully validated page (when -using the "deferred put" pattern), or with a partially validated page -(when a normal "de-validation" is interrupted, or when a validation -fails part-way through due to invalid entries). Add a flag, -old_guest_table_partial, to indicate which of these it is, and use -that to pass the appropriate flag to _put_page_type(). - -While here, delete stray trailing whitespace. - -This is part of XSA-299. - -Reported-by: George Dunlap <george.dunlap@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ------ -Appendix: - -Suppose page A, when interpreted as an l3 pagetable, contains all -valid entries; and suppose A[x] points to page B, which when -interpreted as an l2 pagetable, contains all valid entries. - -P1: PIN_L3_TABLE - A -> PGT_l3_table | 1 | valid - B -> PGT_l2_table | 1 | valid - -P1: UNPIN_TABLE - > Arrange to interrupt after B has been de-validated - B: - type_info -> PGT_l2_table | 0 - A: - type_info -> PGT_l3_table | 1 | partial - nr_validated_enties -> (less than x) - -P2: mod_l4_entry to point to A - > Arrange for this to be interrupted while B is being validated - B: - type_info -> PGT_l2_table | 1 | partial - (nr_validated_entires &c set as appropriate) - A: - type_info -> PGT_l3_table | 1 | partial - nr_validated_entries -> x - partial_pte = 1 - -P3: mod_l3_entry some other unrelated l3 to point to B: - B: - type_info -> PGT_l2_table | 1 - -P1: Restart UNPIN_TABLE - -At this point, since A.nr_validate_entries == x and A.partial_pte != -0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping -its type count to 0 while it's still being pointed to by some other l3 - -A similar issue arises with old_guest_table. Consider the following -scenario: - -Suppose A is a page which, when interpreted as an l2, has valid entries -until entry x, which is invalid. - -V1: PIN_L2_TABLE(A) - <Validate until we try to validate [x], get -EINVAL> - A -> PGT_l2_table | 1 | PGT_partial - V1 -> old_guest_table = A - <delayed> - -V2: PIN_L2_TABLE(A) - <Pick up where V1 left off, try to re-validate [x], get -EINVAL> - A -> PGT_l2_table | 1 | PGT_partial - V2 -> old_guest_table = A - <restart> - put_old_guest_table() - _put_page_type(A) - A -> PGT_l2_table | 0 - -V1: <restart> - put_old_guest_table() - _put_page_type(A) # UNDERFLOW - -Indeed, it is possible to engineer for old_guest_table for every vcpu -a guest has to point to the same page. ---- - xen/arch/x86/domain.c | 6 +++ - xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++----- - xen/include/asm-x86/domain.h | 4 +- - 3 files changed, 95 insertions(+), 14 deletions(-) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 59df8a6d8d..f1ae5f89f5 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -1104,9 +1104,15 @@ int arch_set_info_guest( - rc = -ERESTART; - /* Fallthrough */ - case -ERESTART: -+ /* -+ * NB that we're putting the kernel-mode table -+ * here, which we've already successfully -+ * validated above; hence partial = false; -+ */ - v->arch.old_guest_ptpg = NULL; - v->arch.old_guest_table = - pagetable_get_page(v->arch.guest_table); -+ v->arch.old_guest_table_partial = false; - v->arch.guest_table = pagetable_null(); - break; - default: -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index a432e69c74..81774368a0 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -1359,10 +1359,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, - { - current->arch.old_guest_ptpg = ptpg; - current->arch.old_guest_table = pg; -+ current->arch.old_guest_table_partial = false; - } - else - { -- rc = _put_page_type(pg, PTF_preemptible, ptpg); -+ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); - if ( likely(!rc) ) - put_page(pg); - } -@@ -1385,6 +1386,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - unsigned long mfn = l3e_get_pfn(l3e); - bool writeable = l3e_get_flags(l3e) & _PAGE_RW; - -+ ASSERT(!(flags & PTF_partial_set)); - ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); - do { - put_data_page(mfn_to_page(_mfn(mfn)), writeable); -@@ -1397,12 +1399,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - - if ( flags & PTF_defer ) - { -+ ASSERT(!(flags & PTF_partial_set)); - current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); - current->arch.old_guest_table = pg; -+ current->arch.old_guest_table_partial = false; - return 0; - } - -- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); -+ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn))); - if ( likely(!rc) ) - put_page(pg); - -@@ -1421,12 +1425,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - - if ( flags & PTF_defer ) - { -+ ASSERT(!(flags & PTF_partial_set)); - current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); - current->arch.old_guest_table = pg; -+ current->arch.old_guest_table_partial = false; - return 0; - } - -- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); -+ rc = _put_page_type(pg, flags | PTF_preemptible, -+ mfn_to_page(_mfn(pfn))); - if ( likely(!rc) ) - put_page(pg); - } -@@ -1535,6 +1542,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - - pl2e = map_domain_page(_mfn(pfn)); - -+ /* -+ * NB that alloc_l2_table will never set partial_pte on an l2; but -+ * free_l2_table might if a linear_pagetable entry is interrupted -+ * partway through de-validation. In that circumstance, -+ * get_page_from_l2e() will always return -EINVAL; and we must -+ * retain the type ref by doing the normal partial_flags tracking. -+ */ -+ - for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; - i++, partial_flags = 0 ) - { -@@ -1598,6 +1613,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) - page->partial_flags = partial_flags; - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; -+ current->arch.old_guest_table_partial = true; - } - } - if ( rc < 0 ) -@@ -1704,12 +1720,16 @@ static int alloc_l3_table(struct page_info *page) - * builds. - */ - if ( current->arch.old_guest_table == l3e_get_page(l3e) ) -+ { -+ ASSERT(current->arch.old_guest_table_partial); - page->partial_flags = PTF_partial_set; -+ } - else - ASSERT_UNREACHABLE(); - } - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; -+ current->arch.old_guest_table_partial = true; - } - while ( i-- > 0 ) - pl3e[i] = unadjust_guest_l3e(pl3e[i], d); -@@ -1897,12 +1917,16 @@ static int alloc_l4_table(struct page_info *page) - * builds. - */ - if ( current->arch.old_guest_table == l4e_get_page(l4e) ) -+ { -+ ASSERT(current->arch.old_guest_table_partial); - page->partial_flags = PTF_partial_set; -+ } - else - ASSERT_UNREACHABLE(); - } - current->arch.old_guest_ptpg = NULL; - current->arch.old_guest_table = page; -+ current->arch.old_guest_table_partial = true; - } - } - } -@@ -2831,6 +2855,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags, - x = y; - nx = x - 1; - -+ /* -+ * Is this expected to do a full reference drop, or only -+ * cleanup partial validation / devalidation? -+ * -+ * If the former, the caller must hold a "full" type ref; -+ * which means the page must be validated. If the page is -+ * *not* fully validated, continuing would almost certainly -+ * open up a security hole. An exception to this is during -+ * domain destruction, where PGT_validated can be dropped -+ * without dropping a type ref. -+ * -+ * If the latter, do nothing unless type PGT_partial is set. -+ * If it is set, the type count must be 1. -+ */ -+ if ( !(flags & PTF_partial_set) ) -+ BUG_ON((x & PGT_partial) || -+ !((x & PGT_validated) || page_get_owner(page)->is_dying)); -+ else if ( !(x & PGT_partial) ) -+ return 0; -+ else -+ BUG_ON((x & PGT_count_mask) != 1); -+ - ASSERT((x & PGT_count_mask) != 0); - - switch ( nx & (PGT_locked | PGT_count_mask) ) -@@ -3092,17 +3138,34 @@ int put_old_guest_table(struct vcpu *v) - if ( !v->arch.old_guest_table ) - return 0; - -- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, -- v->arch.old_guest_ptpg) ) -+ rc = _put_page_type(v->arch.old_guest_table, -+ PTF_preemptible | -+ ( v->arch.old_guest_table_partial ? -+ PTF_partial_set : 0 ), -+ v->arch.old_guest_ptpg); -+ -+ if ( rc == -ERESTART || rc == -EINTR ) - { -- case -EINTR: -- case -ERESTART: -+ v->arch.old_guest_table_partial = (rc == -ERESTART); - return -ERESTART; -- case 0: -- put_page(v->arch.old_guest_table); - } - -+ /* -+ * It shouldn't be possible for _put_page_type() to return -+ * anything else at the moment; but if it does happen in -+ * production, leaking the type ref is probably the best thing to -+ * do. Either way, drop the general ref held by old_guest_table. -+ */ -+ ASSERT(rc == 0); -+ -+ put_page(v->arch.old_guest_table); - v->arch.old_guest_table = NULL; -+ v->arch.old_guest_ptpg = NULL; -+ /* -+ * Safest default if someone sets old_guest_table without -+ * explicitly setting old_guest_table_partial. -+ */ -+ v->arch.old_guest_table_partial = true; - - return rc; - } -@@ -3253,11 +3316,11 @@ int new_guest_cr3(mfn_t mfn) - switch ( rc = put_page_and_type_preemptible(page) ) - { - case -EINTR: -- rc = -ERESTART; -- /* fallthrough */ - case -ERESTART: - curr->arch.old_guest_ptpg = NULL; - curr->arch.old_guest_table = page; -+ curr->arch.old_guest_table_partial = (rc == -ERESTART); -+ rc = -ERESTART; - break; - default: - BUG_ON(rc); -@@ -3494,6 +3557,7 @@ long do_mmuext_op( - { - curr->arch.old_guest_ptpg = NULL; - curr->arch.old_guest_table = page; -+ curr->arch.old_guest_table_partial = false; - } - } - } -@@ -3528,6 +3592,11 @@ long do_mmuext_op( - case -ERESTART: - curr->arch.old_guest_ptpg = NULL; - curr->arch.old_guest_table = page; -+ /* -+ * EINTR means we still hold the type ref; ERESTART -+ * means PGT_partial holds the type ref -+ */ -+ curr->arch.old_guest_table_partial = (rc == -ERESTART); - rc = 0; - break; - default: -@@ -3596,11 +3665,15 @@ long do_mmuext_op( - switch ( rc = put_page_and_type_preemptible(page) ) - { - case -EINTR: -- rc = -ERESTART; -- /* fallthrough */ - case -ERESTART: - curr->arch.old_guest_ptpg = NULL; - curr->arch.old_guest_table = page; -+ /* -+ * EINTR means we still hold the type ref; -+ * ERESTART means PGT_partial holds the ref -+ */ -+ curr->arch.old_guest_table_partial = (rc == -ERESTART); -+ rc = -ERESTART; - break; - default: - BUG_ON(rc); -diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h -index 214e44ce1c..2cfce7b36b 100644 ---- a/xen/include/asm-x86/domain.h -+++ b/xen/include/asm-x86/domain.h -@@ -307,7 +307,7 @@ struct arch_domain - - struct paging_domain paging; - struct p2m_domain *p2m; -- /* To enforce lock ordering in the pod code wrt the -+ /* To enforce lock ordering in the pod code wrt the - * page_alloc lock */ - int page_alloc_unlock_level; - -@@ -581,6 +581,8 @@ struct arch_vcpu - struct page_info *old_guest_table; /* partially destructed pagetable */ - struct page_info *old_guest_ptpg; /* containing page table of the */ - /* former, if any */ -+ bool old_guest_table_partial; /* Are we dropping a type ref, or just -+ * finishing up a partial de-validation? */ - /* guest_table holds a ref to the page, and also a type-count unless - * shadow refcounts are in use */ - pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ --- -2.23.0 - diff --git a/system/xen/xsa/xsa301-master-1.patch b/system/xen/xsa/xsa301-master-1.patch deleted file mode 100644 index 54cce2ce28456..0000000000000 --- a/system/xen/xsa/xsa301-master-1.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 19d6330f142cb941b6340a88592e8a294de0ff8c Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Tue, 15 Oct 2019 17:10:40 +0100 -Subject: [PATCH 1/3] xen/arm: p2m: Avoid aliasing guest physical frame - -The P2M helpers implementation is quite lax and will end up to ignore -the unused top bits of a guest physical frame. - -This effectively means that p2m_set_entry() will create a mapping for a -different frame (it is always equal to gfn & (mask unused bits)). Yet -p2m->max_mapped_gfn will be updated using the original frame. - -At the moment, p2m_get_entry() and p2m_resolve_translation_fault() -assume that p2m_get_root_pointer() will always return a non-NULL pointer -when the GFN is smaller than p2m->max_mapped_gfn. - -Unfortunately, because of the aliasing described above, it would be -possible to set p2m->max_mapped_gfn high enough so it covers frame that -would lead p2m_get_root_pointer() to return NULL. - -As we don't sanity check the guest physical frame provided by a guest, a -malicious guest could craft a series of hypercalls that will hit the -BUG_ON() and therefore DoS Xen. - -To prevent aliasing, the function p2m_get_root_pointer() is now reworked -to return NULL If any of the unused top bits are not zero. The caller -can then decide what's the appropriate action to do. Since the two paths -(i.e. P2M_ROOT_PAGES == 1 and P2M_ROOT_PAGES != 1) are now very -similarly, take the opportunity to consolidate them making the code a -bit simpler. - -With this change, p2m_get_entry() will not try to insert a mapping as -the root pointer is invalid. - -Note that root_table is now switch to unsigned long as unsigned int is -not enough to hold part of a GFN. - -This is part of XSA-301. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> ---- - xen/arch/arm/p2m.c | 17 +++++------------ - 1 file changed, 5 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index a2749d9b6f..d0045a8b28 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -229,21 +229,14 @@ void p2m_tlb_flush_sync(struct p2m_domain *p2m) - static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m, - gfn_t gfn) - { -- unsigned int root_table; -- -- if ( P2M_ROOT_PAGES == 1 ) -- return __map_domain_page(p2m->root); -+ unsigned long root_table; - - /* -- * Concatenated root-level tables. The table number will be the -- * offset at the previous level. It is not possible to -- * concatenate a level-0 root. -+ * While the root table index is the offset from the previous level, -+ * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be -+ * 0. Yet we still want to check if all the unused bits are zeroed. - */ -- ASSERT(P2M_ROOT_LEVEL > 0); -- -- root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]); -- root_table &= LPAE_ENTRY_MASK; -- -+ root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT); - if ( root_table >= P2M_ROOT_PAGES ) - return NULL; - --- -2.23.0 - diff --git a/system/xen/xsa/xsa301-master-2.patch b/system/xen/xsa/xsa301-master-2.patch deleted file mode 100644 index baedc9c29751f..0000000000000 --- a/system/xen/xsa/xsa301-master-2.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 3b896936f7505e929dd869d14afcb185d0ee75f8 Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Tue, 15 Oct 2019 17:10:41 +0100 -Subject: [PATCH 2/3] xen/arm: p2m: Avoid off-by-one check on - p2m->max_mapped_gfn - -The code base is using inconsistently the field p2m->max_mapped_gfn. -Some of the useres expect that p2m->max_guest_gfn contain the highest -mapped GFN while others expect highest + 1. - -p2m->max_guest_gfn is set as highest + 1, because of that the sanity -check on the GFN in p2m_resolved_translation_fault() and -p2m_get_entry() can be bypassed when GFN == p2m->max_guest_gfn. - -p2m_get_root_pointer(p2m->max_guest_gfn) may return NULL if it is -outside of address range supported and therefore the BUG_ON() could be -hit. - -The current value hold in p2m->max_mapped_gfn is inconsistent with the -expectation of the common code (see domain_get_maximum_gpfn()) and also -the documentation of the field. - -Rather than changing the check in p2m_translation_fault() and -p2m_get_entry(), p2m->max_mapped_gfn is now containing the highest -mapped GFN and the callers assuming "highest + 1" are now adjusted. - -Take the opportunity to use 1UL rather than 1 as page_order could -theoritically big enough to overflow a 32-bit integer. - -Lastly, the documentation of the field max_guest_gfn to reflect how it -is computed. - -This is part of XSA-301. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> ---- - xen/arch/arm/p2m.c | 6 +++--- - xen/include/asm-arm/p2m.h | 5 +---- - 2 files changed, 4 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index d0045a8b28..8d20d27961 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1041,7 +1041,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m, - p2m_write_pte(entry, pte, p2m->clean_pte); - - p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn, -- gfn_add(sgfn, 1 << page_order)); -+ gfn_add(sgfn, (1UL << page_order) - 1)); - p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn); - } - -@@ -1572,7 +1572,7 @@ int relinquish_p2m_mapping(struct domain *d) - p2m_write_lock(p2m); - - start = p2m->lowest_mapped_gfn; -- end = p2m->max_mapped_gfn; -+ end = gfn_add(p2m->max_mapped_gfn, 1); - - for ( ; gfn_x(start) < gfn_x(end); - start = gfn_next_boundary(start, order) ) -@@ -1641,7 +1641,7 @@ int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end) - p2m_read_lock(p2m); - - start = gfn_max(start, p2m->lowest_mapped_gfn); -- end = gfn_min(end, p2m->max_mapped_gfn); -+ end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1)); - - next_block_gfn = start; - -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index 89f82df380..5fdb6e8183 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -36,10 +36,7 @@ struct p2m_domain { - /* Current Translation Table Base Register for the p2m */ - uint64_t vttbr; - -- /* -- * Highest guest frame that's ever been mapped in the p2m -- * Only takes into account ram and foreign mapping -- */ -+ /* Highest guest frame that's ever been mapped in the p2m */ - gfn_t max_mapped_gfn; - - /* --- -2.23.0 - diff --git a/system/xen/xsa/xsa301-master-3.patch b/system/xen/xsa/xsa301-master-3.patch deleted file mode 100644 index 9f137b89f6ef2..0000000000000 --- a/system/xen/xsa/xsa301-master-3.patch +++ /dev/null @@ -1,67 +0,0 @@ -From 060c2dd3b7c2674a019d94afb2b4ebf3663f6c6e Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Tue, 15 Oct 2019 17:10:42 +0100 -Subject: [PATCH 3/3] xen/arm: p2m: Don't check the return of - p2m_get_root_pointer() with BUG_ON() - -It turns out that the BUG_ON() was actually reachable with well-crafted -hypercalls. The BUG_ON() is here to prevent catch logical error, so -crashing Xen is a bit over the top. - -While all the holes should now be fixed, it would be better to downgrade -the BUG_ON() to something less fatal to prevent any more DoS. - -The BUG_ON() in p2m_get_entry() is now replaced by ASSERT_UNREACHABLE() -to catch mistake in debug build and return INVALID_MFN for production -build. The interface also requires to set page_order to give an idea of -the size of "hole". So 'level' is now set so we report a hole of size of -the an entry of the root page-table. This stays inline with what happen -when the GFN is higher than p2m->max_mapped_gfn. - -The BUG_ON() in p2m_resolve_translation_fault() is now replaced by -ASSERT_UNREACHABLE() to catch mistake in debug build and just report a -fault for producion build. - -This is part of XSA-301. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> ---- - xen/arch/arm/p2m.c | 13 +++++++++++-- - 1 file changed, 11 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 8d20d27961..ce59f2b503 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -395,7 +395,12 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn, - * the table should always be non-NULL because the gfn is below - * p2m->max_mapped_gfn and the root table pages are always present. - */ -- BUG_ON(table == NULL); -+ if ( !table ) -+ { -+ ASSERT_UNREACHABLE(); -+ level = P2M_ROOT_LEVEL; -+ goto out; -+ } - - for ( level = P2M_ROOT_LEVEL; level < 3; level++ ) - { -@@ -1196,7 +1201,11 @@ bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn) - * The table should always be non-NULL because the gfn is below - * p2m->max_mapped_gfn and the root table pages are always present. - */ -- BUG_ON(table == NULL); -+ if ( !table ) -+ { -+ ASSERT_UNREACHABLE(); -+ goto out; -+ } - - /* - * Go down the page-tables until an entry has the valid bit unset or --- -2.23.0 - diff --git a/system/xen/xsa/xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch b/system/xen/xsa/xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch deleted file mode 100644 index 5d52163406f0c..0000000000000 --- a/system/xen/xsa/xsa302-4.12-0001-IOMMU-add-missing-HVM-check.patch +++ /dev/null @@ -1,37 +0,0 @@ -From 0c9c0fbb356e3210cb77b3d738be50981b26058a Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 2 Oct 2019 13:36:59 +0200 -Subject: [PATCH 1/2] IOMMU: add missing HVM check -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Fix an unguarded d->arch.hvm access in assign_device(). - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> - -(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6) -Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> ---- - xen/drivers/passthrough/pci.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index 8108ed5f9a..d7420bd8bf 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -1452,7 +1452,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - /* Prevent device assign if mem paging or mem sharing have been - * enabled for this domain */ -- if ( unlikely(d->arch.hvm.mem_sharing_enabled || -+ if ( unlikely((is_hvm_domain(d) && -+ d->arch.hvm.mem_sharing_enabled) || - vm_event_check_ring(d->vm_event_paging) || - p2m_get_hostp2m(d)->global_logdirty) ) - return -EXDEV; --- -2.11.0 - diff --git a/system/xen/xsa/xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch b/system/xen/xsa/xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch deleted file mode 100644 index 181ece3bb72a2..0000000000000 --- a/system/xen/xsa/xsa302-4.12-0002-passthrough-quarantine-PCI-devices.patch +++ /dev/null @@ -1,499 +0,0 @@ -From 278d8e585a9f110a1af0bd92a9fc43733c9c7227 Mon Sep 17 00:00:00 2001 -From: Paul Durrant <paul.durrant@citrix.com> -Date: Mon, 14 Oct 2019 17:52:59 +0100 -Subject: [PATCH 2/2] passthrough: quarantine PCI devices - -When a PCI device is assigned to an untrusted domain, it is possible for -that domain to program the device to DMA to an arbitrary address. The -IOMMU is used to protect the host from malicious DMA by making sure that -the device addresses can only target memory assigned to the guest. However, -when the guest domain is torn down the device is assigned back to dom0, -thus allowing any in-flight DMA to potentially target critical host data. - -This patch introduces a 'quarantine' for PCI devices using dom_io. When -the toolstack makes a device assignable (by binding it to pciback), it -will now also assign it to DOMID_IO and the device will only be assigned -back to dom0 when the device is made unassignable again. Whilst device is -assignable it will only ever transfer between dom_io and guest domains. -dom_io is actually only used as a sentinel domain for quarantining purposes; -it is not configured with any IOMMU mappings. Assignment to dom_io simply -means that the device's initiator (requestor) identifier is not present in -the IOMMU's device table and thus any DMA transactions issued will be -terminated with a fault condition. - -In addition, a fix to assignment handling is made for VT-d. Failure -during the assignment step should not lead to a device still being -associated with its prior owner. Hand the device to DomIO temporarily, -until the assignment step has completed successfully. Remove the PI -hooks from the source domain then earlier as well. - -Failure of the recovery reassign_device_ownership() may not go silent: -There e.g. may still be left over RMRR mappings in the domain assignment -to which has failed, and hence we can't allow that domain to continue -executing. - -NOTE: This patch also includes one printk() cleanup; the - "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(), - since similar printk()-s elsewhere also don't log such a tag. - -This is XSA-302. - -Signed-off-by: Paul Durrant <paul.durrant@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> -(cherry picked from commit ec99857f59f7f06236f11ca8b0b2303e5e745cc4) ---- - tools/libxl/libxl_pci.c | 25 +++++++++++- - xen/arch/x86/mm.c | 2 + - xen/common/domctl.c | 14 ++++++- - xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++- - xen/drivers/passthrough/iommu.c | 9 +++++ - xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++------- - xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++--- - xen/include/xen/pci.h | 3 ++ - 8 files changed, 138 insertions(+), 24 deletions(-) - -diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c -index 88c324ea23..d6a23fb5f8 100644 ---- a/tools/libxl/libxl_pci.c -+++ b/tools/libxl/libxl_pci.c -@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, - libxl_device_pci *pcidev, - int rebind) - { -+ libxl_ctx *ctx = libxl__gc_owner(gc); - unsigned dom, bus, dev, func; - char *spath, *driver_path = NULL; - int rc; -@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, - } - if ( rc ) { - LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func); -- return 0; -+ goto quarantine; - } - - /* Check to see if there's already a driver that we need to unbind from */ -@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, - return ERROR_FAIL; - } - -+quarantine: -+ /* -+ * DOMID_IO is just a sentinel domain, without any actual mappings, -+ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being -+ * unnecessarily denied. -+ */ -+ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev), -+ XEN_DOMCTL_DEV_RDM_RELAXED); -+ if ( rc < 0 ) { -+ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func); -+ return ERROR_FAIL; -+ } -+ - return 0; - } - -@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc, - libxl_device_pci *pcidev, - int rebind) - { -+ libxl_ctx *ctx = libxl__gc_owner(gc); - int rc; - char *driver_path; - -+ /* De-quarantine */ -+ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev)); -+ if ( rc < 0 ) { -+ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus, -+ pcidev->dev, pcidev->func); -+ return ERROR_FAIL; -+ } -+ - /* Unbind from pciback */ - if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) { - return ERROR_FAIL; -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 3557cd1178..11d753d8d2 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -295,9 +295,11 @@ void __init arch_init_memory(void) - * Initialise our DOMID_IO domain. - * This domain owns I/O pages that are within the range of the page_info - * array. Mappings occur at the priv of the caller. -+ * Quarantined PCI devices will be associated with this domain. - */ - dom_io = domain_create(DOMID_IO, NULL, false); - BUG_ON(IS_ERR(dom_io)); -+ INIT_LIST_HEAD(&dom_io->arch.pdev_list); - - /* - * Initialise our COW domain. -diff --git a/xen/common/domctl.c b/xen/common/domctl.c -index d08b6274e2..e3c4be2b48 100644 ---- a/xen/common/domctl.c -+++ b/xen/common/domctl.c -@@ -391,6 +391,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - - switch ( op->cmd ) - { -+ case XEN_DOMCTL_assign_device: -+ case XEN_DOMCTL_deassign_device: -+ if ( op->domain == DOMID_IO ) -+ { -+ d = dom_io; -+ break; -+ } -+ else if ( op->domain == DOMID_INVALID ) -+ return -ESRCH; -+ /* fall through */ - case XEN_DOMCTL_test_assign_device: - if ( op->domain == DOMID_INVALID ) - { -@@ -412,7 +422,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - - if ( !domctl_lock_acquire() ) - { -- if ( d ) -+ if ( d && d != dom_io ) - rcu_unlock_domain(d); - return hypercall_create_continuation( - __HYPERVISOR_domctl, "h", u_domctl); -@@ -1074,7 +1084,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - domctl_lock_release(); - - domctl_out_unlock_domonly: -- if ( d ) -+ if ( d && d != dom_io ) - rcu_unlock_domain(d); - - if ( copyback && __copy_to_guest(u_domctl, op, 1) ) -diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c -index 33a3798f36..15c13e1163 100644 ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -120,6 +120,10 @@ static void amd_iommu_setup_domain_device( - u8 bus = pdev->bus; - const struct domain_iommu *hd = dom_iommu(domain); - -+ /* dom_io is used as a sentinel for quarantined devices */ -+ if ( domain == dom_io ) -+ return; -+ - BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || - !iommu->dev_table.buffer ); - -@@ -277,6 +281,10 @@ void amd_iommu_disable_domain_device(struct domain *domain, - int req_id; - u8 bus = pdev->bus; - -+ /* dom_io is used as a sentinel for quarantined devices */ -+ if ( domain == dom_io ) -+ return; -+ - BUG_ON ( iommu->dev_table.buffer == NULL ); - req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); - dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); -@@ -363,7 +371,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn, - ivrs_mappings[req_id].read_permission); - } - -- return reassign_device(hardware_domain, d, devfn, pdev); -+ return reassign_device(pdev->domain, d, devfn, pdev); - } - - static void deallocate_next_page_table(struct page_info *pg, int level) -diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c -index a6697d58fb..2762e1342f 100644 ---- a/xen/drivers/passthrough/iommu.c -+++ b/xen/drivers/passthrough/iommu.c -@@ -232,6 +232,9 @@ void iommu_teardown(struct domain *d) - { - struct domain_iommu *hd = dom_iommu(d); - -+ if ( d == dom_io ) -+ return; -+ - hd->status = IOMMU_STATUS_disabled; - hd->platform_ops->teardown(d); - tasklet_schedule(&iommu_pt_cleanup_tasklet); -@@ -241,6 +244,9 @@ int iommu_construct(struct domain *d) - { - struct domain_iommu *hd = dom_iommu(d); - -+ if ( d == dom_io ) -+ return 0; -+ - if ( hd->status == IOMMU_STATUS_initialized ) - return 0; - -@@ -521,6 +527,9 @@ int __init iommu_setup(void) - printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); - if ( iommu_enabled ) - { -+ if ( iommu_domain_init(dom_io) ) -+ panic("Could not set up quarantine\n"); -+ - printk(" - Dom0 mode: %s\n", - iommu_hwdom_passthrough ? "Passthrough" : - iommu_hwdom_strict ? "Strict" : "Relaxed"); -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index d7420bd8bf..d66a8a1daf 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -1426,19 +1426,29 @@ static int iommu_remove_device(struct pci_dev *pdev) - return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev)); - } - --/* -- * If the device isn't owned by the hardware domain, it means it already -- * has been assigned to other domain, or it doesn't exist. -- */ - static int device_assigned(u16 seg, u8 bus, u8 devfn) - { - struct pci_dev *pdev; -+ int rc = 0; - - pcidevs_lock(); -- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); -+ -+ pdev = pci_get_pdev(seg, bus, devfn); -+ -+ if ( !pdev ) -+ rc = -ENODEV; -+ /* -+ * If the device exists and it is not owned by either the hardware -+ * domain or dom_io then it must be assigned to a guest, or be -+ * hidden (owned by dom_xen). -+ */ -+ else if ( pdev->domain != hardware_domain && -+ pdev->domain != dom_io ) -+ rc = -EBUSY; -+ - pcidevs_unlock(); - -- return pdev ? 0 : -EBUSY; -+ return rc; - } - - static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) -@@ -1452,7 +1462,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - /* Prevent device assign if mem paging or mem sharing have been - * enabled for this domain */ -- if ( unlikely((is_hvm_domain(d) && -+ if ( d != dom_io && -+ unlikely((is_hvm_domain(d) && - d->arch.hvm.mem_sharing_enabled) || - vm_event_check_ring(d->vm_event_paging) || - p2m_get_hostp2m(d)->global_logdirty) ) -@@ -1468,12 +1479,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - return rc; - } - -- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); -+ pdev = pci_get_pdev(seg, bus, devfn); -+ -+ rc = -ENODEV; - if ( !pdev ) -- { -- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; - goto done; -- } -+ -+ rc = 0; -+ if ( d == pdev->domain ) -+ goto done; -+ -+ rc = -EBUSY; -+ if ( pdev->domain != hardware_domain && -+ pdev->domain != dom_io ) -+ goto done; - - if ( pdev->msix ) - msixtbl_init(d); -@@ -1496,6 +1515,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - } - - done: -+ /* The device is assigned to dom_io so mark it as quarantined */ -+ if ( !rc && d == dom_io ) -+ pdev->quarantine = true; -+ - if ( !has_arch_pdevs(d) && has_iommu_pt(d) ) - iommu_teardown(d); - pcidevs_unlock(); -@@ -1508,6 +1531,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) - { - const struct domain_iommu *hd = dom_iommu(d); - struct pci_dev *pdev = NULL; -+ struct domain *target; - int ret = 0; - - if ( !iommu_enabled || !hd->platform_ops ) -@@ -1518,12 +1542,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) - if ( !pdev ) - return -ENODEV; - -+ /* De-assignment from dom_io should de-quarantine the device */ -+ target = (pdev->quarantine && pdev->domain != dom_io) ? -+ dom_io : hardware_domain; -+ - while ( pdev->phantom_stride ) - { - devfn += pdev->phantom_stride; - if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) - break; -- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, -+ ret = hd->platform_ops->reassign_device(d, target, devfn, - pci_to_dev(pdev)); - if ( !ret ) - continue; -@@ -1534,7 +1562,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) - } - - devfn = pdev->devfn; -- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, -+ ret = hd->platform_ops->reassign_device(d, target, devfn, - pci_to_dev(pdev)); - if ( ret ) - { -@@ -1544,6 +1572,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) - return ret; - } - -+ if ( pdev->domain == hardware_domain ) -+ pdev->quarantine = false; -+ - pdev->fault.count = 0; - - if ( !has_arch_pdevs(d) && has_iommu_pt(d) ) -@@ -1722,7 +1753,7 @@ int iommu_do_pci_domctl( - ret = hypercall_create_continuation(__HYPERVISOR_domctl, - "h", u_domctl); - else if ( ret ) -- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " -+ printk(XENLOG_G_ERR - "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", - seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), - d->domain_id, ret); -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index 1db1cd9f2d..a8d1baa064 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -1338,6 +1338,10 @@ int domain_context_mapping_one( - int agaw, rc, ret; - bool_t flush_dev_iotlb; - -+ /* dom_io is used as a sentinel for quarantined devices */ -+ if ( domain == dom_io ) -+ return 0; -+ - ASSERT(pcidevs_locked()); - spin_lock(&iommu->lock); - maddr = bus_to_context_maddr(iommu, bus); -@@ -1573,6 +1577,10 @@ int domain_context_unmap_one( - int iommu_domid, rc, ret; - bool_t flush_dev_iotlb; - -+ /* dom_io is used as a sentinel for quarantined devices */ -+ if ( domain == dom_io ) -+ return 0; -+ - ASSERT(pcidevs_locked()); - spin_lock(&iommu->lock); - -@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, - goto out; - } - -+ /* dom_io is used as a sentinel for quarantined devices */ -+ if ( domain == dom_io ) -+ goto out; -+ - /* - * if no other devices under the same iommu owned by this domain, - * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp -@@ -2441,6 +2453,15 @@ static int reassign_device_ownership( - if ( ret ) - return ret; - -+ if ( devfn == pdev->devfn ) -+ { -+ list_move(&pdev->domain_list, &dom_io->arch.pdev_list); -+ pdev->domain = dom_io; -+ } -+ -+ if ( !has_arch_pdevs(source) ) -+ vmx_pi_hooks_deassign(source); -+ - if ( !has_arch_pdevs(target) ) - vmx_pi_hooks_assign(target); - -@@ -2459,15 +2480,13 @@ static int reassign_device_ownership( - pdev->domain = target; - } - -- if ( !has_arch_pdevs(source) ) -- vmx_pi_hooks_deassign(source); -- - return ret; - } - - static int intel_iommu_assign_device( - struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) - { -+ struct domain *s = pdev->domain; - struct acpi_rmrr_unit *rmrr; - int ret = 0, i; - u16 bdf, seg; -@@ -2510,8 +2529,8 @@ static int intel_iommu_assign_device( - } - } - -- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev); -- if ( ret ) -+ ret = reassign_device_ownership(s, d, devfn, pdev); -+ if ( ret || d == dom_io ) - return ret; - - /* Setup rmrr identity mapping */ -@@ -2524,11 +2543,20 @@ static int intel_iommu_assign_device( - ret = rmrr_identity_mapping(d, 1, rmrr, flag); - if ( ret ) - { -- reassign_device_ownership(d, hardware_domain, devfn, pdev); -+ int rc; -+ -+ rc = reassign_device_ownership(d, s, devfn, pdev); - printk(XENLOG_G_ERR VTDPREFIX - " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", - rmrr->base_address, rmrr->end_address, - d->domain_id, ret); -+ if ( rc ) -+ { -+ printk(XENLOG_ERR VTDPREFIX -+ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", -+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); -+ domain_crash(d); -+ } - break; - } - } -diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h -index 8b21e8dc84..a031fd6020 100644 ---- a/xen/include/xen/pci.h -+++ b/xen/include/xen/pci.h -@@ -88,6 +88,9 @@ struct pci_dev { - - nodeid_t node; /* NUMA node */ - -+ /* Device to be quarantined, don't automatically re-assign to dom0 */ -+ bool quarantine; -+ - /* Device with errata, ignore the BARs. */ - bool ignore_bars; - --- -2.11.0 - diff --git a/system/xen/xsa/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch b/system/xen/xsa/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch deleted file mode 100644 index afb1096c1d30b..0000000000000 --- a/system/xen/xsa/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch +++ /dev/null @@ -1,74 +0,0 @@ -From c8cb33fa64c9ccbfa2a494a9dad2e0a763c09176 Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Tue, 1 Oct 2019 13:07:53 +0100 -Subject: [PATCH 1/4] xen/arm32: entry: Split __DEFINE_ENTRY_TRAP in two - -The preprocessing macro __DEFINE_ENTRY_TRAP is used to generate trap -entry function. While the macro is fairly small today, follow-up patches -will increase the size signicantly. - -In general, assembly macros are more readable as they allow you to name -parameters and avoid '\'. So the actual implementation of the trap is -now switched to an assembly macro. - -This is part of XSA-303. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Andre Przywara <andre.przywara@arm.com> ---- - xen/arch/arm/arm32/entry.S | 34 +++++++++++++++++++--------------- - 1 file changed, 19 insertions(+), 15 deletions(-) - -diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S -index 0b4cd19abd..4a762e04f1 100644 ---- a/xen/arch/arm/arm32/entry.S -+++ b/xen/arch/arm/arm32/entry.S -@@ -126,24 +126,28 @@ abort_guest_exit_end: - skip_check: - mov pc, lr - --/* -- * Macro to define trap entry. The iflags corresponds to the list of -- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. -- */ -+ /* -+ * Macro to define trap entry. The iflags corresponds to the list of -+ * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. -+ */ -+ .macro vector trap, iflags -+ SAVE_ALL -+ cpsie \iflags -+ adr lr, return_from_trap -+ mov r0, sp -+ /* -+ * Save the stack pointer in r11. It will be restored after the -+ * trap has been handled (see return_from_trap). -+ */ -+ mov r11, sp -+ bic sp, #7 /* Align the stack pointer (noop on guest trap) */ -+ b do_trap_\trap -+ .endm -+ - #define __DEFINE_TRAP_ENTRY(trap, iflags) \ - ALIGN; \ - trap_##trap: \ -- SAVE_ALL; \ -- cpsie iflags; \ -- adr lr, return_from_trap; \ -- mov r0, sp; \ -- /* \ -- * Save the stack pointer in r11. It will be restored after the \ -- * trap has been handled (see return_from_trap). \ -- */ \ -- mov r11, sp; \ -- bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \ -- b do_trap_##trap -+ vector trap, iflags - - /* Trap handler which unmask IRQ/Abort, keep FIQ masked */ - #define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) --- -2.11.0 - diff --git a/system/xen/xsa/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch b/system/xen/xsa/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch deleted file mode 100644 index 35f9c0475e62c..0000000000000 --- a/system/xen/xsa/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch +++ /dev/null @@ -1,97 +0,0 @@ -From be7379207c83fa74f8a6c22a8ea213f02714776f Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Tue, 1 Oct 2019 13:15:48 +0100 -Subject: [PATCH 2/4] xen/arm32: entry: Fold the macro SAVE_ALL in the macro - vector - -Follow-up rework will require the macro vector to distinguish between -a trap from a guest vs while in the hypervisor. - -The macro SAVE_ALL already has code to distinguish between the two and -it is only called by the vector macro. So fold the former into the -latter. This will help to avoid duplicating the check. - -This is part of XSA-303. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Andre Przywara <andre.przywara@arm.com> ---- - xen/arch/arm/arm32/entry.S | 46 +++++++++++++++++++++++----------------------- - 1 file changed, 23 insertions(+), 23 deletions(-) - -diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S -index 4a762e04f1..150cbc0b4b 100644 ---- a/xen/arch/arm/arm32/entry.S -+++ b/xen/arch/arm/arm32/entry.S -@@ -13,27 +13,6 @@ - #define RESTORE_BANKED(mode) \ - RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode) - --#define SAVE_ALL \ -- sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \ -- push {r0-r12}; /* Save R0-R12 */ \ -- \ -- mrs r11, ELR_hyp; /* ELR_hyp is return address. */\ -- str r11, [sp, #UREGS_pc]; \ -- \ -- str lr, [sp, #UREGS_lr]; \ -- \ -- add r11, sp, #UREGS_kernel_sizeof+4; \ -- str r11, [sp, #UREGS_sp]; \ -- \ -- mrc CP32(r11, HSR); /* Save exception syndrome */ \ -- str r11, [sp, #UREGS_hsr]; \ -- \ -- mrs r11, SPSR_hyp; \ -- str r11, [sp, #UREGS_cpsr]; \ -- and r11, #PSR_MODE_MASK; \ -- cmp r11, #PSR_MODE_HYP; \ -- blne save_guest_regs -- - save_guest_regs: - #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR - /* -@@ -52,7 +31,7 @@ save_guest_regs: - ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */ - str r11, [sp, #UREGS_sp] - SAVE_ONE_BANKED(SP_usr) -- /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */ -+ /* LR_usr is the same physical register as lr and is saved by the caller */ - SAVE_BANKED(svc) - SAVE_BANKED(abt) - SAVE_BANKED(und) -@@ -131,7 +110,28 @@ skip_check: - * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. - */ - .macro vector trap, iflags -- SAVE_ALL -+ /* Save registers in the stack */ -+ sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ -+ push {r0-r12} /* Save R0-R12 */ -+ mrs r11, ELR_hyp /* ELR_hyp is return address */ -+ str r11, [sp, #UREGS_pc] -+ -+ str lr, [sp, #UREGS_lr] -+ -+ add r11, sp, #(UREGS_kernel_sizeof + 4) -+ -+ str r11, [sp, #UREGS_sp] -+ -+ mrc CP32(r11, HSR) /* Save exception syndrome */ -+ str r11, [sp, #UREGS_hsr] -+ -+ mrs r11, SPSR_hyp -+ str r11, [sp, #UREGS_cpsr] -+ and r11, #PSR_MODE_MASK -+ cmp r11, #PSR_MODE_HYP -+ blne save_guest_regs -+ -+ /* We are ready to handle the trap, setup the registers and jump. */ - cpsie \iflags - adr lr, return_from_trap - mov r0, sp --- -2.11.0 - diff --git a/system/xen/xsa/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/system/xen/xsa/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch deleted file mode 100644 index 516845214880c..0000000000000 --- a/system/xen/xsa/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch +++ /dev/null @@ -1,226 +0,0 @@ -From 098fe877967870ffda2dfd9629a5fd272f6aacdc Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Fri, 11 Oct 2019 17:49:28 +0100 -Subject: [PATCH 3/4] xen/arm32: Don't blindly unmask interrupts on trap - without a change of level - -Exception vectors will unmask interrupts regardless the state of them in -the interrupted context. - -One of the consequences is IRQ will be unmasked when receiving an -undefined instruction exception (used by WARN*) from the hypervisor. -This could result to unexpected behavior such as deadlock (if a lock was -shared with interrupts). - -In a nutshell, interrupts should only be unmasked when it is safe to do. -Xen only unmask IRQ and Abort interrupts, so the logic can stay simple. - -As vectors exceptions may be shared between guest and hypervisor, we now -need to have a different policy for the interrupts. - -On exception from hypervisor, each vector will select the list of -interrupts to inherit from the interrupted context. Any interrupts not -listed will be kept masked. - -On exception from the guest, the Abort and IRQ will be unmasked -depending on the exact vector. - -The interrupts will be kept unmasked when the vector cannot used by -either guest or hypervisor. - -Note that each vector is not anymore preceded by ALIGN. This is fine -because the alignment is already bigger than what we need. - -This is part of XSA-303. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Andre Przywara <andre.przywara@arm.com> ---- - xen/arch/arm/arm32/entry.S | 138 +++++++++++++++++++++++++++++++++++---------- - 1 file changed, 109 insertions(+), 29 deletions(-) - -diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S -index 150cbc0b4b..ec90cca093 100644 ---- a/xen/arch/arm/arm32/entry.S -+++ b/xen/arch/arm/arm32/entry.S -@@ -4,6 +4,17 @@ - #include <asm/alternative.h> - #include <public/xen.h> - -+/* -+ * Short-hands to defined the interrupts (A, I, F) -+ * -+ * _ means the interrupt state will not change -+ * X means the state of interrupt X will change -+ * -+ * To be used with msr cpsr_* only -+ */ -+#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK -+#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK -+ - #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg] - #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11 - -@@ -106,10 +117,18 @@ skip_check: - mov pc, lr - - /* -- * Macro to define trap entry. The iflags corresponds to the list of -- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask. -+ * Macro to define a trap entry. -+ * -+ * @guest_iflags: Optional list of interrupts to unmask when -+ * entering from guest context. As this is used with cpsie, -+ * the letter (a, i, f) should be used. -+ * -+ * @hyp_iflags: Optional list of interrupts to inherit when -+ * entering from hypervisor context. Any interrupts not -+ * listed will be kept unchanged. As this is used with cpsr_*, -+ * IFLAGS_* short-hands should be used. - */ -- .macro vector trap, iflags -+ .macro vector trap, guest_iflags=n, hyp_iflags=0 - /* Save registers in the stack */ - sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */ - push {r0-r12} /* Save R0-R12 */ -@@ -127,12 +146,39 @@ skip_check: - - mrs r11, SPSR_hyp - str r11, [sp, #UREGS_cpsr] -- and r11, #PSR_MODE_MASK -- cmp r11, #PSR_MODE_HYP -- blne save_guest_regs - -+ /* -+ * We need to distinguish whether we came from guest or -+ * hypervisor context. -+ */ -+ and r0, r11, #PSR_MODE_MASK -+ cmp r0, #PSR_MODE_HYP -+ -+ bne 1f -+ /* -+ * Trap from the hypervisor -+ * -+ * Inherit the state of the interrupts from the hypervisor -+ * context. For that we need to use SPSR (stored in r11) and -+ * modify CPSR accordingly. -+ * -+ * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags) -+ */ -+ mrs r10, cpsr -+ bic r10, r10, #\hyp_iflags -+ and r11, r11, #\hyp_iflags -+ orr r10, r10, r11 -+ msr cpsr_cx, r10 -+ b 2f -+ -+1: -+ /* Trap from the guest */ -+ bl save_guest_regs -+ .if \guest_iflags != n -+ cpsie \guest_iflags -+ .endif -+2: - /* We are ready to handle the trap, setup the registers and jump. */ -- cpsie \iflags - adr lr, return_from_trap - mov r0, sp - /* -@@ -144,20 +190,6 @@ skip_check: - b do_trap_\trap - .endm - --#define __DEFINE_TRAP_ENTRY(trap, iflags) \ -- ALIGN; \ --trap_##trap: \ -- vector trap, iflags -- --/* Trap handler which unmask IRQ/Abort, keep FIQ masked */ --#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai) -- --/* Trap handler which unmask Abort, keep IRQ/FIQ masked */ --#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a) -- --/* Trap handler which unmask IRQ, keep Abort/FIQ masked */ --#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i) -- - .align 5 - GLOBAL(hyp_traps_vector) - b trap_reset /* 0x00 - Reset */ -@@ -228,14 +260,62 @@ decode_vectors: - - #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ - --DEFINE_TRAP_ENTRY(reset) --DEFINE_TRAP_ENTRY(undefined_instruction) --DEFINE_TRAP_ENTRY(hypervisor_call) --DEFINE_TRAP_ENTRY(prefetch_abort) --DEFINE_TRAP_ENTRY(guest_sync) --DEFINE_TRAP_ENTRY_NOIRQ(irq) --DEFINE_TRAP_ENTRY_NOIRQ(fiq) --DEFINE_TRAP_ENTRY_NOABORT(data_abort) -+/* Vector not used by the Hypervisor. */ -+trap_reset: -+ vector reset -+ -+/* -+ * Vector only used by the Hypervisor. -+ * -+ * While the exception can be executed with all the interrupts (e.g. -+ * IRQ) unmasked, the interrupted context may have purposefully masked -+ * some of them. So we want to inherit the state from the interrupted -+ * context. -+ */ -+trap_undefined_instruction: -+ vector undefined_instruction, hyp_iflags=IFLAGS_AIF -+ -+/* We should never reach this trap */ -+trap_hypervisor_call: -+ vector hypervisor_call -+ -+/* -+ * Vector only used by the hypervisor. -+ * -+ * While the exception can be executed with all the interrupts (e.g. -+ * IRQ) unmasked, the interrupted context may have purposefully masked -+ * some of them. So we want to inherit the state from the interrupted -+ * context. -+ */ -+trap_prefetch_abort: -+ vector prefetch_abort, hyp_iflags=IFLAGS_AIF -+ -+/* -+ * Vector only used by the hypervisor. -+ * -+ * Data Abort should be rare and most likely fatal. It is best to not -+ * unmask any interrupts to limit the amount of code that can run before -+ * the Data Abort is treated. -+ */ -+trap_data_abort: -+ vector data_abort -+ -+/* Vector only used by the guest. We can unmask Abort/IRQ. */ -+trap_guest_sync: -+ vector guest_sync, guest_iflags=ai -+ -+ -+/* Vector used by the hypervisor and the guest. */ -+trap_irq: -+ vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F -+ -+/* -+ * Vector used by the hypervisor and the guest. -+ * -+ * FIQ are not meant to happen, so we don't unmask any interrupts. -+ */ -+trap_fiq: -+ vector fiq - - return_from_trap: - /* --- -2.11.0 - diff --git a/system/xen/xsa/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/system/xen/xsa/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch deleted file mode 100644 index 106cbf98f1771..0000000000000 --- a/system/xen/xsa/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch +++ /dev/null @@ -1,114 +0,0 @@ -From c6d290ce157a044dec417fdda8db71e41a37d744 Mon Sep 17 00:00:00 2001 -From: Julien Grall <julien.grall@arm.com> -Date: Mon, 7 Oct 2019 18:10:56 +0100 -Subject: [PATCH 4/4] xen/arm64: Don't blindly unmask interrupts on trap - without a change of level - -Some of the traps without a change of the level (i.e. hypervisor -> -hypervisor) will unmask interrupts regardless the state of them in the -interrupted context. - -One of the consequences is IRQ will be unmasked when receiving a -synchronous exception (used by WARN*()). This could result to unexpected -behavior such as deadlock (if a lock was shared with interrupts). - -In a nutshell, interrupts should only be unmasked when it is safe to -do. Xen only unmask IRQ and Abort interrupts, so the logic can stay -simple: - - hyp_error: All the interrupts are now kept masked. SError should - be pretty rare and if ever happen then we most likely want to - avoid any other interrupts to be generated. The potential main - "caller" is during virtual SError synchronization on the exit - path from the guest (see check_pending_vserror). - - - hyp_sync: The interrupts state is inherited from the interrupted - context. - - - hyp_irq: All the interrupts but IRQ state are inherited from the - interrupted context. IRQ is kept masked. - -This is part of XSA-303. - -Reported-by: Julien Grall <Julien.Grall@arm.com> -Signed-off-by: Julien Grall <julien.grall@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Andre Przywara <andre.przywara@arm.com> ---- - xen/arch/arm/arm64/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++++---- - 1 file changed, 43 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S -index 2d9a2713a1..3e41ba65b6 100644 ---- a/xen/arch/arm/arm64/entry.S -+++ b/xen/arch/arm/arm64/entry.S -@@ -188,24 +188,63 @@ hyp_error_invalid: - entry hyp=1 - invalid BAD_ERROR - -+/* -+ * SError received while running in the hypervisor mode. -+ * -+ * Technically, we could unmask the IRQ if it were unmasked in the -+ * interrupted context. However, this require to check the PSTATE. For -+ * simplicity, as SError should be rare and potentially fatal, -+ * all interrupts are kept masked. -+ */ - hyp_error: - entry hyp=1 -- msr daifclr, #2 - mov x0, sp - bl do_trap_hyp_serror - exit hyp=1 - --/* Traps taken in Current EL with SP_ELx */ -+/* -+ * Synchronous exception received while running in the hypervisor mode. -+ * -+ * While the exception could be executed with all the interrupts (e.g. -+ * IRQ) unmasked, the interrupted context may have purposefully masked -+ * some of them. So we want to inherit the state from the interrupted -+ * context. -+ */ - hyp_sync: - entry hyp=1 -- msr daifclr, #6 -+ -+ /* Inherit interrupts */ -+ mrs x0, SPSR_el2 -+ and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK) -+ msr daif, x0 -+ - mov x0, sp - bl do_trap_hyp_sync - exit hyp=1 - -+/* -+ * IRQ received while running in the hypervisor mode. -+ * -+ * While the exception could be executed with all the interrupts but IRQ -+ * unmasked, the interrupted context may have purposefully masked some -+ * of them. So we want to inherit the state from the interrupt context -+ * and keep IRQ masked. -+ * -+ * XXX: We may want to consider an ordering between interrupts (e.g. if -+ * SError are masked, then IRQ should be masked too). However, this -+ * would require some rework in some paths (e.g. panic, livepatch) to -+ * ensure the ordering is enforced everywhere. -+ */ - hyp_irq: - entry hyp=1 -- msr daifclr, #4 -+ -+ /* Inherit D, A, F interrupts and keep I masked */ -+ mrs x0, SPSR_el2 -+ mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK) -+ and x0, x0, x1 -+ orr x0, x0, #PSR_IRQ_MASK -+ msr daif, x0 -+ - mov x0, sp - bl do_trap_irq - exit hyp=1 --- -2.11.0 - diff --git a/system/xen/xsa/xsa304-4.12-1.patch b/system/xen/xsa/xsa304-4.12-1.patch deleted file mode 100644 index c2ed2c2ced89f..0000000000000 --- a/system/xen/xsa/xsa304-4.12-1.patch +++ /dev/null @@ -1,71 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs - -Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and -an EPT superpage gets shattered. The root cause is still under investigation, -but the end result is unusable in combination with CVE-2018-12207 protections. - -This is part of XSA-304 / CVE-2018-12207 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h -index 16eada9fa2..a71c8b0f84 100644 ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -97,6 +97,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu); - int __must_check me_wifi_quirk(struct domain *domain, - u8 bus, u8 devfn, int map); - void pci_vtd_quirk(const struct pci_dev *); -+void quirk_iommu_caps(struct iommu *iommu); -+ - bool_t platform_supports_intremap(void); - bool_t platform_supports_x2apic(void); - -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index b3664ecbe0..5d34f75306 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -1215,6 +1215,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) - if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) - return -ENODEV; - -+ quirk_iommu_caps(iommu); -+ - if ( cap_fault_reg_offset(iommu->cap) + - cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || - ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) -diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c -index d6db862678..b02688e316 100644 ---- a/xen/drivers/passthrough/vtd/quirks.c -+++ b/xen/drivers/passthrough/vtd/quirks.c -@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev) - break; - } - } -+ -+void __init quirk_iommu_caps(struct iommu *iommu) -+{ -+ /* -+ * IOMMU Quirks: -+ * -+ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't -+ * implement superpages internally. -+ * -+ * There are issues changing the walk length under in-flight DMA, which -+ * has manifested as incompatibility between EPT/IOMMU sharing and the -+ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages -+ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT -+ * and IOMMU pagetables. -+ * -+ * Detection of SandyBridge unfortunately has to be done by processor -+ * model because the client parts don't expose their IOMMUs as PCI devices -+ * we could match with a Device ID. -+ */ -+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && -+ boot_cpu_data.x86 == 6 && -+ (boot_cpu_data.x86_model == 0x2a || -+ boot_cpu_data.x86_model == 0x2d) ) -+ iommu->cap &= ~(0xful << 34); -+} diff --git a/system/xen/xsa/xsa304-4.12-2.patch b/system/xen/xsa/xsa304-4.12-2.patch deleted file mode 100644 index 66d4301838e6e..0000000000000 --- a/system/xen/xsa/xsa304-4.12-2.patch +++ /dev/null @@ -1,272 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/vtx: Disable executable EPT superpages to work around - CVE-2018-12207 - -CVE-2018-12207 covers a set of errata on various Intel processors, whereby a -machine check exception can be generated in a corner case when an executable -mapping changes size or cacheability without TLB invalidation. HVM guest -kernels can trigger this to DoS the host. - -To mitigate, in affected hardware, all EPT superpages are marked NX. When an -instruction fetch violation is observed against the superpage, the superpage -is shattered to 4k and has execute permissions restored. This prevents the -guest kernel from being able to create the necessary preconditions in the iTLB -to exploit the vulnerability. - -This does come with a workload-dependent performance overhead, caused by -increased TLB pressure. Performance can be restored, if guest kernels are -trusted not to mount an attack, by specifying ept=exec-sp on the command line. - -This is part of XSA-304 / CVE-2018-12207 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 85081fdc94..e283017015 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -895,7 +895,7 @@ Controls for interacting with the system Extended Firmware Interface. - uncacheable. - - ### ept --> `= List of [ ad=<bool>, pml=<bool> ]` -+> `= List of [ ad=<bool>, pml=<bool>, exec-sp=<bool> ]` - - > Applicability: Intel - -@@ -926,6 +926,16 @@ introduced with the Nehalem architecture. - disable PML. `pml=0` can be used to prevent the use of PML on otherwise - capable hardware. - -+* The `exec-sp` boolean controls whether EPT superpages with execute -+ permissions are permitted. In general this is good for performance. -+ -+ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can -+ use executable superpages to crash the host. By default, executable -+ superpages are disabled on affected hardware. -+ -+ If HVM guest kernels are trusted not to mount a DoS against the system, -+ this option can enabled to regain performance. -+ - ### extra_guest_irqs - > `= [<domU number>][,<dom0 number>]` - -diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c -index 2089a77270..84191d4e4b 100644 ---- a/xen/arch/x86/hvm/hvm.c -+++ b/xen/arch/x86/hvm/hvm.c -@@ -1814,6 +1814,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, - break; - } - -+ /* -+ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution -+ * fault against a non-executable superpage, shatter it to regain -+ * execute permissions. -+ */ -+ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation ) -+ { -+ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K, -+ p2mt, p2ma); -+ -+ if ( res ) -+ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n", -+ gfn, res); -+ -+ rc = !res; -+ goto out_put_gfn; -+ } -+ - if ( violation ) - { - /* Should #VE be emulated for this fault? */ -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index 56519fee84..ec5ab860ad 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window); - - static bool __read_mostly opt_ept_pml = true; - static s8 __read_mostly opt_ept_ad = -1; -+int8_t __read_mostly opt_ept_exec_sp = -1; - - static int __init parse_ept_param(const char *s) - { -@@ -82,6 +83,8 @@ static int __init parse_ept_param(const char *s) - opt_ept_ad = val; - else if ( (val = parse_boolean("pml", s, ss)) >= 0 ) - opt_ept_pml = val; -+ else if ( (val = parse_boolean("exec-sp", s, ss)) >= 0 ) -+ opt_ept_exec_sp = val; - else - rc = -EINVAL; - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index 26b7ddb5fe..28cba8ec28 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -2445,6 +2445,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs) - static void __init lbr_tsx_fixup_check(void); - static void __init bdw_erratum_bdf14_fixup_check(void); - -+/* -+ * Calculate whether the CPU is vulnerable to Instruction Fetch page -+ * size-change MCEs. -+ */ -+static bool __init has_if_pschange_mc(void) -+{ -+ uint64_t caps = 0; -+ -+ /* -+ * If we are virtualised, there is nothing we can do. Our EPT tables are -+ * shadowed by our hypervisor, and not walked by hardware. -+ */ -+ if ( cpu_has_hypervisor ) -+ return false; -+ -+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) -+ rdmsrl(MSR_ARCH_CAPABILITIES, caps); -+ -+ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO ) -+ return false; -+ -+ /* -+ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at -+ * this time. -+ */ -+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || -+ boot_cpu_data.x86 != 6 ) -+ return false; -+ -+ switch ( boot_cpu_data.x86_model ) -+ { -+ /* -+ * Core processors since at least Nehalem are vulnerable. -+ */ -+ case 0x1f: /* Auburndale / Havendale */ -+ case 0x1e: /* Nehalem */ -+ case 0x1a: /* Nehalem EP */ -+ case 0x2e: /* Nehalem EX */ -+ case 0x25: /* Westmere */ -+ case 0x2c: /* Westmere EP */ -+ case 0x2f: /* Westmere EX */ -+ case 0x2a: /* SandyBridge */ -+ case 0x2d: /* SandyBridge EP/EX */ -+ case 0x3a: /* IvyBridge */ -+ case 0x3e: /* IvyBridge EP/EX */ -+ case 0x3c: /* Haswell */ -+ case 0x3f: /* Haswell EX/EP */ -+ case 0x45: /* Haswell D */ -+ case 0x46: /* Haswell H */ -+ case 0x3d: /* Broadwell */ -+ case 0x47: /* Broadwell H */ -+ case 0x4f: /* Broadwell EP/EX */ -+ case 0x56: /* Broadwell D */ -+ case 0x4e: /* Skylake M */ -+ case 0x5e: /* Skylake D */ -+ case 0x55: /* Skylake-X / Cascade Lake */ -+ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */ -+ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */ -+ return true; -+ -+ /* -+ * Atom processors are not vulnerable. -+ */ -+ case 0x1c: /* Pineview */ -+ case 0x26: /* Lincroft */ -+ case 0x27: /* Penwell */ -+ case 0x35: /* Cloverview */ -+ case 0x36: /* Cedarview */ -+ case 0x37: /* Baytrail / Valleyview (Silvermont) */ -+ case 0x4d: /* Avaton / Rangely (Silvermont) */ -+ case 0x4c: /* Cherrytrail / Brasswell */ -+ case 0x4a: /* Merrifield */ -+ case 0x5a: /* Moorefield */ -+ case 0x5c: /* Goldmont */ -+ case 0x5d: /* SoFIA 3G Granite/ES2.1 */ -+ case 0x65: /* SoFIA LTE AOSP */ -+ case 0x5f: /* Denverton */ -+ case 0x6e: /* Cougar Mountain */ -+ case 0x75: /* Lightning Mountain */ -+ case 0x7a: /* Gemini Lake */ -+ case 0x86: /* Jacobsville */ -+ -+ /* -+ * Knights processors are not vulnerable. -+ */ -+ case 0x57: /* Knights Landing */ -+ case 0x85: /* Knights Mill */ -+ return false; -+ -+ default: -+ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n", -+ boot_cpu_data.x86_model); -+ return true; -+ } -+} -+ - const struct hvm_function_table * __init start_vmx(void) - { - set_in_cr4(X86_CR4_VMXE); -@@ -2465,6 +2561,17 @@ const struct hvm_function_table * __init start_vmx(void) - */ - if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) ) - { -+ bool cpu_has_bug_pschange_mc = has_if_pschange_mc(); -+ -+ if ( opt_ept_exec_sp == -1 ) -+ { -+ /* Default to non-executable superpages on vulnerable hardware. */ -+ opt_ept_exec_sp = !cpu_has_bug_pschange_mc; -+ -+ if ( cpu_has_bug_pschange_mc ) -+ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); -+ } -+ - vmx_function_table.hap_supported = 1; - vmx_function_table.altp2m_supported = 1; - -diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c -index 952ebad82f..834d4798c8 100644 ---- a/xen/arch/x86/mm/p2m-ept.c -+++ b/xen/arch/x86/mm/p2m-ept.c -@@ -174,6 +174,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry, - break; - } - -+ /* -+ * Don't create executable superpages if we need to shatter them to -+ * protect against CVE-2018-12207. -+ */ -+ if ( !opt_ept_exec_sp && is_epte_superpage(entry) ) -+ entry->x = 0; - } - - #define GUEST_TABLE_MAP_FAILED 0 -diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h -index ebaa74449b..371b912887 100644 ---- a/xen/include/asm-x86/hvm/vmx/vmx.h -+++ b/xen/include/asm-x86/hvm/vmx/vmx.h -@@ -28,6 +28,8 @@ - #include <asm/hvm/trace.h> - #include <asm/hvm/vmx/vmcs.h> - -+extern int8_t opt_ept_exec_sp; -+ - typedef union { - struct { - u64 r : 1, /* bit 0 - Read permission */ -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 637259bd1f..32746aa8ae 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -52,6 +52,7 @@ - #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) - #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) - #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) -+#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) diff --git a/system/xen/xsa/xsa304-4.12-3.patch b/system/xen/xsa/xsa304-4.12-3.patch deleted file mode 100644 index 04b4c454f24fa..0000000000000 --- a/system/xen/xsa/xsa304-4.12-3.patch +++ /dev/null @@ -1,108 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/vtx: Allow runtime modification of the exec-sp setting - -See patch for details. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index e283017015..84221fe60a 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -936,6 +936,21 @@ introduced with the Nehalem architecture. - If HVM guest kernels are trusted not to mount a DoS against the system, - this option can enabled to regain performance. - -+ This boolean may be modified at runtime using `xl set-parameters -+ ept=[no-]exec-sp` to switch between fast and secure. -+ -+ * When switching from secure to fast, preexisting HVM domains will run -+ at their current performance until they are rebooted; new domains will -+ run without any overhead. -+ -+ * When switching from fast to secure, all HVM domains will immediately -+ suffer a performance penalty. -+ -+ **Warning: No guarantee is made that this runtime option will be retained -+ indefinitely, or that it will retain this exact behaviour. It is -+ intended as an emergency option for people who first chose fast, then -+ change their minds to secure, and wish not to reboot.** -+ - ### extra_guest_irqs - > `= [<domU number>][,<dom0 number>]` - -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index ec5ab860ad..c4d8a5ba78 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -95,6 +95,41 @@ static int __init parse_ept_param(const char *s) - } - custom_param("ept", parse_ept_param); - -+static int parse_ept_param_runtime(const char *s) -+{ -+ int val; -+ -+ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || -+ !(hvm_funcs.hap_capabilities & -+ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) -+ { -+ printk("VMX: EPT not available, or not in use - ignoring\n"); -+ return 0; -+ } -+ -+ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 ) -+ return -EINVAL; -+ -+ if ( val != opt_ept_exec_sp ) -+ { -+ struct domain *d; -+ -+ opt_ept_exec_sp = val; -+ -+ rcu_read_lock(&domlist_read_lock); -+ for_each_domain ( d ) -+ if ( paging_mode_hap(d) ) -+ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); -+ rcu_read_unlock(&domlist_read_lock); -+ } -+ -+ printk("VMX: EPT executable superpages %sabled\n", -+ val ? "en" : "dis"); -+ -+ return 0; -+} -+custom_runtime_only_param("ept", parse_ept_param_runtime); -+ - /* Dynamic (run-time adjusted) execution control flags. */ - u32 vmx_pin_based_exec_control __read_mostly; - u32 vmx_cpu_based_exec_control __read_mostly; -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index f518f86493..16608098b1 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -289,15 +289,20 @@ static void change_entry_type_global(struct p2m_domain *p2m, - p2m_type_t ot, p2m_type_t nt) - { - p2m->change_entry_type_global(p2m, ot, nt); -- p2m->global_logdirty = (nt == p2m_ram_logdirty); -+ /* Don't allow 'recalculate' operations to change the logdirty state. */ -+ if ( ot != nt ) -+ p2m->global_logdirty = (nt == p2m_ram_logdirty); - } - -+/* -+ * May be called with ot = nt = p2m_ram_rw for its side effect of -+ * recalculating all PTEs in the p2m. -+ */ - void p2m_change_entry_type_global(struct domain *d, - p2m_type_t ot, p2m_type_t nt) - { - struct p2m_domain *hostp2m = p2m_get_hostp2m(d); - -- ASSERT(ot != nt); - ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt)); - - p2m_lock(hostp2m); diff --git a/system/xen/xsa/xsa305-4.12-1.patch b/system/xen/xsa/xsa305-4.12-1.patch deleted file mode 100644 index e1a91a52640b6..0000000000000 --- a/system/xen/xsa/xsa305-4.12-1.patch +++ /dev/null @@ -1,288 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available - -To protect against the TSX Async Abort speculative vulnerability, Intel have -released new microcode for affected parts which introduce the MSR_TSX_CTRL -control, which allows TSX to be turned off. This will be architectural on -future parts. - -Introduce tsx= to provide a global on/off for TSX, including its enumeration -via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to -guests at the moment. - -VMs may have booted before microcode is loaded, or before hosts have rebooted, -and they still want to migrate freely. A VM which booted seeing TSX can -migrate safely to hosts with TSX disabled - TSX will start unconditionally -aborting, but still behave in a manner compatible with the ABI. - -The guest-visible behaviour is equivalent to late loading the microcode and -setting the RTM_DISABLE bit in the course of live patching. - -This is part of XSA-305 / CVE-2019-11135 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index e283017015..b7e1bf8e8b 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2033,6 +2033,20 @@ Xen version. - ### tsc (x86) - > `= unstable | skewed | stable:socket` - -+### tsx -+ = <bool> -+ -+ Applicability: x86 -+ Default: true -+ -+Controls for the use of Transactional Synchronization eXtensions. -+ -+On Intel parts released in Q3 2019 (with updated microcode), and future parts, -+a control has been introduced which allows TSX to be turned off. -+ -+On systems with the ability to turn TSX off, this boolean offers system wide -+control of whether TSX is enabled or disabled. -+ - ### ucode (x86) - > `= [<integer> | scan]` - -diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile -index 8a8d8f060f..9b9a4435fb 100644 ---- a/xen/arch/x86/Makefile -+++ b/xen/arch/x86/Makefile -@@ -66,6 +66,7 @@ obj-y += sysctl.o - obj-y += time.o - obj-y += trace.o - obj-y += traps.o -+obj-y += tsx.o - obj-y += usercopy.o - obj-y += x86_emulate.o - obj-$(CONFIG_TBOOT) += tboot.o -diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c -index 57e80694f2..1727497459 100644 ---- a/xen/arch/x86/cpuid.c -+++ b/xen/arch/x86/cpuid.c -@@ -524,6 +524,20 @@ void recalculate_cpuid_policy(struct domain *d) - if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) ) - __set_bit(X86_FEATURE_ITSC, max_fs); - -+ /* -+ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable -+ * TSX and hide the feature bits. Migrating-in VMs may have been booted -+ * pre-mitigation when the TSX features were visbile. -+ * -+ * This situation is compatible (albeit with a perf hit to any TSX code in -+ * the guest), so allow the feature bits to remain set. -+ */ -+ if ( cpu_has_tsx_ctrl ) -+ { -+ __set_bit(X86_FEATURE_HLE, max_fs); -+ __set_bit(X86_FEATURE_RTM, max_fs); -+ } -+ - /* Clamp the toolstacks choices to reality. */ - for ( i = 0; i < ARRAY_SIZE(fs); i++ ) - fs[i] &= max_fs[i]; -diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c -index 56de0fe9e1..c2722d7c73 100644 ---- a/xen/arch/x86/msr.c -+++ b/xen/arch/x86/msr.c -@@ -132,6 +132,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) - case MSR_FLUSH_CMD: - /* Write-only */ - case MSR_TSX_FORCE_ABORT: -+ case MSR_TSX_CTRL: - /* Not offered to guests. */ - goto gp_fault; - -@@ -260,6 +261,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) - case MSR_ARCH_CAPABILITIES: - /* Read-only */ - case MSR_TSX_FORCE_ABORT: -+ case MSR_TSX_CTRL: - /* Not offered to guests. */ - goto gp_fault; - -diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c -index cf790f36ef..c1c7c44000 100644 ---- a/xen/arch/x86/setup.c -+++ b/xen/arch/x86/setup.c -@@ -1594,6 +1594,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) - - early_microcode_init(); - -+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ -+ - identify_cpu(&boot_cpu_data); - - set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT); -diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c -index 737a44f055..e21cf0a310 100644 ---- a/xen/arch/x86/smpboot.c -+++ b/xen/arch/x86/smpboot.c -@@ -376,6 +376,8 @@ void start_secondary(void *unused) - if ( boot_cpu_has(X86_FEATURE_IBRSB) ) - wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); - -+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ -+ - if ( xen_guest ) - hypervisor_ap_setup(); - -diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c -new file mode 100644 -index 0000000000..a8ec2ccc69 ---- /dev/null -+++ b/xen/arch/x86/tsx.c -@@ -0,0 +1,74 @@ -+#include <xen/init.h> -+#include <asm/msr.h> -+ -+/* -+ * Valid values: -+ * 1 => Explicit tsx=1 -+ * 0 => Explicit tsx=0 -+ * -1 => Default, implicit tsx=1 -+ * -+ * This is arranged such that the bottom bit encodes whether TSX is actually -+ * disabled, while identifying various explicit (>=0) and implicit (<0) -+ * conditions. -+ */ -+int8_t __read_mostly opt_tsx = -1; -+int8_t __read_mostly cpu_has_tsx_ctrl = -1; -+ -+static int __init parse_tsx(const char *s) -+{ -+ int rc = 0, val = parse_bool(s, NULL); -+ -+ if ( val >= 0 ) -+ opt_tsx = val; -+ else -+ rc = -EINVAL; -+ -+ return rc; -+} -+custom_param("tsx", parse_tsx); -+ -+void tsx_init(void) -+{ -+ /* -+ * This function is first called between microcode being loaded, and CPUID -+ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL -+ * is available. -+ */ -+ if ( unlikely(cpu_has_tsx_ctrl < 0) ) -+ { -+ uint64_t caps = 0; -+ -+ if ( boot_cpu_data.cpuid_level >= 7 && -+ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) ) -+ rdmsrl(MSR_ARCH_CAPABILITIES, caps); -+ -+ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL); -+ } -+ -+ if ( cpu_has_tsx_ctrl ) -+ { -+ uint64_t val; -+ -+ rdmsrl(MSR_TSX_CTRL, val); -+ -+ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR); -+ /* Check bottom bit only. Higher bits are various sentinals. */ -+ if ( !(opt_tsx & 1) ) -+ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR; -+ -+ wrmsrl(MSR_TSX_CTRL, val); -+ } -+ else if ( opt_tsx >= 0 ) -+ printk_once(XENLOG_WARNING -+ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n"); -+} -+ -+/* -+ * Local variables: -+ * mode: C -+ * c-file-style: "BSD" -+ * c-basic-offset: 4 -+ * tab-width: 4 -+ * indent-tabs-mode: nil -+ * End: -+ */ -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 32746aa8ae..d5f3899f73 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -53,6 +53,7 @@ - #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) - #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) - #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) -+#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) -@@ -60,6 +61,10 @@ - #define MSR_TSX_FORCE_ABORT 0x0000010f - #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0) - -+#define MSR_TSX_CTRL 0x00000122 -+#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0) -+#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1) -+ - /* Intel MSRs. Some also available on other CPUs */ - #define MSR_IA32_PERFCTR0 0x000000c1 - #define MSR_IA32_A_PERFCTR0 0x000004c1 -diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h -index d33ac34d29..1b52712180 100644 ---- a/xen/include/asm-x86/processor.h -+++ b/xen/include/asm-x86/processor.h -@@ -263,6 +263,16 @@ static always_inline unsigned int cpuid_count_ebx( - return ebx; - } - -+static always_inline unsigned int cpuid_count_edx( -+ unsigned int leaf, unsigned int subleaf) -+{ -+ unsigned int edx, tmp; -+ -+ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx); -+ -+ return edx; -+} -+ - static inline unsigned long read_cr0(void) - { - unsigned long cr0; -@@ -609,6 +619,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model, - return fam; - } - -+extern int8_t opt_tsx, cpu_has_tsx_ctrl; -+void tsx_init(void); -+ - #endif /* !__ASSEMBLY__ */ - - #endif /* __ASM_X86_PROCESSOR_H */ -diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h -index 89939f43c8..6529f12dae 100644 ---- a/xen/include/xen/lib.h -+++ b/xen/include/xen/lib.h -@@ -114,6 +114,16 @@ extern int printk_ratelimit(void); - #define gprintk(lvl, fmt, args...) \ - printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args) - -+#define printk_once(fmt, args...) \ -+({ \ -+ static bool __read_mostly once_; \ -+ if ( unlikely(!once_) ) \ -+ { \ -+ once_ = true; \ -+ printk(fmt, ## args); \ -+ } \ -+}) -+ - #ifdef NDEBUG - - static inline void diff --git a/system/xen/xsa/xsa305-4.12-2.patch b/system/xen/xsa/xsa305-4.12-2.patch deleted file mode 100644 index 07fba86287f8a..0000000000000 --- a/system/xen/xsa/xsa305-4.12-2.patch +++ /dev/null @@ -1,192 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel - -See patch documentation and comments. - -This is part of XSA-305 / CVE-2019-11135 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index b7e1bf8e8b..74e1e35b88 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -1920,7 +1920,7 @@ extreme care.** - An overall boolean value, `spec-ctrl=no`, can be specified to turn off all - mitigations, including pieces of infrastructure used to virtualise certain - mitigation features for guests. This also includes settings which `xpti`, --`smt`, `pv-l1tf` control, unless the respective option(s) have been -+`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been - specified earlier on the command line. - - Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to -@@ -2037,7 +2037,7 @@ Xen version. - = <bool> - - Applicability: x86 -- Default: true -+ Default: false on parts vulnerable to TAA, true otherwise - - Controls for the use of Transactional Synchronization eXtensions. - -@@ -2047,6 +2047,19 @@ a control has been introduced which allows TSX to be turned off. - On systems with the ability to turn TSX off, this boolean offers system wide - control of whether TSX is enabled or disabled. - -+On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following -+logic applies: -+ -+ * An explicit `tsx=` choice is honoured, even if it is `true` and would -+ result in a vulnerable system. -+ -+ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be -+ mitigated by disabling TSX, as this is the lowest overhead option. -+ -+ * If the use of TSX is important, the more expensive TAA mitigations can be -+ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain -+ active by default. -+ - ### ucode (x86) - > `= [<integer> | scan]` - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index b37d40e643..800139d79c 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -96,6 +96,9 @@ static int __init parse_spec_ctrl(const char *s) - if ( opt_pv_l1tf_domu < 0 ) - opt_pv_l1tf_domu = 0; - -+ if ( opt_tsx == -1 ) -+ opt_tsx = -3; -+ - disable_common: - opt_rsb_pv = false; - opt_rsb_hvm = false; -@@ -306,7 +309,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - printk("Speculative mitigation facilities:\n"); - - /* Hardware features which pertain to speculative mitigations. */ -- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", -@@ -318,7 +321,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", - (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", - (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", -- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : ""); -+ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", -+ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", -+ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); - - /* Compiled-in support which pertains to mitigations. */ - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) -@@ -332,7 +337,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : -@@ -341,6 +346,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", - !boot_cpu_has(X86_FEATURE_SSBD) ? "" : - (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", -+ !(caps & ARCH_CAPS_TSX_CTRL) ? "" : -+ (opt_tsx & 1) ? " TSX+" : " TSX-", - opt_ibpb ? " IBPB" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : ""); -@@ -862,6 +869,7 @@ void __init init_speculation_mitigations(void) - { - enum ind_thunk thunk = THUNK_DEFAULT; - bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; -+ bool cpu_has_bug_taa; - uint64_t caps = 0; - - if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) -@@ -1086,6 +1094,53 @@ void __init init_speculation_mitigations(void) - "enabled. Mitigations will not be fully effective. Please\n" - "choose an explicit smt=<bool> setting. See XSA-297.\n"); - -+ /* -+ * Vulnerability to TAA is a little complicated to quantify. -+ * -+ * In the pipeline, it is just another way to get speculative access to -+ * stale load port, store buffer or fill buffer data, and therefore can be -+ * considered a superset of MDS (on TSX-capable parts). On parts which -+ * predate MDS_NO, the existing VERW flushing will mitigate this -+ * sidechannel as well. -+ * -+ * On parts which contain MDS_NO, the lack of VERW flushing means that an -+ * attacker can still use TSX to target microarchitectural buffers to leak -+ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts -+ * which have MDS_NO but lack TAA_NO. -+ * -+ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the -+ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so -+ * we check both to spot TSX in a microcode/cmdline independent way. -+ */ -+ cpu_has_bug_taa = -+ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) && -+ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO; -+ -+ /* -+ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs -+ * the MDS mitigation of disabling HT and using VERW flushing. -+ * -+ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until -+ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being -+ * advertised, and there isn't a MD_CLEAR_2 flag to use... -+ * -+ * If we're on affected hardware, able to do something about it (which -+ * implies that VERW now works), no explicit TSX choice and traditional -+ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might -+ * plausibly value TSX higher than Hyperthreading...), disable TSX to -+ * mitigate TAA. -+ */ -+ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) && -+ ((hw_smt_enabled && opt_smt) || -+ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) ) -+ { -+ setup_clear_cpu_cap(X86_FEATURE_HLE); -+ setup_clear_cpu_cap(X86_FEATURE_RTM); -+ -+ opt_tsx = 0; -+ tsx_init(); -+ } -+ - print_details(thunk, caps); - - /* -diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c -index a8ec2ccc69..2d202a0d4e 100644 ---- a/xen/arch/x86/tsx.c -+++ b/xen/arch/x86/tsx.c -@@ -5,7 +5,8 @@ - * Valid values: - * 1 => Explicit tsx=1 - * 0 => Explicit tsx=0 -- * -1 => Default, implicit tsx=1 -+ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA -+ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) - * - * This is arranged such that the bottom bit encodes whether TSX is actually - * disabled, while identifying various explicit (>=0) and implicit (<0) -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index d5f3899f73..3971b992d3 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -54,6 +54,7 @@ - #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) - #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) - #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) -+#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) diff --git a/system/xen/xsa/xsa312.patch b/system/xen/xsa/xsa312.patch new file mode 100644 index 0000000000000..ae3fa4041ba06 --- /dev/null +++ b/system/xen/xsa/xsa312.patch @@ -0,0 +1,93 @@ +From 9f807cf84a9a7a011cf1df7895c54d6031a7596d Mon Sep 17 00:00:00 2001 +From: Julien Grall <julien@xen.org> +Date: Thu, 19 Dec 2019 08:12:21 +0000 +Subject: [PATCH] xen/arm: Place a speculation barrier sequence following an + eret instruction + +Some CPUs can speculate past an ERET instruction and potentially perform +speculative accesses to memory before processing the exception return. +Since the register state is often controlled by lower privilege level +at the point of an ERET, this could potentially be used as part of a +side-channel attack. + +Newer CPUs may implement a new SB barrier instruction which acts +as an architected speculation barrier. For current CPUs, the sequence +DSB; ISB is known to prevent speculation. + +The latter sequence is heavier than SB but it would never be executed +(this is speculation after all!). + +Introduce a new macro 'sb' that could be used when a speculation barrier +is required. For now it is using dsb; isb but this could easily be +updated to cater SB in the future. + +This is XSA-312. + +Signed-off-by: Julien Grall <julien@xen.org> +--- + xen/arch/arm/arm32/entry.S | 1 + + xen/arch/arm/arm64/entry.S | 3 +++ + xen/include/asm-arm/macros.h | 9 +++++++++ + 3 files changed, 13 insertions(+) + +diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S +index 31ccfb2631..b228d44b19 100644 +--- a/xen/arch/arm/arm32/entry.S ++++ b/xen/arch/arm/arm32/entry.S +@@ -426,6 +426,7 @@ return_to_hypervisor: + add sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ + clrex + eret ++ sb + + /* + * struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next) +diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S +index d35855af96..175ea2981e 100644 +--- a/xen/arch/arm/arm64/entry.S ++++ b/xen/arch/arm/arm64/entry.S +@@ -354,6 +354,7 @@ guest_sync: + */ + mov x1, xzr + eret ++ sb + + check_wa2: + /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ +@@ -393,6 +394,7 @@ wa2_end: + #endif /* !CONFIG_ARM_SSBD */ + mov x0, xzr + eret ++ sb + guest_sync_slowpath: + /* + * x0/x1 may have been scratch by the fast path above, so avoid +@@ -457,6 +459,7 @@ return_from_trap: + ldr lr, [sp], #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */ + + eret ++ sb + + /* + * Consume pending SError generated by the guest if any. +diff --git a/xen/include/asm-arm/macros.h b/xen/include/asm-arm/macros.h +index 91ea3505e4..4833671f4c 100644 +--- a/xen/include/asm-arm/macros.h ++++ b/xen/include/asm-arm/macros.h +@@ -20,4 +20,13 @@ + .endr + .endm + ++ /* ++ * Speculative barrier ++ * XXX: Add support for the 'sb' instruction ++ */ ++ .macro sb ++ dsb nsh ++ isb ++ .endm ++ + #endif /* __ASM_ARM_MACROS_H */ +-- +2.17.1 + diff --git a/system/xen/xsa/xsa313-1.patch b/system/xen/xsa/xsa313-1.patch new file mode 100644 index 0000000000000..95fde7ead4db3 --- /dev/null +++ b/system/xen/xsa/xsa313-1.patch @@ -0,0 +1,26 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: xenoprof: clear buffer intended to be shared with guests + +alloc_xenheap_pages() making use of MEMF_no_scrub is fine for Xen +internally used allocations, but buffers allocated to be shared with +(unpriviliged) guests need to be zapped of their prior content. + +This is part of XSA-313. + +Reported-by: Ilja Van Sprundel <ivansprundel@ioactive.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Wei Liu <wl@xen.org> + +--- a/xen/common/xenoprof.c ++++ b/xen/common/xenoprof.c +@@ -253,6 +253,9 @@ static int alloc_xenoprof_struct( + return -ENOMEM; + } + ++ for ( i = 0; i < npages; ++i ) ++ clear_page(d->xenoprof->rawbuf + i * PAGE_SIZE); ++ + d->xenoprof->npages = npages; + d->xenoprof->nbuf = nvcpu; + d->xenoprof->bufsize = bufsize; diff --git a/system/xen/xsa/xsa313-2.patch b/system/xen/xsa/xsa313-2.patch new file mode 100644 index 0000000000000..d81b8232d2df0 --- /dev/null +++ b/system/xen/xsa/xsa313-2.patch @@ -0,0 +1,132 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: xenoprof: limit consumption of shared buffer data + +Since a shared buffer can be written to by the guest, we may only read +the head and tail pointers from there (all other fields should only ever +be written to). Furthermore, for any particular operation the two values +must be read exactly once, with both checks and consumption happening +with the thus read values. (The backtrace related xenoprof_buf_space() +use in xenoprof_log_event() is an exception: The values used there get +re-checked by every subsequent xenoprof_add_sample().) + +Since that code needed touching, also fix the double increment of the +lost samples count in case the backtrace related xenoprof_add_sample() +invocation in xenoprof_log_event() fails. + +Where code is being touched anyway, add const as appropriate, but take +the opportunity to entirely drop the now unused domain parameter of +xenoprof_buf_space(). + +This is part of XSA-313. + +Reported-by: Ilja Van Sprundel <ivansprundel@ioactive.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Wei Liu <wl@xen.org> + +--- a/xen/common/xenoprof.c ++++ b/xen/common/xenoprof.c +@@ -479,25 +479,22 @@ static int add_passive_list(XEN_GUEST_HA + + + /* Get space in the buffer */ +-static int xenoprof_buf_space(struct domain *d, xenoprof_buf_t * buf, int size) ++static int xenoprof_buf_space(int head, int tail, int size) + { +- int head, tail; +- +- head = xenoprof_buf(d, buf, event_head); +- tail = xenoprof_buf(d, buf, event_tail); +- + return ((tail > head) ? 0 : size) + tail - head - 1; + } + + /* Check for space and add a sample. Return 1 if successful, 0 otherwise. */ +-static int xenoprof_add_sample(struct domain *d, xenoprof_buf_t *buf, ++static int xenoprof_add_sample(const struct domain *d, ++ const struct xenoprof_vcpu *v, + uint64_t eip, int mode, int event) + { ++ xenoprof_buf_t *buf = v->buffer; + int head, tail, size; + + head = xenoprof_buf(d, buf, event_head); + tail = xenoprof_buf(d, buf, event_tail); +- size = xenoprof_buf(d, buf, event_size); ++ size = v->event_size; + + /* make sure indexes in shared buffer are sane */ + if ( (head < 0) || (head >= size) || (tail < 0) || (tail >= size) ) +@@ -506,7 +503,7 @@ static int xenoprof_add_sample(struct do + return 0; + } + +- if ( xenoprof_buf_space(d, buf, size) > 0 ) ++ if ( xenoprof_buf_space(head, tail, size) > 0 ) + { + xenoprof_buf(d, buf, event_log[head].eip) = eip; + xenoprof_buf(d, buf, event_log[head].mode) = mode; +@@ -530,7 +527,6 @@ static int xenoprof_add_sample(struct do + int xenoprof_add_trace(struct vcpu *vcpu, uint64_t pc, int mode) + { + struct domain *d = vcpu->domain; +- xenoprof_buf_t *buf = d->xenoprof->vcpu[vcpu->vcpu_id].buffer; + + /* Do not accidentally write an escape code due to a broken frame. */ + if ( pc == XENOPROF_ESCAPE_CODE ) +@@ -539,7 +535,8 @@ int xenoprof_add_trace(struct vcpu *vcpu + return 0; + } + +- return xenoprof_add_sample(d, buf, pc, mode, 0); ++ return xenoprof_add_sample(d, &d->xenoprof->vcpu[vcpu->vcpu_id], ++ pc, mode, 0); + } + + void xenoprof_log_event(struct vcpu *vcpu, const struct cpu_user_regs *regs, +@@ -570,17 +567,22 @@ void xenoprof_log_event(struct vcpu *vcp + /* Provide backtrace if requested. */ + if ( backtrace_depth > 0 ) + { +- if ( (xenoprof_buf_space(d, buf, v->event_size) < 2) || +- !xenoprof_add_sample(d, buf, XENOPROF_ESCAPE_CODE, mode, +- XENOPROF_TRACE_BEGIN) ) ++ if ( xenoprof_buf_space(xenoprof_buf(d, buf, event_head), ++ xenoprof_buf(d, buf, event_tail), ++ v->event_size) < 2 ) + { + xenoprof_buf(d, buf, lost_samples)++; + lost_samples++; + return; + } ++ ++ /* xenoprof_add_sample() will increment lost_samples on failure */ ++ if ( !xenoprof_add_sample(d, v, XENOPROF_ESCAPE_CODE, mode, ++ XENOPROF_TRACE_BEGIN) ) ++ return; + } + +- if ( xenoprof_add_sample(d, buf, pc, mode, event) ) ++ if ( xenoprof_add_sample(d, v, pc, mode, event) ) + { + if ( is_active(vcpu->domain) ) + active_samples++; +--- a/xen/include/xen/xenoprof.h ++++ b/xen/include/xen/xenoprof.h +@@ -61,12 +61,12 @@ struct xenoprof { + + #ifndef CONFIG_COMPAT + #define XENOPROF_COMPAT(x) 0 +-#define xenoprof_buf(d, b, field) ((b)->field) ++#define xenoprof_buf(d, b, field) ACCESS_ONCE((b)->field) + #else + #define XENOPROF_COMPAT(x) ((x)->is_compat) +-#define xenoprof_buf(d, b, field) (*(!(d)->xenoprof->is_compat ? \ +- &(b)->native.field : \ +- &(b)->compat.field)) ++#define xenoprof_buf(d, b, field) ACCESS_ONCE(*(!(d)->xenoprof->is_compat \ ++ ? &(b)->native.field \ ++ : &(b)->compat.field)) + #endif + + struct domain; diff --git a/system/xen/xsa/xsa314-4.13.patch b/system/xen/xsa/xsa314-4.13.patch new file mode 100644 index 0000000000000..67e006681e0ce --- /dev/null +++ b/system/xen/xsa/xsa314-4.13.patch @@ -0,0 +1,121 @@ +From ab49f005f7d01d4004d76f2e295d31aca7d4f93a Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Thu, 20 Feb 2020 20:54:40 +0000 +Subject: [PATCH] xen/rwlock: Add missing memory barrier in the unlock path of + rwlock + +The rwlock unlock paths are using atomic_sub() to release the lock. +However the implementation of atomic_sub() rightfully doesn't contain a +memory barrier. On Arm, this means a processor is allowed to re-order +the memory access with the preceeding access. + +In other words, the unlock may be seen by another processor before all +the memory accesses within the "critical" section. + +The rwlock paths already contains barrier indirectly, but they are not +very useful without the counterpart in the unlock paths. + +The memory barriers are not necessary on x86 because loads/stores are +not re-ordered with lock instructions. + +So add arch_lock_release_barrier() in the unlock paths that will only +add memory barrier on Arm. + +Take the opportunity to document each lock paths explaining why a +barrier is not necessary. + +This is XSA-314. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> + +--- + xen/include/xen/rwlock.h | 29 ++++++++++++++++++++++++++++- + 1 file changed, 28 insertions(+), 1 deletion(-) + +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index 3dfea1ac2a..516486306f 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -48,6 +48,10 @@ static inline int _read_trylock(rwlock_t *lock) + if ( likely(!(cnts & _QW_WMASK)) ) + { + cnts = (u32)atomic_add_return(_QR_BIAS, &lock->cnts); ++ /* ++ * atomic_add_return() is a full barrier so no need for an ++ * arch_lock_acquire_barrier(). ++ */ + if ( likely(!(cnts & _QW_WMASK)) ) + return 1; + atomic_sub(_QR_BIAS, &lock->cnts); +@@ -64,11 +68,19 @@ static inline void _read_lock(rwlock_t *lock) + u32 cnts; + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts); ++ /* ++ * atomic_add_return() is a full barrier so no need for an ++ * arch_lock_acquire_barrier(). ++ */ + if ( likely(!(cnts & _QW_WMASK)) ) + return; + + /* The slowpath will decrement the reader count, if necessary. */ + queue_read_lock_slowpath(lock); ++ /* ++ * queue_read_lock_slowpath() is using spinlock and therefore is a ++ * full barrier. So no need for an arch_lock_acquire_barrier(). ++ */ + } + + static inline void _read_lock_irq(rwlock_t *lock) +@@ -92,6 +104,7 @@ static inline unsigned long _read_lock_irqsave(rwlock_t *lock) + */ + static inline void _read_unlock(rwlock_t *lock) + { ++ arch_lock_release_barrier(); + /* + * Atomically decrement the reader count + */ +@@ -121,11 +134,20 @@ static inline int _rw_is_locked(rwlock_t *lock) + */ + static inline void _write_lock(rwlock_t *lock) + { +- /* Optimize for the unfair lock case where the fair flag is 0. */ ++ /* ++ * Optimize for the unfair lock case where the fair flag is 0. ++ * ++ * atomic_cmpxchg() is a full barrier so no need for an ++ * arch_lock_acquire_barrier(). ++ */ + if ( atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0 ) + return; + + queue_write_lock_slowpath(lock); ++ /* ++ * queue_write_lock_slowpath() is using spinlock and therefore is a ++ * full barrier. So no need for an arch_lock_acquire_barrier(). ++ */ + } + + static inline void _write_lock_irq(rwlock_t *lock) +@@ -157,11 +179,16 @@ static inline int _write_trylock(rwlock_t *lock) + if ( unlikely(cnts) ) + return 0; + ++ /* ++ * atomic_cmpxchg() is a full barrier so no need for an ++ * arch_lock_acquire_barrier(). ++ */ + return likely(atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0); + } + + static inline void _write_unlock(rwlock_t *lock) + { ++ arch_lock_release_barrier(); + /* + * If the writer field is atomic, it can be cleared directly. + * Otherwise, an atomic subtraction will be used to clear it. +-- +2.17.1 + diff --git a/system/xen/xsa/xsa316-xen.patch b/system/xen/xsa/xsa316-xen.patch new file mode 100644 index 0000000000000..4962b4e7161cb --- /dev/null +++ b/system/xen/xsa/xsa316-xen.patch @@ -0,0 +1,30 @@ +From: Ross Lagerwall <ross.lagerwall@citrix.com> +Subject: xen/gnttab: Fix error path in map_grant_ref() + +Part of XSA-295 (c/s 863e74eb2cffb) inadvertently re-positioned the brackets, +changing the logic. If the _set_status() call fails, the grant_map hypercall +would fail with a status of 1 (rc != GNTST_okay) instead of the expected +negative GNTST_* error. + +This error path can be taken due to bad guest state, and causes net/blk-back +in Linux to crash. + +This is XSA-316. + +Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 9fd6e60416..4b5344dc21 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -1031,7 +1031,7 @@ map_grant_ref( + { + if ( (rc = _set_status(shah, status, rd, rgt->gt_version, act, + op->flags & GNTMAP_readonly, 1, +- ld->domain_id) != GNTST_okay) ) ++ ld->domain_id)) != GNTST_okay ) + goto act_release_out; + + if ( !act->pin ) diff --git a/system/xen/xsa/xsa318.patch b/system/xen/xsa/xsa318.patch new file mode 100644 index 0000000000000..f4becdf81e7a7 --- /dev/null +++ b/system/xen/xsa/xsa318.patch @@ -0,0 +1,39 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: gnttab: fix GNTTABOP_copy continuation handling + +The XSA-226 fix was flawed - the backwards transformation on rc was done +too early, causing a continuation to not get invoked when the need for +preemption was determined at the very first iteration of the request. +This in particular means that all of the status fields of the individual +operations would be left untouched, i.e. set to whatever the caller may +or may not have initialized them to. + +This is part of XSA-318. + +Reported-by: Pawel Wieczorkiewicz <wipawel@amazon.de> +Tested-by: Pawel Wieczorkiewicz <wipawel@amazon.de> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Juergen Gross <jgross@suse.com> + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -3576,8 +3576,7 @@ do_grant_table_op( + rc = gnttab_copy(copy, count); + if ( rc > 0 ) + { +- rc = count - rc; +- guest_handle_add_offset(copy, rc); ++ guest_handle_add_offset(copy, count - rc); + uop = guest_handle_cast(copy, void); + } + break; +@@ -3644,6 +3643,9 @@ do_grant_table_op( + out: + if ( rc > 0 || opaque_out != 0 ) + { ++ /* Adjust rc, see gnttab_copy() for why this is needed. */ ++ if ( cmd == GNTTABOP_copy ) ++ rc = count - rc; + ASSERT(rc < count); + ASSERT((opaque_out & GNTTABOP_CMD_MASK) == 0); + rc = hypercall_create_continuation(__HYPERVISOR_grant_table_op, "ihi", |