diff options
author | Mario Preksavec <mario@slackware.hr> | 2022-10-13 11:36:16 +0200 |
---|---|---|
committer | Willy Sudiarto Raharjo <willysr@slackbuilds.org> | 2022-10-15 10:47:42 +0700 |
commit | b75c769415e3718b1b934f26737ad17f9ed019ce (patch) | |
tree | 7a6e3fed3a7a8f14befad896144c810f2e66a74b | |
parent | aae988ef834b611712b5b7b4df0256aae61ff3ff (diff) |
system/xen: Updated for version 4.16.2.
Signed-off-by: Mario Preksavec <mario@slackware.hr>
Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
-rw-r--r-- | system/xen/xen.SlackBuild | 4 | ||||
-rw-r--r-- | system/xen/xen.info | 6 | ||||
-rw-r--r-- | system/xen/xsa/xsa401-4.16-1.patch | 170 | ||||
-rw-r--r-- | system/xen/xsa/xsa401-4.16-2.patch | 191 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-1.patch | 43 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-2.patch | 213 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-3.patch | 284 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-4.patch | 83 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-5.patch | 148 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-01.patch | 59 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-02.patch | 165 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-03.patch | 113 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-04.patch | 62 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-05.patch | 60 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-06.patch | 255 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-07.patch | 82 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-08.patch | 96 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-09.patch | 159 | ||||
-rw-r--r-- | system/xen/xsa/xsa410-4.16-10.patch | 171 | ||||
-rw-r--r-- | system/xen/xsa/xsa411.patch | 55 |
20 files changed, 1282 insertions, 1137 deletions
diff --git a/system/xen/xen.SlackBuild b/system/xen/xen.SlackBuild index e504dc97bf12..2d326cc732f1 100644 --- a/system/xen/xen.SlackBuild +++ b/system/xen/xen.SlackBuild @@ -25,8 +25,8 @@ cd $(dirname $0) ; CWD=$(pwd) PRGNAM=xen -VERSION=${VERSION:-4.16.1} -BUILD=${BUILD:-2} +VERSION=${VERSION:-4.16.2} +BUILD=${BUILD:-1} TAG=${TAG:-_SBo} PKGTYPE=${PKGTYPE:-tgz} diff --git a/system/xen/xen.info b/system/xen/xen.info index e6c8cb4da0d4..150bd770b565 100644 --- a/system/xen/xen.info +++ b/system/xen/xen.info @@ -1,9 +1,9 @@ PRGNAM="xen" -VERSION="4.16.1" +VERSION="4.16.2" HOMEPAGE="http://www.xenproject.org/" DOWNLOAD="UNSUPPORTED" MD5SUM="" -DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \ +DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.2.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-3c040ad387099483102708bb1839110bc788cefb.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \ @@ -15,7 +15,7 @@ DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \ http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \ http://mirror.slackware.hr/sources/xen-seabios/seabios-1.14.0.tar.gz \ http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20210824_7b4a99be8a.tar.bz2" -MD5SUM_x86_64="1c2cd4f7f966c1d455aab630953e5fad \ +MD5SUM_x86_64="6bd720f53e3c34a35cb8a8897a561e18 \ 23ba00d5e2c5b4343d12665af73e1cb5 \ 36cc57650cffda9a0269493be2a169bb \ debc62758716a169df9f62e6ab2bc634 \ diff --git a/system/xen/xsa/xsa401-4.16-1.patch b/system/xen/xsa/xsa401-4.16-1.patch deleted file mode 100644 index 5c8c50617a21..000000000000 --- a/system/xen/xsa/xsa401-4.16-1.patch +++ /dev/null @@ -1,170 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/pv: Clean up _get_page_type() - -Various fixes for clarity, ahead of making complicated changes. - - * Split the overflow check out of the if/else chain for type handling, as - it's somewhat unrelated. - * Comment the main if/else chain to explain what is going on. Adjust one - ASSERT() and state the bit layout for validate-locked and partial states. - * Correct the comment about TLB flushing, as it's backwards. The problem - case is when writeable mappings are retained to a page becoming read-only, - as it allows the guest to bypass Xen's safety checks for updates. - * Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not - valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat - all reads as explicitly volatile. The only thing preventing the validated - wait-loop being infinite is the compiler barrier hidden in cpu_relax(). - * Replace one page_get_owner(page) with the already-calculated 'd' already in - scope. - -No functional change. - -This is part of XSA-401 / CVE-2022-26362. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 796faca64103..ddd32f88c798 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags, - static int _get_page_type(struct page_info *page, unsigned long type, - bool preemptible) - { -- unsigned long nx, x, y = page->u.inuse.type_info; -+ unsigned long nx, x; - int rc = 0; - - ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); - ASSERT(!in_irq()); - -- for ( ; ; ) -+ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; ) - { - x = y; - nx = x + 1; -+ - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - gdprintk(XENLOG_WARNING, -@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type, - mfn_x(page_to_mfn(page))); - return -EINVAL; - } -- else if ( unlikely((x & PGT_count_mask) == 0) ) -+ -+ if ( unlikely((x & PGT_count_mask) == 0) ) - { -+ /* -+ * Typeref 0 -> 1. -+ * -+ * Type changes are permitted when the typeref is 0. If the type -+ * actually changes, the page needs re-validating. -+ */ - struct domain *d = page_get_owner(page); - - if ( d && shadow_mode_enabled(d) ) -@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type, - { - /* - * On type change we check to flush stale TLB entries. It is -- * vital that no other CPUs are left with mappings of a frame -- * which is about to become writeable to the guest. -+ * vital that no other CPUs are left with writeable mappings -+ * to a frame which is intending to become pgtable/segdesc. - */ - cpumask_t *mask = this_cpu(scratch_cpumask); - -@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - if ( unlikely(!cpumask_empty(mask)) && - /* Shadow mode: track only writable pages. */ -- (!shadow_mode_enabled(page_get_owner(page)) || -+ (!shadow_mode_enabled(d) || - ((nx & PGT_type_mask) == PGT_writable_page)) ) - { - perfc_incr(need_flush_tlb_flush); -@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type, - } - else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) - { -- /* Don't log failure if it could be a recursive-mapping attempt. */ -+ /* -+ * else, we're trying to take a new reference, of the wrong type. -+ * -+ * This (being able to prohibit use of the wrong type) is what the -+ * typeref system exists for, but skip printing the failure if it -+ * looks like a recursive mapping, as subsequent logic might -+ * ultimately permit the attempt. -+ */ - if ( ((x & PGT_type_mask) == PGT_l2_page_table) && - (type == PGT_l1_page_table) ) - return -EINVAL; -@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type, - } - else if ( unlikely(!(x & PGT_validated)) ) - { -+ /* -+ * else, the count is non-zero, and we're grabbing the right type; -+ * but the page hasn't been validated yet. -+ * -+ * The page is in one of two states (depending on PGT_partial), -+ * and should have exactly one reference. -+ */ -+ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1)); -+ - if ( !(x & PGT_partial) ) - { -- /* Someone else is updating validation of this page. Wait... */ -+ /* -+ * The page has been left in the "validate locked" state -+ * (i.e. PGT_[type] | 1) which means that a concurrent caller -+ * of _get_page_type() is in the middle of validation. -+ * -+ * Spin waiting for the concurrent user to complete (partial -+ * or fully validated), then restart our attempt to acquire a -+ * type reference. -+ */ - do { - if ( preemptible && hypercall_preempt_check() ) - return -EINTR; - cpu_relax(); -- } while ( (y = page->u.inuse.type_info) == x ); -+ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x ); - continue; - } -- /* Type ref count was left at 1 when PGT_partial got set. */ -- ASSERT((x & PGT_count_mask) == 1); -+ -+ /* -+ * The page has been left in the "partial" state -+ * (i.e., PGT_[type] | PGT_partial | 1). -+ * -+ * Rather than bumping the type count, we need to try to grab the -+ * validation lock; if we succeed, we need to validate the page, -+ * then drop the general ref associated with the PGT_partial bit. -+ * -+ * We grab the validation lock by setting nx to (PGT_[type] | 1) -+ * (i.e., non-zero type count, neither PGT_validated nor -+ * PGT_partial set). -+ */ - nx = x & ~PGT_partial; - } - -@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type, - } - - out: -+ /* -+ * Did we drop the PGT_partial bit when acquiring the typeref? If so, -+ * drop the general reference that went along with it. -+ * -+ * N.B. validate_page() may have have re-set PGT_partial, not reflected in -+ * nx, but will have taken an extra ref when doing so. -+ */ - if ( (x & PGT_partial) && !(nx & PGT_partial) ) - put_page(page); - diff --git a/system/xen/xsa/xsa401-4.16-2.patch b/system/xen/xsa/xsa401-4.16-2.patch deleted file mode 100644 index be58db59a513..000000000000 --- a/system/xen/xsa/xsa401-4.16-2.patch +++ /dev/null @@ -1,191 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type() - -_get_page_type() suffers from a race condition where it incorrectly assumes -that because 'x' was read and a subsequent a cmpxchg() succeeds, the type -cannot have changed in-between. Consider: - -CPU A: - 1. Creates an L2e referencing pg - `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page - 2. Issues flush_tlb_mask() -CPU B: - 3. Creates a writeable mapping of pg - `-> _get_page_type(pg, PGT_writable_page), count increases to 1 - 4. Writes into new mapping, creating a TLB entry for pg - 5. Removes the writeable mapping of pg - `-> _put_page_type(pg), count goes back down to 0 -CPU A: - 7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table - -CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and -suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred -until after the guest is prohibited from creating new writeable mappings, -which is after step 7. - -Defer all safety actions until after the cmpxchg() has successfully taken the -intended typeref, because that is what prevents concurrent users from using -the old type. - -Also remove the early validation for writeable and shared pages. This removes -race conditions where one half of a parallel mapping attempt can return -successfully before: - * The IOMMU pagetables are in sync with the new page type - * Writeable mappings to shared pages have been torn down - -This is part of XSA-401 / CVE-2022-26362. - -Reported-by: Jann Horn <jannh@google.com> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index ddd32f88c798..1693b580b152 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type, - * Type changes are permitted when the typeref is 0. If the type - * actually changes, the page needs re-validating. - */ -- struct domain *d = page_get_owner(page); -- -- if ( d && shadow_mode_enabled(d) ) -- shadow_prepare_page_type_change(d, page, type); - - ASSERT(!(x & PGT_pae_xen_l2)); - if ( (x & PGT_type_mask) != type ) - { -- /* -- * On type change we check to flush stale TLB entries. It is -- * vital that no other CPUs are left with writeable mappings -- * to a frame which is intending to become pgtable/segdesc. -- */ -- cpumask_t *mask = this_cpu(scratch_cpumask); -- -- BUG_ON(in_irq()); -- cpumask_copy(mask, d->dirty_cpumask); -- -- /* Don't flush if the timestamp is old enough */ -- tlbflush_filter(mask, page->tlbflush_timestamp); -- -- if ( unlikely(!cpumask_empty(mask)) && -- /* Shadow mode: track only writable pages. */ -- (!shadow_mode_enabled(d) || -- ((nx & PGT_type_mask) == PGT_writable_page)) ) -- { -- perfc_incr(need_flush_tlb_flush); -- /* -- * If page was a page table make sure the flush is -- * performed using an IPI in order to avoid changing the -- * type of a page table page under the feet of -- * spurious_page_fault(). -- */ -- flush_mask(mask, -- (x & PGT_type_mask) && -- (x & PGT_type_mask) <= PGT_root_page_table -- ? FLUSH_TLB | FLUSH_FORCE_IPI -- : FLUSH_TLB); -- } -- -- /* We lose existing type and validity. */ - nx &= ~(PGT_type_mask | PGT_validated); - nx |= type; -- -- /* -- * No special validation needed for writable pages. -- * Page tables and GDT/LDT need to be scanned for validity. -- */ -- if ( type == PGT_writable_page || type == PGT_shared_page ) -- nx |= PGT_validated; - } - } - else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) -@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type, - return -EINTR; - } - -+ /* -+ * One typeref has been taken and is now globally visible. -+ * -+ * The page is either in the "validate locked" state (PGT_[type] | 1) or -+ * fully validated (PGT_[type] | PGT_validated | >0). -+ */ -+ -+ if ( unlikely((x & PGT_count_mask) == 0) ) -+ { -+ struct domain *d = page_get_owner(page); -+ -+ if ( d && shadow_mode_enabled(d) ) -+ shadow_prepare_page_type_change(d, page, type); -+ -+ if ( (x & PGT_type_mask) != type ) -+ { -+ /* -+ * On type change we check to flush stale TLB entries. It is -+ * vital that no other CPUs are left with writeable mappings -+ * to a frame which is intending to become pgtable/segdesc. -+ */ -+ cpumask_t *mask = this_cpu(scratch_cpumask); -+ -+ BUG_ON(in_irq()); -+ cpumask_copy(mask, d->dirty_cpumask); -+ -+ /* Don't flush if the timestamp is old enough */ -+ tlbflush_filter(mask, page->tlbflush_timestamp); -+ -+ if ( unlikely(!cpumask_empty(mask)) && -+ /* Shadow mode: track only writable pages. */ -+ (!shadow_mode_enabled(d) || -+ ((nx & PGT_type_mask) == PGT_writable_page)) ) -+ { -+ perfc_incr(need_flush_tlb_flush); -+ /* -+ * If page was a page table make sure the flush is -+ * performed using an IPI in order to avoid changing the -+ * type of a page table page under the feet of -+ * spurious_page_fault(). -+ */ -+ flush_mask(mask, -+ (x & PGT_type_mask) && -+ (x & PGT_type_mask) <= PGT_root_page_table -+ ? FLUSH_TLB | FLUSH_FORCE_IPI -+ : FLUSH_TLB); -+ } -+ } -+ } -+ - if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) != - (type == PGT_writable_page)) ) - { -@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - if ( unlikely(!(nx & PGT_validated)) ) - { -- if ( !(x & PGT_partial) ) -+ /* -+ * No special validation needed for writable or shared pages. Page -+ * tables and GDT/LDT need to have their contents audited. -+ * -+ * per validate_page(), non-atomic updates are fine here. -+ */ -+ if ( type == PGT_writable_page || type == PGT_shared_page ) -+ page->u.inuse.type_info |= PGT_validated; -+ else - { -- page->nr_validated_ptes = 0; -- page->partial_flags = 0; -- page->linear_pt_count = 0; -+ if ( !(x & PGT_partial) ) -+ { -+ page->nr_validated_ptes = 0; -+ page->partial_flags = 0; -+ page->linear_pt_count = 0; -+ } -+ -+ rc = validate_page(page, type, preemptible); - } -- rc = validate_page(page, type, preemptible); - } - - out: diff --git a/system/xen/xsa/xsa402-4.16-1.patch b/system/xen/xsa/xsa402-4.16-1.patch deleted file mode 100644 index b783383fc89f..000000000000 --- a/system/xen/xsa/xsa402-4.16-1.patch +++ /dev/null @@ -1,43 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/page: Introduce _PAGE_* constants for memory types - -... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_* -constants. These are going to be needed by forthcoming logic. - -No functional change. - -This is part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h -index 1d080cffbe84..2e542050f65a 100644 ---- a/xen/include/asm-x86/page.h -+++ b/xen/include/asm-x86/page.h -@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); - - #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) - -+/* Memory types, encoded under Xen's choice of MSR_PAT. */ -+#define _PAGE_WB ( 0) -+#define _PAGE_WT ( _PAGE_PWT) -+#define _PAGE_UCM ( _PAGE_PCD ) -+#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT) -+#define _PAGE_WC (_PAGE_PAT ) -+#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT) -+ - /* - * Debug option: Ensure that granted mappings are not implicitly unmapped. - * WARNING: This will need to be disabled to run OSes that use the spare PTE -@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); - #define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED) - #define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \ - _PAGE_DIRTY | _PAGE_RW) --#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD) --#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT) -+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM) -+#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC) - #define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY) - - #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */ diff --git a/system/xen/xsa/xsa402-4.16-2.patch b/system/xen/xsa/xsa402-4.16-2.patch deleted file mode 100644 index ebb2f5e221e8..000000000000 --- a/system/xen/xsa/xsa402-4.16-2.patch +++ /dev/null @@ -1,213 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86: Don't change the cacheability of the directmap - -Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings -in response to guest mapping requests") attempted to keep the cacheability -consistent between different mappings of the same page. - -The reason wasn't described in the changelog, but it is understood to be in -regards to a concern over machine check exceptions, owing to errata when using -mixed cacheabilities. It did this primarily by updating Xen's mapping of the -page in the direct map when the guest mapped a page with reduced cacheability. - -Unfortunately, the logic didn't actually prevent mixed cacheability from -occurring: - * A guest could map a page normally, and then map the same page with - different cacheability; nothing prevented this. - * The cacheability of the directmap was always latest-takes-precedence in - terms of guest requests. - * Grant-mapped frames with lesser cacheability didn't adjust the page's - cacheattr settings. - * The map_domain_page() function still unconditionally created WB mappings, - irrespective of the page's cacheattr settings. - -Additionally, update_xen_mappings() had a bug where the alias calculation was -wrong for mfn's which were .init content, which should have been treated as -fully guest pages, not Xen pages. - -Worse yet, the logic introduced a vulnerability whereby necessary -pagetable/segdesc adjustments made by Xen in the validation logic could become -non-coherent between the cache and main memory. The CPU could subsequently -operate on the stale value in the cache, rather than the safe value in main -memory. - -The directmap contains primarily mappings of RAM. PAT/MTRR conflict -resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser -cacheability resolves to being coherent. The special case is WC mappings, -which are non-coherent against MTRR=WB regions (except for fully-coherent -CPUs). - -Xen must not have any WC cacheability in the directmap, to prevent Xen's -actions from creating non-coherency. (Guest actions creating non-coherency is -dealt with in subsequent patches.) As all memory types for MTRR=WB ranges -inter-operate coherently, so leave Xen's directmap mappings as WB. - -Only PV guests with access to devices can use reduced-cacheability mappings to -begin with, and they're trusted not to mount DoSs against the system anyway. - -Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them. -Shift the later PGC_* constants up, to gain 3 extra bits in the main reference -count. Retain the check in get_page_from_l1e() for special_pages() because a -guest has no business using reduced cacheability on these. - -This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0 - -This is CVE-2022-26363, part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index c6429b0f749a..ab32d13a1a0d 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn) - return (page_get_owner(page) == dom_io); - } - --static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr) --{ -- int err = 0; -- bool alias = mfn >= PFN_DOWN(xen_phys_start) && -- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START); -- unsigned long xen_va = -- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT); -- -- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) ) -- return 0; -- -- if ( unlikely(alias) && cacheattr ) -- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0); -- if ( !err ) -- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1, -- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr)); -- if ( unlikely(alias) && !cacheattr && !err ) -- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR); -- -- return err; --} -- - #ifndef NDEBUG - struct mmio_emul_range_ctxt { - const struct domain *d; -@@ -1009,47 +987,14 @@ get_page_from_l1e( - goto could_not_pin; - } - -- if ( pte_flags_to_cacheattr(l1f) != -- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) ) -+ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) ) - { -- unsigned long x, nx, y = page->count_info; -- unsigned long cacheattr = pte_flags_to_cacheattr(l1f); -- int err; -- -- if ( is_special_page(page) ) -- { -- if ( write ) -- put_page_type(page); -- put_page(page); -- gdprintk(XENLOG_WARNING, -- "Attempt to change cache attributes of Xen heap page\n"); -- return -EACCES; -- } -- -- do { -- x = y; -- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base); -- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); -- -- err = update_xen_mappings(mfn, cacheattr); -- if ( unlikely(err) ) -- { -- cacheattr = y & PGC_cacheattr_mask; -- do { -- x = y; -- nx = (x & ~PGC_cacheattr_mask) | cacheattr; -- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); -- -- if ( write ) -- put_page_type(page); -- put_page(page); -- -- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn -- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n", -- mfn, get_gpfn_from_mfn(mfn), -- l1e_get_intpte(l1e), l1e_owner->domain_id); -- return err; -- } -+ if ( write ) -+ put_page_type(page); -+ put_page(page); -+ gdprintk(XENLOG_WARNING, -+ "Attempt to change cache attributes of Xen heap page\n"); -+ return -EACCES; - } - - return 0; -@@ -2467,25 +2412,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, - */ - static int cleanup_page_mappings(struct page_info *page) - { -- unsigned int cacheattr = -- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base; - int rc = 0; - unsigned long mfn = mfn_x(page_to_mfn(page)); - - /* -- * If we've modified xen mappings as a result of guest cache -- * attributes, restore them to the "normal" state. -- */ -- if ( unlikely(cacheattr) ) -- { -- page->count_info &= ~PGC_cacheattr_mask; -- -- BUG_ON(is_special_page(page)); -- -- rc = update_xen_mappings(mfn, 0); -- } -- -- /* - * If this may be in a PV domain's IOMMU, remove it. - * - * NB that writable xenheap pages have their type set and cleared by -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index cb9052749963..8a9a43bb0a9d 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -69,25 +69,22 @@ - /* Set when is using a page as a page table */ - #define _PGC_page_table PG_shift(3) - #define PGC_page_table PG_mask(1, 3) -- /* 3-bit PAT/PCD/PWT cache-attribute hint. */ --#define PGC_cacheattr_base PG_shift(6) --#define PGC_cacheattr_mask PG_mask(7, 6) - /* Page is broken? */ --#define _PGC_broken PG_shift(7) --#define PGC_broken PG_mask(1, 7) -+#define _PGC_broken PG_shift(4) -+#define PGC_broken PG_mask(1, 4) - /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */ --#define PGC_state PG_mask(3, 9) --#define PGC_state_inuse PG_mask(0, 9) --#define PGC_state_offlining PG_mask(1, 9) --#define PGC_state_offlined PG_mask(2, 9) --#define PGC_state_free PG_mask(3, 9) -+#define PGC_state PG_mask(3, 6) -+#define PGC_state_inuse PG_mask(0, 6) -+#define PGC_state_offlining PG_mask(1, 6) -+#define PGC_state_offlined PG_mask(2, 6) -+#define PGC_state_free PG_mask(3, 6) - #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) - /* Page is not reference counted (see below for caveats) */ --#define _PGC_extra PG_shift(10) --#define PGC_extra PG_mask(1, 10) -+#define _PGC_extra PG_shift(7) -+#define PGC_extra PG_mask(1, 7) - - /* Count of references to this frame. */ --#define PGC_count_width PG_shift(10) -+#define PGC_count_width PG_shift(7) - #define PGC_count_mask ((1UL<<PGC_count_width)-1) - - /* diff --git a/system/xen/xsa/xsa402-4.16-3.patch b/system/xen/xsa/xsa402-4.16-3.patch deleted file mode 100644 index b4d2a4c835ae..000000000000 --- a/system/xen/xsa/xsa402-4.16-3.patch +++ /dev/null @@ -1,284 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86: Split cache_flush() out of cache_writeback() - -Subsequent changes will want a fully flushing version. - -Use the new helper rather than opencoding it in flush_area_local(). This -resolves an outstanding issue where the conditional sfence is on the wrong -side of the clflushopt loop. clflushopt is ordered with respect to older -stores, not to younger stores. - -Rename gnttab_cache_flush()'s helper to avoid colliding in name. -grant_table.c can see the prototype from cache.h so the build fails -otherwise. - -This is part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -Xen 4.16 and earlier: - * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache - syncing" to split cache_writeback() out of the IOMMU logic, but without the - associated hooks changes. - -diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c -index 25798df50f54..0c912b8669f8 100644 ---- a/xen/arch/x86/flushtlb.c -+++ b/xen/arch/x86/flushtlb.c -@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - if ( flags & FLUSH_CACHE ) - { - const struct cpuinfo_x86 *c = ¤t_cpu_data; -- unsigned long i, sz = 0; -+ unsigned long sz = 0; - - if ( order < (BITS_PER_LONG - PAGE_SHIFT) ) - sz = 1UL << (order + PAGE_SHIFT); -@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - c->x86_clflush_size && c->x86_cache_size && sz && - ((sz >> 10) < c->x86_cache_size) ) - { -- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); -- for ( i = 0; i < sz; i += c->x86_clflush_size ) -- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";" -- " clflush %0", -- "data16 clflush %0", /* clflushopt */ -- X86_FEATURE_CLFLUSHOPT, -- "m" (((const char *)va)[i])); -+ cache_flush(va, sz); - flags &= ~FLUSH_CACHE; - } - else -@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - return flags; - } - -+void cache_flush(const void *addr, unsigned int size) -+{ -+ /* -+ * This function may be called before current_cpu_data is established. -+ * Hence a fallback is needed to prevent the loop below becoming infinite. -+ */ -+ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; -+ const void *end = addr + size; -+ -+ addr -= (unsigned long)addr & (clflush_size - 1); -+ for ( ; addr < end; addr += clflush_size ) -+ { -+ /* -+ * Note regarding the "ds" prefix use: it's faster to do a clflush -+ * + prefix than a clflush + nop, and hence the prefix is added instead -+ * of letting the alternative framework fill the gap by appending nops. -+ */ -+ alternative_io("ds; clflush %[p]", -+ "data16 clflush %[p]", /* clflushopt */ -+ X86_FEATURE_CLFLUSHOPT, -+ /* no outputs */, -+ [p] "m" (*(const char *)(addr))); -+ } -+ -+ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); -+} -+ -+void cache_writeback(const void *addr, unsigned int size) -+{ -+ unsigned int clflush_size; -+ const void *end = addr + size; -+ -+ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */ -+ if ( !boot_cpu_has(X86_FEATURE_CLWB) ) -+ return cache_flush(addr, size); -+ -+ /* -+ * This function may be called before current_cpu_data is established. -+ * Hence a fallback is needed to prevent the loop below becoming infinite. -+ */ -+ clflush_size = current_cpu_data.x86_clflush_size ?: 16; -+ addr -= (unsigned long)addr & (clflush_size - 1); -+ for ( ; addr < end; addr += clflush_size ) -+ { -+/* -+ * The arguments to a macro must not include preprocessor directives. Doing so -+ * results in undefined behavior, so we have to create some defines here in -+ * order to avoid it. -+ */ -+#if defined(HAVE_AS_CLWB) -+# define CLWB_ENCODING "clwb %[p]" -+#elif defined(HAVE_AS_XSAVEOPT) -+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ -+#else -+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ -+#endif -+ -+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) -+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) -+# define INPUT BASE_INPUT -+#else -+# define INPUT(addr) "a" (addr), BASE_INPUT(addr) -+#endif -+ -+ asm volatile (CLWB_ENCODING :: INPUT(addr)); -+ -+#undef INPUT -+#undef BASE_INPUT -+#undef CLWB_ENCODING -+ } -+ -+ asm volatile ("sfence" ::: "memory"); -+} -+ - unsigned int guest_flush_tlb_flags(const struct domain *d) - { - bool shadow = paging_mode_shadow(d); -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index 66f8ce71741c..4c742cd8fe81 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop, - return 0; - } - --static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) -+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) - { - struct domain *d, *owner; - struct page_info *page; -@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop, - return -EFAULT; - for ( ; ; ) - { -- int ret = cache_flush(&op, cur_ref); -+ int ret = _cache_flush(&op, cur_ref); - - if ( ret < 0 ) - return ret; -diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h -index 01e010a10d61..401079299725 100644 ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu, - struct pci_dev *pdev, - u16 did, u16 size, u64 addr); - --unsigned int get_cache_line_size(void); - void flush_all_cache(void); - - uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index 8975c1de61bc..bc377c9bcfa4 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -31,6 +31,7 @@ - #include <xen/pci.h> - #include <xen/pci_regs.h> - #include <xen/keyhandler.h> -+#include <asm/cache.h> - #include <asm/msi.h> - #include <asm/nops.h> - #include <asm/irq.h> -@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d, - } - } - --static void sync_cache(const void *addr, unsigned int size) --{ -- static unsigned long clflush_size = 0; -- const void *end = addr + size; -- -- if ( clflush_size == 0 ) -- clflush_size = get_cache_line_size(); -- -- addr -= (unsigned long)addr & (clflush_size - 1); -- for ( ; addr < end; addr += clflush_size ) --/* -- * The arguments to a macro must not include preprocessor directives. Doing so -- * results in undefined behavior, so we have to create some defines here in -- * order to avoid it. -- */ --#if defined(HAVE_AS_CLWB) --# define CLWB_ENCODING "clwb %[p]" --#elif defined(HAVE_AS_XSAVEOPT) --# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ --#else --# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ --#endif -- --#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) --#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) --# define INPUT BASE_INPUT --#else --# define INPUT(addr) "a" (addr), BASE_INPUT(addr) --#endif -- /* -- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush -- * + prefix than a clflush + nop, and hence the prefix is added instead -- * of letting the alternative framework fill the gap by appending nops. -- */ -- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]", -- "data16 clflush %[p]", /* clflushopt */ -- X86_FEATURE_CLFLUSHOPT, -- CLWB_ENCODING, -- X86_FEATURE_CLWB, /* no outputs */, -- INPUT(addr)); --#undef INPUT --#undef BASE_INPUT --#undef CLWB_ENCODING -- -- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT, -- "sfence", X86_FEATURE_CLWB); --} -- - /* Allocate page table, return its machine address */ - uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) - { -@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) - clear_page(vaddr); - - if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache ) -- sync_cache(vaddr, PAGE_SIZE); -+ cache_writeback(vaddr, PAGE_SIZE); - unmap_domain_page(vaddr); - cur_pg++; - } -@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) - iommu->nr_pt_levels = agaw_to_level(agaw); - - if ( !ecap_coherent(iommu->ecap) ) -- vtd_ops.sync_cache = sync_cache; -+ vtd_ops.sync_cache = cache_writeback; - - /* allocate domain id bitmap */ - iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); -diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c -index 6681dccd6970..55f0faa521cb 100644 ---- a/xen/drivers/passthrough/vtd/x86/vtd.c -+++ b/xen/drivers/passthrough/vtd/x86/vtd.c -@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va) - unmap_domain_page(va); - } - --unsigned int get_cache_line_size(void) --{ -- return ((cpuid_ebx(1) >> 8) & 0xff) * 8; --} -- - void flush_all_cache() - { - wbinvd(); -diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h -index 1f7173d8c72c..e4770efb22b9 100644 ---- a/xen/include/asm-x86/cache.h -+++ b/xen/include/asm-x86/cache.h -@@ -11,4 +11,11 @@ - - #define __read_mostly __section(".data.read_mostly") - -+#ifndef __ASSEMBLY__ -+ -+void cache_flush(const void *addr, unsigned int size); -+void cache_writeback(const void *addr, unsigned int size); -+ -+#endif -+ - #endif diff --git a/system/xen/xsa/xsa402-4.16-4.patch b/system/xen/xsa/xsa402-4.16-4.patch deleted file mode 100644 index 21109225d7d7..000000000000 --- a/system/xen/xsa/xsa402-4.16-4.patch +++ /dev/null @@ -1,83 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/amd: Work around CLFLUSH ordering on older parts - -On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything, -including reads and writes to the address, and LFENCE/SFENCE instructions. - -This creates a multitude of problematic corner cases, laid out in the manual. -Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering. - -This is part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index a8e37dbb1f5c..b3b9a0df5fed 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c) - if (!cpu_has_lfence_dispatch) - __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); - -+ /* -+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with -+ * everything, including reads and writes to address, and -+ * LFENCE/SFENCE instructions. -+ */ -+ if (!cpu_has_clflushopt) -+ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); -+ - switch(c->x86) - { - case 0xf ... 0x11: -diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c -index 0c912b8669f8..dcbb4064012e 100644 ---- a/xen/arch/x86/flushtlb.c -+++ b/xen/arch/x86/flushtlb.c -@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - return flags; - } - -+/* -+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything, -+ * including reads and writes to address, and LFENCE/SFENCE instructions. -+ * -+ * This function only works safely after alternatives have run. Luckily, at -+ * the time of writing, we don't flush the caches that early. -+ */ - void cache_flush(const void *addr, unsigned int size) - { - /* -@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size) - unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; - const void *end = addr + size; - -+ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE); -+ - addr -= (unsigned long)addr & (clflush_size - 1); - for ( ; addr < end; addr += clflush_size ) - { -@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size) - [p] "m" (*(const char *)(addr))); - } - -- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); -+ alternative_2("", -+ "sfence", X86_FEATURE_CLFLUSHOPT, -+ "mfence", X86_BUG_CLFLUSH_MFENCE); - } - - void cache_writeback(const void *addr, unsigned int size) -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index 7413febd7ad8..ff3157d52d13 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch - - #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ - #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */ -+#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ - - /* Total number of capability words, inc synth and bug words. */ - #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ diff --git a/system/xen/xsa/xsa402-4.16-5.patch b/system/xen/xsa/xsa402-4.16-5.patch deleted file mode 100644 index 4806d25c6fa5..000000000000 --- a/system/xen/xsa/xsa402-4.16-5.patch +++ /dev/null @@ -1,148 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/pv: Track and flush non-coherent mappings of RAM - -There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with -devices that make non-coherent writes. The Linux sound subsystem makes -extensive use of this technique. - -For such usecases, the guest's DMA buffer is mapped and consistently used as -WC, and Xen doesn't interact with the buffer. - -However, a mischevious guest can use WC mappings to deliberately create -non-coherency between the cache and RAM, and use this to trick Xen into -validating a pagetable which isn't actually safe. - -Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set -it whenever a non-coherent writeable mapping is created. If the page is used -as anything other than PGT_writable_page, force a cache flush before -validation. Also force a cache flush before the page is returned to the heap. - -This is CVE-2022-26364, part of XSA-402. - -Reported-by: Jann Horn <jannh@google.com> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index ab32d13a1a0d..bab9624fabb7 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -997,6 +997,15 @@ get_page_from_l1e( - return -EACCES; - } - -+ /* -+ * Track writeable non-coherent mappings to RAM pages, to trigger a cache -+ * flush later if the target is used as anything but a PGT_writeable page. -+ * We care about all writeable mappings, including foreign mappings. -+ */ -+ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) && -+ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) ) -+ set_bit(_PGT_non_coherent, &page->u.inuse.type_info); -+ - return 0; - - could_not_pin: -@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page) - } - } - -+ /* -+ * Flush the cache if there were previously non-coherent writeable -+ * mappings of this page. This forces the page to be coherent before it -+ * is freed back to the heap. -+ */ -+ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) ) -+ { -+ void *addr = __map_domain_page(page); -+ -+ cache_flush(addr, PAGE_SIZE); -+ unmap_domain_page(addr); -+ } -+ - return rc; - } - -@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type, - if ( unlikely(!(nx & PGT_validated)) ) - { - /* -+ * Flush the cache if there were previously non-coherent mappings of -+ * this page, and we're trying to use it as anything other than a -+ * writeable page. This forces the page to be coherent before we -+ * validate its contents for safety. -+ */ -+ if ( (nx & PGT_non_coherent) && type != PGT_writable_page ) -+ { -+ void *addr = __map_domain_page(page); -+ -+ cache_flush(addr, PAGE_SIZE); -+ unmap_domain_page(addr); -+ -+ page->u.inuse.type_info &= ~PGT_non_coherent; -+ } -+ -+ /* - * No special validation needed for writable or shared pages. Page - * tables and GDT/LDT need to have their contents audited. - * -diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c -index 0325618c9883..81c72e61ed55 100644 ---- a/xen/arch/x86/pv/grant_table.c -+++ b/xen/arch/x86/pv/grant_table.c -@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame, - - ol1e = *pl1e; - if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) -+ { -+ /* -+ * We always create mappings in this path. However, our caller, -+ * map_grant_ref(), only passes potentially non-zero cache_flags for -+ * MMIO frames, so this path doesn't create non-coherent mappings of -+ * RAM frames and there's no need to calculate PGT_non_coherent. -+ */ -+ ASSERT(!cache_flags || is_iomem_page(frame)); -+ - rc = GNTST_okay; -+ } - - out_unlock: - page_unlock(page); -@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame, - l1e_get_flags(ol1e), addr, grant_pte_flags); - - if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) -+ { -+ /* -+ * Generally, replace_grant_pv_mapping() is used to destroy mappings -+ * (n1le = l1e_empty()), but it can be a present mapping on the -+ * GNTABOP_unmap_and_replace path. -+ * -+ * In such cases, the PTE is fully transplanted from its old location -+ * via steal_linear_addr(), so we need not perform PGT_non_coherent -+ * checking here. -+ */ - rc = GNTST_okay; -+ } - - out_unlock: - page_unlock(page); -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 8a9a43bb0a9d..7464167ae192 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -53,8 +53,12 @@ - #define _PGT_partial PG_shift(8) - #define PGT_partial PG_mask(1, 8) - -+/* Has this page been mapped writeable with a non-coherent memory type? */ -+#define _PGT_non_coherent PG_shift(9) -+#define PGT_non_coherent PG_mask(1, 9) -+ - /* Count of uses of this frame as its current type. */ --#define PGT_count_width PG_shift(8) -+#define PGT_count_width PG_shift(9) - #define PGT_count_mask ((1UL<<PGT_count_width)-1) - - /* Are the 'type mask' bits identical? */ diff --git a/system/xen/xsa/xsa410-4.16-01.patch b/system/xen/xsa/xsa410-4.16-01.patch new file mode 100644 index 000000000000..8f352ee86d06 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-01.patch @@ -0,0 +1,59 @@ +From 4b4359122a414cc15156e13e3805988b71ff9da0 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Mon, 6 Jun 2022 06:17:25 +0000 +Subject: [PATCH 1/2] xen/arm: p2m: Prevent adding mapping when domain is dying + +During the domain destroy process, the domain will still be accessible +until it is fully destroyed. So does the P2M because we don't bail +out early if is_dying is non-zero. If a domain has permission to +modify the other domain's P2M (i.e. dom0, or a stubdomain), then +foreign mapping can be added past relinquish_p2m_mapping(). + +Therefore, we need to prevent mapping to be added when the domain +is dying. This commit prevents such adding of mapping by adding the +d->is_dying check to p2m_set_entry(). Also this commit enhances the +check in relinquish_p2m_mapping() to make sure that no mappings can +be added in the P2M after the P2M lock is released. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Tested-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +--- + xen/arch/arm/p2m.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index fb71fa4c1c90..cbeff90f4371 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m, + { + int rc = 0; + ++ /* ++ * Any reference taken by the P2M mappings (e.g. foreign mapping) will ++ * be dropped in relinquish_p2m_mapping(). As the P2M will still ++ * be accessible after, we need to prevent mapping to be added when the ++ * domain is dying. ++ */ ++ if ( unlikely(p2m->domain->is_dying) ) ++ return -ENOMEM; ++ + while ( nr ) + { + unsigned long mask; +@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d) + unsigned int order; + gfn_t start, end; + ++ BUG_ON(!d->is_dying); ++ /* No mappings can be added in the P2M after the P2M lock is released. */ + p2m_write_lock(p2m); + + start = p2m->lowest_mapped_gfn; +-- +2.37.1 + diff --git a/system/xen/xsa/xsa410-4.16-02.patch b/system/xen/xsa/xsa410-4.16-02.patch new file mode 100644 index 000000000000..7599a475f9a6 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-02.patch @@ -0,0 +1,165 @@ +From 0d5846490348fa09a0d0915d7c795685a016ce10 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Mon, 6 Jun 2022 06:17:26 +0000 +Subject: [PATCH 2/2] xen/arm: p2m: Handle preemption when freeing intermediate + page tables + +At the moment the P2M page tables will be freed when the domain structure +is freed without any preemption. As the P2M is quite large, iterating +through this may take more time than it is reasonable without intermediate +preemption (to run softirqs and perhaps scheduler). + +Split p2m_teardown() in two parts: one preemptible and called when +relinquishing the resources, the other one non-preemptible and called +when freeing the domain structure. + +As we are now freeing the P2M pages early, we also need to prevent +further allocation if someone call p2m_set_entry() past p2m_teardown() +(I wasn't able to prove this will never happen). This is done by +the checking domain->is_dying from previous patch in p2m_set_entry(). + +Similarly, we want to make sure that no-one can accessed the free +pages. Therefore the root is cleared before freeing pages. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Tested-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +--- + xen/arch/arm/domain.c | 10 +++++++-- + xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++--- + xen/include/asm-arm/p2m.h | 13 +++++++++-- + 3 files changed, 63 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index 96e1b235501d..2694c39127c5 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -789,10 +789,10 @@ fail: + void arch_domain_destroy(struct domain *d) + { + /* IOMMU page table is shared with P2M, always call +- * iommu_domain_destroy() before p2m_teardown(). ++ * iommu_domain_destroy() before p2m_final_teardown(). + */ + iommu_domain_destroy(d); +- p2m_teardown(d); ++ p2m_final_teardown(d); + domain_vgic_free(d); + domain_vuart_free(d); + free_xenheap_page(d->shared_info); +@@ -996,6 +996,7 @@ enum { + PROG_xen, + PROG_page, + PROG_mapping, ++ PROG_p2m, + PROG_done, + }; + +@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d) + if ( ret ) + return ret; + ++ PROGRESS(p2m): ++ ret = p2m_teardown(d); ++ if ( ret ) ++ return ret; ++ + PROGRESS(done): + break; + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index cbeff90f4371..3bcd1e897e88 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d) + spin_unlock(&vmid_alloc_lock); + } + +-void p2m_teardown(struct domain *d) ++int p2m_teardown(struct domain *d) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); ++ unsigned long count = 0; + struct page_info *pg; ++ unsigned int i; ++ int rc = 0; ++ ++ p2m_write_lock(p2m); ++ ++ /* ++ * We are about to free the intermediate page-tables, so clear the ++ * root to prevent any walk to use them. ++ */ ++ for ( i = 0; i < P2M_ROOT_PAGES; i++ ) ++ clear_and_clean_page(p2m->root + i); ++ ++ /* ++ * The domain will not be scheduled anymore, so in theory we should ++ * not need to flush the TLBs. Do it for safety purpose. ++ * ++ * Note that all the devices have already been de-assigned. So we don't ++ * need to flush the IOMMU TLB here. ++ */ ++ p2m_force_tlb_flush_sync(p2m); ++ ++ while ( (pg = page_list_remove_head(&p2m->pages)) ) ++ { ++ free_domheap_page(pg); ++ count++; ++ /* Arbitrarily preempt every 512 iterations */ ++ if ( !(count % 512) && hypercall_preempt_check() ) ++ { ++ rc = -ERESTART; ++ break; ++ } ++ } ++ ++ p2m_write_unlock(p2m); ++ ++ return rc; ++} ++ ++void p2m_final_teardown(struct domain *d) ++{ ++ struct p2m_domain *p2m = p2m_get_hostp2m(d); + + /* p2m not actually initialized */ + if ( !p2m->domain ) + return; + +- while ( (pg = page_list_remove_head(&p2m->pages)) ) +- free_domheap_page(pg); ++ ASSERT(page_list_empty(&p2m->pages)); + + if ( p2m->root ) + free_domheap_pages(p2m->root, P2M_ROOT_ORDER); +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index 8f11d9c97b5d..b3ba83283e11 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -192,8 +192,17 @@ void setup_virt_paging(void); + /* Init the datastructures for later use by the p2m code */ + int p2m_init(struct domain *d); + +-/* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct domain *d); ++/* ++ * The P2M resources are freed in two parts: ++ * - p2m_teardown() will be called when relinquish the resources. It ++ * will free large resources (e.g. intermediate page-tables) that ++ * requires preemption. ++ * - p2m_final_teardown() will be called when domain struct is been ++ * freed. This *cannot* be preempted and therefore one small ++ * resources should be freed here. ++ */ ++int p2m_teardown(struct domain *d); ++void p2m_final_teardown(struct domain *d); + + /* + * Remove mapping refcount on each mapping page in the p2m +-- +2.37.1 + diff --git a/system/xen/xsa/xsa410-4.16-03.patch b/system/xen/xsa/xsa410-4.16-03.patch new file mode 100644 index 000000000000..7411d900ea9e --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-03.patch @@ -0,0 +1,113 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/p2m: add option to skip root pagetable removal in p2m_teardown() + +Add a new parameter to p2m_teardown() in order to select whether the +root page table should also be freed. Note that all users are +adjusted to pass the parameter to remove the root page tables, so +behavior is not modified. + +No functional change intended. + +This is part of CVE-2022-33746 / XSA-410. + +Suggested-by: Julien Grall <julien@xen.org> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> + +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); + int p2m_alloc_table(struct p2m_domain *p2m); + + /* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct p2m_domain *p2m); ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root); + void p2m_final_teardown(struct domain *d); + + /* Add a page to a domain's p2m table */ +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d + } + + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i]); ++ p2m_teardown(d->arch.altp2m_p2m[i], true); + } + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +- p2m_teardown(d->arch.nested_p2m[i]); ++ p2m_teardown(d->arch.nested_p2m[i], true); + } + + if ( d->arch.paging.hap.total_pages != 0 ) + hap_teardown(d, NULL); + +- p2m_teardown(p2m_get_hostp2m(d)); ++ p2m_teardown(p2m_get_hostp2m(d), true); + /* Free any memory that the p2m teardown released */ + paging_lock(d); + hap_set_allocation(d, 0, NULL); +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p + * hvm fixme: when adding support for pvh non-hardware domains, this path must + * cleanup any foreign p2m types (release refcnts on them). + */ +-void p2m_teardown(struct p2m_domain *p2m) ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root) + /* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ + { +- struct page_info *pg; ++ struct page_info *pg, *root_pg = NULL; + struct domain *d; + + if (p2m == NULL) +@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m + + p2m_lock(p2m); + ASSERT(atomic_read(&d->shr_pages) == 0); +- p2m->phys_table = pagetable_null(); ++ ++ if ( remove_root ) ++ p2m->phys_table = pagetable_null(); ++ else if ( !pagetable_is_null(p2m->phys_table) ) ++ { ++ root_pg = pagetable_get_page(p2m->phys_table); ++ clear_domain_page(pagetable_get_mfn(p2m->phys_table)); ++ } + + while ( (pg = page_list_remove_head(&p2m->pages)) ) +- d->arch.paging.free_page(d, pg); ++ if ( pg != root_pg ) ++ d->arch.paging.free_page(d, pg); ++ ++ if ( root_pg ) ++ page_list_add(root_pg, &p2m->pages); ++ + p2m_unlock(p2m); + } + +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32 + paging_unlock(d); + out_unlocked: + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) +- p2m_teardown(p2m); ++ p2m_teardown(p2m, true); + if ( rv != 0 && pg != NULL ) + { + pg->count_info &= ~PGC_count_mask; +@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain + shadow_teardown(d, NULL); + + /* It is now safe to pull down the p2m map. */ +- p2m_teardown(p2m_get_hostp2m(d)); ++ p2m_teardown(p2m_get_hostp2m(d), true); + /* Free any shadow memory that the p2m teardown released */ + paging_lock(d); + shadow_set_allocation(d, 0, NULL); diff --git a/system/xen/xsa/xsa410-4.16-04.patch b/system/xen/xsa/xsa410-4.16-04.patch new file mode 100644 index 000000000000..2170b8e1a327 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-04.patch @@ -0,0 +1,62 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/HAP: adjust monitor table related error handling + +hap_make_monitor_table() will return INVALID_MFN if it encounters an +error condition, but hap_update_paging_modes() wasn’t handling this +value, resulting in an inappropriate value being stored in +monitor_table. This would subsequently misguide at least +hap_vcpu_teardown(). Avoid this by bailing early. + +Further, when a domain has/was already crashed or (perhaps less +important as there's no such path known to lead here) is already dying, +avoid calling domain_crash() on it again - that's at best confusing. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> + +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -39,6 +39,7 @@ + #include <asm/domain.h> + #include <xen/numa.h> + #include <asm/hvm/nestedhvm.h> ++#include <public/sched.h> + + #include "private.h" + +@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(stru + return m4mfn; + + oom: +- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n"); +- domain_crash(d); ++ if ( !d->is_dying && ++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ { ++ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n", ++ d); ++ domain_crash(d); ++ } + return INVALID_MFN; + } + +@@ -766,6 +772,9 @@ static void hap_update_paging_modes(stru + if ( pagetable_is_null(v->arch.hvm.monitor_table) ) + { + mfn_t mmfn = hap_make_monitor_table(v); ++ ++ if ( mfn_eq(mmfn, INVALID_MFN) ) ++ goto unlock; + v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); + make_cr3(v, mmfn); + hvm_update_host_cr3(v); +@@ -774,6 +783,7 @@ static void hap_update_paging_modes(stru + /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ + hap_update_cr3(v, 0, false); + ++ unlock: + paging_unlock(d); + put_gfn(d, cr3_gfn); + } diff --git a/system/xen/xsa/xsa410-4.16-05.patch b/system/xen/xsa/xsa410-4.16-05.patch new file mode 100644 index 000000000000..dc626c7f542f --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-05.patch @@ -0,0 +1,60 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/shadow: tolerate failure of sh_set_toplevel_shadow() + +Subsequently sh_set_toplevel_shadow() will be adjusted to install a +blank entry in case prealloc fails. There are, in fact, pre-existing +error paths which would put in place a blank entry. The 4- and 2-level +code in sh_update_cr3(), however, assume the top level entry to be +valid. + +Hence bail from the function in the unlikely event that it's not. Note +that 3-level logic works differently: In particular a guest is free to +supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid) +entries. The guest will crash, but we already cope with that. + +Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(), +and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change +in security context, but add a respective assertion. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> + +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu + /* Now figure out the new contents: is this a valid guest MFN? */ + if ( !mfn_valid(gmfn) ) + { ++ ASSERT(mfn_eq(gmfn, INVALID_MFN)); + new_entry = pagetable_null(); + goto install_new_entry; + } +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_loc + if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) + guest_flush_tlb_mask(d, d->dirty_cpumask); + sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); ++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) ++ { ++ ASSERT(d->is_dying || d->is_shutting_down); ++ return; ++ } + if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) + { + mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); +@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_loc + if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) + guest_flush_tlb_mask(d, d->dirty_cpumask); + sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); ++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) ++ { ++ ASSERT(d->is_dying || d->is_shutting_down); ++ return; ++ } + #else + #error This should never happen + #endif diff --git a/system/xen/xsa/xsa410-4.16-06.patch b/system/xen/xsa/xsa410-4.16-06.patch new file mode 100644 index 000000000000..8c89eb32c7a4 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-06.patch @@ -0,0 +1,255 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/shadow: tolerate failure in shadow_prealloc() + +Prevent _shadow_prealloc() from calling BUG() when unable to fulfill +the pre-allocation and instead return true/false. Modify +shadow_prealloc() to crash the domain on allocation failure (if the +domain is not already dying), as shadow cannot operate normally after +that. Modify callers to also gracefully handle {_,}shadow_prealloc() +failing to fulfill the request. + +Note this in turn requires adjusting the callers of +sh_make_monitor_table() also to handle it returning INVALID_MFN. +sh_update_paging_modes() is also modified to add additional error +paths in case of allocation failure, some of those will return with +null monitor page tables (and the domain likely crashed). This is no +different that current error paths, but the newly introduced ones are +more likely to trigger. + +The now added failure points in sh_update_paging_modes() also require +that on some error return paths the previous structures are cleared, +and thus monitor table is null. + +While there adjust the 'type' parameter type of shadow_prealloc() to +unsigned int rather than u32. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> + +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -36,6 +36,7 @@ + #include <asm/flushtlb.h> + #include <asm/shadow.h> + #include <xen/numa.h> ++#include <public/sched.h> + #include "private.h" + + DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); +@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc + + /* Make sure there are at least count order-sized pages + * available in the shadow page pool. */ +-static void _shadow_prealloc(struct domain *d, unsigned int pages) ++static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + { + struct vcpu *v; + struct page_info *sp, *t; + mfn_t smfn; + int i; + +- if ( d->arch.paging.shadow.free_pages >= pages ) return; ++ if ( d->arch.paging.shadow.free_pages >= pages ) ++ return true; + + /* Shouldn't have enabled shadows if we've no vcpus. */ + ASSERT(d->vcpu && d->vcpu[0]); +@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma + sh_unpin(d, smfn); + + /* See if that freed up enough space */ +- if ( d->arch.paging.shadow.free_pages >= pages ) return; ++ if ( d->arch.paging.shadow.free_pages >= pages ) ++ return true; + } + + /* Stage two: all shadow pages are in use in hierarchies that are +@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma + if ( d->arch.paging.shadow.free_pages >= pages ) + { + guest_flush_tlb_mask(d, d->dirty_cpumask); +- return; ++ return true; + } + } + } +@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma + d->arch.paging.shadow.total_pages, + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); +- BUG(); ++ ++ ASSERT(d->is_dying); ++ ++ guest_flush_tlb_mask(d, d->dirty_cpumask); ++ ++ return false; + } + + /* Make sure there are at least count pages of the order according to +@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma + * This must be called before any calls to shadow_alloc(). Since this + * will free existing shadows to make room, it must be called early enough + * to avoid freeing shadows that the caller is currently working on. */ +-void shadow_prealloc(struct domain *d, u32 type, unsigned int count) ++bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + { +- return _shadow_prealloc(d, shadow_size(type) * count); ++ bool ret = _shadow_prealloc(d, shadow_size(type) * count); ++ ++ if ( !ret && !d->is_dying && ++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ /* ++ * Failing to allocate memory required for shadow usage can only result in ++ * a domain crash, do it here rather that relying on every caller to do it. ++ */ ++ domain_crash(d); ++ ++ return ret; + } + + /* Deliberately free all the memory we can: this will tear down all of +@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t + static struct page_info * + shadow_alloc_p2m_page(struct domain *d) + { +- struct page_info *pg; ++ struct page_info *pg = NULL; + + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ +@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d) + d->arch.paging.shadow.p2m_pages, + shadow_min_acceptable_pages(d)); + } +- paging_unlock(d); +- return NULL; ++ goto out; + } + +- shadow_prealloc(d, SH_type_p2m_table, 1); ++ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) ) ++ goto out; ++ + pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); + d->arch.paging.shadow.p2m_pages++; + d->arch.paging.shadow.total_pages--; + ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + ++ out: + paging_unlock(d); + + return pg; +@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain + else if ( d->arch.paging.shadow.total_pages > pages ) + { + /* Need to return memory to domheap */ +- _shadow_prealloc(d, 1); ++ if ( !_shadow_prealloc(d, 1) ) ++ return -ENOMEM; ++ + sp = page_list_remove_head(&d->arch.paging.shadow.freelist); + ASSERT(sp); + /* +@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struc + if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) + { + int i; ++ ++ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) ) ++ return; ++ + for(i = 0; i < SHADOW_OOS_PAGES; i++) +- { +- shadow_prealloc(d, SH_type_oos_snapshot, 1); + v->arch.paging.shadow.oos_snapshot[i] = + shadow_alloc(d, SH_type_oos_snapshot, 0); +- } + } + #endif /* OOS */ + +@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struc + mfn_t mmfn = sh_make_monitor_table( + v, v->arch.paging.mode->shadow.shadow_levels); + ++ if ( mfn_eq(mmfn, INVALID_MFN) ) ++ return; ++ + v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); + make_cr3(v, mmfn); + hvm_update_host_cr3(v); +@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struc + v->arch.hvm.monitor_table = pagetable_null(); + new_mfn = sh_make_monitor_table( + v, v->arch.paging.mode->shadow.shadow_levels); ++ if ( mfn_eq(new_mfn, INVALID_MFN) ) ++ { ++ sh_destroy_monitor_table(v, old_mfn, ++ old_mode->shadow.shadow_levels); ++ return; ++ } + v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn); + SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", + mfn_x(new_mfn)); +@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu + if ( !mfn_valid(smfn) ) + { + /* Make sure there's enough free shadow memory. */ +- shadow_prealloc(d, root_type, 1); ++ if ( !shadow_prealloc(d, root_type, 1) ) ++ { ++ new_entry = pagetable_null(); ++ goto install_new_entry; ++ } ++ + /* Shadow the page. */ + smfn = make_shadow(v, gmfn, root_type); + } +--- a/xen/arch/x86/mm/shadow/hvm.c ++++ b/xen/arch/x86/mm/shadow/hvm.c +@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct + ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table)); + + /* Guarantee we can get the memory we need */ +- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); ++ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) ) ++ return INVALID_MFN; ++ + m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); + mfn_to_page(m4mfn)->shadow_flags = 4; + +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v, + * Preallocate shadow pages *before* removing writable accesses + * otherwhise an OOS L1 might be demoted and promoted again with + * writable mappings. */ +- shadow_prealloc(d, +- SH_type_l1_shadow, +- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); ++ if ( !shadow_prealloc(d, SH_type_l1_shadow, ++ GUEST_PAGING_LEVELS < 4 ++ ? 1 : GUEST_PAGING_LEVELS - 1) ) ++ { ++ paging_unlock(d); ++ put_gfn(d, gfn_x(gfn)); ++ return 0; ++ } + + rc = gw_remove_write_accesses(v, va, &gw); + +--- a/xen/arch/x86/mm/shadow/private.h ++++ b/xen/arch/x86/mm/shadow/private.h +@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf + void shadow_demote(struct domain *d, mfn_t gmfn, u32 type); + + /* Shadow page allocation functions */ +-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); ++bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type, ++ unsigned int count); + mfn_t shadow_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer); diff --git a/system/xen/xsa/xsa410-4.16-07.patch b/system/xen/xsa/xsa410-4.16-07.patch new file mode 100644 index 000000000000..9eea91c42df7 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-07.patch @@ -0,0 +1,82 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/p2m: refuse new allocations for dying domains + +This will in particular prevent any attempts to add entries to the p2m, +once - in a subsequent change - non-root entries have been removed. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> + +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struc + + ASSERT(paging_locked_by_me(d)); + ++ if ( unlikely(d->is_dying) ) ++ return NULL; ++ + pg = page_list_remove_head(&d->arch.paging.hap.freelist); + if ( unlikely(!pg) ) + return NULL; +@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_p + d->arch.paging.hap.p2m_pages++; + ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + } +- else if ( !d->arch.paging.p2m_alloc_failed ) ++ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying ) + { + d->arch.paging.p2m_alloc_failed = 1; + dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -939,6 +939,10 @@ static bool __must_check _shadow_preallo + if ( d->arch.paging.shadow.free_pages >= pages ) + return true; + ++ if ( unlikely(d->is_dying) ) ++ /* No reclaim when the domain is dying, teardown will take care of it. */ ++ return false; ++ + /* Shouldn't have enabled shadows if we've no vcpus. */ + ASSERT(d->vcpu && d->vcpu[0]); + +@@ -991,7 +995,7 @@ static bool __must_check _shadow_preallo + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); + +- ASSERT(d->is_dying); ++ ASSERT_UNREACHABLE(); + + guest_flush_tlb_mask(d, d->dirty_cpumask); + +@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_preallo + * to avoid freeing shadows that the caller is currently working on. */ + bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + { +- bool ret = _shadow_prealloc(d, shadow_size(type) * count); ++ bool ret; ++ ++ if ( unlikely(d->is_dying) ) ++ return false; + +- if ( !ret && !d->is_dying && +- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ ret = _shadow_prealloc(d, shadow_size(type) * count); ++ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) + /* + * Failing to allocate memory required for shadow usage can only result in + * a domain crash, do it here rather that relying on every caller to do it. +@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d) + { + struct page_info *pg = NULL; + ++ if ( unlikely(d->is_dying) ) ++ return NULL; ++ + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); diff --git a/system/xen/xsa/xsa410-4.16-08.patch b/system/xen/xsa/xsa410-4.16-08.patch new file mode 100644 index 000000000000..92cdb4942b6a --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-08.patch @@ -0,0 +1,96 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/p2m: truly free paging pool memory for dying domains + +Modify {hap,shadow}_free to free the page immediately if the domain is +dying, so that pages don't accumulate in the pool when +{shadow,hap}_final_teardown() get called. This is to limit the amount of +work which needs to be done there (in a non-preemptable manner). + +Note the call to shadow_free() in shadow_free_p2m_page() is moved after +increasing total_pages, so that the decrease done in shadow_free() in +case the domain is dying doesn't underflow the counter, even if just for +a short interval. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> + +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, m + + ASSERT(paging_locked_by_me(d)); + ++ /* ++ * For dying domains, actually free the memory here. This way less work is ++ * left to hap_final_teardown(), which cannot easily have preemption checks ++ * added. ++ */ ++ if ( unlikely(d->is_dying) ) ++ { ++ free_domheap_page(pg); ++ d->arch.paging.hap.total_pages--; ++ return; ++ } ++ + d->arch.paging.hap.free_pages++; + page_list_add_tail(pg, &d->arch.paging.hap.freelist); + } +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d, + void shadow_free(struct domain *d, mfn_t smfn) + { + struct page_info *next = NULL, *sp = mfn_to_page(smfn); ++ bool dying = ACCESS_ONCE(d->is_dying); + struct page_list_head *pin_list; + unsigned int pages; + u32 shadow_type; +@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t + * just before the allocator hands the page out again. */ + page_set_tlbflush_timestamp(sp); + perfc_decr(shadow_alloc_count); +- page_list_add_tail(sp, &d->arch.paging.shadow.freelist); ++ ++ /* ++ * For dying domains, actually free the memory here. This way less ++ * work is left to shadow_final_teardown(), which cannot easily have ++ * preemption checks added. ++ */ ++ if ( unlikely(dying) ) ++ { ++ /* ++ * The backpointer field (sh.back) used by shadow code aliases the ++ * domain owner field, unconditionally clear it here to avoid ++ * free_domheap_page() attempting to parse it. ++ */ ++ page_set_owner(sp, NULL); ++ free_domheap_page(sp); ++ } ++ else ++ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); ++ + sp = next; + } + +- d->arch.paging.shadow.free_pages += pages; ++ if ( unlikely(dying) ) ++ d->arch.paging.shadow.total_pages -= pages; ++ else ++ d->arch.paging.shadow.free_pages += pages; + } + + /* Divert a page from the pool to be used by the p2m mapping. +@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, s + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); + +- shadow_free(d, page_to_mfn(pg)); + d->arch.paging.shadow.p2m_pages--; + d->arch.paging.shadow.total_pages++; ++ shadow_free(d, page_to_mfn(pg)); + + paging_unlock(d); + } diff --git a/system/xen/xsa/xsa410-4.16-09.patch b/system/xen/xsa/xsa410-4.16-09.patch new file mode 100644 index 000000000000..60259e1c0991 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-09.patch @@ -0,0 +1,159 @@ +From: Roger Pau Monné <roger.pau@citrix.com> +Subject: x86/p2m: free the paging memory pool preemptively + +The paging memory pool is currently freed in two different places: +from {shadow,hap}_teardown() via domain_relinquish_resources() and +from {shadow,hap}_final_teardown() via complete_domain_destroy(). +While the former does handle preemption, the later doesn't. + +Attempt to move as much p2m related freeing as possible to happen +before the call to {shadow,hap}_teardown(), so that most memory can be +freed in a preemptive way. In order to avoid causing issues to +existing callers leave the root p2m page tables set and free them in +{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free +the page immediately if the domain is dying, so that pages don't +accumulate in the pool when {shadow,hap}_final_teardown() get called. + +Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's +the place where altp2m_active gets disabled now. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> + +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -38,7 +38,6 @@ + #include <xen/livepatch.h> + #include <public/sysctl.h> + #include <public/hvm/hvm_vcpu.h> +-#include <asm/altp2m.h> + #include <asm/regs.h> + #include <asm/mc146818rtc.h> + #include <asm/system.h> +@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct d + vpmu_destroy(v); + } + +- if ( altp2m_active(d) ) +- { +- for_each_vcpu ( d, v ) +- altp2m_vcpu_disable_ve(v); +- } +- + if ( is_pv_domain(d) ) + { + for_each_vcpu ( d, v ) +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -28,6 +28,7 @@ + #include <xen/domain_page.h> + #include <xen/guest_access.h> + #include <xen/keyhandler.h> ++#include <asm/altp2m.h> + #include <asm/event.h> + #include <asm/page.h> + #include <asm/current.h> +@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d + unsigned int i; + + if ( hvm_altp2m_supported() ) +- { +- d->arch.altp2m_active = 0; +- +- if ( d->arch.altp2m_eptp ) +- { +- free_xenheap_page(d->arch.altp2m_eptp); +- d->arch.altp2m_eptp = NULL; +- } +- +- if ( d->arch.altp2m_visible_eptp ) +- { +- free_xenheap_page(d->arch.altp2m_visible_eptp); +- d->arch.altp2m_visible_eptp = NULL; +- } +- + for ( i = 0; i < MAX_ALTP2M; i++ ) + p2m_teardown(d->arch.altp2m_p2m[i], true); +- } + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d + paging_lock(d); + hap_set_allocation(d, 0, NULL); + ASSERT(d->arch.paging.hap.p2m_pages == 0); ++ ASSERT(d->arch.paging.hap.free_pages == 0); ++ ASSERT(d->arch.paging.hap.total_pages == 0); + paging_unlock(d); + } + +@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v) + void hap_teardown(struct domain *d, bool *preempted) + { + struct vcpu *v; ++ unsigned int i; + + ASSERT(d->is_dying); + ASSERT(d != current->domain); +@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool + for_each_vcpu ( d, v ) + hap_vcpu_teardown(v); + ++ /* Leave the root pt in case we get further attempts to modify the p2m. */ ++ if ( hvm_altp2m_supported() ) ++ { ++ if ( altp2m_active(d) ) ++ for_each_vcpu ( d, v ) ++ altp2m_vcpu_disable_ve(v); ++ ++ d->arch.altp2m_active = 0; ++ ++ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp); ++ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); ++ ++ for ( i = 0; i < MAX_ALTP2M; i++ ) ++ p2m_teardown(d->arch.altp2m_p2m[i], false); ++ } ++ ++ /* Destroy nestedp2m's after altp2m. */ ++ for ( i = 0; i < MAX_NESTEDP2M; i++ ) ++ p2m_teardown(d->arch.nested_p2m[i], false); ++ ++ p2m_teardown(p2m_get_hostp2m(d), false); ++ + paging_lock(d); /* Keep various asserts happy */ + + if ( d->arch.paging.hap.total_pages != 0 ) +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, b + for_each_vcpu ( d, v ) + shadow_vcpu_teardown(v); + ++ p2m_teardown(p2m_get_hostp2m(d), false); ++ + paging_lock(d); + ++ /* ++ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find ++ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages ++ * because the domain is dying. ++ */ ++ shadow_blow_tables(d); ++ + #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) + /* Free the virtual-TLB array attached to each vcpu */ + for_each_vcpu(d, v) +@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain + d->arch.paging.shadow.total_pages, + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); ++ ASSERT(!d->arch.paging.shadow.total_pages); ++ ASSERT(!d->arch.paging.shadow.free_pages); ++ ASSERT(!d->arch.paging.shadow.p2m_pages); + paging_unlock(d); + } + diff --git a/system/xen/xsa/xsa410-4.16-10.patch b/system/xen/xsa/xsa410-4.16-10.patch new file mode 100644 index 000000000000..258b7fff3194 --- /dev/null +++ b/system/xen/xsa/xsa410-4.16-10.patch @@ -0,0 +1,171 @@ +From: Julien Grall <jgrall@amazon.com> +Subject: xen/x86: p2m: Add preemption in p2m_teardown() + +The list p2m->pages contain all the pages used by the P2M. On large +instance this can be quite large and the time spent to call +d->arch.paging.free_page() will take more than 1ms for a 80GB guest +on a Xen running in nested environment on a c5.metal. + +By extrapolation, it would take > 100ms for a 8TB guest (what we +current security support). So add some preemption in p2m_teardown() +and propagate to the callers. Note there are 3 places where +the preemption is not enabled: + - hap_final_teardown()/shadow_final_teardown(): We are + preventing update the P2M once the domain is dying (so + no more pages could be allocated) and most of the P2M pages + will be freed in preemptive manneer when relinquishing the + resources. So this is fine to disable preemption. + - shadow_enable(): This is fine because it will undo the allocation + that may have been made by p2m_alloc_table() (so only the root + page table). + +The preemption is arbitrarily checked every 1024 iterations. + +Note that with the current approach, Xen doesn't keep track on whether +the alt/nested P2Ms have been cleared. So there are some redundant work. +However, this is not expected to incurr too much overhead (the P2M lock +shouldn't be contended during teardown). So this is optimization is +left outside of the security event. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> + +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); + int p2m_alloc_table(struct p2m_domain *p2m); + + /* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct p2m_domain *p2m, bool remove_root); ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted); + void p2m_final_teardown(struct domain *d); + + /* Add a page to a domain's p2m table */ +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d + + if ( hvm_altp2m_supported() ) + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i], true); ++ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL); + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +- p2m_teardown(d->arch.nested_p2m[i], true); ++ p2m_teardown(d->arch.nested_p2m[i], true, NULL); + } + + if ( d->arch.paging.hap.total_pages != 0 ) + hap_teardown(d, NULL); + +- p2m_teardown(p2m_get_hostp2m(d), true); ++ p2m_teardown(p2m_get_hostp2m(d), true, NULL); + /* Free any memory that the p2m teardown released */ + paging_lock(d); + hap_set_allocation(d, 0, NULL); +@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool + FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); + + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i], false); ++ { ++ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted); ++ if ( preempted && *preempted ) ++ return; ++ } + } + + /* Destroy nestedp2m's after altp2m. */ + for ( i = 0; i < MAX_NESTEDP2M; i++ ) +- p2m_teardown(d->arch.nested_p2m[i], false); ++ { ++ p2m_teardown(d->arch.nested_p2m[i], false, preempted); ++ if ( preempted && *preempted ) ++ return; ++ } + +- p2m_teardown(p2m_get_hostp2m(d), false); ++ p2m_teardown(p2m_get_hostp2m(d), false, preempted); ++ if ( preempted && *preempted ) ++ return; + + paging_lock(d); /* Keep various asserts happy */ + +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p + * hvm fixme: when adding support for pvh non-hardware domains, this path must + * cleanup any foreign p2m types (release refcnts on them). + */ +-void p2m_teardown(struct p2m_domain *p2m, bool remove_root) ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted) + /* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ + { + struct page_info *pg, *root_pg = NULL; + struct domain *d; ++ unsigned int i = 0; + + if (p2m == NULL) + return; +@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m + } + + while ( (pg = page_list_remove_head(&p2m->pages)) ) +- if ( pg != root_pg ) +- d->arch.paging.free_page(d, pg); ++ { ++ if ( pg == root_pg ) ++ continue; ++ ++ d->arch.paging.free_page(d, pg); ++ ++ /* Arbitrarily check preemption every 1024 iterations */ ++ if ( preempted && !(++i % 1024) && general_preempt_check() ) ++ { ++ *preempted = true; ++ break; ++ } ++ } + + if ( root_pg ) + page_list_add(root_pg, &p2m->pages); +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32 + out_locked: + paging_unlock(d); + out_unlocked: ++ /* ++ * This is fine to ignore the preemption here because only the root ++ * will be allocated by p2m_alloc_table(). ++ */ + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) +- p2m_teardown(p2m, true); ++ p2m_teardown(p2m, true, NULL); + if ( rv != 0 && pg != NULL ) + { + pg->count_info &= ~PGC_count_mask; +@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, b + for_each_vcpu ( d, v ) + shadow_vcpu_teardown(v); + +- p2m_teardown(p2m_get_hostp2m(d), false); ++ p2m_teardown(p2m_get_hostp2m(d), false, preempted); ++ if ( preempted && *preempted ) ++ return; + + paging_lock(d); + +@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain + shadow_teardown(d, NULL); + + /* It is now safe to pull down the p2m map. */ +- p2m_teardown(p2m_get_hostp2m(d), true); ++ p2m_teardown(p2m_get_hostp2m(d), true, NULL); + /* Free any shadow memory that the p2m teardown released */ + paging_lock(d); + shadow_set_allocation(d, 0, NULL); diff --git a/system/xen/xsa/xsa411.patch b/system/xen/xsa/xsa411.patch new file mode 100644 index 000000000000..50dcae4a8184 --- /dev/null +++ b/system/xen/xsa/xsa411.patch @@ -0,0 +1,55 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: gnttab: correct locking on transitive grant copy error path + +While the comment next to the lock dropping in preparation of +recursively calling acquire_grant_for_copy() mistakenly talks about the +rd == td case (excluded a few lines further up), the same concerns apply +to the calling of release_grant_for_copy() on a subsequent error path. + +This is CVE-2022-33748 / XSA-411. + +Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +--- +v2: Extend code comment. + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2622,9 +2622,8 @@ acquire_grant_for_copy( + trans_domid); + + /* +- * acquire_grant_for_copy() could take the lock on the +- * remote table (if rd == td), so we have to drop the lock +- * here and reacquire. ++ * acquire_grant_for_copy() will take the lock on the remote table, ++ * so we have to drop the lock here and reacquire. + */ + active_entry_release(act); + grant_read_unlock(rgt); +@@ -2661,11 +2660,25 @@ acquire_grant_for_copy( + act->trans_gref != trans_gref || + !act->is_sub_page)) ) + { ++ /* ++ * Like above for acquire_grant_for_copy() we need to drop and then ++ * re-acquire the locks here to prevent lock order inversion issues. ++ * Unlike for acquire_grant_for_copy() we don't need to re-check ++ * anything, as release_grant_for_copy() doesn't depend on the grant ++ * table entry: It only updates internal state and the status flags. ++ */ ++ active_entry_release(act); ++ grant_read_unlock(rgt); ++ + release_grant_for_copy(td, trans_gref, readonly); + rcu_unlock_domain(td); ++ ++ grant_read_lock(rgt); ++ act = active_entry_acquire(rgt, gref); + reduce_status_for_pin(rd, act, status, readonly); + active_entry_release(act); + grant_read_unlock(rgt); ++ + put_page(*page); + *page = NULL; + return ERESTART; |