diff options
Diffstat (limited to 'system')
-rw-r--r-- | system/xen/xen.SlackBuild | 2 | ||||
-rw-r--r-- | system/xen/xsa/xsa401-4.16-1.patch | 170 | ||||
-rw-r--r-- | system/xen/xsa/xsa401-4.16-2.patch | 191 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-1.patch | 43 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-2.patch | 213 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-3.patch | 284 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-4.patch | 83 | ||||
-rw-r--r-- | system/xen/xsa/xsa402-4.16-5.patch | 148 |
8 files changed, 1133 insertions, 1 deletions
diff --git a/system/xen/xen.SlackBuild b/system/xen/xen.SlackBuild index 1d1d31d344dd..e504dc97bf12 100644 --- a/system/xen/xen.SlackBuild +++ b/system/xen/xen.SlackBuild @@ -26,7 +26,7 @@ cd $(dirname $0) ; CWD=$(pwd) PRGNAM=xen VERSION=${VERSION:-4.16.1} -BUILD=${BUILD:-1} +BUILD=${BUILD:-2} TAG=${TAG:-_SBo} PKGTYPE=${PKGTYPE:-tgz} diff --git a/system/xen/xsa/xsa401-4.16-1.patch b/system/xen/xsa/xsa401-4.16-1.patch new file mode 100644 index 000000000000..5c8c50617a21 --- /dev/null +++ b/system/xen/xsa/xsa401-4.16-1.patch @@ -0,0 +1,170 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/pv: Clean up _get_page_type() + +Various fixes for clarity, ahead of making complicated changes. + + * Split the overflow check out of the if/else chain for type handling, as + it's somewhat unrelated. + * Comment the main if/else chain to explain what is going on. Adjust one + ASSERT() and state the bit layout for validate-locked and partial states. + * Correct the comment about TLB flushing, as it's backwards. The problem + case is when writeable mappings are retained to a page becoming read-only, + as it allows the guest to bypass Xen's safety checks for updates. + * Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not + valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat + all reads as explicitly volatile. The only thing preventing the validated + wait-loop being infinite is the compiler barrier hidden in cpu_relax(). + * Replace one page_get_owner(page) with the already-calculated 'd' already in + scope. + +No functional change. + +This is part of XSA-401 / CVE-2022-26362. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 796faca64103..ddd32f88c798 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags, + static int _get_page_type(struct page_info *page, unsigned long type, + bool preemptible) + { +- unsigned long nx, x, y = page->u.inuse.type_info; ++ unsigned long nx, x; + int rc = 0; + + ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); + ASSERT(!in_irq()); + +- for ( ; ; ) ++ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; ) + { + x = y; + nx = x + 1; ++ + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + gdprintk(XENLOG_WARNING, +@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type, + mfn_x(page_to_mfn(page))); + return -EINVAL; + } +- else if ( unlikely((x & PGT_count_mask) == 0) ) ++ ++ if ( unlikely((x & PGT_count_mask) == 0) ) + { ++ /* ++ * Typeref 0 -> 1. ++ * ++ * Type changes are permitted when the typeref is 0. If the type ++ * actually changes, the page needs re-validating. ++ */ + struct domain *d = page_get_owner(page); + + if ( d && shadow_mode_enabled(d) ) +@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type, + { + /* + * On type change we check to flush stale TLB entries. It is +- * vital that no other CPUs are left with mappings of a frame +- * which is about to become writeable to the guest. ++ * vital that no other CPUs are left with writeable mappings ++ * to a frame which is intending to become pgtable/segdesc. + */ + cpumask_t *mask = this_cpu(scratch_cpumask); + +@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(!cpumask_empty(mask)) && + /* Shadow mode: track only writable pages. */ +- (!shadow_mode_enabled(page_get_owner(page)) || ++ (!shadow_mode_enabled(d) || + ((nx & PGT_type_mask) == PGT_writable_page)) ) + { + perfc_incr(need_flush_tlb_flush); +@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type, + } + else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) + { +- /* Don't log failure if it could be a recursive-mapping attempt. */ ++ /* ++ * else, we're trying to take a new reference, of the wrong type. ++ * ++ * This (being able to prohibit use of the wrong type) is what the ++ * typeref system exists for, but skip printing the failure if it ++ * looks like a recursive mapping, as subsequent logic might ++ * ultimately permit the attempt. ++ */ + if ( ((x & PGT_type_mask) == PGT_l2_page_table) && + (type == PGT_l1_page_table) ) + return -EINVAL; +@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type, + } + else if ( unlikely(!(x & PGT_validated)) ) + { ++ /* ++ * else, the count is non-zero, and we're grabbing the right type; ++ * but the page hasn't been validated yet. ++ * ++ * The page is in one of two states (depending on PGT_partial), ++ * and should have exactly one reference. ++ */ ++ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1)); ++ + if ( !(x & PGT_partial) ) + { +- /* Someone else is updating validation of this page. Wait... */ ++ /* ++ * The page has been left in the "validate locked" state ++ * (i.e. PGT_[type] | 1) which means that a concurrent caller ++ * of _get_page_type() is in the middle of validation. ++ * ++ * Spin waiting for the concurrent user to complete (partial ++ * or fully validated), then restart our attempt to acquire a ++ * type reference. ++ */ + do { + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + cpu_relax(); +- } while ( (y = page->u.inuse.type_info) == x ); ++ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x ); + continue; + } +- /* Type ref count was left at 1 when PGT_partial got set. */ +- ASSERT((x & PGT_count_mask) == 1); ++ ++ /* ++ * The page has been left in the "partial" state ++ * (i.e., PGT_[type] | PGT_partial | 1). ++ * ++ * Rather than bumping the type count, we need to try to grab the ++ * validation lock; if we succeed, we need to validate the page, ++ * then drop the general ref associated with the PGT_partial bit. ++ * ++ * We grab the validation lock by setting nx to (PGT_[type] | 1) ++ * (i.e., non-zero type count, neither PGT_validated nor ++ * PGT_partial set). ++ */ + nx = x & ~PGT_partial; + } + +@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type, + } + + out: ++ /* ++ * Did we drop the PGT_partial bit when acquiring the typeref? If so, ++ * drop the general reference that went along with it. ++ * ++ * N.B. validate_page() may have have re-set PGT_partial, not reflected in ++ * nx, but will have taken an extra ref when doing so. ++ */ + if ( (x & PGT_partial) && !(nx & PGT_partial) ) + put_page(page); + diff --git a/system/xen/xsa/xsa401-4.16-2.patch b/system/xen/xsa/xsa401-4.16-2.patch new file mode 100644 index 000000000000..be58db59a513 --- /dev/null +++ b/system/xen/xsa/xsa401-4.16-2.patch @@ -0,0 +1,191 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type() + +_get_page_type() suffers from a race condition where it incorrectly assumes +that because 'x' was read and a subsequent a cmpxchg() succeeds, the type +cannot have changed in-between. Consider: + +CPU A: + 1. Creates an L2e referencing pg + `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page + 2. Issues flush_tlb_mask() +CPU B: + 3. Creates a writeable mapping of pg + `-> _get_page_type(pg, PGT_writable_page), count increases to 1 + 4. Writes into new mapping, creating a TLB entry for pg + 5. Removes the writeable mapping of pg + `-> _put_page_type(pg), count goes back down to 0 +CPU A: + 7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table + +CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and +suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred +until after the guest is prohibited from creating new writeable mappings, +which is after step 7. + +Defer all safety actions until after the cmpxchg() has successfully taken the +intended typeref, because that is what prevents concurrent users from using +the old type. + +Also remove the early validation for writeable and shared pages. This removes +race conditions where one half of a parallel mapping attempt can return +successfully before: + * The IOMMU pagetables are in sync with the new page type + * Writeable mappings to shared pages have been torn down + +This is part of XSA-401 / CVE-2022-26362. + +Reported-by: Jann Horn <jannh@google.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index ddd32f88c798..1693b580b152 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type, + * Type changes are permitted when the typeref is 0. If the type + * actually changes, the page needs re-validating. + */ +- struct domain *d = page_get_owner(page); +- +- if ( d && shadow_mode_enabled(d) ) +- shadow_prepare_page_type_change(d, page, type); + + ASSERT(!(x & PGT_pae_xen_l2)); + if ( (x & PGT_type_mask) != type ) + { +- /* +- * On type change we check to flush stale TLB entries. It is +- * vital that no other CPUs are left with writeable mappings +- * to a frame which is intending to become pgtable/segdesc. +- */ +- cpumask_t *mask = this_cpu(scratch_cpumask); +- +- BUG_ON(in_irq()); +- cpumask_copy(mask, d->dirty_cpumask); +- +- /* Don't flush if the timestamp is old enough */ +- tlbflush_filter(mask, page->tlbflush_timestamp); +- +- if ( unlikely(!cpumask_empty(mask)) && +- /* Shadow mode: track only writable pages. */ +- (!shadow_mode_enabled(d) || +- ((nx & PGT_type_mask) == PGT_writable_page)) ) +- { +- perfc_incr(need_flush_tlb_flush); +- /* +- * If page was a page table make sure the flush is +- * performed using an IPI in order to avoid changing the +- * type of a page table page under the feet of +- * spurious_page_fault(). +- */ +- flush_mask(mask, +- (x & PGT_type_mask) && +- (x & PGT_type_mask) <= PGT_root_page_table +- ? FLUSH_TLB | FLUSH_FORCE_IPI +- : FLUSH_TLB); +- } +- +- /* We lose existing type and validity. */ + nx &= ~(PGT_type_mask | PGT_validated); + nx |= type; +- +- /* +- * No special validation needed for writable pages. +- * Page tables and GDT/LDT need to be scanned for validity. +- */ +- if ( type == PGT_writable_page || type == PGT_shared_page ) +- nx |= PGT_validated; + } + } + else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) +@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type, + return -EINTR; + } + ++ /* ++ * One typeref has been taken and is now globally visible. ++ * ++ * The page is either in the "validate locked" state (PGT_[type] | 1) or ++ * fully validated (PGT_[type] | PGT_validated | >0). ++ */ ++ ++ if ( unlikely((x & PGT_count_mask) == 0) ) ++ { ++ struct domain *d = page_get_owner(page); ++ ++ if ( d && shadow_mode_enabled(d) ) ++ shadow_prepare_page_type_change(d, page, type); ++ ++ if ( (x & PGT_type_mask) != type ) ++ { ++ /* ++ * On type change we check to flush stale TLB entries. It is ++ * vital that no other CPUs are left with writeable mappings ++ * to a frame which is intending to become pgtable/segdesc. ++ */ ++ cpumask_t *mask = this_cpu(scratch_cpumask); ++ ++ BUG_ON(in_irq()); ++ cpumask_copy(mask, d->dirty_cpumask); ++ ++ /* Don't flush if the timestamp is old enough */ ++ tlbflush_filter(mask, page->tlbflush_timestamp); ++ ++ if ( unlikely(!cpumask_empty(mask)) && ++ /* Shadow mode: track only writable pages. */ ++ (!shadow_mode_enabled(d) || ++ ((nx & PGT_type_mask) == PGT_writable_page)) ) ++ { ++ perfc_incr(need_flush_tlb_flush); ++ /* ++ * If page was a page table make sure the flush is ++ * performed using an IPI in order to avoid changing the ++ * type of a page table page under the feet of ++ * spurious_page_fault(). ++ */ ++ flush_mask(mask, ++ (x & PGT_type_mask) && ++ (x & PGT_type_mask) <= PGT_root_page_table ++ ? FLUSH_TLB | FLUSH_FORCE_IPI ++ : FLUSH_TLB); ++ } ++ } ++ } ++ + if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) != + (type == PGT_writable_page)) ) + { +@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type, + + if ( unlikely(!(nx & PGT_validated)) ) + { +- if ( !(x & PGT_partial) ) ++ /* ++ * No special validation needed for writable or shared pages. Page ++ * tables and GDT/LDT need to have their contents audited. ++ * ++ * per validate_page(), non-atomic updates are fine here. ++ */ ++ if ( type == PGT_writable_page || type == PGT_shared_page ) ++ page->u.inuse.type_info |= PGT_validated; ++ else + { +- page->nr_validated_ptes = 0; +- page->partial_flags = 0; +- page->linear_pt_count = 0; ++ if ( !(x & PGT_partial) ) ++ { ++ page->nr_validated_ptes = 0; ++ page->partial_flags = 0; ++ page->linear_pt_count = 0; ++ } ++ ++ rc = validate_page(page, type, preemptible); + } +- rc = validate_page(page, type, preemptible); + } + + out: diff --git a/system/xen/xsa/xsa402-4.16-1.patch b/system/xen/xsa/xsa402-4.16-1.patch new file mode 100644 index 000000000000..b783383fc89f --- /dev/null +++ b/system/xen/xsa/xsa402-4.16-1.patch @@ -0,0 +1,43 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/page: Introduce _PAGE_* constants for memory types + +... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_* +constants. These are going to be needed by forthcoming logic. + +No functional change. + +This is part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h +index 1d080cffbe84..2e542050f65a 100644 +--- a/xen/include/asm-x86/page.h ++++ b/xen/include/asm-x86/page.h +@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); + + #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) + ++/* Memory types, encoded under Xen's choice of MSR_PAT. */ ++#define _PAGE_WB ( 0) ++#define _PAGE_WT ( _PAGE_PWT) ++#define _PAGE_UCM ( _PAGE_PCD ) ++#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT) ++#define _PAGE_WC (_PAGE_PAT ) ++#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT) ++ + /* + * Debug option: Ensure that granted mappings are not implicitly unmapped. + * WARNING: This will need to be disabled to run OSes that use the spare PTE +@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); + #define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED) + #define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \ + _PAGE_DIRTY | _PAGE_RW) +-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD) +-#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT) ++#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM) ++#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC) + #define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY) + + #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */ diff --git a/system/xen/xsa/xsa402-4.16-2.patch b/system/xen/xsa/xsa402-4.16-2.patch new file mode 100644 index 000000000000..ebb2f5e221e8 --- /dev/null +++ b/system/xen/xsa/xsa402-4.16-2.patch @@ -0,0 +1,213 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86: Don't change the cacheability of the directmap + +Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings +in response to guest mapping requests") attempted to keep the cacheability +consistent between different mappings of the same page. + +The reason wasn't described in the changelog, but it is understood to be in +regards to a concern over machine check exceptions, owing to errata when using +mixed cacheabilities. It did this primarily by updating Xen's mapping of the +page in the direct map when the guest mapped a page with reduced cacheability. + +Unfortunately, the logic didn't actually prevent mixed cacheability from +occurring: + * A guest could map a page normally, and then map the same page with + different cacheability; nothing prevented this. + * The cacheability of the directmap was always latest-takes-precedence in + terms of guest requests. + * Grant-mapped frames with lesser cacheability didn't adjust the page's + cacheattr settings. + * The map_domain_page() function still unconditionally created WB mappings, + irrespective of the page's cacheattr settings. + +Additionally, update_xen_mappings() had a bug where the alias calculation was +wrong for mfn's which were .init content, which should have been treated as +fully guest pages, not Xen pages. + +Worse yet, the logic introduced a vulnerability whereby necessary +pagetable/segdesc adjustments made by Xen in the validation logic could become +non-coherent between the cache and main memory. The CPU could subsequently +operate on the stale value in the cache, rather than the safe value in main +memory. + +The directmap contains primarily mappings of RAM. PAT/MTRR conflict +resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser +cacheability resolves to being coherent. The special case is WC mappings, +which are non-coherent against MTRR=WB regions (except for fully-coherent +CPUs). + +Xen must not have any WC cacheability in the directmap, to prevent Xen's +actions from creating non-coherency. (Guest actions creating non-coherency is +dealt with in subsequent patches.) As all memory types for MTRR=WB ranges +inter-operate coherently, so leave Xen's directmap mappings as WB. + +Only PV guests with access to devices can use reduced-cacheability mappings to +begin with, and they're trusted not to mount DoSs against the system anyway. + +Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them. +Shift the later PGC_* constants up, to gain 3 extra bits in the main reference +count. Retain the check in get_page_from_l1e() for special_pages() because a +guest has no business using reduced cacheability on these. + +This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0 + +This is CVE-2022-26363, part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index c6429b0f749a..ab32d13a1a0d 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn) + return (page_get_owner(page) == dom_io); + } + +-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr) +-{ +- int err = 0; +- bool alias = mfn >= PFN_DOWN(xen_phys_start) && +- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START); +- unsigned long xen_va = +- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT); +- +- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) ) +- return 0; +- +- if ( unlikely(alias) && cacheattr ) +- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0); +- if ( !err ) +- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1, +- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr)); +- if ( unlikely(alias) && !cacheattr && !err ) +- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR); +- +- return err; +-} +- + #ifndef NDEBUG + struct mmio_emul_range_ctxt { + const struct domain *d; +@@ -1009,47 +987,14 @@ get_page_from_l1e( + goto could_not_pin; + } + +- if ( pte_flags_to_cacheattr(l1f) != +- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) ) ++ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) ) + { +- unsigned long x, nx, y = page->count_info; +- unsigned long cacheattr = pte_flags_to_cacheattr(l1f); +- int err; +- +- if ( is_special_page(page) ) +- { +- if ( write ) +- put_page_type(page); +- put_page(page); +- gdprintk(XENLOG_WARNING, +- "Attempt to change cache attributes of Xen heap page\n"); +- return -EACCES; +- } +- +- do { +- x = y; +- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base); +- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); +- +- err = update_xen_mappings(mfn, cacheattr); +- if ( unlikely(err) ) +- { +- cacheattr = y & PGC_cacheattr_mask; +- do { +- x = y; +- nx = (x & ~PGC_cacheattr_mask) | cacheattr; +- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); +- +- if ( write ) +- put_page_type(page); +- put_page(page); +- +- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn +- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n", +- mfn, get_gpfn_from_mfn(mfn), +- l1e_get_intpte(l1e), l1e_owner->domain_id); +- return err; +- } ++ if ( write ) ++ put_page_type(page); ++ put_page(page); ++ gdprintk(XENLOG_WARNING, ++ "Attempt to change cache attributes of Xen heap page\n"); ++ return -EACCES; + } + + return 0; +@@ -2467,25 +2412,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, + */ + static int cleanup_page_mappings(struct page_info *page) + { +- unsigned int cacheattr = +- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base; + int rc = 0; + unsigned long mfn = mfn_x(page_to_mfn(page)); + + /* +- * If we've modified xen mappings as a result of guest cache +- * attributes, restore them to the "normal" state. +- */ +- if ( unlikely(cacheattr) ) +- { +- page->count_info &= ~PGC_cacheattr_mask; +- +- BUG_ON(is_special_page(page)); +- +- rc = update_xen_mappings(mfn, 0); +- } +- +- /* + * If this may be in a PV domain's IOMMU, remove it. + * + * NB that writable xenheap pages have their type set and cleared by +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index cb9052749963..8a9a43bb0a9d 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -69,25 +69,22 @@ + /* Set when is using a page as a page table */ + #define _PGC_page_table PG_shift(3) + #define PGC_page_table PG_mask(1, 3) +- /* 3-bit PAT/PCD/PWT cache-attribute hint. */ +-#define PGC_cacheattr_base PG_shift(6) +-#define PGC_cacheattr_mask PG_mask(7, 6) + /* Page is broken? */ +-#define _PGC_broken PG_shift(7) +-#define PGC_broken PG_mask(1, 7) ++#define _PGC_broken PG_shift(4) ++#define PGC_broken PG_mask(1, 4) + /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */ +-#define PGC_state PG_mask(3, 9) +-#define PGC_state_inuse PG_mask(0, 9) +-#define PGC_state_offlining PG_mask(1, 9) +-#define PGC_state_offlined PG_mask(2, 9) +-#define PGC_state_free PG_mask(3, 9) ++#define PGC_state PG_mask(3, 6) ++#define PGC_state_inuse PG_mask(0, 6) ++#define PGC_state_offlining PG_mask(1, 6) ++#define PGC_state_offlined PG_mask(2, 6) ++#define PGC_state_free PG_mask(3, 6) + #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) + /* Page is not reference counted (see below for caveats) */ +-#define _PGC_extra PG_shift(10) +-#define PGC_extra PG_mask(1, 10) ++#define _PGC_extra PG_shift(7) ++#define PGC_extra PG_mask(1, 7) + + /* Count of references to this frame. */ +-#define PGC_count_width PG_shift(10) ++#define PGC_count_width PG_shift(7) + #define PGC_count_mask ((1UL<<PGC_count_width)-1) + + /* diff --git a/system/xen/xsa/xsa402-4.16-3.patch b/system/xen/xsa/xsa402-4.16-3.patch new file mode 100644 index 000000000000..b4d2a4c835ae --- /dev/null +++ b/system/xen/xsa/xsa402-4.16-3.patch @@ -0,0 +1,284 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86: Split cache_flush() out of cache_writeback() + +Subsequent changes will want a fully flushing version. + +Use the new helper rather than opencoding it in flush_area_local(). This +resolves an outstanding issue where the conditional sfence is on the wrong +side of the clflushopt loop. clflushopt is ordered with respect to older +stores, not to younger stores. + +Rename gnttab_cache_flush()'s helper to avoid colliding in name. +grant_table.c can see the prototype from cache.h so the build fails +otherwise. + +This is part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +Xen 4.16 and earlier: + * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache + syncing" to split cache_writeback() out of the IOMMU logic, but without the + associated hooks changes. + +diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c +index 25798df50f54..0c912b8669f8 100644 +--- a/xen/arch/x86/flushtlb.c ++++ b/xen/arch/x86/flushtlb.c +@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + if ( flags & FLUSH_CACHE ) + { + const struct cpuinfo_x86 *c = ¤t_cpu_data; +- unsigned long i, sz = 0; ++ unsigned long sz = 0; + + if ( order < (BITS_PER_LONG - PAGE_SHIFT) ) + sz = 1UL << (order + PAGE_SHIFT); +@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + c->x86_clflush_size && c->x86_cache_size && sz && + ((sz >> 10) < c->x86_cache_size) ) + { +- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); +- for ( i = 0; i < sz; i += c->x86_clflush_size ) +- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";" +- " clflush %0", +- "data16 clflush %0", /* clflushopt */ +- X86_FEATURE_CLFLUSHOPT, +- "m" (((const char *)va)[i])); ++ cache_flush(va, sz); + flags &= ~FLUSH_CACHE; + } + else +@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + return flags; + } + ++void cache_flush(const void *addr, unsigned int size) ++{ ++ /* ++ * This function may be called before current_cpu_data is established. ++ * Hence a fallback is needed to prevent the loop below becoming infinite. ++ */ ++ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; ++ const void *end = addr + size; ++ ++ addr -= (unsigned long)addr & (clflush_size - 1); ++ for ( ; addr < end; addr += clflush_size ) ++ { ++ /* ++ * Note regarding the "ds" prefix use: it's faster to do a clflush ++ * + prefix than a clflush + nop, and hence the prefix is added instead ++ * of letting the alternative framework fill the gap by appending nops. ++ */ ++ alternative_io("ds; clflush %[p]", ++ "data16 clflush %[p]", /* clflushopt */ ++ X86_FEATURE_CLFLUSHOPT, ++ /* no outputs */, ++ [p] "m" (*(const char *)(addr))); ++ } ++ ++ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); ++} ++ ++void cache_writeback(const void *addr, unsigned int size) ++{ ++ unsigned int clflush_size; ++ const void *end = addr + size; ++ ++ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */ ++ if ( !boot_cpu_has(X86_FEATURE_CLWB) ) ++ return cache_flush(addr, size); ++ ++ /* ++ * This function may be called before current_cpu_data is established. ++ * Hence a fallback is needed to prevent the loop below becoming infinite. ++ */ ++ clflush_size = current_cpu_data.x86_clflush_size ?: 16; ++ addr -= (unsigned long)addr & (clflush_size - 1); ++ for ( ; addr < end; addr += clflush_size ) ++ { ++/* ++ * The arguments to a macro must not include preprocessor directives. Doing so ++ * results in undefined behavior, so we have to create some defines here in ++ * order to avoid it. ++ */ ++#if defined(HAVE_AS_CLWB) ++# define CLWB_ENCODING "clwb %[p]" ++#elif defined(HAVE_AS_XSAVEOPT) ++# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ ++#else ++# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ ++#endif ++ ++#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) ++#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) ++# define INPUT BASE_INPUT ++#else ++# define INPUT(addr) "a" (addr), BASE_INPUT(addr) ++#endif ++ ++ asm volatile (CLWB_ENCODING :: INPUT(addr)); ++ ++#undef INPUT ++#undef BASE_INPUT ++#undef CLWB_ENCODING ++ } ++ ++ asm volatile ("sfence" ::: "memory"); ++} ++ + unsigned int guest_flush_tlb_flags(const struct domain *d) + { + bool shadow = paging_mode_shadow(d); +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 66f8ce71741c..4c742cd8fe81 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop, + return 0; + } + +-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) ++static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) + { + struct domain *d, *owner; + struct page_info *page; +@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop, + return -EFAULT; + for ( ; ; ) + { +- int ret = cache_flush(&op, cur_ref); ++ int ret = _cache_flush(&op, cur_ref); + + if ( ret < 0 ) + return ret; +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index 01e010a10d61..401079299725 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu, + struct pci_dev *pdev, + u16 did, u16 size, u64 addr); + +-unsigned int get_cache_line_size(void); + void flush_all_cache(void); + + uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 8975c1de61bc..bc377c9bcfa4 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -31,6 +31,7 @@ + #include <xen/pci.h> + #include <xen/pci_regs.h> + #include <xen/keyhandler.h> ++#include <asm/cache.h> + #include <asm/msi.h> + #include <asm/nops.h> + #include <asm/irq.h> +@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d, + } + } + +-static void sync_cache(const void *addr, unsigned int size) +-{ +- static unsigned long clflush_size = 0; +- const void *end = addr + size; +- +- if ( clflush_size == 0 ) +- clflush_size = get_cache_line_size(); +- +- addr -= (unsigned long)addr & (clflush_size - 1); +- for ( ; addr < end; addr += clflush_size ) +-/* +- * The arguments to a macro must not include preprocessor directives. Doing so +- * results in undefined behavior, so we have to create some defines here in +- * order to avoid it. +- */ +-#if defined(HAVE_AS_CLWB) +-# define CLWB_ENCODING "clwb %[p]" +-#elif defined(HAVE_AS_XSAVEOPT) +-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ +-#else +-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ +-#endif +- +-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) +-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) +-# define INPUT BASE_INPUT +-#else +-# define INPUT(addr) "a" (addr), BASE_INPUT(addr) +-#endif +- /* +- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush +- * + prefix than a clflush + nop, and hence the prefix is added instead +- * of letting the alternative framework fill the gap by appending nops. +- */ +- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]", +- "data16 clflush %[p]", /* clflushopt */ +- X86_FEATURE_CLFLUSHOPT, +- CLWB_ENCODING, +- X86_FEATURE_CLWB, /* no outputs */, +- INPUT(addr)); +-#undef INPUT +-#undef BASE_INPUT +-#undef CLWB_ENCODING +- +- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT, +- "sfence", X86_FEATURE_CLWB); +-} +- + /* Allocate page table, return its machine address */ + uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) + { +@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) + clear_page(vaddr); + + if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache ) +- sync_cache(vaddr, PAGE_SIZE); ++ cache_writeback(vaddr, PAGE_SIZE); + unmap_domain_page(vaddr); + cur_pg++; + } +@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + iommu->nr_pt_levels = agaw_to_level(agaw); + + if ( !ecap_coherent(iommu->ecap) ) +- vtd_ops.sync_cache = sync_cache; ++ vtd_ops.sync_cache = cache_writeback; + + /* allocate domain id bitmap */ + iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); +diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c +index 6681dccd6970..55f0faa521cb 100644 +--- a/xen/drivers/passthrough/vtd/x86/vtd.c ++++ b/xen/drivers/passthrough/vtd/x86/vtd.c +@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va) + unmap_domain_page(va); + } + +-unsigned int get_cache_line_size(void) +-{ +- return ((cpuid_ebx(1) >> 8) & 0xff) * 8; +-} +- + void flush_all_cache() + { + wbinvd(); +diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h +index 1f7173d8c72c..e4770efb22b9 100644 +--- a/xen/include/asm-x86/cache.h ++++ b/xen/include/asm-x86/cache.h +@@ -11,4 +11,11 @@ + + #define __read_mostly __section(".data.read_mostly") + ++#ifndef __ASSEMBLY__ ++ ++void cache_flush(const void *addr, unsigned int size); ++void cache_writeback(const void *addr, unsigned int size); ++ ++#endif ++ + #endif diff --git a/system/xen/xsa/xsa402-4.16-4.patch b/system/xen/xsa/xsa402-4.16-4.patch new file mode 100644 index 000000000000..21109225d7d7 --- /dev/null +++ b/system/xen/xsa/xsa402-4.16-4.patch @@ -0,0 +1,83 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/amd: Work around CLFLUSH ordering on older parts + +On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything, +including reads and writes to the address, and LFENCE/SFENCE instructions. + +This creates a multitude of problematic corner cases, laid out in the manual. +Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering. + +This is part of XSA-402. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index a8e37dbb1f5c..b3b9a0df5fed 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c) + if (!cpu_has_lfence_dispatch) + __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); + ++ /* ++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with ++ * everything, including reads and writes to address, and ++ * LFENCE/SFENCE instructions. ++ */ ++ if (!cpu_has_clflushopt) ++ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); ++ + switch(c->x86) + { + case 0xf ... 0x11: +diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c +index 0c912b8669f8..dcbb4064012e 100644 +--- a/xen/arch/x86/flushtlb.c ++++ b/xen/arch/x86/flushtlb.c +@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags) + return flags; + } + ++/* ++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything, ++ * including reads and writes to address, and LFENCE/SFENCE instructions. ++ * ++ * This function only works safely after alternatives have run. Luckily, at ++ * the time of writing, we don't flush the caches that early. ++ */ + void cache_flush(const void *addr, unsigned int size) + { + /* +@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size) + unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; + const void *end = addr + size; + ++ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE); ++ + addr -= (unsigned long)addr & (clflush_size - 1); + for ( ; addr < end; addr += clflush_size ) + { +@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size) + [p] "m" (*(const char *)(addr))); + } + +- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); ++ alternative_2("", ++ "sfence", X86_FEATURE_CLFLUSHOPT, ++ "mfence", X86_BUG_CLFLUSH_MFENCE); + } + + void cache_writeback(const void *addr, unsigned int size) +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index 7413febd7ad8..ff3157d52d13 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch + + #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ + #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */ ++#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ + + /* Total number of capability words, inc synth and bug words. */ + #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ diff --git a/system/xen/xsa/xsa402-4.16-5.patch b/system/xen/xsa/xsa402-4.16-5.patch new file mode 100644 index 000000000000..4806d25c6fa5 --- /dev/null +++ b/system/xen/xsa/xsa402-4.16-5.patch @@ -0,0 +1,148 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/pv: Track and flush non-coherent mappings of RAM + +There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with +devices that make non-coherent writes. The Linux sound subsystem makes +extensive use of this technique. + +For such usecases, the guest's DMA buffer is mapped and consistently used as +WC, and Xen doesn't interact with the buffer. + +However, a mischevious guest can use WC mappings to deliberately create +non-coherency between the cache and RAM, and use this to trick Xen into +validating a pagetable which isn't actually safe. + +Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set +it whenever a non-coherent writeable mapping is created. If the page is used +as anything other than PGT_writable_page, force a cache flush before +validation. Also force a cache flush before the page is returned to the heap. + +This is CVE-2022-26364, part of XSA-402. + +Reported-by: Jann Horn <jannh@google.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index ab32d13a1a0d..bab9624fabb7 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -997,6 +997,15 @@ get_page_from_l1e( + return -EACCES; + } + ++ /* ++ * Track writeable non-coherent mappings to RAM pages, to trigger a cache ++ * flush later if the target is used as anything but a PGT_writeable page. ++ * We care about all writeable mappings, including foreign mappings. ++ */ ++ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) && ++ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) ) ++ set_bit(_PGT_non_coherent, &page->u.inuse.type_info); ++ + return 0; + + could_not_pin: +@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page) + } + } + ++ /* ++ * Flush the cache if there were previously non-coherent writeable ++ * mappings of this page. This forces the page to be coherent before it ++ * is freed back to the heap. ++ */ ++ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) ) ++ { ++ void *addr = __map_domain_page(page); ++ ++ cache_flush(addr, PAGE_SIZE); ++ unmap_domain_page(addr); ++ } ++ + return rc; + } + +@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type, + if ( unlikely(!(nx & PGT_validated)) ) + { + /* ++ * Flush the cache if there were previously non-coherent mappings of ++ * this page, and we're trying to use it as anything other than a ++ * writeable page. This forces the page to be coherent before we ++ * validate its contents for safety. ++ */ ++ if ( (nx & PGT_non_coherent) && type != PGT_writable_page ) ++ { ++ void *addr = __map_domain_page(page); ++ ++ cache_flush(addr, PAGE_SIZE); ++ unmap_domain_page(addr); ++ ++ page->u.inuse.type_info &= ~PGT_non_coherent; ++ } ++ ++ /* + * No special validation needed for writable or shared pages. Page + * tables and GDT/LDT need to have their contents audited. + * +diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c +index 0325618c9883..81c72e61ed55 100644 +--- a/xen/arch/x86/pv/grant_table.c ++++ b/xen/arch/x86/pv/grant_table.c +@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame, + + ol1e = *pl1e; + if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) ++ { ++ /* ++ * We always create mappings in this path. However, our caller, ++ * map_grant_ref(), only passes potentially non-zero cache_flags for ++ * MMIO frames, so this path doesn't create non-coherent mappings of ++ * RAM frames and there's no need to calculate PGT_non_coherent. ++ */ ++ ASSERT(!cache_flags || is_iomem_page(frame)); ++ + rc = GNTST_okay; ++ } + + out_unlock: + page_unlock(page); +@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame, + l1e_get_flags(ol1e), addr, grant_pte_flags); + + if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) ++ { ++ /* ++ * Generally, replace_grant_pv_mapping() is used to destroy mappings ++ * (n1le = l1e_empty()), but it can be a present mapping on the ++ * GNTABOP_unmap_and_replace path. ++ * ++ * In such cases, the PTE is fully transplanted from its old location ++ * via steal_linear_addr(), so we need not perform PGT_non_coherent ++ * checking here. ++ */ + rc = GNTST_okay; ++ } + + out_unlock: + page_unlock(page); +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index 8a9a43bb0a9d..7464167ae192 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -53,8 +53,12 @@ + #define _PGT_partial PG_shift(8) + #define PGT_partial PG_mask(1, 8) + ++/* Has this page been mapped writeable with a non-coherent memory type? */ ++#define _PGT_non_coherent PG_shift(9) ++#define PGT_non_coherent PG_mask(1, 9) ++ + /* Count of uses of this frame as its current type. */ +-#define PGT_count_width PG_shift(8) ++#define PGT_count_width PG_shift(9) + #define PGT_count_mask ((1UL<<PGT_count_width)-1) + + /* Are the 'type mask' bits identical? */ |