8 files changed, 1133 insertions, 1 deletions
diff --git a/system/xen/xen.SlackBuild b/system/xen/xen.SlackBuild
index 1d1d31d344dd..e504dc97bf12 100644
--- a/system/xen/xen.SlackBuild
+++ b/system/xen/xen.SlackBuild
@@ -26,7 +26,7 @@ cd $(dirname $0) ; CWD=$(pwd)
 
 PRGNAM=xen
 VERSION=${VERSION:-4.16.1}
-BUILD=${BUILD:-1}
+BUILD=${BUILD:-2}
 TAG=${TAG:-_SBo}
 PKGTYPE=${PKGTYPE:-tgz}
 
diff --git a/system/xen/xsa/xsa401-4.16-1.patch b/system/xen/xsa/xsa401-4.16-1.patch
new file mode 100644
index 000000000000..5c8c50617a21
--- /dev/null
+++ b/system/xen/xsa/xsa401-4.16-1.patch
@@ -0,0 +1,170 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Clean up _get_page_type()
+
+Various fixes for clarity, ahead of making complicated changes.
+
+ * Split the overflow check out of the if/else chain for type handling, as
+   it's somewhat unrelated.
+ * Comment the main if/else chain to explain what is going on.  Adjust one
+   ASSERT() and state the bit layout for validate-locked and partial states.
+ * Correct the comment about TLB flushing, as it's backwards.  The problem
+   case is when writeable mappings are retained to a page becoming read-only,
+   as it allows the guest to bypass Xen's safety checks for updates.
+ * Reduce the scope of 'y'.  It is an artefact of the cmpxchg loop and not
+   valid for use by subsequent logic.  Switch to using ACCESS_ONCE() to treat
+   all reads as explicitly volatile.  The only thing preventing the validated
+   wait-loop being infinite is the compiler barrier hidden in cpu_relax().
+ * Replace one page_get_owner(page) with the already-calculated 'd' already in
+   scope.
+
+No functional change.
+
+This is part of XSA-401 / CVE-2022-26362.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 796faca64103..ddd32f88c798 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
+ static int _get_page_type(struct page_info *page, unsigned long type,
+                           bool preemptible)
+ {
+-    unsigned long nx, x, y = page->u.inuse.type_info;
++    unsigned long nx, x;
+     int rc = 0;
+ 
+     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
+     ASSERT(!in_irq());
+ 
+-    for ( ; ; )
++    for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
+     {
+         x  = y;
+         nx = x + 1;
++
+         if ( unlikely((nx & PGT_count_mask) == 0) )
+         {
+             gdprintk(XENLOG_WARNING,
+@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+                      mfn_x(page_to_mfn(page)));
+             return -EINVAL;
+         }
+-        else if ( unlikely((x & PGT_count_mask) == 0) )
++
++        if ( unlikely((x & PGT_count_mask) == 0) )
+         {
++            /*
++             * Typeref 0 -> 1.
++             *
++             * Type changes are permitted when the typeref is 0.  If the type
++             * actually changes, the page needs re-validating.
++             */
+             struct domain *d = page_get_owner(page);
+ 
+             if ( d && shadow_mode_enabled(d) )
+@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+             {
+                 /*
+                  * On type change we check to flush stale TLB entries. It is
+-                 * vital that no other CPUs are left with mappings of a frame
+-                 * which is about to become writeable to the guest.
++                 * vital that no other CPUs are left with writeable mappings
++                 * to a frame which is intending to become pgtable/segdesc.
+                  */
+                 cpumask_t *mask = this_cpu(scratch_cpumask);
+ 
+@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ 
+                 if ( unlikely(!cpumask_empty(mask)) &&
+                      /* Shadow mode: track only writable pages. */
+-                     (!shadow_mode_enabled(page_get_owner(page)) ||
++                     (!shadow_mode_enabled(d) ||
+                       ((nx & PGT_type_mask) == PGT_writable_page)) )
+                 {
+                     perfc_incr(need_flush_tlb_flush);
+@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+         }
+         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
+         {
+-            /* Don't log failure if it could be a recursive-mapping attempt. */
++            /*
++             * else, we're trying to take a new reference, of the wrong type.
++             *
++             * This (being able to prohibit use of the wrong type) is what the
++             * typeref system exists for, but skip printing the failure if it
++             * looks like a recursive mapping, as subsequent logic might
++             * ultimately permit the attempt.
++             */
+             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
+                  (type == PGT_l1_page_table) )
+                 return -EINVAL;
+@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+         }
+         else if ( unlikely(!(x & PGT_validated)) )
+         {
++            /*
++             * else, the count is non-zero, and we're grabbing the right type;
++             * but the page hasn't been validated yet.
++             *
++             * The page is in one of two states (depending on PGT_partial),
++             * and should have exactly one reference.
++             */
++            ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
++
+             if ( !(x & PGT_partial) )
+             {
+-                /* Someone else is updating validation of this page. Wait... */
++                /*
++                 * The page has been left in the "validate locked" state
++                 * (i.e. PGT_[type] | 1) which means that a concurrent caller
++                 * of _get_page_type() is in the middle of validation.
++                 *
++                 * Spin waiting for the concurrent user to complete (partial
++                 * or fully validated), then restart our attempt to acquire a
++                 * type reference.
++                 */
+                 do {
+                     if ( preemptible && hypercall_preempt_check() )
+                         return -EINTR;
+                     cpu_relax();
+-                } while ( (y = page->u.inuse.type_info) == x );
++                } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
+                 continue;
+             }
+-            /* Type ref count was left at 1 when PGT_partial got set. */
+-            ASSERT((x & PGT_count_mask) == 1);
++
++            /*
++             * The page has been left in the "partial" state
++             * (i.e., PGT_[type] | PGT_partial | 1).
++             *
++             * Rather than bumping the type count, we need to try to grab the
++             * validation lock; if we succeed, we need to validate the page,
++             * then drop the general ref associated with the PGT_partial bit.
++             *
++             * We grab the validation lock by setting nx to (PGT_[type] | 1)
++             * (i.e., non-zero type count, neither PGT_validated nor
++             * PGT_partial set).
++             */
+             nx = x & ~PGT_partial;
+         }
+ 
+@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+     }
+ 
+  out:
++    /*
++     * Did we drop the PGT_partial bit when acquiring the typeref?  If so,
++     * drop the general reference that went along with it.
++     *
++     * N.B. validate_page() may have have re-set PGT_partial, not reflected in
++     * nx, but will have taken an extra ref when doing so.
++     */
+     if ( (x & PGT_partial) && !(nx & PGT_partial) )
+         put_page(page);
+ 
diff --git a/system/xen/xsa/xsa401-4.16-2.patch b/system/xen/xsa/xsa401-4.16-2.patch
new file mode 100644
index 000000000000..be58db59a513
--- /dev/null
+++ b/system/xen/xsa/xsa401-4.16-2.patch
@@ -0,0 +1,191 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
+
+_get_page_type() suffers from a race condition where it incorrectly assumes
+that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
+cannot have changed in-between.  Consider:
+
+CPU A:
+  1. Creates an L2e referencing pg
+     `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
+  2.     Issues flush_tlb_mask()
+CPU B:
+  3. Creates a writeable mapping of pg
+     `-> _get_page_type(pg, PGT_writable_page), count increases to 1
+  4. Writes into new mapping, creating a TLB entry for pg
+  5. Removes the writeable mapping of pg
+     `-> _put_page_type(pg), count goes back down to 0
+CPU A:
+  7.     Issues cmpxchg(), setting count 1, type PGT_l1_page_table
+
+CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
+suitably protected (i.e. read-only).  The TLB flush in step 2 must be deferred
+until after the guest is prohibited from creating new writeable mappings,
+which is after step 7.
+
+Defer all safety actions until after the cmpxchg() has successfully taken the
+intended typeref, because that is what prevents concurrent users from using
+the old type.
+
+Also remove the early validation for writeable and shared pages.  This removes
+race conditions where one half of a parallel mapping attempt can return
+successfully before:
+ * The IOMMU pagetables are in sync with the new page type
+ * Writeable mappings to shared pages have been torn down
+
+This is part of XSA-401 / CVE-2022-26362.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index ddd32f88c798..1693b580b152 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+              * Type changes are permitted when the typeref is 0.  If the type
+              * actually changes, the page needs re-validating.
+              */
+-            struct domain *d = page_get_owner(page);
+-
+-            if ( d && shadow_mode_enabled(d) )
+-               shadow_prepare_page_type_change(d, page, type);
+ 
+             ASSERT(!(x & PGT_pae_xen_l2));
+             if ( (x & PGT_type_mask) != type )
+             {
+-                /*
+-                 * On type change we check to flush stale TLB entries. It is
+-                 * vital that no other CPUs are left with writeable mappings
+-                 * to a frame which is intending to become pgtable/segdesc.
+-                 */
+-                cpumask_t *mask = this_cpu(scratch_cpumask);
+-
+-                BUG_ON(in_irq());
+-                cpumask_copy(mask, d->dirty_cpumask);
+-
+-                /* Don't flush if the timestamp is old enough */
+-                tlbflush_filter(mask, page->tlbflush_timestamp);
+-
+-                if ( unlikely(!cpumask_empty(mask)) &&
+-                     /* Shadow mode: track only writable pages. */
+-                     (!shadow_mode_enabled(d) ||
+-                      ((nx & PGT_type_mask) == PGT_writable_page)) )
+-                {
+-                    perfc_incr(need_flush_tlb_flush);
+-                    /*
+-                     * If page was a page table make sure the flush is
+-                     * performed using an IPI in order to avoid changing the
+-                     * type of a page table page under the feet of
+-                     * spurious_page_fault().
+-                     */
+-                    flush_mask(mask,
+-                               (x & PGT_type_mask) &&
+-                               (x & PGT_type_mask) <= PGT_root_page_table
+-                               ? FLUSH_TLB | FLUSH_FORCE_IPI
+-                               : FLUSH_TLB);
+-                }
+-
+-                /* We lose existing type and validity. */
+                 nx &= ~(PGT_type_mask | PGT_validated);
+                 nx |= type;
+-
+-                /*
+-                 * No special validation needed for writable pages.
+-                 * Page tables and GDT/LDT need to be scanned for validity.
+-                 */
+-                if ( type == PGT_writable_page || type == PGT_shared_page )
+-                    nx |= PGT_validated;
+             }
+         }
+         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
+@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+             return -EINTR;
+     }
+ 
++    /*
++     * One typeref has been taken and is now globally visible.
++     *
++     * The page is either in the "validate locked" state (PGT_[type] | 1) or
++     * fully validated (PGT_[type] | PGT_validated | >0).
++     */
++
++    if ( unlikely((x & PGT_count_mask) == 0) )
++    {
++        struct domain *d = page_get_owner(page);
++
++        if ( d && shadow_mode_enabled(d) )
++            shadow_prepare_page_type_change(d, page, type);
++
++        if ( (x & PGT_type_mask) != type )
++        {
++            /*
++             * On type change we check to flush stale TLB entries. It is
++             * vital that no other CPUs are left with writeable mappings
++             * to a frame which is intending to become pgtable/segdesc.
++             */
++            cpumask_t *mask = this_cpu(scratch_cpumask);
++
++            BUG_ON(in_irq());
++            cpumask_copy(mask, d->dirty_cpumask);
++
++            /* Don't flush if the timestamp is old enough */
++            tlbflush_filter(mask, page->tlbflush_timestamp);
++
++            if ( unlikely(!cpumask_empty(mask)) &&
++                 /* Shadow mode: track only writable pages. */
++                 (!shadow_mode_enabled(d) ||
++                  ((nx & PGT_type_mask) == PGT_writable_page)) )
++            {
++                perfc_incr(need_flush_tlb_flush);
++                /*
++                 * If page was a page table make sure the flush is
++                 * performed using an IPI in order to avoid changing the
++                 * type of a page table page under the feet of
++                 * spurious_page_fault().
++                 */
++                flush_mask(mask,
++                           (x & PGT_type_mask) &&
++                           (x & PGT_type_mask) <= PGT_root_page_table
++                           ? FLUSH_TLB | FLUSH_FORCE_IPI
++                           : FLUSH_TLB);
++            }
++        }
++    }
++
+     if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
+                   (type == PGT_writable_page)) )
+     {
+@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ 
+     if ( unlikely(!(nx & PGT_validated)) )
+     {
+-        if ( !(x & PGT_partial) )
++        /*
++         * No special validation needed for writable or shared pages.  Page
++         * tables and GDT/LDT need to have their contents audited.
++         *
++         * per validate_page(), non-atomic updates are fine here.
++         */
++        if ( type == PGT_writable_page || type == PGT_shared_page )
++            page->u.inuse.type_info |= PGT_validated;
++        else
+         {
+-            page->nr_validated_ptes = 0;
+-            page->partial_flags = 0;
+-            page->linear_pt_count = 0;
++            if ( !(x & PGT_partial) )
++            {
++                page->nr_validated_ptes = 0;
++                page->partial_flags = 0;
++                page->linear_pt_count = 0;
++            }
++
++            rc = validate_page(page, type, preemptible);
+         }
+-        rc = validate_page(page, type, preemptible);
+     }
+ 
+  out:
diff --git a/system/xen/xsa/xsa402-4.16-1.patch b/system/xen/xsa/xsa402-4.16-1.patch
new file mode 100644
index 000000000000..b783383fc89f
--- /dev/null
+++ b/system/xen/xsa/xsa402-4.16-1.patch
@@ -0,0 +1,43 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/page: Introduce _PAGE_* constants for memory types
+
+... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
+constants.  These are going to be needed by forthcoming logic.
+
+No functional change.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
+index 1d080cffbe84..2e542050f65a 100644
+--- a/xen/include/asm-x86/page.h
++++ b/xen/include/asm-x86/page.h
+@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
+ 
+ #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
+ 
++/* Memory types, encoded under Xen's choice of MSR_PAT. */
++#define _PAGE_WB         (                                0)
++#define _PAGE_WT         (                        _PAGE_PWT)
++#define _PAGE_UCM        (            _PAGE_PCD            )
++#define _PAGE_UC         (            _PAGE_PCD | _PAGE_PWT)
++#define _PAGE_WC         (_PAGE_PAT                        )
++#define _PAGE_WP         (_PAGE_PAT |             _PAGE_PWT)
++
+ /*
+  * Debug option: Ensure that granted mappings are not implicitly unmapped.
+  * WARNING: This will need to be disabled to run OSes that use the spare PTE
+@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
+ #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+ #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                    _PAGE_DIRTY | _PAGE_RW)
+-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
+-#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
++#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_UC)
+ #define __PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
+ 
+ #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
diff --git a/system/xen/xsa/xsa402-4.16-2.patch b/system/xen/xsa/xsa402-4.16-2.patch
new file mode 100644
index 000000000000..ebb2f5e221e8
--- /dev/null
+++ b/system/xen/xsa/xsa402-4.16-2.patch
@@ -0,0 +1,213 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86: Don't change the cacheability of the directmap
+
+Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
+in response to guest mapping requests") attempted to keep the cacheability
+consistent between different mappings of the same page.
+
+The reason wasn't described in the changelog, but it is understood to be in
+regards to a concern over machine check exceptions, owing to errata when using
+mixed cacheabilities.  It did this primarily by updating Xen's mapping of the
+page in the direct map when the guest mapped a page with reduced cacheability.
+
+Unfortunately, the logic didn't actually prevent mixed cacheability from
+occurring:
+ * A guest could map a page normally, and then map the same page with
+   different cacheability; nothing prevented this.
+ * The cacheability of the directmap was always latest-takes-precedence in
+   terms of guest requests.
+ * Grant-mapped frames with lesser cacheability didn't adjust the page's
+   cacheattr settings.
+ * The map_domain_page() function still unconditionally created WB mappings,
+   irrespective of the page's cacheattr settings.
+
+Additionally, update_xen_mappings() had a bug where the alias calculation was
+wrong for mfn's which were .init content, which should have been treated as
+fully guest pages, not Xen pages.
+
+Worse yet, the logic introduced a vulnerability whereby necessary
+pagetable/segdesc adjustments made by Xen in the validation logic could become
+non-coherent between the cache and main memory.  The CPU could subsequently
+operate on the stale value in the cache, rather than the safe value in main
+memory.
+
+The directmap contains primarily mappings of RAM.  PAT/MTRR conflict
+resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
+cacheability resolves to being coherent.  The special case is WC mappings,
+which are non-coherent against MTRR=WB regions (except for fully-coherent
+CPUs).
+
+Xen must not have any WC cacheability in the directmap, to prevent Xen's
+actions from creating non-coherency.  (Guest actions creating non-coherency is
+dealt with in subsequent patches.)  As all memory types for MTRR=WB ranges
+inter-operate coherently, so leave Xen's directmap mappings as WB.
+
+Only PV guests with access to devices can use reduced-cacheability mappings to
+begin with, and they're trusted not to mount DoSs against the system anyway.
+
+Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
+Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
+count.  Retain the check in get_page_from_l1e() for special_pages() because a
+guest has no business using reduced cacheability on these.
+
+This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
+
+This is CVE-2022-26363, part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index c6429b0f749a..ab32d13a1a0d 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn)
+     return (page_get_owner(page) == dom_io);
+ }
+ 
+-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
+-{
+-    int err = 0;
+-    bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
+-         mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
+-    unsigned long xen_va =
+-        XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
+-
+-    if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) )
+-        return 0;
+-
+-    if ( unlikely(alias) && cacheattr )
+-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
+-    if ( !err )
+-        err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
+-                     PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
+-    if ( unlikely(alias) && !cacheattr && !err )
+-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
+-
+-    return err;
+-}
+-
+ #ifndef NDEBUG
+ struct mmio_emul_range_ctxt {
+     const struct domain *d;
+@@ -1009,47 +987,14 @@ get_page_from_l1e(
+         goto could_not_pin;
+     }
+ 
+-    if ( pte_flags_to_cacheattr(l1f) !=
+-         ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
++    if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) )
+     {
+-        unsigned long x, nx, y = page->count_info;
+-        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
+-        int err;
+-
+-        if ( is_special_page(page) )
+-        {
+-            if ( write )
+-                put_page_type(page);
+-            put_page(page);
+-            gdprintk(XENLOG_WARNING,
+-                     "Attempt to change cache attributes of Xen heap page\n");
+-            return -EACCES;
+-        }
+-
+-        do {
+-            x  = y;
+-            nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
+-        } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
+-
+-        err = update_xen_mappings(mfn, cacheattr);
+-        if ( unlikely(err) )
+-        {
+-            cacheattr = y & PGC_cacheattr_mask;
+-            do {
+-                x  = y;
+-                nx = (x & ~PGC_cacheattr_mask) | cacheattr;
+-            } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
+-
+-            if ( write )
+-                put_page_type(page);
+-            put_page(page);
+-
+-            gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
+-                     " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
+-                     mfn, get_gpfn_from_mfn(mfn),
+-                     l1e_get_intpte(l1e), l1e_owner->domain_id);
+-            return err;
+-        }
++        if ( write )
++            put_page_type(page);
++        put_page(page);
++        gdprintk(XENLOG_WARNING,
++                 "Attempt to change cache attributes of Xen heap page\n");
++        return -EACCES;
+     }
+ 
+     return 0;
+@@ -2467,25 +2412,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+  */
+ static int cleanup_page_mappings(struct page_info *page)
+ {
+-    unsigned int cacheattr =
+-        (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
+     int rc = 0;
+     unsigned long mfn = mfn_x(page_to_mfn(page));
+ 
+     /*
+-     * If we've modified xen mappings as a result of guest cache
+-     * attributes, restore them to the "normal" state.
+-     */
+-    if ( unlikely(cacheattr) )
+-    {
+-        page->count_info &= ~PGC_cacheattr_mask;
+-
+-        BUG_ON(is_special_page(page));
+-
+-        rc = update_xen_mappings(mfn, 0);
+-    }
+-
+-    /*
+      * If this may be in a PV domain's IOMMU, remove it.
+      *
+      * NB that writable xenheap pages have their type set and cleared by
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index cb9052749963..8a9a43bb0a9d 100644
+--- a/xen/include/asm-x86/mm.h
++++ b/xen/include/asm-x86/mm.h
+@@ -69,25 +69,22 @@
+  /* Set when is using a page as a page table */
+ #define _PGC_page_table   PG_shift(3)
+ #define PGC_page_table    PG_mask(1, 3)
+- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
+-#define PGC_cacheattr_base PG_shift(6)
+-#define PGC_cacheattr_mask PG_mask(7, 6)
+  /* Page is broken? */
+-#define _PGC_broken       PG_shift(7)
+-#define PGC_broken        PG_mask(1, 7)
++#define _PGC_broken       PG_shift(4)
++#define PGC_broken        PG_mask(1, 4)
+  /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
+-#define PGC_state         PG_mask(3, 9)
+-#define PGC_state_inuse   PG_mask(0, 9)
+-#define PGC_state_offlining PG_mask(1, 9)
+-#define PGC_state_offlined PG_mask(2, 9)
+-#define PGC_state_free    PG_mask(3, 9)
++#define PGC_state           PG_mask(3, 6)
++#define PGC_state_inuse     PG_mask(0, 6)
++#define PGC_state_offlining PG_mask(1, 6)
++#define PGC_state_offlined  PG_mask(2, 6)
++#define PGC_state_free      PG_mask(3, 6)
+ #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+ /* Page is not reference counted (see below for caveats) */
+-#define _PGC_extra        PG_shift(10)
+-#define PGC_extra         PG_mask(1, 10)
++#define _PGC_extra        PG_shift(7)
++#define PGC_extra         PG_mask(1, 7)
+ 
+ /* Count of references to this frame. */
+-#define PGC_count_width   PG_shift(10)
++#define PGC_count_width   PG_shift(7)
+ #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
+ 
+ /*
diff --git a/system/xen/xsa/xsa402-4.16-3.patch b/system/xen/xsa/xsa402-4.16-3.patch
new file mode 100644
index 000000000000..b4d2a4c835ae
--- /dev/null
+++ b/system/xen/xsa/xsa402-4.16-3.patch
@@ -0,0 +1,284 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86: Split cache_flush() out of cache_writeback()
+
+Subsequent changes will want a fully flushing version.
+
+Use the new helper rather than opencoding it in flush_area_local().  This
+resolves an outstanding issue where the conditional sfence is on the wrong
+side of the clflushopt loop.  clflushopt is ordered with respect to older
+stores, not to younger stores.
+
+Rename gnttab_cache_flush()'s helper to avoid colliding in name.
+grant_table.c can see the prototype from cache.h so the build fails
+otherwise.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+Xen 4.16 and earlier:
+ * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
+   syncing" to split cache_writeback() out of the IOMMU logic, but without the
+   associated hooks changes.
+
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index 25798df50f54..0c912b8669f8 100644
+--- a/xen/arch/x86/flushtlb.c
++++ b/xen/arch/x86/flushtlb.c
+@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+     if ( flags & FLUSH_CACHE )
+     {
+         const struct cpuinfo_x86 *c = &current_cpu_data;
+-        unsigned long i, sz = 0;
++        unsigned long sz = 0;
+ 
+         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
+             sz = 1UL << (order + PAGE_SHIFT);
+@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+              c->x86_clflush_size && c->x86_cache_size && sz &&
+              ((sz >> 10) < c->x86_cache_size) )
+         {
+-            alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+-            for ( i = 0; i < sz; i += c->x86_clflush_size )
+-                alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
+-                                  " clflush %0",
+-                                  "data16 clflush %0",      /* clflushopt */
+-                                  X86_FEATURE_CLFLUSHOPT,
+-                                  "m" (((const char *)va)[i]));
++            cache_flush(va, sz);
+             flags &= ~FLUSH_CACHE;
+         }
+         else
+@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+     return flags;
+ }
+ 
++void cache_flush(const void *addr, unsigned int size)
++{
++    /*
++     * This function may be called before current_cpu_data is established.
++     * Hence a fallback is needed to prevent the loop below becoming infinite.
++     */
++    unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
++    const void *end = addr + size;
++
++    addr -= (unsigned long)addr & (clflush_size - 1);
++    for ( ; addr < end; addr += clflush_size )
++    {
++        /*
++         * Note regarding the "ds" prefix use: it's faster to do a clflush
++         * + prefix than a clflush + nop, and hence the prefix is added instead
++         * of letting the alternative framework fill the gap by appending nops.
++         */
++        alternative_io("ds; clflush %[p]",
++                       "data16 clflush %[p]", /* clflushopt */
++                       X86_FEATURE_CLFLUSHOPT,
++                       /* no outputs */,
++                       [p] "m" (*(const char *)(addr)));
++    }
++
++    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
++}
++
++void cache_writeback(const void *addr, unsigned int size)
++{
++    unsigned int clflush_size;
++    const void *end = addr + size;
++
++    /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
++    if ( !boot_cpu_has(X86_FEATURE_CLWB) )
++        return cache_flush(addr, size);
++
++    /*
++     * This function may be called before current_cpu_data is established.
++     * Hence a fallback is needed to prevent the loop below becoming infinite.
++     */
++    clflush_size = current_cpu_data.x86_clflush_size ?: 16;
++    addr -= (unsigned long)addr & (clflush_size - 1);
++    for ( ; addr < end; addr += clflush_size )
++    {
++/*
++ * The arguments to a macro must not include preprocessor directives. Doing so
++ * results in undefined behavior, so we have to create some defines here in
++ * order to avoid it.
++ */
++#if defined(HAVE_AS_CLWB)
++# define CLWB_ENCODING "clwb %[p]"
++#elif defined(HAVE_AS_XSAVEOPT)
++# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
++#else
++# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
++#endif
++
++#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
++#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
++# define INPUT BASE_INPUT
++#else
++# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
++#endif
++
++        asm volatile (CLWB_ENCODING :: INPUT(addr));
++
++#undef INPUT
++#undef BASE_INPUT
++#undef CLWB_ENCODING
++    }
++
++    asm volatile ("sfence" ::: "memory");
++}
++
+ unsigned int guest_flush_tlb_flags(const struct domain *d)
+ {
+     bool shadow = paging_mode_shadow(d);
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index 66f8ce71741c..4c742cd8fe81 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
+     return 0;
+ }
+ 
+-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
++static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
+ {
+     struct domain *d, *owner;
+     struct page_info *page;
+@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
+             return -EFAULT;
+         for ( ; ; )
+         {
+-            int ret = cache_flush(&op, cur_ref);
++            int ret = _cache_flush(&op, cur_ref);
+ 
+             if ( ret < 0 )
+                 return ret;
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index 01e010a10d61..401079299725 100644
+--- a/xen/drivers/passthrough/vtd/extern.h
++++ b/xen/drivers/passthrough/vtd/extern.h
+@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
+                                           struct pci_dev *pdev,
+                                           u16 did, u16 size, u64 addr);
+ 
+-unsigned int get_cache_line_size(void);
+ void flush_all_cache(void);
+ 
+ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 8975c1de61bc..bc377c9bcfa4 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -31,6 +31,7 @@
+ #include <xen/pci.h>
+ #include <xen/pci_regs.h>
+ #include <xen/keyhandler.h>
++#include <asm/cache.h>
+ #include <asm/msi.h>
+ #include <asm/nops.h>
+ #include <asm/irq.h>
+@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d,
+     }
+ }
+ 
+-static void sync_cache(const void *addr, unsigned int size)
+-{
+-    static unsigned long clflush_size = 0;
+-    const void *end = addr + size;
+-
+-    if ( clflush_size == 0 )
+-        clflush_size = get_cache_line_size();
+-
+-    addr -= (unsigned long)addr & (clflush_size - 1);
+-    for ( ; addr < end; addr += clflush_size )
+-/*
+- * The arguments to a macro must not include preprocessor directives. Doing so
+- * results in undefined behavior, so we have to create some defines here in
+- * order to avoid it.
+- */
+-#if defined(HAVE_AS_CLWB)
+-# define CLWB_ENCODING "clwb %[p]"
+-#elif defined(HAVE_AS_XSAVEOPT)
+-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+-#else
+-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+-#endif
+-
+-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+-# define INPUT BASE_INPUT
+-#else
+-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+-#endif
+-        /*
+-         * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
+-         * + prefix than a clflush + nop, and hence the prefix is added instead
+-         * of letting the alternative framework fill the gap by appending nops.
+-         */
+-        alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
+-                         "data16 clflush %[p]", /* clflushopt */
+-                         X86_FEATURE_CLFLUSHOPT,
+-                         CLWB_ENCODING,
+-                         X86_FEATURE_CLWB, /* no outputs */,
+-                         INPUT(addr));
+-#undef INPUT
+-#undef BASE_INPUT
+-#undef CLWB_ENCODING
+-
+-    alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
+-                      "sfence", X86_FEATURE_CLWB);
+-}
+-
+ /* Allocate page table, return its machine address */
+ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
+ {
+@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
+         clear_page(vaddr);
+ 
+         if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache )
+-            sync_cache(vaddr, PAGE_SIZE);
++            cache_writeback(vaddr, PAGE_SIZE);
+         unmap_domain_page(vaddr);
+         cur_pg++;
+     }
+@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+     iommu->nr_pt_levels = agaw_to_level(agaw);
+ 
+     if ( !ecap_coherent(iommu->ecap) )
+-        vtd_ops.sync_cache = sync_cache;
++        vtd_ops.sync_cache = cache_writeback;
+ 
+     /* allocate domain id bitmap */
+     iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
+diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
+index 6681dccd6970..55f0faa521cb 100644
+--- a/xen/drivers/passthrough/vtd/x86/vtd.c
++++ b/xen/drivers/passthrough/vtd/x86/vtd.c
+@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va)
+     unmap_domain_page(va);
+ }
+ 
+-unsigned int get_cache_line_size(void)
+-{
+-    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+-}
+-
+ void flush_all_cache()
+ {
+     wbinvd();
+diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
+index 1f7173d8c72c..e4770efb22b9 100644
+--- a/xen/include/asm-x86/cache.h
++++ b/xen/include/asm-x86/cache.h
+@@ -11,4 +11,11 @@
+ 
+ #define __read_mostly __section(".data.read_mostly")
+ 
++#ifndef __ASSEMBLY__
++
++void cache_flush(const void *addr, unsigned int size);
++void cache_writeback(const void *addr, unsigned int size);
++
++#endif
++
+ #endif
diff --git a/system/xen/xsa/xsa402-4.16-4.patch b/system/xen/xsa/xsa402-4.16-4.patch
new file mode 100644
index 000000000000..21109225d7d7
--- /dev/null
+++ b/system/xen/xsa/xsa402-4.16-4.patch
@@ -0,0 +1,83 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/amd: Work around CLFLUSH ordering on older parts
+
+On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
+including reads and writes to the address, and LFENCE/SFENCE instructions.
+
+This creates a multitude of problematic corner cases, laid out in the manual.
+Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
+index a8e37dbb1f5c..b3b9a0df5fed 100644
+--- a/xen/arch/x86/cpu/amd.c
++++ b/xen/arch/x86/cpu/amd.c
+@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c)
+ 	if (!cpu_has_lfence_dispatch)
+ 		__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+ 
++	/*
++	 * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
++	 * everything, including reads and writes to address, and
++	 * LFENCE/SFENCE instructions.
++	 */
++	if (!cpu_has_clflushopt)
++		setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
++
+ 	switch(c->x86)
+ 	{
+ 	case 0xf ... 0x11:
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index 0c912b8669f8..dcbb4064012e 100644
+--- a/xen/arch/x86/flushtlb.c
++++ b/xen/arch/x86/flushtlb.c
+@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+     return flags;
+ }
+ 
++/*
++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
++ * including reads and writes to address, and LFENCE/SFENCE instructions.
++ *
++ * This function only works safely after alternatives have run.  Luckily, at
++ * the time of writing, we don't flush the caches that early.
++ */
+ void cache_flush(const void *addr, unsigned int size)
+ {
+     /*
+@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size)
+     unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+     const void *end = addr + size;
+ 
++    alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
++
+     addr -= (unsigned long)addr & (clflush_size - 1);
+     for ( ; addr < end; addr += clflush_size )
+     {
+@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size)
+                        [p] "m" (*(const char *)(addr)));
+     }
+ 
+-    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
++    alternative_2("",
++                  "sfence", X86_FEATURE_CLFLUSHOPT,
++                  "mfence", X86_BUG_CLFLUSH_MFENCE);
+ }
+ 
+ void cache_writeback(const void *addr, unsigned int size)
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index 7413febd7ad8..ff3157d52d13 100644
+--- a/xen/include/asm-x86/cpufeatures.h
++++ b/xen/include/asm-x86/cpufeatures.h
+@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT,           X86_SYNTH(27)) /* Xen uses CET Indirect Branch
+ 
+ #define X86_BUG_FPU_PTRS          X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
+ #define X86_BUG_NULL_SEG          X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */
++#define X86_BUG_CLFLUSH_MFENCE    X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
+ 
+ /* Total number of capability words, inc synth and bug words. */
+ #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
diff --git a/system/xen/xsa/xsa402-4.16-5.patch b/system/xen/xsa/xsa402-4.16-5.patch
new file mode 100644
index 000000000000..4806d25c6fa5
--- /dev/null
+++ b/system/xen/xsa/xsa402-4.16-5.patch
@@ -0,0 +1,148 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Track and flush non-coherent mappings of RAM
+
+There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
+devices that make non-coherent writes.  The Linux sound subsystem makes
+extensive use of this technique.
+
+For such usecases, the guest's DMA buffer is mapped and consistently used as
+WC, and Xen doesn't interact with the buffer.
+
+However, a mischevious guest can use WC mappings to deliberately create
+non-coherency between the cache and RAM, and use this to trick Xen into
+validating a pagetable which isn't actually safe.
+
+Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
+it whenever a non-coherent writeable mapping is created.  If the page is used
+as anything other than PGT_writable_page, force a cache flush before
+validation.  Also force a cache flush before the page is returned to the heap.
+
+This is CVE-2022-26364, part of XSA-402.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index ab32d13a1a0d..bab9624fabb7 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -997,6 +997,15 @@ get_page_from_l1e(
+         return -EACCES;
+     }
+ 
++    /*
++     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
++     * flush later if the target is used as anything but a PGT_writeable page.
++     * We care about all writeable mappings, including foreign mappings.
++     */
++    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
++         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
++        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
++
+     return 0;
+ 
+  could_not_pin:
+@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
+         }
+     }
+ 
++    /*
++     * Flush the cache if there were previously non-coherent writeable
++     * mappings of this page.  This forces the page to be coherent before it
++     * is freed back to the heap.
++     */
++    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
++    {
++        void *addr = __map_domain_page(page);
++
++        cache_flush(addr, PAGE_SIZE);
++        unmap_domain_page(addr);
++    }
++
+     return rc;
+ }
+ 
+@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+     if ( unlikely(!(nx & PGT_validated)) )
+     {
+         /*
++         * Flush the cache if there were previously non-coherent mappings of
++         * this page, and we're trying to use it as anything other than a
++         * writeable page.  This forces the page to be coherent before we
++         * validate its contents for safety.
++         */
++        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
++        {
++            void *addr = __map_domain_page(page);
++
++            cache_flush(addr, PAGE_SIZE);
++            unmap_domain_page(addr);
++
++            page->u.inuse.type_info &= ~PGT_non_coherent;
++        }
++
++        /*
+          * No special validation needed for writable or shared pages.  Page
+          * tables and GDT/LDT need to have their contents audited.
+          *
+diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
+index 0325618c9883..81c72e61ed55 100644
+--- a/xen/arch/x86/pv/grant_table.c
++++ b/xen/arch/x86/pv/grant_table.c
+@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
+ 
+     ol1e = *pl1e;
+     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
++    {
++        /*
++         * We always create mappings in this path.  However, our caller,
++         * map_grant_ref(), only passes potentially non-zero cache_flags for
++         * MMIO frames, so this path doesn't create non-coherent mappings of
++         * RAM frames and there's no need to calculate PGT_non_coherent.
++         */
++        ASSERT(!cache_flags || is_iomem_page(frame));
++
+         rc = GNTST_okay;
++    }
+ 
+  out_unlock:
+     page_unlock(page);
+@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
+                  l1e_get_flags(ol1e), addr, grant_pte_flags);
+ 
+     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
++    {
++        /*
++         * Generally, replace_grant_pv_mapping() is used to destroy mappings
++         * (n1le = l1e_empty()), but it can be a present mapping on the
++         * GNTABOP_unmap_and_replace path.
++         *
++         * In such cases, the PTE is fully transplanted from its old location
++         * via steal_linear_addr(), so we need not perform PGT_non_coherent
++         * checking here.
++         */
+         rc = GNTST_okay;
++    }
+ 
+  out_unlock:
+     page_unlock(page);
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 8a9a43bb0a9d..7464167ae192 100644
+--- a/xen/include/asm-x86/mm.h
++++ b/xen/include/asm-x86/mm.h
+@@ -53,8 +53,12 @@
+ #define _PGT_partial      PG_shift(8)
+ #define PGT_partial       PG_mask(1, 8)
+ 
++/* Has this page been mapped writeable with a non-coherent memory type? */
++#define _PGT_non_coherent PG_shift(9)
++#define PGT_non_coherent  PG_mask(1, 9)
++
+  /* Count of uses of this frame as its current type. */
+-#define PGT_count_width   PG_shift(8)
++#define PGT_count_width   PG_shift(9)
+ #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
+ 
+ /* Are the 'type mask' bits identical? */