5 files changed, 534 insertions, 0 deletions
diff --git a/system/xen/xsa/xsa385-4.15.patch b/system/xen/xsa/xsa385-4.15.patch
new file mode 100644
index 0000000000000..69b97049b5f45
--- /dev/null
+++ b/system/xen/xsa/xsa385-4.15.patch
@@ -0,0 +1,96 @@
+From: Julien Grall <jgrall@amazon.com>
+Subject: xen/page_alloc: Harden assign_pages()
+
+domain_tot_pages() and d->max_pages are 32-bit values. While the order
+should always be quite small, it would still be possible to overflow
+if domain_tot_pages() is near to (2^32 - 1).
+
+As this code may be called by a guest via XENMEM_increase_reservation
+and XENMEM_populate_physmap, we want to make sure the guest is not going
+to be able to allocate more than it is allowed.
+
+Rework the allocation check to avoid any possible overflow. While the
+check domain_tot_pages() < d->max_pages should technically not be
+necessary, it is probably best to have it to catch any possible
+inconsistencies in the future.
+
+This is CVE-2021-28706 / XSA-385.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -2336,7 +2336,8 @@ gnttab_transfer(
+          * pages when it is dying.
+          */
+         if ( unlikely(e->is_dying) ||
+-             unlikely(domain_tot_pages(e) >= e->max_pages) )
++             unlikely(domain_tot_pages(e) >= e->max_pages) ||
++             unlikely(!(e->tot_pages + 1)) )
+         {
+             spin_unlock(&e->page_alloc_lock);
+ 
+@@ -2345,8 +2346,8 @@ gnttab_transfer(
+                          e->domain_id);
+             else
+                 gdprintk(XENLOG_INFO,
+-                         "Transferee d%d has no headroom (tot %u, max %u)\n",
+-                         e->domain_id, domain_tot_pages(e), e->max_pages);
++                         "Transferee %pd has no headroom (tot %u, max %u, ex %u)\n",
++                         e, domain_tot_pages(e), e->max_pages, e->extra_pages);
+ 
+             gop.status = GNTST_general_error;
+             goto unlock_and_copyback;
+--- a/xen/common/page_alloc.c
++++ b/xen/common/page_alloc.c
+@@ -2298,20 +2298,43 @@ int assign_pages(
+     }
+     else if ( !(memflags & MEMF_no_refcount) )
+     {
+-        unsigned int tot_pages = domain_tot_pages(d) + (1 << order);
++        unsigned int tot_pages = domain_tot_pages(d), nr = 1u << order;
+ 
+         if ( unlikely(tot_pages > d->max_pages) )
+         {
+-            gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
+-                    "%u > %u\n", d->domain_id, tot_pages, d->max_pages);
++            gprintk(XENLOG_INFO, "Inconsistent allocation for %pd: %u > %u\n",
++                    d, tot_pages, d->max_pages);
++            rc = -EPERM;
++            goto out;
++        }
++
++        if ( unlikely(nr > d->max_pages - tot_pages) )
++        {
++            gprintk(XENLOG_INFO, "Over-allocation for %pd: %Lu > %u\n",
++                    d, tot_pages + 0ull + nr, d->max_pages);
+             rc = -E2BIG;
+             goto out;
+         }
+     }
+ 
+-    if ( !(memflags & MEMF_no_refcount) &&
+-         unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) )
+-        get_knownalive_domain(d);
++    if ( !(memflags & MEMF_no_refcount) )
++    {
++        unsigned int nr = 1u << order;
++
++        if ( unlikely(d->tot_pages + nr < nr) )
++        {
++            gprintk(XENLOG_INFO,
++                    "Excess allocation for %pd: %Lu (%u extra)\n",
++                    d, d->tot_pages + 0ull + nr, d->extra_pages);
++            if ( pg[0].count_info & PGC_extra )
++                d->extra_pages -= nr;
++            rc = -E2BIG;
++            goto out;
++        }
++
++        if ( unlikely(domain_adjust_tot_pages(d, nr) == nr) )
++            get_knownalive_domain(d);
++    }
+ 
+     for ( i = 0; i < (1 << order); i++ )
+     {
diff --git a/system/xen/xsa/xsa388-4.15-1.patch b/system/xen/xsa/xsa388-4.15-1.patch
new file mode 100644
index 0000000000000..b4d900336b485
--- /dev/null
+++ b/system/xen/xsa/xsa388-4.15-1.patch
@@ -0,0 +1,174 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/PoD: deal with misaligned GFNs
+
+Users of XENMEM_decrease_reservation and XENMEM_populate_physmap aren't
+required to pass in order-aligned GFN values. (While I consider this
+bogus, I don't think we can fix this there, as that might break existing
+code, e.g Linux'es swiotlb, which - while affecting PV only - until
+recently had been enforcing only page alignment on the original
+allocation.) Only non-PoD code paths (guest_physmap_{add,remove}_page(),
+p2m_set_entry()) look to be dealing with this properly (in part by being
+implemented inefficiently, handling every 4k page separately).
+
+Introduce wrappers taking care of splitting the incoming request into
+aligned chunks, without putting much effort in trying to determine the
+largest possible chunk at every iteration.
+
+Also "handle" p2m_set_entry() failure for non-order-0 requests by
+crashing the domain in one more place. Alongside putting a log message
+there, also add one to the other similar path.
+
+Note regarding locking: This is left in the actual worker functions on
+the assumption that callers aren't guaranteed atomicity wrt acting on
+multiple pages at a time. For mis-aligned GFNs gfn_lock() wouldn't have
+locked the correct GFN range anyway, if it didn't simply resolve to
+p2m_lock(), and for well-behaved callers there continues to be only a
+single iteration, i.e. behavior is unchanged for them. (FTAOD pulling
+out just pod_lock() into p2m_pod_decrease_reservation() would result in
+a lock order violation.)
+
+This is CVE-2021-28704 and CVE-2021-28707 / part of XSA-388.
+
+Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -496,7 +496,7 @@ p2m_pod_zero_check_superpage(struct p2m_
+ 
+ 
+ /*
+- * This function is needed for two reasons:
++ * This pair of functions is needed for two reasons:
+  * + To properly handle clearing of PoD entries
+  * + To "steal back" memory being freed for the PoD cache, rather than
+  *   releasing it.
+@@ -504,8 +504,8 @@ p2m_pod_zero_check_superpage(struct p2m_
+  * Once both of these functions have been completed, we can return and
+  * allow decrease_reservation() to handle everything else.
+  */
+-unsigned long
+-p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
++static unsigned long
++decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
+ {
+     unsigned long ret = 0, i, n;
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -552,8 +552,10 @@ p2m_pod_decrease_reservation(struct doma
+          * All PoD: Mark the whole region invalid and tell caller
+          * we're done.
+          */
+-        if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
+-                           p2m->default_access) )
++        int rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
++                               p2m->default_access);
++
++        if ( rc )
+         {
+             /*
+              * If this fails, we can't tell how much of the range was changed.
+@@ -561,7 +563,12 @@ p2m_pod_decrease_reservation(struct doma
+              * impossible.
+              */
+             if ( order != 0 )
++            {
++                printk(XENLOG_G_ERR
++                       "%pd: marking GFN %#lx (order %u) as non-PoD failed: %d\n",
++                       d, gfn_x(gfn), order, rc);
+                 domain_crash(d);
++            }
+             goto out_unlock;
+         }
+         ret = 1UL << order;
+@@ -670,6 +677,22 @@ out_unlock:
+     return ret;
+ }
+ 
++unsigned long
++p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
++{
++    unsigned long left = 1UL << order, ret = 0;
++    unsigned int chunk_order = find_first_set_bit(gfn_x(gfn) | left);
++
++    do {
++        ret += decrease_reservation(d, gfn, chunk_order);
++
++        left -= 1UL << chunk_order;
++        gfn = gfn_add(gfn, 1UL << chunk_order);
++    } while ( left );
++
++    return ret;
++}
++
+ void p2m_pod_dump_data(struct domain *d)
+ {
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -1273,19 +1296,15 @@ remap_and_retry:
+     return true;
+ }
+ 
+-
+-int
+-guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
+-                                      unsigned int order)
++static int
++mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
++                        unsigned int order)
+ {
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+     gfn_t gfn = _gfn(gfn_l);
+     unsigned long i, n, pod_count = 0;
+     int rc = 0;
+ 
+-    if ( !paging_mode_translate(d) )
+-        return -EINVAL;
+-
+     gfn_lock(p2m, gfn, order);
+ 
+     P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
+@@ -1325,12 +1344,44 @@ guest_physmap_mark_populate_on_demand(st
+ 
+         ioreq_request_mapcache_invalidate(d);
+     }
++    else if ( order )
++    {
++        /*
++         * If this failed, we can't tell how much of the range was changed.
++         * Best to crash the domain.
++         */
++        printk(XENLOG_G_ERR
++               "%pd: marking GFN %#lx (order %u) as PoD failed: %d\n",
++               d, gfn_l, order, rc);
++        domain_crash(d);
++    }
+ 
+ out:
+     gfn_unlock(p2m, gfn, order);
+ 
+     return rc;
+ }
++
++int
++guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
++                                      unsigned int order)
++{
++    unsigned long left = 1UL << order;
++    unsigned int chunk_order = find_first_set_bit(gfn | left);
++    int rc;
++
++    if ( !paging_mode_translate(d) )
++        return -EINVAL;
++
++    do {
++        rc = mark_populate_on_demand(d, gfn, chunk_order);
++
++        left -= 1UL << chunk_order;
++        gfn += 1UL << chunk_order;
++    } while ( !rc && left );
++
++    return rc;
++}
+ 
+ void p2m_pod_init(struct p2m_domain *p2m)
+ {
diff --git a/system/xen/xsa/xsa388-4.15-2.patch b/system/xen/xsa/xsa388-4.15-2.patch
new file mode 100644
index 0000000000000..ccccb20263b22
--- /dev/null
+++ b/system/xen/xsa/xsa388-4.15-2.patch
@@ -0,0 +1,36 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/PoD: handle intermediate page orders in p2m_pod_cache_add()
+
+p2m_pod_decrease_reservation() may pass pages to the function which
+aren't 4k, 2M, or 1G. Handle all intermediate orders as well, to avoid
+hitting the BUG() at the switch() statement's "default" case.
+
+This is CVE-2021-28708 / part of XSA-388.
+
+Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -112,15 +112,13 @@ p2m_pod_cache_add(struct p2m_domain *p2m
+     /* Then add to the appropriate populate-on-demand list. */
+     switch ( order )
+     {
+-    case PAGE_ORDER_1G:
+-        for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
++    case PAGE_ORDER_2M ... PAGE_ORDER_1G:
++        for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_2M )
+             page_list_add_tail(page + i, &p2m->pod.super);
+         break;
+-    case PAGE_ORDER_2M:
+-        page_list_add_tail(page, &p2m->pod.super);
+-        break;
+-    case PAGE_ORDER_4K:
+-        page_list_add_tail(page, &p2m->pod.single);
++    case PAGE_ORDER_4K ... PAGE_ORDER_2M - 1:
++        for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_4K )
++            page_list_add_tail(page + i, &p2m->pod.single);
+         break;
+     default:
+         BUG();
diff --git a/system/xen/xsa/xsa389-4.15.patch b/system/xen/xsa/xsa389-4.15.patch
new file mode 100644
index 0000000000000..402a38e2d4ebb
--- /dev/null
+++ b/system/xen/xsa/xsa389-4.15.patch
@@ -0,0 +1,182 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/P2M: deal with partial success of p2m_set_entry()
+
+M2P and PoD stats need to remain in sync with P2M; if an update succeeds
+only partially, respective adjustments need to be made. If updates get
+made before the call, they may also need undoing upon complete failure
+(i.e. including the single-page case).
+
+Log-dirty state would better also be kept in sync.
+
+Note that the change to set_typed_p2m_entry() may not be strictly
+necessary (due to the order restriction enforced near the top of the
+function), but is being kept here to be on the safe side.
+
+This is CVE-2021-28705 and CVE-2021-28709 / XSA-389.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -784,6 +784,7 @@ p2m_remove_page(struct p2m_domain *p2m,
+     unsigned long i;
+     p2m_type_t t;
+     p2m_access_t a;
++    int rc;
+ 
+     /* IOMMU for PV guests is handled in get_page_type() and put_page(). */
+     if ( !paging_mode_translate(p2m->domain) )
+@@ -819,8 +820,27 @@ p2m_remove_page(struct p2m_domain *p2m,
+ 
+     ioreq_request_mapcache_invalidate(p2m->domain);
+ 
+-    return p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
+-                         p2m->default_access);
++    rc = p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
++                       p2m->default_access);
++    if ( likely(!rc) || !mfn_valid(mfn) )
++        return rc;
++
++    /*
++     * The operation may have partially succeeded. For the failed part we need
++     * to undo the M2P update and, out of precaution, mark the pages dirty
++     * again.
++     */
++    for ( i = 0; i < (1UL << page_order); ++i )
++    {
++        p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL);
++        if ( !p2m_is_hole(t) && !p2m_is_special(t) && !p2m_is_shared(t) )
++        {
++            set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_x(gfn) + i);
++            paging_mark_pfn_dirty(p2m->domain, _pfn(gfn_x(gfn) + i));
++        }
++    }
++
++    return rc;
+ }
+ 
+ int
+@@ -1009,13 +1029,8 @@ guest_physmap_add_entry(struct domain *d
+ 
+     /* Now, actually do the two-way mapping */
+     rc = p2m_set_entry(p2m, gfn, mfn, page_order, t, p2m->default_access);
+-    if ( rc == 0 )
++    if ( likely(!rc) )
+     {
+-        pod_lock(p2m);
+-        p2m->pod.entry_count -= pod_count;
+-        BUG_ON(p2m->pod.entry_count < 0);
+-        pod_unlock(p2m);
+-
+         if ( !p2m_is_grant(t) )
+         {
+             for ( i = 0; i < (1UL << page_order); i++ )
+@@ -1023,6 +1038,42 @@ guest_physmap_add_entry(struct domain *d
+                                   gfn_x(gfn_add(gfn, i)));
+         }
+     }
++    else
++    {
++        /*
++         * The operation may have partially succeeded. For the successful part
++         * we need to update M2P and dirty state, while for the failed part we
++         * may need to adjust PoD stats as well as undo the earlier M2P update.
++         */
++        for ( i = 0; i < (1UL << page_order); ++i )
++        {
++            omfn = p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, NULL, NULL);
++            if ( p2m_is_pod(ot) )
++            {
++                BUG_ON(!pod_count);
++                --pod_count;
++            }
++            else if ( mfn_eq(omfn, mfn_add(mfn, i)) && ot == t &&
++                      a == p2m->default_access && !p2m_is_grant(t) )
++            {
++                set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
++                paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn) + i));
++            }
++            else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) )
++            {
++                ASSERT(mfn_valid(omfn));
++                set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
++            }
++        }
++    }
++
++    if ( pod_count )
++    {
++        pod_lock(p2m);
++        p2m->pod.entry_count -= pod_count;
++        BUG_ON(p2m->pod.entry_count < 0);
++        pod_unlock(p2m);
++    }
+ 
+ out:
+     p2m_unlock(p2m);
+@@ -1314,6 +1365,51 @@ static int set_typed_p2m_entry(struct do
+             return 0;
+         }
+     }
++
++    P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
++    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
++    if ( unlikely(rc) )
++    {
++        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
++                 gfn_l, order, rc, mfn_x(mfn));
++
++        /*
++         * The operation may have partially succeeded. For the successful part
++         * we need to update PoD stats, M2P, and dirty state.
++         */
++        if ( order != PAGE_ORDER_4K )
++        {
++            unsigned long i;
++
++            for ( i = 0; i < (1UL << order); ++i )
++            {
++                p2m_type_t t;
++                mfn_t cmfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0,
++                                            NULL, NULL);
++
++                if ( !mfn_eq(cmfn, mfn_add(mfn, i)) || t != gfn_p2mt ||
++                     a != access )
++                    continue;
++
++                if ( p2m_is_ram(ot) )
++                {
++                    ASSERT(mfn_valid(mfn_add(omfn, i)));
++                    set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
++
++                    ioreq_request_mapcache_invalidate(d);
++                }
++#ifdef CONFIG_HVM
++                else if ( p2m_is_pod(ot) )
++                {
++                    pod_lock(p2m);
++                    BUG_ON(!p2m->pod.entry_count);
++                    --p2m->pod.entry_count;
++                    pod_unlock(p2m);
++                }
++#endif
++            }
++        }
++    }
+     else if ( p2m_is_ram(ot) )
+     {
+         unsigned long i;
+@@ -1326,12 +1422,6 @@ static int set_typed_p2m_entry(struct do
+ 
+         ioreq_request_mapcache_invalidate(d);
+     }
+-
+-    P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
+-    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
+-    if ( rc )
+-        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+-                 gfn_l, order, rc, mfn_x(mfn));
+ #ifdef CONFIG_HVM
+     else if ( p2m_is_pod(ot) )
+     {
diff --git a/system/xen/xsa/xsa390.patch b/system/xen/xsa/xsa390.patch
new file mode 100644
index 0000000000000..3c008a9bc2589
--- /dev/null
+++ b/system/xen/xsa/xsa390.patch
@@ -0,0 +1,46 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: fix reduced page table levels support when sharing tables
+
+domain_pgd_maddr() contains logic to adjust the root address to be put
+in the context entry in case 4-level page tables aren't supported by an
+IOMMU. This logic may not be bypassed when sharing page tables.
+
+This is CVE-2021-28710 / XSA-390.
+
+Fixes: 25ccd093425c ("iommu: remove the share_p2m operation")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -340,19 +340,21 @@ static uint64_t domain_pgd_maddr(struct
+     {
+         pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d));
+ 
+-        return pagetable_get_paddr(pgt);
++        pgd_maddr = pagetable_get_paddr(pgt);
+     }
+-
+-    if ( !hd->arch.vtd.pgd_maddr )
++    else
+     {
+-        /* Ensure we have pagetables allocated down to leaf PTE. */
+-        addr_to_dma_page_maddr(d, 0, 1);
+-
+         if ( !hd->arch.vtd.pgd_maddr )
+-            return 0;
+-    }
++        {
++            /* Ensure we have pagetables allocated down to leaf PTE. */
++            addr_to_dma_page_maddr(d, 0, 1);
+ 
+-    pgd_maddr = hd->arch.vtd.pgd_maddr;
++            if ( !hd->arch.vtd.pgd_maddr )
++                return 0;
++        }
++
++        pgd_maddr = hd->arch.vtd.pgd_maddr;
++    }
+ 
+     /* Skip top levels of page tables for 2- and 3-level DRHDs. */
+     for ( agaw = level_to_agaw(4);