CVS commit: pkgsrc/sysutils/xenkernel411

To: pkgsrc-changes%NetBSD.org@localhost
Subject: CVS commit: pkgsrc/sysutils/xenkernel411
From: "Manuel Bouyer" <bouyer%netbsd.org@localhost>
Date: Wed, 13 Nov 2019 13:36:11 +0000

Module Name:    pkgsrc
Committed By:   bouyer
Date:           Wed Nov 13 13:36:11 UTC 2019

Modified Files:
        pkgsrc/sysutils/xenkernel411: Makefile distinfo
Added Files:
        pkgsrc/sysutils/xenkernel411/patches: patch-XSA298 patch-XSA302
            patch-XSA304 patch-XSA305

Log Message:
Add patches for relevant Xen security advisory up to XSA305 (everything
up to XSA297 is already fixed upstream).
Bump PKGREVISION


To generate a diff of this commit:
cvs rdiff -u -r1.8 -r1.9 pkgsrc/sysutils/xenkernel411/Makefile
cvs rdiff -u -r1.5 -r1.6 pkgsrc/sysutils/xenkernel411/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xenkernel411/patches/patch-XSA298 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA302 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA304 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA305

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: pkgsrc/sysutils/xenkernel411/Makefile
diff -u pkgsrc/sysutils/xenkernel411/Makefile:1.8 pkgsrc/sysutils/xenkernel411/Makefile:1.9
--- pkgsrc/sysutils/xenkernel411/Makefile:1.8   Fri Aug 30 13:16:27 2019
+++ pkgsrc/sysutils/xenkernel411/Makefile       Wed Nov 13 13:36:11 2019
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.8 2019/08/30 13:16:27 bouyer Exp $
+# $NetBSD: Makefile,v 1.9 2019/11/13 13:36:11 bouyer Exp $
 
 VERSION=       4.11.2
-#PKGREVISION=  0
+PKGREVISION=   1
 DISTNAME=      xen-${VERSION}
 PKGNAME=       xenkernel411-${VERSION}
 CATEGORIES=    sysutils

Index: pkgsrc/sysutils/xenkernel411/distinfo
diff -u pkgsrc/sysutils/xenkernel411/distinfo:1.5 pkgsrc/sysutils/xenkernel411/distinfo:1.6
--- pkgsrc/sysutils/xenkernel411/distinfo:1.5   Fri Aug 30 13:16:27 2019
+++ pkgsrc/sysutils/xenkernel411/distinfo       Wed Nov 13 13:36:11 2019
@@ -1,10 +1,14 @@
-$NetBSD: distinfo,v 1.5 2019/08/30 13:16:27 bouyer Exp $
+$NetBSD: distinfo,v 1.6 2019/11/13 13:36:11 bouyer Exp $
 
 SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce
 RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b
 SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e
 Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes
 SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
+SHA1 (patch-XSA298) = 63e0f96ce3b945b16b98b51b423bafec14cf2be6
+SHA1 (patch-XSA302) = 12fbb7dfea27f53c70c8115487a2e30595549c2b
+SHA1 (patch-XSA304) = f2c22732227e11a3e77c630f0264a689eed53399
+SHA1 (patch-XSA305) = eb5e0096cbf501fcbd7a5c5f9d1f932b557636b6
 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
 SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b

Added files:

Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA298
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA298:1.1
--- /dev/null   Wed Nov 13 13:36:11 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA298   Wed Nov 13 13:36:11 2019
@@ -0,0 +1,89 @@
+$NetBSD: patch-XSA298,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: x86/PV: check GDT/LDT limits during emulation
+
+Accesses beyond the LDT limit originating from emulation would trigger
+the ASSERT() in pv_map_ldt_shadow_page(). On production builds such
+accesses would cause an attempt to promote the touched page (offset from
+the present LDT base address) to a segment descriptor one. If this
+happens to succeed, guest user mode would be able to elevate its
+privileges to that of the guest kernel. This is particularly easy when
+there's no LDT at all, in which case the LDT base stored internally to
+Xen is simply zero.
+
+Also adjust the ASSERT() that was triggering: It was off by one to
+begin with, and for production builds we also better use
+ASSERT_UNREACHABLE() instead with suitable recovery code afterwards.
+
+This is XSA-298.
+
+Reported-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+
+--- xen/arch/x86/pv/emul-gate-op.c.orig
++++ xen/arch/x86/pv/emul-gate-op.c
+@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned
+     const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel);
+ 
+     if ( (gate_sel < 4) ||
+-         ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
++         /*
++          * We're interested in call gates only, which occupy a single
++          * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit.
++          */
++         ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >=
++          (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents
++                        : v->arch.pv_vcpu.gdt_ents)) ||
+          __get_user(desc, pdesc) )
+         return 0;
+ 
+@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned
+     if ( !is_pv_32bit_vcpu(v) )
+     {
+         if ( (*ar & 0x1f00) != 0x0c00 ||
+-             (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
++             /* Limit check done above already. */
+              __get_user(desc, pdesc + 1) ||
+              (desc.b & 0x1f00) )
+             return 0;
+--- xen/arch/x86/pv/emulate.c.orig
++++ xen/arch/x86/pv/emulate.c
+@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int
+ {
+     struct desc_struct desc;
+ 
+-    if ( sel < 4)
++    if ( sel < 4 ||
++         /*
++          * Don't apply the GDT limit here, as the selector may be a Xen
++          * provided one. __get_user() will fail (without taking further
++          * action) for ones falling in the gap between guest populated
++          * and Xen ones.
++          */
++         ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) )
+         desc.b = desc.a = 0;
+     else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) )
+         return 0;
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int
+     BUG_ON(unlikely(in_irq()));
+ 
+     /*
+-     * Hardware limit checking should guarantee this property.  NB. This is
++     * Prior limit checking should guarantee this property.  NB. This is
+      * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the
+      * current vcpu, and vcpu_reset() will block until this vcpu has been
+      * descheduled before continuing.
+      */
+-    ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents);
++    if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) )
++    {
++        ASSERT_UNREACHABLE();
++        return false;
++    }
+ 
+     if ( is_pv_32bit_domain(currd) )
+         linear = (uint32_t)linear;
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA302
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA302:1.1
--- /dev/null   Wed Nov 13 13:36:11 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA302   Wed Nov 13 13:36:11 2019
@@ -0,0 +1,537 @@
+$NetBSD: patch-XSA302,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From bbca29f88d9ad9c7e91125a3b5d5f13a23e5801f Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Date: Wed, 2 Oct 2019 13:36:59 +0200
+Subject: [PATCH 1/2] IOMMU: add missing HVM check
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fix an unguarded d->arch.hvm access in assign_device().
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+
+(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6)
+Signed-off-by: Ian Jackson <ian.jackson%eu.citrix.com@localhost>
+---
+ xen/drivers/passthrough/pci.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index f51cae7f4e..037aba7c94 100644
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -1416,7 +1416,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+     /* Prevent device assign if mem paging or mem sharing have been 
+      * enabled for this domain */
+     if ( unlikely(!need_iommu(d) &&
+-            (d->arch.hvm_domain.mem_sharing_enabled ||
++            ((is_hvm_domain(d) &&
++              d->arch.hvm_domain.mem_sharing_enabled) ||
+              vm_event_check_ring(d->vm_event_paging) ||
+              p2m_get_hostp2m(d)->global_logdirty)) )
+         return -EXDEV;
+-- 
+2.11.0
+
+From ec99857f59f7f06236f11ca8b0b2303e5e745cc4 Mon Sep 17 00:00:00 2001
+From: Paul Durrant <paul.durrant%citrix.com@localhost>
+Date: Mon, 14 Oct 2019 17:52:59 +0100
+Subject: [PATCH 2/2] passthrough: quarantine PCI devices
+
+When a PCI device is assigned to an untrusted domain, it is possible for
+that domain to program the device to DMA to an arbitrary address. The
+IOMMU is used to protect the host from malicious DMA by making sure that
+the device addresses can only target memory assigned to the guest. However,
+when the guest domain is torn down the device is assigned back to dom0,
+thus allowing any in-flight DMA to potentially target critical host data.
+
+This patch introduces a 'quarantine' for PCI devices using dom_io. When
+the toolstack makes a device assignable (by binding it to pciback), it
+will now also assign it to DOMID_IO and the device will only be assigned
+back to dom0 when the device is made unassignable again. Whilst device is
+assignable it will only ever transfer between dom_io and guest domains.
+dom_io is actually only used as a sentinel domain for quarantining purposes;
+it is not configured with any IOMMU mappings. Assignment to dom_io simply
+means that the device's initiator (requestor) identifier is not present in
+the IOMMU's device table and thus any DMA transactions issued will be
+terminated with a fault condition.
+
+In addition, a fix to assignment handling is made for VT-d.  Failure
+during the assignment step should not lead to a device still being
+associated with its prior owner. Hand the device to DomIO temporarily,
+until the assignment step has completed successfully.  Remove the PI
+hooks from the source domain then earlier as well.
+
+Failure of the recovery reassign_device_ownership() may not go silent:
+There e.g. may still be left over RMRR mappings in the domain assignment
+to which has failed, and hence we can't allow that domain to continue
+executing.
+
+NOTE: This patch also includes one printk() cleanup; the
+      "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(),
+      since similar printk()-s elsewhere also don't log such a tag.
+
+This is XSA-302.
+
+Signed-off-by: Paul Durrant <paul.durrant%citrix.com@localhost>
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Signed-off-by: Ian Jackson <ian.jackson%eu.citrix.com@localhost>
+---
+ tools/libxl/libxl_pci.c                     | 25 +++++++++++-
+ xen/arch/x86/mm.c                           |  2 +
+ xen/common/domctl.c                         | 14 ++++++-
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++-
+ xen/drivers/passthrough/iommu.c             |  9 +++++
+ xen/drivers/passthrough/pci.c               | 59 ++++++++++++++++++++++-------
+ xen/drivers/passthrough/vtd/iommu.c         | 40 ++++++++++++++++---
+ xen/include/xen/pci.h                       |  3 ++
+ 8 files changed, 138 insertions(+), 24 deletions(-)
+
+diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
+index 4755a0c93c..81890a91ac 100644
+--- tools/libxl/libxl_pci.c.orig
++++ tools/libxl/libxl_pci.c
+@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+                                             libxl_device_pci *pcidev,
+                                             int rebind)
+ {
++    libxl_ctx *ctx = libxl__gc_owner(gc);
+     unsigned dom, bus, dev, func;
+     char *spath, *driver_path = NULL;
+     int rc;
+@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+     }
+     if ( rc ) {
+         LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func);
+-        return 0;
++        goto quarantine;
+     }
+ 
+     /* Check to see if there's already a driver that we need to unbind from */
+@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+         return ERROR_FAIL;
+     }
+ 
++quarantine:
++    /*
++     * DOMID_IO is just a sentinel domain, without any actual mappings,
++     * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being
++     * unnecessarily denied.
++     */
++    rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev),
++                          XEN_DOMCTL_DEV_RDM_RELAXED);
++    if ( rc < 0 ) {
++        LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func);
++        return ERROR_FAIL;
++    }
++
+     return 0;
+ }
+ 
+@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc,
+                                                libxl_device_pci *pcidev,
+                                                int rebind)
+ {
++    libxl_ctx *ctx = libxl__gc_owner(gc);
+     int rc;
+     char *driver_path;
+ 
++    /* De-quarantine */
++    rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev));
++    if ( rc < 0 ) {
++        LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus,
++            pcidev->dev, pcidev->func);
++        return ERROR_FAIL;
++    }
++
+     /* Unbind from pciback */
+     if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) {
+         return ERROR_FAIL;
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e6a4cb28f8..c1ab57f9a5 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -295,9 +295,11 @@ void __init arch_init_memory(void)
+      * Initialise our DOMID_IO domain.
+      * This domain owns I/O pages that are within the range of the page_info
+      * array. Mappings occur at the priv of the caller.
++     * Quarantined PCI devices will be associated with this domain.
+      */
+     dom_io = domain_create(DOMID_IO, NULL);
+     BUG_ON(IS_ERR(dom_io));
++    INIT_LIST_HEAD(&dom_io->arch.pdev_list);
+ 
+     /*
+      * Initialise our COW domain.
+diff --git a/xen/common/domctl.c b/xen/common/domctl.c
+index 9b7bc083ee..741d774cd1 100644
+--- xen/common/domctl.c.orig
++++ xen/common/domctl.c
+@@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ 
+     switch ( op->cmd )
+     {
++    case XEN_DOMCTL_assign_device:
++    case XEN_DOMCTL_deassign_device:
++        if ( op->domain == DOMID_IO )
++        {
++            d = dom_io;
++            break;
++        }
++        else if ( op->domain == DOMID_INVALID )
++            return -ESRCH;
++        /* fall through */
+     case XEN_DOMCTL_test_assign_device:
+         if ( op->domain == DOMID_INVALID )
+         {
+@@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ 
+     if ( !domctl_lock_acquire() )
+     {
+-        if ( d )
++        if ( d && d != dom_io )
+             rcu_unlock_domain(d);
+         return hypercall_create_continuation(
+             __HYPERVISOR_domctl, "h", u_domctl);
+@@ -1148,7 +1158,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+     domctl_lock_release();
+ 
+  domctl_out_unlock_domonly:
+-    if ( d )
++    if ( d && d != dom_io )
+         rcu_unlock_domain(d);
+ 
+     if ( copyback && __copy_to_guest(u_domctl, op, 1) )
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index 12d2695b89..ec8baae717 100644
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device(
+     u8 bus = pdev->bus;
+     const struct domain_iommu *hd = dom_iommu(domain);
+ 
++    /* dom_io is used as a sentinel for quarantined devices */
++    if ( domain == dom_io )
++        return;
++
+     BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
+             !iommu->dev_table.buffer );
+ 
+@@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
+     int req_id;
+     u8 bus = pdev->bus;
+ 
++    /* dom_io is used as a sentinel for quarantined devices */
++    if ( domain == dom_io )
++        return;
++
+     BUG_ON ( iommu->dev_table.buffer == NULL );
+     req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
+     dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+@@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn,
+             ivrs_mappings[req_id].read_permission);
+     }
+ 
+-    return reassign_device(hardware_domain, d, devfn, pdev);
++    return reassign_device(pdev->domain, d, devfn, pdev);
+ }
+ 
+ static void deallocate_next_page_table(struct page_info *pg, int level)
+diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
+index 04b0be37d3..8027d96f1c 100644
+--- xen/drivers/passthrough/iommu.c.orig
++++ xen/drivers/passthrough/iommu.c
+@@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d)
+ {
+     const struct domain_iommu *hd = dom_iommu(d);
+ 
++    if ( d == dom_io )
++        return;
++
+     d->need_iommu = 0;
+     hd->platform_ops->teardown(d);
+     tasklet_schedule(&iommu_pt_cleanup_tasklet);
+@@ -229,6 +232,9 @@ int iommu_construct(struct domain *d)
+     if ( need_iommu(d) > 0 )
+         return 0;
+ 
++    if ( d == dom_io )
++        return 0;
++
+     if ( !iommu_use_hap_pt(d) )
+     {
+         int rc;
+@@ -404,6 +410,9 @@ int __init iommu_setup(void)
+     printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
+     if ( iommu_enabled )
+     {
++        if ( iommu_domain_init(dom_io) )
++            panic("Could not set up quarantine\n");
++
+         printk(" - Dom0 mode: %s\n",
+                iommu_passthrough ? "Passthrough" :
+                iommu_dom0_strict ? "Strict" : "Relaxed");
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 037aba7c94..fb010a547b 100644
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -1389,19 +1389,29 @@ static int iommu_remove_device(struct pci_dev *pdev)
+     return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
+ }
+ 
+-/*
+- * If the device isn't owned by the hardware domain, it means it already
+- * has been assigned to other domain, or it doesn't exist.
+- */
+ static int device_assigned(u16 seg, u8 bus, u8 devfn)
+ {
+     struct pci_dev *pdev;
++    int rc = 0;
+ 
+     pcidevs_lock();
+-    pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
++
++    pdev = pci_get_pdev(seg, bus, devfn);
++
++    if ( !pdev )
++        rc = -ENODEV;
++    /*
++     * If the device exists and it is not owned by either the hardware
++     * domain or dom_io then it must be assigned to a guest, or be
++     * hidden (owned by dom_xen).
++     */
++    else if ( pdev->domain != hardware_domain &&
++              pdev->domain != dom_io )
++        rc = -EBUSY;
++
+     pcidevs_unlock();
+ 
+-    return pdev ? 0 : -EBUSY;
++    return rc;
+ }
+ 
+ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+@@ -1415,7 +1425,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ 
+     /* Prevent device assign if mem paging or mem sharing have been 
+      * enabled for this domain */
+-    if ( unlikely(!need_iommu(d) &&
++    if ( d != dom_io &&
++         unlikely(!need_iommu(d) &&
+             ((is_hvm_domain(d) &&
+               d->arch.hvm_domain.mem_sharing_enabled) ||
+              vm_event_check_ring(d->vm_event_paging) ||
+@@ -1432,12 +1443,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+         return rc;
+     }
+ 
+-    pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
++    pdev = pci_get_pdev(seg, bus, devfn);
++
++    rc = -ENODEV;
+     if ( !pdev )
+-    {
+-        rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV;
+         goto done;
+-    }
++
++    rc = 0;
++    if ( d == pdev->domain )
++        goto done;
++
++    rc = -EBUSY;
++    if ( pdev->domain != hardware_domain &&
++         pdev->domain != dom_io )
++        goto done;
+ 
+     if ( pdev->msix )
+         msixtbl_init(d);
+@@ -1460,6 +1479,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+     }
+ 
+  done:
++    /* The device is assigned to dom_io so mark it as quarantined */
++    if ( !rc && d == dom_io )
++        pdev->quarantine = true;
++
+     if ( !has_arch_pdevs(d) && need_iommu(d) )
+         iommu_teardown(d);
+     pcidevs_unlock();
+@@ -1472,6 +1495,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ {
+     const struct domain_iommu *hd = dom_iommu(d);
+     struct pci_dev *pdev = NULL;
++    struct domain *target;
+     int ret = 0;
+ 
+     if ( !iommu_enabled || !hd->platform_ops )
+@@ -1482,12 +1506,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+     if ( !pdev )
+         return -ENODEV;
+ 
++    /* De-assignment from dom_io should de-quarantine the device */
++    target = (pdev->quarantine && pdev->domain != dom_io) ?
++        dom_io : hardware_domain;
++
+     while ( pdev->phantom_stride )
+     {
+         devfn += pdev->phantom_stride;
+         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+             break;
+-        ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
++        ret = hd->platform_ops->reassign_device(d, target, devfn,
+                                                 pci_to_dev(pdev));
+         if ( !ret )
+             continue;
+@@ -1498,7 +1526,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+     }
+ 
+     devfn = pdev->devfn;
+-    ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
++    ret = hd->platform_ops->reassign_device(d, target, devfn,
+                                             pci_to_dev(pdev));
+     if ( ret )
+     {
+@@ -1508,6 +1536,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+         return ret;
+     }
+ 
++    if ( pdev->domain == hardware_domain  )
++        pdev->quarantine = false;
++
+     pdev->fault.count = 0;
+ 
+     if ( !has_arch_pdevs(d) && need_iommu(d) )
+@@ -1686,7 +1717,7 @@ int iommu_do_pci_domctl(
+             ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                 "h", u_domctl);
+         else if ( ret )
+-            printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
++            printk(XENLOG_G_ERR
+                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
+                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                    d->domain_id, ret);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 4c719d4ee7..19f7d13013 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1338,6 +1338,10 @@ int domain_context_mapping_one(
+     int agaw, rc, ret;
+     bool_t flush_dev_iotlb;
+ 
++    /* dom_io is used as a sentinel for quarantined devices */
++    if ( domain == dom_io )
++        return 0;
++
+     ASSERT(pcidevs_locked());
+     spin_lock(&iommu->lock);
+     maddr = bus_to_context_maddr(iommu, bus);
+@@ -1573,6 +1577,10 @@ int domain_context_unmap_one(
+     int iommu_domid, rc, ret;
+     bool_t flush_dev_iotlb;
+ 
++    /* dom_io is used as a sentinel for quarantined devices */
++    if ( domain == dom_io )
++        return 0;
++
+     ASSERT(pcidevs_locked());
+     spin_lock(&iommu->lock);
+ 
+@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+         goto out;
+     }
+ 
++    /* dom_io is used as a sentinel for quarantined devices */
++    if ( domain == dom_io )
++        goto out;
++
+     /*
+      * if no other devices under the same iommu owned by this domain,
+      * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
+@@ -2389,6 +2401,15 @@ static int reassign_device_ownership(
+     if ( ret )
+         return ret;
+ 
++    if ( devfn == pdev->devfn )
++    {
++        list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
++        pdev->domain = dom_io;
++    }
++
++    if ( !has_arch_pdevs(source) )
++        vmx_pi_hooks_deassign(source);
++
+     if ( !has_arch_pdevs(target) )
+         vmx_pi_hooks_assign(target);
+ 
+@@ -2407,15 +2428,13 @@ static int reassign_device_ownership(
+         pdev->domain = target;
+     }
+ 
+-    if ( !has_arch_pdevs(source) )
+-        vmx_pi_hooks_deassign(source);
+-
+     return ret;
+ }
+ 
+ static int intel_iommu_assign_device(
+     struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
+ {
++    struct domain *s = pdev->domain;
+     struct acpi_rmrr_unit *rmrr;
+     int ret = 0, i;
+     u16 bdf, seg;
+@@ -2458,8 +2477,8 @@ static int intel_iommu_assign_device(
+         }
+     }
+ 
+-    ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
+-    if ( ret )
++    ret = reassign_device_ownership(s, d, devfn, pdev);
++    if ( ret || d == dom_io )
+         return ret;
+ 
+     /* Setup rmrr identity mapping */
+@@ -2472,11 +2491,20 @@ static int intel_iommu_assign_device(
+             ret = rmrr_identity_mapping(d, 1, rmrr, flag);
+             if ( ret )
+             {
+-                reassign_device_ownership(d, hardware_domain, devfn, pdev);
++                int rc;
++
++                rc = reassign_device_ownership(d, s, devfn, pdev);
+                 printk(XENLOG_G_ERR VTDPREFIX
+                        " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
+                        rmrr->base_address, rmrr->end_address,
+                        d->domain_id, ret);
++                if ( rc )
++                {
++                    printk(XENLOG_ERR VTDPREFIX
++                           " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
++                           seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
++                    domain_crash(d);
++                }
+                 break;
+             }
+         }
+diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
+index 4cfa774615..066364bdef 100644
+--- xen/include/xen/pci.h.orig
++++ xen/include/xen/pci.h
+@@ -88,6 +88,9 @@ struct pci_dev {
+ 
+     nodeid_t node; /* NUMA node */
+ 
++    /* Device to be quarantined, don't automatically re-assign to dom0 */
++    bool quarantine;
++
+     enum pdev_type {
+         DEV_TYPE_PCI_UNKNOWN,
+         DEV_TYPE_PCIe_ENDPOINT,
+-- 
+2.11.0
+
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA304
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA304:1.1
--- /dev/null   Wed Nov 13 13:36:11 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA304   Wed Nov 13 13:36:11 2019
@@ -0,0 +1,481 @@
+$NetBSD: patch-XSA304,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs
+
+Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and
+an EPT superpage gets shattered.  The root cause is still under investigation,
+but the end result is unusable in combination with CVE-2018-12207 protections.
+
+This is part of XSA-304 / CVE-2018-12207
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index fb7edfaef9..d698b1d50a 100644
+--- xen/drivers/passthrough/vtd/extern.h.orig
++++ xen/drivers/passthrough/vtd/extern.h
+@@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu);
+ int __must_check me_wifi_quirk(struct domain *domain,
+                                u8 bus, u8 devfn, int map);
+ void pci_vtd_quirk(const struct pci_dev *);
++void quirk_iommu_caps(struct iommu *iommu);
++
+ bool_t platform_supports_intremap(void);
+ bool_t platform_supports_x2apic(void);
+ 
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index f242e30caf..8712d3b4dc 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1211,6 +1211,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+     if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
+         return -ENODEV;
+ 
++    quirk_iommu_caps(iommu);
++
+     if ( cap_fault_reg_offset(iommu->cap) +
+          cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
+          ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index d6db862678..b02688e316 100644
+--- xen/drivers/passthrough/vtd/quirks.c.orig
++++ xen/drivers/passthrough/vtd/quirks.c
+@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
+         break;
+     }
+ }
++
++void __init quirk_iommu_caps(struct iommu *iommu)
++{
++    /*
++     * IOMMU Quirks:
++     *
++     * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't
++     * implement superpages internally.
++     *
++     * There are issues changing the walk length under in-flight DMA, which
++     * has manifested as incompatibility between EPT/IOMMU sharing and the
++     * workaround for CVE-2018-12207 / XSA-304.  Hide the superpages
++     * capabilities in the IOMMU, which will prevent Xen from sharing the EPT
++     * and IOMMU pagetables.
++     *
++     * Detection of SandyBridge unfortunately has to be done by processor
++     * model because the client parts don't expose their IOMMUs as PCI devices
++     * we could match with a Device ID.
++     */
++    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
++         boot_cpu_data.x86 == 6 &&
++         (boot_cpu_data.x86_model == 0x2a ||
++          boot_cpu_data.x86_model == 0x2d) )
++        iommu->cap &= ~(0xful << 34);
++}
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/vtx: Disable executable EPT superpages to work around
+ CVE-2018-12207
+
+CVE-2018-12207 covers a set of errata on various Intel processors, whereby a
+machine check exception can be generated in a corner case when an executable
+mapping changes size or cacheability without TLB invalidation.  HVM guest
+kernels can trigger this to DoS the host.
+
+To mitigate, in affected hardware, all EPT superpages are marked NX.  When an
+instruction fetch violation is observed against the superpage, the superpage
+is shattered to 4k and has execute permissions restored.  This prevents the
+guest kernel from being able to create the necessary preconditions in the iTLB
+to exploit the vulnerability.
+
+This does come with a workload-dependent performance overhead, caused by
+increased TLB pressure.  Performance can be restored, if guest kernels are
+trusted not to mount an attack, by specifying ept=exec-sp on the command line.
+
+This is part of XSA-304 / CVE-2018-12207
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Acked-by: George Dunlap <george.dunlap%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index c63a07d29b..684671cb7b 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -828,7 +828,7 @@ effect the inverse meaning.
+ >> set as UC.
+ 
+ ### ept (Intel)
+-> `= List of ( {no-}pml | {no-}ad )`
++> `= List of [ {no-}pml,  {no-}ad, {no-}exec-sp ]`
+ 
+ Controls EPT related features.
+ 
+@@ -851,6 +851,16 @@ Controls EPT related features.
+ 
+ >> Have hardware keep accessed/dirty (A/D) bits updated.
+ 
++*   The `exec-sp` boolean controls whether EPT superpages with execute
++    permissions are permitted.  In general this is good for performance.
++
++    However, on processors vulnerable CVE-2018-12207, HVM guest kernels can
++    use executable superpages to crash the host.  By default, executable
++    superpages are disabled on affected hardware.
++
++    If HVM guest kernels are trusted not to mount a DoS against the system,
++    this option can enabled to regain performance.
++
+ ### extra\_guest\_irqs
+ > `= [<domU number>][,<dom0 number>]`
+ 
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index f4a6a37149..1924434960 100644
+--- xen/arch/x86/hvm/hvm.c.orig
++++ xen/arch/x86/hvm/hvm.c
+@@ -1706,6 +1706,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+     struct p2m_domain *p2m, *hostp2m;
+     int rc, fall_through = 0, paged = 0;
+     int sharing_enomem = 0;
++    unsigned int page_order = 0;
+     vm_event_request_t *req_ptr = NULL;
+     bool_t ap2m_active, sync = 0;
+ 
+@@ -1774,7 +1775,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+     hostp2m = p2m_get_hostp2m(currd);
+     mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
+                               P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
+-                              NULL);
++                              &page_order);
+ 
+     if ( ap2m_active )
+     {
+@@ -1786,7 +1787,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+             goto out;
+         }
+ 
+-        mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
++        mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order);
+     }
+     else
+         p2m = hostp2m;
+@@ -1828,6 +1829,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+             break;
+         }
+ 
++        /*
++         * Workaround for XSA-304 / CVE-2018-12207.  If we take an execution
++         * fault against a non-executable superpage, shatter it to regain
++         * execute permissions.
++         */
++        if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation )
++        {
++            int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K,
++                                    p2mt, p2ma);
++
++            if ( res )
++                printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n",
++                       gfn, res);
++
++            rc = !res;
++            goto out_put_gfn;
++        }
++
+         if ( violation )
+         {
+             /* Should #VE be emulated for this fault? */
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 493986e84a..8821a3b536 100644
+--- xen/arch/x86/hvm/vmx/vmcs.c.orig
++++ xen/arch/x86/hvm/vmx/vmcs.c
+@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window);
+ 
+ static bool_t __read_mostly opt_pml_enabled = 1;
+ static s8 __read_mostly opt_ept_ad = -1;
++int8_t __read_mostly opt_ept_exec_sp = -1;
+ 
+ /*
+  * The 'ept' parameter controls functionalities that depend on, or impact the
+@@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s)
+             opt_pml_enabled = val;
+         else if ( !cmdline_strcmp(s, "ad") )
+             opt_ept_ad = val;
++        else if ( !cmdline_strcmp(s, "exec-sp") )
++            opt_ept_exec_sp = val;
+         else
+             rc = -EINVAL;
+ 
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 840dc2b44d..a568d62643 100644
+--- xen/arch/x86/hvm/vmx/vmx.c.orig
++++ xen/arch/x86/hvm/vmx/vmx.c
+@@ -2415,6 +2415,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs)
+ static void __init lbr_tsx_fixup_check(void);
+ static void __init bdw_erratum_bdf14_fixup_check(void);
+ 
++/*
++ * Calculate whether the CPU is vulnerable to Instruction Fetch page
++ * size-change MCEs.
++ */
++static bool __init has_if_pschange_mc(void)
++{
++    uint64_t caps = 0;
++
++    /*
++     * If we are virtualised, there is nothing we can do.  Our EPT tables are
++     * shadowed by our hypervisor, and not walked by hardware.
++     */
++    if ( cpu_has_hypervisor )
++        return false;
++
++    if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
++        rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++    if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO )
++        return false;
++
++    /*
++     * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at
++     * this time.
++     */
++    if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
++         boot_cpu_data.x86 != 6 )
++        return false;
++
++    switch ( boot_cpu_data.x86_model )
++    {
++        /*
++         * Core processors since at least Nehalem are vulnerable.
++         */
++    case 0x1f: /* Auburndale / Havendale */
++    case 0x1e: /* Nehalem */
++    case 0x1a: /* Nehalem EP */
++    case 0x2e: /* Nehalem EX */
++    case 0x25: /* Westmere */
++    case 0x2c: /* Westmere EP */
++    case 0x2f: /* Westmere EX */
++    case 0x2a: /* SandyBridge */
++    case 0x2d: /* SandyBridge EP/EX */
++    case 0x3a: /* IvyBridge */
++    case 0x3e: /* IvyBridge EP/EX */
++    case 0x3c: /* Haswell */
++    case 0x3f: /* Haswell EX/EP */
++    case 0x45: /* Haswell D */
++    case 0x46: /* Haswell H */
++    case 0x3d: /* Broadwell */
++    case 0x47: /* Broadwell H */
++    case 0x4f: /* Broadwell EP/EX */
++    case 0x56: /* Broadwell D */
++    case 0x4e: /* Skylake M */
++    case 0x5e: /* Skylake D */
++    case 0x55: /* Skylake-X / Cascade Lake */
++    case 0x8e: /* Kaby / Coffee / Whiskey Lake M */
++    case 0x9e: /* Kaby / Coffee / Whiskey Lake D */
++        return true;
++
++        /*
++         * Atom processors are not vulnerable.
++         */
++    case 0x1c: /* Pineview */
++    case 0x26: /* Lincroft */
++    case 0x27: /* Penwell */
++    case 0x35: /* Cloverview */
++    case 0x36: /* Cedarview */
++    case 0x37: /* Baytrail / Valleyview (Silvermont) */
++    case 0x4d: /* Avaton / Rangely (Silvermont) */
++    case 0x4c: /* Cherrytrail / Brasswell */
++    case 0x4a: /* Merrifield */
++    case 0x5a: /* Moorefield */
++    case 0x5c: /* Goldmont */
++    case 0x5d: /* SoFIA 3G Granite/ES2.1 */
++    case 0x65: /* SoFIA LTE AOSP */
++    case 0x5f: /* Denverton */
++    case 0x6e: /* Cougar Mountain */
++    case 0x75: /* Lightning Mountain */
++    case 0x7a: /* Gemini Lake */
++    case 0x86: /* Jacobsville */
++
++        /*
++         * Knights processors are not vulnerable.
++         */
++    case 0x57: /* Knights Landing */
++    case 0x85: /* Knights Mill */
++        return false;
++
++    default:
++        printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n",
++               boot_cpu_data.x86_model);
++        return true;
++    }
++}
++
+ const struct hvm_function_table * __init start_vmx(void)
+ {
+     set_in_cr4(X86_CR4_VMXE);
+@@ -2435,6 +2531,17 @@ const struct hvm_function_table * __init start_vmx(void)
+      */
+     if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
+     {
++        bool cpu_has_bug_pschange_mc = has_if_pschange_mc();
++
++        if ( opt_ept_exec_sp == -1 )
++        {
++            /* Default to non-executable superpages on vulnerable hardware. */
++            opt_ept_exec_sp = !cpu_has_bug_pschange_mc;
++
++            if ( cpu_has_bug_pschange_mc )
++                printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n");
++        }
++
+         vmx_function_table.hap_supported = 1;
+         vmx_function_table.altp2m_supported = 1;
+ 
+diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
+index ce46201d45..93e08f89a2 100644
+--- xen/arch/x86/mm/p2m-ept.c.orig
++++ xen/arch/x86/mm/p2m-ept.c
+@@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
+             break;
+     }
+     
++    /*
++     * Don't create executable superpages if we need to shatter them to
++     * protect against CVE-2018-12207.
++     */
++    if ( !opt_ept_exec_sp && is_epte_superpage(entry) )
++        entry->x = 0;
+ }
+ 
+ #define GUEST_TABLE_MAP_FAILED  0
+diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
+index 89619e4afd..20eb7f6082 100644
+--- xen/include/asm-x86/hvm/vmx/vmx.h.orig
++++ xen/include/asm-x86/hvm/vmx/vmx.h
+@@ -28,6 +28,8 @@
+ #include <asm/hvm/trace.h>
+ #include <asm/hvm/vmx/vmcs.h>
+ 
++extern int8_t opt_ept_exec_sp;
++
+ typedef union {
+     struct {
+         u64 r       :   1,  /* bit 0 - Read permission */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index b8151d2d9f..89ae3e03f1 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -54,6 +54,7 @@
+ #define ARCH_CAPS_SKIP_L1DFL          (_AC(1, ULL) << 3)
+ #define ARCH_CAPS_SSB_NO              (_AC(1, ULL) << 4)
+ #define ARCH_CAPS_MDS_NO              (_AC(1, ULL) << 5)
++#define ARCH_CAPS_IF_PSCHANGE_MC_NO   (_AC(1, ULL) << 6)
+ 
+ #define MSR_FLUSH_CMD                 0x0000010b
+ #define FLUSH_CMD_L1D                 (_AC(1, ULL) << 0)
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/vtx: Allow runtime modification of the exec-sp setting
+
+See patch for details.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: George Dunlap <george.dunlap%citrix.com@localhost>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 684671cb7b..33ed1ffc40 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -861,6 +861,21 @@ Controls EPT related features.
+     If HVM guest kernels are trusted not to mount a DoS against the system,
+     this option can enabled to regain performance.
+ 
++    This boolean may be modified at runtime using `xl set-parameters
++    ept=[no-]exec-sp` to switch between fast and secure.
++
++    *   When switching from secure to fast, preexisting HVM domains will run
++        at their current performance until they are rebooted; new domains will
++        run without any overhead.
++
++    *   When switching from fast to secure, all HVM domains will immediately
++        suffer a performance penalty.
++
++    **Warning: No guarantee is made that this runtime option will be retained
++      indefinitely, or that it will retain this exact behaviour.  It is
++      intended as an emergency option for people who first chose fast, then
++      change their minds to secure, and wish not to reboot.**
++
+ ### extra\_guest\_irqs
+ > `= [<domU number>][,<dom0 number>]`
+ 
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 8821a3b536..15376e25ba 100644
+--- xen/arch/x86/hvm/vmx/vmcs.c.orig
++++ xen/arch/x86/hvm/vmx/vmcs.c
+@@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s)
+ }
+ custom_param("ept", parse_ept_param);
+ 
++static int parse_ept_param_runtime(const char *s)
++{
++    int val;
++
++    if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported ||
++         !(hvm_funcs.hap_capabilities &
++           (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) )
++    {
++        printk("VMX: EPT not available, or not in use - ignoring\n");
++        return 0;
++    }
++
++    if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 )
++        return -EINVAL;
++
++    if ( val != opt_ept_exec_sp )
++    {
++        struct domain *d;
++
++        opt_ept_exec_sp = val;
++
++        rcu_read_lock(&domlist_read_lock);
++        for_each_domain ( d )
++            if ( paging_mode_hap(d) )
++                p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
++        rcu_read_unlock(&domlist_read_lock);
++    }
++
++    printk("VMX: EPT executable superpages %sabled\n",
++           val ? "en" : "dis");
++
++    return 0;
++}
++custom_runtime_only_param("ept", parse_ept_param_runtime);
++
+ /* Dynamic (run-time adjusted) execution control flags. */
+ u32 vmx_pin_based_exec_control __read_mostly;
+ u32 vmx_cpu_based_exec_control __read_mostly;
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index 2b62bc61dd..97c417fc3e 100644
+--- xen/arch/x86/mm/p2m.c.orig
++++ xen/arch/x86/mm/p2m.c
+@@ -257,17 +257,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
+     return 0;
+ }
+ 
++/*
++ * May be called with ot = nt = p2m_ram_rw for its side effect of
++ * recalculating all PTEs in the p2m.
++ */
+ void p2m_change_entry_type_global(struct domain *d,
+                                   p2m_type_t ot, p2m_type_t nt)
+ {
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+ 
+-    ASSERT(ot != nt);
+     ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
+ 
+     p2m_lock(p2m);
+     p2m->change_entry_type_global(p2m, ot, nt);
+-    p2m->global_logdirty = (nt == p2m_ram_logdirty);
++    /* Don't allow 'recalculate' operations to change the logdirty state. */
++    if ( ot != nt )
++        p2m->global_logdirty = (nt == p2m_ram_logdirty);
+     p2m_unlock(p2m);
+ }
+ 
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA305
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA305:1.1
--- /dev/null   Wed Nov 13 13:36:11 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA305   Wed Nov 13 13:36:11 2019
@@ -0,0 +1,482 @@
+$NetBSD: patch-XSA305,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available
+
+To protect against the TSX Async Abort speculative vulnerability, Intel have
+released new microcode for affected parts which introduce the MSR_TSX_CTRL
+control, which allows TSX to be turned off.  This will be architectural on
+future parts.
+
+Introduce tsx= to provide a global on/off for TSX, including its enumeration
+via CPUID.  Provide stub virtualisation of this MSR, as it is not exposed to
+guests at the moment.
+
+VMs may have booted before microcode is loaded, or before hosts have rebooted,
+and they still want to migrate freely.  A VM which booted seeing TSX can
+migrate safely to hosts with TSX disabled - TSX will start unconditionally
+aborting, but still behave in a manner compatible with the ABI.
+
+The guest-visible behaviour is equivalent to late loading the microcode and
+setting the RTM_DISABLE bit in the course of live patching.
+
+This is part of XSA-305 / CVE-2019-11135
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 684671cb7b..b86d26399a 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -1948,6 +1948,20 @@ pages) must also be specified via the tbuf\_size parameter.
+ ### tsc (x86)
+ > `= unstable | skewed | stable:socket`
+ 
++### tsx
++    = <bool>
++
++    Applicability: x86
++    Default: true
++
++Controls for the use of Transactional Synchronization eXtensions.
++
++On Intel parts released in Q3 2019 (with updated microcode), and future parts,
++a control has been introduced which allows TSX to be turned off.
++
++On systems with the ability to turn TSX off, this boolean offers system wide
++control of whether TSX is enabled or disabled.
++
+ ### ucode (x86)
+ > `= [<integer> | scan]`
+ 
+diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
+index da1e4827f4..4c82d9f710 100644
+--- xen/arch/x86/Makefile.orig
++++ xen/arch/x86/Makefile
+@@ -65,6 +65,7 @@ obj-y += sysctl.o
+ obj-y += time.o
+ obj-y += trace.o
+ obj-y += traps.o
++obj-y += tsx.o
+ obj-y += usercopy.o
+ obj-y += x86_emulate.o
+ obj-$(CONFIG_TBOOT) += tboot.o
+diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
+index 5e11970701..04aefa555d 100644
+--- xen/arch/x86/cpuid.c.orig
++++ xen/arch/x86/cpuid.c
+@@ -622,6 +622,20 @@ void recalculate_cpuid_policy(struct domain *d)
+     if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
+         __set_bit(X86_FEATURE_ITSC, max_fs);
+ 
++    /*
++     * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
++     * TSX and hide the feature bits.  Migrating-in VMs may have been booted
++     * pre-mitigation when the TSX features were visbile.
++     *
++     * This situation is compatible (albeit with a perf hit to any TSX code in
++     * the guest), so allow the feature bits to remain set.
++     */
++    if ( cpu_has_tsx_ctrl )
++    {
++        __set_bit(X86_FEATURE_HLE, max_fs);
++        __set_bit(X86_FEATURE_RTM, max_fs);
++    }
++
+     /* Clamp the toolstacks choices to reality. */
+     for ( i = 0; i < ARRAY_SIZE(fs); i++ )
+         fs[i] &= max_fs[i];
+diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
+index ebc0665615..35d99a98a1 100644
+--- xen/arch/x86/msr.c.orig
++++ xen/arch/x86/msr.c
+@@ -153,6 +153,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+     case MSR_FLUSH_CMD:
+         /* Write-only */
+     case MSR_TSX_FORCE_ABORT:
++    case MSR_TSX_CTRL:
+         /* Not offered to guests. */
+         goto gp_fault;
+ 
+@@ -233,6 +234,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+     case MSR_ARCH_CAPABILITIES:
+         /* Read-only */
+     case MSR_TSX_FORCE_ABORT:
++    case MSR_TSX_CTRL:
+         /* Not offered to guests. */
+         goto gp_fault;
+ 
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index 657160549f..dc13ad6c36 100644
+--- xen/arch/x86/setup.c.orig
++++ xen/arch/x86/setup.c
+@@ -1551,6 +1551,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+ 
+     early_microcode_init();
+ 
++    tsx_init(); /* Needs microcode.  May change HLE/RTM feature bits. */
++
+     identify_cpu(&boot_cpu_data);
+ 
+     set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index fd52a10cf9..bdc118d88b 100644
+--- xen/arch/x86/smpboot.c.orig
++++ xen/arch/x86/smpboot.c
+@@ -376,6 +376,8 @@ void start_secondary(void *unused)
+     if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
+ 
++    tsx_init(); /* Needs microcode.  May change HLE/RTM feature bits. */
++
+     if ( xen_guest )
+         hypervisor_ap_setup();
+ 
+diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
+new file mode 100644
+index 0000000000..a8ec2ccc69
+--- /dev/null
++++ xen/arch/x86/tsx.c
+@@ -0,0 +1,74 @@
++#include <xen/init.h>
++#include <asm/msr.h>
++
++/*
++ * Valid values:
++ *   1 => Explicit tsx=1
++ *   0 => Explicit tsx=0
++ *  -1 => Default, implicit tsx=1
++ *
++ * This is arranged such that the bottom bit encodes whether TSX is actually
++ * disabled, while identifying various explicit (>=0) and implicit (<0)
++ * conditions.
++ */
++int8_t __read_mostly opt_tsx = -1;
++int8_t __read_mostly cpu_has_tsx_ctrl = -1;
++
++static int __init parse_tsx(const char *s)
++{
++    int rc = 0, val = parse_bool(s, NULL);
++
++    if ( val >= 0 )
++        opt_tsx = val;
++    else
++        rc = -EINVAL;
++
++    return rc;
++}
++custom_param("tsx", parse_tsx);
++
++void tsx_init(void)
++{
++    /*
++     * This function is first called between microcode being loaded, and CPUID
++     * being scanned generally.  Calculate from raw data whether MSR_TSX_CTRL
++     * is available.
++     */
++    if ( unlikely(cpu_has_tsx_ctrl < 0) )
++    {
++        uint64_t caps = 0;
++
++        if ( boot_cpu_data.cpuid_level >= 7 &&
++             (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) )
++            rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++        cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
++    }
++
++    if ( cpu_has_tsx_ctrl )
++    {
++        uint64_t val;
++
++        rdmsrl(MSR_TSX_CTRL, val);
++
++        val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR);
++        /* Check bottom bit only.  Higher bits are various sentinals. */
++        if ( !(opt_tsx & 1) )
++            val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR;
++
++        wrmsrl(MSR_TSX_CTRL, val);
++    }
++    else if ( opt_tsx >= 0 )
++        printk_once(XENLOG_WARNING
++                    "MSR_TSX_CTRL not available - Ignoring tsx= setting\n");
++}
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 89ae3e03f1..5ee7a37c12 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -55,6 +55,7 @@
+ #define ARCH_CAPS_SSB_NO              (_AC(1, ULL) << 4)
+ #define ARCH_CAPS_MDS_NO              (_AC(1, ULL) << 5)
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO   (_AC(1, ULL) << 6)
++#define ARCH_CAPS_TSX_CTRL            (_AC(1, ULL) << 7)
+ 
+ #define MSR_FLUSH_CMD                 0x0000010b
+ #define FLUSH_CMD_L1D                 (_AC(1, ULL) << 0)
+@@ -62,6 +63,10 @@
+ #define MSR_TSX_FORCE_ABORT             0x0000010f
+ #define TSX_FORCE_ABORT_RTM             (_AC(1, ULL) <<  0)
+ 
++#define MSR_TSX_CTRL                    0x00000122
++#define TSX_CTRL_RTM_DISABLE            (_AC(1, ULL) <<  0)
++#define TSX_CTRL_CPUID_CLEAR            (_AC(1, ULL) <<  1)
++
+ /* Intel MSRs. Some also available on other CPUs */
+ #define MSR_IA32_PERFCTR0             0x000000c1
+ #define MSR_IA32_A_PERFCTR0           0x000004c1
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 20d1ecb332..66224f23b9 100644
+--- xen/include/asm-x86/processor.h.orig
++++ xen/include/asm-x86/processor.h
+@@ -258,6 +258,16 @@ static always_inline unsigned int cpuid_count_ebx(
+     return ebx;
+ }
+ 
++static always_inline unsigned int cpuid_count_edx(
++    unsigned int leaf, unsigned int subleaf)
++{
++    unsigned int edx, tmp;
++
++    cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx);
++
++    return edx;
++}
++
+ static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf,
+                                            struct cpuid_leaf *data)
+ {
+@@ -610,6 +620,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model,
+     return fam;
+ }
+ 
++extern int8_t opt_tsx, cpu_has_tsx_ctrl;
++void tsx_init(void);
++
+ #endif /* !__ASSEMBLY__ */
+ 
+ #endif /* __ASM_X86_PROCESSOR_H */
+diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
+index 750f809968..be223a6950 100644
+--- xen/include/xen/lib.h.orig
++++ xen/include/xen/lib.h
+@@ -116,6 +116,16 @@ extern int printk_ratelimit(void);
+ #define gprintk(lvl, fmt, args...) \
+     printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
+ 
++#define printk_once(fmt, args...)               \
++({                                              \
++    static bool __read_mostly once_;            \
++    if ( unlikely(!once_) )                     \
++    {                                           \
++        once_ = true;                           \
++        printk(fmt, ## args);                   \
++    }                                           \
++})
++
+ #ifdef NDEBUG
+ 
+ static inline void
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel
+
+See patch documentation and comments.
+
+This is part of XSA-305 / CVE-2019-11135
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index b86d26399a..31635a473a 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -1841,7 +1841,7 @@ extreme care.**
+ An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
+ mitigations, including pieces of infrastructure used to virtualise certain
+ mitigation features for guests.  This also includes settings which `xpti`,
+-`smt`, `pv-l1tf` control, unless the respective option(s) have been
++`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been
+ specified earlier on the command line.
+ 
+ Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
+@@ -1952,7 +1952,7 @@ pages) must also be specified via the tbuf\_size parameter.
+     = <bool>
+ 
+     Applicability: x86
+-    Default: true
++    Default: false on parts vulnerable to TAA, true otherwise
+ 
+ Controls for the use of Transactional Synchronization eXtensions.
+ 
+@@ -1962,6 +1962,19 @@ a control has been introduced which allows TSX to be turned off.
+ On systems with the ability to turn TSX off, this boolean offers system wide
+ control of whether TSX is enabled or disabled.
+ 
++On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following
++logic applies:
++
++ * An explicit `tsx=` choice is honoured, even if it is `true` and would
++   result in a vulnerable system.
++
++ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
++   mitigated by disabling TSX, as this is the lowest overhead option.
++
++ * If the use of TSX is important, the more expensive TAA mitigations can be
++   opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
++   active by default.
++
+ ### ucode (x86)
+ > `= [<integer> | scan]`
+ 
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 2fe16b423d..ab196b156d 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -152,6 +152,9 @@ static int __init parse_spec_ctrl(const char *s)
+             if ( opt_pv_l1tf_domu < 0 )
+                 opt_pv_l1tf_domu = 0;
+ 
++            if ( opt_tsx == -1 )
++                opt_tsx = -3;
++
+         disable_common:
+             opt_rsb_pv = false;
+             opt_rsb_hvm = false;
+@@ -362,7 +365,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+     printk("Speculative mitigation facilities:\n");
+ 
+     /* Hardware features which pertain to speculative mitigations. */
+-    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
++    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP"     : "",
+            (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
+@@ -374,7 +377,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+            (caps & ARCH_CAPS_RSBA)                  ? " RSBA"      : "",
+            (caps & ARCH_CAPS_SKIP_L1DFL)            ? " SKIP_L1DFL": "",
+            (caps & ARCH_CAPS_SSB_NO)                ? " SSB_NO"    : "",
+-           (caps & ARCH_CAPS_MDS_NO)                ? " MDS_NO"    : "");
++           (caps & ARCH_CAPS_MDS_NO)                ? " MDS_NO"    : "",
++           (caps & ARCH_CAPS_TSX_CTRL)              ? " TSX_CTRL"  : "",
++           (caps & ARCH_CAPS_TAA_NO)                ? " TAA_NO"    : "");
+ 
+     /* Compiled-in support which pertains to mitigations. */
+     if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+@@ -388,7 +393,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+                "\n");
+ 
+     /* Settings for Xen's protection, irrespective of guests. */
+-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n",
++    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n",
+            thunk == THUNK_NONE      ? "N/A" :
+            thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+            thunk == THUNK_LFENCE    ? "LFENCE" :
+@@ -397,6 +402,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+            (default_xen_spec_ctrl & SPEC_CTRL_IBRS)  ? "IBRS+" :  "IBRS-",
+            !boot_cpu_has(X86_FEATURE_SSBD)           ? "" :
+            (default_xen_spec_ctrl & SPEC_CTRL_SSBD)  ? " SSBD+" : " SSBD-",
++           !(caps & ARCH_CAPS_TSX_CTRL)              ? "" :
++           (opt_tsx & 1)                             ? " TSX+" : " TSX-",
+            opt_ibpb                                  ? " IBPB"  : "",
+            opt_l1d_flush                             ? " L1D_FLUSH" : "",
+            opt_md_clear_pv || opt_md_clear_hvm       ? " VERW"  : "");
+@@ -911,6 +918,7 @@ void __init init_speculation_mitigations(void)
+ {
+     enum ind_thunk thunk = THUNK_DEFAULT;
+     bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
++    bool cpu_has_bug_taa;
+     uint64_t caps = 0;
+ 
+     if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+@@ -1140,6 +1148,53 @@ void __init init_speculation_mitigations(void)
+             "enabled.  Mitigations will not be fully effective.  Please\n"
+             "choose an explicit smt=<bool> setting.  See XSA-297.\n");
+ 
++    /*
++     * Vulnerability to TAA is a little complicated to quantify.
++     *
++     * In the pipeline, it is just another way to get speculative access to
++     * stale load port, store buffer or fill buffer data, and therefore can be
++     * considered a superset of MDS (on TSX-capable parts).  On parts which
++     * predate MDS_NO, the existing VERW flushing will mitigate this
++     * sidechannel as well.
++     *
++     * On parts which contain MDS_NO, the lack of VERW flushing means that an
++     * attacker can still use TSX to target microarchitectural buffers to leak
++     * secrets.  Therefore, we consider TAA to be the set of TSX-capable parts
++     * which have MDS_NO but lack TAA_NO.
++     *
++     * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the
++     *       cmdline.  MSR_TSX_CTRL will only appear on TSX-capable parts, so
++     *       we check both to spot TSX in a microcode/cmdline independent way.
++     */
++    cpu_has_bug_taa =
++        (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) &&
++        (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO;
++
++    /*
++     * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs
++     * the MDS mitigation of disabling HT and using VERW flushing.
++     *
++     * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
++     * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
++     * advertised, and there isn't a MD_CLEAR_2 flag to use...
++     *
++     * If we're on affected hardware, able to do something about it (which
++     * implies that VERW now works), no explicit TSX choice and traditional
++     * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
++     * plausibly value TSX higher than Hyperthreading...), disable TSX to
++     * mitigate TAA.
++     */
++    if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) &&
++         ((hw_smt_enabled && opt_smt) ||
++          !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) )
++    {
++        setup_clear_cpu_cap(X86_FEATURE_HLE);
++        setup_clear_cpu_cap(X86_FEATURE_RTM);
++
++        opt_tsx = 0;
++        tsx_init();
++    }
++
+     print_details(thunk, caps);
+ 
+     /*
+diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
+index a8ec2ccc69..2d202a0d4e 100644
+--- xen/arch/x86/tsx.c.orig
++++ xen/arch/x86/tsx.c
+@@ -5,7 +5,8 @@
+  * Valid values:
+  *   1 => Explicit tsx=1
+  *   0 => Explicit tsx=0
+- *  -1 => Default, implicit tsx=1
++ *  -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA
++ *  -3 => Implicit tsx=1 (feed-through from spec-ctrl=0)
+  *
+  * This is arranged such that the bottom bit encodes whether TSX is actually
+  * disabled, while identifying various explicit (>=0) and implicit (<0)
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 5ee7a37c12..1761a01f1f 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -56,6 +56,7 @@
+ #define ARCH_CAPS_MDS_NO              (_AC(1, ULL) << 5)
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO   (_AC(1, ULL) << 6)
+ #define ARCH_CAPS_TSX_CTRL            (_AC(1, ULL) << 7)
++#define ARCH_CAPS_TAA_NO              (_AC(1, ULL) << 8)
+ 
+ #define MSR_FLUSH_CMD                 0x0000010b
+ #define FLUSH_CMD_L1D                 (_AC(1, ULL) << 0)

Prev by Date: CVS commit: pkgsrc/doc
Next by Date: CVS commit: pkgsrc/doc
Previous by Thread: CVS commit: pkgsrc/doc
Next by Thread: CVS commit: pkgsrc/doc
Indexes:

Home | Main Index | Thread Index | Old Index