pkgsrc-Changes archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
CVS commit: pkgsrc/sysutils/xenkernel411
Module Name: pkgsrc
Committed By: bouyer
Date: Fri Oct 2 13:00:48 UTC 2020
Modified Files:
pkgsrc/sysutils/xenkernel411: Makefile distinfo
Added Files:
pkgsrc/sysutils/xenkernel411/patches: patch-XSA333 patch-XSA336
patch-XSA337 patch-XSA338 patch-XSA339 patch-XSA340 patch-XSA342
patch-XSA343 patch-XSA344
Log Message:
dd uptream fixes for
XSA333, XSA336, XSA337, XSA338, XSA339, XSA340, XSA342, XSA343, XSA344
bump PKGREVISION
To generate a diff of this commit:
cvs rdiff -u -r1.15 -r1.16 pkgsrc/sysutils/xenkernel411/Makefile
cvs rdiff -u -r1.13 -r1.14 pkgsrc/sysutils/xenkernel411/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xenkernel411/patches/patch-XSA333 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA336 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA337 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA338 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA339 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA340 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA342 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA343 \
pkgsrc/sysutils/xenkernel411/patches/patch-XSA344
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: pkgsrc/sysutils/xenkernel411/Makefile
diff -u pkgsrc/sysutils/xenkernel411/Makefile:1.15 pkgsrc/sysutils/xenkernel411/Makefile:1.16
--- pkgsrc/sysutils/xenkernel411/Makefile:1.15 Mon Aug 24 10:35:35 2020
+++ pkgsrc/sysutils/xenkernel411/Makefile Fri Oct 2 13:00:48 2020
@@ -1,8 +1,8 @@
-# $NetBSD: Makefile,v 1.15 2020/08/24 10:35:35 bouyer Exp $
+# $NetBSD: Makefile,v 1.16 2020/10/02 13:00:48 bouyer Exp $
VERSION= 4.11.4
#keep >= 1 if we have security patches
-PKGREVISION= 1
+PKGREVISION= 2
DISTNAME= xen-${VERSION}
PKGNAME= xenkernel411-${VERSION}
CATEGORIES= sysutils
Index: pkgsrc/sysutils/xenkernel411/distinfo
diff -u pkgsrc/sysutils/xenkernel411/distinfo:1.13 pkgsrc/sysutils/xenkernel411/distinfo:1.14
--- pkgsrc/sysutils/xenkernel411/distinfo:1.13 Mon Aug 24 10:35:35 2020
+++ pkgsrc/sysutils/xenkernel411/distinfo Fri Oct 2 13:00:48 2020
@@ -1,4 +1,4 @@
-$NetBSD: distinfo,v 1.13 2020/08/24 10:35:35 bouyer Exp $
+$NetBSD: distinfo,v 1.14 2020/10/02 13:00:48 bouyer Exp $
SHA1 (xen411/xen-4.11.4.tar.gz) = 6c8cdf441621c14dc5345196b48df6982c060c4f
RMD160 (xen411/xen-4.11.4.tar.gz) = 49819fcd1de3985d4dea370be962548c862f2933
@@ -10,6 +10,15 @@ SHA1 (patch-XSA319) = 4954bdc849666e1c73
SHA1 (patch-XSA320) = 38d84a2ded4ccacee455ba64eb3b369e5661fbfd
SHA1 (patch-XSA321) = 1f15b2e3c0f7f2d7335879d3a83c1557ac9de806
SHA1 (patch-XSA328) = a9b02c183a5dbfb6c0fe50824f18896fcab4a9e9
+SHA1 (patch-XSA333) = 47660b70b2c998436587600bb9a25c2f494afa49
+SHA1 (patch-XSA336) = da0a8bb05877917c75a28155cf2dd2f66d11ef9c
+SHA1 (patch-XSA337) = f323b4c596f8a7b2b3d57dd799f70cf62743369f
+SHA1 (patch-XSA338) = 0adcebec2c25a389155a10de84bf999ff2e5425d
+SHA1 (patch-XSA339) = 4f97076bda8150d1b1c68f6000d563f3c3314c02
+SHA1 (patch-XSA340) = 23888acfe25fc82ff085fa9acfbb36c156a15bc3
+SHA1 (patch-XSA342) = a61c4e28a8c8219b88e3bab534a109b2b29e2cc3
+SHA1 (patch-XSA343) = 239822636b474ebb62aa455cfdbd9853c4fb342f
+SHA1 (patch-XSA344) = cf7184ac9263b418305c6a7fbae7b163b233b4bc
SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
Added files:
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA333
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA333:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA333 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,41 @@
+$NetBSD: patch-XSA333,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/pv: Handle the Intel-specific MSR_MISC_ENABLE correctly
+
+This MSR doesn't exist on AMD hardware, and switching away from the safe
+functions in the common MSR path was an erroneous change.
+
+Partially revert the change.
+
+This is XSA-333.
+
+Fixes: 4fdc932b3cc ("x86/Intel: drop another 32-bit leftover")
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Wei Liu <wl%xen.org@localhost>
+
+diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
+index efeb2a727e..6332c74b80 100644
+--- xen/arch/x86/pv/emul-priv-op.c.orig
++++ xen/arch/x86/pv/emul-priv-op.c
+@@ -924,7 +924,8 @@ static int read_msr(unsigned int reg, uint64_t *val,
+ return X86EMUL_OKAY;
+
+ case MSR_IA32_MISC_ENABLE:
+- rdmsrl(reg, *val);
++ if ( rdmsr_safe(reg, *val) )
++ break;
+ *val = guest_misc_enable(*val);
+ return X86EMUL_OKAY;
+
+@@ -1059,7 +1060,8 @@ static int write_msr(unsigned int reg, uint64_t val,
+ break;
+
+ case MSR_IA32_MISC_ENABLE:
+- rdmsrl(reg, temp);
++ if ( rdmsr_safe(reg, temp) )
++ break;
+ if ( val != guest_misc_enable(temp) )
+ goto invalid;
+ return X86EMUL_OKAY;
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA336
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA336:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA336 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,258 @@
+$NetBSD: patch-XSA336,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Subject: x86/vpt: fix race when migrating timers between vCPUs
+
+The current vPT code will migrate the emulated timers between vCPUs
+(change the pt->vcpu field) while just holding the destination lock,
+either from create_periodic_time or pt_adjust_global_vcpu_target if
+the global target is adjusted. Changing the periodic_timer vCPU field
+in this way creates a race where a third party could grab the lock in
+the unlocked region of pt_adjust_global_vcpu_target (or before
+create_periodic_time performs the vcpu change) and then release the
+lock from a different vCPU, creating a locking imbalance.
+
+Introduce a per-domain rwlock in order to protect periodic_time
+migration between vCPU lists. Taking the lock in read mode prevents
+any timer from being migrated to a different vCPU, while taking it in
+write mode allows performing migration of timers across vCPUs. The
+per-vcpu locks are still used to protect all the other fields from the
+periodic_timer struct.
+
+Note that such migration shouldn't happen frequently, and hence
+there's no performance drop as a result of such locking.
+
+This is XSA-336.
+
+Reported-by: Igor Druzhinin <igor.druzhinin%citrix.com@localhost>
+Tested-by: Igor Druzhinin <igor.druzhinin%citrix.com@localhost>
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+--- xen/arch/x86/hvm/hvm.c.orig
++++ xen/arch/x86/hvm/hvm.c
+@@ -627,6 +627,8 @@ int hvm_domain_initialise(struct domain
+ /* need link to containing domain */
+ d->arch.hvm_domain.pl_time->domain = d;
+
++ rwlock_init(&d->arch.hvm_domain.pl_time->pt_migrate);
++
+ /* Set the default IO Bitmap. */
+ if ( is_hardware_domain(d) )
+ {
+--- xen/arch/x86/hvm/vpt.c.orig
++++ xen/arch/x86/hvm/vpt.c
+@@ -152,23 +152,32 @@ static int pt_irq_masked(struct periodic
+ return 1;
+ }
+
+-static void pt_lock(struct periodic_time *pt)
++static void pt_vcpu_lock(struct vcpu *v)
+ {
+- struct vcpu *v;
++ read_lock(&v->domain->arch.hvm_domain.pl_time->pt_migrate);
++ spin_lock(&v->arch.hvm_vcpu.tm_lock);
++}
+
+- for ( ; ; )
+- {
+- v = pt->vcpu;
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
+- if ( likely(pt->vcpu == v) )
+- break;
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
+- }
++static void pt_vcpu_unlock(struct vcpu *v)
++{
++ spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ read_unlock(&v->domain->arch.hvm_domain.pl_time->pt_migrate);
++}
++
++static void pt_lock(struct periodic_time *pt)
++{
++ /*
++ * We cannot use pt_vcpu_lock here, because we need to acquire the
++ * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
++ * else we might be using a stale value of pt->vcpu.
++ */
++ read_lock(&pt->vcpu->domain->arch.hvm_domain.pl_time->pt_migrate);
++ spin_lock(&pt->vcpu->arch.hvm_vcpu.tm_lock);
+ }
+
+ static void pt_unlock(struct periodic_time *pt)
+ {
+- spin_unlock(&pt->vcpu->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(pt->vcpu);
+ }
+
+ static void pt_process_missed_ticks(struct periodic_time *pt)
+@@ -218,7 +227,7 @@ void pt_save_timer(struct vcpu *v)
+ if ( v->pause_flags & VPF_blocked )
+ return;
+
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_lock(v);
+
+ list_for_each_entry ( pt, head, list )
+ if ( !pt->do_not_freeze )
+@@ -226,7 +235,7 @@ void pt_save_timer(struct vcpu *v)
+
+ pt_freeze_time(v);
+
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+ }
+
+ void pt_restore_timer(struct vcpu *v)
+@@ -234,7 +243,7 @@ void pt_restore_timer(struct vcpu *v)
+ struct list_head *head = &v->arch.hvm_vcpu.tm_list;
+ struct periodic_time *pt;
+
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_lock(v);
+
+ list_for_each_entry ( pt, head, list )
+ {
+@@ -247,7 +256,7 @@ void pt_restore_timer(struct vcpu *v)
+
+ pt_thaw_time(v);
+
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+ }
+
+ static void pt_timer_fn(void *data)
+@@ -272,7 +281,7 @@ int pt_update_irq(struct vcpu *v)
+ uint64_t max_lag;
+ int irq, pt_vector = -1;
+
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_lock(v);
+
+ earliest_pt = NULL;
+ max_lag = -1ULL;
+@@ -300,14 +309,14 @@ int pt_update_irq(struct vcpu *v)
+
+ if ( earliest_pt == NULL )
+ {
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+ return -1;
+ }
+
+ earliest_pt->irq_issued = 1;
+ irq = earliest_pt->irq;
+
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+
+ switch ( earliest_pt->source )
+ {
+@@ -377,12 +386,12 @@ void pt_intr_post(struct vcpu *v, struct
+ if ( intack.source == hvm_intsrc_vector )
+ return;
+
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_lock(v);
+
+ pt = is_pt_irq(v, intack);
+ if ( pt == NULL )
+ {
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+ return;
+ }
+
+@@ -421,7 +430,7 @@ void pt_intr_post(struct vcpu *v, struct
+ cb = pt->cb;
+ cb_priv = pt->priv;
+
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+
+ if ( cb != NULL )
+ cb(v, cb_priv);
+@@ -432,12 +441,12 @@ void pt_migrate(struct vcpu *v)
+ struct list_head *head = &v->arch.hvm_vcpu.tm_list;
+ struct periodic_time *pt;
+
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_lock(v);
+
+ list_for_each_entry ( pt, head, list )
+ migrate_timer(&pt->timer, v->processor);
+
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ pt_vcpu_unlock(v);
+ }
+
+ void create_periodic_time(
+@@ -455,7 +464,7 @@ void create_periodic_time(
+
+ destroy_periodic_time(pt);
+
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ write_lock(&v->domain->arch.hvm_domain.pl_time->pt_migrate);
+
+ pt->pending_intr_nr = 0;
+ pt->do_not_freeze = 0;
+@@ -504,7 +513,7 @@ void create_periodic_time(
+ init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
+ set_timer(&pt->timer, pt->scheduled);
+
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ write_unlock(&v->domain->arch.hvm_domain.pl_time->pt_migrate);
+ }
+
+ void destroy_periodic_time(struct periodic_time *pt)
+@@ -529,30 +538,20 @@ void destroy_periodic_time(struct period
+
+ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
+ {
+- int on_list;
+-
+ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
+
+ if ( pt->vcpu == NULL )
+ return;
+
+- pt_lock(pt);
+- on_list = pt->on_list;
+- if ( pt->on_list )
+- list_del(&pt->list);
+- pt->on_list = 0;
+- pt_unlock(pt);
+-
+- spin_lock(&v->arch.hvm_vcpu.tm_lock);
++ write_lock(&pt->vcpu->domain->arch.hvm_domain.pl_time->pt_migrate);
+ pt->vcpu = v;
+- if ( on_list )
++ if ( pt->on_list )
+ {
+- pt->on_list = 1;
++ list_del(&pt->list);
+ list_add(&pt->list, &v->arch.hvm_vcpu.tm_list);
+-
+ migrate_timer(&pt->timer, v->processor);
+ }
+- spin_unlock(&v->arch.hvm_vcpu.tm_lock);
++ write_unlock(&pt->vcpu->domain->arch.hvm_domain.pl_time->pt_migrate);
+ }
+
+ void pt_adjust_global_vcpu_target(struct vcpu *v)
+--- xen/include/asm-x86/hvm/vpt.h.orig
++++ xen/include/asm-x86/hvm/vpt.h
+@@ -133,6 +133,13 @@ struct pl_time { /* platform time */
+ struct RTCState vrtc;
+ struct HPETState vhpet;
+ struct PMTState vpmt;
++ /*
++ * rwlock to prevent periodic_time vCPU migration. Take the lock in read
++ * mode in order to prevent the vcpu field of periodic_time from changing.
++ * Lock must be taken in write mode when changes to the vcpu field are
++ * performed, as it allows exclusive access to all the timers of a domain.
++ */
++ rwlock_t pt_migrate;
+ /* guest_time = Xen sys time + stime_offset */
+ int64_t stime_offset;
+ /* Ensures monotonicity in appropriate timer modes. */
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA337
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA337:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA337 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,276 @@
+$NetBSD: patch-XSA337,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Subject: x86/msi: get rid of read_msi_msg
+
+It's safer and faster to just use the cached last written
+(untranslated) MSI message stored in msi_desc for the single user that
+calls read_msi_msg.
+
+This also prevents relying on the data read from the device MSI
+registers in order to figure out the index into the IOMMU interrupt
+remapping table, which is not safe.
+
+This is part of XSA-337.
+
+Reported-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Requested-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+--- xen/arch/x86/msi.c.orig
++++ xen/arch/x86/msi.c
+@@ -192,59 +192,6 @@ void msi_compose_msg(unsigned vector, co
+ MSI_DATA_VECTOR(vector);
+ }
+
+-static bool read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+-{
+- switch ( entry->msi_attrib.type )
+- {
+- case PCI_CAP_ID_MSI:
+- {
+- struct pci_dev *dev = entry->dev;
+- int pos = entry->msi_attrib.pos;
+- u16 data, seg = dev->seg;
+- u8 bus = dev->bus;
+- u8 slot = PCI_SLOT(dev->devfn);
+- u8 func = PCI_FUNC(dev->devfn);
+-
+- msg->address_lo = pci_conf_read32(seg, bus, slot, func,
+- msi_lower_address_reg(pos));
+- if ( entry->msi_attrib.is_64 )
+- {
+- msg->address_hi = pci_conf_read32(seg, bus, slot, func,
+- msi_upper_address_reg(pos));
+- data = pci_conf_read16(seg, bus, slot, func,
+- msi_data_reg(pos, 1));
+- }
+- else
+- {
+- msg->address_hi = 0;
+- data = pci_conf_read16(seg, bus, slot, func,
+- msi_data_reg(pos, 0));
+- }
+- msg->data = data;
+- break;
+- }
+- case PCI_CAP_ID_MSIX:
+- {
+- void __iomem *base = entry->mask_base;
+-
+- if ( unlikely(!msix_memory_decoded(entry->dev,
+- entry->msi_attrib.pos)) )
+- return false;
+- msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+- msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+- msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
+- break;
+- }
+- default:
+- BUG();
+- }
+-
+- if ( iommu_intremap )
+- iommu_read_msi_from_ire(entry, msg);
+-
+- return true;
+-}
+-
+ static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+ {
+ entry->msg = *msg;
+@@ -322,10 +269,7 @@ void set_msi_affinity(struct irq_desc *d
+
+ ASSERT(spin_is_locked(&desc->lock));
+
+- memset(&msg, 0, sizeof(msg));
+- if ( !read_msi_msg(msi_desc, &msg) )
+- return;
+-
++ msg = msi_desc->msg;
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: x86/MSI-X: restrict reading of table/PBA bases from BARs
+
+When assigned to less trusted or un-trusted guests, devices may change
+state behind our backs (they may e.g. get reset by means we may not know
+about). Therefore we should avoid reading BARs from hardware once a
+device is no longer owned by Dom0. Furthermore when we can't read a BAR,
+or when we read zero, we shouldn't instead use the caller provided
+address unless that caller can be trusted.
+
+Re-arrange the logic in msix_capability_init() such that only Dom0 (and
+only if the device isn't DomU-owned yet) or calls through
+PHYSDEVOP_prepare_msix will actually result in the reading of the
+respective BAR register(s). Additionally do so only as long as in-use
+table entries are known (note that invocation of PHYSDEVOP_prepare_msix
+counts as a "pseudo" entry). In all other uses the value already
+recorded will get used instead.
+
+Clear the recorded values in _pci_cleanup_msix() as well as on the one
+affected error path. (Adjust this error path to also avoid blindly
+disabling MSI-X when it was enabled on entry to the function.)
+
+While moving around variable declarations (in many cases to reduce their
+scopes), also adjust some of their types.
+
+This is part of XSA-337.
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+
+--- xen/arch/x86/msi.c.orig
++++ xen/arch/x86/msi.c
+@@ -790,16 +790,14 @@ static int msix_capability_init(struct p
+ {
+ struct arch_msix *msix = dev->msix;
+ struct msi_desc *entry = NULL;
+- int vf;
+ u16 control;
+ u64 table_paddr;
+ u32 table_offset;
+- u8 bir, pbus, pslot, pfunc;
+ u16 seg = dev->seg;
+ u8 bus = dev->bus;
+ u8 slot = PCI_SLOT(dev->devfn);
+ u8 func = PCI_FUNC(dev->devfn);
+- bool maskall = msix->host_maskall;
++ bool maskall = msix->host_maskall, zap_on_error = false;
+
+ ASSERT(pcidevs_locked());
+
+@@ -837,43 +835,45 @@ static int msix_capability_init(struct p
+ /* Locate MSI-X table region */
+ table_offset = pci_conf_read32(seg, bus, slot, func,
+ msix_table_offset_reg(pos));
+- bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
+- table_offset &= ~PCI_MSIX_BIRMASK;
++ if ( !msix->used_entries &&
++ (!msi ||
++ (is_hardware_domain(current->domain) &&
++ (dev->domain == current->domain || dev->domain == dom_io))) )
++ {
++ unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc;
++ int vf;
++ paddr_t pba_paddr;
++ unsigned int pba_offset;
+
+- if ( !dev->info.is_virtfn )
+- {
+- pbus = bus;
+- pslot = slot;
+- pfunc = func;
+- vf = -1;
+- }
+- else
+- {
+- pbus = dev->info.physfn.bus;
+- pslot = PCI_SLOT(dev->info.physfn.devfn);
+- pfunc = PCI_FUNC(dev->info.physfn.devfn);
+- vf = PCI_BDF2(dev->bus, dev->devfn);
+- }
+-
+- table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
+- WARN_ON(msi && msi->table_base != table_paddr);
+- if ( !table_paddr )
+- {
+- if ( !msi || !msi->table_base )
++ if ( !dev->info.is_virtfn )
+ {
+- pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+- control & ~PCI_MSIX_FLAGS_ENABLE);
+- xfree(entry);
+- return -ENXIO;
++ pbus = bus;
++ pslot = slot;
++ pfunc = func;
++ vf = -1;
++ }
++ else
++ {
++ pbus = dev->info.physfn.bus;
++ pslot = PCI_SLOT(dev->info.physfn.devfn);
++ pfunc = PCI_FUNC(dev->info.physfn.devfn);
++ vf = PCI_BDF2(dev->bus, dev->devfn);
+ }
+- table_paddr = msi->table_base;
+- }
+- table_paddr += table_offset;
+
+- if ( !msix->used_entries )
+- {
+- u64 pba_paddr;
+- u32 pba_offset;
++ table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
++ WARN_ON(msi && msi->table_base != table_paddr);
++ if ( !table_paddr )
++ {
++ if ( !msi || !msi->table_base )
++ {
++ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
++ control & ~PCI_MSIX_FLAGS_ENABLE);
++ xfree(entry);
++ return -ENXIO;
++ }
++ table_paddr = msi->table_base;
++ }
++ table_paddr += table_offset & ~PCI_MSIX_BIRMASK;
+
+ msix->nr_entries = nr_entries;
+ msix->table.first = PFN_DOWN(table_paddr);
+@@ -894,7 +894,19 @@ static int msix_capability_init(struct p
+ BITS_TO_LONGS(nr_entries) - 1);
+ WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
+ msix->pba.last));
++
++ zap_on_error = true;
+ }
++ else if ( !msix->table.first )
++ {
++ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
++ control);
++ xfree(entry);
++ return -ENODATA;
++ }
++ else
++ table_paddr = (msix->table.first << PAGE_SHIFT) +
++ (table_offset & ~PCI_MSIX_BIRMASK & ~PAGE_MASK);
+
+ if ( entry )
+ {
+@@ -905,8 +917,16 @@ static int msix_capability_init(struct p
+
+ if ( idx < 0 )
+ {
++ if ( zap_on_error )
++ {
++ msix->table.first = 0;
++ msix->pba.first = 0;
++
++ control &= ~PCI_MSIX_FLAGS_ENABLE;
++ }
++
+ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+- control & ~PCI_MSIX_FLAGS_ENABLE);
++ control);
+ xfree(entry);
+ return idx;
+ }
+@@ -1102,9 +1122,14 @@ static void _pci_cleanup_msix(struct arc
+ if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first,
+ msix->table.last) )
+ WARN();
++ msix->table.first = 0;
++ msix->table.last = 0;
++
+ if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first,
+ msix->pba.last) )
+ WARN();
++ msix->pba.first = 0;
++ msix->pba.last = 0;
+ }
+ }
+
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA338
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA338:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA338 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,44 @@
+$NetBSD: patch-XSA338,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn: relax port_is_valid()
+
+To avoid ports potentially becoming invalid behind the back of certain
+other functions (due to ->max_evtchn shrinking) because of
+- a guest invoking evtchn_reset() and from a 2nd vCPU opening new
+ channels in parallel (see also XSA-343),
+- alloc_unbound_xen_event_channel() produced channels living above the
+ 2-level range (see also XSA-342),
+drop the max_evtchns check from port_is_valid(). For a port for which
+the function once returned "true", the returned value may not turn into
+"false" later on. The function's result may only depend on bounds which
+can only ever grow (which is the case for d->valid_evtchns).
+
+This also eliminates a false sense of safety, utilized by some of the
+users (see again XSA-343): Without a suitable lock held, d->max_evtchns
+may change at any time, and hence deducing that certain other operations
+are safe when port_is_valid() returned true is not legitimate. The
+opportunities to abuse this may get widened by the change here
+(depending on guest and host configuration), but will be taken care of
+by the other XSA.
+
+This is XSA-338.
+
+Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of event channels")
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Stefano Stabellini <sstabellini%kernel.org@localhost>
+Reviewed-by: Julien Grall <jgrall%amazon.com@localhost>
+---
+v5: New, split from larger patch.
+
+--- xen/include/xen/event.h.orig
++++ xen/include/xen/event.h
+@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct
+
+ static inline bool_t port_is_valid(struct domain *d, unsigned int p)
+ {
+- if ( p >= d->max_evtchns )
+- return 0;
+ return p < read_atomic(&d->valid_evtchns);
+ }
+
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA339
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA339:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA339 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,78 @@
+$NetBSD: patch-XSA339,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/pv: Avoid double exception injection
+
+There is at least one path (SYSENTER with NT set, Xen converts to #GP) which
+ends up injecting the #GP fault twice, first in compat_sysenter(), and then a
+second time in compat_test_all_events(), due to the stale TBF_EXCEPTION left
+in TRAPBOUNCE_flags.
+
+The guest kernel sees the second fault first, which is a kernel level #GP
+pointing at the head of the #GP handler, and is therefore a userspace
+trigger-able DoS.
+
+This particular bug has bitten us several times before, so rearrange
+{compat_,}create_bounce_frame() to clobber TRAPBOUNCE on success, rather than
+leaving this task to one area of code which isn't used uniformly.
+
+Other scenarios which might result in a double injection (e.g. two calls
+directly to compat_create_bounce_frame) will now crash the guest, which is far
+more obvious than letting the kernel run with corrupt state.
+
+This is XSA-339
+
+Fixes: fdac9515607b ("x86: clear EFLAGS.NT in SYSENTER entry path")
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index c3e62f8734..73619f57ca 100644
+--- xen/arch/x86/x86_64/compat/entry.S.orig
++++ xen/arch/x86/x86_64/compat/entry.S
+@@ -78,7 +78,6 @@ compat_process_softirqs:
+ sti
+ .Lcompat_bounce_exception:
+ call compat_create_bounce_frame
+- movb $0, TRAPBOUNCE_flags(%rdx)
+ jmp compat_test_all_events
+
+ ALIGN
+@@ -352,7 +351,13 @@ __UNLIKELY_END(compat_bounce_null_selector)
+ movl %eax,UREGS_cs+8(%rsp)
+ movl TRAPBOUNCE_eip(%rdx),%eax
+ movl %eax,UREGS_rip+8(%rsp)
++
++ /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */
++ xor %eax, %eax
++ mov %ax, TRAPBOUNCE_cs(%rdx)
++ mov %al, TRAPBOUNCE_flags(%rdx)
+ ret
++
+ .section .fixup,"ax"
+ .Lfx13:
+ xorl %edi,%edi
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 1e880eb9f6..71a00e846b 100644
+--- xen/arch/x86/x86_64/entry.S.orig
++++ xen/arch/x86/x86_64/entry.S
+@@ -90,7 +90,6 @@ process_softirqs:
+ sti
+ .Lbounce_exception:
+ call create_bounce_frame
+- movb $0, TRAPBOUNCE_flags(%rdx)
+ jmp test_all_events
+
+ ALIGN
+@@ -512,6 +511,11 @@ UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip)
+ jmp asm_domain_crash_synchronous /* Does not return */
+ __UNLIKELY_END(create_bounce_frame_bad_bounce_ip)
+ movq %rax,UREGS_rip+8(%rsp)
++
++ /* Trapbounce complete. Clobber state to avoid an erroneous second injection. */
++ xor %eax, %eax
++ mov %rax, TRAPBOUNCE_eip(%rdx)
++ mov %al, TRAPBOUNCE_flags(%rdx)
+ ret
+
+ .pushsection .fixup, "ax", @progbits
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA340
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA340:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA340 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,67 @@
+$NetBSD: patch-XSA340,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Julien Grall <jgrall%amazon.com@localhost>
+Subject: xen/evtchn: Add missing barriers when accessing/allocating an event channel
+
+While the allocation of a bucket is always performed with the per-domain
+lock, the bucket may be accessed without the lock taken (for instance, see
+evtchn_send()).
+
+Instead such sites relies on port_is_valid() to return a non-zero value
+when the port has a struct evtchn associated to it. The function will
+mostly check whether the port is less than d->valid_evtchns as all the
+buckets/event channels should be allocated up to that point.
+
+Unfortunately a compiler is free to re-order the assignment in
+evtchn_allocate_port() so it would be possible to have d->valid_evtchns
+updated before the new bucket has finish to allocate.
+
+Additionally on Arm, even if this was compiled "correctly", the
+processor can still re-order the memory access.
+
+Add a write memory barrier in the allocation side and a read memory
+barrier when the port is valid to prevent any re-ordering issue.
+
+This is XSA-340.
+
+Reported-by: Julien Grall <jgrall%amazon.com@localhost>
+Signed-off-by: Julien Grall <jgrall%amazon.com@localhost>
+Reviewed-by: Stefano Stabellini <sstabellini%kernel.org@localhost>
+
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -178,6 +178,13 @@ int evtchn_allocate_port(struct domain *
+ return -ENOMEM;
+ bucket_from_port(d, port) = chn;
+
++ /*
++ * d->valid_evtchns is used to check whether the bucket can be
++ * accessed without the per-domain lock. Therefore,
++ * d->valid_evtchns should be seen *after* the new bucket has
++ * been setup.
++ */
++ smp_wmb();
+ write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
+ }
+
+--- xen/include/xen/event.h.orig
++++ xen/include/xen/event.h
+@@ -107,7 +107,17 @@ void notify_via_xen_event_channel(struct
+
+ static inline bool_t port_is_valid(struct domain *d, unsigned int p)
+ {
+- return p < read_atomic(&d->valid_evtchns);
++ if ( p >= read_atomic(&d->valid_evtchns) )
++ return false;
++
++ /*
++ * The caller will usually access the event channel afterwards and
++ * may be done without taking the per-domain lock. The barrier is
++ * going in pair the smp_wmb() barrier in evtchn_allocate_port().
++ */
++ smp_rmb();
++
++ return true;
+ }
+
+ static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA342
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA342:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA342 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,147 @@
+$NetBSD: patch-XSA342,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn/x86: enforce correct upper limit for 32-bit guests
+
+The recording of d->max_evtchns in evtchn_2l_init(), in particular with
+the limited set of callers of the function, is insufficient. Neither for
+PV nor for HVM guests the bitness is known at domain_create() time, yet
+the upper bound in 2-level mode depends upon guest bitness. Recording
+too high a limit "allows" x86 32-bit domains to open not properly usable
+event channels, management of which (inside Xen) would then result in
+corruption of the shared info and vCPU info structures.
+
+Keep the upper limit dynamic for the 2-level case, introducing a helper
+function to retrieve the effective limit. This helper is now supposed to
+be private to the event channel code. The used in do_poll() and
+domain_dump_evtchn_info() weren't consistent with port uses elsewhere
+and hence get switched to port_is_valid().
+
+Furthermore FIFO mode's setup_ports() gets adjusted to loop only up to
+the prior ABI limit, rather than all the way up to the new one.
+
+Finally a word on the change to do_poll(): Accessing ->max_evtchns
+without holding a suitable lock was never safe, as it as well as
+->evtchn_port_ops may change behind do_poll()'s back. Using
+port_is_valid() instead widens some the window for potential abuse,
+until we've dealt with the race altogether (see XSA-343).
+
+This is XSA-342.
+
+Reported-by: Julien Grall <jgrall%amazon.com@localhost>
+Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of event channels")
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Stefano Stabellini <sstabellini%kernel.org@localhost>
+Reviewed-by: Julien Grall <jgrall%amazon.com@localhost>
+
+--- xen/common/event_2l.c.orig
++++ xen/common/event_2l.c
+@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtc
+ void evtchn_2l_init(struct domain *d)
+ {
+ d->evtchn_port_ops = &evtchn_port_ops_2l;
+- d->max_evtchns = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
+ }
+
+ /*
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct do
+
+ int evtchn_allocate_port(struct domain *d, evtchn_port_t port)
+ {
+- if ( port > d->max_evtchn_port || port >= d->max_evtchns )
++ if ( port > d->max_evtchn_port || port >= max_evtchns(d) )
+ return -ENOSPC;
+
+ if ( port_is_valid(d, port) )
+@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(stru
+
+ spin_lock(&d->event_lock);
+
+- for ( port = 1; port < d->max_evtchns; ++port )
++ for ( port = 1; port_is_valid(d, port); ++port )
+ {
+ const struct evtchn *chn;
+ char *ssid;
+
+- if ( !port_is_valid(d, port) )
+- continue;
+ chn = evtchn_from_port(d, port);
+ if ( chn->state == ECS_FREE )
+ continue;
+--- xen/common/event_fifo.c.orig
++++ xen/common/event_fifo.c
+@@ -478,7 +478,7 @@ static void cleanup_event_array(struct d
+ d->evtchn_fifo = NULL;
+ }
+
+-static void setup_ports(struct domain *d)
++static void setup_ports(struct domain *d, unsigned int prev_evtchns)
+ {
+ unsigned int port;
+
+@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d
+ * - save its pending state.
+ * - set default priority.
+ */
+- for ( port = 1; port < d->max_evtchns; port++ )
++ for ( port = 1; port < prev_evtchns; port++ )
+ {
+ struct evtchn *evtchn;
+
+@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtc
+ if ( !d->evtchn_fifo )
+ {
+ struct vcpu *vcb;
++ /* Latch the value before it changes during setup_event_array(). */
++ unsigned int prev_evtchns = max_evtchns(d);
+
+ for_each_vcpu ( d, vcb ) {
+ rc = setup_control_block(vcb);
+@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtc
+ goto error;
+
+ d->evtchn_port_ops = &evtchn_port_ops_fifo;
+- d->max_evtchns = EVTCHN_FIFO_NR_CHANNELS;
+- setup_ports(d);
++ setup_ports(d, prev_evtchns);
+ }
+ else
+ rc = map_control_block(v, gfn, offset);
+--- xen/common/schedule.c.orig
++++ xen/common/schedule.c
+@@ -1434,7 +1434,7 @@ static long do_poll(struct sched_poll *s
+ goto out;
+
+ rc = -EINVAL;
+- if ( port >= d->max_evtchns )
++ if ( !port_is_valid(d, port) )
+ goto out;
+
+ rc = 0;
+--- xen/include/xen/event.h.orig
++++ xen/include/xen/event.h
+@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct
+ #define bucket_from_port(d, p) \
+ ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET])
+
++static inline unsigned int max_evtchns(const struct domain *d)
++{
++ return d->evtchn_fifo ? EVTCHN_FIFO_NR_CHANNELS
++ : BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
++}
++
+ static inline bool_t port_is_valid(struct domain *d, unsigned int p)
+ {
+ if ( p >= read_atomic(&d->valid_evtchns) )
+--- xen/include/xen/sched.h.orig
++++ xen/include/xen/sched.h
+@@ -382,7 +382,6 @@ struct domain
+ /* Event channel information. */
+ struct evtchn *evtchn; /* first bucket only */
+ struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
+- unsigned int max_evtchns; /* number supported by ABI */
+ unsigned int max_evtchn_port; /* max permitted port number */
+ unsigned int valid_evtchns; /* number of allocated event channels */
+ spinlock_t event_lock;
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA343
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA343:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA343 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,863 @@
+$NetBSD: patch-XSA343,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn: evtchn_reset() shouldn't succeed with still-open ports
+
+While the function closes all ports, it does so without holding any
+lock, and hence racing requests may be issued causing new ports to get
+opened. This would have been problematic in particular if such a newly
+opened port had a port number above the new implementation limit (i.e.
+when switching from FIFO to 2-level) after the reset, as prior to
+"evtchn: relax port_is_valid()" this could have led to e.g.
+evtchn_close()'s "BUG_ON(!port_is_valid(d2, port2))" to trigger.
+
+Introduce a counter of active ports and check that it's (still) no
+larger then the number of Xen internally used ones after obtaining the
+necessary lock in evtchn_reset().
+
+As to the access model of the new {active,xen}_evtchns fields - while
+all writes get done using write_atomic(), reads ought to use
+read_atomic() only when outside of a suitably locked region.
+
+Note that as of now evtchn_bind_virq() and evtchn_bind_ipi() don't have
+a need to call check_free_port().
+
+This is part of XSA-343.
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Reviewed-by: Stefano Stabellini <sstabellini%kernel.org@localhost>
+Reviewed-by: Julien Grall <jgrall%amazon.com@localhost>
+
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -188,6 +188,8 @@ int evtchn_allocate_port(struct domain *
+ write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
+ }
+
++ write_atomic(&d->active_evtchns, d->active_evtchns + 1);
++
+ return 0;
+ }
+
+@@ -211,11 +213,26 @@ static int get_free_port(struct domain *
+ return -ENOSPC;
+ }
+
++/*
++ * Check whether a port is still marked free, and if so update the domain
++ * counter accordingly. To be used on function exit paths.
++ */
++static void check_free_port(struct domain *d, evtchn_port_t port)
++{
++ if ( port_is_valid(d, port) &&
++ evtchn_from_port(d, port)->state == ECS_FREE )
++ write_atomic(&d->active_evtchns, d->active_evtchns - 1);
++}
++
+ void evtchn_free(struct domain *d, struct evtchn *chn)
+ {
+ /* Clear pending event to avoid unexpected behavior on re-bind. */
+ evtchn_port_clear_pending(d, chn);
+
++ if ( consumer_is_xen(chn) )
++ write_atomic(&d->xen_evtchns, d->xen_evtchns - 1);
++ write_atomic(&d->active_evtchns, d->active_evtchns - 1);
++
+ /* Reset binding to vcpu0 when the channel is freed. */
+ chn->state = ECS_FREE;
+ chn->notify_vcpu_id = 0;
+@@ -258,6 +275,7 @@ static long evtchn_alloc_unbound(evtchn_
+ alloc->port = port;
+
+ out:
++ check_free_port(d, port);
+ spin_unlock(&d->event_lock);
+ rcu_unlock_domain(d);
+
+@@ -351,6 +369,7 @@ static long evtchn_bind_interdomain(evtc
+ bind->local_port = lport;
+
+ out:
++ check_free_port(ld, lport);
+ spin_unlock(&ld->event_lock);
+ if ( ld != rd )
+ spin_unlock(&rd->event_lock);
+@@ -484,7 +503,7 @@ static long evtchn_bind_pirq(evtchn_bind
+ struct domain *d = current->domain;
+ struct vcpu *v = d->vcpu[0];
+ struct pirq *info;
+- int port, pirq = bind->pirq;
++ int port = 0, pirq = bind->pirq;
+ long rc;
+
+ if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
+@@ -532,6 +551,7 @@ static long evtchn_bind_pirq(evtchn_bind
+ arch_evtchn_bind_pirq(d, pirq);
+
+ out:
++ check_free_port(d, port);
+ spin_unlock(&d->event_lock);
+
+ return rc;
+@@ -1005,10 +1025,10 @@ int evtchn_unmask(unsigned int port)
+ return 0;
+ }
+
+-
+ int evtchn_reset(struct domain *d)
+ {
+ unsigned int i;
++ int rc = 0;
+
+ if ( d != current->domain && !d->controller_pause_count )
+ return -EINVAL;
+@@ -1018,7 +1038,9 @@ int evtchn_reset(struct domain *d)
+
+ spin_lock(&d->event_lock);
+
+- if ( d->evtchn_fifo )
++ if ( d->active_evtchns > d->xen_evtchns )
++ rc = -EAGAIN;
++ else if ( d->evtchn_fifo )
+ {
+ /* Switching back to 2-level ABI. */
+ evtchn_fifo_destroy(d);
+@@ -1027,7 +1049,7 @@ int evtchn_reset(struct domain *d)
+
+ spin_unlock(&d->event_lock);
+
+- return 0;
++ return rc;
+ }
+
+ static long evtchn_set_priority(const struct evtchn_set_priority *set_priority)
+@@ -1213,10 +1235,9 @@ int alloc_unbound_xen_event_channel(
+
+ spin_lock(&ld->event_lock);
+
+- rc = get_free_port(ld);
++ port = rc = get_free_port(ld);
+ if ( rc < 0 )
+ goto out;
+- port = rc;
+ chn = evtchn_from_port(ld, port);
+
+ rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid);
+@@ -1232,7 +1253,10 @@ int alloc_unbound_xen_event_channel(
+
+ spin_unlock(&chn->lock);
+
++ write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1);
++
+ out:
++ check_free_port(ld, port);
+ spin_unlock(&ld->event_lock);
+
+ return rc < 0 ? rc : port;
+@@ -1308,6 +1332,7 @@ int evtchn_init(struct domain *d)
+ return -EINVAL;
+ }
+ evtchn_from_port(d, 0)->state = ECS_RESERVED;
++ write_atomic(&d->active_evtchns, 0);
+
+ #if MAX_VIRT_CPUS > BITS_PER_LONG
+ d->poll_mask = xzalloc_array(unsigned long,
+@@ -1335,6 +1360,8 @@ void evtchn_destroy(struct domain *d)
+ for ( i = 0; port_is_valid(d, i); i++ )
+ evtchn_close(d, i, 0);
+
++ ASSERT(!d->active_evtchns);
++
+ clear_global_virq_handlers(d);
+
+ evtchn_fifo_destroy(d);
+--- xen/include/xen/sched.h.orig
++++ xen/include/xen/sched.h
+@@ -345,6 +345,16 @@ struct domain
+ struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
+ unsigned int max_evtchn_port; /* max permitted port number */
+ unsigned int valid_evtchns; /* number of allocated event channels */
++ /*
++ * Number of in-use event channels. Writers should use write_atomic().
++ * Readers need to use read_atomic() only when not holding event_lock.
++ */
++ unsigned int active_evtchns;
++ /*
++ * Number of event channels used internally by Xen (not subject to
++ * EVTCHNOP_reset). Read/write access like for active_evtchns.
++ */
++ unsigned int xen_evtchns;
+ spinlock_t event_lock;
+ const struct evtchn_port_ops *evtchn_port_ops;
+ struct evtchn_fifo_domain *evtchn_fifo;
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn: convert per-channel lock to be IRQ-safe
+
+... in order for send_guest_{global,vcpu}_virq() to be able to make use
+of it.
+
+This is part of XSA-343.
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Acked-by: Julien Grall <jgrall%amazon.com@localhost>
+
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -248,6 +248,7 @@ static long evtchn_alloc_unbound(evtchn_
+ int port;
+ domid_t dom = alloc->dom;
+ long rc;
++ unsigned long flags;
+
+ d = rcu_lock_domain_by_any_id(dom);
+ if ( d == NULL )
+@@ -263,14 +264,14 @@ static long evtchn_alloc_unbound(evtchn_
+ if ( rc )
+ goto out;
+
+- spin_lock(&chn->lock);
++ spin_lock_irqsave(&chn->lock, flags);
+
+ chn->state = ECS_UNBOUND;
+ if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF )
+ chn->u.unbound.remote_domid = current->domain->domain_id;
+ evtchn_port_init(d, chn);
+
+- spin_unlock(&chn->lock);
++ spin_unlock_irqrestore(&chn->lock, flags);
+
+ alloc->port = port;
+
+@@ -283,26 +284,32 @@ static long evtchn_alloc_unbound(evtchn_
+ }
+
+
+-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
++static unsigned long double_evtchn_lock(struct evtchn *lchn,
++ struct evtchn *rchn)
+ {
+- if ( lchn < rchn )
++ unsigned long flags;
++
++ if ( lchn <= rchn )
+ {
+- spin_lock(&lchn->lock);
+- spin_lock(&rchn->lock);
++ spin_lock_irqsave(&lchn->lock, flags);
++ if ( lchn != rchn )
++ spin_lock(&rchn->lock);
+ }
+ else
+ {
+- if ( lchn != rchn )
+- spin_lock(&rchn->lock);
++ spin_lock_irqsave(&rchn->lock, flags);
+ spin_lock(&lchn->lock);
+ }
++
++ return flags;
+ }
+
+-static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn)
++static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn,
++ unsigned long flags)
+ {
+- spin_unlock(&lchn->lock);
+ if ( lchn != rchn )
+- spin_unlock(&rchn->lock);
++ spin_unlock(&lchn->lock);
++ spin_unlock_irqrestore(&rchn->lock, flags);
+ }
+
+ static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
+@@ -312,6 +319,7 @@ static long evtchn_bind_interdomain(evtc
+ int lport, rport = bind->remote_port;
+ domid_t rdom = bind->remote_dom;
+ long rc;
++ unsigned long flags;
+
+ if ( rdom == DOMID_SELF )
+ rdom = current->domain->domain_id;
+@@ -347,7 +355,7 @@ static long evtchn_bind_interdomain(evtc
+ if ( rc )
+ goto out;
+
+- double_evtchn_lock(lchn, rchn);
++ flags = double_evtchn_lock(lchn, rchn);
+
+ lchn->u.interdomain.remote_dom = rd;
+ lchn->u.interdomain.remote_port = rport;
+@@ -364,7 +372,7 @@ static long evtchn_bind_interdomain(evtc
+ */
+ evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);
+
+- double_evtchn_unlock(lchn, rchn);
++ double_evtchn_unlock(lchn, rchn, flags);
+
+ bind->local_port = lport;
+
+@@ -387,6 +395,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t
+ struct domain *d = current->domain;
+ int virq = bind->virq, vcpu = bind->vcpu;
+ int rc = 0;
++ unsigned long flags;
+
+ if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) )
+ return -EINVAL;
+@@ -419,14 +428,14 @@ int evtchn_bind_virq(evtchn_bind_virq_t
+
+ chn = evtchn_from_port(d, port);
+
+- spin_lock(&chn->lock);
++ spin_lock_irqsave(&chn->lock, flags);
+
+ chn->state = ECS_VIRQ;
+ chn->notify_vcpu_id = vcpu;
+ chn->u.virq = virq;
+ evtchn_port_init(d, chn);
+
+- spin_unlock(&chn->lock);
++ spin_unlock_irqrestore(&chn->lock, flags);
+
+ v->virq_to_evtchn[virq] = bind->port = port;
+
+@@ -443,6 +452,7 @@ static long evtchn_bind_ipi(evtchn_bind_
+ struct domain *d = current->domain;
+ int port, vcpu = bind->vcpu;
+ long rc = 0;
++ unsigned long flags;
+
+ if ( (vcpu < 0) || (vcpu >= d->max_vcpus) ||
+ (d->vcpu[vcpu] == NULL) )
+@@ -455,13 +465,13 @@ static long evtchn_bind_ipi(evtchn_bind_
+
+ chn = evtchn_from_port(d, port);
+
+- spin_lock(&chn->lock);
++ spin_lock_irqsave(&chn->lock, flags);
+
+ chn->state = ECS_IPI;
+ chn->notify_vcpu_id = vcpu;
+ evtchn_port_init(d, chn);
+
+- spin_unlock(&chn->lock);
++ spin_unlock_irqrestore(&chn->lock, flags);
+
+ bind->port = port;
+
+@@ -505,6 +515,7 @@ static long evtchn_bind_pirq(evtchn_bind
+ struct pirq *info;
+ int port = 0, pirq = bind->pirq;
+ long rc;
++ unsigned long flags;
+
+ if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
+ return -EINVAL;
+@@ -537,14 +548,14 @@ static long evtchn_bind_pirq(evtchn_bind
+ goto out;
+ }
+
+- spin_lock(&chn->lock);
++ spin_lock_irqsave(&chn->lock, flags);
+
+ chn->state = ECS_PIRQ;
+ chn->u.pirq.irq = pirq;
+ link_pirq_port(port, chn, v);
+ evtchn_port_init(d, chn);
+
+- spin_unlock(&chn->lock);
++ spin_unlock_irqrestore(&chn->lock, flags);
+
+ bind->port = port;
+
+@@ -565,6 +576,7 @@ int evtchn_close(struct domain *d1, int
+ struct evtchn *chn1, *chn2;
+ int port2;
+ long rc = 0;
++ unsigned long flags;
+
+ again:
+ spin_lock(&d1->event_lock);
+@@ -664,14 +676,14 @@ int evtchn_close(struct domain *d1, int
+ BUG_ON(chn2->state != ECS_INTERDOMAIN);
+ BUG_ON(chn2->u.interdomain.remote_dom != d1);
+
+- double_evtchn_lock(chn1, chn2);
++ flags = double_evtchn_lock(chn1, chn2);
+
+ evtchn_free(d1, chn1);
+
+ chn2->state = ECS_UNBOUND;
+ chn2->u.unbound.remote_domid = d1->domain_id;
+
+- double_evtchn_unlock(chn1, chn2);
++ double_evtchn_unlock(chn1, chn2, flags);
+
+ goto out;
+
+@@ -679,9 +691,9 @@ int evtchn_close(struct domain *d1, int
+ BUG();
+ }
+
+- spin_lock(&chn1->lock);
++ spin_lock_irqsave(&chn1->lock, flags);
+ evtchn_free(d1, chn1);
+- spin_unlock(&chn1->lock);
++ spin_unlock_irqrestore(&chn1->lock, flags);
+
+ out:
+ if ( d2 != NULL )
+@@ -701,13 +713,14 @@ int evtchn_send(struct domain *ld, unsig
+ struct evtchn *lchn, *rchn;
+ struct domain *rd;
+ int rport, ret = 0;
++ unsigned long flags;
+
+ if ( !port_is_valid(ld, lport) )
+ return -EINVAL;
+
+ lchn = evtchn_from_port(ld, lport);
+
+- spin_lock(&lchn->lock);
++ spin_lock_irqsave(&lchn->lock, flags);
+
+ /* Guest cannot send via a Xen-attached event channel. */
+ if ( unlikely(consumer_is_xen(lchn)) )
+@@ -742,7 +755,7 @@ int evtchn_send(struct domain *ld, unsig
+ }
+
+ out:
+- spin_unlock(&lchn->lock);
++ spin_unlock_irqrestore(&lchn->lock, flags);
+
+ return ret;
+ }
+@@ -1232,6 +1245,7 @@ int alloc_unbound_xen_event_channel(
+ {
+ struct evtchn *chn;
+ int port, rc;
++ unsigned long flags;
+
+ spin_lock(&ld->event_lock);
+
+@@ -1244,14 +1258,14 @@ int alloc_unbound_xen_event_channel(
+ if ( rc )
+ goto out;
+
+- spin_lock(&chn->lock);
++ spin_lock_irqsave(&chn->lock, flags);
+
+ chn->state = ECS_UNBOUND;
+ chn->xen_consumer = get_xen_consumer(notification_fn);
+ chn->notify_vcpu_id = lvcpu;
+ chn->u.unbound.remote_domid = remote_domid;
+
+- spin_unlock(&chn->lock);
++ spin_unlock_irqrestore(&chn->lock, flags);
+
+ write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1);
+
+@@ -1274,11 +1288,12 @@ void notify_via_xen_event_channel(struct
+ {
+ struct evtchn *lchn, *rchn;
+ struct domain *rd;
++ unsigned long flags;
+
+ ASSERT(port_is_valid(ld, lport));
+ lchn = evtchn_from_port(ld, lport);
+
+- spin_lock(&lchn->lock);
++ spin_lock_irqsave(&lchn->lock, flags);
+
+ if ( likely(lchn->state == ECS_INTERDOMAIN) )
+ {
+@@ -1288,7 +1303,7 @@ void notify_via_xen_event_channel(struct
+ evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
+ }
+
+- spin_unlock(&lchn->lock);
++ spin_unlock_irqrestore(&lchn->lock, flags);
+ }
+
+ void evtchn_check_pollers(struct domain *d, unsigned int port)
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn: address races with evtchn_reset()
+
+Neither d->evtchn_port_ops nor max_evtchns(d) may be used in an entirely
+lock-less manner, as both may change by a racing evtchn_reset(). In the
+common case, at least one of the domain's event lock or the per-channel
+lock needs to be held. In the specific case of the inter-domain sending
+by evtchn_send() and notify_via_xen_event_channel() holding the other
+side's per-channel lock is sufficient, as the channel can't change state
+without both per-channel locks held. Without such a channel changing
+state, evtchn_reset() can't complete successfully.
+
+Lock-free accesses continue to be permitted for the shim (calling some
+otherwise internal event channel functions), as this happens while the
+domain is in effectively single-threaded mode. Special care also needs
+taking for the shim's marking of in-use ports as ECS_RESERVED (allowing
+use of such ports in the shim case is okay because switching into and
+hence also out of FIFO mode is impossible there).
+
+As a side effect, certain operations on Xen bound event channels which
+were mistakenly permitted so far (e.g. unmask or poll) will be refused
+now.
+
+This is part of XSA-343.
+
+Reported-by: Julien Grall <jgrall%amazon.com@localhost>
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Acked-by: Julien Grall <jgrall%amazon.com@localhost>
+
+--- xen/arch/x86/irq.c.orig
++++ xen/arch/x86/irq.c
+@@ -2367,14 +2367,24 @@ static void dump_irqs(unsigned char key)
+
+ for ( i = 0; i < action->nr_guests; i++ )
+ {
++ struct evtchn *evtchn;
++ unsigned int pending = 2, masked = 2;
++
+ d = action->guest[i];
+ pirq = domain_irq_to_pirq(d, irq);
+ info = pirq_info(d, pirq);
++ evtchn = evtchn_from_port(d, info->evtchn);
++ local_irq_disable();
++ if ( spin_trylock(&evtchn->lock) )
++ {
++ pending = evtchn_is_pending(d, evtchn);
++ masked = evtchn_is_masked(d, evtchn);
++ spin_unlock(&evtchn->lock);
++ }
++ local_irq_enable();
+ printk("%u:%3d(%c%c%c)",
+- d->domain_id, pirq,
+- evtchn_port_is_pending(d, info->evtchn) ? 'P' : '-',
+- evtchn_port_is_masked(d, info->evtchn) ? 'M' : '-',
+- (info->masked ? 'M' : '-'));
++ d->domain_id, pirq, "-P?"[pending],
++ "-M?"[masked], info->masked ? 'M' : '-');
+ if ( i != action->nr_guests )
+ printk(",");
+ }
+--- xen/arch/x86/pv/shim.c.orig
++++ xen/arch/x86/pv/shim.c
+@@ -616,8 +616,11 @@ void pv_shim_inject_evtchn(unsigned int
+ if ( port_is_valid(guest, port) )
+ {
+ struct evtchn *chn = evtchn_from_port(guest, port);
++ unsigned long flags;
+
++ spin_lock_irqsave(&chn->lock, flags);
+ evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn);
++ spin_unlock_irqrestore(&chn->lock, flags);
+ }
+ }
+
+--- xen/common/event_2l.c.orig
++++ xen/common/event_2l.c
+@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct doma
+ }
+ }
+
+-static bool evtchn_2l_is_pending(const struct domain *d, evtchn_port_t port)
++static bool evtchn_2l_is_pending(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
++ evtchn_port_t port = evtchn->port;
+ unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
+
+ ASSERT(port < max_ports);
+@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const s
+ guest_test_bit(d, port, &shared_info(d, evtchn_pending)));
+ }
+
+-static bool evtchn_2l_is_masked(const struct domain *d, evtchn_port_t port)
++static bool evtchn_2l_is_masked(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
++ evtchn_port_t port = evtchn->port;
+ unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
+
+ ASSERT(port < max_ports);
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain *
+
+ if ( port_is_valid(d, port) )
+ {
+- if ( evtchn_from_port(d, port)->state != ECS_FREE ||
+- evtchn_port_is_busy(d, port) )
++ const struct evtchn *chn = evtchn_from_port(d, port);
++
++ if ( chn->state != ECS_FREE || evtchn_is_busy(d, chn) )
+ return -EBUSY;
+ }
+ else
+@@ -770,6 +771,7 @@ void send_guest_vcpu_virq(struct vcpu *v
+ unsigned long flags;
+ int port;
+ struct domain *d;
++ struct evtchn *chn;
+
+ ASSERT(!virq_is_global(virq));
+
+@@ -780,7 +782,10 @@ void send_guest_vcpu_virq(struct vcpu *v
+ goto out;
+
+ d = v->domain;
+- evtchn_port_set_pending(d, v->vcpu_id, evtchn_from_port(d, port));
++ chn = evtchn_from_port(d, port);
++ spin_lock(&chn->lock);
++ evtchn_port_set_pending(d, v->vcpu_id, chn);
++ spin_unlock(&chn->lock);
+
+ out:
+ spin_unlock_irqrestore(&v->virq_lock, flags);
+@@ -809,7 +814,9 @@ static void send_guest_global_virq(struc
+ goto out;
+
+ chn = evtchn_from_port(d, port);
++ spin_lock(&chn->lock);
+ evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
++ spin_unlock(&chn->lock);
+
+ out:
+ spin_unlock_irqrestore(&v->virq_lock, flags);
+@@ -819,6 +826,7 @@ void send_guest_pirq(struct domain *d, c
+ {
+ int port;
+ struct evtchn *chn;
++ unsigned long flags;
+
+ /*
+ * PV guests: It should not be possible to race with __evtchn_close(). The
+@@ -833,7 +841,9 @@ void send_guest_pirq(struct domain *d, c
+ }
+
+ chn = evtchn_from_port(d, port);
++ spin_lock_irqsave(&chn->lock, flags);
+ evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
++ spin_unlock_irqrestore(&chn->lock, flags);
+ }
+
+ static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly;
+@@ -1028,12 +1038,15 @@ int evtchn_unmask(unsigned int port)
+ {
+ struct domain *d = current->domain;
+ struct evtchn *evtchn;
++ unsigned long flags;
+
+ if ( unlikely(!port_is_valid(d, port)) )
+ return -EINVAL;
+
+ evtchn = evtchn_from_port(d, port);
++ spin_lock_irqsave(&evtchn->lock, flags);
+ evtchn_port_unmask(d, evtchn);
++ spin_unlock_irqrestore(&evtchn->lock, flags);
+
+ return 0;
+ }
+@@ -1446,8 +1459,8 @@ static void domain_dump_evtchn_info(stru
+
+ printk(" %4u [%d/%d/",
+ port,
+- evtchn_port_is_pending(d, port),
+- evtchn_port_is_masked(d, port));
++ evtchn_is_pending(d, chn),
++ evtchn_is_masked(d, chn));
+ evtchn_port_print_state(d, chn);
+ printk("]: s=%d n=%d x=%d",
+ chn->state, chn->notify_vcpu_id, chn->xen_consumer);
+--- xen/common/event_fifo.c.orig
++++ xen/common/event_fifo.c
+@@ -295,23 +295,26 @@ static void evtchn_fifo_unmask(struct do
+ evtchn_fifo_set_pending(v, evtchn);
+ }
+
+-static bool evtchn_fifo_is_pending(const struct domain *d, evtchn_port_t port)
++static bool evtchn_fifo_is_pending(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
+- const event_word_t *word = evtchn_fifo_word_from_port(d, port);
++ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port);
+
+ return word && guest_test_bit(d, EVTCHN_FIFO_PENDING, word);
+ }
+
+-static bool_t evtchn_fifo_is_masked(const struct domain *d, evtchn_port_t port)
++static bool_t evtchn_fifo_is_masked(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
+- const event_word_t *word = evtchn_fifo_word_from_port(d, port);
++ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port);
+
+ return !word || guest_test_bit(d, EVTCHN_FIFO_MASKED, word);
+ }
+
+-static bool_t evtchn_fifo_is_busy(const struct domain *d, evtchn_port_t port)
++static bool_t evtchn_fifo_is_busy(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
+- const event_word_t *word = evtchn_fifo_word_from_port(d, port);
++ const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port);
+
+ return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word);
+ }
+--- xen/include/asm-x86/event.h.orig
++++ xen/include/asm-x86/event.h
+@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(u
+ return true;
+ }
+
++#ifdef CONFIG_PV_SHIM
++# include <asm/pv/shim.h>
++# define arch_evtchn_is_special(chn) \
++ (pv_shim && (chn)->port && (chn)->state == ECS_RESERVED)
++#endif
++
+ #endif
+--- xen/include/xen/event.h.orig
++++ xen/include/xen/event.h
+@@ -125,6 +125,24 @@ static inline struct evtchn *evtchn_from
+ return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET);
+ }
+
++/*
++ * "usable" as in "by a guest", i.e. Xen consumed channels are assumed to be
++ * taken care of separately where used for Xen's internal purposes.
++ */
++static bool evtchn_usable(const struct evtchn *evtchn)
++{
++ if ( evtchn->xen_consumer )
++ return false;
++
++#ifdef arch_evtchn_is_special
++ if ( arch_evtchn_is_special(evtchn) )
++ return true;
++#endif
++
++ BUILD_BUG_ON(ECS_FREE > ECS_RESERVED);
++ return evtchn->state > ECS_RESERVED;
++}
++
+ /* Wait on a Xen-attached event channel. */
+ #define wait_on_xen_event_channel(port, condition) \
+ do { \
+@@ -157,19 +175,24 @@ int evtchn_reset(struct domain *d);
+
+ /*
+ * Low-level event channel port ops.
++ *
++ * All hooks have to be called with a lock held which prevents the channel
++ * from changing state. This may be the domain event lock, the per-channel
++ * lock, or in the case of sending interdomain events also the other side's
++ * per-channel lock. Exceptions apply in certain cases for the PV shim.
+ */
+ struct evtchn_port_ops {
+ void (*init)(struct domain *d, struct evtchn *evtchn);
+ void (*set_pending)(struct vcpu *v, struct evtchn *evtchn);
+ void (*clear_pending)(struct domain *d, struct evtchn *evtchn);
+ void (*unmask)(struct domain *d, struct evtchn *evtchn);
+- bool (*is_pending)(const struct domain *d, evtchn_port_t port);
+- bool (*is_masked)(const struct domain *d, evtchn_port_t port);
++ bool (*is_pending)(const struct domain *d, const struct evtchn *evtchn);
++ bool (*is_masked)(const struct domain *d, const struct evtchn *evtchn);
+ /*
+ * Is the port unavailable because it's still being cleaned up
+ * after being closed?
+ */
+- bool (*is_busy)(const struct domain *d, evtchn_port_t port);
++ bool (*is_busy)(const struct domain *d, const struct evtchn *evtchn);
+ int (*set_priority)(struct domain *d, struct evtchn *evtchn,
+ unsigned int priority);
+ void (*print_state)(struct domain *d, const struct evtchn *evtchn);
+@@ -185,38 +208,67 @@ static inline void evtchn_port_set_pendi
+ unsigned int vcpu_id,
+ struct evtchn *evtchn)
+ {
+- d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn);
++ if ( evtchn_usable(evtchn) )
++ d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn);
+ }
+
+ static inline void evtchn_port_clear_pending(struct domain *d,
+ struct evtchn *evtchn)
+ {
+- d->evtchn_port_ops->clear_pending(d, evtchn);
++ if ( evtchn_usable(evtchn) )
++ d->evtchn_port_ops->clear_pending(d, evtchn);
+ }
+
+ static inline void evtchn_port_unmask(struct domain *d,
+ struct evtchn *evtchn)
+ {
+- d->evtchn_port_ops->unmask(d, evtchn);
++ if ( evtchn_usable(evtchn) )
++ d->evtchn_port_ops->unmask(d, evtchn);
+ }
+
+-static inline bool evtchn_port_is_pending(const struct domain *d,
+- evtchn_port_t port)
++static inline bool evtchn_is_pending(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
+- return d->evtchn_port_ops->is_pending(d, port);
++ return evtchn_usable(evtchn) && d->evtchn_port_ops->is_pending(d, evtchn);
+ }
+
+-static inline bool evtchn_port_is_masked(const struct domain *d,
+- evtchn_port_t port)
++static inline bool evtchn_port_is_pending(struct domain *d, evtchn_port_t port)
+ {
+- return d->evtchn_port_ops->is_masked(d, port);
++ struct evtchn *evtchn = evtchn_from_port(d, port);
++ bool rc;
++ unsigned long flags;
++
++ spin_lock_irqsave(&evtchn->lock, flags);
++ rc = evtchn_is_pending(d, evtchn);
++ spin_unlock_irqrestore(&evtchn->lock, flags);
++
++ return rc;
++}
++
++static inline bool evtchn_is_masked(const struct domain *d,
++ const struct evtchn *evtchn)
++{
++ return !evtchn_usable(evtchn) || d->evtchn_port_ops->is_masked(d, evtchn);
++}
++
++static inline bool evtchn_port_is_masked(struct domain *d, evtchn_port_t port)
++{
++ struct evtchn *evtchn = evtchn_from_port(d, port);
++ bool rc;
++ unsigned long flags;
++
++ spin_lock_irqsave(&evtchn->lock, flags);
++ rc = evtchn_is_masked(d, evtchn);
++ spin_unlock_irqrestore(&evtchn->lock, flags);
++
++ return rc;
+ }
+
+-static inline bool evtchn_port_is_busy(const struct domain *d,
+- evtchn_port_t port)
++static inline bool evtchn_is_busy(const struct domain *d,
++ const struct evtchn *evtchn)
+ {
+ return d->evtchn_port_ops->is_busy &&
+- d->evtchn_port_ops->is_busy(d, port);
++ d->evtchn_port_ops->is_busy(d, evtchn);
+ }
+
+ static inline int evtchn_port_set_priority(struct domain *d,
+@@ -225,6 +277,8 @@ static inline int evtchn_port_set_priori
+ {
+ if ( !d->evtchn_port_ops->set_priority )
+ return -ENOSYS;
++ if ( !evtchn_usable(evtchn) )
++ return -EACCES;
+ return d->evtchn_port_ops->set_priority(d, evtchn, priority);
+ }
+
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA344
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA344:1.1
--- /dev/null Fri Oct 2 13:00:48 2020
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA344 Fri Oct 2 13:00:48 2020
@@ -0,0 +1,337 @@
+$NetBSD: patch-XSA344,v 1.1 2020/10/02 13:00:48 bouyer Exp $
+
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn: arrange for preemption in evtchn_destroy()
+
+Especially closing of fully established interdomain channels can take
+quite some time, due to the locking involved. Therefore we shouldn't
+assume we can clean up still active ports all in one go. Besides adding
+the necessary preemption check, also avoid pointlessly starting from
+(or now really ending at) 0; 1 is the lowest numbered port which may
+need closing.
+
+Since we're now reducing ->valid_evtchns, free_xen_event_channel(),
+and (at least to be on the safe side) notify_via_xen_event_channel()
+need to cope with attempts to close / unbind from / send through already
+closed (and no longer valid, as per port_is_valid()) ports.
+
+This is part of XSA-344.
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Acked-by: Julien Grall <jgrall%amazon.com@localhost>
+Reviewed-by: Stefano Stabellini <sstabellini%kernel.org@localhost>
+
+--- xen/common/domain.c.orig
++++ xen/common/domain.c
+@@ -646,7 +646,6 @@ int domain_kill(struct domain *d)
+ if ( d->is_dying != DOMDYING_alive )
+ return domain_kill(d);
+ d->is_dying = DOMDYING_dying;
+- evtchn_destroy(d);
+ gnttab_release_mappings(d);
+ tmem_destroy(d->tmem_client);
+ vnuma_destroy(d->vnuma);
+@@ -654,6 +653,9 @@ int domain_kill(struct domain *d)
+ d->tmem_client = NULL;
+ /* fallthrough */
+ case DOMDYING_dying:
++ rc = evtchn_destroy(d);
++ if ( rc )
++ break;
+ rc = domain_relinquish_resources(d);
+ if ( rc != 0 )
+ break;
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -1291,7 +1291,16 @@ int alloc_unbound_xen_event_channel(
+
+ void free_xen_event_channel(struct domain *d, int port)
+ {
+- BUG_ON(!port_is_valid(d, port));
++ if ( !port_is_valid(d, port) )
++ {
++ /*
++ * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
++ * with the spin_barrier() and BUG_ON() in evtchn_destroy().
++ */
++ smp_rmb();
++ BUG_ON(!d->is_dying);
++ return;
++ }
+
+ evtchn_close(d, port, 0);
+ }
+@@ -1303,7 +1312,17 @@ void notify_via_xen_event_channel(struct
+ struct domain *rd;
+ unsigned long flags;
+
+- ASSERT(port_is_valid(ld, lport));
++ if ( !port_is_valid(ld, lport) )
++ {
++ /*
++ * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
++ * with the spin_barrier() and BUG_ON() in evtchn_destroy().
++ */
++ smp_rmb();
++ ASSERT(ld->is_dying);
++ return;
++ }
++
+ lchn = evtchn_from_port(ld, lport);
+
+ spin_lock_irqsave(&lchn->lock, flags);
+@@ -1375,8 +1394,7 @@ int evtchn_init(struct domain *d)
+ return 0;
+ }
+
+-
+-void evtchn_destroy(struct domain *d)
++int evtchn_destroy(struct domain *d)
+ {
+ unsigned int i;
+
+@@ -1385,14 +1403,29 @@ void evtchn_destroy(struct domain *d)
+ spin_barrier(&d->event_lock);
+
+ /* Close all existing event channels. */
+- for ( i = 0; port_is_valid(d, i); i++ )
++ for ( i = d->valid_evtchns; --i; )
++ {
+ evtchn_close(d, i, 0);
+
++ /*
++ * Avoid preempting when called from domain_create()'s error path,
++ * and don't check too often (choice of frequency is arbitrary).
++ */
++ if ( i && !(i & 0x3f) && d->is_dying != DOMDYING_dead &&
++ hypercall_preempt_check() )
++ {
++ write_atomic(&d->valid_evtchns, i);
++ return -ERESTART;
++ }
++ }
++
+ ASSERT(!d->active_evtchns);
+
+ clear_global_virq_handlers(d);
+
+ evtchn_fifo_destroy(d);
++
++ return 0;
+ }
+
+
+--- xen/include/xen/sched.h.orig
++++ xen/include/xen/sched.h
+@@ -135,7 +135,7 @@ struct evtchn
+ } __attribute__((aligned(64)));
+
+ int evtchn_init(struct domain *d); /* from domain_create */
+-void evtchn_destroy(struct domain *d); /* from domain_kill */
++int evtchn_destroy(struct domain *d); /* from domain_kill */
+ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
+
+ struct waitqueue_vcpu;
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: evtchn: arrange for preemption in evtchn_reset()
+
+Like for evtchn_destroy() looping over all possible event channels to
+close them can take a significant amount of time. Unlike done there, we
+can't alter domain properties (i.e. d->valid_evtchns) here. Borrow, in a
+lightweight form, the paging domctl continuation concept, redirecting
+the continuations to different sub-ops. Just like there this is to be
+able to allow for predictable overall results of the involved sub-ops:
+Racing requests should either complete or be refused.
+
+Note that a domain can't interfere with an already started (by a remote
+domain) reset, due to being paused. It can prevent a remote reset from
+happening by leaving a reset unfinished, but that's only going to affect
+itself.
+
+This is part of XSA-344.
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Acked-by: Julien Grall <jgrall%amazon.com@localhost>
+Reviewed-by: Stefano Stabellini <sstabellini%kernel.org@localhost>
+
+--- xen/common/domain.c.orig
++++ xen/common/domain.c
+@@ -1105,7 +1105,7 @@ void domain_unpause_except_self(struct d
+ domain_unpause(d);
+ }
+
+-int domain_soft_reset(struct domain *d)
++int domain_soft_reset(struct domain *d, bool resuming)
+ {
+ struct vcpu *v;
+ int rc;
+@@ -1119,7 +1119,7 @@ int domain_soft_reset(struct domain *d)
+ }
+ spin_unlock(&d->shutdown_lock);
+
+- rc = evtchn_reset(d);
++ rc = evtchn_reset(d, resuming);
+ if ( rc )
+ return rc;
+
+--- xen/common/domctl.c.orig
++++ xen/common/domctl.c
+@@ -648,12 +648,22 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
+ }
+
+ case XEN_DOMCTL_soft_reset:
++ case XEN_DOMCTL_soft_reset_cont:
+ if ( d == current->domain ) /* no domain_pause() */
+ {
+ ret = -EINVAL;
+ break;
+ }
+- ret = domain_soft_reset(d);
++ ret = domain_soft_reset(d, op->cmd == XEN_DOMCTL_soft_reset_cont);
++ if ( ret == -ERESTART )
++ {
++ op->cmd = XEN_DOMCTL_soft_reset_cont;
++ if ( !__copy_field_to_guest(u_domctl, op, cmd) )
++ ret = hypercall_create_continuation(__HYPERVISOR_domctl,
++ "h", u_domctl);
++ else
++ ret = -EFAULT;
++ }
+ break;
+
+ case XEN_DOMCTL_destroydomain:
+--- xen/common/event_channel.c.orig
++++ xen/common/event_channel.c
+@@ -1051,7 +1051,7 @@ int evtchn_unmask(unsigned int port)
+ return 0;
+ }
+
+-int evtchn_reset(struct domain *d)
++int evtchn_reset(struct domain *d, bool resuming)
+ {
+ unsigned int i;
+ int rc = 0;
+@@ -1059,11 +1059,40 @@ int evtchn_reset(struct domain *d)
+ if ( d != current->domain && !d->controller_pause_count )
+ return -EINVAL;
+
+- for ( i = 0; port_is_valid(d, i); i++ )
++ spin_lock(&d->event_lock);
++
++ /*
++ * If we are resuming, then start where we stopped. Otherwise, check
++ * that a reset operation is not already in progress, and if none is,
++ * record that this is now the case.
++ */
++ i = resuming ? d->next_evtchn : !d->next_evtchn;
++ if ( i > d->next_evtchn )
++ d->next_evtchn = i;
++
++ spin_unlock(&d->event_lock);
++
++ if ( !i )
++ return -EBUSY;
++
++ for ( ; port_is_valid(d, i); i++ )
++ {
+ evtchn_close(d, i, 1);
+
++ /* NB: Choice of frequency is arbitrary. */
++ if ( !(i & 0x3f) && hypercall_preempt_check() )
++ {
++ spin_lock(&d->event_lock);
++ d->next_evtchn = i;
++ spin_unlock(&d->event_lock);
++ return -ERESTART;
++ }
++ }
++
+ spin_lock(&d->event_lock);
+
++ d->next_evtchn = 0;
++
+ if ( d->active_evtchns > d->xen_evtchns )
+ rc = -EAGAIN;
+ else if ( d->evtchn_fifo )
+@@ -1198,7 +1227,8 @@ long do_event_channel_op(int cmd, XEN_GU
+ break;
+ }
+
+- case EVTCHNOP_reset: {
++ case EVTCHNOP_reset:
++ case EVTCHNOP_reset_cont: {
+ struct evtchn_reset reset;
+ struct domain *d;
+
+@@ -1211,9 +1241,13 @@ long do_event_channel_op(int cmd, XEN_GU
+
+ rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d);
+ if ( !rc )
+- rc = evtchn_reset(d);
++ rc = evtchn_reset(d, cmd == EVTCHNOP_reset_cont);
+
+ rcu_unlock_domain(d);
++
++ if ( rc == -ERESTART )
++ rc = hypercall_create_continuation(__HYPERVISOR_event_channel_op,
++ "ih", EVTCHNOP_reset_cont, arg);
+ break;
+ }
+
+--- xen/include/public/domctl.h.orig
++++ xen/include/public/domctl.h
+@@ -1121,7 +1121,10 @@ struct xen_domctl {
+ #define XEN_DOMCTL_iomem_permission 20
+ #define XEN_DOMCTL_ioport_permission 21
+ #define XEN_DOMCTL_hypercall_init 22
+-#define XEN_DOMCTL_arch_setup 23 /* Obsolete IA64 only */
++#ifdef __XEN__
++/* #define XEN_DOMCTL_arch_setup 23 Obsolete IA64 only */
++#define XEN_DOMCTL_soft_reset_cont 23
++#endif
+ #define XEN_DOMCTL_settimeoffset 24
+ #define XEN_DOMCTL_getvcpuaffinity 25
+ #define XEN_DOMCTL_real_mode_area 26 /* Obsolete PPC only */
+--- xen/include/public/event_channel.h.orig
++++ xen/include/public/event_channel.h
+@@ -74,6 +74,9 @@
+ #define EVTCHNOP_init_control 11
+ #define EVTCHNOP_expand_array 12
+ #define EVTCHNOP_set_priority 13
++#ifdef __XEN__
++#define EVTCHNOP_reset_cont 14
++#endif
+ /* ` } */
+
+ typedef uint32_t evtchn_port_t;
+--- xen/include/xen/event.h.orig
++++ xen/include/xen/event.h
+@@ -163,7 +163,7 @@ void evtchn_check_pollers(struct domain
+ void evtchn_2l_init(struct domain *d);
+
+ /* Close all event channels and reset to 2-level ABI. */
+-int evtchn_reset(struct domain *d);
++int evtchn_reset(struct domain *d, bool resuming);
+
+ /*
+ * Low-level event channel port ops.
+--- xen/include/xen/sched.h.orig
++++ xen/include/xen/sched.h
+@@ -355,6 +355,8 @@ struct domain
+ * EVTCHNOP_reset). Read/write access like for active_evtchns.
+ */
+ unsigned int xen_evtchns;
++ /* Port to resume from in evtchn_reset(), when in a continuation. */
++ unsigned int next_evtchn;
+ spinlock_t event_lock;
+ const struct evtchn_port_ops *evtchn_port_ops;
+ struct evtchn_fifo_domain *evtchn_fifo;
+@@ -608,7 +610,7 @@ int domain_shutdown(struct domain *d, u8
+ void domain_resume(struct domain *d);
+ void domain_pause_for_debugger(void);
+
+-int domain_soft_reset(struct domain *d);
++int domain_soft_reset(struct domain *d, bool resuming);
+
+ int vcpu_start_shutdown_deferral(struct vcpu *v);
+ void vcpu_end_shutdown_deferral(struct vcpu *v);
Home |
Main Index |
Thread Index |
Old Index