pkgsrc-Changes archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
CVS commit: pkgsrc/sysutils
Module Name: pkgsrc
Committed By: bouyer
Date: Wed Nov 15 15:59:36 UTC 2023
Modified Files:
pkgsrc/sysutils/xenkernel415: Makefile distinfo
pkgsrc/sysutils/xentools415: Makefile PLIST distinfo
Added Files:
pkgsrc/sysutils/xenkernel415/patches: patch-XSA439 patch-XSA442
patch-XSA444 patch-XSA445 patch-XSA446
pkgsrc/sysutils/xentools415/patches: patch-XSA440 patch-XSA443
Log Message:
xen*415: apply upstream patches for Xen Security Advisory
XSA-439, XSA-440, XSA-442, XSA-443, XSA-444, XSA-445, XSA-446
bump PKGREVISIONs
To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 pkgsrc/sysutils/xenkernel415/Makefile
cvs rdiff -u -r1.10 -r1.11 pkgsrc/sysutils/xenkernel415/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xenkernel415/patches/patch-XSA439 \
pkgsrc/sysutils/xenkernel415/patches/patch-XSA442 \
pkgsrc/sysutils/xenkernel415/patches/patch-XSA444 \
pkgsrc/sysutils/xenkernel415/patches/patch-XSA445 \
pkgsrc/sysutils/xenkernel415/patches/patch-XSA446
cvs rdiff -u -r1.27 -r1.28 pkgsrc/sysutils/xentools415/Makefile
cvs rdiff -u -r1.3 -r1.4 pkgsrc/sysutils/xentools415/PLIST
cvs rdiff -u -r1.13 -r1.14 pkgsrc/sysutils/xentools415/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xentools415/patches/patch-XSA440 \
pkgsrc/sysutils/xentools415/patches/patch-XSA443
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: pkgsrc/sysutils/xenkernel415/Makefile
diff -u pkgsrc/sysutils/xenkernel415/Makefile:1.11 pkgsrc/sysutils/xenkernel415/Makefile:1.12
--- pkgsrc/sysutils/xenkernel415/Makefile:1.11 Thu Sep 21 10:39:45 2023
+++ pkgsrc/sysutils/xenkernel415/Makefile Wed Nov 15 15:59:36 2023
@@ -1,9 +1,9 @@
-# $NetBSD: Makefile,v 1.11 2023/09/21 10:39:45 bouyer Exp $
+# $NetBSD: Makefile,v 1.12 2023/11/15 15:59:36 bouyer Exp $
VERSION= 4.15.5
DISTNAME= xen-${VERSION}
PKGNAME= xenkernel415-${VERSION}
-PKGREVISION= 1
+PKGREVISION= 2
CATEGORIES= sysutils
MASTER_SITES= https://downloads.xenproject.org/release/xen/${VERSION}/
DIST_SUBDIR= xen415
Index: pkgsrc/sysutils/xenkernel415/distinfo
diff -u pkgsrc/sysutils/xenkernel415/distinfo:1.10 pkgsrc/sysutils/xenkernel415/distinfo:1.11
--- pkgsrc/sysutils/xenkernel415/distinfo:1.10 Thu Sep 21 10:39:45 2023
+++ pkgsrc/sysutils/xenkernel415/distinfo Wed Nov 15 15:59:36 2023
@@ -1,10 +1,15 @@
-$NetBSD: distinfo,v 1.10 2023/09/21 10:39:45 bouyer Exp $
+$NetBSD: distinfo,v 1.11 2023/11/15 15:59:36 bouyer Exp $
BLAKE2s (xen415/xen-4.15.5.tar.gz) = 85bef27c99fd9fd3037ec6df5e514289b650f2f073bcc543d13d5997c03332d4
SHA512 (xen415/xen-4.15.5.tar.gz) = 790f3d75df78f63f5b2ce3b99c1f2287f75ef5571d1b7a9bb9bac470bd28ccbd4816d07a1af8320eee4107626c75be029bd6dad1d99d58f3816906ed98d206d9
Size (xen415/xen-4.15.5.tar.gz) = 40835793 bytes
SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
SHA1 (patch-XSA438) = a8288bbbe8ffe799cebbf6bb184b1a2b59b59089
+SHA1 (patch-XSA439) = 5284e7801ed379aaac3c12dafc32283567bddd95
+SHA1 (patch-XSA442) = 170d94ed89a0d9ab210052fef0c8ae41a426374c
+SHA1 (patch-XSA444) = 5da1c79e811bebf5fee8416b00f76cfbc3946701
+SHA1 (patch-XSA445) = 85990f0ecd529b0c0b4cd9ab422d305bb94ae4b9
+SHA1 (patch-XSA446) = 7271a9afc134cca8c42e7a284f1761ddce2ac5ca
SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
SHA1 (patch-xen_arch_x86_Kconfig) = df14bfa09b9a0008ca59d53c938d43a644822dd9
Index: pkgsrc/sysutils/xentools415/Makefile
diff -u pkgsrc/sysutils/xentools415/Makefile:1.27 pkgsrc/sysutils/xentools415/Makefile:1.28
--- pkgsrc/sysutils/xentools415/Makefile:1.27 Mon Oct 23 06:37:53 2023
+++ pkgsrc/sysutils/xentools415/Makefile Wed Nov 15 15:59:36 2023
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.27 2023/10/23 06:37:53 wiz Exp $
+# $NetBSD: Makefile,v 1.28 2023/11/15 15:59:36 bouyer Exp $
#
# VERSION is set in version.mk as it is shared with other packages
-PKGREVISION= 1
+PKGREVISION= 2
.include "version.mk"
PKGNAME= xentools415-${VERSION}
Index: pkgsrc/sysutils/xentools415/PLIST
diff -u pkgsrc/sysutils/xentools415/PLIST:1.3 pkgsrc/sysutils/xentools415/PLIST:1.4
--- pkgsrc/sysutils/xentools415/PLIST:1.3 Sat Jan 15 17:44:35 2022
+++ pkgsrc/sysutils/xentools415/PLIST Wed Nov 15 15:59:36 2023
@@ -1,4 +1,4 @@
-@comment $NetBSD: PLIST,v 1.3 2022/01/15 17:44:35 wiz Exp $
+@comment $NetBSD: PLIST,v 1.4 2023/11/15 15:59:36 bouyer Exp $
${PYSITELIB}/grub/ExtLinuxConf.py
${PYSITELIB}/grub/ExtLinuxConf.pyc
${PYSITELIB}/grub/GrubConf.py
@@ -7,10 +7,10 @@ ${PYSITELIB}/grub/LiloConf.py
${PYSITELIB}/grub/LiloConf.pyc
${PYSITELIB}/grub/__init__.py
${PYSITELIB}/grub/__init__.pyc
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/PKG-INFO
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/SOURCES.txt
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/dependency_links.txt
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/top_level.txt
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/PKG-INFO
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/SOURCES.txt
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/dependency_links.txt
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/top_level.txt
${PYSITELIB}/xen-3.0-py${PYVERSSUFFIX}.egg-info/PKG-INFO
${PYSITELIB}/xen-3.0-py${PYVERSSUFFIX}.egg-info/SOURCES.txt
${PYSITELIB}/xen-3.0-py${PYVERSSUFFIX}.egg-info/dependency_links.txt
Index: pkgsrc/sysutils/xentools415/distinfo
diff -u pkgsrc/sysutils/xentools415/distinfo:1.13 pkgsrc/sysutils/xentools415/distinfo:1.14
--- pkgsrc/sysutils/xentools415/distinfo:1.13 Thu Aug 24 10:27:09 2023
+++ pkgsrc/sysutils/xentools415/distinfo Wed Nov 15 15:59:36 2023
@@ -1,4 +1,4 @@
-$NetBSD: distinfo,v 1.13 2023/08/24 10:27:09 bouyer Exp $
+$NetBSD: distinfo,v 1.14 2023/11/15 15:59:36 bouyer Exp $
BLAKE2s (xen415/ipxe-988d2c13cdf0f0b4140685af35ced70ac5b3283c.tar.gz) = 67ded947316100f4f66fa61fe156baf1620db575450f4dc0dd8dcb323e57970b
SHA512 (xen415/ipxe-988d2c13cdf0f0b4140685af35ced70ac5b3283c.tar.gz) = d888e0e653727ee9895fa866d8895e6d23a568b4e9e8439db4c4d790996700c60b0655e3a3129e599736ec2b4f7b987ce79d625ba208f06665fced8bddf94403
@@ -12,6 +12,8 @@ Size (xen415/xen-4.15.5.tar.gz) = 408357
SHA1 (patch-.._seabios-rel-1.16.0_src_string.c) = e82f2f16a236a3b878c07b4fb655998591717a73
SHA1 (patch-Config.mk) = d108a1743b5b5313d3ea957b02a005b49f5b3bf6
SHA1 (patch-Makefile) = 6c580cbea532d08a38cf5e54228bd0210a98da21
+SHA1 (patch-XSA440) = 92c21a9caab0292082799e357725345ac676db9e
+SHA1 (patch-XSA443) = 53ea19eb131c3a83b9ab586fc6632fa3704e4fc0
SHA1 (patch-docs_man_xl.1.pod.in) = 280a3717b9f15578d90f85392249ef97844b6765
SHA1 (patch-docs_man_xl.cfg.5.pod.in) = 5970961552f29c4536a884161a208a27a20dccf4
SHA1 (patch-docs_man_xlcpupool.cfg.5.pod) = ab3a2529cd10458948557fd7ab032e80df8b1d81
Added files:
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA439
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA439:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA439 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,255 @@
+$NetBSD: patch-XSA439,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From d7b78041dc819efde0350f27754a61cb01a93496 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Date: Wed, 30 Aug 2023 20:24:25 +0100
+Subject: [PATCH 1/1] x86/spec-ctrl: Mitigate the Zen1 DIV leakage
+
+In the Zen1 microarchitecure, there is one divider in the pipeline which
+services uops from both threads. In the case of #DE, the latched result from
+the previous DIV to execute will be forwarded speculatively.
+
+This is an interesting covert channel that allows two threads to communicate
+without any system calls. In also allows userspace to obtain the result of
+the most recent DIV instruction executed (even speculatively) in the core,
+which can be from a higher privilege context.
+
+Scrub the result from the divider by executing a non-faulting divide. This
+needs performing on the exit-to-guest paths, and ist_exit-to-Xen.
+
+Alternatives in IST context is believed safe now that it's done in NMI
+context.
+
+This is XSA-439 / CVE-2023-20588.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+(cherry picked from commit b5926c6ecf05c28ee99c6248c42d691ccbf0c315)
+---
+ docs/misc/xen-command-line.pandoc | 6 +++-
+ xen/arch/x86/hvm/svm/entry.S | 1 +
+ xen/arch/x86/spec_ctrl.c | 49 ++++++++++++++++++++++++++++-
+ xen/include/asm-x86/cpufeatures.h | 3 +-
+ xen/include/asm-x86/spec_ctrl_asm.h | 17 ++++++++++
+ 5 files changed, 73 insertions(+), 3 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index 16a61ad858..f3d1009f2d 100644
+--- docs/misc/xen-command-line.pandoc.orig
++++ docs/misc/xen-command-line.pandoc
+@@ -2189,7 +2189,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
+ > {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
+ > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
+ > eager-fpu,l1d-flush,branch-harden,srb-lock,
+-> unpriv-mmio,gds-mit}=<bool> ]`
++> unpriv-mmio,gds-mit,div-scrub}=<bool> ]`
+
+ Controls for speculative execution sidechannel mitigations. By default, Xen
+ will pick the most appropriate mitigations based on compiled in support,
+@@ -2309,6 +2309,10 @@ has elected not to lock the configuration, Xen will use GDS_CTRL to mitigate
+ GDS with. Otherwise, Xen will mitigate by disabling AVX, which blocks the use
+ of the AVX2 Gather instructions.
+
++On all hardware, the `div-scrub=` option can be used to force or prevent Xen
++from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate
++DIV-leakage on hardware believed to be vulnerable.
++
+ ### sync_console
+ > `= <boolean>`
+
+diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S
+index 0ff4008060..ad5ca50c12 100644
+--- xen/arch/x86/hvm/svm/entry.S.orig
++++ xen/arch/x86/hvm/svm/entry.S
+@@ -72,6 +72,7 @@ __UNLIKELY_END(nsvm_hap)
+ 1: /* No Spectre v1 concerns. Execution will hit VMRUN imminently. */
+ .endm
+ ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM
++ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+
+ pop %r15
+ pop %r14
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index e9cc6b586a..f75124117b 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -22,6 +22,7 @@
+ #include <xen/param.h>
+ #include <xen/warning.h>
+
++#include <asm/amd.h>
+ #include <asm/hvm/svm/svm.h>
+ #include <asm/microcode.h>
+ #include <asm/msr.h>
+@@ -78,6 +79,7 @@ static int8_t __initdata opt_srb_lock = -1;
+ static bool __initdata opt_unpriv_mmio;
+ static bool __read_mostly opt_fb_clear_mmio;
+ static int8_t __initdata opt_gds_mit = -1;
++static int8_t __initdata opt_div_scrub = -1;
+
+ static int __init parse_spec_ctrl(const char *s)
+ {
+@@ -132,6 +134,7 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_srb_lock = 0;
+ opt_unpriv_mmio = false;
+ opt_gds_mit = 0;
++ opt_div_scrub = 0;
+ }
+ else if ( val > 0 )
+ rc = -EINVAL;
+@@ -284,6 +287,8 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_unpriv_mmio = val;
+ else if ( (val = parse_boolean("gds-mit", s, ss)) >= 0 )
+ opt_gds_mit = val;
++ else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 )
++ opt_div_scrub = val;
+ else
+ rc = -EINVAL;
+
+@@ -484,7 +489,7 @@ static void __init print_details(enum ind_thunk thunk)
+ "\n");
+
+ /* Settings for Xen's protection, irrespective of guests. */
+- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n",
++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n",
+ thunk == THUNK_NONE ? "N/A" :
+ thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+ thunk == THUNK_LFENCE ? "LFENCE" :
+@@ -509,6 +514,7 @@ static void __init print_details(enum ind_thunk thunk)
+ opt_l1d_flush ? " L1D_FLUSH" : "",
+ opt_md_clear_pv || opt_md_clear_hvm ||
+ opt_fb_clear_mmio ? " VERW" : "",
++ opt_div_scrub ? " DIV" : "",
+ opt_branch_harden ? " BRANCH_HARDEN" : "");
+
+ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+@@ -933,6 +939,45 @@ static void __init srso_calculations(bool hw_smt_enabled)
+ setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ }
+
++/*
++ * The Div leakage issue is specific to the AMD Zen1 microarchitecure.
++ *
++ * However, there's no $FOO_NO bit defined, so if we're virtualised we have no
++ * hope of spotting the case where we might move to vulnerable hardware. We
++ * also can't make any useful conclusion about SMT-ness.
++ *
++ * Don't check the hypervisor bit, so at least we do the safe thing when
++ * booting on something that looks like a Zen1 CPU.
++ */
++static bool __init has_div_vuln(void)
++{
++ if ( !(boot_cpu_data.x86_vendor &
++ (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
++ return false;
++
++ if ( boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18 )
++ return false;
++
++ return is_zen1_uarch();
++}
++
++static void __init div_calculations(bool hw_smt_enabled)
++{
++ bool cpu_bug_div = has_div_vuln();
++
++ if ( opt_div_scrub == -1 )
++ opt_div_scrub = cpu_bug_div;
++
++ if ( opt_div_scrub )
++ setup_force_cpu_cap(X86_FEATURE_SC_DIV);
++
++ if ( opt_smt == -1 && !cpu_has_hypervisor && cpu_bug_div && hw_smt_enabled )
++ warning_add(
++ "Booted on leaky-DIV hardware with SMT/Hyperthreading\n"
++ "enabled. Please assess your configuration and choose an\n"
++ "explicit 'smt=<bool>' setting. See XSA-439.\n");
++}
++
+ static void __init ibpb_calculations(void)
+ {
+ bool def_ibpb_entry = false;
+@@ -1644,6 +1689,8 @@ void __init init_speculation_mitigations(void)
+
+ ibpb_calculations();
+
++ div_calculations(hw_smt_enabled);
++
+ /* Check whether Eager FPU should be enabled by default. */
+ if ( opt_eager_fpu == -1 )
+ opt_eager_fpu = should_use_eager_fpu();
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index bdb119a34c..d993e06e4c 100644
+--- xen/include/asm-x86/cpufeatures.h.orig
++++ xen/include/asm-x86/cpufeatures.h
+@@ -35,7 +35,8 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM
+ XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */
+ XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */
+ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
+-/* Bits 23,24 unused. */
++XEN_CPUFEATURE(SC_DIV, X86_SYNTH(23)) /* DIV scrub needed */
++/* Bit 24 unused. */
+ XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */
+ XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
+ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
+
+--- xen/include/asm-x86/spec_ctrl_asm.h.orig 2023-08-07 14:08:26.000000000 +0200
++++ xen/include/asm-x86/spec_ctrl_asm.h 2023-11-15 14:50:58.771057793 +0100
+@@ -178,6 +178,19 @@
+ .L\@_verw_skip:
+ .endm
+
++.macro DO_SPEC_CTRL_DIV
++/*
++ * Requires nothing
++ * Clobbers %rax
++ *
++ * Issue a DIV for its flushing side effect (Zen1 uarch specific). Any
++ * non-faulting DIV will do; a byte DIV has least latency, and doesn't clobber
++ * %rdx.
++ */
++ mov $1, %eax
++ div %al
++.endm
++
+ .macro DO_SPEC_CTRL_ENTRY maybexen:req
+ /*
+ * Requires %rsp=regs (also cpuinfo if !maybexen)
+@@ -231,6 +244,7 @@
+ wrmsr
+
+ .L\@_skip:
++ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+ .endm
+
+ .macro DO_SPEC_CTRL_EXIT_TO_GUEST
+@@ -277,7 +291,8 @@
+ #define SPEC_CTRL_EXIT_TO_PV \
+ ALTERNATIVE "", \
+ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \
+- DO_SPEC_CTRL_COND_VERW
++ DO_SPEC_CTRL_COND_VERW; \
++ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+
+ /*
+ * Use in IST interrupt/exception context. May interrupt Xen or PV context.
+--- xen/include/asm-x86/amd.h.orig 2023-11-15 15:16:19.642351562 +0100
++++ xen/include/asm-x86/amd.h 2023-11-15 15:17:10.878437198 +0100
+@@ -140,6 +140,17 @@
+ AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf), \
+ AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf))
+
++/*
++ * The Zen1 and Zen2 microarchitectures are implemented by AMD (Fam17h) and
++ * Hygon (Fam18h) but without simple model number rules. Instead, use STIBP
++ * as a heuristic that distinguishes the two.
++ *
++ * The caller is required to perform the appropriate vendor/family checks
++ * first.
++ */
++#define is_zen1_uarch() (!boot_cpu_has(X86_FEATURE_AMD_STIBP))
++#define is_zen2_uarch() boot_cpu_has(X86_FEATURE_AMD_STIBP)
++
+ struct cpuinfo_x86;
+ int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...);
+
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA442
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA442:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA442 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,187 @@
+$NetBSD: patch-XSA442,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 42614970833467d8b9aaf9def9f062c6c7425dad Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Tue, 13 Jun 2023 15:01:05 +0200
+Subject: [PATCH] iommu/amd-vi: flush IOMMU TLB when flushing the DTE
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The caching invalidation guidelines from the AMD-Vi specification (48882—Rev
+3.07-PUB—Oct 2022) seem to be misleading on some hardware, as devices will
+malfunction (see stale DMA mappings) if some fields of the DTE are updated but
+the IOMMU TLB is not flushed. This has been observed in practice on AMD
+systems. Due to the lack of guidance from the currently published
+specification this patch aims to increase the flushing done in order to prevent
+device malfunction.
+
+In order to fix, issue an INVALIDATE_IOMMU_PAGES command from
+amd_iommu_flush_device(), flushing all the address space. Note this requires
+callers to be adjusted in order to pass the DomID on the DTE previous to the
+modification.
+
+Some call sites don't provide a valid DomID to amd_iommu_flush_device() in
+order to avoid the flush. That's because the device had address translations
+disabled and hence the previous DomID on the DTE is not valid. Note the
+current logic relies on the entity disabling address translations to also flush
+the TLB of the in use DomID.
+
+Device I/O TLB flushing when ATS are enabled is not covered by the current
+change, as ATS usage is not security supported.
+
+This is XSA-442 / CVE-2023-34326
+
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ xen/drivers/passthrough/amd/iommu.h | 3 ++-
+ xen/drivers/passthrough/amd/iommu_cmd.c | 10 +++++++++-
+ xen/drivers/passthrough/amd/iommu_guest.c | 5 +++--
+ xen/drivers/passthrough/amd/iommu_init.c | 6 +++++-
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 14 ++++++++++----
+ 5 files changed, 29 insertions(+), 9 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu.h b/xen/drivers/passthrough/amd/iommu.h
+index 0d9d976faaea..4e355ef4c12f 100644
+--- xen/drivers/passthrough/amd/iommu.h.orig
++++ xen/drivers/passthrough/amd/iommu.h
+@@ -265,7 +265,8 @@ void amd_iommu_flush_pages(struct domain *d, unsigned long dfn,
+ unsigned int order);
+ void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev,
+ uint64_t gaddr, unsigned int order);
+-void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf);
++void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf,
++ domid_t domid);
+ void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf);
+ void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
+
+diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c
+index dfb8b1c860d1..196e3dce3aec 100644
+--- xen/drivers/passthrough/amd/iommu_cmd.c.orig
++++ xen/drivers/passthrough/amd/iommu_cmd.c
+@@ -362,12 +362,20 @@ void amd_iommu_flush_pages(struct domain *d,
+ _amd_iommu_flush_pages(d, __dfn_to_daddr(dfn), order);
+ }
+
+-void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf)
++void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf,
++ domid_t domid)
+ {
+ ASSERT( spin_is_locked(&iommu->lock) );
+
+ invalidate_dev_table_entry(iommu, bdf);
+ flush_command_buffer(iommu, 0);
++
++ /* Also invalidate IOMMU TLB entries when flushing the DTE. */
++ if ( domid != DOMID_INVALID )
++ {
++ invalidate_iommu_pages(iommu, INV_IOMMU_ALL_PAGES_ADDRESS, domid, 0);
++ flush_command_buffer(iommu, 0);
++ }
+ }
+
+ void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf)
+diff --git a/xen/drivers/passthrough/amd/iommu_guest.c b/xen/drivers/passthrough/amd/iommu_guest.c
+index 00c5ccd7b5d2..f404e382f019 100644
+--- xen/drivers/passthrough/amd/iommu_guest.c.orig
++++ xen/drivers/passthrough/amd/iommu_guest.c
+@@ -385,7 +385,7 @@ static int do_completion_wait(struct domain *d, cmd_entry_t *cmd)
+
+ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd)
+ {
+- uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id;
++ uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id, prev_domid;
+ struct amd_iommu_dte *gdte, *mdte, *dte_base;
+ struct amd_iommu *iommu = NULL;
+ struct guest_iommu *g_iommu;
+@@ -445,11 +445,12 @@ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd)
+ req_id = get_dma_requestor_id(iommu->seg, mbdf);
+ dte_base = iommu->dev_table.buffer;
+ mdte = &dte_base[req_id];
++ prev_domid = mdte->domain_id;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ dte_set_gcr3_table(mdte, hdom_id, gcr3_mfn << PAGE_SHIFT, gv, glx);
+
+- amd_iommu_flush_device(iommu, req_id);
++ amd_iommu_flush_device(iommu, req_id, prev_domid);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return 0;
+diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
+index bb52c181f8cd..4a96f7fbec3c 100644
+--- xen/drivers/passthrough/amd/iommu_init.c.orig
++++ xen/drivers/passthrough/amd/iommu_init.c
+@@ -1554,7 +1554,11 @@ static int _invalidate_all_devices(
+ if ( iommu )
+ {
+ spin_lock_irqsave(&iommu->lock, flags);
+- amd_iommu_flush_device(iommu, req_id);
++ /*
++ * IOMMU TLB flush performed separately (see
++ * invalidate_all_domain_pages()).
++ */
++ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID);
+ amd_iommu_flush_intremap(iommu, req_id);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index e804fdc34fcd..872955566608 100644
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -183,10 +183,13 @@ static int __must_check amd_iommu_setup_domain_device(
+ iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
+ dte->i = ats_enabled;
+
+- amd_iommu_flush_device(iommu, req_id);
++ /* DTE didn't have DMA translations enabled, do not flush the TLB. */
++ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID);
+ }
+ else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) )
+ {
++ domid_t prev_domid = dte->domain_id;
++
+ /*
+ * Strictly speaking if the device is the only one with this requestor
+ * ID, it could be allowed to be re-assigned regardless of unity map
+@@ -240,7 +243,7 @@ static int __must_check amd_iommu_setup_domain_device(
+ iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
+ ASSERT(dte->i == ats_enabled);
+
+- amd_iommu_flush_device(iommu, req_id);
++ amd_iommu_flush_device(iommu, req_id, prev_domid);
+ }
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -389,6 +392,8 @@ static void amd_iommu_disable_domain_device(const struct domain *domain,
+ spin_lock_irqsave(&iommu->lock, flags);
+ if ( dte->tv || dte->v )
+ {
++ domid_t prev_domid = dte->domain_id;
++
+ /* See the comment in amd_iommu_setup_device_table(). */
+ dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+ smp_wmb();
+@@ -405,7 +410,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain,
+ smp_wmb();
+ dte->v = true;
+
+- amd_iommu_flush_device(iommu, req_id);
++ amd_iommu_flush_device(iommu, req_id, prev_domid);
+
+ AMD_IOMMU_DEBUG("Disable: device id = %#x, "
+ "domain = %d, paging mode = %d\n",
+@@ -578,7 +583,8 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+ iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
+ ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
+
+- amd_iommu_flush_device(iommu, bdf);
++ /* DTE didn't have DMA translations enabled, do not flush the TLB. */
++ amd_iommu_flush_device(iommu, bdf, DOMID_INVALID);
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+--
+2.42.0
+
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA444
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA444:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA444 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,167 @@
+$NetBSD: patch-XSA444,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/svm: Fix asymmetry with AMD DR MASK context switching
+
+The handling of MSR_DR{0..3}_MASK is asymmetric between PV and HVM guests.
+
+HVM guests context switch in based on the guest view of DBEXT, whereas PV
+guest switch in base on the host capability. Both guest types leave the
+context dirty for the next vCPU.
+
+This leads to the following issue:
+
+ * PV or HVM guest has debugging active (%dr7 + mask)
+ * Switch-out deactivates %dr7 but leaves other state stale in hardware
+ * Another HVM guest with masks unavailable has debugging active
+ * Switch in loads %dr7 but leaves the mask MSRs alone
+
+Now, the second guest's vCPU is operating in the context of the prior vCPU's
+mask MSR, while the environment the vCPU can see says there are no mask MSRs.
+
+As a stopgap, adjust the HVM path to switch in the masks based on host
+capabilities rather than guest visibility (i.e. like the PV path). Adjustment
+of the intercepts still needs to be dependent on the guest visibility of
+DBEXT.
+
+This is part of XSA-444 / CVE-2023-34327
+
+Fixes: c097f54912d3 ("x86/SVM: support data breakpoint extension registers")
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
+index a019d196e071..ba4069f9100a 100644
+--- xen/arch/x86/hvm/svm/svm.c.orig
++++ xen/arch/x86/hvm/svm/svm.c
+@@ -185,6 +185,10 @@ static void svm_save_dr(struct vcpu *v)
+ v->arch.hvm.flag_dr_dirty = 0;
+ vmcb_set_dr_intercepts(vmcb, ~0u);
+
++ /*
++ * The guest can only have changed the mask MSRs if we previous dropped
++ * intercepts. Re-read them from hardware.
++ */
+ if ( v->domain->arch.cpuid->extd.dbext )
+ {
+ svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
+@@ -216,17 +220,25 @@ static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
+
+ ASSERT(v == current);
+
+- if ( v->domain->arch.cpuid->extd.dbext )
++ /*
++ * Both the PV and HVM paths leave stale DR_MASK values in hardware on
++ * context-switch-out. If we're activating %dr7 for the guest, we must
++ * sync the DR_MASKs too, whether or not the guest can see them.
++ */
++ if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+ {
+- svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+- svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+- svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+- svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+-
+ wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.msrs->dr_mask[0]);
+ wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.msrs->dr_mask[1]);
+ wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.msrs->dr_mask[2]);
+ wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.msrs->dr_mask[3]);
++
++ if ( v->domain->arch.cpuid->extd.dbext )
++ {
++ svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++ svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++ svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++ svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++ }
+ }
+
+ write_debugreg(0, v->arch.dr[0]);
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index f7992ff230b5..a142a63dd869 100644
+--- xen/arch/x86/traps.c.orig
++++ xen/arch/x86/traps.c
+@@ -2314,6 +2314,11 @@ void activate_debugregs(const struct vcpu *curr)
+ if ( curr->arch.dr7 & DR7_ACTIVE_MASK )
+ write_debugreg(7, curr->arch.dr7);
+
++ /*
++ * Both the PV and HVM paths leave stale DR_MASK values in hardware on
++ * context-switch-out. If we're activating %dr7 for the guest, we must
++ * sync the DR_MASKs too, whether or not the guest can see them.
++ */
+ if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+ {
+ wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]);
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/pv: Correct the auditing of guest breakpoint addresses
+
+The use of access_ok() is buggy, because it permits access to the compat
+translation area. 64bit PV guests don't use the XLAT area, but on AMD
+hardware, the DBEXT feature allows a breakpoint to match up to a 4G aligned
+region, allowing the breakpoint to reach outside of the XLAT area.
+
+Prior to c/s cda16c1bb223 ("x86: mirror compat argument translation area for
+32-bit PV"), the live GDT was within 4G of the XLAT area.
+
+All together, this allowed a malicious 64bit PV guest on AMD hardware to place
+a breakpoint over the live GDT, and trigger a #DB livelock (CVE-2015-8104).
+
+Introduce breakpoint_addr_ok() and explain why __addr_ok() happens to be an
+appropriate check in this case.
+
+For Xen 4.14 and later, this is a latent bug because the XLAT area has moved
+to be on its own with nothing interesting adjacent. For Xen 4.13 and older on
+AMD hardware, this fixes a PV-trigger-able DoS.
+
+This is part of XSA-444 / CVE-2023-34328.
+
+Fixes: 65e355490817 ("x86/PV: support data breakpoint extension registers")
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c
+index 5dade2472687..681c16108fd1 100644
+--- xen/arch/x86/pv/misc-hypercalls.c.orig
++++ xen/arch/x86/pv/misc-hypercalls.c
+@@ -68,7 +68,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+ switch ( reg )
+ {
+ case 0 ... 3:
+- if ( !access_ok(value, sizeof(long)) )
++ if ( !breakpoint_addr_ok(value) )
+ return -EPERM;
+
+ v->arch.dr[reg] = value;
+diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h
+index c57914efc6e8..cc298265244b 100644
+--- xen/include/asm-x86/debugreg.h.orig
++++ xen/include/asm-x86/debugreg.h
+@@ -77,6 +77,26 @@
+ asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \
+ __val; \
+ })
++
++/*
++ * Architecturally, %dr{0..3} can have any arbitrary value. However, Xen
++ * can't allow the guest to breakpoint the Xen address range, so we limit the
++ * guest to the lower canonical half, or above the Xen range in the higher
++ * canonical half.
++ *
++ * Breakpoint lengths are specified to mask the low order address bits,
++ * meaning all breakpoints are naturally aligned. With %dr7, the widest
++ * breakpoint is 8 bytes. With DBEXT, the widest breakpoint is 4G. Both of
++ * the Xen boundaries have >4G alignment.
++ *
++ * In principle we should account for HYPERVISOR_COMPAT_VIRT_START(d), but
++ * 64bit Xen has never enforced this for compat guests, and there's no problem
++ * (to Xen) if the guest breakpoints it's alias of the M2P. Skipping this
++ * aspect simplifies the logic, and causes us not to reject a migrating guest
++ * which operated fine on prior versions of Xen.
++ */
++#define breakpoint_addr_ok(a) __addr_ok(a)
++
+ long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
+ void activate_debugregs(const struct vcpu *);
+
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA445
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA445:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA445 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,67 @@
+$NetBSD: patch-XSA445,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 9877bb3af60ef2b543742835c49de7d0108cdca9 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Wed, 11 Oct 2023 13:14:21 +0200
+Subject: [PATCH] iommu/amd-vi: use correct level for quarantine domain page
+ tables
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current setup of the quarantine page tables assumes that the quarantine
+domain (dom_io) has been initialized with an address width of
+DEFAULT_DOMAIN_ADDRESS_WIDTH (48).
+
+However dom_io being a PV domain gets the AMD-Vi IOMMU page tables levels based
+on the maximum (hot pluggable) RAM address, and hence on systems with no RAM
+above the 512GB mark only 3 page-table levels are configured in the IOMMU.
+
+On systems without RAM above the 512GB boundary amd_iommu_quarantine_init()
+will setup page tables for the scratch page with 4 levels, while the IOMMU will
+be configured to use 3 levels only. The page destined to be used as level 1,
+and to contain a directory of PTEs ends up being the address in a PTE itself,
+and thus level 1 page becomes the leaf page. Without the level mismatch it's
+level 0 page that should be the leaf page instead.
+
+The level 1 page won't be used as such, and hence it's not possible to use it
+to gain access to other memory on the system. However that page is not cleared
+in amd_iommu_quarantine_init() as part of re-initialization of the device
+quarantine page tables, and hence data on the level 1 page can be leaked
+between device usages.
+
+Fix this by making sure the paging levels setup by amd_iommu_quarantine_init()
+match the number configured on the IOMMUs.
+
+Note that IVMD regions are not affected by this issue, as those areas are
+mapped taking the configured paging levels into account.
+
+This is XSA-445 / CVE-2023-46835
+
+Fixes: ea38867831da ('x86 / iommu: set up a scratch page in the quarantine domain')
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ xen/drivers/passthrough/amd/iommu_map.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
+index b4c182449131..3473db4c1efc 100644
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -584,9 +584,7 @@ static int fill_qpt(union amd_iommu_pte *this, unsigned int level,
+ int amd_iommu_quarantine_init(struct pci_dev *pdev)
+ {
+ struct domain_iommu *hd = dom_iommu(dom_io);
+- unsigned long end_gfn =
+- 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
+- unsigned int level = amd_iommu_get_paging_mode(end_gfn);
++ unsigned int level = hd->arch.amd.paging_mode;
+ unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf);
+ const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+ int rc;
+
+base-commit: 4a4daf6bddbe8a741329df5cc8768f7dec664aed
+--
+2.30.2
+
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA446
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA446:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA446 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,117 @@
+$NetBSD: patch-XSA446,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 80d5aada598c3a800a350003d5d582931545e13c Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Date: Thu, 26 Oct 2023 14:37:38 +0100
+Subject: [PATCH] x86/spec-ctrl: Remove conditional IRQs-on-ness for INT
+ $0x80/0x82 paths
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Before speculation defences, some paths in Xen could genuinely get away with
+being IRQs-on at entry. But XPTI invalidated this property on most paths, and
+attempting to maintain it on the remaining paths was a mistake.
+
+Fast forward, and DO_SPEC_CTRL_COND_IBPB (protection for AMD BTC/SRSO) is not
+IRQ-safe, running with IRQs enabled in some cases. The other actions taken on
+these paths happen to be IRQ-safe.
+
+Make entry_int82() and int80_direct_trap() unconditionally Interrupt Gates
+rather than Trap Gates. Remove the conditional re-adjustment of
+int80_direct_trap() in smp_prepare_cpus(), and have entry_int82() explicitly
+enable interrupts when safe to do so.
+
+In smp_prepare_cpus(), with the conditional re-adjustment removed, the
+clearing of pv_cr3 is the only remaining action gated on XPTI, and it is out
+of place anyway, repeating work already done by smp_prepare_boot_cpu(). Drop
+the entire if() condition to avoid leaving an incorrect vestigial remnant.
+
+Also drop comments which make incorrect statements about when its safe to
+enable interrupts.
+
+This is XSA-446 / CVE-2023-46836
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+---
+ xen/arch/x86/pv/traps.c | 4 ++--
+ xen/arch/x86/smpboot.c | 14 --------------
+ xen/arch/x86/x86_64/compat/entry.S | 2 ++
+ xen/arch/x86/x86_64/entry.S | 1 -
+ 4 files changed, 4 insertions(+), 17 deletions(-)
+
+diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
+index 74f333da7e1c..240d1a2db7a3 100644
+--- xen/arch/x86/pv/traps.c.orig
++++ xen/arch/x86/pv/traps.c
+@@ -139,11 +139,11 @@ void __init pv_trap_init(void)
+ #ifdef CONFIG_PV32
+ /* The 32-on-64 hypercall vector is only accessible from ring 1. */
+ _set_gate(idt_table + HYPERCALL_VECTOR,
+- SYS_DESC_trap_gate, 1, entry_int82);
++ SYS_DESC_irq_gate, 1, entry_int82);
+ #endif
+
+ /* Fast trap for int80 (faster than taking the #GP-fixup path). */
+- _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_trap_gate, 3,
++ _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
+ &int80_direct_trap);
+
+ open_softirq(NMI_SOFTIRQ, nmi_softirq);
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index 3a1a659082c6..4c54ecbc91d7 100644
+--- xen/arch/x86/smpboot.c.orig
++++ xen/arch/x86/smpboot.c
+@@ -1158,20 +1158,6 @@ void __init smp_prepare_cpus(void)
+
+ stack_base[0] = (void *)((unsigned long)stack_start & ~(STACK_SIZE - 1));
+
+- if ( opt_xpti_hwdom || opt_xpti_domu )
+- {
+- get_cpu_info()->pv_cr3 = 0;
+-
+-#ifdef CONFIG_PV
+- /*
+- * All entry points which may need to switch page tables have to start
+- * with interrupts off. Re-write what pv_trap_init() has put there.
+- */
+- _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
+- &int80_direct_trap);
+-#endif
+- }
+-
+ set_nr_sockets();
+
+ socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index bd5abd8040bd..fcc3a721f147 100644
+--- xen/arch/x86/x86_64/compat/entry.S.orig
++++ xen/arch/x86/x86_64/compat/entry.S
+@@ -21,6 +21,8 @@ ENTRY(entry_int82)
+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+ /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+
++ sti
++
+ CR4_PV32_RESTORE
+
+ GET_CURRENT(bx)
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 5ca74f5f62b2..9a7b129aa7e4 100644
+--- xen/arch/x86/x86_64/entry.S.orig
++++ xen/arch/x86/x86_64/entry.S
+@@ -327,7 +327,6 @@ ENTRY(sysenter_entry)
+ #ifdef CONFIG_XEN_SHSTK
+ ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK
+ #endif
+- /* sti could live here when we don't switch page tables below. */
+ pushq $FLAT_USER_SS
+ pushq $0
+ pushfq
+
+base-commit: 7befef87cc9b1bb8ca15d866ce1ecd9165ccb58c
+prerequisite-patch-id: 142a87c707411d49e136c3fb76f1b14963ec6dc8
+--
+2.30.2
+
Index: pkgsrc/sysutils/xentools415/patches/patch-XSA440
diff -u /dev/null pkgsrc/sysutils/xentools415/patches/patch-XSA440:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xentools415/patches/patch-XSA440 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,60 @@
+$NetBSD: patch-XSA440,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 5d8b3d1ec98e56155d9650d7f4a70cd8ba9dc27d Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall%amazon.com@localhost>
+Date: Fri, 22 Sep 2023 11:32:16 +0100
+Subject: tools/xenstored: domain_entry_fix(): Handle conflicting transaction
+
+The function domain_entry_fix() will be initially called to check if the
+quota is correct before attempt to commit any nodes. So it would be
+possible that accounting is temporarily negative. This is the case
+in the following sequence:
+
+ 1) Create 50 nodes
+ 2) Start two transactions
+ 3) Delete all the nodes in each transaction
+ 4) Commit the two transactions
+
+Because the first transaction will have succeed and updated the
+accounting, there is no guarantee that 'd->nbentry + num' will still
+be above 0. So the assert() would be triggered.
+The assert() was introduced in dbef1f748289 ("tools/xenstore: simplify
+and fix per domain node accounting") with the assumption that the
+value can't be negative. As this is not true revert to the original
+check but restricted to the path where we don't update. Take the
+opportunity to explain the rationale behind the check.
+
+This CVE-2023-34323 / XSA-440.
+
+Reported-by: Stanislav Uschakow <suschako%amazon.de@localhost>
+Fixes: dbef1f748289 ("tools/xenstore: simplify and fix per domain node accounting")
+Signed-off-by: Julien Grall <jgrall%amazon.com@localhost>
+Reviewed-by: Juergen Gross <jgross%suse.com@localhost>
+
+diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c
+index aa86892fed9e..6074df210c6e 100644
+--- tools/xenstore/xenstored_domain.c.orig
++++ tools/xenstore/xenstored_domain.c
+@@ -1094,10 +1094,20 @@ int domain_entry_fix(unsigned int domid, int num, bool update)
+ }
+
+ cnt = d->nbentry + num;
+- assert(cnt >= 0);
+
+- if (update)
++ if (update) {
++ assert(cnt >= 0);
+ d->nbentry = cnt;
++ } else if (cnt < 0) {
++ /*
++ * In a transaction when a node is being added/removed AND
++ * the same node has been added/removed outside the
++ * transaction in parallel, the result value may be negative.
++ * This is no problem, as the transaction will fail due to
++ * the resulting conflict. So override 'cnt'.
++ */
++ cnt = 0;
++ }
+
+ return domid_is_unprivileged(domid) ? cnt : 0;
+ }
Index: pkgsrc/sysutils/xentools415/patches/patch-XSA443
diff -u /dev/null pkgsrc/sysutils/xentools415/patches/patch-XSA443:1.1
--- /dev/null Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xentools415/patches/patch-XSA443 Wed Nov 15 15:59:36 2023
@@ -0,0 +1,1370 @@
+$NetBSD: patch-XSA443,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From c4d597f63832a53bbb1b826af7a4677e40e9fded Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:50 +0100
+Subject: [PATCH 01/11] libfsimage/xfs: Remove dead code
+
+xfs_info.agnolog (and related code) and XFS_INO_AGBNO_BITS are dead code
+that serve no purpose.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/xfs/fsys_xfs.c | 18 ------------------
+ 1 file changed, 18 deletions(-)
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index d735a88e55f3..2800699f5985 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -37,7 +37,6 @@ struct xfs_info {
+ int blklog;
+ int inopblog;
+ int agblklog;
+- int agnolog;
+ unsigned int nextents;
+ xfs_daddr_t next;
+ xfs_daddr_t daddr;
+@@ -65,9 +64,7 @@ static struct xfs_info xfs;
+
+ #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1))
+ #define XFS_INO_OFFSET_BITS xfs.inopblog
+-#define XFS_INO_AGBNO_BITS xfs.agblklog
+ #define XFS_INO_AGINO_BITS (xfs.agblklog + xfs.inopblog)
+-#define XFS_INO_AGNO_BITS xfs.agnolog
+
+ static inline xfs_agblock_t
+ agino2agbno (xfs_agino_t agino)
+@@ -149,20 +146,6 @@ xt_len (xfs_bmbt_rec_32_t *r)
+ return le32(r->l3) & mask32lo(21);
+ }
+
+-static inline int
+-xfs_highbit32(xfs_uint32_t v)
+-{
+- int i;
+-
+- if (--v) {
+- for (i = 0; i < 31; i++, v >>= 1) {
+- if (v == 0)
+- return i;
+- }
+- }
+- return 0;
+-}
+-
+ static int
+ isinxt (xfs_fileoff_t key, xfs_fileoff_t offset, xfs_filblks_t len)
+ {
+@@ -472,7 +455,6 @@ xfs_mount (fsi_file_t *ffi, const char *options)
+
+ xfs.inopblog = super.sb_inopblog;
+ xfs.agblklog = super.sb_agblklog;
+- xfs.agnolog = xfs_highbit32 (le32(super.sb_agcount));
+
+ xfs.btnode_ptr0_off =
+ ((xfs.bsize - sizeof(xfs_btree_block_t)) /
+--
+2.42.0
+
+From f75b0a70da392672fb7d9feed2a9e9515d74df2c Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:51 +0100
+Subject: [PATCH 02/11] libfsimage/xfs: Amend mask32lo() to allow the value 32
+
+agblklog could plausibly be 32, but that would overflow this shift.
+Perform the shift as ULL and cast to u32 at the end instead.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/xfs/fsys_xfs.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index 2800699f5985..4720bb4505c8 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -60,7 +60,7 @@ static struct xfs_info xfs;
+ #define inode ((xfs_dinode_t *)((char *)FSYS_BUF + 8192))
+ #define icore (inode->di_core)
+
+-#define mask32lo(n) (((xfs_uint32_t)1 << (n)) - 1)
++#define mask32lo(n) ((xfs_uint32_t)((1ull << (n)) - 1))
+
+ #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1))
+ #define XFS_INO_OFFSET_BITS xfs.inopblog
+--
+2.42.0
+
+From 25fae23b32ee4d990ae11368ee21e28e66dbfa25 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:52 +0100
+Subject: [PATCH 03/11] libfsimage/xfs: Sanity-check the superblock during
+ mounts
+
+Sanity-check the XFS superblock for wellformedness at the mount handler.
+This forces pygrub to abort parsing a potentially malformed filesystem and
+ensures the invariants assumed throughout the rest of the code hold.
+
+Also, derive parameters from previously sanitized parameters where possible
+(rather than reading them off the superblock)
+
+The code doesn't try to avoid overflowing the end of the disk, because
+that's an unlikely and benign error. Parameters used in calculations of
+xfs_daddr_t (like the root inode index) aren't in critical need of being
+sanitized.
+
+The sanitization of agblklog is basically checking that no obvious
+overflows happen on agblklog, and then ensuring agblocks is contained in
+the range (2^(sb_agblklog-1), 2^sb_agblklog].
+
+This is part of XSA-443 / CVE-2023-34325
+
+Reported-by: Ferdinand Nölscher <noelscher%google.com@localhost>
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/xfs/fsys_xfs.c | 48 ++++++++++++++++++++++++++-------
+ tools/libfsimage/xfs/xfs.h | 12 +++++++++
+ 2 files changed, 50 insertions(+), 10 deletions(-)
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index 4720bb4505c8..e4eb7e1ee26f 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -17,6 +17,7 @@
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
++#include <stdbool.h>
+ #include <xenfsimage_grub.h>
+ #include "xfs.h"
+
+@@ -433,29 +434,56 @@ first_dentry (fsi_file_t *ffi, xfs_ino_t *ino)
+ return next_dentry (ffi, ino);
+ }
+
++static bool
++xfs_sb_is_invalid (const xfs_sb_t *super)
++{
++ return (le32(super->sb_magicnum) != XFS_SB_MAGIC)
++ || ((le16(super->sb_versionnum) & XFS_SB_VERSION_NUMBITS) !=
++ XFS_SB_VERSION_4)
++ || (super->sb_inodelog < XFS_SB_INODELOG_MIN)
++ || (super->sb_inodelog > XFS_SB_INODELOG_MAX)
++ || (super->sb_blocklog < XFS_SB_BLOCKLOG_MIN)
++ || (super->sb_blocklog > XFS_SB_BLOCKLOG_MAX)
++ || (super->sb_blocklog < super->sb_inodelog)
++ || (super->sb_agblklog > XFS_SB_AGBLKLOG_MAX)
++ || ((1ull << super->sb_agblklog) < le32(super->sb_agblocks))
++ || (((1ull << super->sb_agblklog) >> 1) >=
++ le32(super->sb_agblocks))
++ || ((super->sb_blocklog + super->sb_dirblklog) >=
++ XFS_SB_DIRBLK_NUMBITS);
++}
++
+ static int
+ xfs_mount (fsi_file_t *ffi, const char *options)
+ {
+ xfs_sb_t super;
+
+ if (!devread (ffi, 0, 0, sizeof(super), (char *)&super)
+- || (le32(super.sb_magicnum) != XFS_SB_MAGIC)
+- || ((le16(super.sb_versionnum)
+- & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4) ) {
++ || xfs_sb_is_invalid(&super)) {
+ return 0;
+ }
+
+- xfs.bsize = le32 (super.sb_blocksize);
+- xfs.blklog = super.sb_blocklog;
+- xfs.bdlog = xfs.blklog - SECTOR_BITS;
++ /*
++ * Not sanitized. It's exclusively used to generate disk addresses,
++ * so it's not important from a security standpoint.
++ */
+ xfs.rootino = le64 (super.sb_rootino);
+- xfs.isize = le16 (super.sb_inodesize);
+- xfs.agblocks = le32 (super.sb_agblocks);
+- xfs.dirbsize = xfs.bsize << super.sb_dirblklog;
+
+- xfs.inopblog = super.sb_inopblog;
++ /*
++ * Sanitized to be consistent with each other, only used to
++ * generate disk addresses, so it's safe
++ */
++ xfs.agblocks = le32 (super.sb_agblocks);
+ xfs.agblklog = super.sb_agblklog;
+
++ /* Derived from sanitized parameters */
++ xfs.bsize = 1 << super.sb_blocklog;
++ xfs.blklog = super.sb_blocklog;
++ xfs.bdlog = super.sb_blocklog - SECTOR_BITS;
++ xfs.isize = 1 << super.sb_inodelog;
++ xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog);
++ xfs.inopblog = super.sb_blocklog - super.sb_inodelog;
++
+ xfs.btnode_ptr0_off =
+ ((xfs.bsize - sizeof(xfs_btree_block_t)) /
+ (sizeof (xfs_bmbt_key_t) + sizeof (xfs_bmbt_ptr_t)))
+diff --git a/tools/libfsimage/xfs/xfs.h b/tools/libfsimage/xfs/xfs.h
+index 40699281e44d..b87e37d3d7e9 100644
+--- tools/libfsimage/xfs/xfs.h.orig
++++ tools/libfsimage/xfs/xfs.h
+@@ -134,6 +134,18 @@ typedef struct xfs_sb
+ xfs_uint8_t sb_dummy[7]; /* padding */
+ } xfs_sb_t;
+
++/* Bound taken from xfs.c in GRUB2. It doesn't exist in the spec */
++#define XFS_SB_DIRBLK_NUMBITS 27
++/* Implied by the XFS specification. The minimum block size is 512 octets */
++#define XFS_SB_BLOCKLOG_MIN 9
++/* Implied by the XFS specification. The maximum block size is 65536 octets */
++#define XFS_SB_BLOCKLOG_MAX 16
++/* Implied by the XFS specification. The minimum inode size is 256 octets */
++#define XFS_SB_INODELOG_MIN 8
++/* Implied by the XFS specification. The maximum inode size is 2048 octets */
++#define XFS_SB_INODELOG_MAX 11
++/* High bound for sb_agblklog */
++#define XFS_SB_AGBLKLOG_MAX 32
+
+ /* those are from xfs_btree.h */
+
+--
+2.42.0
+
+From e72c68e702dd930bc6013182bb44d3e8fbbb6bf4 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:53 +0100
+Subject: [PATCH 04/11] libfsimage/xfs: Add compile-time check to libfsimage
+
+Adds the common tools include folder to the -I compile flags
+of libfsimage. This allows us to use:
+ xen-tools/common-macros.h:BUILD_BUG_ON()
+
+With it, statically assert a sanitized "blocklog - SECTOR_BITS" cannot
+underflow.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/Rules.mk | 2 +-
+ tools/libfsimage/xfs/fsys_xfs.c | 4 +++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/tools/libfsimage/Rules.mk b/tools/libfsimage/Rules.mk
+index bb6d42abb494..80598fb70aa7 100644
+--- tools/libfsimage/Rules.mk.orig
++++ tools/libfsimage/Rules.mk
+@@ -1,6 +1,6 @@
+ include $(XEN_ROOT)/tools/Rules.mk
+
+-CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ -DFSIMAGE_FSDIR=\"$(FSDIR)\"
++CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ $(CFLAGS_xeninclude) -DFSIMAGE_FSDIR=\"$(FSDIR)\"
+ CFLAGS += -Werror -D_GNU_SOURCE
+ LDFLAGS += -L../common/
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index e4eb7e1ee26f..4a8dd6f2397b 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -19,6 +19,7 @@
+
+ #include <stdbool.h>
+ #include <xenfsimage_grub.h>
++#include <xen-tools/libs.h>
+ #include "xfs.h"
+
+ #define MAX_LINK_COUNT 8
+@@ -477,9 +478,10 @@ xfs_mount (fsi_file_t *ffi, const char *options)
+ xfs.agblklog = super.sb_agblklog;
+
+ /* Derived from sanitized parameters */
++ BUILD_BUG_ON(XFS_SB_BLOCKLOG_MIN < SECTOR_BITS);
++ xfs.bdlog = super.sb_blocklog - SECTOR_BITS;
+ xfs.bsize = 1 << super.sb_blocklog;
+ xfs.blklog = super.sb_blocklog;
+- xfs.bdlog = super.sb_blocklog - SECTOR_BITS;
+ xfs.isize = 1 << super.sb_inodelog;
+ xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog);
+ xfs.inopblog = super.sb_blocklog - super.sb_inodelog;
+--
+2.42.0
+
+From 75fdc03c5a6b7fac0c3a5ac06a5beaac73aad36f Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:21 +0100
+Subject: [PATCH 05/11] tools/pygrub: Remove unnecessary hypercall
+
+There's a hypercall being issued in order to determine whether PV64 is
+supported, but since Xen 4.3 that's strictly true so it's not required.
+
+Plus, this way we can avoid mapping the privcmd interface altogether in the
+depriv pygrub.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/src/pygrub | 12 +-----------
+ 1 file changed, 1 insertion(+), 11 deletions(-)
+
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index ce7ab0eb8cf3..ce4e07d3e823 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -18,7 +18,6 @@ import os, sys, string, struct, tempfile, re, traceback, stat, errno
+ import copy
+ import logging
+ import platform
+-import xen.lowlevel.xc
+
+ import curses, _curses, curses.textpad, curses.ascii
+ import getopt
+@@ -668,14 +667,6 @@ def run_grub(file, entry, fs, cfg_args):
+
+ return grubcfg
+
+-def supports64bitPVguest():
+- xc = xen.lowlevel.xc.xc()
+- caps = xc.xeninfo()['xen_caps'].split(" ")
+- for cap in caps:
+- if cap == "xen-3.0-x86_64":
+- return True
+- return False
+-
+ # If nothing has been specified, look for a Solaris domU. If found, perform the
+ # necessary tweaks.
+ def sniff_solaris(fs, cfg):
+@@ -684,8 +675,7 @@ def sniff_solaris(fs, cfg):
+ return cfg
+
+ if not cfg["kernel"]:
+- if supports64bitPVguest() and \
+- fs.file_exists("/platform/i86xpv/kernel/amd64/unix"):
++ if fs.file_exists("/platform/i86xpv/kernel/amd64/unix"):
+ cfg["kernel"] = "/platform/i86xpv/kernel/amd64/unix"
+ cfg["ramdisk"] = "/platform/i86pc/amd64/boot_archive"
+ elif fs.file_exists("/platform/i86xpv/kernel/unix"):
+--
+2.42.0
+
+From 1083a16f63461e844e9515ac4d35d48bf55785af Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:22 +0100
+Subject: [PATCH 06/11] tools/pygrub: Small refactors
+
+Small tidy up to ensure output_directory always has a trailing '/' to ease
+concatenating paths and that `output` can only be a filename or None.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/src/pygrub | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index ce4e07d3e823..1042c05b8676 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -793,7 +793,7 @@ if __name__ == "__main__":
+ debug = False
+ not_really = False
+ output_format = "sxp"
+- output_directory = "/var/run/xen/pygrub"
++ output_directory = "/var/run/xen/pygrub/"
+
+ # what was passed in
+ incfg = { "kernel": None, "ramdisk": None, "args": "" }
+@@ -815,7 +815,8 @@ if __name__ == "__main__":
+ usage()
+ sys.exit()
+ elif o in ("--output",):
+- output = a
++ if a != "-":
++ output = a
+ elif o in ("--kernel",):
+ incfg["kernel"] = a
+ elif o in ("--ramdisk",):
+@@ -847,12 +848,11 @@ if __name__ == "__main__":
+ if not os.path.isdir(a):
+ print("%s is not an existing directory" % a)
+ sys.exit(1)
+- output_directory = a
++ output_directory = a + '/'
+
+ if debug:
+ logging.basicConfig(level=logging.DEBUG)
+
+-
+ try:
+ os.makedirs(output_directory, 0o700)
+ except OSError as e:
+@@ -861,7 +861,7 @@ if __name__ == "__main__":
+ else:
+ raise
+
+- if output is None or output == "-":
++ if output is None:
+ fd = sys.stdout.fileno()
+ else:
+ fd = os.open(output, os.O_WRONLY)
+--
+2.42.0
+
+From 350db30e33f39af40c1e3752d73c0a30ef2d26e7 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:23 +0100
+Subject: [PATCH 07/11] tools/pygrub: Open the output files earlier
+
+This patch allows pygrub to get ahold of every RW file descriptor it needs
+early on. A later patch will clamp the filesystem it can access so it can't
+obtain any others.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/src/pygrub | 37 ++++++++++++++++++++++---------------
+ 1 file changed, 22 insertions(+), 15 deletions(-)
+
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index 1042c05b8676..91e2ec2ab105 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -738,8 +738,7 @@ if __name__ == "__main__":
+ def usage():
+ print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=]
[--output-format=sxp|simple|simple0] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr)
+
+- def copy_from_image(fs, file_to_read, file_type, output_directory,
+- not_really):
++ def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really):
+ if not_really:
+ if fs.file_exists(file_to_read):
+ return "<%s:%s>" % (file_type, file_to_read)
+@@ -750,21 +749,18 @@ if __name__ == "__main__":
+ except Exception as e:
+ print(e, file=sys.stderr)
+ sys.exit("Error opening %s in guest" % file_to_read)
+- (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".",
+- dir=output_directory)
+ dataoff = 0
+ while True:
+ data = datafile.read(FS_READ_MAX, dataoff)
+ if len(data) == 0:
+- os.close(tfd)
++ os.close(fd_dst)
+ del datafile
+- return ret
++ return
+ try:
+- os.write(tfd, data)
++ os.write(fd_dst, data)
+ except Exception as e:
+ print(e, file=sys.stderr)
+- os.close(tfd)
+- os.unlink(ret)
++ os.unlink(path_dst)
+ del datafile
+ sys.exit("Error writing temporary copy of "+file_type)
+ dataoff += len(data)
+@@ -861,6 +857,14 @@ if __name__ == "__main__":
+ else:
+ raise
+
++ if not_really:
++ fd_kernel = path_kernel = fd_ramdisk = path_ramdisk = None
++ else:
++ (fd_kernel, path_kernel) = tempfile.mkstemp(prefix="boot_kernel.",
++ dir=output_directory)
++ (fd_ramdisk, path_ramdisk) = tempfile.mkstemp(prefix="boot_ramdisk.",
++ dir=output_directory)
++
+ if output is None:
+ fd = sys.stdout.fileno()
+ else:
+@@ -920,20 +924,23 @@ if __name__ == "__main__":
+ if fs is None:
+ raise RuntimeError("Unable to find partition containing kernel")
+
+- bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel",
+- output_directory, not_really)
++ copy_from_image(fs, chosencfg["kernel"], "kernel",
++ fd_kernel, path_kernel, not_really)
++ bootcfg["kernel"] = path_kernel
+
+ if chosencfg["ramdisk"]:
+ try:
+- bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"],
+- "ramdisk", output_directory,
+- not_really)
++ copy_from_image(fs, chosencfg["ramdisk"], "ramdisk",
++ fd_ramdisk, path_ramdisk, not_really)
+ except:
+ if not not_really:
+- os.unlink(bootcfg["kernel"])
++ os.unlink(path_kernel)
+ raise
++ bootcfg["ramdisk"] = path_ramdisk
+ else:
+ initrd = None
++ if not not_really:
++ os.unlink(path_ramdisk)
+
+ args = None
+ if chosencfg["args"]:
+--
+2.42.0
+
+From 1548ad2291ec7a72ae6949c11d2e50cea135a48d Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:24 +0100
+Subject: [PATCH 08/11] tools/libfsimage: Export a new function to preload all
+ plugins
+
+This is work required in order to let pygrub operate in highly deprivileged
+chroot mode. This patch adds a function that preloads every plugin, hence
+ensuring that a on function exit, every shared library is loaded in memory.
+
+The new "init" function is supposed to be used before depriv, but that's
+fine because it's not acting on untrusted data.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/libfsimage/common/fsimage_plugin.c | 4 ++--
+ tools/libfsimage/common/mapfile-GNU | 1 +
+ tools/libfsimage/common/mapfile-SunOS | 1 +
+ tools/libfsimage/common/xenfsimage.h | 8 ++++++++
+ tools/pygrub/src/fsimage/fsimage.c | 15 +++++++++++++++
+ 5 files changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/tools/libfsimage/common/fsimage_plugin.c b/tools/libfsimage/common/fsimage_plugin.c
+index de1412b4233a..d0cb9e96a654 100644
+--- tools/libfsimage/common/fsimage_plugin.c.orig
++++ tools/libfsimage/common/fsimage_plugin.c
+@@ -119,7 +119,7 @@ fail:
+ return (-1);
+ }
+
+-static int load_plugins(void)
++int fsi_init(void)
+ {
+ const char *fsdir = getenv("XEN_FSIMAGE_FSDIR");
+ struct dirent *dp = NULL;
+@@ -180,7 +180,7 @@ int find_plugin(fsi_t *fsi, const char *path, const char *options)
+ fsi_plugin_t *fp;
+ int ret = 0;
+
+- if (plugins == NULL && (ret = load_plugins()) != 0)
++ if (plugins == NULL && (ret = fsi_init()) != 0)
+ goto out;
+
+ for (fp = plugins; fp != NULL; fp = fp->fp_next) {
+diff --git a/tools/libfsimage/common/mapfile-GNU b/tools/libfsimage/common/mapfile-GNU
+index 26d4d7a69ec7..2d54d527d7f5 100644
+--- tools/libfsimage/common/mapfile-GNU.orig
++++ tools/libfsimage/common/mapfile-GNU
+@@ -1,6 +1,7 @@
+ VERSION {
+ libfsimage.so.1.0 {
+ global:
++ fsi_init;
+ fsi_open_fsimage;
+ fsi_close_fsimage;
+ fsi_file_exists;
+diff --git a/tools/libfsimage/common/mapfile-SunOS b/tools/libfsimage/common/mapfile-SunOS
+index e99b90b65077..48deedb4252f 100644
+--- tools/libfsimage/common/mapfile-SunOS.orig
++++ tools/libfsimage/common/mapfile-SunOS
+@@ -1,5 +1,6 @@
+ libfsimage.so.1.0 {
+ global:
++ fsi_init;
+ fsi_open_fsimage;
+ fsi_close_fsimage;
+ fsi_file_exists;
+diff --git a/tools/libfsimage/common/xenfsimage.h b/tools/libfsimage/common/xenfsimage.h
+index 201abd54f23a..341883b2d71a 100644
+--- tools/libfsimage/common/xenfsimage.h.orig
++++ tools/libfsimage/common/xenfsimage.h
+@@ -35,6 +35,14 @@ extern C {
+ typedef struct fsi fsi_t;
+ typedef struct fsi_file fsi_file_t;
+
++/*
++ * Optional initialization function. If invoked it loads the associated
++ * dynamic libraries for the backends ahead of time. This is required if
++ * the library is to run as part of a highly deprivileged executable, as
++ * the libraries may not be reachable after depriv.
++ */
++int fsi_init(void);
++
+ fsi_t *fsi_open_fsimage(const char *, uint64_t, const char *);
+ void fsi_close_fsimage(fsi_t *);
+
+diff --git a/tools/pygrub/src/fsimage/fsimage.c b/tools/pygrub/src/fsimage/fsimage.c
+index 2ebbbe35df92..92fbf2851f01 100644
+--- tools/pygrub/src/fsimage/fsimage.c.orig
++++ tools/pygrub/src/fsimage/fsimage.c
+@@ -286,6 +286,15 @@ fsimage_getbootstring(PyObject *o, PyObject *args)
+ return Py_BuildValue("s", bootstring);
+ }
+
++static PyObject *
++fsimage_init(PyObject *o, PyObject *args)
++{
++ if (!PyArg_ParseTuple(args, ""))
++ return (NULL);
++
++ return Py_BuildValue("i", fsi_init());
++}
++
+ PyDoc_STRVAR(fsimage_open__doc__,
+ "open(name, [offset=off]) - Open the given file as a filesystem image.\n"
+ "\n"
+@@ -297,7 +306,13 @@ PyDoc_STRVAR(fsimage_getbootstring__doc__,
+ "getbootstring(fs) - Return the boot string needed for this file system "
+ "or NULL if none is needed.\n");
+
++PyDoc_STRVAR(fsimage_init__doc__,
++ "init() - Loads every dynamic library contained in xenfsimage "
++ "into memory so that it can be used in chrooted environments.\n");
++
+ static struct PyMethodDef fsimage_module_methods[] = {
++ { "init", (PyCFunction)fsimage_init,
++ METH_VARARGS, fsimage_init__doc__ },
+ { "open", (PyCFunction)fsimage_open,
+ METH_VARARGS|METH_KEYWORDS, fsimage_open__doc__ },
+ { "getbootstring", (PyCFunction)fsimage_getbootstring,
+--
+2.42.0
+
+From 4d331b0b914dfc17bd2d883bc55aeb798930832a Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:25 +0100
+Subject: [PATCH 09/11] tools/pygrub: Deprivilege pygrub
+
+Introduce a --runas=<uid> flag to deprivilege pygrub on Linux and *BSDs. It
+also implicitly creates a chroot env where it drops a deprivileged forked
+process. The chroot itself is cleaned up at the end.
+
+If the --runas arg is present, then pygrub forks, leaving the child to
+deprivilege itself, and waiting for it to complete. When the child exists,
+the parent performs cleanup and exits with the same error code.
+
+This is roughly what the child does:
+ 1. Initialize libfsimage (this loads every .so in memory so the chroot
+ can avoid bind-mounting /{,usr}/lib*
+ 2. Create a temporary empty chroot directory
+ 3. Mount tmpfs in it
+ 4. Bind mount the disk inside, because libfsimage expects a path, not a
+ file descriptor.
+ 5. Remount the root tmpfs to be stricter (ro,nosuid,nodev)
+ 6. Set RLIMIT_FSIZE to a sensibly high amount (128 MiB)
+ 7. Depriv gid, groups and uid
+
+With this scheme in place, the "output" files are writable (up to
+RLIMIT_FSIZE octets) and the exposed filesystem is immutable and contains
+the single only file we can't easily get rid of (the disk).
+
+If running on Linux, the child process also unshares mount, IPC, and
+network namespaces before dropping its privileges.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/setup.py | 2 +-
+ tools/pygrub/src/pygrub | 162 +++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 154 insertions(+), 10 deletions(-)
+
+diff --git a/tools/pygrub/setup.py b/tools/pygrub/setup.py
+index b8f1dc4590cf..f16187b6d118 100644
+--- tools/pygrub/setup.py.orig
++++ tools/pygrub/setup.py
+@@ -17,7 +17,7 @@ xenfsimage = Extension("xenfsimage",
+ pkgs = [ 'grub' ]
+
+ setup(name='pygrub',
+- version='0.6',
++ version='0.7',
+ description='Boot loader that looks a lot like grub for Xen',
+ author='Jeremy Katz',
+ author_email='katzj%redhat.com@localhost',
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index 91e2ec2ab105..7cea496ade08 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -16,8 +16,11 @@ from __future__ import print_function
+
+ import os, sys, string, struct, tempfile, re, traceback, stat, errno
+ import copy
++import ctypes, ctypes.util
+ import logging
+ import platform
++import resource
++import subprocess
+
+ import curses, _curses, curses.textpad, curses.ascii
+ import getopt
+@@ -27,10 +30,135 @@ import grub.GrubConf
+ import grub.LiloConf
+ import grub.ExtLinuxConf
+
+-PYGRUB_VER = 0.6
++PYGRUB_VER = 0.7
+ FS_READ_MAX = 1024 * 1024
+ SECTOR_SIZE = 512
+
++# Unless provided through the env variable PYGRUB_MAX_FILE_SIZE_MB, then
++# this is the maximum filesize allowed for files written by the depriv
++# pygrub
++LIMIT_FSIZE = 128 << 20
++
++CLONE_NEWNS = 0x00020000 # mount namespace
++CLONE_NEWNET = 0x40000000 # network namespace
++CLONE_NEWIPC = 0x08000000 # IPC namespace
++
++def unshare(flags):
++ if not sys.platform.startswith("linux"):
++ print("skip_unshare reason=not_linux platform=%s", sys.platform, file=sys.stderr)
++ return
++
++ libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True)
++ unshare_prototype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, use_errno=True)
++ unshare = unshare_prototype(('unshare', libc))
++
++ if unshare(flags) < 0:
++ raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno()))
++
++def bind_mount(src, dst, options):
++ open(dst, "a").close() # touch
++
++ rc = subprocess.call(["mount", "--bind", "-o", options, src, dst])
++ if rc != 0:
++ raise RuntimeError("bad_mount: src=%s dst=%s opts=%s" %
++ (src, dst, options))
++
++def downgrade_rlimits():
++ # Wipe the authority to use unrequired resources
++ resource.setrlimit(resource.RLIMIT_NPROC, (0, 0))
++ resource.setrlimit(resource.RLIMIT_CORE, (0, 0))
++ resource.setrlimit(resource.RLIMIT_MEMLOCK, (0, 0))
++
++ # py2's resource module doesn't know about resource.RLIMIT_MSGQUEUE
++ #
++ # TODO: Use resource.RLIMIT_MSGQUEUE after python2 is deprecated
++ if sys.platform.startswith('linux'):
++ RLIMIT_MSGQUEUE = 12
++ resource.setrlimit(RLIMIT_MSGQUEUE, (0, 0))
++
++ # The final look of the filesystem for this process is fully RO, but
++ # note we have some file descriptor already open (notably, kernel and
++ # ramdisk). In order to avoid a compromised pygrub from filling up the
++ # filesystem we set RLIMIT_FSIZE to a high bound, so that the file
++ # write permissions are bound.
++ fsize = LIMIT_FSIZE
++ if "PYGRUB_MAX_FILE_SIZE_MB" in os.environ.keys():
++ fsize = os.environ["PYGRUB_MAX_FILE_SIZE_MB"] << 20
++
++ resource.setrlimit(resource.RLIMIT_FSIZE, (fsize, fsize))
++
++def depriv(output_directory, output, device, uid, path_kernel, path_ramdisk):
++ # The only point of this call is to force the loading of libfsimage.
++ # That way, we don't need to bind-mount it into the chroot
++ rc = xenfsimage.init()
++ if rc != 0:
++ os.unlink(path_ramdisk)
++ os.unlink(path_kernel)
++ raise RuntimeError("bad_xenfsimage: rc=%d" % rc)
++
++ # Create a temporary directory for the chroot
++ chroot = tempfile.mkdtemp(prefix=str(uid)+'-', dir=output_directory) + '/'
++ device_path = '/device'
++
++ pid = os.fork()
++ if pid:
++ # parent
++ _, rc = os.waitpid(pid, 0)
++
++ for path in [path_kernel, path_ramdisk]:
++ # If the child didn't write anything, just get rid of it,
++ # otherwise we end up consuming a 0-size file when parsing
++ # systems without a ramdisk that the ultimate caller of pygrub
++ # may just be unaware of
++ if rc != 0 or os.path.getsize(path) == 0:
++ os.unlink(path)
++
++ # Normally, unshare(CLONE_NEWNS) will ensure this is not required.
++ # However, this syscall doesn't exist in *BSD systems and doesn't
++ # auto-unmount everything on older Linux kernels (At least as of
++ # Linux 4.19, but it seems fixed in 5.15). Either way,
++ # recursively unmount everything if needed. Quietly.
++ with open('/dev/null', 'w') as devnull:
++ subprocess.call(["umount", "-f", chroot + device_path],
++ stdout=devnull, stderr=devnull)
++ subprocess.call(["umount", "-f", chroot],
++ stdout=devnull, stderr=devnull)
++ os.rmdir(chroot)
++
++ sys.exit(rc)
++
++ # By unsharing the namespace we're making sure it's all bulk-released
++ # at the end, when the namespaces disappear. This means the kernel does
++ # (almost) all the cleanup for us and the parent just has to remove the
++ # temporary directory.
++ unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWNET)
++
++ # Set sensible limits using the setrlimit interface
++ downgrade_rlimits()
++
++ # We'll mount tmpfs on the chroot to ensure the deprivileged child
++ # cannot affect the persistent state. It's RW now in order to
++ # bind-mount the device, but note it's remounted RO after that.
++ rc = subprocess.call(["mount", "-t", "tmpfs", "none", chroot])
++ if rc != 0:
++ raise RuntimeError("mount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot))
++
++ # Bind the untrusted device RO
++ bind_mount(device, chroot + device_path, "ro,nosuid,noexec")
++
++ rc = subprocess.call(["mount", "-t", "tmpfs", "-o", "remount,ro,nosuid,noexec,nodev", "none", chroot])
++ if rc != 0:
++ raise RuntimeError("remount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot))
++
++ # Drop superpowers!
++ os.chroot(chroot)
++ os.chdir('/')
++ os.setgid(uid)
++ os.setgroups([uid])
++ os.setuid(uid)
++
++ return device_path
++
+ def read_size_roundup(fd, size):
+ if platform.system() != 'FreeBSD':
+ return size
+@@ -736,7 +864,7 @@ if __name__ == "__main__":
+ sel = None
+
+ def usage():
+- print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=]
[--output-format=sxp|simple|simple0] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr)
++ print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=]
[--output-format=sxp|simple|simple0] [--runas=] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr)
+
+ def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really):
+ if not_really:
+@@ -760,7 +888,8 @@ if __name__ == "__main__":
+ os.write(fd_dst, data)
+ except Exception as e:
+ print(e, file=sys.stderr)
+- os.unlink(path_dst)
++ if path_dst:
++ os.unlink(path_dst)
+ del datafile
+ sys.exit("Error writing temporary copy of "+file_type)
+ dataoff += len(data)
+@@ -769,7 +898,7 @@ if __name__ == "__main__":
+ opts, args = getopt.gnu_getopt(sys.argv[1:], 'qilnh::',
+ ["quiet", "interactive", "list-entries", "not-really", "help",
+ "output=", "output-format=", "output-directory=", "offset=",
+- "entry=", "kernel=",
++ "runas=", "entry=", "kernel=",
+ "ramdisk=", "args=", "isconfig", "debug"])
+ except getopt.GetoptError:
+ usage()
+@@ -790,6 +919,7 @@ if __name__ == "__main__":
+ not_really = False
+ output_format = "sxp"
+ output_directory = "/var/run/xen/pygrub/"
++ uid = None
+
+ # what was passed in
+ incfg = { "kernel": None, "ramdisk": None, "args": "" }
+@@ -813,6 +943,13 @@ if __name__ == "__main__":
+ elif o in ("--output",):
+ if a != "-":
+ output = a
++ elif o in ("--runas",):
++ try:
++ uid = int(a)
++ except ValueError:
++ print("runas value must be an integer user id")
++ usage()
++ sys.exit(1)
+ elif o in ("--kernel",):
+ incfg["kernel"] = a
+ elif o in ("--ramdisk",):
+@@ -849,6 +986,10 @@ if __name__ == "__main__":
+ if debug:
+ logging.basicConfig(level=logging.DEBUG)
+
++ if interactive and uid:
++ print("In order to use --runas, you must also set --entry or -q", file=sys.stderr)
++ sys.exit(1)
++
+ try:
+ os.makedirs(output_directory, 0o700)
+ except OSError as e:
+@@ -870,6 +1011,9 @@ if __name__ == "__main__":
+ else:
+ fd = os.open(output, os.O_WRONLY)
+
++ if uid:
++ file = depriv(output_directory, output, file, uid, path_kernel, path_ramdisk)
++
+ # debug
+ if isconfig:
+ chosencfg = run_grub(file, entry, fs, incfg["args"])
+@@ -925,21 +1069,21 @@ if __name__ == "__main__":
+ raise RuntimeError("Unable to find partition containing kernel")
+
+ copy_from_image(fs, chosencfg["kernel"], "kernel",
+- fd_kernel, path_kernel, not_really)
++ fd_kernel, None if uid else path_kernel, not_really)
+ bootcfg["kernel"] = path_kernel
+
+ if chosencfg["ramdisk"]:
+ try:
+ copy_from_image(fs, chosencfg["ramdisk"], "ramdisk",
+- fd_ramdisk, path_ramdisk, not_really)
++ fd_ramdisk, None if uid else path_ramdisk, not_really)
+ except:
+- if not not_really:
+- os.unlink(path_kernel)
++ if not uid and not not_really:
++ os.unlink(path_kernel)
+ raise
+ bootcfg["ramdisk"] = path_ramdisk
+ else:
+ initrd = None
+- if not not_really:
++ if not uid and not not_really:
+ os.unlink(path_ramdisk)
+
+ args = None
+--
+2.42.0
+
+From 576e7aa02ab838b6768b498f310c70ca49537202 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Mon, 25 Sep 2023 14:30:20 +0200
+Subject: [PATCH 10/11] libxl: add support for running bootloader in restricted
+ mode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Much like the device model depriv mode, add the same kind of support for the
+bootloader. Such feature allows passing a UID as a parameter for the
+bootloader to run as, together with the bootloader itself taking the necessary
+actions to isolate.
+
+Note that the user to run the bootloader as must have the right permissions to
+access the guest disk image (in read mode only), and that the bootloader will
+be run in non-interactive mode when restricted.
+
+If enabled bootloader restrict mode will attempt to re-use the user(s) from the
+QEMU depriv implementation if no user is provided on the configuration file or
+the environment. See docs/features/qemu-deprivilege.pandoc for more
+information about how to setup those users.
+
+Bootloader restrict mode is not enabled by default as it requires certain
+setup to be done first (setup of the user(s) to use in restrict mode).
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Anthony PERARD <anthony.perard%citrix.com@localhost>
+---
+ docs/man/xl.1.pod.in | 33 +++++++++++
+ tools/libs/light/libxl_bootloader.c | 89 ++++++++++++++++++++++++++++-
+ tools/libs/light/libxl_dm.c | 8 +--
+ tools/libs/light/libxl_internal.h | 8 +++
+ 4 files changed, 131 insertions(+), 7 deletions(-)
+
+diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
+index 45e1430aeb74..96e6fb1c32a3 100644
+--- docs/man/xl.1.pod.in.orig
++++ docs/man/xl.1.pod.in
+@@ -1976,6 +1976,39 @@ ignored:
+
+ =back
+
++=head1 ENVIRONMENT VARIABLES
++
++The following environment variables shall affect the execution of xl:
++
++=over 4
++
++=item LIBXL_BOOTLOADER_RESTRICT
++
++Attempt to restrict the bootloader after startup, to limit the
++consequences of security vulnerabilities due to parsing guest
++owned image files.
++
++See docs/features/qemu-deprivilege.pandoc for more information
++on how to setup the unprivileged users.
++
++Note that running the bootloader in restricted mode also implies using
++non-interactive mode, and the disk image must be readable by the
++restricted user.
++
++Having this variable set is equivalent to enabling the option, even if the
++value is 0.
++
++=item LIBXL_BOOTLOADER_USER
++
++When using bootloader_restrict, run the bootloader as this user. If
++not set the default QEMU restrict users will be used.
++
++NOTE: Each domain MUST have a SEPARATE username.
++
++See docs/features/qemu-deprivilege.pandoc for more information.
++
++=back
++
+ =head1 SEE ALSO
+
+ The following man pages:
+diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c
+index 18e9ebd7148c..97d9bf4ddc0a 100644
+--- tools/libs/light/libxl_bootloader.c.orig
++++ tools/libs/light/libxl_bootloader.c
+@@ -14,6 +14,7 @@
+
+ #include "libxl_osdeps.h" /* must come before any other headers */
+
++#include <pwd.h>
+ #include <termios.h>
+ #ifdef HAVE_UTMP_H
+ #include <utmp.h>
+@@ -46,8 +47,71 @@ static void bootloader_arg(libxl__bootloader_state *bl, const char *arg)
+ bl->args[bl->nargs++] = arg;
+ }
+
+-static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
+- const char *bootloader_path)
++static int bootloader_uid(libxl__gc *gc, domid_t guest_domid,
++ const char *user, uid_t *intended_uid)
++{
++ struct passwd *user_base, user_pwbuf;
++ int rc;
++
++ if (user) {
++ rc = userlookup_helper_getpwnam(gc, user, &user_pwbuf, &user_base);
++ if (rc) return rc;
++
++ if (!user_base) {
++ LOGD(ERROR, guest_domid, "Couldn't find user %s", user);
++ return ERROR_INVAL;
++ }
++
++ *intended_uid = user_base->pw_uid;
++ return 0;
++ }
++
++ /* Re-use QEMU user range for the bootloader. */
++ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_RANGE_BASE,
++ &user_pwbuf, &user_base);
++ if (rc) return rc;
++
++ if (user_base) {
++ struct passwd *user_clash, user_clash_pwbuf;
++ uid_t temp_uid = user_base->pw_uid + guest_domid;
++
++ rc = userlookup_helper_getpwuid(gc, temp_uid, &user_clash_pwbuf,
++ &user_clash);
++ if (rc) return rc;
++
++ if (user_clash) {
++ LOGD(ERROR, guest_domid,
++ "wanted to use uid %ld (%s + %d) but that is user %s !",
++ (long)temp_uid, LIBXL_QEMU_USER_RANGE_BASE,
++ guest_domid, user_clash->pw_name);
++ return ERROR_INVAL;
++ }
++
++ *intended_uid = temp_uid;
++ return 0;
++ }
++
++ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_SHARED, &user_pwbuf,
++ &user_base);
++ if (rc) return rc;
++
++ if (user_base) {
++ LOGD(WARN, guest_domid, "Could not find user %s, falling back to %s",
++ LIBXL_QEMU_USER_RANGE_BASE, LIBXL_QEMU_USER_SHARED);
++ *intended_uid = user_base->pw_uid;
++
++ return 0;
++ }
++
++ LOGD(ERROR, guest_domid,
++ "Could not find user %s or range base pseudo-user %s, cannot restrict",
++ LIBXL_QEMU_USER_SHARED, LIBXL_QEMU_USER_RANGE_BASE);
++
++ return ERROR_INVAL;
++}
++
++static int make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
++ const char *bootloader_path)
+ {
+ const libxl_domain_build_info *info = bl->info;
+
+@@ -65,6 +129,23 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
+ ARG(GCSPRINTF("--ramdisk=%s", info->ramdisk));
+ if (info->cmdline && *info->cmdline != '\0')
+ ARG(GCSPRINTF("--args=%s", info->cmdline));
++ if (getenv("LIBXL_BOOTLOADER_RESTRICT") ||
++ getenv("LIBXL_BOOTLOADER_USER")) {
++ uid_t uid = -1;
++ int rc = bootloader_uid(gc, bl->domid, getenv("LIBXL_BOOTLOADER_USER"),
++ &uid);
++
++ if (rc) return rc;
++
++ assert(uid != -1);
++ if (!uid) {
++ LOGD(ERROR, bl->domid, "bootloader restrict UID is 0 (root)!");
++ return ERROR_INVAL;
++ }
++ LOGD(DEBUG, bl->domid, "using uid %ld", (long)uid);
++ ARG(GCSPRINTF("--runas=%ld", (long)uid));
++ ARG("--quiet");
++ }
+
+ ARG(GCSPRINTF("--output=%s", bl->outputpath));
+ ARG("--output-format=simple0");
+@@ -83,6 +164,7 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
+ /* Sentinel for execv */
+ ARG(NULL);
+
++ return 0;
+ #undef ARG
+ }
+
+@@ -447,7 +529,8 @@ static void bootloader_disk_attached_cb(libxl__egc *egc,
+ bootloader = bltmp;
+ }
+
+- make_bootloader_args(gc, bl, bootloader);
++ rc = make_bootloader_args(gc, bl, bootloader);
++ if (rc) goto out;
+
+ bl->openpty.ao = ao;
+ bl->openpty.callback = bootloader_gotptys;
+diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c
+index b86e8ccc858f..59de5c1ae22f 100644
+--- tools/libs/light/libxl_dm.c.orig
++++ tools/libs/light/libxl_dm.c
+@@ -80,10 +80,10 @@ static int libxl__create_qemu_logfile(libxl__gc *gc, char *name)
+ * On error, return a libxl-style error code.
+ */
+ #define DEFINE_USERLOOKUP_HELPER(NAME,SPEC_TYPE,STRUCTNAME,SYSCONF) \
+- static int userlookup_helper_##NAME(libxl__gc *gc, \
+- SPEC_TYPE spec, \
+- struct STRUCTNAME *resultbuf, \
+- struct STRUCTNAME **out) \
++ int userlookup_helper_##NAME(libxl__gc *gc, \
++ SPEC_TYPE spec, \
++ struct STRUCTNAME *resultbuf, \
++ struct STRUCTNAME **out) \
+ { \
+ struct STRUCTNAME *resultp = NULL; \
+ char *buf = NULL; \
+diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h
+index cc27c72ecf30..8415d1feed16 100644
+--- tools/libs/light/libxl_internal.h.orig
++++ tools/libs/light/libxl_internal.h
+@@ -4864,6 +4864,14 @@ struct libxl__cpu_policy {
+ struct xc_msr *msr;
+ };
+
++struct passwd;
++_hidden int userlookup_helper_getpwnam(libxl__gc*, const char *user,
++ struct passwd *res,
++ struct passwd **out);
++_hidden int userlookup_helper_getpwuid(libxl__gc*, uid_t uid,
++ struct passwd *res,
++ struct passwd **out);
++
+ #endif
+
+ /*
+--
+2.42.0
+
+From 34221884752bb835bbdab66378b3cecbf133e3d3 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Thu, 28 Sep 2023 12:22:35 +0200
+Subject: [PATCH 11/11] libxl: limit bootloader execution in restricted mode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Introduce a timeout for bootloader execution when running in restricted mode.
+
+Allow overwriting the default time out with an environment provided value.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Anthony PERARD <anthony.perard%citrix.com@localhost>
+---
+ docs/man/xl.1.pod.in | 8 ++++++
+ tools/libs/light/libxl_bootloader.c | 40 +++++++++++++++++++++++++++++
+ tools/libs/light/libxl_internal.h | 2 ++
+ 3 files changed, 50 insertions(+)
+
+diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
+index 96e6fb1c32a3..8f056450a730 100644
+--- docs/man/xl.1.pod.in.orig
++++ docs/man/xl.1.pod.in
+@@ -2007,6 +2007,14 @@ NOTE: Each domain MUST have a SEPARATE username.
+
+ See docs/features/qemu-deprivilege.pandoc for more information.
+
++=item LIBXL_BOOTLOADER_TIMEOUT
++
++Timeout in seconds for bootloader execution when running in restricted mode.
++Otherwise the build time default in LIBXL_BOOTLOADER_TIMEOUT will be used.
++
++If defined the value must be an unsigned integer between 0 and INT_MAX,
++otherwise behavior is undefined. Setting to 0 disables the timeout.
++
+ =back
+
+ =head1 SEE ALSO
+diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c
+index 97d9bf4ddc0a..3ca6463e5f63 100644
+--- tools/libs/light/libxl_bootloader.c.orig
++++ tools/libs/light/libxl_bootloader.c
+@@ -34,6 +34,8 @@ static void bootloader_keystrokes_copyfail(libxl__egc *egc,
+ libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
+ static void bootloader_display_copyfail(libxl__egc *egc,
+ libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
++static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev,
++ const struct timeval *requested_abs, int rc);
+ static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc,
+ int rc);
+ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child,
+@@ -301,6 +303,7 @@ void libxl__bootloader_init(libxl__bootloader_state *bl)
+ bl->ptys[0].master = bl->ptys[0].slave = 0;
+ bl->ptys[1].master = bl->ptys[1].slave = 0;
+ libxl__ev_child_init(&bl->child);
++ libxl__ev_time_init(&bl->time);
+ libxl__domaindeathcheck_init(&bl->deathcheck);
+ bl->keystrokes.ao = bl->ao; libxl__datacopier_init(&bl->keystrokes);
+ bl->display.ao = bl->ao; libxl__datacopier_init(&bl->display);
+@@ -318,6 +321,7 @@ static void bootloader_cleanup(libxl__egc *egc, libxl__bootloader_state *bl)
+ libxl__domaindeathcheck_stop(gc,&bl->deathcheck);
+ libxl__datacopier_kill(&bl->keystrokes);
+ libxl__datacopier_kill(&bl->display);
++ libxl__ev_time_deregister(gc, &bl->time);
+ for (i=0; i<2; i++) {
+ libxl__carefd_close(bl->ptys[i].master);
+ libxl__carefd_close(bl->ptys[i].slave);
+@@ -379,6 +383,7 @@ static void bootloader_stop(libxl__egc *egc,
+
+ libxl__datacopier_kill(&bl->keystrokes);
+ libxl__datacopier_kill(&bl->display);
++ libxl__ev_time_deregister(gc, &bl->time);
+ if (libxl__ev_child_inuse(&bl->child)) {
+ r = kill(bl->child.pid, SIGTERM);
+ if (r) LOGED(WARN, bl->domid, "%sfailed to kill bootloader [%lu]",
+@@ -641,6 +646,25 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op)
+
+ struct termios termattr;
+
++ if (getenv("LIBXL_BOOTLOADER_RESTRICT") ||
++ getenv("LIBXL_BOOTLOADER_USER")) {
++ const char *timeout_env = getenv("LIBXL_BOOTLOADER_TIMEOUT");
++ int timeout = timeout_env ? atoi(timeout_env)
++ : LIBXL_BOOTLOADER_TIMEOUT;
++
++ if (timeout) {
++ /* Set execution timeout */
++ rc = libxl__ev_time_register_rel(ao, &bl->time,
++ bootloader_timeout,
++ timeout * 1000);
++ if (rc) {
++ LOGED(ERROR, bl->domid,
++ "unable to register timeout for bootloader execution");
++ goto out;
++ }
++ }
++ }
++
+ pid_t pid = libxl__ev_child_fork(gc, &bl->child, bootloader_finished);
+ if (pid == -1) {
+ rc = ERROR_FAIL;
+@@ -706,6 +730,21 @@ static void bootloader_display_copyfail(libxl__egc *egc,
+ libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, display);
+ bootloader_copyfail(egc, "bootloader output", bl, 1, rc,onwrite,errnoval);
+ }
++static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev,
++ const struct timeval *requested_abs, int rc)
++{
++ libxl__bootloader_state *bl = CONTAINER_OF(ev, *bl, time);
++ STATE_AO_GC(bl->ao);
++
++ libxl__ev_time_deregister(gc, &bl->time);
++
++ assert(libxl__ev_child_inuse(&bl->child));
++ LOGD(ERROR, bl->domid, "killing bootloader because of timeout");
++
++ libxl__ev_child_kill_deregister(ao, &bl->child, SIGKILL);
++
++ bootloader_callback(egc, bl, rc);
++}
+
+ static void bootloader_domaindeath(libxl__egc *egc,
+ libxl__domaindeathcheck *dc,
+@@ -722,6 +761,7 @@ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child,
+ STATE_AO_GC(bl->ao);
+ int rc;
+
++ libxl__ev_time_deregister(gc, &bl->time);
+ libxl__datacopier_kill(&bl->keystrokes);
+ libxl__datacopier_kill(&bl->display);
+
+diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h
+index 8415d1feed16..a9581289f462 100644
+--- tools/libs/light/libxl_internal.h.orig
++++ tools/libs/light/libxl_internal.h
+@@ -103,6 +103,7 @@
+ #define LIBXL_QMP_CMD_TIMEOUT 10
+ #define LIBXL_STUBDOM_START_TIMEOUT 30
+ #define LIBXL_QEMU_BODGE_TIMEOUT 2
++#define LIBXL_BOOTLOADER_TIMEOUT 120
+ #define LIBXL_XENCONSOLE_LIMIT 1048576
+ #define LIBXL_XENCONSOLE_PROTOCOL "vt100"
+ #define LIBXL_MAXMEM_CONSTANT 1024
+@@ -3738,6 +3739,7 @@ struct libxl__bootloader_state {
+ libxl__openpty_state openpty;
+ libxl__openpty_result ptys[2]; /* [0] is for bootloader */
+ libxl__ev_child child;
++ libxl__ev_time time;
+ libxl__domaindeathcheck deathcheck;
+ int nargs, argsspace;
+ const char **args;
+--
+2.42.0
+
Home |
Main Index |
Thread Index |
Old Index