pkgsrc-Changes archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

CVS commit: pkgsrc/sysutils



Module Name:    pkgsrc
Committed By:   bouyer
Date:           Wed Nov 15 15:59:36 UTC 2023

Modified Files:
        pkgsrc/sysutils/xenkernel415: Makefile distinfo
        pkgsrc/sysutils/xentools415: Makefile PLIST distinfo
Added Files:
        pkgsrc/sysutils/xenkernel415/patches: patch-XSA439 patch-XSA442
            patch-XSA444 patch-XSA445 patch-XSA446
        pkgsrc/sysutils/xentools415/patches: patch-XSA440 patch-XSA443

Log Message:
xen*415: apply upstream patches for Xen Security Advisory
XSA-439, XSA-440, XSA-442, XSA-443, XSA-444, XSA-445, XSA-446
bump PKGREVISIONs


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 pkgsrc/sysutils/xenkernel415/Makefile
cvs rdiff -u -r1.10 -r1.11 pkgsrc/sysutils/xenkernel415/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xenkernel415/patches/patch-XSA439 \
    pkgsrc/sysutils/xenkernel415/patches/patch-XSA442 \
    pkgsrc/sysutils/xenkernel415/patches/patch-XSA444 \
    pkgsrc/sysutils/xenkernel415/patches/patch-XSA445 \
    pkgsrc/sysutils/xenkernel415/patches/patch-XSA446
cvs rdiff -u -r1.27 -r1.28 pkgsrc/sysutils/xentools415/Makefile
cvs rdiff -u -r1.3 -r1.4 pkgsrc/sysutils/xentools415/PLIST
cvs rdiff -u -r1.13 -r1.14 pkgsrc/sysutils/xentools415/distinfo
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xentools415/patches/patch-XSA440 \
    pkgsrc/sysutils/xentools415/patches/patch-XSA443

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: pkgsrc/sysutils/xenkernel415/Makefile
diff -u pkgsrc/sysutils/xenkernel415/Makefile:1.11 pkgsrc/sysutils/xenkernel415/Makefile:1.12
--- pkgsrc/sysutils/xenkernel415/Makefile:1.11  Thu Sep 21 10:39:45 2023
+++ pkgsrc/sysutils/xenkernel415/Makefile       Wed Nov 15 15:59:36 2023
@@ -1,9 +1,9 @@
-# $NetBSD: Makefile,v 1.11 2023/09/21 10:39:45 bouyer Exp $
+# $NetBSD: Makefile,v 1.12 2023/11/15 15:59:36 bouyer Exp $
 
 VERSION=       4.15.5
 DISTNAME=      xen-${VERSION}
 PKGNAME=       xenkernel415-${VERSION}
-PKGREVISION=   1
+PKGREVISION=   2
 CATEGORIES=    sysutils
 MASTER_SITES=  https://downloads.xenproject.org/release/xen/${VERSION}/
 DIST_SUBDIR=   xen415

Index: pkgsrc/sysutils/xenkernel415/distinfo
diff -u pkgsrc/sysutils/xenkernel415/distinfo:1.10 pkgsrc/sysutils/xenkernel415/distinfo:1.11
--- pkgsrc/sysutils/xenkernel415/distinfo:1.10  Thu Sep 21 10:39:45 2023
+++ pkgsrc/sysutils/xenkernel415/distinfo       Wed Nov 15 15:59:36 2023
@@ -1,10 +1,15 @@
-$NetBSD: distinfo,v 1.10 2023/09/21 10:39:45 bouyer Exp $
+$NetBSD: distinfo,v 1.11 2023/11/15 15:59:36 bouyer Exp $
 
 BLAKE2s (xen415/xen-4.15.5.tar.gz) = 85bef27c99fd9fd3037ec6df5e514289b650f2f073bcc543d13d5997c03332d4
 SHA512 (xen415/xen-4.15.5.tar.gz) = 790f3d75df78f63f5b2ce3b99c1f2287f75ef5571d1b7a9bb9bac470bd28ccbd4816d07a1af8320eee4107626c75be029bd6dad1d99d58f3816906ed98d206d9
 Size (xen415/xen-4.15.5.tar.gz) = 40835793 bytes
 SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
 SHA1 (patch-XSA438) = a8288bbbe8ffe799cebbf6bb184b1a2b59b59089
+SHA1 (patch-XSA439) = 5284e7801ed379aaac3c12dafc32283567bddd95
+SHA1 (patch-XSA442) = 170d94ed89a0d9ab210052fef0c8ae41a426374c
+SHA1 (patch-XSA444) = 5da1c79e811bebf5fee8416b00f76cfbc3946701
+SHA1 (patch-XSA445) = 85990f0ecd529b0c0b4cd9ab422d305bb94ae4b9
+SHA1 (patch-XSA446) = 7271a9afc134cca8c42e7a284f1761ddce2ac5ca
 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
 SHA1 (patch-xen_arch_x86_Kconfig) = df14bfa09b9a0008ca59d53c938d43a644822dd9

Index: pkgsrc/sysutils/xentools415/Makefile
diff -u pkgsrc/sysutils/xentools415/Makefile:1.27 pkgsrc/sysutils/xentools415/Makefile:1.28
--- pkgsrc/sysutils/xentools415/Makefile:1.27   Mon Oct 23 06:37:53 2023
+++ pkgsrc/sysutils/xentools415/Makefile        Wed Nov 15 15:59:36 2023
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.27 2023/10/23 06:37:53 wiz Exp $
+# $NetBSD: Makefile,v 1.28 2023/11/15 15:59:36 bouyer Exp $
 #
 # VERSION is set in version.mk as it is shared with other packages
-PKGREVISION=        1
+PKGREVISION=        2
 .include        "version.mk"
 PKGNAME=               xentools415-${VERSION}
 

Index: pkgsrc/sysutils/xentools415/PLIST
diff -u pkgsrc/sysutils/xentools415/PLIST:1.3 pkgsrc/sysutils/xentools415/PLIST:1.4
--- pkgsrc/sysutils/xentools415/PLIST:1.3       Sat Jan 15 17:44:35 2022
+++ pkgsrc/sysutils/xentools415/PLIST   Wed Nov 15 15:59:36 2023
@@ -1,4 +1,4 @@
-@comment $NetBSD: PLIST,v 1.3 2022/01/15 17:44:35 wiz Exp $
+@comment $NetBSD: PLIST,v 1.4 2023/11/15 15:59:36 bouyer Exp $
 ${PYSITELIB}/grub/ExtLinuxConf.py
 ${PYSITELIB}/grub/ExtLinuxConf.pyc
 ${PYSITELIB}/grub/GrubConf.py
@@ -7,10 +7,10 @@ ${PYSITELIB}/grub/LiloConf.py
 ${PYSITELIB}/grub/LiloConf.pyc
 ${PYSITELIB}/grub/__init__.py
 ${PYSITELIB}/grub/__init__.pyc
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/PKG-INFO
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/SOURCES.txt
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/dependency_links.txt
-${PYSITELIB}/pygrub-0.6-py${PYVERSSUFFIX}.egg-info/top_level.txt
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/PKG-INFO
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/SOURCES.txt
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/dependency_links.txt
+${PYSITELIB}/pygrub-0.7-py${PYVERSSUFFIX}.egg-info/top_level.txt
 ${PYSITELIB}/xen-3.0-py${PYVERSSUFFIX}.egg-info/PKG-INFO
 ${PYSITELIB}/xen-3.0-py${PYVERSSUFFIX}.egg-info/SOURCES.txt
 ${PYSITELIB}/xen-3.0-py${PYVERSSUFFIX}.egg-info/dependency_links.txt

Index: pkgsrc/sysutils/xentools415/distinfo
diff -u pkgsrc/sysutils/xentools415/distinfo:1.13 pkgsrc/sysutils/xentools415/distinfo:1.14
--- pkgsrc/sysutils/xentools415/distinfo:1.13   Thu Aug 24 10:27:09 2023
+++ pkgsrc/sysutils/xentools415/distinfo        Wed Nov 15 15:59:36 2023
@@ -1,4 +1,4 @@
-$NetBSD: distinfo,v 1.13 2023/08/24 10:27:09 bouyer Exp $
+$NetBSD: distinfo,v 1.14 2023/11/15 15:59:36 bouyer Exp $
 
 BLAKE2s (xen415/ipxe-988d2c13cdf0f0b4140685af35ced70ac5b3283c.tar.gz) = 67ded947316100f4f66fa61fe156baf1620db575450f4dc0dd8dcb323e57970b
 SHA512 (xen415/ipxe-988d2c13cdf0f0b4140685af35ced70ac5b3283c.tar.gz) = d888e0e653727ee9895fa866d8895e6d23a568b4e9e8439db4c4d790996700c60b0655e3a3129e599736ec2b4f7b987ce79d625ba208f06665fced8bddf94403
@@ -12,6 +12,8 @@ Size (xen415/xen-4.15.5.tar.gz) = 408357
 SHA1 (patch-.._seabios-rel-1.16.0_src_string.c) = e82f2f16a236a3b878c07b4fb655998591717a73
 SHA1 (patch-Config.mk) = d108a1743b5b5313d3ea957b02a005b49f5b3bf6
 SHA1 (patch-Makefile) = 6c580cbea532d08a38cf5e54228bd0210a98da21
+SHA1 (patch-XSA440) = 92c21a9caab0292082799e357725345ac676db9e
+SHA1 (patch-XSA443) = 53ea19eb131c3a83b9ab586fc6632fa3704e4fc0
 SHA1 (patch-docs_man_xl.1.pod.in) = 280a3717b9f15578d90f85392249ef97844b6765
 SHA1 (patch-docs_man_xl.cfg.5.pod.in) = 5970961552f29c4536a884161a208a27a20dccf4
 SHA1 (patch-docs_man_xlcpupool.cfg.5.pod) = ab3a2529cd10458948557fd7ab032e80df8b1d81

Added files:

Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA439
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA439:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA439   Wed Nov 15 15:59:36 2023
@@ -0,0 +1,255 @@
+$NetBSD: patch-XSA439,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From d7b78041dc819efde0350f27754a61cb01a93496 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Date: Wed, 30 Aug 2023 20:24:25 +0100
+Subject: [PATCH 1/1] x86/spec-ctrl: Mitigate the Zen1 DIV leakage
+
+In the Zen1 microarchitecure, there is one divider in the pipeline which
+services uops from both threads.  In the case of #DE, the latched result from
+the previous DIV to execute will be forwarded speculatively.
+
+This is an interesting covert channel that allows two threads to communicate
+without any system calls.  In also allows userspace to obtain the result of
+the most recent DIV instruction executed (even speculatively) in the core,
+which can be from a higher privilege context.
+
+Scrub the result from the divider by executing a non-faulting divide.  This
+needs performing on the exit-to-guest paths, and ist_exit-to-Xen.
+
+Alternatives in IST context is believed safe now that it's done in NMI
+context.
+
+This is XSA-439 / CVE-2023-20588.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+(cherry picked from commit b5926c6ecf05c28ee99c6248c42d691ccbf0c315)
+---
+ docs/misc/xen-command-line.pandoc   |  6 +++-
+ xen/arch/x86/hvm/svm/entry.S        |  1 +
+ xen/arch/x86/spec_ctrl.c            | 49 ++++++++++++++++++++++++++++-
+ xen/include/asm-x86/cpufeatures.h   |  3 +-
+ xen/include/asm-x86/spec_ctrl_asm.h | 17 ++++++++++
+ 5 files changed, 73 insertions(+), 3 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index 16a61ad858..f3d1009f2d 100644
+--- docs/misc/xen-command-line.pandoc.orig
++++ docs/misc/xen-command-line.pandoc
+@@ -2189,7 +2189,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
+ >              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
+ >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
+ >              eager-fpu,l1d-flush,branch-harden,srb-lock,
+->              unpriv-mmio,gds-mit}=<bool> ]`
++>              unpriv-mmio,gds-mit,div-scrub}=<bool> ]`
+ 
+ Controls for speculative execution sidechannel mitigations.  By default, Xen
+ will pick the most appropriate mitigations based on compiled in support,
+@@ -2309,6 +2309,10 @@ has elected not to lock the configuration, Xen will use GDS_CTRL to mitigate
+ GDS with.  Otherwise, Xen will mitigate by disabling AVX, which blocks the use
+ of the AVX2 Gather instructions.
+ 
++On all hardware, the `div-scrub=` option can be used to force or prevent Xen
++from mitigating the DIV-leakage vulnerability.  By default, Xen will mitigate
++DIV-leakage on hardware believed to be vulnerable.
++
+ ### sync_console
+ > `= <boolean>`
+ 
+diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S
+index 0ff4008060..ad5ca50c12 100644
+--- xen/arch/x86/hvm/svm/entry.S.orig
++++ xen/arch/x86/hvm/svm/entry.S
+@@ -72,6 +72,7 @@ __UNLIKELY_END(nsvm_hap)
+ 1:          /* No Spectre v1 concerns.  Execution will hit VMRUN imminently. */
+         .endm
+         ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM
++        ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+ 
+         pop  %r15
+         pop  %r14
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index e9cc6b586a..f75124117b 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -22,6 +22,7 @@
+ #include <xen/param.h>
+ #include <xen/warning.h>
+ 
++#include <asm/amd.h>
+ #include <asm/hvm/svm/svm.h>
+ #include <asm/microcode.h>
+ #include <asm/msr.h>
+@@ -78,6 +79,7 @@ static int8_t __initdata opt_srb_lock = -1;
+ static bool __initdata opt_unpriv_mmio;
+ static bool __read_mostly opt_fb_clear_mmio;
+ static int8_t __initdata opt_gds_mit = -1;
++static int8_t __initdata opt_div_scrub = -1;
+ 
+ static int __init parse_spec_ctrl(const char *s)
+ {
+@@ -132,6 +134,7 @@ static int __init parse_spec_ctrl(const char *s)
+             opt_srb_lock = 0;
+             opt_unpriv_mmio = false;
+             opt_gds_mit = 0;
++            opt_div_scrub = 0;
+         }
+         else if ( val > 0 )
+             rc = -EINVAL;
+@@ -284,6 +287,8 @@ static int __init parse_spec_ctrl(const char *s)
+             opt_unpriv_mmio = val;
+         else if ( (val = parse_boolean("gds-mit", s, ss)) >= 0 )
+             opt_gds_mit = val;
++        else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 )
++            opt_div_scrub = val;
+         else
+             rc = -EINVAL;
+ 
+@@ -484,7 +489,7 @@ static void __init print_details(enum ind_thunk thunk)
+                "\n");
+ 
+     /* Settings for Xen's protection, irrespective of guests. */
+-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n",
++    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n",
+            thunk == THUNK_NONE      ? "N/A" :
+            thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+            thunk == THUNK_LFENCE    ? "LFENCE" :
+@@ -509,6 +514,7 @@ static void __init print_details(enum ind_thunk thunk)
+            opt_l1d_flush                             ? " L1D_FLUSH" : "",
+            opt_md_clear_pv || opt_md_clear_hvm ||
+            opt_fb_clear_mmio                         ? " VERW"  : "",
++           opt_div_scrub                             ? " DIV" : "",
+            opt_branch_harden                         ? " BRANCH_HARDEN" : "");
+ 
+     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+@@ -933,6 +939,45 @@ static void __init srso_calculations(bool hw_smt_enabled)
+         setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ }
+ 
++/*
++ * The Div leakage issue is specific to the AMD Zen1 microarchitecure.
++ *
++ * However, there's no $FOO_NO bit defined, so if we're virtualised we have no
++ * hope of spotting the case where we might move to vulnerable hardware.  We
++ * also can't make any useful conclusion about SMT-ness.
++ *
++ * Don't check the hypervisor bit, so at least we do the safe thing when
++ * booting on something that looks like a Zen1 CPU.
++ */
++static bool __init has_div_vuln(void)
++{
++    if ( !(boot_cpu_data.x86_vendor &
++           (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
++        return false;
++
++    if ( boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18 )
++        return false;
++
++    return is_zen1_uarch();
++}
++
++static void __init div_calculations(bool hw_smt_enabled)
++{
++    bool cpu_bug_div = has_div_vuln();
++
++    if ( opt_div_scrub == -1 )
++        opt_div_scrub = cpu_bug_div;
++
++    if ( opt_div_scrub )
++        setup_force_cpu_cap(X86_FEATURE_SC_DIV);
++
++    if ( opt_smt == -1 && !cpu_has_hypervisor && cpu_bug_div && hw_smt_enabled )
++        warning_add(
++            "Booted on leaky-DIV hardware with SMT/Hyperthreading\n"
++            "enabled.  Please assess your configuration and choose an\n"
++            "explicit 'smt=<bool>' setting.  See XSA-439.\n");
++}
++
+ static void __init ibpb_calculations(void)
+ {
+     bool def_ibpb_entry = false;
+@@ -1644,6 +1689,8 @@ void __init init_speculation_mitigations(void)
+ 
+     ibpb_calculations();
+ 
++    div_calculations(hw_smt_enabled);
++
+     /* Check whether Eager FPU should be enabled by default. */
+     if ( opt_eager_fpu == -1 )
+         opt_eager_fpu = should_use_eager_fpu();
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index bdb119a34c..d993e06e4c 100644
+--- xen/include/asm-x86/cpufeatures.h.orig
++++ xen/include/asm-x86/cpufeatures.h
+@@ -35,7 +35,8 @@ XEN_CPUFEATURE(SC_RSB_HVM,        X86_SYNTH(19)) /* RSB overwrite needed for HVM
+ XEN_CPUFEATURE(XEN_SELFSNOOP,     X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */
+ XEN_CPUFEATURE(SC_MSR_IDLE,       X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */
+ XEN_CPUFEATURE(XEN_LBR,           X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
+-/* Bits 23,24 unused. */
++XEN_CPUFEATURE(SC_DIV,            X86_SYNTH(23)) /* DIV scrub needed */
++/* Bit 24 unused. */
+ XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
+ XEN_CPUFEATURE(XEN_SHSTK,         X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
+ XEN_CPUFEATURE(XEN_IBT,           X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
+
+--- xen/include/asm-x86/spec_ctrl_asm.h.orig   2023-08-07 14:08:26.000000000 +0200
++++ xen/include/asm-x86/spec_ctrl_asm.h        2023-11-15 14:50:58.771057793 +0100
+@@ -178,6 +178,19 @@
+ .L\@_verw_skip:
+ .endm
+ 
++.macro DO_SPEC_CTRL_DIV
++/*
++ * Requires nothing
++ * Clobbers %rax
++ *
++ * Issue a DIV for its flushing side effect (Zen1 uarch specific).  Any
++ * non-faulting DIV will do; a byte DIV has least latency, and doesn't clobber
++ * %rdx.
++ */
++    mov $1, %eax
++    div %al
++.endm
++
+ .macro DO_SPEC_CTRL_ENTRY maybexen:req
+ /*
+  * Requires %rsp=regs (also cpuinfo if !maybexen)
+@@ -231,6 +244,7 @@
+     wrmsr
+ 
+ .L\@_skip:
++    ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+ .endm
+ 
+ .macro DO_SPEC_CTRL_EXIT_TO_GUEST
+@@ -277,7 +291,8 @@
+ #define SPEC_CTRL_EXIT_TO_PV                                            \
+     ALTERNATIVE "",                                                     \
+         DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
+-    DO_SPEC_CTRL_COND_VERW
++    DO_SPEC_CTRL_COND_VERW;                                             \
++    ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+ 
+ /*
+  * Use in IST interrupt/exception context.  May interrupt Xen or PV context.
+--- xen/include/asm-x86/amd.h.orig     2023-11-15 15:16:19.642351562 +0100
++++ xen/include/asm-x86/amd.h  2023-11-15 15:17:10.878437198 +0100
+@@ -140,6 +140,17 @@
+                        AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf),    \
+                        AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf))
+ 
++/*
++ * The Zen1 and Zen2 microarchitectures are implemented by AMD (Fam17h) and
++ * Hygon (Fam18h) but without simple model number rules.  Instead, use STIBP
++ * as a heuristic that distinguishes the two.
++ *
++ * The caller is required to perform the appropriate vendor/family checks
++ * first.
++ */
++#define is_zen1_uarch() (!boot_cpu_has(X86_FEATURE_AMD_STIBP))
++#define is_zen2_uarch()   boot_cpu_has(X86_FEATURE_AMD_STIBP)
++
+ struct cpuinfo_x86;
+ int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...);
+ 
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA442
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA442:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA442   Wed Nov 15 15:59:36 2023
@@ -0,0 +1,187 @@
+$NetBSD: patch-XSA442,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 42614970833467d8b9aaf9def9f062c6c7425dad Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Tue, 13 Jun 2023 15:01:05 +0200
+Subject: [PATCH] iommu/amd-vi: flush IOMMU TLB when flushing the DTE
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The caching invalidation guidelines from the AMD-Vi specification (48882—Rev
+3.07-PUB—Oct 2022) seem to be misleading on some hardware, as devices will
+malfunction (see stale DMA mappings) if some fields of the DTE are updated but
+the IOMMU TLB is not flushed. This has been observed in practice on AMD
+systems.  Due to the lack of guidance from the currently published
+specification this patch aims to increase the flushing done in order to prevent
+device malfunction.
+
+In order to fix, issue an INVALIDATE_IOMMU_PAGES command from
+amd_iommu_flush_device(), flushing all the address space.  Note this requires
+callers to be adjusted in order to pass the DomID on the DTE previous to the
+modification.
+
+Some call sites don't provide a valid DomID to amd_iommu_flush_device() in
+order to avoid the flush.  That's because the device had address translations
+disabled and hence the previous DomID on the DTE is not valid.  Note the
+current logic relies on the entity disabling address translations to also flush
+the TLB of the in use DomID.
+
+Device I/O TLB flushing when ATS are enabled is not covered by the current
+change, as ATS usage is not security supported.
+
+This is XSA-442 / CVE-2023-34326
+
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ xen/drivers/passthrough/amd/iommu.h         |  3 ++-
+ xen/drivers/passthrough/amd/iommu_cmd.c     | 10 +++++++++-
+ xen/drivers/passthrough/amd/iommu_guest.c   |  5 +++--
+ xen/drivers/passthrough/amd/iommu_init.c    |  6 +++++-
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 14 ++++++++++----
+ 5 files changed, 29 insertions(+), 9 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu.h b/xen/drivers/passthrough/amd/iommu.h
+index 0d9d976faaea..4e355ef4c12f 100644
+--- xen/drivers/passthrough/amd/iommu.h.orig
++++ xen/drivers/passthrough/amd/iommu.h
+@@ -265,7 +265,8 @@ void amd_iommu_flush_pages(struct domain *d, unsigned long dfn,
+                            unsigned int order);
+ void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev,
+                            uint64_t gaddr, unsigned int order);
+-void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf);
++void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf,
++                            domid_t domid);
+ void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf);
+ void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
+ 
+diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c
+index dfb8b1c860d1..196e3dce3aec 100644
+--- xen/drivers/passthrough/amd/iommu_cmd.c.orig
++++ xen/drivers/passthrough/amd/iommu_cmd.c
+@@ -362,12 +362,20 @@ void amd_iommu_flush_pages(struct domain *d,
+     _amd_iommu_flush_pages(d, __dfn_to_daddr(dfn), order);
+ }
+ 
+-void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf)
++void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf,
++                            domid_t domid)
+ {
+     ASSERT( spin_is_locked(&iommu->lock) );
+ 
+     invalidate_dev_table_entry(iommu, bdf);
+     flush_command_buffer(iommu, 0);
++
++    /* Also invalidate IOMMU TLB entries when flushing the DTE. */
++    if ( domid != DOMID_INVALID )
++    {
++        invalidate_iommu_pages(iommu, INV_IOMMU_ALL_PAGES_ADDRESS, domid, 0);
++        flush_command_buffer(iommu, 0);
++    }
+ }
+ 
+ void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf)
+diff --git a/xen/drivers/passthrough/amd/iommu_guest.c b/xen/drivers/passthrough/amd/iommu_guest.c
+index 00c5ccd7b5d2..f404e382f019 100644
+--- xen/drivers/passthrough/amd/iommu_guest.c.orig
++++ xen/drivers/passthrough/amd/iommu_guest.c
+@@ -385,7 +385,7 @@ static int do_completion_wait(struct domain *d, cmd_entry_t *cmd)
+ 
+ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd)
+ {
+-    uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id;
++    uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id, prev_domid;
+     struct amd_iommu_dte *gdte, *mdte, *dte_base;
+     struct amd_iommu *iommu = NULL;
+     struct guest_iommu *g_iommu;
+@@ -445,11 +445,12 @@ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd)
+     req_id = get_dma_requestor_id(iommu->seg, mbdf);
+     dte_base = iommu->dev_table.buffer;
+     mdte = &dte_base[req_id];
++    prev_domid = mdte->domain_id;
+ 
+     spin_lock_irqsave(&iommu->lock, flags);
+     dte_set_gcr3_table(mdte, hdom_id, gcr3_mfn << PAGE_SHIFT, gv, glx);
+ 
+-    amd_iommu_flush_device(iommu, req_id);
++    amd_iommu_flush_device(iommu, req_id, prev_domid);
+     spin_unlock_irqrestore(&iommu->lock, flags);
+ 
+     return 0;
+diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
+index bb52c181f8cd..4a96f7fbec3c 100644
+--- xen/drivers/passthrough/amd/iommu_init.c.orig
++++ xen/drivers/passthrough/amd/iommu_init.c
+@@ -1554,7 +1554,11 @@ static int _invalidate_all_devices(
+         if ( iommu )
+         {
+             spin_lock_irqsave(&iommu->lock, flags);
+-            amd_iommu_flush_device(iommu, req_id);
++            /*
++             * IOMMU TLB flush performed separately (see
++             * invalidate_all_domain_pages()).
++             */
++            amd_iommu_flush_device(iommu, req_id, DOMID_INVALID);
+             amd_iommu_flush_intremap(iommu, req_id);
+             spin_unlock_irqrestore(&iommu->lock, flags);
+         }
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index e804fdc34fcd..872955566608 100644
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -183,10 +183,13 @@ static int __must_check amd_iommu_setup_domain_device(
+              iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
+             dte->i = ats_enabled;
+ 
+-        amd_iommu_flush_device(iommu, req_id);
++        /* DTE didn't have DMA translations enabled, do not flush the TLB. */
++        amd_iommu_flush_device(iommu, req_id, DOMID_INVALID);
+     }
+     else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) )
+     {
++        domid_t prev_domid = dte->domain_id;
++
+         /*
+          * Strictly speaking if the device is the only one with this requestor
+          * ID, it could be allowed to be re-assigned regardless of unity map
+@@ -240,7 +243,7 @@ static int __must_check amd_iommu_setup_domain_device(
+              iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
+             ASSERT(dte->i == ats_enabled);
+ 
+-        amd_iommu_flush_device(iommu, req_id);
++        amd_iommu_flush_device(iommu, req_id, prev_domid);
+     }
+ 
+     spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -389,6 +392,8 @@ static void amd_iommu_disable_domain_device(const struct domain *domain,
+     spin_lock_irqsave(&iommu->lock, flags);
+     if ( dte->tv || dte->v )
+     {
++        domid_t prev_domid = dte->domain_id;
++
+         /* See the comment in amd_iommu_setup_device_table(). */
+         dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED;
+         smp_wmb();
+@@ -405,7 +410,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain,
+         smp_wmb();
+         dte->v = true;
+ 
+-        amd_iommu_flush_device(iommu, req_id);
++        amd_iommu_flush_device(iommu, req_id, prev_domid);
+ 
+         AMD_IOMMU_DEBUG("Disable: device id = %#x, "
+                         "domain = %d, paging mode = %d\n",
+@@ -578,7 +583,8 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+             iommu->dev_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE),
+             ivrs_mappings[bdf].intremap_table, iommu, iommu_intremap);
+ 
+-        amd_iommu_flush_device(iommu, bdf);
++        /* DTE didn't have DMA translations enabled, do not flush the TLB. */
++        amd_iommu_flush_device(iommu, bdf, DOMID_INVALID);
+ 
+         spin_unlock_irqrestore(&iommu->lock, flags);
+     }
+-- 
+2.42.0
+
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA444
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA444:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA444   Wed Nov 15 15:59:36 2023
@@ -0,0 +1,167 @@
+$NetBSD: patch-XSA444,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/svm: Fix asymmetry with AMD DR MASK context switching
+
+The handling of MSR_DR{0..3}_MASK is asymmetric between PV and HVM guests.
+
+HVM guests context switch in based on the guest view of DBEXT, whereas PV
+guest switch in base on the host capability.  Both guest types leave the
+context dirty for the next vCPU.
+
+This leads to the following issue:
+
+ * PV or HVM guest has debugging active (%dr7 + mask)
+ * Switch-out deactivates %dr7 but leaves other state stale in hardware
+ * Another HVM guest with masks unavailable has debugging active
+ * Switch in loads %dr7 but leaves the mask MSRs alone
+
+Now, the second guest's vCPU is operating in the context of the prior vCPU's
+mask MSR, while the environment the vCPU can see says there are no mask MSRs.
+
+As a stopgap, adjust the HVM path to switch in the masks based on host
+capabilities rather than guest visibility (i.e. like the PV path).  Adjustment
+of the intercepts still needs to be dependent on the guest visibility of
+DBEXT.
+
+This is part of XSA-444 / CVE-2023-34327
+
+Fixes: c097f54912d3 ("x86/SVM: support data breakpoint extension registers")
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
+index a019d196e071..ba4069f9100a 100644
+--- xen/arch/x86/hvm/svm/svm.c.orig
++++ xen/arch/x86/hvm/svm/svm.c
+@@ -185,6 +185,10 @@ static void svm_save_dr(struct vcpu *v)
+     v->arch.hvm.flag_dr_dirty = 0;
+     vmcb_set_dr_intercepts(vmcb, ~0u);
+ 
++    /*
++     * The guest can only have changed the mask MSRs if we previous dropped
++     * intercepts.  Re-read them from hardware.
++     */
+     if ( v->domain->arch.cpuid->extd.dbext )
+     {
+         svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
+@@ -216,17 +220,25 @@ static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
+ 
+     ASSERT(v == current);
+ 
+-    if ( v->domain->arch.cpuid->extd.dbext )
++    /*
++     * Both the PV and HVM paths leave stale DR_MASK values in hardware on
++     * context-switch-out.  If we're activating %dr7 for the guest, we must
++     * sync the DR_MASKs too, whether or not the guest can see them.
++     */
++    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+     {
+-        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+-        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+-        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+-        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+-
+         wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.msrs->dr_mask[0]);
+         wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.msrs->dr_mask[1]);
+         wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.msrs->dr_mask[2]);
+         wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.msrs->dr_mask[3]);
++
++        if ( v->domain->arch.cpuid->extd.dbext )
++        {
++            svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++            svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++            svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++            svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
++        }
+     }
+ 
+     write_debugreg(0, v->arch.dr[0]);
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index f7992ff230b5..a142a63dd869 100644
+--- xen/arch/x86/traps.c.orig
++++ xen/arch/x86/traps.c
+@@ -2314,6 +2314,11 @@ void activate_debugregs(const struct vcpu *curr)
+     if ( curr->arch.dr7 & DR7_ACTIVE_MASK )
+         write_debugreg(7, curr->arch.dr7);
+ 
++    /*
++     * Both the PV and HVM paths leave stale DR_MASK values in hardware on
++     * context-switch-out.  If we're activating %dr7 for the guest, we must
++     * sync the DR_MASKs too, whether or not the guest can see them.
++     */
+     if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+     {
+         wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]);
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/pv: Correct the auditing of guest breakpoint addresses
+
+The use of access_ok() is buggy, because it permits access to the compat
+translation area.  64bit PV guests don't use the XLAT area, but on AMD
+hardware, the DBEXT feature allows a breakpoint to match up to a 4G aligned
+region, allowing the breakpoint to reach outside of the XLAT area.
+
+Prior to c/s cda16c1bb223 ("x86: mirror compat argument translation area for
+32-bit PV"), the live GDT was within 4G of the XLAT area.
+
+All together, this allowed a malicious 64bit PV guest on AMD hardware to place
+a breakpoint over the live GDT, and trigger a #DB livelock (CVE-2015-8104).
+
+Introduce breakpoint_addr_ok() and explain why __addr_ok() happens to be an
+appropriate check in this case.
+
+For Xen 4.14 and later, this is a latent bug because the XLAT area has moved
+to be on its own with nothing interesting adjacent.  For Xen 4.13 and older on
+AMD hardware, this fixes a PV-trigger-able DoS.
+
+This is part of XSA-444 / CVE-2023-34328.
+
+Fixes: 65e355490817 ("x86/PV: support data breakpoint extension registers")
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c
+index 5dade2472687..681c16108fd1 100644
+--- xen/arch/x86/pv/misc-hypercalls.c.orig
++++ xen/arch/x86/pv/misc-hypercalls.c
+@@ -68,7 +68,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
+     switch ( reg )
+     {
+     case 0 ... 3:
+-        if ( !access_ok(value, sizeof(long)) )
++        if ( !breakpoint_addr_ok(value) )
+             return -EPERM;
+ 
+         v->arch.dr[reg] = value;
+diff --git a/xen/include/asm-x86/debugreg.h b/xen/include/asm-x86/debugreg.h
+index c57914efc6e8..cc298265244b 100644
+--- xen/include/asm-x86/debugreg.h.orig
++++ xen/include/asm-x86/debugreg.h
+@@ -77,6 +77,26 @@
+     asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
+     __val;                                                  \
+ })
++
++/*
++ * Architecturally, %dr{0..3} can have any arbitrary value.  However, Xen
++ * can't allow the guest to breakpoint the Xen address range, so we limit the
++ * guest to the lower canonical half, or above the Xen range in the higher
++ * canonical half.
++ *
++ * Breakpoint lengths are specified to mask the low order address bits,
++ * meaning all breakpoints are naturally aligned.  With %dr7, the widest
++ * breakpoint is 8 bytes.  With DBEXT, the widest breakpoint is 4G.  Both of
++ * the Xen boundaries have >4G alignment.
++ *
++ * In principle we should account for HYPERVISOR_COMPAT_VIRT_START(d), but
++ * 64bit Xen has never enforced this for compat guests, and there's no problem
++ * (to Xen) if the guest breakpoints it's alias of the M2P.  Skipping this
++ * aspect simplifies the logic, and causes us not to reject a migrating guest
++ * which operated fine on prior versions of Xen.
++ */
++#define breakpoint_addr_ok(a) __addr_ok(a)
++
+ long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
+ void activate_debugregs(const struct vcpu *);
+ 
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA445
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA445:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA445   Wed Nov 15 15:59:36 2023
@@ -0,0 +1,67 @@
+$NetBSD: patch-XSA445,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 9877bb3af60ef2b543742835c49de7d0108cdca9 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Wed, 11 Oct 2023 13:14:21 +0200
+Subject: [PATCH] iommu/amd-vi: use correct level for quarantine domain page
+ tables
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current setup of the quarantine page tables assumes that the quarantine
+domain (dom_io) has been initialized with an address width of
+DEFAULT_DOMAIN_ADDRESS_WIDTH (48).
+
+However dom_io being a PV domain gets the AMD-Vi IOMMU page tables levels based
+on the maximum (hot pluggable) RAM address, and hence on systems with no RAM
+above the 512GB mark only 3 page-table levels are configured in the IOMMU.
+
+On systems without RAM above the 512GB boundary amd_iommu_quarantine_init()
+will setup page tables for the scratch page with 4 levels, while the IOMMU will
+be configured to use 3 levels only.  The page destined to be used as level 1,
+and to contain a directory of PTEs ends up being the address in a PTE itself,
+and thus level 1 page becomes the leaf page.  Without the level mismatch it's
+level 0 page that should be the leaf page instead.
+
+The level 1 page won't be used as such, and hence it's not possible to use it
+to gain access to other memory on the system.  However that page is not cleared
+in amd_iommu_quarantine_init() as part of re-initialization of the device
+quarantine page tables, and hence data on the level 1 page can be leaked
+between device usages.
+
+Fix this by making sure the paging levels setup by amd_iommu_quarantine_init()
+match the number configured on the IOMMUs.
+
+Note that IVMD regions are not affected by this issue, as those areas are
+mapped taking the configured paging levels into account.
+
+This is XSA-445 / CVE-2023-46835
+
+Fixes: ea38867831da ('x86 / iommu: set up a scratch page in the quarantine domain')
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ xen/drivers/passthrough/amd/iommu_map.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_map.c b/xen/drivers/passthrough/amd/iommu_map.c
+index b4c182449131..3473db4c1efc 100644
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -584,9 +584,7 @@ static int fill_qpt(union amd_iommu_pte *this, unsigned int level,
+ int amd_iommu_quarantine_init(struct pci_dev *pdev)
+ {
+     struct domain_iommu *hd = dom_iommu(dom_io);
+-    unsigned long end_gfn =
+-        1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
+-    unsigned int level = amd_iommu_get_paging_mode(end_gfn);
++    unsigned int level = hd->arch.amd.paging_mode;
+     unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf);
+     const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+     int rc;
+
+base-commit: 4a4daf6bddbe8a741329df5cc8768f7dec664aed
+-- 
+2.30.2
+
Index: pkgsrc/sysutils/xenkernel415/patches/patch-XSA446
diff -u /dev/null pkgsrc/sysutils/xenkernel415/patches/patch-XSA446:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xenkernel415/patches/patch-XSA446   Wed Nov 15 15:59:36 2023
@@ -0,0 +1,117 @@
+$NetBSD: patch-XSA446,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 80d5aada598c3a800a350003d5d582931545e13c Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Date: Thu, 26 Oct 2023 14:37:38 +0100
+Subject: [PATCH] x86/spec-ctrl: Remove conditional IRQs-on-ness for INT
+ $0x80/0x82 paths
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Before speculation defences, some paths in Xen could genuinely get away with
+being IRQs-on at entry.  But XPTI invalidated this property on most paths, and
+attempting to maintain it on the remaining paths was a mistake.
+
+Fast forward, and DO_SPEC_CTRL_COND_IBPB (protection for AMD BTC/SRSO) is not
+IRQ-safe, running with IRQs enabled in some cases.  The other actions taken on
+these paths happen to be IRQ-safe.
+
+Make entry_int82() and int80_direct_trap() unconditionally Interrupt Gates
+rather than Trap Gates.  Remove the conditional re-adjustment of
+int80_direct_trap() in smp_prepare_cpus(), and have entry_int82() explicitly
+enable interrupts when safe to do so.
+
+In smp_prepare_cpus(), with the conditional re-adjustment removed, the
+clearing of pv_cr3 is the only remaining action gated on XPTI, and it is out
+of place anyway, repeating work already done by smp_prepare_boot_cpu().  Drop
+the entire if() condition to avoid leaving an incorrect vestigial remnant.
+
+Also drop comments which make incorrect statements about when its safe to
+enable interrupts.
+
+This is XSA-446 / CVE-2023-46836
+
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+---
+ xen/arch/x86/pv/traps.c            |  4 ++--
+ xen/arch/x86/smpboot.c             | 14 --------------
+ xen/arch/x86/x86_64/compat/entry.S |  2 ++
+ xen/arch/x86/x86_64/entry.S        |  1 -
+ 4 files changed, 4 insertions(+), 17 deletions(-)
+
+diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
+index 74f333da7e1c..240d1a2db7a3 100644
+--- xen/arch/x86/pv/traps.c.orig
++++ xen/arch/x86/pv/traps.c
+@@ -139,11 +139,11 @@ void __init pv_trap_init(void)
+ #ifdef CONFIG_PV32
+     /* The 32-on-64 hypercall vector is only accessible from ring 1. */
+     _set_gate(idt_table + HYPERCALL_VECTOR,
+-              SYS_DESC_trap_gate, 1, entry_int82);
++              SYS_DESC_irq_gate, 1, entry_int82);
+ #endif
+ 
+     /* Fast trap for int80 (faster than taking the #GP-fixup path). */
+-    _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_trap_gate, 3,
++    _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
+               &int80_direct_trap);
+ 
+     open_softirq(NMI_SOFTIRQ, nmi_softirq);
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index 3a1a659082c6..4c54ecbc91d7 100644
+--- xen/arch/x86/smpboot.c.orig
++++ xen/arch/x86/smpboot.c
+@@ -1158,20 +1158,6 @@ void __init smp_prepare_cpus(void)
+ 
+     stack_base[0] = (void *)((unsigned long)stack_start & ~(STACK_SIZE - 1));
+ 
+-    if ( opt_xpti_hwdom || opt_xpti_domu )
+-    {
+-        get_cpu_info()->pv_cr3 = 0;
+-
+-#ifdef CONFIG_PV
+-        /*
+-         * All entry points which may need to switch page tables have to start
+-         * with interrupts off. Re-write what pv_trap_init() has put there.
+-         */
+-        _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
+-                  &int80_direct_trap);
+-#endif
+-    }
+-
+     set_nr_sockets();
+ 
+     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index bd5abd8040bd..fcc3a721f147 100644
+--- xen/arch/x86/x86_64/compat/entry.S.orig
++++ xen/arch/x86/x86_64/compat/entry.S
+@@ -21,6 +21,8 @@ ENTRY(entry_int82)
+         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+         /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
+ 
++        sti
++
+         CR4_PV32_RESTORE
+ 
+         GET_CURRENT(bx)
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 5ca74f5f62b2..9a7b129aa7e4 100644
+--- xen/arch/x86/x86_64/entry.S.orig
++++ xen/arch/x86/x86_64/entry.S
+@@ -327,7 +327,6 @@ ENTRY(sysenter_entry)
+ #ifdef CONFIG_XEN_SHSTK
+         ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK
+ #endif
+-        /* sti could live here when we don't switch page tables below. */
+         pushq $FLAT_USER_SS
+         pushq $0
+         pushfq
+
+base-commit: 7befef87cc9b1bb8ca15d866ce1ecd9165ccb58c
+prerequisite-patch-id: 142a87c707411d49e136c3fb76f1b14963ec6dc8
+-- 
+2.30.2
+

Index: pkgsrc/sysutils/xentools415/patches/patch-XSA440
diff -u /dev/null pkgsrc/sysutils/xentools415/patches/patch-XSA440:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xentools415/patches/patch-XSA440    Wed Nov 15 15:59:36 2023
@@ -0,0 +1,60 @@
+$NetBSD: patch-XSA440,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From 5d8b3d1ec98e56155d9650d7f4a70cd8ba9dc27d Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall%amazon.com@localhost>
+Date: Fri, 22 Sep 2023 11:32:16 +0100
+Subject: tools/xenstored: domain_entry_fix(): Handle conflicting transaction
+
+The function domain_entry_fix() will be initially called to check if the
+quota is correct before attempt to commit any nodes. So it would be
+possible that accounting is temporarily negative. This is the case
+in the following sequence:
+
+  1) Create 50 nodes
+  2) Start two transactions
+  3) Delete all the nodes in each transaction
+  4) Commit the two transactions
+
+Because the first transaction will have succeed and updated the
+accounting, there is no guarantee that 'd->nbentry + num' will still
+be above 0. So the assert() would be triggered.
+The assert() was introduced in dbef1f748289 ("tools/xenstore: simplify
+and fix per domain node accounting") with the assumption that the
+value can't be negative. As this is not true revert to the original
+check but restricted to the path where we don't update. Take the
+opportunity to explain the rationale behind the check.
+
+This CVE-2023-34323 / XSA-440.
+
+Reported-by: Stanislav Uschakow <suschako%amazon.de@localhost>
+Fixes: dbef1f748289 ("tools/xenstore: simplify and fix per domain node accounting")
+Signed-off-by: Julien Grall <jgrall%amazon.com@localhost>
+Reviewed-by: Juergen Gross <jgross%suse.com@localhost>
+
+diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c
+index aa86892fed9e..6074df210c6e 100644
+--- tools/xenstore/xenstored_domain.c.orig
++++ tools/xenstore/xenstored_domain.c
+@@ -1094,10 +1094,20 @@ int domain_entry_fix(unsigned int domid, int num, bool update)
+       }
+ 
+       cnt = d->nbentry + num;
+-      assert(cnt >= 0);
+ 
+-      if (update)
++      if (update) {
++              assert(cnt >= 0);
+               d->nbentry = cnt;
++      } else if (cnt < 0) {
++              /*
++               * In a transaction when a node is being added/removed AND
++               * the same node has been added/removed outside the
++               * transaction in parallel, the result value may be negative.
++               * This is no problem, as the transaction will fail due to
++               * the resulting conflict. So override 'cnt'.
++               */
++              cnt = 0;
++      }
+ 
+       return domid_is_unprivileged(domid) ? cnt : 0;
+ }
Index: pkgsrc/sysutils/xentools415/patches/patch-XSA443
diff -u /dev/null pkgsrc/sysutils/xentools415/patches/patch-XSA443:1.1
--- /dev/null   Wed Nov 15 15:59:36 2023
+++ pkgsrc/sysutils/xentools415/patches/patch-XSA443    Wed Nov 15 15:59:36 2023
@@ -0,0 +1,1370 @@
+$NetBSD: patch-XSA443,v 1.1 2023/11/15 15:59:36 bouyer Exp $
+
+From c4d597f63832a53bbb1b826af7a4677e40e9fded Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:50 +0100
+Subject: [PATCH 01/11] libfsimage/xfs: Remove dead code
+
+xfs_info.agnolog (and related code) and XFS_INO_AGBNO_BITS are dead code
+that serve no purpose.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/xfs/fsys_xfs.c | 18 ------------------
+ 1 file changed, 18 deletions(-)
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index d735a88e55f3..2800699f5985 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -37,7 +37,6 @@ struct xfs_info {
+       int blklog;
+       int inopblog;
+       int agblklog;
+-      int agnolog;
+       unsigned int nextents;
+       xfs_daddr_t next;
+       xfs_daddr_t daddr;
+@@ -65,9 +64,7 @@ static struct xfs_info xfs;
+ 
+ #define       XFS_INO_MASK(k)         ((xfs_uint32_t)((1ULL << (k)) - 1))
+ #define       XFS_INO_OFFSET_BITS     xfs.inopblog
+-#define       XFS_INO_AGBNO_BITS      xfs.agblklog
+ #define       XFS_INO_AGINO_BITS      (xfs.agblklog + xfs.inopblog)
+-#define       XFS_INO_AGNO_BITS       xfs.agnolog
+ 
+ static inline xfs_agblock_t
+ agino2agbno (xfs_agino_t agino)
+@@ -149,20 +146,6 @@ xt_len (xfs_bmbt_rec_32_t *r)
+       return le32(r->l3) & mask32lo(21);
+ }
+ 
+-static inline int
+-xfs_highbit32(xfs_uint32_t v)
+-{
+-      int i;
+-
+-      if (--v) {
+-              for (i = 0; i < 31; i++, v >>= 1) {
+-                      if (v == 0)
+-                              return i;
+-              }
+-      }
+-      return 0;
+-}
+-
+ static int
+ isinxt (xfs_fileoff_t key, xfs_fileoff_t offset, xfs_filblks_t len)
+ {
+@@ -472,7 +455,6 @@ xfs_mount (fsi_file_t *ffi, const char *options)
+ 
+       xfs.inopblog = super.sb_inopblog;
+       xfs.agblklog = super.sb_agblklog;
+-      xfs.agnolog = xfs_highbit32 (le32(super.sb_agcount));
+ 
+       xfs.btnode_ptr0_off =
+               ((xfs.bsize - sizeof(xfs_btree_block_t)) /
+-- 
+2.42.0
+
+From f75b0a70da392672fb7d9feed2a9e9515d74df2c Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:51 +0100
+Subject: [PATCH 02/11] libfsimage/xfs: Amend mask32lo() to allow the value 32
+
+agblklog could plausibly be 32, but that would overflow this shift.
+Perform the shift as ULL and cast to u32 at the end instead.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/xfs/fsys_xfs.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index 2800699f5985..4720bb4505c8 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -60,7 +60,7 @@ static struct xfs_info xfs;
+ #define inode         ((xfs_dinode_t *)((char *)FSYS_BUF + 8192))
+ #define icore         (inode->di_core)
+ 
+-#define       mask32lo(n)     (((xfs_uint32_t)1 << (n)) - 1)
++#define       mask32lo(n)     ((xfs_uint32_t)((1ull << (n)) - 1))
+ 
+ #define       XFS_INO_MASK(k)         ((xfs_uint32_t)((1ULL << (k)) - 1))
+ #define       XFS_INO_OFFSET_BITS     xfs.inopblog
+-- 
+2.42.0
+
+From 25fae23b32ee4d990ae11368ee21e28e66dbfa25 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:52 +0100
+Subject: [PATCH 03/11] libfsimage/xfs: Sanity-check the superblock during
+ mounts
+
+Sanity-check the XFS superblock for wellformedness at the mount handler.
+This forces pygrub to abort parsing a potentially malformed filesystem and
+ensures the invariants assumed throughout the rest of the code hold.
+
+Also, derive parameters from previously sanitized parameters where possible
+(rather than reading them off the superblock)
+
+The code doesn't try to avoid overflowing the end of the disk, because
+that's an unlikely and benign error. Parameters used in calculations of
+xfs_daddr_t (like the root inode index) aren't in critical need of being
+sanitized.
+
+The sanitization of agblklog is basically checking that no obvious
+overflows happen on agblklog, and then ensuring agblocks is contained in
+the range (2^(sb_agblklog-1), 2^sb_agblklog].
+
+This is part of XSA-443 / CVE-2023-34325
+
+Reported-by: Ferdinand Nölscher <noelscher%google.com@localhost>
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/xfs/fsys_xfs.c | 48 ++++++++++++++++++++++++++-------
+ tools/libfsimage/xfs/xfs.h      | 12 +++++++++
+ 2 files changed, 50 insertions(+), 10 deletions(-)
+
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index 4720bb4505c8..e4eb7e1ee26f 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -17,6 +17,7 @@
+  *  along with this program; If not, see <http://www.gnu.org/licenses/>.
+  */
+ 
++#include <stdbool.h>
+ #include <xenfsimage_grub.h>
+ #include "xfs.h"
+ 
+@@ -433,29 +434,56 @@ first_dentry (fsi_file_t *ffi, xfs_ino_t *ino)
+       return next_dentry (ffi, ino);
+ }
+ 
++static bool
++xfs_sb_is_invalid (const xfs_sb_t *super)
++{
++      return (le32(super->sb_magicnum) != XFS_SB_MAGIC)
++          || ((le16(super->sb_versionnum) & XFS_SB_VERSION_NUMBITS) !=
++              XFS_SB_VERSION_4)
++          || (super->sb_inodelog < XFS_SB_INODELOG_MIN)
++          || (super->sb_inodelog > XFS_SB_INODELOG_MAX)
++          || (super->sb_blocklog < XFS_SB_BLOCKLOG_MIN)
++          || (super->sb_blocklog > XFS_SB_BLOCKLOG_MAX)
++          || (super->sb_blocklog < super->sb_inodelog)
++          || (super->sb_agblklog > XFS_SB_AGBLKLOG_MAX)
++          || ((1ull << super->sb_agblklog) < le32(super->sb_agblocks))
++          || (((1ull << super->sb_agblklog) >> 1) >=
++              le32(super->sb_agblocks))
++          || ((super->sb_blocklog + super->sb_dirblklog) >=
++              XFS_SB_DIRBLK_NUMBITS);
++}
++
+ static int
+ xfs_mount (fsi_file_t *ffi, const char *options)
+ {
+       xfs_sb_t super;
+ 
+       if (!devread (ffi, 0, 0, sizeof(super), (char *)&super)
+-          || (le32(super.sb_magicnum) != XFS_SB_MAGIC)
+-          || ((le16(super.sb_versionnum) 
+-              & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4) ) {
++          || xfs_sb_is_invalid(&super)) {
+               return 0;
+       }
+ 
+-      xfs.bsize = le32 (super.sb_blocksize);
+-      xfs.blklog = super.sb_blocklog;
+-      xfs.bdlog = xfs.blklog - SECTOR_BITS;
++      /*
++       * Not sanitized. It's exclusively used to generate disk addresses,
++       * so it's not important from a security standpoint.
++       */
+       xfs.rootino = le64 (super.sb_rootino);
+-      xfs.isize = le16 (super.sb_inodesize);
+-      xfs.agblocks = le32 (super.sb_agblocks);
+-      xfs.dirbsize = xfs.bsize << super.sb_dirblklog;
+ 
+-      xfs.inopblog = super.sb_inopblog;
++      /*
++       * Sanitized to be consistent with each other, only used to
++       * generate disk addresses, so it's safe
++       */
++      xfs.agblocks = le32 (super.sb_agblocks);
+       xfs.agblklog = super.sb_agblklog;
+ 
++      /* Derived from sanitized parameters */
++      xfs.bsize = 1 << super.sb_blocklog;
++      xfs.blklog = super.sb_blocklog;
++      xfs.bdlog = super.sb_blocklog - SECTOR_BITS;
++      xfs.isize = 1 << super.sb_inodelog;
++      xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog);
++      xfs.inopblog = super.sb_blocklog - super.sb_inodelog;
++
+       xfs.btnode_ptr0_off =
+               ((xfs.bsize - sizeof(xfs_btree_block_t)) /
+               (sizeof (xfs_bmbt_key_t) + sizeof (xfs_bmbt_ptr_t)))
+diff --git a/tools/libfsimage/xfs/xfs.h b/tools/libfsimage/xfs/xfs.h
+index 40699281e44d..b87e37d3d7e9 100644
+--- tools/libfsimage/xfs/xfs.h.orig
++++ tools/libfsimage/xfs/xfs.h
+@@ -134,6 +134,18 @@ typedef struct xfs_sb
+         xfs_uint8_t       sb_dummy[7];    /* padding */
+ } xfs_sb_t;
+ 
++/* Bound taken from xfs.c in GRUB2. It doesn't exist in the spec */
++#define       XFS_SB_DIRBLK_NUMBITS   27
++/* Implied by the XFS specification. The minimum block size is 512 octets */
++#define       XFS_SB_BLOCKLOG_MIN     9
++/* Implied by the XFS specification. The maximum block size is 65536 octets */
++#define       XFS_SB_BLOCKLOG_MAX     16
++/* Implied by the XFS specification. The minimum inode size is 256 octets */
++#define       XFS_SB_INODELOG_MIN     8
++/* Implied by the XFS specification. The maximum inode size is 2048 octets */
++#define       XFS_SB_INODELOG_MAX     11
++/* High bound for sb_agblklog */
++#define       XFS_SB_AGBLKLOG_MAX     32
+ 
+ /* those are from xfs_btree.h */
+ 
+-- 
+2.42.0
+
+From e72c68e702dd930bc6013182bb44d3e8fbbb6bf4 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Thu, 14 Sep 2023 13:22:53 +0100
+Subject: [PATCH 04/11] libfsimage/xfs: Add compile-time check to libfsimage
+
+Adds the common tools include folder to the -I compile flags
+of libfsimage. This allows us to use:
+  xen-tools/common-macros.h:BUILD_BUG_ON()
+
+With it, statically assert a sanitized "blocklog - SECTOR_BITS" cannot
+underflow.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ tools/libfsimage/Rules.mk       | 2 +-
+ tools/libfsimage/xfs/fsys_xfs.c | 4 +++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/tools/libfsimage/Rules.mk b/tools/libfsimage/Rules.mk
+index bb6d42abb494..80598fb70aa7 100644
+--- tools/libfsimage/Rules.mk.orig
++++ tools/libfsimage/Rules.mk
+@@ -1,6 +1,6 @@
+ include $(XEN_ROOT)/tools/Rules.mk
+ 
+-CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ -DFSIMAGE_FSDIR=\"$(FSDIR)\"
++CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ $(CFLAGS_xeninclude) -DFSIMAGE_FSDIR=\"$(FSDIR)\"
+ CFLAGS += -Werror -D_GNU_SOURCE
+ LDFLAGS += -L../common/
+ 
+diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c
+index e4eb7e1ee26f..4a8dd6f2397b 100644
+--- tools/libfsimage/xfs/fsys_xfs.c.orig
++++ tools/libfsimage/xfs/fsys_xfs.c
+@@ -19,6 +19,7 @@
+ 
+ #include <stdbool.h>
+ #include <xenfsimage_grub.h>
++#include <xen-tools/libs.h>
+ #include "xfs.h"
+ 
+ #define MAX_LINK_COUNT        8
+@@ -477,9 +478,10 @@ xfs_mount (fsi_file_t *ffi, const char *options)
+       xfs.agblklog = super.sb_agblklog;
+ 
+       /* Derived from sanitized parameters */
++      BUILD_BUG_ON(XFS_SB_BLOCKLOG_MIN < SECTOR_BITS);
++      xfs.bdlog = super.sb_blocklog - SECTOR_BITS;
+       xfs.bsize = 1 << super.sb_blocklog;
+       xfs.blklog = super.sb_blocklog;
+-      xfs.bdlog = super.sb_blocklog - SECTOR_BITS;
+       xfs.isize = 1 << super.sb_inodelog;
+       xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog);
+       xfs.inopblog = super.sb_blocklog - super.sb_inodelog;
+-- 
+2.42.0
+
+From 75fdc03c5a6b7fac0c3a5ac06a5beaac73aad36f Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:21 +0100
+Subject: [PATCH 05/11] tools/pygrub: Remove unnecessary hypercall
+
+There's a hypercall being issued in order to determine whether PV64 is
+supported, but since Xen 4.3 that's strictly true so it's not required.
+
+Plus, this way we can avoid mapping the privcmd interface altogether in the
+depriv pygrub.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Reviewed-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/src/pygrub | 12 +-----------
+ 1 file changed, 1 insertion(+), 11 deletions(-)
+
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index ce7ab0eb8cf3..ce4e07d3e823 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -18,7 +18,6 @@ import os, sys, string, struct, tempfile, re, traceback, stat, errno
+ import copy
+ import logging
+ import platform
+-import xen.lowlevel.xc
+ 
+ import curses, _curses, curses.textpad, curses.ascii
+ import getopt
+@@ -668,14 +667,6 @@ def run_grub(file, entry, fs, cfg_args):
+ 
+     return grubcfg
+ 
+-def supports64bitPVguest():
+-    xc = xen.lowlevel.xc.xc()
+-    caps = xc.xeninfo()['xen_caps'].split(" ")
+-    for cap in caps:
+-        if cap == "xen-3.0-x86_64":
+-            return True
+-    return False
+-
+ # If nothing has been specified, look for a Solaris domU. If found, perform the
+ # necessary tweaks.
+ def sniff_solaris(fs, cfg):
+@@ -684,8 +675,7 @@ def sniff_solaris(fs, cfg):
+         return cfg
+ 
+     if not cfg["kernel"]:
+-        if supports64bitPVguest() and \
+-          fs.file_exists("/platform/i86xpv/kernel/amd64/unix"):
++        if fs.file_exists("/platform/i86xpv/kernel/amd64/unix"):
+             cfg["kernel"] = "/platform/i86xpv/kernel/amd64/unix"
+             cfg["ramdisk"] = "/platform/i86pc/amd64/boot_archive"
+         elif fs.file_exists("/platform/i86xpv/kernel/unix"):
+-- 
+2.42.0
+
+From 1083a16f63461e844e9515ac4d35d48bf55785af Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:22 +0100
+Subject: [PATCH 06/11] tools/pygrub: Small refactors
+
+Small tidy up to ensure output_directory always has a trailing '/' to ease
+concatenating paths and that `output` can only be a filename or None.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/src/pygrub | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index ce4e07d3e823..1042c05b8676 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -793,7 +793,7 @@ if __name__ == "__main__":
+     debug = False
+     not_really = False
+     output_format = "sxp"
+-    output_directory = "/var/run/xen/pygrub"
++    output_directory = "/var/run/xen/pygrub/"
+ 
+     # what was passed in
+     incfg = { "kernel": None, "ramdisk": None, "args": "" }
+@@ -815,7 +815,8 @@ if __name__ == "__main__":
+             usage()
+             sys.exit()
+         elif o in ("--output",):
+-            output = a
++            if a != "-":
++                output = a
+         elif o in ("--kernel",):
+             incfg["kernel"] = a
+         elif o in ("--ramdisk",):
+@@ -847,12 +848,11 @@ if __name__ == "__main__":
+             if not os.path.isdir(a):
+                 print("%s is not an existing directory" % a)
+                 sys.exit(1)
+-            output_directory = a
++            output_directory = a + '/'
+ 
+     if debug:
+         logging.basicConfig(level=logging.DEBUG)
+ 
+-
+     try:
+         os.makedirs(output_directory, 0o700)
+     except OSError as e:
+@@ -861,7 +861,7 @@ if __name__ == "__main__":
+         else:
+             raise
+ 
+-    if output is None or output == "-":
++    if output is None:
+         fd = sys.stdout.fileno()
+     else:
+         fd = os.open(output, os.O_WRONLY)
+-- 
+2.42.0
+
+From 350db30e33f39af40c1e3752d73c0a30ef2d26e7 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:23 +0100
+Subject: [PATCH 07/11] tools/pygrub: Open the output files earlier
+
+This patch allows pygrub to get ahold of every RW file descriptor it needs
+early on. A later patch will clamp the filesystem it can access so it can't
+obtain any others.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/src/pygrub | 37 ++++++++++++++++++++++---------------
+ 1 file changed, 22 insertions(+), 15 deletions(-)
+
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index 1042c05b8676..91e2ec2ab105 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -738,8 +738,7 @@ if __name__ == "__main__":
+     def usage():
+         print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] 
[--output-format=sxp|simple|simple0] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr)
+ 
+-    def copy_from_image(fs, file_to_read, file_type, output_directory,
+-                        not_really):
++    def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really):
+         if not_really:
+             if fs.file_exists(file_to_read):
+                 return "<%s:%s>" % (file_type, file_to_read)
+@@ -750,21 +749,18 @@ if __name__ == "__main__":
+         except Exception as e:
+             print(e, file=sys.stderr)
+             sys.exit("Error opening %s in guest" % file_to_read)
+-        (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".",
+-                                      dir=output_directory)
+         dataoff = 0
+         while True:
+             data = datafile.read(FS_READ_MAX, dataoff)
+             if len(data) == 0:
+-                os.close(tfd)
++                os.close(fd_dst)
+                 del datafile
+-                return ret
++                return
+             try:
+-                os.write(tfd, data)
++                os.write(fd_dst, data)
+             except Exception as e:
+                 print(e, file=sys.stderr)
+-                os.close(tfd)
+-                os.unlink(ret)
++                os.unlink(path_dst)
+                 del datafile
+                 sys.exit("Error writing temporary copy of "+file_type)
+             dataoff += len(data)
+@@ -861,6 +857,14 @@ if __name__ == "__main__":
+         else:
+             raise
+ 
++    if not_really:
++        fd_kernel =  path_kernel = fd_ramdisk = path_ramdisk = None
++    else:
++        (fd_kernel, path_kernel) = tempfile.mkstemp(prefix="boot_kernel.",
++                                                    dir=output_directory)
++        (fd_ramdisk, path_ramdisk) = tempfile.mkstemp(prefix="boot_ramdisk.",
++                                                      dir=output_directory)
++
+     if output is None:
+         fd = sys.stdout.fileno()
+     else:
+@@ -920,20 +924,23 @@ if __name__ == "__main__":
+     if fs is None:
+         raise RuntimeError("Unable to find partition containing kernel")
+ 
+-    bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel",
+-                                        output_directory, not_really)
++    copy_from_image(fs, chosencfg["kernel"], "kernel",
++                    fd_kernel, path_kernel, not_really)
++    bootcfg["kernel"] = path_kernel
+ 
+     if chosencfg["ramdisk"]:
+         try:
+-            bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"],
+-                                                 "ramdisk", output_directory,
+-                                                 not_really)
++            copy_from_image(fs, chosencfg["ramdisk"], "ramdisk",
++                            fd_ramdisk, path_ramdisk, not_really)
+         except:
+             if not not_really:
+-                os.unlink(bootcfg["kernel"])
++                os.unlink(path_kernel)
+             raise
++        bootcfg["ramdisk"] = path_ramdisk
+     else:
+         initrd = None
++        if not not_really:
++            os.unlink(path_ramdisk)
+ 
+     args = None
+     if chosencfg["args"]:
+-- 
+2.42.0
+
+From 1548ad2291ec7a72ae6949c11d2e50cea135a48d Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:24 +0100
+Subject: [PATCH 08/11] tools/libfsimage: Export a new function to preload all
+ plugins
+
+This is work required in order to let pygrub operate in highly deprivileged
+chroot mode. This patch adds a function that preloads every plugin, hence
+ensuring that a on function exit, every shared library is loaded in memory.
+
+The new "init" function is supposed to be used before depriv, but that's
+fine because it's not acting on untrusted data.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/libfsimage/common/fsimage_plugin.c |  4 ++--
+ tools/libfsimage/common/mapfile-GNU      |  1 +
+ tools/libfsimage/common/mapfile-SunOS    |  1 +
+ tools/libfsimage/common/xenfsimage.h     |  8 ++++++++
+ tools/pygrub/src/fsimage/fsimage.c       | 15 +++++++++++++++
+ 5 files changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/tools/libfsimage/common/fsimage_plugin.c b/tools/libfsimage/common/fsimage_plugin.c
+index de1412b4233a..d0cb9e96a654 100644
+--- tools/libfsimage/common/fsimage_plugin.c.orig
++++ tools/libfsimage/common/fsimage_plugin.c
+@@ -119,7 +119,7 @@ fail:
+       return (-1);
+ }
+ 
+-static int load_plugins(void)
++int fsi_init(void)
+ {
+       const char *fsdir = getenv("XEN_FSIMAGE_FSDIR");
+       struct dirent *dp = NULL;
+@@ -180,7 +180,7 @@ int find_plugin(fsi_t *fsi, const char *path, const char *options)
+       fsi_plugin_t *fp;
+       int ret = 0;
+ 
+-      if (plugins == NULL && (ret = load_plugins()) != 0)
++      if (plugins == NULL && (ret = fsi_init()) != 0)
+               goto out;
+ 
+       for (fp = plugins; fp != NULL; fp = fp->fp_next) {
+diff --git a/tools/libfsimage/common/mapfile-GNU b/tools/libfsimage/common/mapfile-GNU
+index 26d4d7a69ec7..2d54d527d7f5 100644
+--- tools/libfsimage/common/mapfile-GNU.orig
++++ tools/libfsimage/common/mapfile-GNU
+@@ -1,6 +1,7 @@
+ VERSION {
+       libfsimage.so.1.0 {
+               global:
++                      fsi_init;
+                       fsi_open_fsimage;
+                       fsi_close_fsimage;
+                       fsi_file_exists;
+diff --git a/tools/libfsimage/common/mapfile-SunOS b/tools/libfsimage/common/mapfile-SunOS
+index e99b90b65077..48deedb4252f 100644
+--- tools/libfsimage/common/mapfile-SunOS.orig
++++ tools/libfsimage/common/mapfile-SunOS
+@@ -1,5 +1,6 @@
+ libfsimage.so.1.0 {
+       global:
++              fsi_init;
+               fsi_open_fsimage;
+               fsi_close_fsimage;
+               fsi_file_exists;
+diff --git a/tools/libfsimage/common/xenfsimage.h b/tools/libfsimage/common/xenfsimage.h
+index 201abd54f23a..341883b2d71a 100644
+--- tools/libfsimage/common/xenfsimage.h.orig
++++ tools/libfsimage/common/xenfsimage.h
+@@ -35,6 +35,14 @@ extern C {
+ typedef struct fsi fsi_t;
+ typedef struct fsi_file fsi_file_t;
+ 
++/*
++ * Optional initialization function. If invoked it loads the associated
++ * dynamic libraries for the backends ahead of time. This is required if
++ * the library is to run as part of a highly deprivileged executable, as
++ * the libraries may not be reachable after depriv.
++ */
++int fsi_init(void);
++
+ fsi_t *fsi_open_fsimage(const char *, uint64_t, const char *);
+ void fsi_close_fsimage(fsi_t *);
+ 
+diff --git a/tools/pygrub/src/fsimage/fsimage.c b/tools/pygrub/src/fsimage/fsimage.c
+index 2ebbbe35df92..92fbf2851f01 100644
+--- tools/pygrub/src/fsimage/fsimage.c.orig
++++ tools/pygrub/src/fsimage/fsimage.c
+@@ -286,6 +286,15 @@ fsimage_getbootstring(PyObject *o, PyObject *args)
+       return Py_BuildValue("s", bootstring);
+ }
+ 
++static PyObject *
++fsimage_init(PyObject *o, PyObject *args)
++{
++      if (!PyArg_ParseTuple(args, ""))
++              return (NULL);
++
++      return Py_BuildValue("i", fsi_init());
++}
++
+ PyDoc_STRVAR(fsimage_open__doc__,
+     "open(name, [offset=off]) - Open the given file as a filesystem image.\n"
+     "\n"
+@@ -297,7 +306,13 @@ PyDoc_STRVAR(fsimage_getbootstring__doc__,
+     "getbootstring(fs) - Return the boot string needed for this file system "
+     "or NULL if none is needed.\n");
+ 
++PyDoc_STRVAR(fsimage_init__doc__,
++    "init() - Loads every dynamic library contained in xenfsimage "
++    "into memory so that it can be used in chrooted environments.\n");
++
+ static struct PyMethodDef fsimage_module_methods[] = {
++      { "init", (PyCFunction)fsimage_init,
++          METH_VARARGS, fsimage_init__doc__ },
+       { "open", (PyCFunction)fsimage_open,
+           METH_VARARGS|METH_KEYWORDS, fsimage_open__doc__ },
+       { "getbootstring", (PyCFunction)fsimage_getbootstring,
+-- 
+2.42.0
+
+From 4d331b0b914dfc17bd2d883bc55aeb798930832a Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Date: Mon, 25 Sep 2023 18:32:25 +0100
+Subject: [PATCH 09/11] tools/pygrub: Deprivilege pygrub
+
+Introduce a --runas=<uid> flag to deprivilege pygrub on Linux and *BSDs. It
+also implicitly creates a chroot env where it drops a deprivileged forked
+process. The chroot itself is cleaned up at the end.
+
+If the --runas arg is present, then pygrub forks, leaving the child to
+deprivilege itself, and waiting for it to complete. When the child exists,
+the parent performs cleanup and exits with the same error code.
+
+This is roughly what the child does:
+  1. Initialize libfsimage (this loads every .so in memory so the chroot
+     can avoid bind-mounting /{,usr}/lib*
+  2. Create a temporary empty chroot directory
+  3. Mount tmpfs in it
+  4. Bind mount the disk inside, because libfsimage expects a path, not a
+     file descriptor.
+  5. Remount the root tmpfs to be stricter (ro,nosuid,nodev)
+  6. Set RLIMIT_FSIZE to a sensibly high amount (128 MiB)
+  7. Depriv gid, groups and uid
+
+With this scheme in place, the "output" files are writable (up to
+RLIMIT_FSIZE octets) and the exposed filesystem is immutable and contains
+the single only file we can't easily get rid of (the disk).
+
+If running on Linux, the child process also unshares mount, IPC, and
+network namespaces before dropping its privileges.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo%cloud.com@localhost>
+Acked-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+---
+ tools/pygrub/setup.py   |   2 +-
+ tools/pygrub/src/pygrub | 162 +++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 154 insertions(+), 10 deletions(-)
+
+diff --git a/tools/pygrub/setup.py b/tools/pygrub/setup.py
+index b8f1dc4590cf..f16187b6d118 100644
+--- tools/pygrub/setup.py.orig
++++ tools/pygrub/setup.py
+@@ -17,7 +17,7 @@ xenfsimage = Extension("xenfsimage",
+ pkgs = [ 'grub' ]
+ 
+ setup(name='pygrub',
+-      version='0.6',
++      version='0.7',
+       description='Boot loader that looks a lot like grub for Xen',
+       author='Jeremy Katz',
+       author_email='katzj%redhat.com@localhost',
+diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub
+index 91e2ec2ab105..7cea496ade08 100755
+--- tools/pygrub/src/pygrub.orig
++++ tools/pygrub/src/pygrub
+@@ -16,8 +16,11 @@ from __future__ import print_function
+ 
+ import os, sys, string, struct, tempfile, re, traceback, stat, errno
+ import copy
++import ctypes, ctypes.util
+ import logging
+ import platform
++import resource
++import subprocess
+ 
+ import curses, _curses, curses.textpad, curses.ascii
+ import getopt
+@@ -27,10 +30,135 @@ import grub.GrubConf
+ import grub.LiloConf
+ import grub.ExtLinuxConf
+ 
+-PYGRUB_VER = 0.6
++PYGRUB_VER = 0.7
+ FS_READ_MAX = 1024 * 1024
+ SECTOR_SIZE = 512
+ 
++# Unless provided through the env variable PYGRUB_MAX_FILE_SIZE_MB, then
++# this is the maximum filesize allowed for files written by the depriv
++# pygrub
++LIMIT_FSIZE = 128 << 20
++
++CLONE_NEWNS = 0x00020000 # mount namespace
++CLONE_NEWNET = 0x40000000 # network namespace
++CLONE_NEWIPC = 0x08000000 # IPC namespace
++
++def unshare(flags):
++    if not sys.platform.startswith("linux"):
++        print("skip_unshare reason=not_linux platform=%s", sys.platform, file=sys.stderr)
++        return
++
++    libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True)
++    unshare_prototype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, use_errno=True)
++    unshare = unshare_prototype(('unshare', libc))
++
++    if unshare(flags) < 0:
++        raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno()))
++
++def bind_mount(src, dst, options):
++    open(dst, "a").close() # touch
++
++    rc = subprocess.call(["mount", "--bind", "-o", options, src, dst])
++    if rc != 0:
++        raise RuntimeError("bad_mount: src=%s dst=%s opts=%s" %
++                           (src, dst, options))
++
++def downgrade_rlimits():
++    # Wipe the authority to use unrequired resources
++    resource.setrlimit(resource.RLIMIT_NPROC,    (0, 0))
++    resource.setrlimit(resource.RLIMIT_CORE,     (0, 0))
++    resource.setrlimit(resource.RLIMIT_MEMLOCK,  (0, 0))
++
++    # py2's resource module doesn't know about resource.RLIMIT_MSGQUEUE
++    #
++    # TODO: Use resource.RLIMIT_MSGQUEUE after python2 is deprecated
++    if sys.platform.startswith('linux'):
++        RLIMIT_MSGQUEUE = 12
++        resource.setrlimit(RLIMIT_MSGQUEUE, (0, 0))
++
++    # The final look of the filesystem for this process is fully RO, but
++    # note we have some file descriptor already open (notably, kernel and
++    # ramdisk). In order to avoid a compromised pygrub from filling up the
++    # filesystem we set RLIMIT_FSIZE to a high bound, so that the file
++    # write permissions are bound.
++    fsize = LIMIT_FSIZE
++    if "PYGRUB_MAX_FILE_SIZE_MB" in os.environ.keys():
++        fsize = os.environ["PYGRUB_MAX_FILE_SIZE_MB"] << 20
++
++    resource.setrlimit(resource.RLIMIT_FSIZE, (fsize, fsize))
++
++def depriv(output_directory, output, device, uid, path_kernel, path_ramdisk):
++    # The only point of this call is to force the loading of libfsimage.
++    # That way, we don't need to bind-mount it into the chroot
++    rc = xenfsimage.init()
++    if rc != 0:
++        os.unlink(path_ramdisk)
++        os.unlink(path_kernel)
++        raise RuntimeError("bad_xenfsimage: rc=%d" % rc)
++
++    # Create a temporary directory for the chroot
++    chroot = tempfile.mkdtemp(prefix=str(uid)+'-', dir=output_directory) + '/'
++    device_path = '/device'
++
++    pid = os.fork()
++    if pid:
++        # parent
++        _, rc = os.waitpid(pid, 0)
++
++        for path in [path_kernel, path_ramdisk]:
++            # If the child didn't write anything, just get rid of it,
++            # otherwise we end up consuming a 0-size file when parsing
++            # systems without a ramdisk that the ultimate caller of pygrub
++            # may just be unaware of
++            if rc != 0 or os.path.getsize(path) == 0:
++                os.unlink(path)
++
++        # Normally, unshare(CLONE_NEWNS) will ensure this is not required.
++        # However, this syscall doesn't exist in *BSD systems and doesn't
++        # auto-unmount everything on older Linux kernels (At least as of
++        # Linux 4.19, but it seems fixed in 5.15). Either way,
++        # recursively unmount everything if needed. Quietly.
++        with open('/dev/null', 'w') as devnull:
++            subprocess.call(["umount", "-f", chroot + device_path],
++                            stdout=devnull, stderr=devnull)
++            subprocess.call(["umount", "-f", chroot],
++                            stdout=devnull, stderr=devnull)
++        os.rmdir(chroot)
++
++        sys.exit(rc)
++
++    # By unsharing the namespace we're making sure it's all bulk-released
++    # at the end, when the namespaces disappear. This means the kernel does
++    # (almost) all the cleanup for us and the parent just has to remove the
++    # temporary directory.
++    unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWNET)
++
++    # Set sensible limits using the setrlimit interface
++    downgrade_rlimits()
++
++    # We'll mount tmpfs on the chroot to ensure the deprivileged child
++    # cannot affect the persistent state. It's RW now in order to
++    # bind-mount the device, but note it's remounted RO after that.
++    rc = subprocess.call(["mount", "-t", "tmpfs", "none", chroot])
++    if rc != 0:
++        raise RuntimeError("mount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot))
++
++    # Bind the untrusted device RO
++    bind_mount(device, chroot + device_path, "ro,nosuid,noexec")
++
++    rc = subprocess.call(["mount", "-t", "tmpfs", "-o", "remount,ro,nosuid,noexec,nodev", "none", chroot])
++    if rc != 0:
++        raise RuntimeError("remount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot))
++
++    # Drop superpowers!
++    os.chroot(chroot)
++    os.chdir('/')
++    os.setgid(uid)
++    os.setgroups([uid])
++    os.setuid(uid)
++
++    return device_path
++
+ def read_size_roundup(fd, size):
+     if platform.system() != 'FreeBSD':
+         return size
+@@ -736,7 +864,7 @@ if __name__ == "__main__":
+     sel = None
+     
+     def usage():
+-        print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] 
[--output-format=sxp|simple|simple0] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr)
++        print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] 
[--output-format=sxp|simple|simple0] [--runas=] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr)
+ 
+     def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really):
+         if not_really:
+@@ -760,7 +888,8 @@ if __name__ == "__main__":
+                 os.write(fd_dst, data)
+             except Exception as e:
+                 print(e, file=sys.stderr)
+-                os.unlink(path_dst)
++                if path_dst:
++                    os.unlink(path_dst)
+                 del datafile
+                 sys.exit("Error writing temporary copy of "+file_type)
+             dataoff += len(data)
+@@ -769,7 +898,7 @@ if __name__ == "__main__":
+         opts, args = getopt.gnu_getopt(sys.argv[1:], 'qilnh::',
+                                    ["quiet", "interactive", "list-entries", "not-really", "help",
+                                     "output=", "output-format=", "output-directory=", "offset=",
+-                                    "entry=", "kernel=", 
++                                    "runas=", "entry=", "kernel=",
+                                     "ramdisk=", "args=", "isconfig", "debug"])
+     except getopt.GetoptError:
+         usage()
+@@ -790,6 +919,7 @@ if __name__ == "__main__":
+     not_really = False
+     output_format = "sxp"
+     output_directory = "/var/run/xen/pygrub/"
++    uid = None
+ 
+     # what was passed in
+     incfg = { "kernel": None, "ramdisk": None, "args": "" }
+@@ -813,6 +943,13 @@ if __name__ == "__main__":
+         elif o in ("--output",):
+             if a != "-":
+                 output = a
++        elif o in ("--runas",):
++            try:
++                uid = int(a)
++            except ValueError:
++                print("runas value must be an integer user id")
++                usage()
++                sys.exit(1)
+         elif o in ("--kernel",):
+             incfg["kernel"] = a
+         elif o in ("--ramdisk",):
+@@ -849,6 +986,10 @@ if __name__ == "__main__":
+     if debug:
+         logging.basicConfig(level=logging.DEBUG)
+ 
++    if interactive and uid:
++        print("In order to use --runas, you must also set --entry or -q", file=sys.stderr)
++        sys.exit(1)
++
+     try:
+         os.makedirs(output_directory, 0o700)
+     except OSError as e:
+@@ -870,6 +1011,9 @@ if __name__ == "__main__":
+     else:
+         fd = os.open(output, os.O_WRONLY)
+ 
++    if uid:
++        file = depriv(output_directory, output, file, uid, path_kernel, path_ramdisk)
++
+     # debug
+     if isconfig:
+         chosencfg = run_grub(file, entry, fs, incfg["args"])
+@@ -925,21 +1069,21 @@ if __name__ == "__main__":
+         raise RuntimeError("Unable to find partition containing kernel")
+ 
+     copy_from_image(fs, chosencfg["kernel"], "kernel",
+-                    fd_kernel, path_kernel, not_really)
++                    fd_kernel, None if uid else path_kernel, not_really)
+     bootcfg["kernel"] = path_kernel
+ 
+     if chosencfg["ramdisk"]:
+         try:
+             copy_from_image(fs, chosencfg["ramdisk"], "ramdisk",
+-                            fd_ramdisk, path_ramdisk, not_really)
++                            fd_ramdisk, None if uid else path_ramdisk, not_really)
+         except:
+-            if not not_really:
+-                os.unlink(path_kernel)
++            if not uid and not not_really:
++                    os.unlink(path_kernel)
+             raise
+         bootcfg["ramdisk"] = path_ramdisk
+     else:
+         initrd = None
+-        if not not_really:
++        if not uid and not not_really:
+             os.unlink(path_ramdisk)
+ 
+     args = None
+-- 
+2.42.0
+
+From 576e7aa02ab838b6768b498f310c70ca49537202 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Mon, 25 Sep 2023 14:30:20 +0200
+Subject: [PATCH 10/11] libxl: add support for running bootloader in restricted
+ mode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Much like the device model depriv mode, add the same kind of support for the
+bootloader.  Such feature allows passing a UID as a parameter for the
+bootloader to run as, together with the bootloader itself taking the necessary
+actions to isolate.
+
+Note that the user to run the bootloader as must have the right permissions to
+access the guest disk image (in read mode only), and that the bootloader will
+be run in non-interactive mode when restricted.
+
+If enabled bootloader restrict mode will attempt to re-use the user(s) from the
+QEMU depriv implementation if no user is provided on the configuration file or
+the environment.  See docs/features/qemu-deprivilege.pandoc for more
+information about how to setup those users.
+
+Bootloader restrict mode is not enabled by default as it requires certain
+setup to be done first (setup of the user(s) to use in restrict mode).
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Anthony PERARD <anthony.perard%citrix.com@localhost>
+---
+ docs/man/xl.1.pod.in                | 33 +++++++++++
+ tools/libs/light/libxl_bootloader.c | 89 ++++++++++++++++++++++++++++-
+ tools/libs/light/libxl_dm.c         |  8 +--
+ tools/libs/light/libxl_internal.h   |  8 +++
+ 4 files changed, 131 insertions(+), 7 deletions(-)
+
+diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
+index 45e1430aeb74..96e6fb1c32a3 100644
+--- docs/man/xl.1.pod.in.orig
++++ docs/man/xl.1.pod.in
+@@ -1976,6 +1976,39 @@ ignored:
+ 
+ =back
+ 
++=head1 ENVIRONMENT VARIABLES
++
++The following environment variables shall affect the execution of xl:
++
++=over 4
++
++=item LIBXL_BOOTLOADER_RESTRICT
++
++Attempt to restrict the bootloader after startup, to limit the
++consequences of security vulnerabilities due to parsing guest
++owned image files.
++
++See docs/features/qemu-deprivilege.pandoc for more information
++on how to setup the unprivileged users.
++
++Note that running the bootloader in restricted mode also implies using
++non-interactive mode, and the disk image must be readable by the
++restricted user.
++
++Having this variable set is equivalent to enabling the option, even if the
++value is 0.
++
++=item LIBXL_BOOTLOADER_USER
++
++When using bootloader_restrict, run the bootloader as this user.  If
++not set the default QEMU restrict users will be used.
++
++NOTE: Each domain MUST have a SEPARATE username.
++
++See docs/features/qemu-deprivilege.pandoc for more information.
++
++=back
++
+ =head1 SEE ALSO
+ 
+ The following man pages:
+diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c
+index 18e9ebd7148c..97d9bf4ddc0a 100644
+--- tools/libs/light/libxl_bootloader.c.orig
++++ tools/libs/light/libxl_bootloader.c
+@@ -14,6 +14,7 @@
+ 
+ #include "libxl_osdeps.h" /* must come before any other headers */
+ 
++#include <pwd.h>
+ #include <termios.h>
+ #ifdef HAVE_UTMP_H
+ #include <utmp.h>
+@@ -46,8 +47,71 @@ static void bootloader_arg(libxl__bootloader_state *bl, const char *arg)
+     bl->args[bl->nargs++] = arg;
+ }
+ 
+-static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
+-                                 const char *bootloader_path)
++static int bootloader_uid(libxl__gc *gc, domid_t guest_domid,
++                          const char *user, uid_t *intended_uid)
++{
++    struct passwd *user_base, user_pwbuf;
++    int rc;
++
++    if (user) {
++        rc = userlookup_helper_getpwnam(gc, user, &user_pwbuf, &user_base);
++        if (rc) return rc;
++
++        if (!user_base) {
++            LOGD(ERROR, guest_domid, "Couldn't find user %s", user);
++            return ERROR_INVAL;
++        }
++
++        *intended_uid = user_base->pw_uid;
++        return 0;
++    }
++
++    /* Re-use QEMU user range for the bootloader. */
++    rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_RANGE_BASE,
++                                    &user_pwbuf, &user_base);
++    if (rc) return rc;
++
++    if (user_base) {
++        struct passwd *user_clash, user_clash_pwbuf;
++        uid_t temp_uid = user_base->pw_uid + guest_domid;
++
++        rc = userlookup_helper_getpwuid(gc, temp_uid, &user_clash_pwbuf,
++                                        &user_clash);
++        if (rc) return rc;
++
++        if (user_clash) {
++            LOGD(ERROR, guest_domid,
++                 "wanted to use uid %ld (%s + %d) but that is user %s !",
++                 (long)temp_uid, LIBXL_QEMU_USER_RANGE_BASE,
++                 guest_domid, user_clash->pw_name);
++            return ERROR_INVAL;
++        }
++
++        *intended_uid = temp_uid;
++        return 0;
++    }
++
++    rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_SHARED, &user_pwbuf,
++                                    &user_base);
++    if (rc) return rc;
++
++    if (user_base) {
++        LOGD(WARN, guest_domid, "Could not find user %s, falling back to %s",
++             LIBXL_QEMU_USER_RANGE_BASE, LIBXL_QEMU_USER_SHARED);
++        *intended_uid = user_base->pw_uid;
++
++        return 0;
++    }
++
++    LOGD(ERROR, guest_domid,
++    "Could not find user %s or range base pseudo-user %s, cannot restrict",
++         LIBXL_QEMU_USER_SHARED, LIBXL_QEMU_USER_RANGE_BASE);
++
++    return ERROR_INVAL;
++}
++
++static int make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
++                                const char *bootloader_path)
+ {
+     const libxl_domain_build_info *info = bl->info;
+ 
+@@ -65,6 +129,23 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
+         ARG(GCSPRINTF("--ramdisk=%s", info->ramdisk));
+     if (info->cmdline && *info->cmdline != '\0')
+         ARG(GCSPRINTF("--args=%s", info->cmdline));
++    if (getenv("LIBXL_BOOTLOADER_RESTRICT") ||
++        getenv("LIBXL_BOOTLOADER_USER")) {
++        uid_t uid = -1;
++        int rc = bootloader_uid(gc, bl->domid, getenv("LIBXL_BOOTLOADER_USER"),
++                                &uid);
++
++        if (rc) return rc;
++
++        assert(uid != -1);
++        if (!uid) {
++            LOGD(ERROR, bl->domid, "bootloader restrict UID is 0 (root)!");
++            return ERROR_INVAL;
++        }
++        LOGD(DEBUG, bl->domid, "using uid %ld", (long)uid);
++        ARG(GCSPRINTF("--runas=%ld", (long)uid));
++        ARG("--quiet");
++    }
+ 
+     ARG(GCSPRINTF("--output=%s", bl->outputpath));
+     ARG("--output-format=simple0");
+@@ -83,6 +164,7 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl,
+     /* Sentinel for execv */
+     ARG(NULL);
+ 
++    return 0;
+ #undef ARG
+ }
+ 
+@@ -447,7 +529,8 @@ static void bootloader_disk_attached_cb(libxl__egc *egc,
+             bootloader = bltmp;
+     }
+ 
+-    make_bootloader_args(gc, bl, bootloader);
++    rc = make_bootloader_args(gc, bl, bootloader);
++    if (rc) goto out;
+ 
+     bl->openpty.ao = ao;
+     bl->openpty.callback = bootloader_gotptys;
+diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c
+index b86e8ccc858f..59de5c1ae22f 100644
+--- tools/libs/light/libxl_dm.c.orig
++++ tools/libs/light/libxl_dm.c
+@@ -80,10 +80,10 @@ static int libxl__create_qemu_logfile(libxl__gc *gc, char *name)
+  *  On error, return a libxl-style error code.
+  */
+ #define DEFINE_USERLOOKUP_HELPER(NAME,SPEC_TYPE,STRUCTNAME,SYSCONF)     \
+-    static int userlookup_helper_##NAME(libxl__gc *gc,                  \
+-                                        SPEC_TYPE spec,                 \
+-                                        struct STRUCTNAME *resultbuf,   \
+-                                        struct STRUCTNAME **out)        \
++    int userlookup_helper_##NAME(libxl__gc *gc,                         \
++                                 SPEC_TYPE spec,                        \
++                                 struct STRUCTNAME *resultbuf,          \
++                                 struct STRUCTNAME **out)               \
+     {                                                                   \
+         struct STRUCTNAME *resultp = NULL;                              \
+         char *buf = NULL;                                               \
+diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h
+index cc27c72ecf30..8415d1feed16 100644
+--- tools/libs/light/libxl_internal.h.orig
++++ tools/libs/light/libxl_internal.h
+@@ -4864,6 +4864,14 @@ struct libxl__cpu_policy {
+     struct xc_msr *msr;
+ };
+ 
++struct passwd;
++_hidden int userlookup_helper_getpwnam(libxl__gc*, const char *user,
++                                       struct passwd *res,
++                                       struct passwd **out);
++_hidden int userlookup_helper_getpwuid(libxl__gc*, uid_t uid,
++                                       struct passwd *res,
++                                       struct passwd **out);
++
+ #endif
+ 
+ /*
+-- 
+2.42.0
+
+From 34221884752bb835bbdab66378b3cecbf133e3d3 Mon Sep 17 00:00:00 2001
+From: Roger Pau Monne <roger.pau%citrix.com@localhost>
+Date: Thu, 28 Sep 2023 12:22:35 +0200
+Subject: [PATCH 11/11] libxl: limit bootloader execution in restricted mode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Introduce a timeout for bootloader execution when running in restricted mode.
+
+Allow overwriting the default time out with an environment provided value.
+
+This is part of XSA-443 / CVE-2023-34325
+
+Signed-off-by: Roger Pau Monné <roger.pau%citrix.com@localhost>
+Reviewed-by: Anthony PERARD <anthony.perard%citrix.com@localhost>
+---
+ docs/man/xl.1.pod.in                |  8 ++++++
+ tools/libs/light/libxl_bootloader.c | 40 +++++++++++++++++++++++++++++
+ tools/libs/light/libxl_internal.h   |  2 ++
+ 3 files changed, 50 insertions(+)
+
+diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in
+index 96e6fb1c32a3..8f056450a730 100644
+--- docs/man/xl.1.pod.in.orig
++++ docs/man/xl.1.pod.in
+@@ -2007,6 +2007,14 @@ NOTE: Each domain MUST have a SEPARATE username.
+ 
+ See docs/features/qemu-deprivilege.pandoc for more information.
+ 
++=item LIBXL_BOOTLOADER_TIMEOUT
++
++Timeout in seconds for bootloader execution when running in restricted mode.
++Otherwise the build time default in LIBXL_BOOTLOADER_TIMEOUT will be used.
++
++If defined the value must be an unsigned integer between 0 and INT_MAX,
++otherwise behavior is undefined.  Setting to 0 disables the timeout.
++
+ =back
+ 
+ =head1 SEE ALSO
+diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c
+index 97d9bf4ddc0a..3ca6463e5f63 100644
+--- tools/libs/light/libxl_bootloader.c.orig
++++ tools/libs/light/libxl_bootloader.c
+@@ -34,6 +34,8 @@ static void bootloader_keystrokes_copyfail(libxl__egc *egc,
+        libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
+ static void bootloader_display_copyfail(libxl__egc *egc,
+        libxl__datacopier_state *dc, int rc, int onwrite, int errnoval);
++static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev,
++                               const struct timeval *requested_abs, int rc);
+ static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc,
+                                    int rc);
+ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child,
+@@ -301,6 +303,7 @@ void libxl__bootloader_init(libxl__bootloader_state *bl)
+     bl->ptys[0].master = bl->ptys[0].slave = 0;
+     bl->ptys[1].master = bl->ptys[1].slave = 0;
+     libxl__ev_child_init(&bl->child);
++    libxl__ev_time_init(&bl->time);
+     libxl__domaindeathcheck_init(&bl->deathcheck);
+     bl->keystrokes.ao = bl->ao;  libxl__datacopier_init(&bl->keystrokes);
+     bl->display.ao = bl->ao;     libxl__datacopier_init(&bl->display);
+@@ -318,6 +321,7 @@ static void bootloader_cleanup(libxl__egc *egc, libxl__bootloader_state *bl)
+     libxl__domaindeathcheck_stop(gc,&bl->deathcheck);
+     libxl__datacopier_kill(&bl->keystrokes);
+     libxl__datacopier_kill(&bl->display);
++    libxl__ev_time_deregister(gc, &bl->time);
+     for (i=0; i<2; i++) {
+         libxl__carefd_close(bl->ptys[i].master);
+         libxl__carefd_close(bl->ptys[i].slave);
+@@ -379,6 +383,7 @@ static void bootloader_stop(libxl__egc *egc,
+ 
+     libxl__datacopier_kill(&bl->keystrokes);
+     libxl__datacopier_kill(&bl->display);
++    libxl__ev_time_deregister(gc, &bl->time);
+     if (libxl__ev_child_inuse(&bl->child)) {
+         r = kill(bl->child.pid, SIGTERM);
+         if (r) LOGED(WARN, bl->domid, "%sfailed to kill bootloader [%lu]",
+@@ -641,6 +646,25 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op)
+ 
+     struct termios termattr;
+ 
++    if (getenv("LIBXL_BOOTLOADER_RESTRICT") ||
++        getenv("LIBXL_BOOTLOADER_USER")) {
++        const char *timeout_env = getenv("LIBXL_BOOTLOADER_TIMEOUT");
++        int timeout = timeout_env ? atoi(timeout_env)
++                                  : LIBXL_BOOTLOADER_TIMEOUT;
++
++        if (timeout) {
++            /* Set execution timeout */
++            rc = libxl__ev_time_register_rel(ao, &bl->time,
++                                            bootloader_timeout,
++                                            timeout * 1000);
++            if (rc) {
++                LOGED(ERROR, bl->domid,
++                      "unable to register timeout for bootloader execution");
++                goto out;
++            }
++        }
++    }
++
+     pid_t pid = libxl__ev_child_fork(gc, &bl->child, bootloader_finished);
+     if (pid == -1) {
+         rc = ERROR_FAIL;
+@@ -706,6 +730,21 @@ static void bootloader_display_copyfail(libxl__egc *egc,
+     libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, display);
+     bootloader_copyfail(egc, "bootloader output", bl, 1, rc,onwrite,errnoval);
+ }
++static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev,
++                               const struct timeval *requested_abs, int rc)
++{
++    libxl__bootloader_state *bl = CONTAINER_OF(ev, *bl, time);
++    STATE_AO_GC(bl->ao);
++
++    libxl__ev_time_deregister(gc, &bl->time);
++
++    assert(libxl__ev_child_inuse(&bl->child));
++    LOGD(ERROR, bl->domid, "killing bootloader because of timeout");
++
++    libxl__ev_child_kill_deregister(ao, &bl->child, SIGKILL);
++
++    bootloader_callback(egc, bl, rc);
++}
+ 
+ static void bootloader_domaindeath(libxl__egc *egc,
+                                    libxl__domaindeathcheck *dc,
+@@ -722,6 +761,7 @@ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child,
+     STATE_AO_GC(bl->ao);
+     int rc;
+ 
++    libxl__ev_time_deregister(gc, &bl->time);
+     libxl__datacopier_kill(&bl->keystrokes);
+     libxl__datacopier_kill(&bl->display);
+ 
+diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h
+index 8415d1feed16..a9581289f462 100644
+--- tools/libs/light/libxl_internal.h.orig
++++ tools/libs/light/libxl_internal.h
+@@ -103,6 +103,7 @@
+ #define LIBXL_QMP_CMD_TIMEOUT 10
+ #define LIBXL_STUBDOM_START_TIMEOUT 30
+ #define LIBXL_QEMU_BODGE_TIMEOUT 2
++#define LIBXL_BOOTLOADER_TIMEOUT 120
+ #define LIBXL_XENCONSOLE_LIMIT 1048576
+ #define LIBXL_XENCONSOLE_PROTOCOL "vt100"
+ #define LIBXL_MAXMEM_CONSTANT 1024
+@@ -3738,6 +3739,7 @@ struct libxl__bootloader_state {
+     libxl__openpty_state openpty;
+     libxl__openpty_result ptys[2];  /* [0] is for bootloader */
+     libxl__ev_child child;
++    libxl__ev_time time;
+     libxl__domaindeathcheck deathcheck;
+     int nargs, argsspace;
+     const char **args;
+-- 
+2.42.0
+



Home | Main Index | Thread Index | Old Index