NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: port-amd64/57661: Crash when booting on Xeon Silver 4416+ in KVM/Qemu
The following reply was made to PR port-amd64/57661; it has been noted by GNATS.
From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
To: Harold Gutch <logix%foobar.franken.de@localhost>
Cc: gnats-bugs%netbsd.org@localhost, port-amd64-maintainer%netbsd.org@localhost,
gnats-admin%netbsd.org@localhost, netbsd-bugs%netbsd.org@localhost,
manu%NetBSD.org@localhost, andrew.cagney%gmail.com@localhost
Subject: Re: port-amd64/57661: Crash when booting on Xeon Silver 4416+ in KVM/Qemu
Date: Fri, 11 Apr 2025 21:09:44 +0000
This is a multi-part message in MIME format.
--=_9iRVuQui7SP0XI0my0i42KB+RbbJU+ZS
Thanks, can you try with this patch?
It incurs less cost on machines with smaller FPU save sizes -- a few
branches and an extra pointer indirection in the save/restore path,
but no pages of memory wasted on every thread. But it's a lot more
involved, so it's more likely I've screwed something up.
(Another alternative would be to dynamically change USPACE, but that
is also pretty involved and likely to get things wrong -- plus it
still entails requires allocating the pages for kernel threads which
will never use them.)
--=_9iRVuQui7SP0XI0my0i42KB+RbbJU+ZS
Content-Type: text/plain; charset="ISO-8859-1"; name="pr57661-x86fpusaveoverflow-separatefpusave"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment; filename="pr57661-x86fpusaveoverflow-separatefpusave.patch"
diff -r 1cb0546d18b6 sys/arch/amd64/amd64/genassym.cf
--- a/sys/arch/amd64/amd64/genassym.cf Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/amd64/amd64/genassym.cf Fri Apr 11 21:07:26 2025 +0000
@@ -186,7 +186,6 @@ define PCB_FLAGS offsetof(struct pcb, p
define PCB_COMPAT32 PCB_COMPAT32
define PCB_FS offsetof(struct pcb, pcb_fs)
define PCB_GS offsetof(struct pcb, pcb_gs)
-define PCB_SAVEFPU offsetof(struct pcb, pcb_savefpu)
=20
define TF_RDI offsetof(struct trapframe, tf_rdi)
define TF_RSI offsetof(struct trapframe, tf_rsi)
diff -r 1cb0546d18b6 sys/arch/amd64/include/param.h
--- a/sys/arch/amd64/include/param.h Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/amd64/include/param.h Fri Apr 11 21:07:26 2025 +0000
@@ -69,11 +69,27 @@
#define SINCR 1 /* increment of stack/NBPG */
=20
#if defined(KASAN) || defined(KMSAN)
-#define UPAGES 8
+#define UPAGES_KxSAN 2
+#else
+#define UPAGES_KxSAN 0
+#endif
+#if defined(SVS)
+#define UPAGES_SVS 1
+#else
+#define UPAGES_SVS 0
+#endif
+#define UPAGES_PCB 1 /* one page for the PCB */
+#define UPAGES_RED 1 /* one page for red zone between pcb/stack */
+#define UPAGES_STACK 3 /* three pages (12 KiB) of stack space */
+#define UPAGES \
+ (UPAGES_PCB + UPAGES_RED + UPAGES_STACK + UPAGES_SVS + UPAGES_KxSAN)
+
+#if defined(KASAN) || defined(KMSAN)
+__CTASSERT(UPAGES =3D=3D 8);
#elif defined(SVS)
-#define UPAGES 6 /* 1 page used internally by SVS */
+__CTASSERT(UPAGES =3D=3D 6);
#else
-#define UPAGES 5 /* pages of u-area (1 for redzone) */
+__CTASSERT(UPAGES =3D=3D 5);
#endif
#define USPACE (UPAGES * NBPG) /* total size of u-area */
=20
diff -r 1cb0546d18b6 sys/arch/amd64/include/pcb.h
--- a/sys/arch/amd64/include/pcb.h Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/amd64/include/pcb.h Fri Apr 11 21:07:26 2025 +0000
@@ -97,13 +97,15 @@ struct pcb {
struct dbreg *pcb_dbregs;
uint16_t pcb_fpu_dflt_cw;
int pcb_iopl;
+ union savefpu *pcb_savefpu;
=20
- uint32_t pcb_unused[8]; /* unused */
+ uint32_t pcb_unused[6]; /* unused */
=20
- union savefpu pcb_savefpu __aligned(64); /* floating point state */
+ union savefpu pcb_savefpusmall __aligned(64); /* small fpu state */
/* **** DO NOT ADD ANYTHING HERE **** */
};
#ifndef __lint__
+__CTASSERT(offsetof(struct pcb, pcb_savefpusmall) =3D=3D 128);
__CTASSERT(sizeof(struct pcb) - sizeof (union savefpu) =3D=3D 128);
#endif
=20
diff -r 1cb0546d18b6 sys/arch/i386/i386/genassym.cf
--- a/sys/arch/i386/i386/genassym.cf Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/i386/i386/genassym.cf Fri Apr 11 21:07:26 2025 +0000
@@ -192,7 +192,6 @@ define PCB_ESP0 offsetof(struct pcb, pc
define PCB_FSD offsetof(struct pcb, pcb_fsd)
define PCB_GSD offsetof(struct pcb, pcb_gsd)
define PCB_IOMAP offsetof(struct pcb, pcb_iomap)
-define PCB_SAVEFPU offsetof(struct pcb, pcb_savefpu)
=20
define TF_CS offsetof(struct trapframe, tf_cs)
define TF_EIP offsetof(struct trapframe, tf_eip)
diff -r 1cb0546d18b6 sys/arch/i386/include/pcb.h
--- a/sys/arch/i386/include/pcb.h Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/i386/include/pcb.h Fri Apr 11 21:07:26 2025 +0000
@@ -95,16 +95,18 @@ struct pcb {
=20
#define PCB_DBREGS 0x01
int pcb_flags;
-
- int not_used[15];
+ union savefpu *pcb_savefpu;
=20
- /* floating point state */
- union savefpu pcb_savefpu __aligned(64);
+ int not_used[14];
+
+ /* small fpu state */
+ union savefpu pcb_savefpusmall __aligned(64);
/* **** DO NOT ADD ANYTHING HERE **** */
=20
};
#ifndef __lint__
/* This doesn't really matter, but there is a lot of implied padding */
+__CTASSERT(offsetof(struct pcb, pcb_savefpusmall) =3D=3D 128);
__CTASSERT(sizeof(struct pcb) - sizeof (union savefpu) =3D=3D 128);
#endif
=20
diff -r 1cb0546d18b6 sys/arch/x86/include/cpu.h
--- a/sys/arch/x86/include/cpu.h Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/include/cpu.h Fri Apr 11 21:07:26 2025 +0000
@@ -481,6 +481,7 @@ extern uint64_t x86_xsave_features;
extern size_t x86_xsave_offsets[];
extern size_t x86_xsave_sizes[];
extern uint32_t x86_fpu_mxcsr_mask;
+bool x86_fpu_save_separate_p(void);
=20
extern void (*x86_cpu_idle)(void);
#define cpu_idle() (*x86_cpu_idle)()
diff -r 1cb0546d18b6 sys/arch/x86/include/cpu_extended_state.h
--- a/sys/arch/x86/include/cpu_extended_state.h Thu Apr 10 18:53:29 2025 +0=
000
+++ b/sys/arch/x86/include/cpu_extended_state.h Fri Apr 11 21:07:26 2025 +0=
000
@@ -226,6 +226,8 @@ struct xstate {
* It is defined this way to separate the definitions and to
* minimise the number of union/struct selectors.
* NB: Some userspace stuff (eg firefox) uses it to parse ucontext.
+ * NB: This is not actually the largest possible save space;
+ * x86_fpu_save_size may be larger.
*/
union savefpu {
struct save87 sv_87;
diff -r 1cb0546d18b6 sys/arch/x86/x86/fpu.c
--- a/sys/arch/x86/x86/fpu.c Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/x86/fpu.c Fri Apr 11 21:07:26 2025 +0000
@@ -136,11 +136,35 @@ void fpu_switch(struct lwp *, struct lwp
=20
uint32_t x86_fpu_mxcsr_mask __read_mostly =3D 0;
=20
+static const union savefpu safe_fpu_storage __aligned(64) =3D {
+ .sv_xmm =3D {
+ .fx_mxcsr =3D __SAFE_MXCSR__,
+ },
+};
+static const union savefpu zero_fpu_storage __aligned(64);
+
+static const void *safe_fpu __read_mostly =3D &safe_fpu_storage;
+static const void *zero_fpu __read_mostly =3D &zero_fpu_storage;
+
+/*
+ * x86_fpu_save_separate_p()
+ *
+ * True if we allocate the FPU save space separately, outside the
+ * struct pcb itself, because it doesn't fit in a single page.
+ */
+bool
+x86_fpu_save_separate_p(void)
+{
+
+ return x86_fpu_save_size >
+ PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall);
+}
+
static inline union savefpu *
fpu_lwp_area(struct lwp *l)
{
struct pcb *pcb =3D lwp_getpcb(l);
- union savefpu *area =3D &pcb->pcb_savefpu;
+ union savefpu *area =3D pcb->pcb_savefpu;
=20
KASSERT((l->l_flag & LW_SYSTEM) =3D=3D 0);
if (l =3D=3D curlwp) {
@@ -155,7 +179,7 @@ static inline void
fpu_save_lwp(struct lwp *l)
{
struct pcb *pcb =3D lwp_getpcb(l);
- union savefpu *area =3D &pcb->pcb_savefpu;
+ union savefpu *area =3D pcb->pcb_savefpu;
int s;
=20
s =3D splvm();
@@ -192,11 +216,55 @@ fpuinit(struct cpu_info *ci)
void
fpuinit_mxcsr_mask(void)
{
+ /*
+ * If the CPU's x86 fpu save size is larger than union savefpu,
+ * we have to allocate larger buffers for the safe and zero FPU
+ * states used here and by fpu_kern_enter/leave.
+ *
+ * Note: This is NOT the same as x86_fpu_save_separate_p(),
+ * which may have a little more space than union savefpu.
+ */
+ const bool allocfpusave =3D x86_fpu_save_size > sizeof(union savefpu);
+ vaddr_t va;
+
+#ifdef __i386__
+ if (x86_fpu_save_separate_p()) {
+ /*
+ * XXX Need to teach cpu_uarea_alloc/free to allocate a
+ * separate fpu save space -- currently only done on
+ * amd64, not on i386. Also need to make fpu_lwp_fork
+ * not clobber pcb_savefpu.
+ */
+ panic("NetBSD/i386 does not support fpu save size %u",
+ x86_fpu_save_size);
+ }
+#endif
+
#ifndef XENPV
- union savefpu fpusave __aligned(64);
+ union savefpu fpusave_stack __aligned(64);
+ union savefpu *fpusave;
u_long psl;
=20
- memset(&fpusave, 0, sizeof(fpusave));
+ /*
+ * Allocate a temporary save space from the stack if it fits,
+ * or from the heap otherwise, so we can query its mxcsr mask.
+ */
+ if (allocfpusave) {
+ /*
+ * Need 64-byte alignment for XSAVE instructions.
+ * kmem_* doesn't guarantee that and we don't have a
+ * handy posix_memalign in the kernel unless we hack it
+ * ourselves with vmem(9), so just ask for page
+ * alignment with uvm_km(9).
+ */
+ __CTASSERT(PAGE_SIZE >=3D 64);
+ va =3D uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+ UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
+ fpusave =3D (void *)va;
+ } else {
+ fpusave =3D &fpusave_stack;
+ memset(fpusave, 0, sizeof(*fpusave));
+ }
=20
/* Disable interrupts, and enable FPU */
psl =3D x86_read_psl();
@@ -204,16 +272,25 @@ fpuinit_mxcsr_mask(void)
clts();
=20
/* Fill in the FPU area */
- fxsave(&fpusave);
+ fxsave(fpusave);
=20
/* Restore previous state */
stts();
x86_write_psl(psl);
=20
- if (fpusave.sv_xmm.fx_mxcsr_mask =3D=3D 0) {
+ if (fpusave->sv_xmm.fx_mxcsr_mask =3D=3D 0) {
x86_fpu_mxcsr_mask =3D __INITIAL_MXCSR_MASK__;
} else {
- x86_fpu_mxcsr_mask =3D fpusave.sv_xmm.fx_mxcsr_mask;
+ x86_fpu_mxcsr_mask =3D fpusave->sv_xmm.fx_mxcsr_mask;
+ }
+
+ /*
+ * Free the temporary save space.
+ */
+ if (allocfpusave) {
+ uvm_km_free(kernel_map, va, x86_fpu_save_size, UVM_KMF_WIRED);
+ fpusave =3D NULL;
+ va =3D 0;
}
#else
/*
@@ -223,6 +300,26 @@ fpuinit_mxcsr_mask(void)
*/
x86_fpu_mxcsr_mask =3D __INITIAL_MXCSR_MASK__;
#endif
+
+ /*
+ * If necessary, allocate FPU save spaces for safe or zero FPU
+ * state, for fpu_kern_enter/leave.
+ */
+ if (allocfpusave) {
+ __CTASSERT(PAGE_SIZE >=3D 64);
+
+ va =3D uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+ UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
+ memcpy((void *)va, &safe_fpu_storage,
+ sizeof(safe_fpu_storage));
+ safe_fpu =3D (void *)va;
+
+ va =3D uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+ UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
+ memcpy((void *)va, &zero_fpu_storage,
+ sizeof(zero_fpu_storage));
+ zero_fpu =3D (void *)va;
+ }
}
=20
static inline void
@@ -305,7 +402,7 @@ void
fpu_handle_deferred(void)
{
struct pcb *pcb =3D lwp_getpcb(curlwp);
- fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features,
+ fpu_area_restore(pcb->pcb_savefpu, x86_xsave_features,
!(curlwp->l_proc->p_flag & PK_32));
}
=20
@@ -321,7 +418,7 @@ fpu_switch(struct lwp *oldlwp, struct lw
if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) {
KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
pcb =3D lwp_getpcb(oldlwp);
- fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features,
+ fpu_area_save(pcb->pcb_savefpu, x86_xsave_features,
!(oldlwp->l_proc->p_flag & PK_32));
oldlwp->l_md.md_flags &=3D ~MDL_FPU_IN_CPU;
}
@@ -338,14 +435,28 @@ fpu_lwp_fork(struct lwp *l1, struct lwp=20
if (__predict_false(l2->l_flag & LW_SYSTEM)) {
return;
}
+
+#ifdef __i386__
+ /*
+ * XXX i386 currently never allocates separate FPU save space
+ * (so it's limited to 4096 - 128 bytes of space available in
+ * the pcb page). Changing this requires removing the panic at
+ * boot in fpuinit_mxcsr_mask and adopting something like the
+ * uarea allocation in cpu_uarea_alloc/free like amd64.
+ */
+ KASSERT(x86_fpu_save_size <=3D
+ PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall));
+ pcb2->pcb_savefpu =3D &pcb2->pcb_savefpusmall;
+#endif
+
/* For init(8). */
if (__predict_false(l1->l_flag & LW_SYSTEM)) {
- memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size);
+ memset(pcb2->pcb_savefpu, 0, x86_fpu_save_size);
return;
}
=20
fpu_save =3D fpu_lwp_area(l1);
- memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
+ memcpy(pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
l2->l_md.md_flags &=3D ~MDL_FPU_IN_CPU;
}
=20
@@ -378,11 +489,6 @@ fpu_lwp_abandon(struct lwp *l)
void
fpu_kern_enter(void)
{
- static const union savefpu safe_fpu __aligned(64) =3D {
- .sv_xmm =3D {
- .fx_mxcsr =3D __SAFE_MXCSR__,
- },
- };
struct lwp *l =3D curlwp;
struct cpu_info *ci;
int s;
@@ -421,7 +527,7 @@ fpu_kern_enter(void)
/*
* Zero the FPU registers and install safe control words.
*/
- fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false);
+ fpu_area_restore(safe_fpu, x86_xsave_features, /*is_64bit*/false);
}
=20
/*
@@ -432,7 +538,6 @@ fpu_kern_enter(void)
void
fpu_kern_leave(void)
{
- static const union savefpu zero_fpu __aligned(64);
struct cpu_info *ci =3D curcpu();
int s;
=20
@@ -451,7 +556,7 @@ fpu_kern_leave(void)
* through Spectre-class attacks to userland, even if there are
* no bugs in fpu state management.
*/
- fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false);
+ fpu_area_restore(zero_fpu, x86_xsave_features, /*is_64bit*/false);
=20
/*
* Set CR0_TS again so that the kernel can't accidentally use
diff -r 1cb0546d18b6 sys/arch/x86/x86/vm_machdep.c
--- a/sys/arch/x86/x86/vm_machdep.c Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/x86/vm_machdep.c Fri Apr 11 21:07:26 2025 +0000
@@ -366,10 +366,42 @@ cpu_uarea_alloc(bool system)
{
vaddr_t base, va;
paddr_t pa;
+ struct pcb *pcb;
=20
base =3D uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0,
UVM_KMF_WIRED|UVM_KMF_WAITVA);
=20
+ /*
+ * Prepare the FPU save area:
+ *
+ * 1. If this is a system thread, no save area.
+ * XXX Allocate/free one in kthread_fpu_enter/exit_md.
+ *
+ * 2. If this is a user thread, and the fpu save size is large
+ * enough, allocate an extra block of memory for it.
+ *
+ * 3. Otherwise, this is a user thread and the fpu save size
+ * fits inside the pcb page, so use that.
+ *
+ * XXX Note that this is currently amd64-only -- if you extend
+ * this FPU save space allocation to i386, make sure to update
+ * fpu_lwp_fork so it doesn't clobber the pcb_savefpu pointer.
+ * We panic on i386 boot if the CPU needs more space in
+ * fpuinit_mxcsr_mask.
+ */
+ pcb =3D (void *)base;
+ if (system) { /* (1) */
+ pcb->pcb_savefpu =3D NULL;
+ } else if (x86_fpu_save_separate_p()) { /* (2) */
+ __CTASSERT(PAGE_SIZE >=3D 64);
+ /* No need to zero -- caller will initialize. */
+ va =3D uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+ UVM_KMF_WIRED|UVM_KMF_WAITVA);
+ pcb->pcb_savefpu =3D (void *)va;
+ } else { /* (3) */
+ pcb->pcb_savefpu =3D &pcb->pcb_savefpusmall;
+ }
+
/* Page[1] =3D RedZone */
va =3D base + PAGE_SIZE;
if (!pmap_extract(pmap_kernel(), va, &pa)) {
@@ -394,8 +426,20 @@ cpu_uarea_alloc(bool system)
bool
cpu_uarea_free(void *addr)
{
+ const struct pcb *const pcb =3D addr;
vaddr_t base =3D (vaddr_t)addr;
=20
+ /*
+ * If we allocated a separate FPU save area, free it.
+ */
+ if (pcb->pcb_savefpu !=3D NULL &&
+ pcb->pcb_savefpu !=3D &pcb->pcb_savefpusmall) {
+ KASSERTMSG(x86_fpu_save_separate_p(), "pcb=3D%p pcb_savefpu=3D%p",
+ pcb, pcb->pcb_savefpu);
+ uvm_km_free(kernel_map, (vaddr_t)pcb->pcb_savefpu,
+ x86_fpu_save_size, UVM_KMF_WIRED);
+ }
+
KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL));
KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL));
uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED);
--=_9iRVuQui7SP0XI0my0i42KB+RbbJU+ZS--
Home |
Main Index |
Thread Index |
Old Index