NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: port-amd64/57661: Crash when booting on Xeon Silver 4416+ in KVM/Qemu



Thanks, can you try with this patch?

It incurs less cost on machines with smaller FPU save sizes -- a few
branches and an extra pointer indirection in the save/restore path,
but no pages of memory wasted on every thread.  But it's a lot more
involved, so it's more likely I've screwed something up.

(Another alternative would be to dynamically change USPACE, but that
is also pretty involved and likely to get things wrong -- plus it
still entails requires allocating the pages for kernel threads which
will never use them.)
diff -r 1cb0546d18b6 sys/arch/amd64/amd64/genassym.cf
--- a/sys/arch/amd64/amd64/genassym.cf	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/amd64/amd64/genassym.cf	Fri Apr 11 21:07:26 2025 +0000
@@ -186,7 +186,6 @@ define	PCB_FLAGS		offsetof(struct pcb, p
 define	PCB_COMPAT32		PCB_COMPAT32
 define	PCB_FS			offsetof(struct pcb, pcb_fs)
 define	PCB_GS			offsetof(struct pcb, pcb_gs)
-define	PCB_SAVEFPU		offsetof(struct pcb, pcb_savefpu)
 
 define	TF_RDI			offsetof(struct trapframe, tf_rdi)
 define	TF_RSI			offsetof(struct trapframe, tf_rsi)
diff -r 1cb0546d18b6 sys/arch/amd64/include/param.h
--- a/sys/arch/amd64/include/param.h	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/amd64/include/param.h	Fri Apr 11 21:07:26 2025 +0000
@@ -69,11 +69,27 @@
 #define	SINCR		1		/* increment of stack/NBPG */
 
 #if defined(KASAN) || defined(KMSAN)
-#define	UPAGES		8
+#define UPAGES_KxSAN	2
+#else
+#define	UPAGES_KxSAN	0
+#endif
+#if defined(SVS)
+#define	UPAGES_SVS	1
+#else
+#define	UPAGES_SVS	0
+#endif
+#define	UPAGES_PCB	1	/* one page for the PCB */
+#define	UPAGES_RED	1	/* one page for red zone between pcb/stack */
+#define	UPAGES_STACK	3	/* three pages (12 KiB) of stack space */
+#define	UPAGES		\
+	(UPAGES_PCB + UPAGES_RED + UPAGES_STACK + UPAGES_SVS + UPAGES_KxSAN)
+
+#if defined(KASAN) || defined(KMSAN)
+__CTASSERT(UPAGES == 8);
 #elif defined(SVS)
-#define	UPAGES		6		/* 1 page used internally by SVS */
+__CTASSERT(UPAGES == 6);
 #else
-#define	UPAGES		5		/* pages of u-area (1 for redzone) */
+__CTASSERT(UPAGES == 5);
 #endif
 #define	USPACE		(UPAGES * NBPG)	/* total size of u-area */
 
diff -r 1cb0546d18b6 sys/arch/amd64/include/pcb.h
--- a/sys/arch/amd64/include/pcb.h	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/amd64/include/pcb.h	Fri Apr 11 21:07:26 2025 +0000
@@ -97,13 +97,15 @@ struct pcb {
 	struct dbreg *pcb_dbregs;
 	uint16_t pcb_fpu_dflt_cw;
 	int pcb_iopl;
+	union savefpu *pcb_savefpu;
 
-	uint32_t pcb_unused[8];		/* unused */
+	uint32_t pcb_unused[6];		/* unused */
 
-	union savefpu	pcb_savefpu __aligned(64); /* floating point state */
+	union savefpu	pcb_savefpusmall __aligned(64); /* small fpu state */
 	/* **** DO NOT ADD ANYTHING HERE **** */
 };
 #ifndef __lint__
+__CTASSERT(offsetof(struct pcb, pcb_savefpusmall) == 128);
 __CTASSERT(sizeof(struct pcb) - sizeof (union savefpu) ==  128);
 #endif
 
diff -r 1cb0546d18b6 sys/arch/i386/i386/genassym.cf
--- a/sys/arch/i386/i386/genassym.cf	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/i386/i386/genassym.cf	Fri Apr 11 21:07:26 2025 +0000
@@ -192,7 +192,6 @@ define	PCB_ESP0		offsetof(struct pcb, pc
 define	PCB_FSD			offsetof(struct pcb, pcb_fsd)
 define	PCB_GSD			offsetof(struct pcb, pcb_gsd)
 define	PCB_IOMAP		offsetof(struct pcb, pcb_iomap)
-define	PCB_SAVEFPU		offsetof(struct pcb, pcb_savefpu)
 
 define	TF_CS			offsetof(struct trapframe, tf_cs)
 define	TF_EIP			offsetof(struct trapframe, tf_eip)
diff -r 1cb0546d18b6 sys/arch/i386/include/pcb.h
--- a/sys/arch/i386/include/pcb.h	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/i386/include/pcb.h	Fri Apr 11 21:07:26 2025 +0000
@@ -95,16 +95,18 @@ struct pcb {
 
 #define	PCB_DBREGS	0x01
 	int	pcb_flags;
-
-	int	not_used[15];
+	union savefpu	*pcb_savefpu;
 
-	/* floating point state */
-	union savefpu	pcb_savefpu __aligned(64);
+	int	not_used[14];
+
+	/* small fpu state */
+	union savefpu	pcb_savefpusmall __aligned(64);
 	/* **** DO NOT ADD ANYTHING HERE **** */
 
 };
 #ifndef __lint__
 /* This doesn't really matter, but there is a lot of implied padding */
+__CTASSERT(offsetof(struct pcb, pcb_savefpusmall) == 128);
 __CTASSERT(sizeof(struct pcb) - sizeof (union savefpu) == 128);
 #endif
 
diff -r 1cb0546d18b6 sys/arch/x86/include/cpu.h
--- a/sys/arch/x86/include/cpu.h	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/include/cpu.h	Fri Apr 11 21:07:26 2025 +0000
@@ -481,6 +481,7 @@ extern uint64_t x86_xsave_features;
 extern size_t x86_xsave_offsets[];
 extern size_t x86_xsave_sizes[];
 extern uint32_t x86_fpu_mxcsr_mask;
+bool x86_fpu_save_separate_p(void);
 
 extern void (*x86_cpu_idle)(void);
 #define	cpu_idle() (*x86_cpu_idle)()
diff -r 1cb0546d18b6 sys/arch/x86/include/cpu_extended_state.h
--- a/sys/arch/x86/include/cpu_extended_state.h	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/include/cpu_extended_state.h	Fri Apr 11 21:07:26 2025 +0000
@@ -226,6 +226,8 @@ struct xstate {
  * It is defined this way to separate the definitions and to
  * minimise the number of union/struct selectors.
  * NB: Some userspace stuff (eg firefox) uses it to parse ucontext.
+ * NB: This is not actually the largest possible save space;
+ *     x86_fpu_save_size may be larger.
  */
 union savefpu {
 	struct save87 sv_87;
diff -r 1cb0546d18b6 sys/arch/x86/x86/fpu.c
--- a/sys/arch/x86/x86/fpu.c	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/x86/fpu.c	Fri Apr 11 21:07:26 2025 +0000
@@ -136,11 +136,35 @@ void fpu_switch(struct lwp *, struct lwp
 
 uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;
 
+static const union savefpu safe_fpu_storage __aligned(64) = {
+	.sv_xmm = {
+		.fx_mxcsr = __SAFE_MXCSR__,
+	},
+};
+static const union savefpu zero_fpu_storage __aligned(64);
+
+static const void *safe_fpu __read_mostly = &safe_fpu_storage;
+static const void *zero_fpu __read_mostly = &zero_fpu_storage;
+
+/*
+ * x86_fpu_save_separate_p()
+ *
+ *	True if we allocate the FPU save space separately, outside the
+ *	struct pcb itself, because it doesn't fit in a single page.
+ */
+bool
+x86_fpu_save_separate_p(void)
+{
+
+	return x86_fpu_save_size >
+	    PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall);
+}
+
 static inline union savefpu *
 fpu_lwp_area(struct lwp *l)
 {
 	struct pcb *pcb = lwp_getpcb(l);
-	union savefpu *area = &pcb->pcb_savefpu;
+	union savefpu *area = pcb->pcb_savefpu;
 
 	KASSERT((l->l_flag & LW_SYSTEM) == 0);
 	if (l == curlwp) {
@@ -155,7 +179,7 @@ static inline void
 fpu_save_lwp(struct lwp *l)
 {
 	struct pcb *pcb = lwp_getpcb(l);
-	union savefpu *area = &pcb->pcb_savefpu;
+	union savefpu *area = pcb->pcb_savefpu;
 	int s;
 
 	s = splvm();
@@ -192,11 +216,55 @@ fpuinit(struct cpu_info *ci)
 void
 fpuinit_mxcsr_mask(void)
 {
+	/*
+	 * If the CPU's x86 fpu save size is larger than union savefpu,
+	 * we have to allocate larger buffers for the safe and zero FPU
+	 * states used here and by fpu_kern_enter/leave.
+	 *
+	 * Note: This is NOT the same as x86_fpu_save_separate_p(),
+	 * which may have a little more space than union savefpu.
+	 */
+	const bool allocfpusave = x86_fpu_save_size > sizeof(union savefpu);
+	vaddr_t va;
+
+#ifdef __i386__
+	if (x86_fpu_save_separate_p()) {
+		/*
+		 * XXX Need to teach cpu_uarea_alloc/free to allocate a
+		 * separate fpu save space -- currently only done on
+		 * amd64, not on i386.  Also need to make fpu_lwp_fork
+		 * not clobber pcb_savefpu.
+		 */
+		panic("NetBSD/i386 does not support fpu save size %u",
+		    x86_fpu_save_size);
+	}
+#endif
+
 #ifndef XENPV
-	union savefpu fpusave __aligned(64);
+	union savefpu fpusave_stack __aligned(64);
+	union savefpu *fpusave;
 	u_long psl;
 
-	memset(&fpusave, 0, sizeof(fpusave));
+	/*
+	 * Allocate a temporary save space from the stack if it fits,
+	 * or from the heap otherwise, so we can query its mxcsr mask.
+	 */
+	if (allocfpusave) {
+		/*
+		 * Need 64-byte alignment for XSAVE instructions.
+		 * kmem_* doesn't guarantee that and we don't have a
+		 * handy posix_memalign in the kernel unless we hack it
+		 * ourselves with vmem(9), so just ask for page
+		 * alignment with uvm_km(9).
+		 */
+		__CTASSERT(PAGE_SIZE >= 64);
+		va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+		    UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
+		fpusave = (void *)va;
+	} else {
+		fpusave = &fpusave_stack;
+		memset(fpusave, 0, sizeof(*fpusave));
+	}
 
 	/* Disable interrupts, and enable FPU */
 	psl = x86_read_psl();
@@ -204,16 +272,25 @@ fpuinit_mxcsr_mask(void)
 	clts();
 
 	/* Fill in the FPU area */
-	fxsave(&fpusave);
+	fxsave(fpusave);
 
 	/* Restore previous state */
 	stts();
 	x86_write_psl(psl);
 
-	if (fpusave.sv_xmm.fx_mxcsr_mask == 0) {
+	if (fpusave->sv_xmm.fx_mxcsr_mask == 0) {
 		x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
 	} else {
-		x86_fpu_mxcsr_mask = fpusave.sv_xmm.fx_mxcsr_mask;
+		x86_fpu_mxcsr_mask = fpusave->sv_xmm.fx_mxcsr_mask;
+	}
+
+	/*
+	 * Free the temporary save space.
+	 */
+	if (allocfpusave) {
+		uvm_km_free(kernel_map, va, x86_fpu_save_size, UVM_KMF_WIRED);
+		fpusave = NULL;
+		va = 0;
 	}
 #else
 	/*
@@ -223,6 +300,26 @@ fpuinit_mxcsr_mask(void)
 	 */
 	x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
 #endif
+
+	/*
+	 * If necessary, allocate FPU save spaces for safe or zero FPU
+	 * state, for fpu_kern_enter/leave.
+	 */
+	if (allocfpusave) {
+		__CTASSERT(PAGE_SIZE >= 64);
+
+		va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+		    UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
+		memcpy((void *)va, &safe_fpu_storage,
+		    sizeof(safe_fpu_storage));
+		safe_fpu = (void *)va;
+
+		va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+		    UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
+		memcpy((void *)va, &zero_fpu_storage,
+		    sizeof(zero_fpu_storage));
+		zero_fpu = (void *)va;
+	}
 }
 
 static inline void
@@ -305,7 +402,7 @@ void
 fpu_handle_deferred(void)
 {
 	struct pcb *pcb = lwp_getpcb(curlwp);
-	fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features,
+	fpu_area_restore(pcb->pcb_savefpu, x86_xsave_features,
 	    !(curlwp->l_proc->p_flag & PK_32));
 }
 
@@ -321,7 +418,7 @@ fpu_switch(struct lwp *oldlwp, struct lw
 	if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) {
 		KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
 		pcb = lwp_getpcb(oldlwp);
-		fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features,
+		fpu_area_save(pcb->pcb_savefpu, x86_xsave_features,
 		    !(oldlwp->l_proc->p_flag & PK_32));
 		oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
 	}
@@ -338,14 +435,28 @@ fpu_lwp_fork(struct lwp *l1, struct lwp 
 	if (__predict_false(l2->l_flag & LW_SYSTEM)) {
 		return;
 	}
+
+#ifdef __i386__
+	/*
+	 * XXX i386 currently never allocates separate FPU save space
+	 * (so it's limited to 4096 - 128 bytes of space available in
+	 * the pcb page).  Changing this requires removing the panic at
+	 * boot in fpuinit_mxcsr_mask and adopting something like the
+	 * uarea allocation in cpu_uarea_alloc/free like amd64.
+	 */
+	KASSERT(x86_fpu_save_size <=
+	    PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall));
+	pcb2->pcb_savefpu = &pcb2->pcb_savefpusmall;
+#endif
+
 	/* For init(8). */
 	if (__predict_false(l1->l_flag & LW_SYSTEM)) {
-		memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size);
+		memset(pcb2->pcb_savefpu, 0, x86_fpu_save_size);
 		return;
 	}
 
 	fpu_save = fpu_lwp_area(l1);
-	memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
+	memcpy(pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
 	l2->l_md.md_flags &= ~MDL_FPU_IN_CPU;
 }
 
@@ -378,11 +489,6 @@ fpu_lwp_abandon(struct lwp *l)
 void
 fpu_kern_enter(void)
 {
-	static const union savefpu safe_fpu __aligned(64) = {
-		.sv_xmm = {
-			.fx_mxcsr = __SAFE_MXCSR__,
-		},
-	};
 	struct lwp *l = curlwp;
 	struct cpu_info *ci;
 	int s;
@@ -421,7 +527,7 @@ fpu_kern_enter(void)
 	/*
 	 * Zero the FPU registers and install safe control words.
 	 */
-	fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false);
+	fpu_area_restore(safe_fpu, x86_xsave_features, /*is_64bit*/false);
 }
 
 /*
@@ -432,7 +538,6 @@ fpu_kern_enter(void)
 void
 fpu_kern_leave(void)
 {
-	static const union savefpu zero_fpu __aligned(64);
 	struct cpu_info *ci = curcpu();
 	int s;
 
@@ -451,7 +556,7 @@ fpu_kern_leave(void)
 	 * through Spectre-class attacks to userland, even if there are
 	 * no bugs in fpu state management.
 	 */
-	fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false);
+	fpu_area_restore(zero_fpu, x86_xsave_features, /*is_64bit*/false);
 
 	/*
 	 * Set CR0_TS again so that the kernel can't accidentally use
diff -r 1cb0546d18b6 sys/arch/x86/x86/vm_machdep.c
--- a/sys/arch/x86/x86/vm_machdep.c	Thu Apr 10 18:53:29 2025 +0000
+++ b/sys/arch/x86/x86/vm_machdep.c	Fri Apr 11 21:07:26 2025 +0000
@@ -366,10 +366,42 @@ cpu_uarea_alloc(bool system)
 {
 	vaddr_t base, va;
 	paddr_t pa;
+	struct pcb *pcb;
 
 	base = uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0,
 	    UVM_KMF_WIRED|UVM_KMF_WAITVA);
 
+	/*
+	 * Prepare the FPU save area:
+	 *
+	 * 1. If this is a system thread, no save area.
+	 *    XXX Allocate/free one in kthread_fpu_enter/exit_md.
+	 *
+	 * 2. If this is a user thread, and the fpu save size is large
+	 *    enough, allocate an extra block of memory for it.
+	 *
+	 * 3. Otherwise, this is a user thread and the fpu save size
+	 *    fits inside the pcb page, so use that.
+	 *
+	 * XXX Note that this is currently amd64-only -- if you extend
+	 * this FPU save space allocation to i386, make sure to update
+	 * fpu_lwp_fork so it doesn't clobber the pcb_savefpu pointer.
+	 * We panic on i386 boot if the CPU needs more space in
+	 * fpuinit_mxcsr_mask.
+	 */
+	pcb = (void *)base;
+	if (system) {					/* (1) */
+		pcb->pcb_savefpu = NULL;
+	} else if (x86_fpu_save_separate_p()) {		/* (2) */
+		__CTASSERT(PAGE_SIZE >= 64);
+		/* No need to zero -- caller will initialize. */
+		va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
+		    UVM_KMF_WIRED|UVM_KMF_WAITVA);
+		pcb->pcb_savefpu = (void *)va;
+	} else {					/* (3) */
+		pcb->pcb_savefpu = &pcb->pcb_savefpusmall;
+	}
+
 	/* Page[1] = RedZone */
 	va = base + PAGE_SIZE;
 	if (!pmap_extract(pmap_kernel(), va, &pa)) {
@@ -394,8 +426,20 @@ cpu_uarea_alloc(bool system)
 bool
 cpu_uarea_free(void *addr)
 {
+	const struct pcb *const pcb = addr;
 	vaddr_t base = (vaddr_t)addr;
 
+	/*
+	 * If we allocated a separate FPU save area, free it.
+	 */
+	if (pcb->pcb_savefpu != NULL &&
+	    pcb->pcb_savefpu != &pcb->pcb_savefpusmall) {
+		KASSERTMSG(x86_fpu_save_separate_p(), "pcb=%p pcb_savefpu=%p",
+		    pcb, pcb->pcb_savefpu);
+		uvm_km_free(kernel_map, (vaddr_t)pcb->pcb_savefpu,
+		    x86_fpu_save_size, UVM_KMF_WIRED);
+	}
+
 	KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL));
 	KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL));
 	uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED);


Home | Main Index | Thread Index | Old Index