Re: port-mips/59064 (jemalloc switch to 5.3 broke userland)

To: Rin Okuyama <rokuyama.rk%gmail.com@localhost>, Martin Husemann <martin%duskware.de@localhost>, gnats-bugs%NetBSD.org@localhost, port-mips-maintainer%NetBSD.org@localhost, gnats-admin%NetBSD.org@localhost, netbsd-bugs%NetBSD.org@localhost, martin%NetBSD.org@localhost, simonb%NetBSD.org@localhost, joerg%NetBSD.org@localhost, dholland%NetBSD.org@localhost
Subject: Re: port-mips/59064 (jemalloc switch to 5.3 broke userland)
From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
Date: Mon, 14 Apr 2025 23:39:09 +0000

I tried various experiments with the attached rdhwr.c program (badly
named and having some irrelevant stuff, sorry) and the following
changes to user_reserved_insn in mipsX_subr.S, each on its own:

1. Add a dozen ssnop instructions after each MFC0_HAZARD (expands to
   nothing) and COP0_SYNC (ehb), and before eret.

2. Use AT instead of v0 on lines 1317-1320.

3. Swap AT/v0 on lines 1326-1335.

4. Use AT instead of v0 on lines 1341-1344.

5. Insert  move v0,k0  at line 1348, and then load from v0 instead of
   k0 on lines 1349-1350.

6. Switch v0 -> v1, AT -> k0 throughout, and skip saving/restoring any
   registers.

7. Duplicate the PTR_L instruction to restore v0 on line 1350.

8. Insert another COP0_SYNC (ehb) on line 1350 just before eret.

9. Insert another COP0_SYNC (ehb) between lines 1327 and 1328, and on
   line 1329.

10. Instead of

1326	_MFC0	v0, MIPS_COP_0_EXC_PC
1327	MFC0_HAZARD
1328	INT_L	AT, 0(v0)

   do

	move	v1, k0
	_MFC0	k0, MIPS_COP_0_EXC_PC
	MFC0_HAZARD
	INT_L	AT, 0(k0)
	move	k0, v1

   to load the excepting instruction, so the load goes via k0 rather
   than v0.

11. Instead of

1326	_MFC0	v0, MIPS_COP_0_EXC_PC
1327	MFC0_HAZARD
1328	INT_L	AT, 0(v0)

   do

	move	v0, k0
	_MFC0	k0, MIPS_COP_0_EXC_PC
	MFC0_HAZARD
	INT_L	AT, 0(k0)
	move	k0, v0

   to do like (10) except with v0 as the temporary instead of v1 since
   v1 is not actually safe to use here.

12. Instead of

1326	_MFC0	v0, MIPS_COP_0_EXC_PC
1327	MFC0_HAZARD
1328	INT_L	AT, 0(v0)

   do

	_MFC0	k0, MIPS_COP_0_EXC_PC
	MFC0_HAZARD
	INT_L	AT, 0(k0)

   and then re-derive k0 := curlwp->l_addr + USPACE - TF_SIZ -
   CALLFRAME_SIZ at the end:

+	PTR_L	k0, L_PCB(k1)			# XXXuvm_lwp_getuarea
+	PTR_ADDU k0, USPACE - TF_SIZ - CALLFRAME_SIZ
 	REG_L	AT, CALLFRAME_SIZ+TF_REG_AST(k0)# restore reg
 	REG_L	v0, CALLFRAME_SIZ+TF_REG_V0(k0) # restore reg
 	eret

Experiments (3), (6), (10), and (12) avoided the segfaults.  The
others all produced no change -- the program intermittently segfaults,
about every other trial, with the RDHWR pc stored in v0, except for
(11) where v0 = 0x98000004109ffea0.

Unfortunately, experiment (3) probably just trashes AT rather than v0.
And (6) and (10) are broken because we can't alter v1 until we have
proven that the output of the instruction we have to emulate is v1;
until then, we're limited to k0 (can't alter k1 because various
exception vectors assume it holds curlwp).

So that leaves the somewhat kooky (12).  I hypothesize it works
because it ends up trashing k0 instead of anything else on eret -- and
k0 is private to the kernel, and assumed garbage on entry to kernel
exception vectors anyway, so trashing it is harmless.  Patch attached.

Reference point:

https://nxr.netbsd.org/xref/src/sys/arch/mips/mips/mipsX_subr.S?r=1.115#1297

         * We also tried saving k0 in v0 by surrounding this stanza in
         *
         *      move    v0, k0
         *      ...
         *      move    k0, v0
         *
         * rather than re-deriving k0 from k1, but that still trashed
         * userland's v0 on return.
         */
        _MFC0   k0, MIPS_COP_0_EXC_PC
        MFC0_HAZARD
        INT_L   AT, 0(k0)

#include <machine/lwp_private.h>

#include <link_elf.h>
#include <lwp.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

__thread int foo = 123;

struct tls_tcb tcb_storage[0x8000];

__noinline __used
static struct tls_tcb *
gettcb_rdhwr(void)
{
	struct tls_tcb *tcb;

	asm(".set push\n\t"
	    ".set mips32r2\n\t"
	    "rdhwr $3,$29\n\t"
	    "move %[tcb],$3\n\t"
	    ".set pop"
	    : [tcb]"=r"(tcb)
	    :
	    : "$3");

	return tcb - (TLS_TP_OFFSET/sizeof(*tcb) + 1);
}

static int
dlitercb(struct dl_phdr_info *dlpi, size_t size, void *cookie)
{
	char buf[1024];

	snprintf(buf, sizeof(buf),
	    "dlpi_addr=0x%lx\n"
	    "dlpi_name=%s\n"
	    "dlpi_phdr=%p\n"
	    "dlpi_phnum=%d\n"
	    "dlpi_adds=%lld\n"
	    "dlpi_subs=%lld\n"
	    "dlpi_tls_modid=0x%zx\n"
	    "dlpi_tls_data=%p\n"
	    "\n",
	    (long)dlpi->dlpi_addr,
	    dlpi->dlpi_name,
	    dlpi->dlpi_phdr,
	    dlpi->dlpi_phnum,
	    dlpi->dlpi_adds,
	    dlpi->dlpi_subs,
	    dlpi->dlpi_tls_modid,
	    dlpi->dlpi_tls_data);
	(void)write(STDOUT_FILENO, buf, strlen(buf));

	return 0;
}

int
main(void)
{
	char buf[1024];
	struct tls_tcb *tcb_rdhwr, *tcb_syscall;

	(void)dl_iterate_phdr(&dlitercb, NULL);

	tcb_syscall = __lwp_gettcb_fast();
	snprintf_ss(buf, sizeof(buf), "tcb_syscall %p\n", tcb_syscall);
	(void)write(STDOUT_FILENO, buf, strlen(buf));

	tcb_rdhwr = gettcb_rdhwr();
	snprintf_ss(buf, sizeof(buf), "tcb_rdhwr %p\n", tcb_rdhwr);
	(void)write(STDOUT_FILENO, buf, strlen(buf));

	printf("%p\n", malloc(1));
	fflush(stdout);

#if 1
	__USE(tcb_storage);
	__USE(tcb_syscall);
	__USE(tcb_rdhwr);
	__USE(buf);
#else
	__lwp_settcb(tcb_storage);

	tcb_syscall = __lwp_gettcb_fast();
	snprintf_ss(buf, sizeof(buf), "tcb_syscall %p\n", tcb_syscall);
	(void)write(STDOUT_FILENO, buf, strlen(buf));

	tcb_rdhwr = gettcb_rdhwr();
	snprintf_ss(buf, sizeof(buf), "tcb_rdhwr %p\n", tcb_rdhwr);
	(void)write(STDOUT_FILENO, buf, strlen(buf));
#endif

	return 0;
}

diff -r 5271608211c5 sys/arch/mips/mips/mipsX_subr.S
--- a/sys/arch/mips/mips/mipsX_subr.S	Mon Apr 14 02:13:34 2025 +0000
+++ b/sys/arch/mips/mips/mipsX_subr.S	Mon Apr 14 23:37:54 2025 +0000
@@ -1322,10 +1322,34 @@
 	/*
 	 * Get exception PC and fetch the instruction.  We know we can do
 	 * this since the instruction actually got read.
+	 *
+	 * Use k0, rather than v0, for the load address to work around a
+	 * likely Cavium bug that replaces v0 by the exception pc when
+	 * userland continues -- we hypothesize that this will trash k0
+	 * instead, which is harmless because kernel exception vectors
+	 * assume k0 is garbage anyway.
+	 *
+	 * It is tempting to use v1 because we are about to overwrite it
+	 * with the output (and that appears to avoid the Cavium bug) --
+	 * but only if the instruction's destination register is v1,
+	 * which we haven't determined yet.
+	 *
+	 * We also tried saving k0 in v0 by surrounding this stanza in
+	 *
+	 *	move	v0, k0
+	 *	...
+	 *	move	k0, v0
+	 *
+	 * rather than re-deriving k0 from k1, but that still trashed
+	 * userland's v0 on return.
+	 *
+	 * See PR port-mips/59064: `jemalloc switch to 5.3 broke
+	 * userland' <https://gnats.NetBSD.org/59064> for more
+	 * background.
 	 */
-	_MFC0	v0, MIPS_COP_0_EXC_PC
+	_MFC0	k0, MIPS_COP_0_EXC_PC
 	MFC0_HAZARD
-	INT_L	AT, 0(v0)
+	INT_L	AT, 0(k0)
 
 	/*
 	 * Was this rdhwr $3,$29?
@@ -1346,6 +1370,8 @@
 
 	PTR_L	v1, L_PRIVATE(k1)		# rdhwr $3,$29 updates v1
 
+	PTR_L	k0, L_PCB(k1)			# XXXuvm_lwp_getuarea
+	PTR_ADDU k0, USPACE - TF_SIZ - CALLFRAME_SIZ
 	REG_L	AT, CALLFRAME_SIZ+TF_REG_AST(k0)# restore reg
 	REG_L	v0, CALLFRAME_SIZ+TF_REG_V0(k0) # restore reg
 	eret

References:
- Re: port-mips/59064 (jemalloc switch to 5.3 broke userland)
  - From: Taylor R Campbell

Prev by Date: NetBSD Nightly Trouble Ticket Report
Next by Date: Re: port-mips/59064 (jemalloc switch to 5.3 broke userland)
Previous by Thread: Re: port-mips/59064 (jemalloc switch to 5.3 broke userland)
Next by Thread: Re: port-mips/59064 (jemalloc switch to 5.3 broke userland)
Indexes:

Home | Main Index | Thread Index | Old Index