Subject: lib/36722: x87 FPU stack overflows when many threads are starting to use a FPU.
To: None <lib-bug-people@netbsd.org, gnats-admin@netbsd.org,>
From: None <irino@src.ricoh.co.jp>
List: netbsd-bugs
Date: 08/02/2007 11:20:00
>Number: 36722
>Category: lib
>Synopsis: x87 FPU stack overflows when many threads are starting to use a FPU.
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: lib-bug-people
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Thu Aug 02 11:20:00 +0000 2007
>Originator: Yoshiaki Irino
>Release: NetBSD 2.1, 3.0.2, 3.1
>Organization:
RICOH COMPANY, LTD.
>Environment:
NetBSD 3.1 (GENERIC) #0: Tue Oct 31 04:27:07 UTC 2006 builds@b0.netbsd.org:/home/builds/ab/netbsd-3-1-RELEASE/i386/200610302053Z-obj/home/builds/ab/netbsd-3-1-RELEASE/src/sys/arch/i386/compile/GENERIC
NetBSD 3.0.2 (GENERIC) #0: Wed Nov 1 00:26:54 UTC 2006 builds@b1.netbsd.org:/home/builds/ab/netbsd-3-0-2-RELEASE/i386/200610311952Z-obj/home/builds/ab/netbsd-3-0-2-RELEASE/src/sys/arch/i386/compile/GENERIC
NetBSD 2.1 (GENERIC) #0: Mon Oct 24 22:35:45 UTC 2005 jmc@faith.netbsd.org:/home/builds/ab/netbsd-2-1-RELEASE/i386/200510241747Z-obj/home/builds/ab/netbsd-2-1-RELEASE/src/sys/arch/i386/compile/GENERIC
>Description:
Under following situation,
(1) a thread was forced to switch to another thread while using FPU, and
(2) the next thread was just after pthread_create()'d,
the next thread inherits FPU context of the previos thread
because all thread just after pthread_create()'d has no FPU context.
Because of this, if the previous thread had pushed all x87 FPU stack,
the next thread meets "stack overflow fault" when pushed the first float value.
I tested this problem on following machines and releases by using a test program below.
CPU/NetBSD release 3.1 3.0.2 2.1
Pentium-MMX(w/o FXSR) NG NG NG
Pentium-4 (with FXSR) NG NG NG
>How-To-Repeat:
This test program fails at "thread4",
because all threads wants to keep two x87 FPU stack for their own calculation.
(x87 FPU has only 8 data register stack)
#include <pthread.h>
#include <stdio.h>
#define N_THREADS 8
void *dofpu(void *);
double d1 = 0.12345;
double d2 = 0.12345;
int
main(int ac, char *av[])
{
int i;
pthread_t th;
for (i = 0; i < N_THREADS; i++) {
if (pthread_create(&th, NULL, dofpu, (void *)i) != 0) {
fprintf(stderr, "pthread_create() failed\n");
exit(1);
}
}
sleep(60);
exit(0);
}
void *
dofpu(void *arg)
{
while (1) {
if (d1 != d2)
break;
}
printf("failed at thread%d\n", (int)arg);
exit(1);
}
---
This sample program shows x87 FPU's control/status/tag words as follows.
# /tmp/x87fail_analyze
FPU words when main thread started
control 0x127f, status 0x0000, tag 0x0000
FPU words when thread0 started
control 0x127f, status 0x3000, tag 0x00c0
FPU words when thread1 started
control 0x127f, status 0x6000, tag 0x00f0
FPU words when thread2 started
control 0x127f, status 0x5000, tag 0x00fc
FPU words when thread3 started
control 0x127f, status 0x4000, tag 0x00ff
FPU words when thread4 started
control 0x127f, status 0x7241, tag 0x00ff
FPU words when thread4 failed
control 0x127f, status 0x7541, tag 0x00ff
failed at thread4
#
#include <pthread.h>
#include <stdio.h>
#define N_THREADS 8
void *dofpu(void *);
void xmm_dump(u_char *);
/*
* only for CPUs with FXSR - XXX
*/
#define fxsave(addr) __asm("fxsave %0" : "=m" (*addr))
u_char save_xmm[N_THREADS][512];
u_char fail_xmm[N_THREADS][512];
u_char main_xmm[512];
double d1 = 0.12345;
double d2 = 0.12345;
int
main(int ac, char *av[])
{
int i;
pthread_t th;
fxsave(main_xmm);
for (i = 0; i < N_THREADS; i++) {
if (pthread_create(&th, NULL, dofpu, (void *)i) != 0) {
fprintf(stderr, "pthread_create() failed\n");
exit(1);
}
}
sleep(60);
exit(0);
}
void *
dofpu(void *arg)
{
u_char *save = save_xmm[(int)arg];
u_char *fail = fail_xmm[(int)arg];
int i;
fxsave(save);
while (1) {
fxsave(fail); /* must be here! */
if (d1 != d2)
break;
}
printf("FPU words when main thread started\n");
xmm_dump(main_xmm);
for (i = 0; i <= (int)arg; i++) {
printf("FPU words when thread%d started\n", i);
xmm_dump(save_xmm[i]);
}
printf("FPU words when thread%d failed\n", (int)arg);
xmm_dump(fail);
printf("failed at thread%d\n", (int)arg);
exit(1);
}
void
xmm_dump(u_char *v)
{
printf("control 0x%02x%02x, status 0x%02x%02x, tag 0x%02x%02x\n",
v[1], v[0], v[3], v[2], v[5], v[4]);
#ifdef VERBOSE
{
int i;
/* show 160 bytes only */
for (i = 0; i < 160; v += 16, i += 16) {
printf("%02x %02x %02x %02x %02x %02x %02x %02x ",
v[15], v[14], v[13], v[12], v[11], v[10], v[9], v[8]);
printf("%02x %02x %02x %02x %02x %02x %02x %02x\n",
v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
}
}
#endif /* VERBOSE */
>Fix:
I checked this patch (for src/lib/libpthread/arch/i386/pthread_md.h) on following machines and releases only by using a test program above,
but I am not sure that this patch cause no side effect.
CPU/NetBSD release 3.1 3.0.2 2.1
Pentium-MMX(w/o FXSR) OK OK OK
Pentium-4 (with FXSR) OK OK OK
--- pthread_md.h 2007-08-01 16:21:14.000000000 +0900
+++ pthread_md.h.sendpr 2007-08-01 16:10:29.000000000 +0900
@@ -72,6 +72,16 @@
#define pthread__uc_sp(ucp) ((ucp)->uc_mcontext.__gregs[_REG_UESP])
#define pthread__uc_pc(ucp) ((ucp)->uc_mcontext.__gregs[_REG_EIP])
+#define pthread__i386_fxsave(addr) __asm("fxsave %0" : "=m" (*addr))
+#define pthread__uc_xmm(ucp) \
+ ((ucp)->uc_mcontext.__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm)
+
+#define pthread__i386_fnsave(addr) __asm("fnsave %0" : "=m" (*addr))
+#define pthread__i386_fwait() __asm("fwait")
+#define pthread__i386_frstor(addr) __asm("frstor %0" : "=m" (*addr))
+#define pthread__uc_s87(ucp) \
+ ((ucp)->uc_mcontext.__fpregs.__fp_reg_set.__fpchip_state.__fp_state)
+
/*
* Set initial, sane values for registers whose values aren't just
* "don't care".
@@ -86,7 +96,16 @@
(ucp)->uc_mcontext.__gregs[_REG_DS] = 0x2b, \
(ucp)->uc_mcontext.__gregs[_REG_CS] = 0x23, \
(ucp)->uc_mcontext.__gregs[_REG_SS] = 0x2b, \
- (ucp)->uc_mcontext.__gregs[_REG_EFL] = 0x202;
+ (ucp)->uc_mcontext.__gregs[_REG_EFL] = 0x202; \
+ if (_md_getcontext_u == _getcontext_u_xmm) { \
+ pthread__i386_fxsave(pthread__uc_xmm(ucp)); \
+ (ucp)->uc_flags |= _UC_FPU; \
+ } else if (_md_getcontext_u == _getcontext_u_s87) { \
+ pthread__i386_fnsave(pthread__uc_s87(ucp)); \
+ pthread__i386_fwait(); \
+ pthread__i386_frstor(pthread__uc_s87(ucp)); \
+ (ucp)->uc_flags |= _UC_FPU; \
+ }
/*
* Usable stack space below the ucontext_t.