Port-vax archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
My long fight with missed interrupts...
Finally! I've found the problem with my lost interrupts.
It turns out to be a vax-specific problem, and one related to spin mutexes.
More specific, the mutex_spin_enter function is not truly reentrant.
What can happen (and is happening sometimes) is that mutex_spin_enter is
called for a mutex. Between the check wether any other spin mutexes are
being held and the decrement of the counter, an interrupt from the clock
happens, which raises the IPL even higher than the spin mutex requires.
This interrupt also calls mutex_spin_enter, and at this point, it don't
look like any other mutex is being held, so the IPL of the clock
interrupt is saved as the old ipl level. The clock interrupt runs
through, restores the IPL to what it was before, and the interrupted
routine continues, at the right IPL level. However, at mute_spin_exit,
the routine will now restore the IPL that was before mutex_spin_enter,
but the value saved is the one from when the interrupt occured, meaning
that in the end, we will actually raise the IPL at mutex_spin_exit, and
will not recover properly from this situation for quite a while.
I don't know under which circumstances it eventually gets recovered, but
it do happen at some point. But the system is somewhat catatonic for
quite a while.
Here is a fix for this bug. It should be incorporated into the source
tree asap, I'd say. If anyone want to improve on it, feel free. I've
tested it for a while on a 8650 now, and it works fine there.
The fix is maybe crude. I raise the IPL to IPL_HIGH for maybe longer
than needed while acquiring a spin mutex, and raise it again at
releasing the mutex. Feel free to improve the code if anyone feel like it...
Johnny
===================================================================
RCS file: /cvsroot/src/sys/arch/vax/vax/lock_stubs.S,v
retrieving revision 1.15
diff -c -r1.15 lock_stubs.S
*** sys/arch/vax/vax/lock_stubs.S 25 May 2008 15:56:12 -0000 1.15
--- sys/arch/vax/vax/lock_stubs.S 14 Jun 2010 05:54:52 -0000
***************
*** 29,34 ****
--- 29,40 ----
* POSSIBILITY OF SUCH DAMAGE.
*/
+ /*
+ * Bugfix 2010 by Johnny Billquist
+ *
+ * mutex_spin_{enter|exit} functions were not thread safe
+ */
+
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"
#include <machine/asm.h>
***************
*** 87,102 ****
blbc (%r0), 3f
#endif
mfpr $PR_IPL, %r2 /* get current IPL */
! movzbl MTX_IPL(%r0), %r3
! cmpl %r3, %r2 /* does mutex have > IPL? */
! bleq 1f /* no, leave IPL alone */
! mtpr %r3, $PR_IPL /* yes, raise IPL */
! 1: mfpr $PR_SSP, %r4 /* get curlwp */
movl L_CPU(%r4),%r4 /* get cpu_info */
tstl CI_MTX_COUNT(%r4) /* any spin mutexes active? */
bneq 3f /* yep, don't save IPL */
movl %r2, CI_MTX_OLDSPL(%r4) /* nope, save old IPL */
3: decl CI_MTX_COUNT(%r4) /* decr mutex count */
#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR)
bbssi $0, MTX_LOCK(%r0), 4f /* take out mutex */
ret
--- 93,110 ----
blbc (%r0), 3f
#endif
mfpr $PR_IPL, %r2 /* get current IPL */
! mtpr $IPL_HIGH, $PR_IPL /* lock out all interrupts */
! mfpr $PR_SSP, %r4 /* get curlwp */
movl L_CPU(%r4),%r4 /* get cpu_info */
tstl CI_MTX_COUNT(%r4) /* any spin mutexes active? */
bneq 3f /* yep, don't save IPL */
movl %r2, CI_MTX_OLDSPL(%r4) /* nope, save old IPL */
3: decl CI_MTX_COUNT(%r4) /* decr mutex count */
+ movzbl MTX_IPL(%r0), %r3 /* Get mutex ipl. */
+ cmpl %r3, %r2 /* does mutex have > IPL? */
+ bleq 1f /* no, leave IPL alone */
+ movl %r3, %r2 /* yes, this will be our new
ipl */
+ 1: mtpr %r2, $PR_IPL /* set IPL */
#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR)
bbssi $0, MTX_LOCK(%r0), 4f /* take out mutex */
ret
***************
*** 119,131 ****
#else
clrb MTX_LOCK(%r0) /* for ddb use only */
#endif
mfpr $PR_SSP, %r4 /* get curlwp */
movl L_CPU(%r4), %r4 /* get cpu_info */
movl CI_MTX_OLDSPL(%r4), %r2 /* fetch oldspl */
incl CI_MTX_COUNT(%r4) /* incr mtx count */
bneq 1f /* still held? */
! mtpr %r2, $PR_IPL /* no, restore saved ipl */
! 1: ret
#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR)
2: callg (%ap), _C_LABEL(mutex_vector_exit) /* slow path */
--- 127,142 ----
#else
clrb MTX_LOCK(%r0) /* for ddb use only */
#endif
+ mfpr $PR_IPL,%r0 /* Get current ipl */
+ mtpr $IPL_HIGH,$PR_IPL /* Lock out all interrupts */
mfpr $PR_SSP, %r4 /* get curlwp */
movl L_CPU(%r4), %r4 /* get cpu_info */
movl CI_MTX_OLDSPL(%r4), %r2 /* fetch oldspl */
incl CI_MTX_COUNT(%r4) /* incr mtx count */
bneq 1f /* still held? */
! movl %r2, %r0 /* no, restpre saved ipl */
! 1: mtpr %r0, $PR_IPL /* restore ipl */
! ret
#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR)
2: callg (%ap), _C_LABEL(mutex_vector_exit) /* slow path */
Home |
Main Index |
Thread Index |
Old Index