Subject: port-arm/25512: switch_exit calls sched_lock_idle with IPL < splsched().
To: None <gnats-bugs@gnats.NetBSD.org>
From: Richard Earnshaw <rearnsha@buzzard.freeserve.co.uk>
List: netbsd-bugs
Date: 05/09/2004 17:11:13
>Number:         25512
>Category:       port-arm
>Synopsis:       switch_exit calls sched_lock_idle with IPL < splsched().
>Confidential:   no
>Severity:       critical
>Priority:       medium
>Responsible:    port-arm-maintainer
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Sun May 09 16:20:00 UTC 2004
>Closed-Date:
>Last-Modified:
>Originator:     Richard Earnshaw
>Release:        NetBSD 2.0_Beta
>Organization:
ARM
>Environment:
	
	
System: NetBSD 2.0_BETA (OSPREY.debug) #4: Sat May  8 16:37:15 BST 2004
Architecture: arm
Machine: cats
>Description:

	In a kernel built with LOCKDEUG switch_exit makes calls to 
	sched_lock_idle with interrupts enabled and with the IPL set to less
	than splsched().  The result is that a panic can occur if an interrupt
	occurs that causes entry into the scheduler (via, for example 
	wakeup()), because assertions that sched_lock is not held fail.


	Attached is a stack backtrace (augmented to include the 
	missing non-frame layers in the trace):

netbsd:panic+0x10
        scp=0xf012e7bc rlv=0xf01ff928 (netbsd:__assert+0x34)
        rsp=0xf3c1ec34 rfp=0xf3c1ec48
        r7=0xf02decac r6=0xf165651c
        r5=0xf02decac r4=0x000002a6
netbsd:__assert+0xc
        scp=0xf01ff900 rlv=0xf011c748 (netbsd:wakeup+0x11c)
        rsp=0xf3c1ec4c rfp=0xf3c1ec68
        r4=0xf165651c
netbsd:wakeup+0xc
        scp=0xf011c638 rlv=0xf0158e64 (netbsd:biodone+0x16c)
        rsp=0xf3c1ec6c rfp=0xf3c1ec88
        r6=0xf1656524 r5=0x00100403
        r4=0xf165651c
netbsd:biodone+0xc
        scp=0xf0158d04 rlv=0xf01c6024 (netbsd:wddone+0x1e0)
        rsp=0xf3c1ec8c rfp=0xf3c1ecb8
        r7=0xf165651c r6=0xf1402800
        r5=0xf13760fc r4=0xf1401000
netbsd:wddone+0xc
        scp=0xf01c5e50 rlv=0xf01c92cc (netbsd:wdc_ata_bio_done+0x8c)
        rsp=0xf3c1ecbc rfp=0xf3c1ece0
        r8=0xf029ad78 r7=0x00000000
        r6=0xf140292c r5=0xf13760fc r4=0xf1401000
netbsd:wdc_ata_bio_done+0xc
        scp=0xf01c924c rlv=0xf01c8f28 (netbsd:wdc_ata_bio_intr+0x1bc)
        rsp=0xf3c1ece4 rfp=0xf3c1ed18
        r8=0xf1401000 r7=0xf1402938
        r6=0x00000000 r5=0x00000010 r4=0x00000000
netbsd:wdc_ata_bio_intr+0xc
        scp=0xf01c8d78 rlv=0xf0062198 (netbsd:wdcintr+0x114)
        rsp=0xf3c1ed1c rfp=0xf3c1ed38
        r10=0x00000000 r9=0x00000400
        r8=0x00000000 r7=0x00000000 r6=0xf1401000 r5=0xf1376000
        r4=0xf13760fc
netbsd:wdcintr+0xc
        scp=0xf0062090 rlv=0xf01dc10c (netbsd:isa_irqdispatch+0x78)
        rsp=0xf3c1ed3c rfp=0xf3c1ed58
        r7=0xf3c1ed94 r6=0x00000000
        r5=0xf1365da0 r4=0x00000000
netbsd:isa_irqdispatch+0xc
        scp=0xf01dc0a0 rlv=0xf01c2dfc (netbsd:footbridge_intr_dispatch+0x1a4)
        rsp=0xf3c1ed5c rfp=0xf3c1ed90
        r7=0xf02dd7dc r6=0x00100403
        r5=0x00000093 r4=0xf1365fa0
netbsd:footbridge_intr_dispatch+0xc
        scp=0xf01c2c64 rlv=0xf01c22b0 (netbsd:irq_entry+0xac)
        rsp=0xf3c1ed94 rfp=0xf3c1ee0c
        r10=0xf02decac r9=0x00000000
        r8=0x00000001 r7=0x00000299 r6=0x00000000 r5=0xf02ce5dc
        r4=0xf02decec
netbsd:irq_entry (frameless)
netbsd:_simple_lock+0xc
        scp=0xf0107b38 rlv=0xf01afdf0 (netbsd:switch_exit+0xc4)
        rsp=0xf3c1ee10 rfp=0xf3c1ee58
        r10=0xf44389b8 r8=0xf02de320
        r7=0xf3c1d000 r6=0xf0109730 r5=0xf02d98a8 r4=0xf44389b8
netbsd:switch_exit (frameless)

	And here's more detail of the stack between footbridge_intr_dispatch
	and switch_exit:


0xf3c1ed90:        f01c2c64     20000013              0       20000093
	a0:          300003          400       f02decec              0
	b0:        f0233268          299              1              0
	c0:        f02decac     f3c1ee0c        e7ffacb       efffd3b0
	d0:           1c050     f3c1ede0       ffffffff       f0107cb0
	e0:        f3c1ee0c     f3c1edf0       f44389b8       f02d98a8
	f0:        f0109730     f3c1d000       f02de320       f44389b8
0xf3c1ee00:        f3c1ee58     f3c1ee10       f01afdf0       f0107b38
	10:               0     f3c1ee68       f3c1ee24       f012bf78
	20:        f01084f8     f44389b8              0       f02d8c70
	30:               0            0       f39c33ac       f3c1ee68
	40:        f3c1ee4c     f445c7f0       f02da39c       f445c7f0
	50:        f02da39c     f02da40c       f0234714       f3c1ee88
	60:        f3c1ee6c     f012c2d4              0       f3c50928
	70:               0     f44389b8              0       f3c1eec8
	80:        f3c1ee8c     f0101ac4       f0108164       ffffffff
	90:        ffffffff                                

	switch_exit is called exclusively by cpu_exit, which in turn is
	called from exit1() and lwp_exit().  The critical bit of code is 
	here:

	IRQenableALL

	/*
	 * Schedule the vmspace and stack to be freed.
	 */
	mov	r0, r4			/* {lwp_}exit2(l) */
	mov	lr, pc
	mov	pc, r6

#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
	bl	_C_LABEL(sched_lock_idle)
#endif

	ldr	r7, .Lwhichqs		/* r7 = &whichqs */
	mov	r5, #0x00000000		/* r5 = old lwp = NULL */
	b	.Lswitch_search

>How-To-Repeat:

	Build and run a kernel built with LOCKDEBUG.  I don't think this
	problem is exclusive to CATS, but that's where I've currently 
	seen the problem.

>Fix:

	Not entirely sure, one method might be to block interrupts entirely
	before calling sched_lock_idle (after calling the exit func), but a
	better method might be to raise the IPL level to splsched().  This is
	made complex by the fact that switch_search is entered from more than
	one routine.



>Release-Note:
>Audit-Trail:
>Unformatted: