Subject: vnlock deadlock examinations ..
To: None <tech-kern@netbsd.org>
From: Stephen Jones <smj@cirr.com>
List: tech-kern
Date: 01/11/2004 13:59:30
This is on a CS20 with the 1.6.1 release. I originally was using this
release before attempting to track
-current for a test machine, but have since reverted back to 1.6.1 to
avoid the problems being worked
out in -current. I'm hoping for a few pointers at debugging this as it
has been a frequent problem on
our NFS clients.
Below I've tried to document this the best I can. DEBUG/LOCKDEBUG and
-G are all set as options
and infact, this is the netbsd.gdb kernel running
What I'd like to get is a few suggestions on further tracing this
problem .. inparticular, how can I get
the 'vnode address' so that I can use 'show ncache' ?
This scenario is pretty much repeatable on our CS20:
Freshly booted .. netbsd.gdb
; I will now copy a large file (about 16mb) from the test client
; to the production fileserver.
(SSH session)
# cp netbsd.gdb /sys
load: 0.78 cmd: cp 226 [nfsaio] 0.00u 0.01s 0% 216k
; I will now drop to the debugger on the console
; to trace pid 226 (e2)
; I will then continue and login to get an nfsrcvlk for 'login'
(CONSOLE)
Stopped at cpu_Debugger+0x4: ret zero,(ra)
db> trace/t e2
trace: pid 226 at 0xfffffe0049ccf738
mi_switch() at mi_switch+0x290
ltsleep() at ltsleep+0x48c
nfs_asyncio() at nfs_asyncio+0x108
nfs_strategy() at nfs_strategy+0xa4
genfs_gop_write() at genfs_gop_write+0x478
nfs_gop_write() at nfs_gop_write+0x90
genfs_putpages() at genfs_putpages+0xb88
end() at 0xfffffc0000f02bc0
db> cont
NetBSD/alpha (iceland) (console)
login: smj
Password:
load: 0.56 cmd: login 201 [nfsrcvlk] 0.02u 0.01s 0% 2112k
setclock: 4/1/11/21/33/42
; I will now suspend the 'cp' waiting in 'nfsaio' and try
; a 'df' which will result in a 'vnlock' which can potentially deadlock.
(SSH session)
load: 0.36 cmd: cp 226 [nfsaio] 0.00u 0.02s 0% 216k
^Z[1] + Stopped cp netbsd.gdb /sys
# df
load: 1.26 cmd: df 227 [vnlock] 0.00u 0.00s 0% 112k
; after a bit of time, I drop to the debugger again
; and trace the pid of the 'df' 227 (e3) and I trace
; the 'login' 201 (c9) pid .. then I will show the
; process table.
(CONSOLE)
Stopped at cpu_Debugger+0x4: ret zero,(ra)
db> trace/t e3
trace: pid 227 at 0xfffffe0049cd3a68
mi_switch() at mi_switch+0x290
ltsleep() at ltsleep+0x48c
_lockmgr() at _lockmgr+0xfa8
genfs_lock() at genfs_lock+0x34
vn_lock() at vn_lock+0xf8
vget() at vget+0x1dc
nfs_root() at nfs_root+0x3c
lookup() at lookup+0x6c4
namei() at namei+0x4a8
sys_statfs() at sys_statfs+0x60
syscall_plain() at syscall_plain+0x154
XentSys() at XentSys+0x58
db> trace/t c9
trace: pid 201 at 0xfffffe00488715b8
mi_switch() at mi_switch+0x290
ltsleep() at ltsleep+0x48c
nfs_rcvlock() at nfs_rcvlock+0x100
nfs_reply() at nfs_reply+0x3c
nfs_request() at nfs_request+0x5cc
nfs_access() at nfs_access+0x764
nfs_lookup() at nfs_lookup+0x1d8
lookup() at lookup+0x49c
namei() at namei+0x4a8
vn_open() at vn_open+0x258
sys_open() at sys_open+0x10c
syscall_plain() at syscall_plain+0x154
XentSys() at XentSys+0x58
db> ps/n
PID PPID PGRP UID S FLAGS COMMAND
WAIT
227 224 227 0 3 0x4106 df
vnlock
226 224 226 0 4 0x5086 cp
224 203 224 0 3 0x4086 ksh
pause
203 202 203 100 3 0x4086 ksh
pause
202 164 202 0 3 0x184 sshd
select
201 1 201 0 3 0x4006 login
nfsrcvl
196 1 196 0 3 0x84 inetd
select
180 1 180 0 3 0x84 timed
select
164 1 164 0 3 0x184 sshd
select
144 1 144 0 3 0x84 rpc.lockd
select
107 1 107 0 3 0x84 ypbind
select
102 1 102 0 3 0x84 rpcbind
select
89 1 89 0 3 0x84 syslogd
select
53 0 0 0 3 0x20204 nfsio
nfsrcvl
52 0 0 0 3 0x20204 nfsio
nfsrcvl
51 0 0 0 3 0x20204 nfsio
nfsrcvl
50 0 0 0 3 0x20204 nfsio
nfsrcvl
49 0 0 0 3 0x20204 nfsio
nfsrcvl
48 0 0 0 3 0x20204 nfsio
nfsrcvl
47 0 0 0 3 0x20204 nfsio
nfsrcvl
46 0 0 0 3 0x20204 nfsio
nfsrcvl
45 0 0 0 3 0x20204 nfsio
nfsrcvl
44 0 0 0 3 0x20204 nfsio
netio
8 0 0 0 3 0x20204 aiodoned
aiodone
7 0 0 0 3 0x20204 ioflush
syncer
6 0 0 0 3 0x20204 reaper
reaper
5 0 0 0 3 0x20204 pagedaemon
pgdaemo
4 0 0 0 3 0x20204 scsibus2
sccomp
3 0 0 0 3 0x20204 scsibus1
sccomp
2 0 0 0 3 0x20204 scsibus0
sccomp
1 0 1 0 3 0x4084 init
wait
0 -1 0 0 3 0x20204 swapper
schedul
db> ps /w
PID COMMAND EMUL PRI UTIME STIME WAIT-MSG WAIT-CHANNEL
227 df netbsd 20 0.0 0.0 vnlock
0xfffffc000fbcccd8
226 cp netbsd 16 0.0 0.0
224 ksh netbsd 40 0.0 0.0 pause
0xfffffc0010f0c820
203 ksh netbsd 40 0.0 0.0 pause
0xfffffc000c409010
202 sshd netbsd 24 0.5 0.0 select selwait
201 login netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
196 inetd netbsd 24 0.0 0.0 select selwait
180 timed netbsd 24 0.0 0.0 select selwait
164 sshd netbsd 24 0.5 0.0 select selwait
144 rpc.lockd netbsd 24 0.0 0.0 select selwait
107 ypbind netbsd 24 0.0 0.0 select selwait
102 rpcbind netbsd 24 0.0 0.0 select selwait
89 syslogd netbsd 24 0.0 0.0 select selwait
53 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
52 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
51 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
50 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
49 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
48 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
47 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
46 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
45 nfsio netbsd 21 0.0 0.0 nfsrcvlk
0xfffffe00003a1b08
44 nfsio netbsd 24 0.0 0.0 netio
0xfffffc000c882d28
8 aiodoned netbsd 4 0.0 0.0 aiodoned uvm+0xf0
7 ioflush netbsd 40 0.0 0.0 syncer rushjob
6 reaper netbsd 4 0.0 0.0 reaper deadproc
5 pagedaemon netbsd 4 0.0 0.0 pgdaemon uvm+0xa8
4 scsibus2 netbsd 16 0.0 0.0 sccomp
0xfffffe0000217920
3 scsibus1 netbsd 16 0.0 0.0 sccomp
0xfffffe000003b520
2 scsibus0 netbsd 16 0.0 0.0 sccomp
0xfffffe000003b920
1 init netbsd 32 0.0 0.0 wait
0xfffffc000c400000
0 swapper netbsd 4 0.0 0.0 scheduler proc0
db> ps/a
PID COMMAND STRUCT PROC * UAREA *
VMSPACE/VM_MAP
227 df 0xfffffc000fa4ee70 0xfffffe0049cd0000
0xfffffc000c40b4a0
226 cp 0xfffffc000fa4f9f0 0xfffffe0049ccc000
0xfffffc000c40bad0
224 ksh 0xfffffc000fa4f710 0xfffffe0049cc8000
0xfffffc000c40bce0
203 ksh 0xfffffc000de55cc8 0xfffffe0049cc4000
0xfffffc000c40a420
202 sshd 0xfffffc000c4019e0 0xfffffe0048876000
0xfffffc000c40b080
201 login 0xfffffc000c401700 0xfffffe004886e000
0xfffffc000c40a210
196 inetd 0xfffffc000fa4f150 0xfffffe0049cc0000
0xfffffc000c40b6b0
180 timed 0xfffffc000fa4eb90 0xfffffe0049cbc000
0xfffffc000c40b290
164 sshd 0xfffffc000fa4e8b0 0xfffffe0049cb8000
0xfffffc000c40aa50
144 rpc.lockd 0xfffffc000fa4e5d0 0xfffffe0049cb4000
0xfffffc000c40ae70
107 ypbind 0xfffffc000fa4e2f0 0xfffffe0049cb0000
0xfffffc000c40ac60
102 rpcbind 0xfffffc000fa4e010 0xfffffe0049cac000
0xfffffc000c40a630
89 syslogd 0xfffffc000c401cc0 0xfffffe0049ca8000
0xfffffc000c40a840
53 nfsio 0xfffffc000de559e8 0xfffffe0049ca4000
0xfffffc00005e0da0
52 nfsio 0xfffffc000de55708 0xfffffe0049ca0000
0xfffffc00005e0da0
51 nfsio 0xfffffc000de55428 0xfffffe0049c9c000
0xfffffc00005e0da0
50 nfsio 0xfffffc000de55148 0xfffffe0049c98000
0xfffffc00005e0da0
49 nfsio 0xfffffc000de54e68 0xfffffe0049c94000
0xfffffc00005e0da0
48 nfsio 0xfffffc000de54b88 0xfffffe0049c90000
0xfffffc00005e0da0
47 nfsio 0xfffffc000de548a8 0xfffffe0049c8c000
0xfffffc00005e0da0
46 nfsio 0xfffffc000de545c8 0xfffffe0049c88000
0xfffffc00005e0da0
45 nfsio 0xfffffc000de542e8 0xfffffe0049c84000
0xfffffc00005e0da0
44 nfsio 0xfffffc000de54008 0xfffffe0049c80000
0xfffffc00005e0da0
8 aiodoned 0xfffffc000c401420 0xfffffe004886a000
0xfffffc00005e0da0
7 ioflush 0xfffffc000c401140 0xfffffe0048866000
0xfffffc00005e0da0
6 reaper 0xfffffc000c400e60 0xfffffe0048862000
0xfffffc00005e0da0
5 pagedaemon 0xfffffc000c400b80 0xfffffe004885e000
0xfffffc00005e0da0
4 scsibus2 0xfffffc000c4008a0 0xfffffe004885a000
0xfffffc00005e0da0
3 scsibus1 0xfffffc000c4005c0 0xfffffe0048856000
0xfffffc00005e0da0
2 scsibus0 0xfffffc000c4002e0 0xfffffe0048852000
0xfffffc00005e0da0
1 init 0xfffffc000c400000 0xfffffe004884e000
0xfffffc000c40a000
0 swapper 0xfffffc00005e0fb0 0xfffffc0000200000
0xfffffc00005e0da0