Subject: re: SMP enabled
To: Hauke Fath <hauke@Espresso.Rhein-Neckar.DE>
From: matthew green <mrg@eterna.com.au>
List: port-sparc
Date: 01/08/2003 18:59:49
With Paul's changes, I have managed to build a kernel with make -j3 on my
dual-SM71 ss10 just fine.
Then, I got adventurous and let the smp kernel (DIAGNOSTIC, DEBUG,
LOCKDEBUG)) run for the nightly amanda backup. It stayed up for about half
an hour into the backup, spitting messages like
Jan 8 02:38:05 pizza /netbsd: xcall(cpu0,0xf000862c): couldn't ping cpus: cpu1
Jan 8 02:38:18 pizza /netbsd: xcall(cpu1,0xf000862c): couldn't ping cpus: cpu0
Jan 8 02:38:18 pizza /netbsd: xcall(cpu1,0xf01ce2ec): couldn't ping cpus: cpu0
Jan 8 02:38:18 pizza /netbsd: xcall(cpu0,0xf000862c): couldn't ping cpus: cpu1
i get these xcall() messages seemingly _only_ if i use LOCKDEBUG...
then broke into the debugger with something like
simple_lock: locking against myself
lock: 0xf023ecfc, currently at: /usr/src/sys/arch/sparc/sparc/pmap.c:689
on cpu 0
last locked: /usr/src/sys/arch/sparc/sparc/pmap.c:689
last unlocked: /usr/src/sys/arch/sparc/sparc/pmap.c:708
0x0(0xf0adf000, 0x1d, 0xf0ae7000, 0xf023e554, 0x0, 0xfffffffe) at
pmap_kremove4m+0x16c
pmap_kremove4m(0xf0249910, 0xf0adf000, 0xf0ae7000, 0xf0232424, 0x100,
0xf023e554) at uvm_unmap_remove+0x16c
uvm_unmap_remove(0xf0249910, 0xf0adf000, 0xf0ae7000, 0x408000e5, 0x33, 0x18060)
at uvm_unmap+0x110
uvm_unmap(0xf0249910, 0xf0adf000, 0x8000, 0xf028280c, 0xf0899910, 0xfff) at
uvm_km_free+0x14
uvm_km_free(0xf0adf000, 0x68, 0x0, 0xf573e000, 0xf08e1a50, 0x3d461400) at
free+0x118
free(0xf22abab0, 0xf00e8aa8, 0xf22abab0, 0x8000, 0xf22abab0, 0xf2284000) at
soft
dep_disk_write_complete+0x254
softdep_disk_write_complete(0xf22abab0, 0x500, 0x0, 0xf01bf3f0, 0xf01b1a94,
0xc3) at biodone+0x74
biodone(0x0, 0x22009, 0x22009, 0xe0000, 0xf08f9498, 0x8) at
scsipi_complete+0x4dc
scsipi_complete(0xf08f8f00, 0x5d73b, 0xf01b1a80, 0xf023e554, 0x0,
0xfffffffe) at
scsipi_done+0x168
scsipi_done(0xf08e3e00, 0xf08f9498, 0x14, 0xf01b1a94, 0x100, 0xf023e554) at
ncr53c9x_done+0x1c8
ncr53c9x_done(0x3, 0xf0087020, 0x500, 0x408000e5, 0x33, 0x18060) at
ncr53c9x_intr+0x12bc
ncr53c9x_intr(0x404000e6, 0xf02110c8, 0x2ff, 0xf5fb7000, 0xf089cf90, 0x4000) at
sparc_interrupt44c+0x150
sparc_interrupt44c(0xf01ce2ec, 0x1, 0x0, 0x0, 0x0, 0xffffffff) at xcall+0x3a0
xcall(0x0, 0xf02ee688, 0x0, 0x182e99a, 0x0, 0xf028bab4) at updatepte4m+0x50
updatepte4m(0xf74a2000, 0x182e900, 0x1, 0xf06fdb80, 0x2, 0xf7250a60) at
pmap_ken
ter_pa4m+0xcc
pmap_kenter_pa4m(0x4000, 0xf727fe28, 0xf09e0300, 0xf0205400, 0xf023b800,
0xf023d
000) at sosend_loan+0x1e0
sosend_loan(0xf0aaef10, 0x0, 0xf727fe28, 0xf09e0300, 0x0, 0x0) at sosend+0x5c0
sosend(0xf718f158, 0xf718f180, 0xf727fe28, 0xf0ace700, 0x1, 0xf012edb0) at
soo_write+0x20
soo_write(0xf7250a60, 0x1, 0xf718f158, 0x4000, 0x4000, 0xf718f180) at
dofilewrite+0x8c
dofilewrite(0xf7250a60, 0xf727ff28, 0xf727ff20, 0xf012b274, 0xf727ff28,
0xf727ff20) at sys_write+0x70
sys_write(0x4, 0xf727ffb0, 0x0, 0x3991, 0x37e2c, 0xfffffffe) at syscall+0x1f4
syscall(0x1, 0x8d1d8, 0x4000, 0xf0002000, 0x3, 0xf06fdb80) at _syscall+0xcc
Stopped in pid 1677 (gzip) at cpu_Debugger+0x8: call
esigcode
db{0}> t
cpu_Debugger(0xf023ecfc, 0xf021ae18, 0x2b1, 0xdeadbeef, 0x0, 0xc3) at
_simple_lock+0x294
_simple_lock(0xf0adf000, 0xf02d3f7c, 0xffffffff, 0x0, 0x0, 0x2b) at
updatepte4m+0x24
updatepte4m(0xf0adf000, 0x1d, 0xf0ae7000, 0xf023e554, 0x0, 0xfffffffe) at
pmap_kremove4m+0x16c
pmap_kremove4m(0xf0249910, 0xf0adf000, 0xf0ae7000, 0xf0232424, 0x100,
0xf023e554) at uvm_unmap_remove+0x16c
uvm_unmap_remove(0xf0249910, 0xf0adf000, 0xf0ae7000, 0x408000e5, 0x33, 0x18060)
at uvm_unmap+0x110
uvm_unmap(0xf0249910, 0xf0adf000, 0x8000, 0xf028280c, 0xf0899910, 0xfff) at
uvm_km_free+0x14
uvm_km_free(0xf0adf000, 0x68, 0x0, 0xf573e000, 0xf08e1a50, 0x3d461400) at
free+0x118
free(0xf22abab0, 0xf00e8aa8, 0xf22abab0, 0x8000, 0xf22abab0, 0xf2284000) at
softdep_disk_write_complete+0x254
softdep_disk_write_complete(0xf22abab0, 0x500, 0x0, 0xf01bf3f0, 0xf01b1a94,
0xc3) at biodone+0x74
biodone(0x0, 0x22009, 0x22009, 0xe0000, 0xf08f9498, 0x8) at
scsipi_complete+0x4dc
scsipi_complete(0xf08f8f00, 0x5d73b, 0xf01b1a80, 0xf023e554, 0x0,
0xfffffffe) at
scsipi_done+0x168
scsipi_done(0xf08e3e00, 0xf08f9498, 0x14, 0xf01b1a94, 0x100, 0xf023e554) at
ncr53c9x_done+0x1c8
ncr53c9x_done(0x3, 0xf0087020, 0x500, 0x408000e5, 0x33, 0x18060) at
ncr53c9x_intr+0x12bc
ncr53c9x_intr(0x404000e6, 0xf02110c8, 0x2ff, 0xf5fb7000, 0xf089cf90,
0x4000) at sparc_interrupt44c+0x150
sparc_interrupt44c(0xf01ce2ec, 0x1, 0x0, 0x0, 0x0, 0xffffffff) at xcall+0x3a0
xcall(0x0, 0xf02ee688, 0x0, 0x182e99a, 0x0, 0xf028bab4) at updatepte4m+0x50
updatepte4m(0xf74a2000, 0x182e900, 0x1, 0xf06fdb80, 0x2, 0xf7250a60) at
pmap_kenter_pa4m+0xcc
pmap_kenter_pa4m(0x4000, 0xf727fe28, 0xf09e0300, 0xf0205400, 0xf023b800,
0xf023d000) at sosend_loan+0x1e0
sosend_loan(0xf0aaef10, 0x0, 0xf727fe28, 0xf09e0300, 0x0, 0x0) at sosend+0x5c0
sosend(0xf718f158, 0xf718f180, 0xf727fe28, 0xf0ace700, 0x1, 0xf012edb0) at
soo_write+0x20
soo_write(0xf7250a60, 0x1, 0xf718f158, 0x4000, 0x4000, 0xf718f180) at
dofilewrite+0x8c
dofilewrite(0xf7250a60, 0xf727ff28, 0xf727ff20, 0xf012b274, 0xf727ff28,
0xf727ff20) at sys_write+0x70
sys_write(0x4, 0xf727ffb0, 0x0, 0x3991, 0x37e2c, 0xfffffffe) at syscall+0x1f4
syscall(0x1, 0x8d1d8, 0x4000, 0xf0002000, 0x3, 0xf06fdb80) at _syscall+0xcc
db{0}>
Typing 'c' a few times would get the box going again, but it returned to
the debugger a few minutes later so I finally rebooted to a 1.6_STABLE
kernel.
Are these 'debugger' breaks really serious, or more of a diagnostics issue?
LOCKDEBUG is a diagnostic checking tool. the debugger break here is
because a bad condition occured that _should never happen_ and so it
dropped to ddb so you could maybe figure out why... the issues are that
(a) sometimes IPI's don't work and (b) this causes locks to get broken
somehow, maybe...
i've spent a bunch of time trying to figure this out but i haven't yet.
.mrg.