Subject: re: SMP enabled
To: Hauke Fath <hauke@Espresso.Rhein-Neckar.DE>
From: matthew green <mrg@eterna.com.au>
List: port-sparc
Date: 01/08/2003 18:59:49
   
   With Paul's changes, I have managed to build a kernel with make -j3 on my
   dual-SM71 ss10 just fine.
   
   Then, I got adventurous and let the smp kernel (DIAGNOSTIC, DEBUG,
   LOCKDEBUG)) run for the nightly amanda backup. It stayed up for about half
   an hour into the backup, spitting messages like
   
   Jan  8 02:38:05 pizza /netbsd: xcall(cpu0,0xf000862c): couldn't ping cpus: cpu1
   Jan  8 02:38:18 pizza /netbsd: xcall(cpu1,0xf000862c): couldn't ping cpus: cpu0
   Jan  8 02:38:18 pizza /netbsd: xcall(cpu1,0xf01ce2ec): couldn't ping cpus: cpu0
   Jan  8 02:38:18 pizza /netbsd: xcall(cpu0,0xf000862c): couldn't ping cpus: cpu1

i get these xcall() messages seemingly _only_ if i use LOCKDEBUG...
   
   then broke into the debugger with something like
   
   
   simple_lock: locking against myself
   lock: 0xf023ecfc, currently at: /usr/src/sys/arch/sparc/sparc/pmap.c:689
   on cpu 0
   last locked: /usr/src/sys/arch/sparc/sparc/pmap.c:689
   last unlocked: /usr/src/sys/arch/sparc/sparc/pmap.c:708
   0x0(0xf0adf000, 0x1d, 0xf0ae7000, 0xf023e554, 0x0, 0xfffffffe) at
   pmap_kremove4m+0x16c
   pmap_kremove4m(0xf0249910, 0xf0adf000, 0xf0ae7000, 0xf0232424, 0x100,
   0xf023e554) at uvm_unmap_remove+0x16c
   uvm_unmap_remove(0xf0249910, 0xf0adf000, 0xf0ae7000, 0x408000e5, 0x33, 0x18060)
   at uvm_unmap+0x110
   uvm_unmap(0xf0249910, 0xf0adf000, 0x8000, 0xf028280c, 0xf0899910, 0xfff) at
   uvm_km_free+0x14
   uvm_km_free(0xf0adf000, 0x68, 0x0, 0xf573e000, 0xf08e1a50, 0x3d461400) at
   free+0x118
   free(0xf22abab0, 0xf00e8aa8, 0xf22abab0, 0x8000, 0xf22abab0, 0xf2284000) at
   soft
   dep_disk_write_complete+0x254
   softdep_disk_write_complete(0xf22abab0, 0x500, 0x0, 0xf01bf3f0, 0xf01b1a94,
   0xc3) at biodone+0x74
   biodone(0x0, 0x22009, 0x22009, 0xe0000, 0xf08f9498, 0x8) at
   scsipi_complete+0x4dc
   scsipi_complete(0xf08f8f00, 0x5d73b, 0xf01b1a80, 0xf023e554, 0x0,
   0xfffffffe) at
    scsipi_done+0x168
   scsipi_done(0xf08e3e00, 0xf08f9498, 0x14, 0xf01b1a94, 0x100, 0xf023e554) at
   ncr53c9x_done+0x1c8
   ncr53c9x_done(0x3, 0xf0087020, 0x500, 0x408000e5, 0x33, 0x18060) at
   ncr53c9x_intr+0x12bc
   ncr53c9x_intr(0x404000e6, 0xf02110c8, 0x2ff, 0xf5fb7000, 0xf089cf90, 0x4000) at
   sparc_interrupt44c+0x150
   sparc_interrupt44c(0xf01ce2ec, 0x1, 0x0, 0x0, 0x0, 0xffffffff) at xcall+0x3a0
   xcall(0x0, 0xf02ee688, 0x0, 0x182e99a, 0x0, 0xf028bab4) at updatepte4m+0x50
   updatepte4m(0xf74a2000, 0x182e900, 0x1, 0xf06fdb80, 0x2, 0xf7250a60) at
   pmap_ken
   ter_pa4m+0xcc
   pmap_kenter_pa4m(0x4000, 0xf727fe28, 0xf09e0300, 0xf0205400, 0xf023b800,
   0xf023d
   000) at sosend_loan+0x1e0
   sosend_loan(0xf0aaef10, 0x0, 0xf727fe28, 0xf09e0300, 0x0, 0x0) at sosend+0x5c0
   sosend(0xf718f158, 0xf718f180, 0xf727fe28, 0xf0ace700, 0x1, 0xf012edb0) at
   soo_write+0x20
   soo_write(0xf7250a60, 0x1, 0xf718f158, 0x4000, 0x4000, 0xf718f180) at
   dofilewrite+0x8c
   dofilewrite(0xf7250a60, 0xf727ff28, 0xf727ff20, 0xf012b274, 0xf727ff28,
   0xf727ff20) at sys_write+0x70
   sys_write(0x4, 0xf727ffb0, 0x0, 0x3991, 0x37e2c, 0xfffffffe) at syscall+0x1f4
   syscall(0x1, 0x8d1d8, 0x4000, 0xf0002000, 0x3, 0xf06fdb80) at _syscall+0xcc
   Stopped in pid 1677 (gzip) at   cpu_Debugger+0x8:       call
   esigcode
   
   db{0}> t
   cpu_Debugger(0xf023ecfc, 0xf021ae18, 0x2b1, 0xdeadbeef, 0x0, 0xc3) at
   _simple_lock+0x294
   _simple_lock(0xf0adf000, 0xf02d3f7c, 0xffffffff, 0x0, 0x0, 0x2b) at
   updatepte4m+0x24
   updatepte4m(0xf0adf000, 0x1d, 0xf0ae7000, 0xf023e554, 0x0, 0xfffffffe) at
   pmap_kremove4m+0x16c
   pmap_kremove4m(0xf0249910, 0xf0adf000, 0xf0ae7000, 0xf0232424, 0x100,
   0xf023e554) at uvm_unmap_remove+0x16c
   uvm_unmap_remove(0xf0249910, 0xf0adf000, 0xf0ae7000, 0x408000e5, 0x33, 0x18060)
   at uvm_unmap+0x110
   uvm_unmap(0xf0249910, 0xf0adf000, 0x8000, 0xf028280c, 0xf0899910, 0xfff) at
   uvm_km_free+0x14
   uvm_km_free(0xf0adf000, 0x68, 0x0, 0xf573e000, 0xf08e1a50, 0x3d461400) at
   free+0x118
   free(0xf22abab0, 0xf00e8aa8, 0xf22abab0, 0x8000, 0xf22abab0, 0xf2284000) at
   softdep_disk_write_complete+0x254
   softdep_disk_write_complete(0xf22abab0, 0x500, 0x0, 0xf01bf3f0, 0xf01b1a94,
   0xc3) at biodone+0x74
   biodone(0x0, 0x22009, 0x22009, 0xe0000, 0xf08f9498, 0x8) at
   scsipi_complete+0x4dc
   scsipi_complete(0xf08f8f00, 0x5d73b, 0xf01b1a80, 0xf023e554, 0x0,
   0xfffffffe) at
    scsipi_done+0x168
   scsipi_done(0xf08e3e00, 0xf08f9498, 0x14, 0xf01b1a94, 0x100, 0xf023e554) at
   ncr53c9x_done+0x1c8
   ncr53c9x_done(0x3, 0xf0087020, 0x500, 0x408000e5, 0x33, 0x18060) at
   ncr53c9x_intr+0x12bc
   ncr53c9x_intr(0x404000e6, 0xf02110c8, 0x2ff, 0xf5fb7000, 0xf089cf90,
   0x4000) at sparc_interrupt44c+0x150
   sparc_interrupt44c(0xf01ce2ec, 0x1, 0x0, 0x0, 0x0, 0xffffffff) at xcall+0x3a0
   xcall(0x0, 0xf02ee688, 0x0, 0x182e99a, 0x0, 0xf028bab4) at updatepte4m+0x50
   updatepte4m(0xf74a2000, 0x182e900, 0x1, 0xf06fdb80, 0x2, 0xf7250a60) at
   pmap_kenter_pa4m+0xcc
   pmap_kenter_pa4m(0x4000, 0xf727fe28, 0xf09e0300, 0xf0205400, 0xf023b800,
   0xf023d000) at sosend_loan+0x1e0
   sosend_loan(0xf0aaef10, 0x0, 0xf727fe28, 0xf09e0300, 0x0, 0x0) at sosend+0x5c0
   sosend(0xf718f158, 0xf718f180, 0xf727fe28, 0xf0ace700, 0x1, 0xf012edb0) at
   soo_write+0x20
   soo_write(0xf7250a60, 0x1, 0xf718f158, 0x4000, 0x4000, 0xf718f180) at
   dofilewrite+0x8c
   dofilewrite(0xf7250a60, 0xf727ff28, 0xf727ff20, 0xf012b274, 0xf727ff28,
   0xf727ff20) at sys_write+0x70
   sys_write(0x4, 0xf727ffb0, 0x0, 0x3991, 0x37e2c, 0xfffffffe) at syscall+0x1f4
   syscall(0x1, 0x8d1d8, 0x4000, 0xf0002000, 0x3, 0xf06fdb80) at _syscall+0xcc
   db{0}>
   
   
   Typing 'c' a few times would get the box going again, but it returned to
   the debugger a few minutes  later so I finally rebooted to a 1.6_STABLE
   kernel.
   
   Are these 'debugger' breaks really serious, or more of a diagnostics issue?

LOCKDEBUG is a diagnostic checking tool.  the debugger break here is
because a bad condition occured that _should never happen_ and so it
dropped to ddb so you could maybe figure out why...  the issues are that
(a) sometimes IPI's don't work and (b) this causes locks to get broken
somehow, maybe...

i've spent a bunch of time trying to figure this out but i haven't yet.


.mrg.