Subject: kern/19288: patches for softdep problems
To: None <gnats-bugs@gnats.netbsd.org>
From: None <stephen@degler.net>
List: netbsd-bugs
Date: 12/04/2002 23:10:45
>Number: 19288
>Category: kern
>Synopsis: patches for softdep problems
>Confidential: no
>Severity: serious
>Priority: high
>Responsible: kern-bug-people
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Wed Dec 04 20:11:00 PST 2002
>Closed-Date:
>Last-Modified:
>Originator: Stephen Degler
>Release: NetBSD 1.6K
>Organization:
Very little, at best.
>Environment:
System: NetBSD kashmir.degler.net 1.6K NetBSD 1.6K (KASHMIR-$Revision: 1.500 $) #1: Fri Sep 27 00:21:32 EDT 2002 sdegler@bauhaus.degler.net:/vol1/NetBSD/kernels/KASHMIR i386
Architecture: i386
Machine: i386
>Description:
While stress testing softdeps on ffs with dbench, I encountered a
series of problems. Dbench can be found at ftp://samba.org/pub/tridge/dbench
Problems encountered.
1) After running dbench with 64 children or more, sub processes are
hung in disk wait state, deadlocked on busy buffers. This is due to
softdep_flush_indir. I'm not exactly sure what's wrong with this
routine, but trivial efforts to fix it did not change the problem. I
ended up just removing this routine and arranging for
softdep_setup_allocindir_page to sleep until a reasonable amount of
bufcache was available.
2) After fixing #1 I got a panic from softdep_allocindir_page, because
it called softdep_setup_pagecache with the lock held, but
softdep_setup_pagecache called pool_get with PR_WAITOK and then was
put to sleep due to memory exhaustion. The next process in found the
softdep lock held and paniced. Reordering the code fixed this.
3) Similar to #2 but in softdep_setup_allocdirect.
Finally, the code still seems dangerous, calling pool_get(xxx,PR_WAITOK)
with splbio held is going to result in bad things when memory is low.
>How-To-Repeat:
build a current kernel with SOFTDEPS
Running the follwing script on a filesystem mounted with softdeps
should trigger the problems:
#!/bin/ksh
ulimit -p 1024
ulimit -n 1024
for i in 256 1 2 4 8 16 32 64 128
do
dbench $i
done
I was using a pc with 1Gb ram.
>Fix:
Index: ffs_softdep.c
===================================================================
RCS file: /cvsroot/syssrc/sys/ufs/ffs/ffs_softdep.c,v
retrieving revision 1.35
diff -u -r1.35 ffs_softdep.c
--- ffs_softdep.c 2002/09/27 15:38:04 1.35
+++ ffs_softdep.c 2002/12/05 03:29:58
@@ -57,7 +57,7 @@
#include <uvm/uvm.h>
struct pool sdpcpool;
-u_int softdep_lockedbufs;
+volatile u_int softdep_lockedbufs;
/*
* For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
@@ -192,7 +192,6 @@
#endif
void softdep_pageiodone __P((struct buf *));
void softdep_flush_vnode __P((struct vnode *, ufs_lbn_t));
-static void softdep_flush_indir __P((struct vnode *));
/*
* Exported softdep operations.
@@ -244,6 +243,8 @@
static struct lockit {
int lkt_spl;
volatile pid_t lkt_held;
+ void *lkt_addr;
+ void *lkt_addr2;
} lk = { 0, -1 };
static int lockcnt;
@@ -263,12 +264,15 @@
{
if (lk->lkt_held != -1) {
if (lk->lkt_held == CURPROC_PID)
- panic("softdep_lock: locking against myself");
+ panic("softdep_lock: locking against myself: prev caller: %p %p",lk->lkt_addr,lk->lkt_addr2);
else
- panic("softdep_lock: lock held by %d", lk->lkt_held);
+ panic("softdep_lock: lock held by %d: prev caller: %p %p",
+ lk->lkt_held,lk->lkt_addr,lk->lkt_addr2);
}
lk->lkt_spl = splbio();
lk->lkt_held = CURPROC_PID;
+ lk->lkt_addr = __builtin_return_address(0);
+ lk->lkt_addr2 = __builtin_return_address(1);
lockcnt++;
}
@@ -290,13 +294,15 @@
{
if (lk->lkt_held != -1) {
if (lk->lkt_held == CURPROC_PID)
- panic("softdep_lock_interlocked: locking against self");
+ panic("softdep_lock_interlocked: locking against myself: prev caller: %p %p",lk->lkt_addr,lk->lkt_addr2);
else
- panic("softdep_lock_interlocked: lock held by %d",
- lk->lkt_held);
+ panic("softdep_lock_interlocked: lock held by %d: prev caller: %p %p",
+ lk->lkt_held,lk->lkt_addr,lk->lkt_addr2);
}
lk->lkt_spl = s;
lk->lkt_held = CURPROC_PID;
+ lk->lkt_addr = __builtin_return_address(0);
+ lk->lkt_addr2 = __builtin_return_address(1);
lockcnt++;
}
@@ -1408,6 +1414,7 @@
}
LIST_REMOVE(newblk, nb_hash);
pool_put(&newblk_pool, newblk);
+ FREE_LOCK(&lk);
/*
* If we were not passed a bp to attach the dep to,
@@ -1423,6 +1430,7 @@
UVMHIST_LOG(ubchist, "bp = %p, size = %d -> %d",
bp, (int)oldsize, (int)newsize, 0);
}
+ ACQUIRE_LOCK(&lk);
WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
if (lbn >= NDADDR) {
/* allocating an indirect block */
@@ -1661,17 +1669,18 @@
/*
* If we are already holding "many" buffers busy (as the safe copies
- * of indirect blocks) flush the dependency for one of those before
- * potentially tying up more. otherwise we could fill the
- * buffer cache with busy buffers and deadlock.
- * XXXUBC I'm sure there's a better way to deal with this.
+ * of indirect blocks) wait for them to complete. This avoids a
+ * buffer cache deadlock
*/
- while (softdep_lockedbufs > nbuf >> 2) {
- softdep_flush_indir(ITOV(ip));
+ while (softdep_lockedbufs > (nbuf >> 2) ) {
+ (void) tsleep((void *)&softdep_lockedbufs, PRIBIO+1, "softdbufs", 0);
}
aip = newallocindir(ip, ptrno, newblkno, oldblkno);
+ if (nbp == NULL) {
+ nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize);
+ }
ACQUIRE_LOCK(&lk);
/*
* If we are allocating a directory page, then we must
@@ -1681,9 +1690,6 @@
if ((ip->i_ffs_mode & IFMT) == IFDIR &&
pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
- if (nbp == NULL) {
- nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize);
- }
WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
FREE_LOCK(&lk);
setup_allocindir_phase2(bp, ip, aip);
@@ -1796,6 +1802,9 @@
brelse(newindirdep->ir_savebp);
KDASSERT(softdep_lockedbufs != 0);
softdep_lockedbufs--;
+ if ( softdep_lockedbufs == nbuf >> 2 ) {
+ wakeup((void *)&softdep_lockedbufs);
+ }
}
WORKITEM_FREE(newindirdep, D_INDIRDEP);
}
@@ -2455,6 +2464,9 @@
brelse(bp);
KDASSERT(softdep_lockedbufs != 0);
softdep_lockedbufs--;
+ if ( softdep_lockedbufs == nbuf >> 2 ) {
+ wakeup((void *)&softdep_lockedbufs);
+ }
return (allerror);
}
@@ -3288,7 +3300,9 @@
brelse(indirdep->ir_savebp);
KDASSERT(softdep_lockedbufs != 0);
softdep_lockedbufs--;
-
+ if ( softdep_lockedbufs == nbuf >> 2 ) {
+ wakeup((void *)&softdep_lockedbufs);
+ }
/* inline expand WORKLIST_REMOVE(wk); */
wk->wk_state &= ~ONWORKLIST;
LIST_REMOVE(wk, wk_list);
@@ -5283,7 +5297,7 @@
{
struct vnode *vp = ITOV(ip);
struct buf *bp;
- int s;
+ /* int s; */
UVMHIST_FUNC("softdep_setup_pagecache"); UVMHIST_CALLED(ubchist);
/*
@@ -5294,9 +5308,9 @@
bp = softdep_lookup_pcbp(vp, lbn);
if (bp == NULL) {
- s = splbio();
+ /* s = splbio(); */
bp = pool_get(&sdpcpool, PR_WAITOK);
- splx(s);
+ /* splx(s); */
bp->b_vp = vp;
bp->b_lblkno = lbn;
@@ -5358,36 +5372,6 @@
return (NULL);
}
-
-/*
- * Flush some dependent page cache data for any vnode *except*
- * the one specified.
- * XXXUBC this is a horrible hack and it's probably not too hard to deadlock
- * even with this, but it's better than nothing.
- */
-
-static void
-softdep_flush_indir(vp)
- struct vnode *vp;
-{
- struct buf *bp;
- int i;
-
- for (i = 0; i < PCBPHASHSIZE; i++) {
- LIST_FOREACH(bp, &pcbphashhead[i], b_hash) {
- if (bp->b_vp == vp ||
- LIST_FIRST(&bp->b_dep)->wk_type != D_ALLOCINDIR) {
- continue;
- }
-
- VOP_FSYNC(bp->b_vp, curproc->p_ucred, FSYNC_WAIT, 0, 0,
- curproc);
- return;
- }
- }
- printf("softdep_flush_indir: nothing to flush?\n");
-}
-
static struct buf *
softdep_lookup_pcbp(vp, lbn)
>Release-Note:
>Audit-Trail:
>Unformatted: