Subject: Re: PR/34293 CVS commit: src/sys/dev
To: None <kern-bug-people@netbsd.org, gnats-admin@netbsd.org,>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: netbsd-bugs
Date: 09/10/2006 06:45:03
The following reply was made to PR kern/34293; it has been noted by GNATS.

From: yamt@mwd.biglobe.ne.jp (YAMAMOTO Takashi)
To: mlelstv@serpens.de
Cc: gnats-bugs@NetBSD.org, kern-bug-people@NetBSD.org,
	gnats-admin@NetBSD.org, netbsd-bugs@NetBSD.org
Subject: Re: PR/34293 CVS commit: src/sys/dev
Date: Sun, 10 Sep 2006 15:40:06 +0900 (JST)

 --NextPart-20060910152350-0213200
 Content-Type: Text/Plain; charset=us-ascii
 
 > >  >  If I remember previous discussions there was some opposition in
 > >  >  restricting writes this way. Please check the discussions about
 > >  >  untarring source trees and waiting for X11 to respond again.
 > >  
 > >  can you provide a pointer?
 > 
 > There is http://mail-index.netbsd.org/current-users/2004/08/30/0033.html,
 > but there are earlier threads I have to look up.
 
 thanks.
 
 > If you limit dirty buffers to 1/2 of total buffer cache you will
 > still deadlock. It doesn't matter what limit you put on dirty
 > buffers. Whenever that limit it reached then vnd cannot process
 > them because for doing so it would have to exceed that limit. You
 > must make sure that the filesystem that vnd is calling through
 > VOP_STRATEGY _can_ allocate new buffers, even those that it
 > can use for writing.
 
 which filesystem are you talking about?
 
 > If my patch doesn't make sense, then try to understand why it works.
 
 now i've tried your patch.  it didn't work for me.
 vndthread just got deadlock on "getnewbuf" without your
 throttling code triggered at all.
 
 	(/bigfile is 512M file w/o holes on ufs.)
 	vnconfig vnd0 /bigfile
 	newfs -F /dev/rvnd0d
 	mount /dev/vnd0d /mnt
 	dd if=/dev/zero of=/mnt/a bs=1m seek=1m  (ENOSPC)
 	vnconfig vnd1 /mnt/a
 	dd if=/dev/zero of=/dev/vnd1d bs=1m seek=1m  (deadlock)
 
 (i can't follow your recipe in the PR because i don't have 10G disk space.)
 
 > No, if the blocks are allocated for another filesystem or another device,
 > it is perfectly possible for these to be processed. So you only want
 > to avoid blocks to be dirtied that end on the device that cannot
 > process them or at least cannot process them fast enough.
 
 yes, it's what i meant.  sorry if it wasn't clear.
 
 > >  "the writer" here includes attempts of cleaning buffers.
 > >  it just introduces another deadlock.
 > 
 > The writer does never clean blocks. The writer can only create
 > more dirty blocks. The blocks are "clean" again when they
 > are processed by the device, i.e. in some iodone routine
 > which is called in an interrupt. How should a sleeping process
 > prevent that? In particular, at that time the writer is stopped
 > in vndstrategy, we definitely know that the vndthread _is_ busy
 > cleaning buffers, because that is the stop condition. We therefore
 > make _sure_ that it can do so.
 
 i think that our definitions of "writer" are different.
 mine is "callers of vndstrategy."
 yours is "callers of write(2)".  right?
 
 i've got tired of this discussion and made a proof-of-concept
 patch. (attached)  i hope it's a clearer explanation. :)
 
 YAMAMOTO Takashi
 
 --NextPart-20060910152350-0213200
 Content-Type: Text/Plain; charset=us-ascii
 Content-Disposition: attachment; filename="a.diff"
 
 Index: sys/buf.h
 ===================================================================
 --- sys/buf.h	(revision 1786)
 +++ sys/buf.h	(working copy)
 @@ -213,11 +213,13 @@ do {									\
  #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
  #define	B_XXX		0x02000000	/* Debugging flag. */
  #define	B_VFLUSH	0x04000000	/* Buffer is being synced. */
 +#define	B_DIRTYACC	0x08000000	/* */
  
  #define BUF_FLAGBITS \
      "\20\1AGE\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" \
      "\11DIRTY\12DONE\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
 -    "\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED\32XXX\33VFLUSH"
 +    "\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED\32XXX\33VFLUSH" \
 +    "\34DIRTYACC"
  
  
  /*
 @@ -283,6 +285,11 @@ struct buf *getblk(struct vnode *, daddr
  struct buf *geteblk(int);
  struct buf *incore(struct vnode *, daddr_t);
  
 +struct buf *getblkx(struct vnode *, daddr_t, int, int, int, int);
 +
 +#define	BUFF_READ	1
 +#define	BUFF_NOALLOC	2
 +
  void	minphys(struct buf *);
  int	physio(void (*)(struct buf *), struct buf *, dev_t, int,
  	       void (*)(struct buf *), struct uio *);
 Index: kern/vfs_bio.c
 ===================================================================
 --- kern/vfs_bio.c	(revision 1786)
 +++ kern/vfs_bio.c	(working copy)
 @@ -116,6 +116,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 
  u_int	nbuf;			/* XXX - for softdep_lockedbufs */
  u_int	bufpages = BUFPAGES;	/* optional hardwired count */
  u_int	bufcache = BUFCACHE;	/* max % of RAM to use for buffer cache */
 +static size_t buf_dirtybytes;
  
  /* Function prototypes */
  struct bqueue;
 @@ -127,7 +128,7 @@ static void bufpool_page_free(struct poo
  static inline struct buf *bio_doread(struct vnode *, daddr_t, int,
      kauth_cred_t, int);
  static struct buf *getnewbuf(int, int, int);
 -static int buf_lotsfree(void);
 +static int buf_lotsfree(int);
  static int buf_canrelease(void);
  static inline u_long buf_mempoolidx(u_long);
  static inline u_long buf_roundsize(u_long);
 @@ -139,6 +140,7 @@ int count_lock_queue(void); /* XXX */
  #ifdef DEBUG
  static int checkfreelist(struct buf *, struct bqueue *);
  #endif
 +static void allocbufx(struct buf *, int, int, int);
  
  /*
   * Definitions for the buffer hash lists.
 @@ -361,6 +363,35 @@ buf_memcalc(void)
  	return (n);
  }
  
 +static void
 +bufcache_dirtyacc_relse(struct buf *bp)
 +{
 +
 +	if ((bp->b_flags & (B_DIRTYACC | B_DELWRI)) == B_DIRTYACC) {
 +		KASSERT(buf_dirtybytes >= bp->b_bcount);
 +		bp->b_flags &= ~B_DIRTYACC;
 +		buf_dirtybytes -= bp->b_bcount;
 +	}
 +}
 +
 +static void
 +bufcache_dirtyacc_write(struct buf *bp)
 +{
 +
 +	if ((bp->b_flags & B_DIRTYACC) == 0) {
 +		bp->b_flags |= B_DIRTYACC;
 +		buf_dirtybytes += bp->b_bcount; /* XXX */
 +		KASSERT(buf_dirtybytes <= bufmem); /* XXX */
 +	}
 +}
 +
 +static boolean_t
 +bufcache_dirtyacc_needthrottle(int flags)
 +{
 +
 +	return (flags & BUFF_READ) == 0 && buf_dirtybytes > bufmem / 4 * 3;
 +}
 +
  /*
   * Initialize buffers and hash links for buffers.
   */
 @@ -430,7 +461,7 @@ bufinit(void)
  }
  
  static int
 -buf_lotsfree(void)
 +buf_lotsfree(int flags)
  {
  	int try, thresh;
  	struct lwp *l = curlwp;
 @@ -439,6 +470,9 @@ buf_lotsfree(void)
  	if (l->l_flag & L_COWINPROGRESS)
  		return 1;
  
 +	if (bufcache_dirtyacc_needthrottle(flags))
 +		return 0;
 +
  	/* Always allocate if less than the low water mark. */
  	if (bufmem < bufmem_lowater)
  		return 1;
 @@ -729,6 +763,7 @@ bwrite(struct buf *bp)
  
  	s = splbio();
  	simple_lock(&bp->b_interlock);
 +	bufcache_dirtyacc_write(bp);
  
  	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
  
 @@ -821,6 +856,7 @@ bdwrite(struct buf *bp)
  		reassignbuf(bp, bp->b_vp);
  	}
  
 +	bufcache_dirtyacc_write(bp);
  	/* Otherwise, the "write" is done, so mark and release the buffer. */
  	CLR(bp->b_flags, B_DONE);
  	simple_unlock(&bp->b_interlock);
 @@ -842,6 +878,7 @@ bawrite(struct buf *bp)
  
  	KASSERT(ISSET(bp->b_flags, B_BUSY));
  
 +	bufcache_dirtyacc_write(bp);
  	SET(bp->b_flags, B_ASYNC);
  	simple_unlock(&bp->b_interlock);
  	splx(s);
 @@ -987,6 +1024,7 @@ already_queued:
  	CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE);
  	SET(bp->b_flags, B_CACHE);
  
 +	bufcache_dirtyacc_relse(bp);
  	/* Allow disk interrupts. */
  	simple_unlock(&bp->b_interlock);
  	simple_unlock(&bqueue_slock);
 @@ -1032,6 +1070,14 @@ incore(struct vnode *vp, daddr_t blkno)
  struct buf *
  getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
  {
 +
 +	return getblkx(vp, blkno, size, slpflag, slptimeo, 0);
 +}
 +
 +struct buf *
 +getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
 +    int flags)
 +{
  	struct buf *bp;
  	int s, err;
  	int preserve;
 @@ -1067,7 +1113,7 @@ start:
  		bremfree(bp);
  		preserve = 1;
  	} else {
 -		if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) {
 +		if ((bp = getnewbuf(slpflag, slptimeo, flags)) == NULL) {
  			simple_unlock(&bqueue_slock);
  			splx(s);
  			goto start;
 @@ -1126,12 +1172,23 @@ geteblk(int size)
   * start a write.  If the buffer grows, it's the callers
   * responsibility to fill out the buffer's additional contents.
   */
 +
  void
  allocbuf(struct buf *bp, int size, int preserve)
  {
 +
 +	allocbufx(bp, size, preserve, 0);
 +}
 +
 +static void
 +allocbufx(struct buf *bp, int size, int preserve, int flags)
 +{
  	vsize_t oldsize, desired_size;
  	caddr_t addr;
  	int s, delta;
 +	int dirtydelta = 0;
 +
 +	KASSERT((bp->b_flags & B_BUSY) != 0);
  
  	desired_size = buf_roundsize(size);
  	if (desired_size > MAXBSIZE)
 @@ -1159,10 +1216,19 @@ allocbuf(struct buf *bp, int size, int p
  	 * Update overall buffer memory counter (protected by bqueue_slock)
  	 */
  	delta = (long)desired_size - (long)oldsize;
 -
 +	if ((bp->b_flags & B_DIRTYACC) != 0) {
 +		dirtydelta -= oldsize;
 +		bp->b_flags &= ~B_DIRTYACC;
 +	}
 +	if ((flags & BUFF_READ) == 0) {
 +		dirtydelta += desired_size;
 +		bp->b_flags |= B_DIRTYACC;
 +	}
  	s = splbio();
  	simple_lock(&bqueue_slock);
 -	if ((bufmem += delta) > bufmem_hiwater) {
 +	bufmem += delta;
 +	buf_dirtybytes += dirtydelta;
 +	if (bufmem > bufmem_hiwater) {
  		/*
  		 * Need to trim overall memory usage.
  		 */
 @@ -1185,6 +1251,34 @@ allocbuf(struct buf *bp, int size, int p
  	splx(s);
  }
  
 +static struct buf *
 +bq_findvictim(struct bqueue *bq, int flags)
 +{
 +	struct buf *bp;
 +	const boolean_t throttledirty = bufcache_dirtyacc_needthrottle(flags);
 +
 +	TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
 +		simple_lock(&bp->b_interlock);
 +		if (!throttledirty || (bp->b_flags & B_DELWRI) != 0) {
 +			break;
 +		}
 +		simple_unlock(&bp->b_interlock);
 +	}
 +	return bp;
 +}
 +
 +static struct buf *
 +bufcache_findvictim(int flags)
 +{
 +	struct buf *bp;
 +
 +	bp = bq_findvictim(&bufqueues[BQ_AGE], flags);
 +	if (bp == NULL) {
 +		bp = bq_findvictim(&bufqueues[BQ_LRU], flags);
 +	}
 +	return bp;
 +}
 +
  /*
   * Find a buffer which is available for use.
   * Select something from a free list.
 @@ -1194,7 +1288,7 @@ allocbuf(struct buf *bp, int size, int p
   * Return buffer locked.
   */
  struct buf *
 -getnewbuf(int slpflag, int slptimeo, int from_bufq)
 +getnewbuf(int slpflag, int slptimeo, int flags)
  {
  	struct buf *bp;
  
 @@ -1205,7 +1299,7 @@ start:
  	 * Get a new buffer from the pool; but use NOWAIT because
  	 * we have the buffer queues locked.
  	 */
 -	if (!from_bufq && buf_lotsfree() &&
 +	if ((flags & BUFF_NOALLOC) == 0 && buf_lotsfree(flags) &&
  	    (bp = pool_get(&bufpool, PR_NOWAIT)) != NULL) {
  		memset((char *)bp, 0, sizeof(*bp));
  		BUF_INIT(bp);
 @@ -1219,15 +1313,11 @@ start:
  		return (bp);
  	}
  
 -	if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL ||
 -	    (bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) {
 -		simple_lock(&bp->b_interlock);
 +	bp = bufcache_findvictim(flags);
 +	if (bp != NULL) {
  		bremfree(bp);
  	} else {
 -		/*
 -		 * XXX: !from_bufq should be removed.
 -		 */
 -		if (!from_bufq || curproc != uvm.pagedaemon_proc) {
 +		if (curproc != uvm.pagedaemon_proc) {
  			/* wait for a free buffer of any kind */
  			needbuffer = 1;
  			ltsleep(&needbuffer, slpflag|(PRIBIO + 1),
 @@ -1305,10 +1395,11 @@ buf_trim(void)
  	long size = 0;
  
  	/* Instruct getnewbuf() to get buffers off the queues */
 -	if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
 +	if ((bp = getnewbuf(PCATCH, 1, BUFF_NOALLOC | BUFF_READ)) == NULL)
  		return 0;
  
  	KASSERT(!ISSET(bp->b_flags, B_WANTED));
 +	KASSERT(!ISSET(bp->b_flags, B_DIRTYACC));
  	simple_unlock(&bp->b_interlock);
  	size = bp->b_bufsize;
  	bufmem -= size;
 Index: ufs/ufs/ufs_bmap.c
 ===================================================================
 --- ufs/ufs/ufs_bmap.c	(revision 1595)
 +++ ufs/ufs/ufs_bmap.c	(working copy)
 @@ -223,7 +223,8 @@ ufs_bmaparray(struct vnode *vp, daddr_t 
  			brelse(bp);
  
  		xap->in_exists = 1;
 -		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
 +		bp = getblkx(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0,
 +		    BUFF_READ);
  		if (bp == NULL) {
  
  			/*
 
 --NextPart-20060910152350-0213200--