Subject: Re: buffer cache memory management revision
To: None <tech-kern@netbsd.org>
From: Paul Kranenburg <pk@cs.few.eur.nl>
List: tech-kern
Date: 11/23/2003 22:14:26
I frobbed vfs_bio.c some more. Here's the result for your inspection.
There's now support for small buffers of up to 1K and 2K in size.
The fixed `buf' array in gone. All buffers are allocated from the
`bufpool' pool. When a buffer's memory is taken away, it will be
returned to the pool. Thus, there's no longer a queue of `empty'
buffers.
Note that there's just a single knob to control how many buffers stay
around on the queues: the configured total amount of memory attached to
the buffers, which is still called `bufpages'.
-pk
Index: vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.97
diff -c -r1.97 vfs_bio.c
*** vfs_bio.c 8 Nov 2003 04:22:35 -0000 1.97
--- vfs_bio.c 23 Nov 2003 21:11:57 -0000
***************
*** 77,82 ****
--- 77,83 ----
* UNIX Operating System (Addison Welley, 1989)
*/
+ #include "opt_bufcache.h"
#include "opt_softdep.h"
#include <sys/cdefs.h>
***************
*** 84,89 ****
--- 85,91 ----
#include <sys/param.h>
#include <sys/systm.h>
+ #include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
***************
*** 96,101 ****
--- 98,121 ----
#include <miscfs/specfs/specdev.h>
+ #ifndef BUFPAGES
+ # define BUFPAGES 0
+ #endif
+
+ #ifdef BUFCACHE
+ # if (BUFCACHE < 5) || (BUFCACHE > 95)
+ # error BUFCACHE is not between 5 and 95
+ # endif
+ #else
+ /* Default to 10% of first 2MB and 5% of remaining. */
+ # define BUFCACHE 0
+ #endif
+
+ u_int nbuf = 0; /* XXX - for softdep_lockedbufs */
+ u_int bufpages = BUFPAGES; /* optional hardwired count */
+ u_int bufcache = BUFCACHE; /* % of RAM to use for buffer cache */
+
+
/* Macros to clear/set/test flags. */
#define SET(t, f) (t) |= (f)
#define CLR(t, f) (t) &= ~(f)
***************
*** 142,147 ****
--- 162,175 ----
*/
struct pool bufpool;
+ /* Small buffer memory pools */
+ static struct pool buf1k, buf2k;
+
+ /* Buffer memory management variables */
+ u_long bufmem_hiwater;
+ u_long bufmem_lowater;
+ u_long bufmem;
+
/*
* bread()/breadn() helper.
*/
***************
*** 187,226 ****
void
bufinit()
{
- struct buf *bp;
struct bqueues *dp;
- u_int i, base, residual;
/*
! * Initialize the buffer pool. This pool is used for buffers
! * which are strictly I/O control blocks, not buffer cache
! * buffers.
*/
pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
TAILQ_INIT(dp);
! bufhashtbl = hashinit(nbuf, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
! base = bufpages / nbuf;
! residual = bufpages % nbuf;
! for (i = 0; i < nbuf; i++) {
! bp = &buf[i];
! memset((char *)bp, 0, sizeof(*bp));
! BUF_INIT(bp);
! bp->b_dev = NODEV;
! bp->b_vnbufs.le_next = NOLIST;
! bp->b_data = buffers + i * MAXBSIZE;
! if (i < residual)
! bp->b_bufsize = (base + 1) * PAGE_SIZE;
! else
! bp->b_bufsize = base * PAGE_SIZE;
! bp->b_flags = B_INVAL;
! dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
! binsheadfree(bp, dp);
! binshash(bp, &invalhash);
}
}
static __inline struct buf *
bio_doread(vp, blkno, size, cred, async)
struct vnode *vp;
--- 215,307 ----
void
bufinit()
{
struct bqueues *dp;
/*
! * Initialize the buffer pools.
*/
pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
+ pool_init(&buf1k, 1024, 0, 0, 0, "buf1k", NULL);
+ pool_init(&buf2k, 2048, 0, 0, 0, "buf2k", NULL);
for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
TAILQ_INIT(dp);
!
! /*
! * Determine how many buffers to allocate.
! *
! * - If bufcache is specified, use that % of memory
! * for the buffer cache.
! *
! * - Otherwise, we default to the traditional BSD
! * formula of 10% of the first 2MB and 5% of
! * the remaining.
! */
! if (bufpages == 0) {
! if (bufcache != 0) {
! if (bufcache < 5 || bufcache > 95)
! panic("bufcache is out of range (%d)",
! bufcache);
! bufpages = physmem / 100 * bufcache;
! } else {
! if (physmem < btoc(2 * 1024 * 1024))
! bufpages = physmem / 10;
! else
! bufpages = (btoc(2 * 1024 * 1024) + physmem) /
! 20;
! }
}
+
+ nbuf = bufpages; /* XXX - for softdep_lockedbufs */
+
+ /*
+ * Estimate hash table size based on the amount of memory we
+ * intent to use for the buffer cache and the distribution
+ * of that memory among small and large buffers.
+ * Assume that, on average, there will about the same number
+ * of small and large buffers on the queues. This assumption
+ * seems ok for typical FFS filesystems but not for NFS which
+ * mostly uses large buffers.
+ */
+ bufhashtbl = hashinit(2 * bufpages, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
+
+ bufmem_hiwater = (bufpages + (bufpages >> 3)) << PAGE_SHIFT;
+ bufmem_lowater = (bufpages - (bufpages >> 3)) << PAGE_SHIFT;
+ bufmem = 0;
}
+
+ /*
+ * Buffer memory allocation helper functions
+ */
+ static __inline__ u_long buf_roundsize(u_long size)
+ {
+ return (size + 1023) & -1024;
+ }
+
+ static caddr_t buf_malloc(size_t size)
+ {
+ if (size <= 1024) {
+ return pool_get(&buf1k, PR_WAITOK);
+ } else if (size <= 2048) {
+ return pool_get(&buf2k, PR_WAITOK);
+ } else
+ return (caddr_t)uvm_km_alloc(kernel_map, size);
+ }
+
+ static void buf_mrelease(caddr_t addr, size_t size)
+ {
+ if (size <= 1024) {
+ pool_put(&buf1k, addr);
+ return;
+ } else if (size <= 2048) {
+ pool_put(&buf2k, addr);
+ return;
+ } else
+ uvm_km_free(kernel_map, (vaddr_t)addr, size);
+ }
+
+
static __inline struct buf *
bio_doread(vp, blkno, size, cred, async)
struct vnode *vp;
***************
*** 596,602 ****
}
if (bp->b_bufsize <= 0)
/* no data */
! bufq = &bufqueues[BQ_EMPTY];
else
/* invalid data */
bufq = &bufqueues[BQ_AGE];
--- 677,683 ----
}
if (bp->b_bufsize <= 0)
/* no data */
! goto already_queued;
else
/* invalid data */
bufq = &bufqueues[BQ_AGE];
***************
*** 639,644 ****
--- 720,727 ----
/* Allow disk interrupts. */
simple_unlock(&bp->b_interlock);
simple_unlock(&bqueue_slock);
+ if (bp->b_bufsize <= 0)
+ pool_put(&bufpool, bp);
splx(s);
}
***************
*** 682,687 ****
--- 765,771 ----
{
struct buf *bp;
int s, err;
+ int preserve;
start:
s = splbio();
***************
*** 711,718 ****
#endif
SET(bp->b_flags, B_BUSY);
bremfree(bp);
} else {
! if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) {
simple_unlock(&bqueue_slock);
splx(s);
goto start;
--- 795,803 ----
#endif
SET(bp->b_flags, B_BUSY);
bremfree(bp);
+ preserve = 1;
} else {
! if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) {
simple_unlock(&bqueue_slock);
splx(s);
goto start;
***************
*** 721,726 ****
--- 806,812 ----
binshash(bp, BUFHASH(vp, blkno));
bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
bgetvp(vp, bp);
+ preserve = 0;
}
simple_unlock(&bp->b_interlock);
simple_unlock(&bqueue_slock);
***************
*** 732,738 ****
if (ISSET(bp->b_flags, B_LOCKED)) {
KASSERT(bp->b_bufsize >= size);
} else {
! allocbuf(bp, size);
}
return (bp);
}
--- 818,824 ----
if (ISSET(bp->b_flags, B_LOCKED)) {
KASSERT(bp->b_bufsize >= size);
} else {
! allocbuf(bp, size, preserve);
}
return (bp);
}
***************
*** 749,755 ****
s = splbio();
simple_lock(&bqueue_slock);
! while ((bp = getnewbuf(0, 0)) == 0)
;
SET(bp->b_flags, B_INVAL);
--- 835,841 ----
s = splbio();
simple_lock(&bqueue_slock);
! while ((bp = getnewbuf(0, 0, 0)) == 0)
;
SET(bp->b_flags, B_INVAL);
***************
*** 757,763 ****
simple_unlock(&bqueue_slock);
simple_unlock(&bp->b_interlock);
splx(s);
! allocbuf(bp, size);
return (bp);
}
--- 843,849 ----
simple_unlock(&bqueue_slock);
simple_unlock(&bp->b_interlock);
splx(s);
! allocbuf(bp, size, 0);
return (bp);
}
***************
*** 770,863 ****
* responsibility to fill out the buffer's additional contents.
*/
void
! allocbuf(bp, size)
struct buf *bp;
int size;
{
! struct buf *nbp;
! vsize_t desired_size;
! int s;
! desired_size = round_page((vsize_t)size);
if (desired_size > MAXBSIZE)
! panic("allocbuf: buffer larger than MAXBSIZE requested");
! if (bp->b_bufsize == desired_size)
! goto out;
/*
! * If the buffer is smaller than the desired size, we need to snarf
! * it from other buffers. Get buffers (via getnewbuf()), and
! * steal their pages.
*/
! while (bp->b_bufsize < desired_size) {
! int amt;
! /* find a buffer */
! s = splbio();
! simple_lock(&bqueue_slock);
! while ((nbp = getnewbuf(0, 0)) == NULL)
! ;
!
! SET(nbp->b_flags, B_INVAL);
! binshash(nbp, &invalhash);
!
! simple_unlock(&nbp->b_interlock);
! simple_unlock(&bqueue_slock);
! splx(s);
!
! /* and steal its pages, up to the amount we need */
! amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
! pagemove((nbp->b_data + nbp->b_bufsize - amt),
! bp->b_data + bp->b_bufsize, amt);
! bp->b_bufsize += amt;
! nbp->b_bufsize -= amt;
!
! /* reduce transfer count if we stole some data */
! if (nbp->b_bcount > nbp->b_bufsize)
! nbp->b_bcount = nbp->b_bufsize;
! #ifdef DIAGNOSTIC
! if (nbp->b_bufsize < 0)
! panic("allocbuf: negative bufsize");
! #endif
! brelse(nbp);
! }
/*
! * If we want a buffer smaller than the current size,
! * shrink this buffer. Grab a buf head from the EMPTY queue,
! * move a page onto it, and put it on front of the AGE queue.
! * If there are no free buffer headers, leave the buffer alone.
*/
! if (bp->b_bufsize > desired_size) {
! s = splbio();
! simple_lock(&bqueue_slock);
! if ((nbp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) == NULL) {
! /* No free buffer head */
simple_unlock(&bqueue_slock);
! splx(s);
! goto out;
}
! /* No need to lock nbp since it came from the empty queue */
! bremfree(nbp);
! SET(nbp->b_flags, B_BUSY | B_INVAL);
simple_unlock(&bqueue_slock);
! splx(s);
!
! /* move the page to it and note this change */
! pagemove(bp->b_data + desired_size,
! nbp->b_data, bp->b_bufsize - desired_size);
! nbp->b_bufsize = bp->b_bufsize - desired_size;
! bp->b_bufsize = desired_size;
! nbp->b_bcount = 0;
!
! /* release the newly-filled buffer and leave */
! brelse(nbp);
}
out:
! bp->b_bcount = size;
}
/*
--- 856,938 ----
* responsibility to fill out the buffer's additional contents.
*/
void
! allocbuf(bp, size, preserve)
struct buf *bp;
int size;
+ int preserve;
{
! vsize_t oldsize, desired_size;
! caddr_t addr;
! int s, delta;
! desired_size = buf_roundsize(size);
if (desired_size > MAXBSIZE)
! printf("allocbuf: buffer larger than MAXBSIZE requested");
! bp->b_bcount = size;
!
! oldsize = bp->b_bufsize;
! if (oldsize == desired_size)
! return;
/*
! * If we want a buffer of a different size, re-allocate the
! * buffer's memory; copy old content only if needed.
*/
! addr = buf_malloc(desired_size);
! if (preserve)
! memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
! if (bp->b_data != NULL)
! buf_mrelease(bp->b_data, oldsize);
! bp->b_data = addr;
! bp->b_bufsize = desired_size;
! /*
! * Update overall buffer memory counter (protected by bqueue_slock)
! */
! delta = (long)desired_size - (long)oldsize;
! s = splbio();
! simple_lock(&bqueue_slock);
! if ((bufmem += delta) < bufmem_hiwater)
! goto out;
/*
! * Need to trim overall memory usage.
*/
! while (bufmem > bufmem_lowater) {
! long size;
! int wanted;
!
! /* Instruct getnewbuf() to get buffers off the queues */
! if ((bp = getnewbuf(PCATCH,1,1)) == NULL)
! break;
! wanted = ISSET(bp->b_flags, B_WANTED);
! simple_unlock(&bp->b_interlock);
! if (wanted) {
! printf("buftrim: got WANTED buffer\n");
! SET(bp->b_flags, B_INVAL);
! binshash(bp, &invalhash);
simple_unlock(&bqueue_slock);
! brelse(bp);
! simple_lock(&bqueue_slock);
! break;
}
! size = bp->b_bufsize;
! bufmem -= size;
simple_unlock(&bqueue_slock);
! if (size > 0) {
! buf_mrelease(bp->b_data, size);
! bp->b_bcount = bp->b_bufsize = 0;
! }
! /* brelse() will return the buffer to the global buffer pool */
! brelse(bp);
! simple_lock(&bqueue_slock);
}
out:
! simple_unlock(&bqueue_slock);
! splx(s);
}
/*
***************
*** 869,882 ****
* Return buffer locked.
*/
struct buf *
! getnewbuf(slpflag, slptimeo)
! int slpflag, slptimeo;
{
struct buf *bp;
start:
LOCK_ASSERT(simple_lock_held(&bqueue_slock));
if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
(bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
simple_lock(&bp->b_interlock);
--- 944,971 ----
* Return buffer locked.
*/
struct buf *
! getnewbuf(slpflag, slptimeo, from_bufq)
! int slpflag, slptimeo, from_bufq;
{
struct buf *bp;
start:
LOCK_ASSERT(simple_lock_held(&bqueue_slock));
+ /*
+ * Get a new buffer from the pool; but use NOWAIT because
+ * we have buffer queues locked.
+ */
+ if (bufmem < bufmem_hiwater && !from_bufq &&
+ (bp = pool_get(&bufpool, PR_NOWAIT)) != NULL) {
+ memset((char *)bp, 0, sizeof(*bp));
+ BUF_INIT(bp);
+ bp->b_dev = NODEV;
+ bp->b_vnbufs.le_next = NOLIST;
+ bp->b_flags = B_BUSY;
+ return (bp);
+ }
+
if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
(bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
simple_lock(&bp->b_interlock);
***************
*** 889,894 ****
--- 978,986 ----
return (NULL);
}
+ if (bp->b_bufsize <= 0)
+ printf("buffer %p: on queue but no mem\n", bp);
+
if (ISSET(bp->b_flags, B_VFLUSH)) {
/*
* This is a delayed write buffer being flushed to disk. Make
***************
*** 1039,1044 ****
--- 1131,1211 ----
n++;
simple_unlock(&bqueue_slock);
return (n);
+ }
+
+ /*
+ * Wait for all buffers to complete I/O
+ * Return the number of "stuck" buffers.
+ */
+ int
+ buf_syncwait(void)
+ {
+ struct buf *bp;
+ int iter, nbusy, nbusy_prev = 0, dcount, s, ihash;
+
+ dcount = 10000;
+ for (iter = 0; iter < 20;) {
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ nbusy = 0;
+ for (ihash = 0; ihash < bufhash+1; ihash++) {
+ LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
+ if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
+ nbusy++;
+ /*
+ * With soft updates, some buffers that are
+ * written will be remarked as dirty until other
+ * buffers are written.
+ */
+ if (bp->b_vp && bp->b_vp->v_mount
+ && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
+ && (bp->b_flags & B_DELWRI)) {
+ simple_lock(&bp->b_interlock);
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ nbusy++;
+ simple_unlock(&bp->b_interlock);
+ simple_unlock(&bqueue_slock);
+ bawrite(bp);
+ if (dcount-- <= 0) {
+ printf("softdep ");
+ goto fail;
+ }
+ simple_lock(&bqueue_slock);
+ }
+ }
+ }
+
+ simple_unlock(&bqueue_slock);
+ splx(s);
+
+ if (nbusy == 0)
+ break;
+ if (nbusy_prev == 0)
+ nbusy_prev = nbusy;
+ printf("%d ", nbusy);
+ tsleep(&nbusy, PRIBIO, "bflush",
+ (iter == 0) ? 1 : hz / 25 * iter);
+ if (nbusy >= nbusy_prev) /* we didn't flush anything */
+ iter++;
+ else
+ nbusy_prev = nbusy;
+ }
+
+ if (nbusy) {
+ fail:;
+ #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
+ printf("giving up\nPrinting vnodes for busy buffers\n");
+ for (ihash = 0; ihash < bufhash+1; ihash++) {
+ LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
+ if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
+ vprint(NULL, bp->b_vp);
+ }
+ }
+ #endif
+ }
+
+ return nbusy;
}
#ifdef DEBUG
Index: vfs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.209
diff -c -r1.209 vfs_subr.c
*** vfs_subr.c 12 Nov 2003 20:38:24 -0000 1.209
--- vfs_subr.c 23 Nov 2003 21:12:01 -0000
***************
*** 2600,2607 ****
void
vfs_shutdown()
{
- struct buf *bp;
- int iter, nbusy, nbusy_prev = 0, dcount, s;
struct lwp *l = curlwp;
struct proc *p;
--- 2600,2605 ----
***************
*** 2621,2681 ****
sys_sync(l, NULL, NULL);
/* Wait for sync to finish. */
! dcount = 10000;
! for (iter = 0; iter < 20;) {
! nbusy = 0;
! for (bp = &buf[nbuf]; --bp >= buf; ) {
! if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
! nbusy++;
! /*
! * With soft updates, some buffers that are
! * written will be remarked as dirty until other
! * buffers are written.
! */
! if (bp->b_vp && bp->b_vp->v_mount
! && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
! && (bp->b_flags & B_DELWRI)) {
! s = splbio();
! simple_lock(&bqueue_slock);
! bremfree(bp);
! simple_unlock(&bqueue_slock);
! bp->b_flags |= B_BUSY;
! splx(s);
! nbusy++;
! bawrite(bp);
! if (dcount-- <= 0) {
! printf("softdep ");
! goto fail;
! }
! }
! }
! if (nbusy == 0)
! break;
! if (nbusy_prev == 0)
! nbusy_prev = nbusy;
! printf("%d ", nbusy);
! tsleep(&nbusy, PRIBIO, "bflush",
! (iter == 0) ? 1 : hz / 25 * iter);
! if (nbusy >= nbusy_prev) /* we didn't flush anything */
! iter++;
! else
! nbusy_prev = nbusy;
! }
! if (nbusy) {
! fail:
! #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
! printf("giving up\nPrinting vnodes for busy buffers\n");
! for (bp = &buf[nbuf]; --bp >= buf; )
! if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
! vprint(NULL, bp->b_vp);
!
#if defined(DDB) && defined(DEBUG_HALT_BUSY)
Debugger();
#endif
-
- #else /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
printf("giving up\n");
- #endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
return;
} else
printf("done\n");
--- 2619,2629 ----
sys_sync(l, NULL, NULL);
/* Wait for sync to finish. */
! if (buf_syncwait() != 0) {
#if defined(DDB) && defined(DEBUG_HALT_BUSY)
Debugger();
#endif
printf("giving up\n");
return;
} else
printf("done\n");