Subject: Re: buffer cache memory management revision
To: None <pk@cs.few.eur.nl>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 11/20/2003 18:44:33
--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
hi,
> Therefore, I propose to revisit the age old memory management still
> employed by the buffer cache. In particular, I'd like to get rid of
> the MAXBSIZE reservation of virtual memory per buffer which is sparsely
> mapped by privately managed pool of physical pages. Currently, this
> scheme stresses MMU resources on some platforms like sun4 & sun4c.
> It also wastes a large amount kernel VM space on machines with lots of
> physical memory when the default buffer cache parameters are in use.
this reminds me of my old patches.. (attached)
YAMAMOTO Takashi
--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="bufcache6.diff"
Index: miscfs/specfs/spec_vnops.c
===================================================================
--- miscfs/specfs/spec_vnops.c (revision 283)
+++ miscfs/specfs/spec_vnops.c (working copy)
@@ -581,11 +581,35 @@ spec_strategy(v)
bp = ap->a_bp;
if (!(bp->b_flags & B_READ) &&
- (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start)
+ (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) {
+ /*
+ * XXX softdep needs buffers kernel-addressable.
+ * XXX this should be pushed into each filesystems.
+ */
+ if (!BUF_IS_ADDRESSABLE(bp)) {
+ if (bp->b_map == NULL) {
+ /* attach */
+ bp->b_map = &devbufmap;
+ }
+ buf_mapin(bp);
+ }
(*bioops.io_start)(bp);
+ }
bdev = bdevsw_lookup(bp->b_dev);
- if (bdev != NULL)
+ if (bdev != NULL) {
+ /*
+ * XXX currently all drivers needs buffers kernel-addressable.
+ * XXX this should be pushed into each drivers.
+ */
+ if (!BUF_IS_ADDRESSABLE(bp)) {
+ if (bp->b_map == NULL) {
+ /* attach */
+ bp->b_map = &devbufmap;
+ }
+ buf_mapin(bp);
+ }
(*bdev->d_strategy)(bp);
+ }
return (0);
}
Index: conf/files
===================================================================
--- conf/files (revision 282)
+++ conf/files (working copy)
@@ -1157,6 +1157,7 @@ file kern/uipc_socket2.c
file kern/uipc_syscalls.c
file kern/uipc_usrreq.c
file kern/vfs_bio.c
+file kern/vfs_bufmap.c
file kern/vfs_cache.c
file kern/vfs_getcwd.c
file kern/vfs_init.c
Index: ufs/lfs/lfs_segment.c
===================================================================
--- ufs/lfs/lfs_segment.c (revision 266)
+++ ufs/lfs/lfs_segment.c (working copy)
@@ -101,6 +101,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_segment.
#include <uvm/uvm.h>
#include <uvm/uvm_extern.h>
+void bufcache_notemappedfree(struct buf *); /* XXX */
+
MALLOC_DEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
extern int count_lock_queue(void);
@@ -1756,6 +1758,11 @@ lfs_writeseg(struct lfs *fs, struct segm
newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
bp->b_bcount, LFS_NB_IBLOCK);
newbp->b_blkno = bp->b_blkno;
+ /* XXX should use our own map? */
+ if (!BUF_IS_ADDRESSABLE(bp)) {
+ buf_mapin(bp);
+ bufcache_notemappedfree(bp);
+ }
memcpy(newbp->b_data, bp->b_data,
newbp->b_bcount);
@@ -1869,6 +1876,11 @@ lfs_writeseg(struct lfs *fs, struct segm
} else
#endif /* LFS_USE_B_INVAL */
{
+ /* XXX should use our own map? */
+ if (!BUF_IS_ADDRESSABLE(bp)) {
+ buf_mapin(bp);
+ bufcache_notemappedfree(bp);
+ }
memcpy(dp, (*bpp)->b_data + byteoffset,
el_size);
}
Index: ufs/lfs/lfs_bio.c
===================================================================
--- ufs/lfs/lfs_bio.c (revision 266)
+++ ufs/lfs/lfs_bio.c (working copy)
@@ -746,7 +746,7 @@ lfs_countlocked(int *count, long *bytes,
n++;
size += bp->b_bufsize;
#ifdef DEBUG_LOCKED_LIST
- if (n > nbuf)
+ if (n > nbufcache_min)
panic("lfs_countlocked: this can't happen: more"
" buffers locked than exist");
#endif
Index: ufs/lfs/lfs_vfsops.c
===================================================================
--- ufs/lfs/lfs_vfsops.c (revision 266)
+++ ufs/lfs/lfs_vfsops.c (working copy)
@@ -1296,12 +1296,12 @@ lfs_mountfs(struct vnode *devvp, struct
if (fs->lfs_ivnode->v_size / fs->lfs_bsize > LFS_MAX_BUFS) {
fs->lfs_flags |= LFS_WARNED;
printf("lfs_mountfs: please consider increasing NBUF to at least %lld\n",
- (long long)(fs->lfs_ivnode->v_size / fs->lfs_bsize) * (nbuf / LFS_MAX_BUFS));
+ (long long)(fs->lfs_ivnode->v_size / fs->lfs_bsize) * (nbufcache_min / LFS_MAX_BUFS));
}
if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES) {
fs->lfs_flags |= LFS_WARNED;
printf("lfs_mountfs: please consider increasing BUFPAGES to at least %lld\n",
- (long long)fs->lfs_ivnode->v_size * bufpages / LFS_MAX_BYTES);
+ (long long)fs->lfs_ivnode->v_size * nbufcachepage_min / LFS_MAX_BYTES);
}
return (0);
@@ -1387,12 +1387,12 @@ lfs_unmount(struct mount *mp, int mntfla
" NBUF to at least %lld\n",
(long long)(fs->lfs_ivnode->v_size /
fs->lfs_bsize) *
- (long long)(nbuf / LFS_MAX_BUFS));
+ (long long)(nbufcache_min / LFS_MAX_BUFS));
if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES)
printf("lfs_unmount: please consider increasing"
" BUFPAGES to at least %lld\n",
(long long)fs->lfs_ivnode->v_size *
- bufpages / LFS_MAX_BYTES);
+ nbufcachepage_min / LFS_MAX_BYTES);
}
/* Explicitly write the superblock, to update serial and pflags */
Index: ufs/lfs/lfs.h
===================================================================
--- ufs/lfs/lfs.h (revision 266)
+++ ufs/lfs/lfs.h (working copy)
@@ -109,11 +109,13 @@
#define PG_DELWRI PG_PAGER1 /* Local def for delayed pageout */
/* Resource limits */
-#define LFS_MAX_BUFS ((nbuf >> 2) - 10)
-#define LFS_WAIT_BUFS ((nbuf >> 1) - (nbuf >> 3) - 10)
-#define LFS_MAX_BYTES (((bufpages >> 2) - 10) * PAGE_SIZE)
-#define LFS_WAIT_BYTES (((bufpages >> 1) - (bufpages >> 3) - 10) \
- * PAGE_SIZE)
+#define LFS_MAX_BUFS ((nbufcache_min >> 2) - 10)
+#define LFS_WAIT_BUFS ((nbufcache_min >> 1) - (nbufcache_min >> 3) - 10)
+/* XXX must consider kva */
+#define LFS_MAX_BYTES \
+ (((nbufcachepage_min >> 2) - 10) * PAGE_SIZE)
+#define LFS_WAIT_BYTES \
+ (((nbufcachepage_min >> 1) - (nbufcachepage_min >> 3) - 10) * PAGE_SIZE)
#define LFS_MAX_DIROP ((desiredvnodes >> 2) + (desiredvnodes >> 3))
#define LFS_MAX_PAGES \
(((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemin) >> 8)
Index: ufs/ext2fs/ext2fs_subr.c
===================================================================
--- ufs/ext2fs/ext2fs_subr.c (revision 266)
+++ ufs/ext2fs/ext2fs_subr.c (working copy)
@@ -121,6 +121,7 @@ ext2fs_checkoverlap(bp, ip)
struct buf *bp;
struct inode *ip;
{
+#if 0 /* XXX */
struct buf *ebp, *ep;
daddr_t start, last;
struct vnode *vp;
@@ -147,5 +148,6 @@ ext2fs_checkoverlap(bp, ip)
ep->b_blkno + btodb(ep->b_bcount) - 1);
panic("Disk buffer overlap");
}
+#endif
}
#endif
Index: ufs/ffs/ffs_softdep.c
===================================================================
--- ufs/ffs/ffs_softdep.c (revision 253)
+++ ufs/ffs/ffs_softdep.c (working copy)
@@ -5745,7 +5745,7 @@ softdep_trackbufs(int delta, boolean_t t
{
if (delta < 0) {
- if (softdep_lockedbufs < nbuf >> 2) {
+ if (softdep_lockedbufs < nbufcache_min >> 2) {
wakeup(&softdep_lockedbufs);
}
KASSERT(softdep_lockedbufs >= -delta);
@@ -5753,7 +5753,7 @@ softdep_trackbufs(int delta, boolean_t t
return;
}
- while (throttle && softdep_lockedbufs >= nbuf >> 2) {
+ while (throttle && softdep_lockedbufs >= nbufcache_min >> 2) {
speedup_syncer();
tsleep(&softdep_lockedbufs, PRIBIO, "softdbufs", 0);
}
Index: ufs/ffs/ffs_subr.c
===================================================================
--- ufs/ffs/ffs_subr.c (revision 266)
+++ ufs/ffs/ffs_subr.c (working copy)
@@ -201,6 +201,7 @@ ffs_checkoverlap(bp, ip)
struct buf *bp;
struct inode *ip;
{
+#if 0 /* XXX */
struct buf *ebp, *ep;
daddr_t start, last;
struct vnode *vp;
@@ -227,6 +228,7 @@ ffs_checkoverlap(bp, ip)
ep->b_blkno + btodb(ep->b_bcount) - 1);
panic("Disk buffer overlap");
}
+#endif
}
#endif /* _KERNEL && DIAGNOSTIC */
Index: kern/kern_allocsys.c
===================================================================
--- kern/kern_allocsys.c (revision 283)
+++ kern/kern_allocsys.c (working copy)
@@ -69,7 +69,6 @@
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_allocsys.c,v 1.24 2003/08/07 16:31:42 agc Exp $");
-#include "opt_bufcache.h"
#include "opt_sysv.h"
#include <sys/param.h>
@@ -85,29 +84,6 @@ __KERNEL_RCSID(0, "$NetBSD: kern_allocsy
#include <sys/shm.h>
#endif
#include <uvm/uvm_extern.h>
-/*
- * Declare these as initialized data so we can patch them.
- */
-#ifndef NBUF
-# define NBUF 0
-#endif
-
-#ifndef BUFPAGES
-# define BUFPAGES 0
-#endif
-
-#ifdef BUFCACHE
-# if (BUFCACHE < 5) || (BUFCACHE > 95)
-# error BUFCACHE is not between 5 and 95
-# endif
-#else
- /* Default to 10% of first 2MB and 5% of remaining. */
-# define BUFCACHE 0
-#endif
-
-u_int nbuf = NBUF;
-u_int bufpages = BUFPAGES; /* optional hardwired count */
-u_int bufcache = BUFCACHE; /* % of RAM to use for buffer cache */
/*
* Allocate space for system data structures. We are given
@@ -140,61 +116,5 @@ allocsys(caddr_t v, caddr_t (*mdcallback
ALLOCSYS(v, msqids, struct msqid_ds, msginfo.msgmni);
#endif
- /*
- * Determine how many buffers to allocate.
- *
- * - If bufcache is specified, use that % of memory
- * for the buffer cache.
- *
- * - Otherwise, we default to the traditional BSD
- * formula of 10% of the first 2MB and 5% of
- * the remaining.
- */
- if (bufpages == 0) {
- if (bufcache != 0) {
- if (bufcache < 5 || bufcache > 95)
- panic("bufcache is out of range (%d)",
- bufcache);
- bufpages = physmem / 100 * bufcache;
- } else {
- if (physmem < btoc(2 * 1024 * 1024))
- bufpages = physmem / 10;
- else
- bufpages = (btoc(2 * 1024 * 1024) + physmem) /
- 20;
- }
- }
-
-#ifdef DIAGNOSTIC
- if (bufpages == 0)
- panic("bufpages = 0");
-#endif
-
- /*
- * Call the mdcallback now; it may need to adjust bufpages.
- */
- if (mdcallback != NULL)
- v = mdcallback(v);
-
- /*
- * Ensure a minimum of 16 buffers.
- */
- if (nbuf == 0) {
- nbuf = bufpages;
- if (nbuf < 16)
- nbuf = 16;
- }
-
-#ifdef VM_MAX_KERNEL_BUF
- /*
- * XXX stopgap measure to prevent wasting too much KVM on
- * the sparsely filled buffer cache.
- */
- if (nbuf > VM_MAX_KERNEL_BUF / MAXBSIZE)
- nbuf = VM_MAX_KERNEL_BUF / MAXBSIZE;
-#endif
-
- ALLOCSYS(v, buf, struct buf, nbuf);
-
return (v);
}
Index: kern/vfs_bio.c
===================================================================
--- kern/vfs_bio.c (revision 283)
+++ kern/vfs_bio.c (working copy)
@@ -82,11 +82,14 @@
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.93 2003/08/07 16:32:01 agc Exp $");
+#include "opt_bufcache.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
+#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
@@ -96,6 +99,53 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v
#include <miscfs/specfs/specdev.h>
+#include <machine/vmparam.h> /* VM_FREELIST_BUFCACHE */
+
+/*
+ * if the arch has a freelist that's preferable for bufcache, use it.
+ */
+#ifdef VM_FREELIST_BUFCACHE
+#define UVM_PGA_STRAT_BUFCACHE UVM_PGA_STRAT_FALLBACK
+#else /* VM_FREELIST_BUFCACHE */
+#define UVM_PGA_STRAT_BUFCACHE UVM_PGA_STRAT_NORMAL
+#define VM_FREELIST_BUFCACHE 0
+#endif /* VM_FREELIST_BUFCACHE */
+
+/*
+ * Declare these as initialized data so we can patch them.
+ */
+#ifndef NBUF
+# define NBUF 0
+#endif
+
+#ifndef BUFPAGES
+# define BUFPAGES 0
+#endif
+
+#ifdef BUFCACHE
+# if (BUFCACHE < 5) || (BUFCACHE > 95)
+# error BUFCACHE is not between 5 and 95
+# endif
+#else
+ /* Default to 10% of first 2MB and 5% of remaining. */
+# define BUFCACHE 0
+#endif
+
+/*
+ * lock for bufcache counts below.
+ */
+struct simplelock bufcache_count_slock = SIMPLELOCK_INITIALIZER;
+
+unsigned int nbufcache_min = NBUF;
+unsigned int nbufcache_max = NBUF * 2; /* XXX */
+unsigned int nbufcache;
+
+unsigned int nbufcachepage_min = BUFPAGES;
+unsigned int nbufcachepage_max = BUFPAGES * 2; /* XXX */
+unsigned int nbufcachepage;
+
+unsigned int bufcache = BUFCACHE; /* % of RAM to use for buffer cache */
+
/* Macros to clear/set/test flags. */
#define SET(t, f) (t) |= (f)
#define CLR(t, f) (t) &= ~(f)
@@ -105,7 +155,8 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v
* Definitions for the buffer hash lists.
*/
#define BUFHASH(dvp, lbn) \
- (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
+ (&bufhashtbl[((((u_long)(uintptr_t)(dvp) >> PAGE_SHIFT) ^ \
+ ((u_long)(uintptr_t)(dvp) >> 3)) + (int)(lbn)) & bufhash])
LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
u_long bufhash;
#ifndef SOFTDEP
@@ -131,6 +182,10 @@ struct bio_ops bioops; /* I/O operation
TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
int needbuffer;
+TAILQ_HEAD(, buf) bufcache_iomap_lru;
+boolean_t bufcache_iomap_wanted;
+vsize_t bufcache_map_size;
+
/*
* Buffer queue lock.
* Take this lock first if also taking some buffer's b_interlock.
@@ -148,6 +203,26 @@ struct pool bufpool;
static __inline struct buf *bio_doread(struct vnode *, daddr_t, int,
struct ucred *, int);
int count_lock_queue(void);
+struct buf *getnewbuf(int, int, boolean_t);
+
+/*
+ */
+void bufcachemap_init(void);
+void bufcache_mapin(struct buf *);
+void bufcache_mapout(struct buf *);
+void bufcache_reclaimkva(struct bufmap *, vsize_t);
+int bufcache_allocpages(struct buf *, vsize_t);
+int bufcache_freepages(struct buf *, vsize_t);
+void bufcache_movepages(struct buf *, struct buf *, int);
+void bufcache_initparam(void);
+unsigned int bufcache_countfree(void);
+int bufcache_reclaim(int);
+#ifdef DEBUG
+void bufcache_debugdump(void);
+#endif
+struct bufmap bufcachemapper;
+
+void devbufmap_init(void); /* XXX */
/*
* Insq/Remq for the buffer free lists.
@@ -162,6 +237,9 @@ bremfree(bp)
{
struct bqueues *dp = NULL;
+ LOCK_ASSERT(simple_lock_held(&bqueue_slock));
+ LOCK_ASSERT(bp->b_bufsize == 0 || simple_lock_held(&bp->b_interlock));
+
/*
* We only calculate the head of the freelist when removing
* the last element of the list as that is the only time that
@@ -195,11 +273,24 @@ found:
#endif /* DEBUG_BUFCACHE */
KASSERT(dp == NULL || !(bp->b_flags & B_LOCKED) ||
- dp == &bufqueues[BQ_LOCKED]);
+ (bp->b_flags & B_INVAL) || dp == &bufqueues[BQ_LOCKED]);
+ KASSERT(dp == NULL || (bp->b_flags & B_LOCKED) ||
+ dp != &bufqueues[BQ_LOCKED]);
KASSERT(dp == NULL || bp->b_bufsize != 0 || dp == &bufqueues[BQ_EMPTY]);
KASSERT(dp == NULL || bp->b_bufsize == 0 || dp != &bufqueues[BQ_EMPTY]);
TAILQ_REMOVE(dp, bp, b_freelist);
+
+ /*
+ * Remove from the kva lru list.
+ */
+ if (bp->b_flags & B_MAPPED) {
+ KASSERT(bp->b_mappedlist.tqe_prev != NULL);
+ TAILQ_REMOVE(&bufcache_iomap_lru, bp, b_mappedlist);
+#ifdef DIAGNOSTIC
+ bp->b_mappedlist.tqe_prev = NULL;
+#endif
+ }
}
/*
@@ -213,33 +304,42 @@ bufinit()
u_int i, base, residual;
/*
- * Initialize the buffer pool. This pool is used for buffers
- * which are strictly I/O control blocks, not buffer cache
- * buffers.
+ * Initialize the buffer pool.
*/
pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
+ bufcache_initparam();
+ bufcachemap_init();
+ devbufmap_init();
for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
TAILQ_INIT(dp);
- bufhashtbl = hashinit(nbuf, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
- base = bufpages / nbuf;
- residual = bufpages % nbuf;
- for (i = 0; i < nbuf; i++) {
- bp = &buf[i];
- memset((char *)bp, 0, sizeof(*bp));
+ TAILQ_INIT(&bufcache_iomap_lru);
+ bufhashtbl =
+ hashinit(nbufcache_min, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
+ base = nbufcachepage_min / nbufcache_min;
+ residual = nbufcachepage_min % nbufcache_min;
+ for (i = 0; i < nbufcache_min; i++) {
+ vsize_t bufsize;
+
+ bp = pool_get(&bufpool, PR_NOWAIT);
+ memset(bp, 0, sizeof(*bp));
BUF_INIT(bp);
+ bp->b_map = &bufcachemapper;
bp->b_dev = NODEV;
bp->b_vnbufs.le_next = NOLIST;
- bp->b_data = buffers + i * MAXBSIZE;
if (i < residual)
- bp->b_bufsize = (base + 1) * PAGE_SIZE;
+ bufsize = (base + 1) * PAGE_SIZE;
else
- bp->b_bufsize = base * PAGE_SIZE;
- bp->b_flags = B_INVAL;
+ bufsize = base * PAGE_SIZE;
+ if (bufcache_allocpages(bp, bufsize))
+ panic("can't alloc buf page");
+ bp->b_flags = B_INVAL | B_PAGES;
dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
binsheadfree(bp, dp);
binshash(bp, &invalhash);
+ nbufcache++;
}
+ KASSERT(nbufcache == nbufcache_min);
}
static __inline struct buf *
@@ -367,6 +467,7 @@ bwrite(bp)
struct mount *mp;
KASSERT(ISSET(bp->b_flags, B_BUSY));
+ KASSERT(ISSET(bp->b_flags, B_PAGES));
vp = bp->b_vp;
if (vp != NULL) {
@@ -471,6 +572,7 @@ bdwrite(bp)
int s;
KASSERT(ISSET(bp->b_flags, B_BUSY));
+ KASSERT(ISSET(bp->b_flags, B_PAGES));
/* If this is a tape block, write the block now. */
bdev = bdevsw_lookup(bp->b_dev);
@@ -512,6 +614,7 @@ bawrite(bp)
int s;
KASSERT(ISSET(bp->b_flags, B_BUSY));
+ KASSERT(ISSET(bp->b_flags, B_PAGES));
s = splbio();
simple_lock(&bp->b_interlock);
@@ -554,15 +657,16 @@ brelse(bp)
struct buf *bp;
{
struct bqueues *bufq;
+ boolean_t dofree = FALSE;
int s;
KASSERT(ISSET(bp->b_flags, B_BUSY));
+ KASSERT(ISSET(bp->b_flags, B_PAGES));
KASSERT(!ISSET(bp->b_flags, B_CALL));
/* Block disk interrupts. */
s = splbio();
simple_lock(&bqueue_slock);
- simple_lock(&bp->b_interlock);
/* Wake up any processes waiting for any buffer to become free. */
if (needbuffer) {
@@ -570,6 +674,13 @@ brelse(bp)
wakeup(&needbuffer);
}
+ if ((bp->b_flags & B_MAPPED) && bufcache_iomap_wanted) {
+ bufcache_iomap_wanted = FALSE;
+ wakeup(&bufcache_iomap_wanted);
+ }
+
+ simple_lock(&bp->b_interlock);
+
/* Wake up any proceeses waiting for _this_ buffer to become free. */
if (ISSET(bp->b_flags, B_WANTED)) {
CLR(bp->b_flags, B_WANTED|B_AGE);
@@ -598,11 +709,14 @@ brelse(bp)
CLR(bp->b_flags, B_VFLUSH);
if (!ISSET(bp->b_flags, B_ERROR|B_INVAL|B_LOCKED|B_AGE))
goto already_queued;
- else
- bremfree(bp);
+
+ bremfree(bp);
}
+ KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+
if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
+ KASSERT(!(bp->b_flags & B_LOCKED));
/*
* If it's invalid or empty, dissociate it from its vnode
* and put on the head of the appropriate queue.
@@ -614,13 +728,25 @@ brelse(bp)
reassignbuf(bp, bp->b_vp);
brelvp(bp);
}
- if (bp->b_bufsize <= 0)
- /* no data */
- bufq = &bufqueues[BQ_EMPTY];
- else
+ if (bp->b_bufsize <= 0) {
+ simple_lock(&bufcache_count_slock);
+ if (nbufcache > nbufcache_min)
+ dofree = TRUE; /* put back to bufpool */
+ else
+ bufq = &bufqueues[BQ_EMPTY];
+ simple_unlock(&bufcache_count_slock);
+ } else {
/* invalid data */
+ /* XXX no worth to cache unless B_MAPPED. */
bufq = &bufqueues[BQ_AGE];
- binsheadfree(bp, bufq);
+ if (bp->b_flags & B_MAPPED) {
+ KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+ TAILQ_INSERT_HEAD(&bufcache_iomap_lru, bp,
+ b_mappedlist);
+ }
+ }
+ if (!dofree)
+ binsheadfree(bp, bufq);
} else {
/*
* It has valid data. Put it on the end of the appropriate
@@ -649,16 +775,29 @@ brelse(bp)
&bufqueues[BQ_AGE];
}
binstailfree(bp, bufq);
+ if (bp->b_flags & B_MAPPED) {
+ KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+ TAILQ_INSERT_TAIL(&bufcache_iomap_lru, bp, b_mappedlist);
+ }
}
already_queued:
+ simple_unlock(&bqueue_slock);
/* Unlock the buffer. */
CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE);
SET(bp->b_flags, B_CACHE);
- /* Allow disk interrupts. */
simple_unlock(&bp->b_interlock);
- simple_unlock(&bqueue_slock);
+ if (dofree) {
+ KASSERT(bp->b_bufsize == 0);
+ bremhash(bp);
+ pool_put(&bufpool, bp);
+ simple_lock(&bufcache_count_slock);
+ KASSERT(nbufcache > nbufcache_min);
+ nbufcache--;
+ simple_unlock(&bufcache_count_slock);
+ }
+ /* Allow disk interrupts. */
splx(s);
}
@@ -732,7 +871,7 @@ start:
SET(bp->b_flags, B_BUSY);
bremfree(bp);
} else {
- if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) {
+ if ((bp = getnewbuf(slpflag, slptimeo, TRUE)) == NULL) {
simple_unlock(&bqueue_slock);
splx(s);
goto start;
@@ -761,7 +900,7 @@ geteblk(size)
s = splbio();
simple_lock(&bqueue_slock);
- while ((bp = getnewbuf(0, 0)) == 0)
+ while ((bp = getnewbuf(0, 0, TRUE)) == 0)
;
SET(bp->b_flags, B_INVAL);
@@ -790,6 +929,9 @@ allocbuf(bp, size)
vsize_t desired_size;
int s;
+ KASSERT(bp->b_flags & B_PAGES);
+ KASSERT(0 <= size);
+
desired_size = round_page((vsize_t)size);
if (desired_size > MAXBSIZE)
panic("allocbuf: buffer larger than MAXBSIZE requested");
@@ -803,12 +945,15 @@ allocbuf(bp, size)
* steal their pages.
*/
while (bp->b_bufsize < desired_size) {
- int amt;
+ /* try to allocate new pages */
+ if (bufcache_allocpages(bp, desired_size) == 0)
+ break;
/* find a buffer */
s = splbio();
simple_lock(&bqueue_slock);
- while ((nbp = getnewbuf(0, 0)) == NULL)
+
+ while ((nbp = getnewbuf(0, 0, FALSE)) == NULL)
;
SET(nbp->b_flags, B_INVAL);
@@ -818,16 +963,19 @@ allocbuf(bp, size)
simple_unlock(&bqueue_slock);
splx(s);
+ if (nbp->b_bufsize == 0) {
+ /*
+ * race between bufcache_allocpages and getnewbuf.
+ * we don't want a buffer without pages.
+ */
+ printf("bufcache race\n");
+ brelse(nbp);
+ continue;
+ }
+
/* and steal its pages, up to the amount we need */
- amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
- pagemove((nbp->b_data + nbp->b_bufsize - amt),
- bp->b_data + bp->b_bufsize, amt);
- bp->b_bufsize += amt;
- nbp->b_bufsize -= amt;
-
- /* reduce transfer count if we stole some data */
- if (nbp->b_bcount > nbp->b_bufsize)
- nbp->b_bcount = nbp->b_bufsize;
+ bufcache_movepages(bp, nbp, desired_size - bp->b_bufsize);
+ KASSERT(bp->b_bufsize <= desired_size);
#ifdef DIAGNOSTIC
if (nbp->b_bufsize < 0)
@@ -836,6 +984,8 @@ allocbuf(bp, size)
brelse(nbp);
}
+ KASSERT(bp->b_bufsize >= desired_size);
+
/*
* If we want a buffer smaller than the current size,
* shrink this buffer. Grab a buf head from the EMPTY queue,
@@ -843,6 +993,9 @@ allocbuf(bp, size)
* If there are no free buffer headers, leave the buffer alone.
*/
if (bp->b_bufsize > desired_size) {
+ if (bufcache_freepages(bp, desired_size) == 0)
+ goto out;
+
s = splbio();
simple_lock(&bqueue_slock);
if ((nbp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) == NULL) {
@@ -852,16 +1005,15 @@ allocbuf(bp, size)
goto out;
}
/* No need to lock nbp since it came from the empty queue */
+ KASSERT(nbp->b_bufsize == 0);
+ KASSERT(!(nbp->b_flags & B_BUSY));
bremfree(nbp);
SET(nbp->b_flags, B_BUSY | B_INVAL);
simple_unlock(&bqueue_slock);
splx(s);
/* move the page to it and note this change */
- pagemove(bp->b_data + desired_size,
- nbp->b_data, bp->b_bufsize - desired_size);
- nbp->b_bufsize = bp->b_bufsize - desired_size;
- bp->b_bufsize = desired_size;
+ bufcache_movepages(nbp, bp, bp->b_bufsize - desired_size);
nbp->b_bcount = 0;
/* release the newly-filled buffer and leave */
@@ -870,6 +1022,8 @@ allocbuf(bp, size)
out:
bp->b_bcount = size;
+ if (!(bp->b_flags & B_MAPPED))
+ bufcache_mapin(bp);
}
/*
@@ -881,19 +1035,37 @@ out:
* Return buffer locked.
*/
struct buf *
-getnewbuf(slpflag, slptimeo)
+getnewbuf(slpflag, slptimeo, doalloc)
int slpflag, slptimeo;
+ boolean_t doalloc;
{
struct buf *bp;
start:
LOCK_ASSERT(simple_lock_held(&bqueue_slock));
- if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
- (bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
- simple_lock(&bp->b_interlock);
- bremfree(bp);
- } else {
+ bp = TAILQ_FIRST(&bufqueues[BQ_AGE]);
+ if (doalloc && bp == NULL) {
+ simple_lock(&bufcache_count_slock);
+ if (nbufcache < nbufcache_max &&
+ nbufcachepage < nbufcachepage_max) {
+ nbufcache++;
+ simple_unlock(&bufcache_count_slock);
+ bp = pool_get(&bufpool, PR_NOWAIT);
+ memset(bp, 0, sizeof(*bp));
+ BUF_INIT(bp);
+ bp->b_map = &bufcachemapper;
+ simple_lock(&bp->b_interlock);
+ bp->b_flags = B_BUSY | B_PAGES;
+ bp->b_dev = NODEV;
+ return bp;
+ }
+ simple_unlock(&bufcache_count_slock);
+ }
+ if (bp == NULL)
+ bp = TAILQ_FIRST(&bufqueues[BQ_LRU]);
+
+ if (bp == NULL) {
/* wait for a free buffer of any kind */
needbuffer = 1;
ltsleep(&needbuffer, slpflag|(PRIBIO+1),
@@ -901,12 +1073,17 @@ start:
return (NULL);
}
+ simple_lock(&bp->b_interlock);
+ bremfree(bp);
+ KASSERT(bp->b_bufsize > 0);
+
if (ISSET(bp->b_flags, B_VFLUSH)) {
/*
* This is a delayed write buffer being flushed to disk. Make
* sure it gets aged out of the queue when it's finished, and
* leave it off the LRU queue.
*/
+ KASSERT(bp->b_flags & B_BUSY);
CLR(bp->b_flags, B_VFLUSH);
SET(bp->b_flags, B_AGE);
simple_unlock(&bp->b_interlock);
@@ -941,7 +1118,7 @@ start:
(*bioops.io_deallocate)(bp);
/* clear out various other fields */
- bp->b_flags = B_BUSY;
+ bp->b_flags = B_BUSY | (bp->b_flags & (B_PAGES | B_MAPPED));
bp->b_dev = NODEV;
bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = 0;
bp->b_iodone = 0;
@@ -1022,6 +1199,8 @@ biodone(bp)
if (ISSET(bp->b_flags, B_CALL)) {
CLR(bp->b_flags, B_CALL); /* but note callout done */
simple_unlock(&bp->b_interlock);
+ if (bp->b_map != NULL)
+ buf_mapout(bp);
(*bp->b_iodone)(bp);
} else {
if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release */
@@ -1086,3 +1265,578 @@ vfs_bufstats()
}
}
#endif /* DEBUG */
+
+#include <uvm/uvm_iomap.h>
+unsigned int bufcachemap_size;
+
+struct uvm_iomap bufcache_iomap;
+
+void
+bufcachemap_init()
+{
+
+ memset(&bufcachemapper, 0, sizeof(bufcachemapper));
+ bufcachemapper.bm_mapin = bufcache_mapin;
+ bufcachemapper.bm_mapout = NULL;
+ bufcachemapper.bm_reclaim = bufcache_reclaimkva;
+ bufcachemapper.bm_iomap = &bufcache_iomap;
+
+ uvm_iomap_init(&bufcache_iomap, bufcache_map_size, round_page(MAXPHYS));
+}
+
+void
+bufcache_reclaimkva(struct bufmap *bmap, vsize_t size)
+{
+ int need = size;
+
+ KASSERT(bmap == &bufcachemapper);
+
+ do {
+ struct buf *victim;
+ int s;
+
+ /*
+ * pick a buffer from the top of kva lru list.
+ */
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ victim = TAILQ_FIRST(&bufcache_iomap_lru);
+ if (victim == NULL) {
+#ifdef DEBUG
+ bufcache_debugdump();
+#endif
+ printf("no buf on kva lru; sleep\n");
+ bufcache_iomap_wanted = TRUE;
+ ltsleep(&bufcache_iomap_wanted,
+ (PRIBIO + 1) | PNORELOCK, "bufkva", 0,
+ &bqueue_slock);
+ splx(s);
+ printf("no buf on kva lru; woken\n");
+ continue;
+ }
+ /*
+ * lock the buffer and take it off the freelist.
+ */
+ simple_lock(&victim->b_interlock);
+ if (victim->b_flags & B_BUSY) {
+ simple_unlock(&bqueue_slock);
+ if (!(victim->b_flags & (B_VFLUSH|B_LOCKED)))
+ panic("%p: %lx\n",
+ victim, victim->b_flags);
+ KASSERT(victim->b_flags & (B_VFLUSH|B_LOCKED));
+ victim->b_flags |= B_WANTED;
+ ltsleep(victim, (PRIBIO + 1) | PNORELOCK,
+ "bunmap", 0, &victim->b_interlock);
+ splx(s);
+ continue;
+ }
+ bremfree(victim);
+ simple_unlock(&bqueue_slock);
+ victim->b_flags |= B_BUSY;
+ simple_unlock(&victim->b_interlock);
+ splx(s);
+
+ KASSERT(victim->b_flags & B_MAPPED);
+ KASSERT(!(victim->b_flags & B_VFLUSH));
+ KASSERT(victim->b_bufsize > 0);
+
+ /*
+ * unmap the buffer.
+ */
+ bufcache_mapout(victim);
+ brelse(victim);
+ need -= victim->b_bufsize;
+ if (need < 0)
+ need = 0;
+ } while (need > 0);
+}
+
+/*
+ * map a buffer.
+ * - allocate kva(b_data) and map the pages.
+ */
+void
+bufcache_mapin(struct buf *bp)
+{
+
+ LOCK_ASSERT(!simple_lock_held(&bqueue_slock));
+
+ genbuf_mapin(bp);
+ KASSERT(bp->b_map->bm_kva_used <= bufcache_map_size); /* XXX MP */
+
+ /*
+ * XXX B_VFLUSH buffers are on freelist.
+ * XXX note that getnewbuf does bremfree for B_VFLUSH buffers.
+ *
+ * XXX LFS maps B_LOCKED buffers to copy their contents to
+ * XXX a segment buffer.
+ */
+ if (bp->b_flags & B_VFLUSH) { /* racy check first */
+ int s;
+
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ simple_lock(&bp->b_interlock);
+ bp->b_flags |= B_MAPPED;
+ if (bp->b_flags & B_VFLUSH) {
+ KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+ TAILQ_INSERT_TAIL(&bufcache_iomap_lru, bp,
+ b_mappedlist);
+ }
+ simple_unlock(&bp->b_interlock);
+ simple_unlock(&bqueue_slock);
+ splx(s);
+ } else
+ bp->b_flags |= B_MAPPED;
+
+ KASSERT(bp->b_map->bm_kva_used <= bufcache_map_size); /* XXX MP */
+}
+
+void bufcache_notemappedfree(struct buf *); /* XXX */
+void
+bufcache_notemappedfree(struct buf *bp)
+{
+
+ KASSERT(bp->b_flags & B_BUSY);
+ KASSERT(bp->b_flags & (B_VFLUSH | B_LOCKED));
+ KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+ TAILQ_INSERT_TAIL(&bufcache_iomap_lru, bp,
+ b_mappedlist);
+}
+
+/*
+ * unmap a buffer.
+ */
+void
+bufcache_mapout(struct buf *bp)
+{
+ int s;
+
+ KASSERT(bp->b_map == &bufcachemapper);
+ genbuf_mapout(bp);
+
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ if (bufcache_iomap_wanted) {
+ bufcache_iomap_wanted = FALSE;
+ wakeup(&bufcache_iomap_wanted);
+ }
+ simple_unlock(&bqueue_slock);
+ splx(s);
+}
+
+int
+bufcache_allocpages(struct buf *bp, vsize_t size)
+{
+ int nalloc = (int)((int)size - bp->b_bufsize) >> PAGE_SHIFT;
+ int s;
+
+ KASSERT((size & PAGE_MASK) == 0);
+ KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
+ KASSERT(size <= MAXBSIZE);
+ KASSERT(nalloc > 0);
+
+ s = splbio();
+ simple_lock(&bufcache_count_slock);
+ if (nbufcachepage + nalloc > nbufcachepage_max) {
+ nalloc = nbufcachepage_max - nbufcachepage;
+ }
+ nbufcachepage += nalloc;
+ simple_unlock(&bufcache_count_slock);
+ splx(s);
+
+ if (nalloc > 0 && bp->b_flags & B_MAPPED)
+ bufcache_mapout(bp);
+
+ for (; nalloc > 0; nalloc--) {
+ struct vm_page *pg;
+ int idx;
+
+ /*
+ * XXX need an md hook?
+ */
+ pg = uvm_pagealloc_strat(NULL, 0, NULL, 0,
+ UVM_PGA_STRAT_BUFCACHE, VM_FREELIST_BUFCACHE);
+ if (pg == NULL)
+ return ENOMEM;
+ idx = bp->b_bufsize >> PAGE_SHIFT;
+ KASSERT(bp->b_pages[idx] == NULL);
+ bp->b_pages[idx] = pg;
+ bp->b_bufsize += PAGE_SIZE;
+ }
+ KASSERT(bp->b_bufsize <= size);
+ if (bp->b_bufsize < size)
+ return ENOMEM;
+ return 0;
+}
+
+int
+bufcache_freepages(struct buf *bp, vsize_t size)
+{
+ int nfree = (int)(bp->b_bufsize - size) >> PAGE_SHIFT;
+ int s;
+
+ KASSERT((size & PAGE_MASK) == 0);
+ KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
+ KASSERT(size <= MAXBSIZE);
+ KASSERT(nfree >= 0);
+
+ s = splbio();
+ simple_lock(&bufcache_count_slock);
+ if (nbufcachepage < nbufcachepage_min + nfree) {
+ nfree = nbufcachepage - nbufcachepage_min;
+ }
+ nbufcachepage -= nfree;
+ simple_unlock(&bufcache_count_slock);
+ splx(s);
+
+ if (nfree > 0 && bp->b_flags & B_MAPPED)
+ bufcache_mapout(bp);
+
+ for (; nfree > 0; nfree--) {
+ struct vm_page *pg;
+ int idx;
+
+ bp->b_bufsize -= PAGE_SIZE;
+ idx = bp->b_bufsize >> PAGE_SHIFT;
+ pg = bp->b_pages[idx];
+ KASSERT(pg != NULL);
+ uvm_pagefree(pg); /* XXX md hook? */
+#ifdef DIAGNOSTIC
+ bp->b_pages[idx] = NULL;
+#endif
+ }
+ KASSERT(bp->b_bufsize >= size);
+ if (bp->b_bufsize > size)
+ return ENOMEM; /* XXX */
+ return 0;
+}
+
+/*
+ * move pages from a buffer to another.
+ */
+void
+bufcache_movepages(struct buf *bp, struct buf *victim, int movesize)
+{
+ int npages;
+ int npages_victim;
+ int npages_move;
+
+ KASSERT(bp->b_flags & B_PAGES);
+ KASSERT(bp->b_flags & B_BUSY);
+ KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
+ KASSERT(victim->b_flags & B_PAGES);
+ KASSERT(victim->b_flags & B_BUSY);
+ KASSERT((victim->b_bufsize & PAGE_MASK) == 0);
+ KASSERT(victim->b_bufsize > 0);
+ KASSERT(movesize > 0);
+ KASSERT((movesize & PAGE_MASK) == 0);
+ KASSERT(bp->b_bufsize + movesize <= MAXPHYS);
+
+ if (bp->b_flags & B_MAPPED)
+ bufcache_mapout(bp);
+ if (victim->b_flags & B_MAPPED)
+ bufcache_mapout(victim);
+
+ npages = bp->b_bufsize >> PAGE_SHIFT;
+ npages_victim = victim->b_bufsize >> PAGE_SHIFT;
+ npages_move = MIN(movesize >> PAGE_SHIFT, npages_victim);
+ while (npages_move > 0) {
+ npages_move--;
+ npages_victim--;
+ KASSERT(victim->b_pages[npages_victim]);
+ KASSERT(bp->b_pages[npages] == NULL);
+ bp->b_pages[npages] = victim->b_pages[npages_victim];
+#ifdef DIAGNOSTIC
+ victim->b_pages[npages_victim] = NULL;
+#endif
+ npages++;
+ }
+ bp->b_bufsize = npages << PAGE_SHIFT;
+ victim->b_bufsize = npages_victim << PAGE_SHIFT;
+
+ /* reduce transfer count if we stole some data */
+ if (victim->b_bcount > victim->b_bufsize)
+ victim->b_bcount = victim->b_bufsize;
+}
+
+/*
+ * count buffers on freelists.
+ */
+unsigned int
+bufcache_countfree()
+{
+ const struct bqueues *dp;
+ const struct buf *bp;
+ unsigned int nfree = 0;
+
+#if 0 /* used from ddb */
+ LOCK_ASSERT(simple_lock_held(&bqueue_slock));
+#endif
+
+ for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
+ TAILQ_FOREACH(bp, dp, b_freelist) {
+ nfree++;
+ }
+ }
+/* KASSERT(nbufcache >= nfree); */
+
+ return nfree;
+}
+
+/*
+ * flush out all buffer caches.
+ *
+ * XXX is this really needed?
+ */
+int
+bufcache_shutdown()
+{
+ struct bqueues *dp;
+ struct buf *bp;
+ int iter, nbusy, nbusy_prev = 0, dcount, s;
+ unsigned int nbusy2;
+
+ /* Wait for sync to finish. */
+ dcount = 10000;
+ for (iter = 0; iter < 20;) {
+ unsigned int nfree;
+ nbusy = 0;
+
+ /*
+ * XXX broken. generally, buffers in i/o is not on freelist.
+ * XXX should free buffers until nbufcache reaches to zero?
+ */
+ for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
+ TAILQ_FOREACH(bp, dp, b_freelist) {
+ if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ))
+ == B_BUSY) {
+ nbusy++;
+ printf("busy buffer\n");
+ }
+ /*
+ * With soft updates, some buffers that are
+ * written will be remarked as dirty until other
+ * buffers are written.
+ */
+ if (bp->b_vp && bp->b_vp->v_mount
+ && (bp->b_vp->v_mount->mnt_flag &
+ MNT_SOFTDEP)
+ && (bp->b_flags & B_DELWRI)) {
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ simple_lock(&bp->b_interlock);
+ bremfree(bp);
+ simple_unlock(&bqueue_slock);
+ bp->b_flags |= B_BUSY;
+ simple_unlock(&bp->b_interlock);
+ splx(s);
+ nbusy++;
+ bawrite(bp);
+ printf("softdep dirty buffer\n");
+ if (dcount-- <= 0) {
+ printf("softdep ");
+ goto fail;
+ }
+ }
+ }
+ }
+
+ /*
+ * count buffers on freelists.
+ */
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ simple_lock(&bufcache_count_slock);
+ nfree = bufcache_countfree();
+ nbusy2 = nbufcache - nfree;
+ simple_unlock(&bufcache_count_slock);
+ simple_unlock(&bqueue_slock);
+ splx(s);
+ printf("nbusy2=%u, busy=%d\n", nbusy2, nbusy);
+ if (nbusy2 == 0 && nbusy == 0)
+ break;
+ if (nbusy_prev == 0)
+ nbusy_prev = nbusy;
+ printf("%d ", nbusy);
+ tsleep(&nbusy, PRIBIO, "bflush",
+ (iter == 0) ? 1 : hz / 25 * iter);
+ if (nbusy >= nbusy_prev) /* we didn't flush anything */
+ iter++;
+ else
+ nbusy_prev = nbusy;
+ }
+ if (nbusy || nbusy2)
+ Debugger();
+ if (nbusy) {
+fail:
+#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
+ printf("giving up\nPrinting vnodes for busy buffers\n");
+#if 0 /* XXX */
+ for (bp = &buf[nbuf]; --bp >= buf; )
+ if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
+ vprint(NULL, bp->b_vp);
+#endif
+
+#if defined(DDB) && defined(DEBUG_HALT_BUSY)
+ Debugger();
+#endif
+
+#else /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
+ printf("giving up\n");
+#endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
+ return 1;
+ }
+
+ return 0;
+}
+
+void
+bufcache_initparam()
+{
+ char pbufmin[9];
+ char pbufmax[9];
+
+ /*
+ * Determine how many buffers to allocate.
+ *
+ * - If bufcache is specified, use that % of memory
+ * for the buffer cache.
+ *
+ * - Otherwise, we default to the traditional BSD
+ * formula of 10% of the first 2MB and 5% of
+ * the remaining.
+ */
+ if (nbufcachepage_min == 0) {
+ if (bufcache != 0) {
+ if (bufcache < 5 || bufcache > 95)
+ panic("bufcache is out of range (%d)",
+ bufcache);
+ nbufcachepage_min = physmem / 100 * bufcache;
+ } else {
+ if (physmem < btoc(2 * 1024 * 1024))
+ nbufcachepage_min = physmem / 10;
+ else
+ nbufcachepage_min =
+ (btoc(2 * 1024 * 1024) + physmem) / 20;
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ if (nbufcachepage_min == 0)
+ panic("bufpages = 0");
+#endif
+
+#if 0 /* XXX XXX */
+ /*
+ * Call the mdcallback now; it may need to adjust bufpages.
+ */
+ if (mdcallback != NULL)
+ v = mdcallback(v);
+#endif
+
+ /*
+ * Ensure a minimum of 16 buffers.
+ */
+ if (nbufcache_min == 0) {
+ nbufcache_min = nbufcachepage_min;
+ if (nbufcache_min < 16)
+ nbufcache_min = 16;
+ }
+
+ nbufcache_max = nbufcache_min * 2; /* XXX XXX */
+ nbufcachepage_max = nbufcachepage_min * 2; /* XXX XXX */
+
+ /* XXX */
+ if (bufcache_map_size == 0)
+ bufcache_map_size = MIN(nbufcache_min * MAXBSIZE,
+ nbufcachepage_min * PAGE_SIZE);
+
+#ifdef VM_MAX_KERNEL_BUF
+ if (bufcache_map_size > VM_MAX_KERNEL_BUF) {
+ /* assuming VM_MAX_KERNEL_BUF is a reasonable value. */
+ bufcache_map_size = VM_MAX_KERNEL_BUF;
+ nbufcache_min =
+ MIN(nbufcache_min, bufcache_map_size / MAXBSIZE);
+ nbufcachepage_min =
+ MIN(nbufcachepage_min, bufcache_map_size / PAGE_SIZE);
+ }
+#endif
+
+ format_bytes(pbufmin, sizeof(pbufmin), nbufcachepage_min * PAGE_SIZE);
+ format_bytes(pbufmax, sizeof(pbufmax), nbufcachepage_max * PAGE_SIZE);
+ printf("using %d-%d buffers %s-%s of memory\n",
+ nbufcache_min, nbufcache_max, pbufmin, pbufmax);
+ /*
+ * XXX nbufcache*_min should be able to be a small constant
+ * but they can't for now because they are used by filesystems to
+ * throttle...
+ */
+ KDASSERT(nbufcache_min >= 16);
+ KDASSERT(nbufcachepage_min >= 16);
+ KDASSERT(nbufcache_max >= nbufcache_min);
+ KDASSERT(nbufcachepage_max >= nbufcachepage_min);
+}
+
+int
+bufcache_reclaim(int num)
+{
+ int error = 0;
+
+ while (num-- > 0) {
+ struct buf *bp;
+ int s;
+
+ while ((bp = getnewbuf(0, 0, FALSE)) == NULL)
+ ;
+
+ error = bufcache_freepages(bp, 0);
+ if (error)
+ break;
+
+ KASSERT(bp->b_bufsize == 0);
+ bremhash(bp);
+ s = splbio();
+ pool_put(&bufpool, bp);
+ simple_lock(&bufcache_count_slock);
+ KASSERT(nbufcache > nbufcache_min);
+ nbufcache--;
+ simple_unlock(&bufcache_count_slock);
+ splx(s);
+ }
+
+ return error;
+}
+
+#ifdef DEBUG
+void
+bufcache_debugdump()
+{
+ struct buf *it;
+ int n, m;
+
+ printf("nbuf=%d, npage=%d, nfree=%d, kva=%d/%d\n",
+ nbufcache, nbufcachepage, bufcache_countfree(),
+ (int)bufcachemapper.bm_kva_used, (int)bufcache_map_size);
+
+ n = m = 0;
+ TAILQ_FOREACH(it, &bufqueues[BQ_LRU], b_freelist){
+ if (it->b_flags & B_VFLUSH)
+ n++;
+ if ((it->b_flags & B_MAPPED) &&
+ !(it->b_flags & B_BUSY))
+ m++;
+ }
+ printf("LRU %d, %d\n", n, m);
+
+ n = m = 0;
+ TAILQ_FOREACH(it, &bufqueues[BQ_AGE], b_freelist){
+ if (it->b_flags & B_VFLUSH)
+ n++;
+ if ((it->b_flags & B_MAPPED) &&
+ !(it->b_flags & B_BUSY))
+ m++;
+ }
+ printf("AGE %d, %d\n", n, m);
+}
+#endif
Index: kern/vfs_subr.c
===================================================================
--- kern/vfs_subr.c (revision 283)
+++ kern/vfs_subr.c (working copy)
@@ -2591,8 +2591,6 @@ vfs_unmountall(p)
void
vfs_shutdown()
{
- struct buf *bp;
- int iter, nbusy, nbusy_prev = 0, dcount, s;
struct lwp *l = curlwp;
struct proc *p;
@@ -2611,62 +2609,7 @@ vfs_shutdown()
sys_sync(l, NULL, NULL);
- /* Wait for sync to finish. */
- dcount = 10000;
- for (iter = 0; iter < 20;) {
- nbusy = 0;
- for (bp = &buf[nbuf]; --bp >= buf; ) {
- if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
- nbusy++;
- /*
- * With soft updates, some buffers that are
- * written will be remarked as dirty until other
- * buffers are written.
- */
- if (bp->b_vp && bp->b_vp->v_mount
- && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
- && (bp->b_flags & B_DELWRI)) {
- s = splbio();
- bremfree(bp);
- bp->b_flags |= B_BUSY;
- splx(s);
- nbusy++;
- bawrite(bp);
- if (dcount-- <= 0) {
- printf("softdep ");
- goto fail;
- }
- }
- }
- if (nbusy == 0)
- break;
- if (nbusy_prev == 0)
- nbusy_prev = nbusy;
- printf("%d ", nbusy);
- tsleep(&nbusy, PRIBIO, "bflush",
- (iter == 0) ? 1 : hz / 25 * iter);
- if (nbusy >= nbusy_prev) /* we didn't flush anything */
- iter++;
- else
- nbusy_prev = nbusy;
- }
- if (nbusy) {
-fail:
-#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
- printf("giving up\nPrinting vnodes for busy buffers\n");
- for (bp = &buf[nbuf]; --bp >= buf; )
- if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
- vprint(NULL, bp->b_vp);
-
-#if defined(DDB) && defined(DEBUG_HALT_BUSY)
- Debugger();
-#endif
-
-#else /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
- printf("giving up\n");
-#endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
- return;
- } else
+ if (!bufcache_shutdown())
printf("done\n");
/*
@@ -2959,10 +2902,10 @@ set_statfs_info(const char *onp, int uko
#ifdef DDB
const char buf_flagbits[] =
- "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI"
+ "\20\1AGE\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI"
"\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE"
- "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED"
- "\32XXX\33VFLUSH";
+ "\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED"
+ "\32XXX\33VFLUSH\34PAGES\35MAPPED";
void
vfs_buf_print(bp, full, pr)
Index: uvm/uvm_iomap.h
===================================================================
--- uvm/uvm_iomap.h (revision 198)
+++ uvm/uvm_iomap.h (working copy)
@@ -55,7 +55,7 @@ void uvm_iomap_init(struct uvm_iomap *,
vaddr_t uvm_iomap_alloc(struct uvm_iomap *, vsize_t, int);
void uvm_iomap_free(struct uvm_iomap *, vaddr_t, vsize_t);
-#define UVMIOMAP_WAITOK UVMPAGER_MAPIN_WAITOK
+#define UVMIOMAP_WAITOK 0x01
#define uvm_iomap_pmap(iomap) vm_map_pmap((iomap)->ui_map)
Index: uvm/uvm_glue.c
===================================================================
--- uvm/uvm_glue.c (revision 196)
+++ uvm/uvm_glue.c (working copy)
@@ -142,9 +142,11 @@ uvm_kernacc(addr, len, rw)
* or worse, inconsistencies at the pmap level. We only worry
* about the buffer cache for now.
*/
+#if 0
if (!readbuffers && rv && (eaddr > (vaddr_t)buffers &&
saddr < (vaddr_t)buffers + MAXBSIZE * nbuf))
rv = FALSE;
+#endif
return(rv);
}
Index: arch/i386/i386/machdep.c
===================================================================
--- arch/i386/i386/machdep.c (revision 281)
+++ arch/i386/i386/machdep.c (working copy)
@@ -284,7 +284,6 @@ cpu_startup()
caddr_t v;
int sz, x;
vaddr_t minaddr, maxaddr;
- vsize_t size;
char pbuf[9];
/*
@@ -325,37 +324,6 @@ cpu_startup()
panic("startup: table size inconsistency");
/*
- * Allocate virtual address space for the buffers. The area
- * is not managed by the VM system.
- */
- size = MAXBSIZE * nbuf;
- if (uvm_map(kernel_map, (vaddr_t *)(void *) &buffers, round_page(size),
- NULL, UVM_UNKNOWN_OFFSET, 0,
- UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
- UVM_ADV_NORMAL, 0)) != 0)
- panic("cpu_startup: cannot allocate VM for buffers");
- minaddr = (vaddr_t)buffers;
- if ((bufpages / nbuf) >= btoc(MAXBSIZE)) {
- /* don't want to alloc more physical mem than needed */
- bufpages = btoc(MAXBSIZE) * nbuf;
- }
-
- /*
- * XXX We defer allocation of physical pages for buffers until
- * XXX after autoconfiguration has run. We must do this because
- * XXX on system with large amounts of memory or with large
- * XXX user-configured buffer caches, the buffer cache will eat
- * XXX up all of the lower 16M of RAM. This prevents ISA DMA
- * XXX maps from allocating bounce pages.
- *
- * XXX Note that nothing can use buffer cache buffers until after
- * XXX autoconfiguration completes!!
- *
- * XXX This is a hack, and needs to be replaced with a better
- * XXX solution! --thorpej@netbsd.org, December 6, 1997
- */
-
- /*
* Allocate a submap for exec arguments. This map effectively
* limits the number of processes exec'ing at any time.
*/
@@ -379,10 +347,8 @@ cpu_startup()
* XXX we need to account for those pages when printing
* XXX the amount of free memory.
*/
- format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free - bufpages));
+ format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free/* - bufpages*/));
printf("avail memory = %s\n", pbuf);
- format_bytes(pbuf, sizeof(pbuf), bufpages * PAGE_SIZE);
- printf("using %d buffers containing %s of memory\n", nbuf, pbuf);
/* Safe for i/o port / memory space allocation to use malloc now. */
x86_bus_space_mallocok();
@@ -441,55 +407,6 @@ i386_init_pcb_tss_ldt(ci)
}
/*
- * XXX Finish up the deferred buffer cache allocation and initialization.
- */
-void
-i386_bufinit()
-{
- int i, base, residual;
-
- base = bufpages / nbuf;
- residual = bufpages % nbuf;
- for (i = 0; i < nbuf; i++) {
- vsize_t curbufsize;
- vaddr_t curbuf;
- struct vm_page *pg;
-
- /*
- * Each buffer has MAXBSIZE bytes of VM space allocated. Of
- * that MAXBSIZE space, we allocate and map (base+1) pages
- * for the first "residual" buffers, and then we allocate
- * "base" pages for the rest.
- */
- curbuf = (vaddr_t) buffers + (i * MAXBSIZE);
- curbufsize = PAGE_SIZE * ((i < residual) ? (base+1) : base);
-
- while (curbufsize) {
- /*
- * Attempt to allocate buffers from the first
- * 16M of RAM to avoid bouncing file system
- * transfers.
- */
- pg = uvm_pagealloc_strat(NULL, 0, NULL, 0,
- UVM_PGA_STRAT_FALLBACK, VM_FREELIST_FIRST16);
- if (pg == NULL)
- panic("cpu_startup: not enough memory for "
- "buffer cache");
- pmap_kenter_pa(curbuf, VM_PAGE_TO_PHYS(pg),
- VM_PROT_READ|VM_PROT_WRITE);
- curbuf += PAGE_SIZE;
- curbufsize -= PAGE_SIZE;
- }
- }
- pmap_update(pmap_kernel());
-
- /*
- * Set up buffers, so they can be used to read disk labels.
- */
- bufinit();
-}
-
-/*
* machine dependent system variables.
*/
int
Index: arch/i386/i386/autoconf.c
===================================================================
--- arch/i386/i386/autoconf.c (revision 281)
+++ arch/i386/i386/autoconf.c (working copy)
@@ -155,8 +155,10 @@ cpu_configure(void)
lapic_tpr = 0;
#endif
- /* XXX Finish deferred buffer cache allocation. */
- i386_bufinit();
+ /*
+ * Set up buffers, so they can be used to read disk labels.
+ */
+ bufinit();
}
void
Index: sys/buf.h
===================================================================
--- sys/buf.h (revision 284)
+++ sys/buf.h (working copy)
@@ -76,10 +76,16 @@
#ifndef _SYS_BUF_H_
#define _SYS_BUF_H_
+#include <sys/param.h> /* for NULL */
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/lock.h>
+#include <uvm/uvm_param.h> /* for MIN_PAGE_SIZE */
+#include <machine/param.h> /* for MAXPHYS */
+
+#include <lib/libkern/libkern.h> /* for KASSERT */
+
struct buf;
struct mount;
struct vnode;
@@ -126,6 +132,24 @@ void bufq_free(struct bufq_state *);
#define BUFQ_PEEK(bufq) \
(*(bufq)->bq_get)((bufq), 0) /* Get buffer from queue */
+/*
+ * buffer mapper
+ *
+ * XXX it's intended to be put into each devices/drivers which
+ * XXX require kernel-addressable buffers.
+ */
+struct bufmap {
+ struct uvm_iomap *bm_iomap;
+ void (*bm_mapin)(struct buf *);
+ void (*bm_mapout)(struct buf *);
+ void (*bm_reclaim)(struct bufmap *, vsize_t);
+ vsize_t bm_kva_used;
+};
+
+extern struct bufmap devbufmap;
+void genbuf_mapin(struct buf *);
+void genbuf_mapout(struct buf *);
+
#endif /* _KERNEL */
/*
@@ -153,7 +177,8 @@ struct buf {
LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
TAILQ_ENTRY(buf) b_actq; /* Device driver queue when active. */
- struct proc *b_proc; /* Associated proc if B_PHYS set. */
+ TAILQ_ENTRY(buf) b_mappedlist; /* LRU entry for mapped buffers. */
+ struct proc *b_proc; /* Associated proc if B_PHYS set. */
volatile long b_flags; /* B_* flags. */
struct simplelock b_interlock; /* Lock for b_flags changes */
int b_error; /* Errno value. */
@@ -172,16 +197,21 @@ struct buf {
number (not partition relative) */
/* Function to call upon completion. */
void (*b_iodone) __P((struct buf *));
- struct vnode *b_vp; /* File vnode. */
+ struct vnode *b_vp; /* File vnode. */
void *b_private; /* Private data for owner */
off_t b_dcookie; /* Offset cookie if dir block */
- struct workhead b_dep; /* List of filesystem dependencies. */
+ struct workhead b_dep; /* List of filesystem dependencies. */
+#ifdef _KERNEL /* XXX */
+ struct bufmap *b_map;
+ struct vm_page *b_pages[MAXPHYS/MIN_PAGE_SIZE];
+#endif /* _KERNEL */
};
#define BUF_INIT(bp) \
do { \
LIST_INIT(&(bp)->b_dep); \
simple_lock_init(&(bp)->b_interlock); \
+ (bp)->b_map = NULL; \
} while (/*CONSTCOND*/0)
/*
@@ -220,6 +250,12 @@ do { \
#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */
#define B_XXX 0x02000000 /* Debugging flag. */
#define B_VFLUSH 0x04000000 /* Buffer is being synced. */
+#define B_PAGES 0x08000000 /* b_pages is valid */
+#define B_MAPPED 0x10000000 /* b_pages are addressable by b_data */
+
+/* test if a buffer is kernel-addressable */
+#define BUF_IS_ADDRESSABLE(bp) \
+ (!((bp)->b_flags & B_PAGES) || ((bp)->b_flags & B_MAPPED))
/*
* This structure describes a clustered I/O. It is stored in the b_saveaddr
@@ -251,10 +287,16 @@ do { \
#ifdef _KERNEL
extern struct bio_ops bioops;
-extern u_int nbuf; /* The number of buffer headers */
-extern struct buf *buf; /* The buffer headers. */
-extern char *buffers; /* The buffer contents. */
-extern u_int bufpages; /* Number of memory pages in the buffer pool. */
+
+/* The number of buffer headers */
+extern unsigned int nbufcache_min;
+extern unsigned int nbufcache_max;
+extern unsigned int nbufcache;
+
+/* Number of memory pages in the buffer pool. */
+extern unsigned int nbufcachepage_min;
+extern unsigned int nbufcachepage_max;
+extern unsigned int nbufcachepage;
/*
* Pool of I/O buffers. Access to this pool must be protected with
@@ -285,7 +327,6 @@ int cluster_read __P((struct vnode *, u_
void cluster_write __P((struct buf *, u_quad_t));
struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
struct buf *geteblk __P((int));
-struct buf *getnewbuf __P((int, int));
struct buf *incore __P((struct vnode *, daddr_t));
void minphys __P((struct buf *));
@@ -295,9 +336,33 @@ int physio __P((void (*)(struct buf *),
void brelvp __P((struct buf *));
void reassignbuf __P((struct buf *, struct vnode *));
void bgetvp __P((struct vnode *, struct buf *));
+
+int bufcache_shutdown __P((void));
+
+static inline void
+buf_mapin(struct buf *bp)
+{
+
+ KASSERT(bp->b_map != NULL);
+ KASSERT(bp->b_map->bm_mapin != NULL);
+
+ bp->b_map->bm_mapin(bp);
+ KASSERT(BUF_IS_ADDRESSABLE(bp));
+}
+
+static inline void
+buf_mapout(struct buf *bp)
+{
+
+ KASSERT(bp->b_map != NULL);
+
+ if (bp->b_map->bm_mapout)
+ bp->b_map->bm_mapout(bp);
+}
+
#ifdef DDB
void vfs_buf_print __P((struct buf *, int, void (*)(const char *, ...)));
-#endif
+#endif /* DDB */
__END_DECLS
-#endif
+#endif /* _KERNEL */
#endif /* !_SYS_BUF_H_ */
--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="uvm.iomap.diff"
Index: uvm/uvm_iomap.h
===================================================================
--- uvm/uvm_iomap.h (revision 0)
+++ uvm/uvm_iomap.h (revision 198)
@@ -0,0 +1,62 @@
+/* $NetBSD$ */
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
+ */
+
+#ifndef _UVM_UVM_IOMAP_H_
+#define _UVM_UVM_IOMAP_H_
+
+/*
+ * uvm_iomap.h
+ */
+
+struct vm_map;
+
+struct uvm_iomap {
+ struct vm_map *ui_map;
+ struct simplelock ui_wanted_lock;
+ boolean_t ui_wanted;
+ vaddr_t ui_emergva;
+ boolean_t ui_emerginuse;
+};
+
+void uvm_iomap_init(struct uvm_iomap *, vsize_t, vsize_t);
+vaddr_t uvm_iomap_alloc(struct uvm_iomap *, vsize_t, int);
+void uvm_iomap_free(struct uvm_iomap *, vaddr_t, vsize_t);
+
+#define UVMIOMAP_WAITOK UVMPAGER_MAPIN_WAITOK
+
+#define uvm_iomap_pmap(iomap) vm_map_pmap((iomap)->ui_map)
+
+#endif /* _UVM_UVM_IOMAP_H_ */
Index: uvm/uvm_map.c
===================================================================
--- uvm/uvm_map.c (revision 197)
+++ uvm/uvm_map.c (revision 198)
@@ -98,8 +98,6 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v
#include <uvm/uvm_ddb.h>
#endif
-extern struct vm_map *pager_map;
-
struct uvm_cnt map_ubackmerge, map_uforwmerge;
struct uvm_cnt map_ubimerge, map_unomerge;
struct uvm_cnt map_kbackmerge, map_kforwmerge;
@@ -578,14 +576,14 @@ uvm_map(map, startp, size, uobj, uoffset
}
/*
- * for pager_map, allocate the new entry first to avoid sleeping
+ * for i/o map, allocate the new entry first to avoid sleeping
* for memory while we have the map locked.
*/
new_entry = NULL;
- if (map == pager_map) {
+ if (map->flags & VM_MAP_IOMAP) {
new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
- if (__predict_false(new_entry == NULL))
+ if (__predict_false(new_entry == NULL))
return ENOMEM;
}
Index: uvm/uvm_map.h
===================================================================
--- uvm/uvm_map.h (revision 197)
+++ uvm/uvm_map.h (revision 198)
@@ -231,6 +231,7 @@ struct vm_map {
#define VM_MAP_WANTLOCK 0x10 /* rw: want to write-lock */
#define VM_MAP_DYING 0x20 /* rw: map is being destroyed */
#define VM_MAP_TOPDOWN 0x40 /* ro: arrange map top-down */
+#define VM_MAP_IOMAP 0x80 /* ro: map for i/o */
/* XXX: number of kernel maps and entries to statically allocate */
Index: uvm/uvm_pager.c
===================================================================
--- uvm/uvm_pager.c (revision 197)
+++ uvm/uvm_pager.c (revision 198)
@@ -52,6 +52,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,
#define UVM_PAGER
#include <uvm/uvm.h>
+#include <uvm/uvm_iomap.h>
struct pool *uvm_aiobuf_pool;
@@ -70,11 +71,7 @@ struct uvm_pagerops * const uvmpagerops[
* the pager map: provides KVA for I/O
*/
-struct vm_map *pager_map; /* XXX */
-struct simplelock pager_map_wanted_lock;
-boolean_t pager_map_wanted; /* locked by pager map */
-static vaddr_t emergva;
-static boolean_t emerginuse;
+struct uvm_iomap pager_kva;
/*
* uvm_pager_init: init pagers (at boot time)
@@ -84,19 +81,12 @@ void
uvm_pager_init()
{
u_int lcv;
- vaddr_t sva, eva;
/*
* init pager map
*/
- sva = 0;
- pager_map = uvm_km_suballoc(kernel_map, &sva, &eva, PAGER_MAP_SIZE, 0,
- FALSE, NULL);
- simple_lock_init(&pager_map_wanted_lock);
- pager_map_wanted = FALSE;
- emergva = uvm_km_valloc(kernel_map, round_page(MAXPHYS));
- emerginuse = FALSE;
+ uvm_iomap_init(&pager_kva, PAGER_MAP_SIZE, round_page(MAXPHYS));
/*
* init ASYNC I/O queue
@@ -136,6 +126,10 @@ uvm_pagermapin(pps, npages, flags)
UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d)", pps, npages,0,0);
+ size = npages << PAGE_SHIFT;
+ kva = uvm_iomap_alloc(&pager_kva, size,
+ (flags & UVMPAGER_MAPIN_WAITOK) ? UVMIOMAP_WAITOK : 0);
+
/*
* compute protection. outgoing I/O only needs read
* access to the page, whereas incoming needs read/write.
@@ -145,48 +139,13 @@ uvm_pagermapin(pps, npages, flags)
if (flags & UVMPAGER_MAPIN_READ)
prot |= VM_PROT_WRITE;
-ReStart:
- size = npages << PAGE_SHIFT;
- kva = 0; /* let system choose VA */
-
- if (uvm_map(pager_map, &kva, size, NULL,
- UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != 0) {
- if (curproc == uvm.pagedaemon_proc) {
- simple_lock(&pager_map_wanted_lock);
- if (emerginuse) {
- UVM_UNLOCK_AND_WAIT(&emergva,
- &pager_map_wanted_lock, FALSE,
- "emergva", 0);
- goto ReStart;
- }
- emerginuse = TRUE;
- simple_unlock(&pager_map_wanted_lock);
- kva = emergva;
- /* The shift implicitly truncates to PAGE_SIZE */
- KASSERT(npages <= (MAXPHYS >> PAGE_SHIFT));
- goto enter;
- }
- if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) {
- UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0);
- return(0);
- }
- simple_lock(&pager_map_wanted_lock);
- pager_map_wanted = TRUE;
- UVMHIST_LOG(maphist, " SLEEPING on pager_map",0,0,0,0);
- UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, FALSE,
- "pager_map", 0);
- goto ReStart;
- }
-
-enter:
- /* got it */
for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) {
pp = *pps++;
KASSERT(pp);
KASSERT(pp->flags & PG_BUSY);
pmap_kenter_pa(cva, VM_PAGE_TO_PHYS(pp), prot);
}
- pmap_update(vm_map_pmap(pager_map));
+ pmap_update(uvm_iomap_pmap(&pager_kva));
UVMHIST_LOG(maphist, "<- done (KVA=0x%x)", kva,0,0,0);
return(kva);
@@ -205,36 +164,11 @@ uvm_pagermapout(kva, npages)
int npages;
{
vsize_t size = npages << PAGE_SHIFT;
- struct vm_map_entry *entries;
UVMHIST_FUNC("uvm_pagermapout"); UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist, " (kva=0x%x, npages=%d)", kva, npages,0,0);
-
- /*
- * duplicate uvm_unmap, but add in pager_map_wanted handling.
- */
-
- pmap_kremove(kva, npages << PAGE_SHIFT);
- if (kva == emergva) {
- simple_lock(&pager_map_wanted_lock);
- emerginuse = FALSE;
- wakeup(&emergva);
- simple_unlock(&pager_map_wanted_lock);
- return;
- }
-
- vm_map_lock(pager_map);
- uvm_unmap_remove(pager_map, kva, kva + size, &entries);
- simple_lock(&pager_map_wanted_lock);
- if (pager_map_wanted) {
- pager_map_wanted = FALSE;
- wakeup(pager_map);
- }
- simple_unlock(&pager_map_wanted_lock);
- vm_map_unlock(pager_map);
- if (entries)
- uvm_unmap_detach(entries, 0);
- pmap_update(pmap_kernel());
+ pmap_kremove(kva, size);
+ uvm_iomap_free(&pager_kva, kva, size);
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
}
Index: uvm/uvm_iomap.c
===================================================================
--- uvm/uvm_iomap.c (revision 0)
+++ uvm/uvm_iomap.c (revision 198)
@@ -0,0 +1,138 @@
+/* $NetBSD$ */
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include "opt_uvmhist.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/vnode.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_iomap.h>
+
+void
+uvm_iomap_init(struct uvm_iomap *kp, vsize_t size, vsize_t emergsize)
+{
+ vaddr_t sva, eva;
+
+ sva = 0;
+ kp->ui_map = uvm_km_suballoc(kernel_map, &sva, &eva, size, 0, FALSE,
+ NULL);
+ kp->ui_map->flags |= VM_MAP_IOMAP;
+ simple_lock_init(&kp->ui_wanted_lock);
+ kp->ui_wanted = FALSE;
+ kp->ui_emergva = uvm_km_valloc(kernel_map, emergsize);
+ kp->ui_emerginuse = FALSE;
+}
+
+vaddr_t
+uvm_iomap_alloc(struct uvm_iomap *kp, vsize_t size, int flags)
+{
+ vaddr_t kva;
+ UVMHIST_FUNC("uvm_iomap_alloc"); UVMHIST_CALLED(maphist);
+
+ReStart:
+ kva = 0; /* let system choose VA */
+
+ if (uvm_map(kp->ui_map, &kva, size, NULL,
+ UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != 0) {
+ if (curproc == uvm.pagedaemon_proc) {
+ simple_lock(&kp->ui_wanted_lock);
+ if (kp->ui_emerginuse) {
+ UVM_UNLOCK_AND_WAIT(&kp->ui_emergva,
+ &kp->ui_wanted_lock, FALSE,
+ "emergva", 0);
+ goto ReStart;
+ }
+ kp->ui_emerginuse = TRUE;
+ simple_unlock(&kp->ui_wanted_lock);
+ kva = kp->ui_emergva;
+ goto gotit;
+ }
+ if ((flags & UVMIOMAP_WAITOK) == 0) {
+ UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0);
+ return 0;
+ }
+ simple_lock(&kp->ui_wanted_lock);
+ kp->ui_wanted = TRUE;
+ UVMHIST_LOG(maphist, " SLEEPING on iomap",0,0,0,0);
+ UVM_UNLOCK_AND_WAIT(kp->ui_map, &kp->ui_wanted_lock, FALSE,
+ "iomap", 0);
+ goto ReStart;
+ }
+
+gotit:
+ return kva;
+ UVMHIST_LOG(maphist, "<- done (KVA=0x%x)", kva,0,0,0);
+}
+
+void
+uvm_iomap_free(struct uvm_iomap *kp, vaddr_t kva, vsize_t size)
+{
+ struct vm_map_entry *entries;
+
+ /*
+ * duplicate uvm_unmap, but add in ui_wanted handling.
+ */
+
+ if (kva == kp->ui_emergva) {
+ simple_lock(&kp->ui_wanted_lock);
+ kp->ui_emerginuse = FALSE;
+ wakeup(&kp->ui_emergva);
+ simple_unlock(&kp->ui_wanted_lock);
+ return;
+ }
+
+ vm_map_lock(kp->ui_map);
+ uvm_unmap_remove(kp->ui_map, kva, kva + size, &entries);
+ simple_lock(&kp->ui_wanted_lock);
+ if (kp->ui_wanted) {
+ kp->ui_wanted = FALSE;
+ wakeup(kp->ui_map);
+ }
+ simple_unlock(&kp->ui_wanted_lock);
+ vm_map_unlock(kp->ui_map);
+ if (entries)
+ uvm_unmap_detach(entries, 0);
+ pmap_update(pmap_kernel()); /* needed? */
+}
+
--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="vfs_bufmap.c"
/* $NetBSD$ */
/*
* XXX
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <uvm/uvm.h>
#include <uvm/uvm_iomap.h>
void devbufmap_init(void);
void bufmap_update(struct bufmap *);
struct uvm_iomap deviomap;
struct bufmap devbufmap;
int devbufcnt;
vsize_t deviomap_size = MAXPHYS * 128; /* XXX */
void
devbufmap_init()
{
memset(&devbufmap, 0, sizeof(devbufmap));
devbufmap.bm_mapin = genbuf_mapin;
devbufmap.bm_mapout = genbuf_mapout;
devbufmap.bm_iomap = &deviomap;
uvm_iomap_init(&deviomap, deviomap_size, round_page(MAXPHYS));
}
/*
* map a buffer.
* - allocate kva(b_data) and map the pages.
* - unmap free buffers on kva shortage if bm_reclaim callback is specified.
*/
void
genbuf_mapin(struct buf *bp)
{
vaddr_t kva, eva;
vsize_t size;
struct vm_page **pgpp;
struct bufmap *bmap = bp->b_map;
struct uvm_iomap *iomap = bmap->bm_iomap;
int iomapflags;
KASSERT(bp->b_flags & B_PAGES);
KASSERT(bp->b_map != NULL);
KASSERT(bp->b_flags & B_BUSY);
KASSERT(!(bp->b_flags & B_MAPPED));
KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
KASSERT(bp->b_data == NULL);
KASSERT(bp->b_bufsize > 0);
LOCK_ASSERT(!simple_lock_held(&bp->b_interlock));
if (bp->b_map == &devbufmap)
devbufcnt++;
size = bp->b_bufsize;
if (bmap->bm_reclaim) {
iomapflags = 0;
} else {
iomapflags = UVMIOMAP_WAITOK;
}
while ((kva = uvm_iomap_alloc(iomap, size, iomapflags)) == 0) {
/*
* kva shortage.
* try to unmap free buffers.
*
* XXX need to consider kva fragmentation
*/
/*
* uvm_iomap_alloc shouldn't failed for pagedaemon.
*/
KASSERT(curproc != uvm.pagedaemon_proc);
/*
* we can deadlock if the buffer we have is on kva lru list.
*/
KASSERT(bp->b_mappedlist.tqe_prev == NULL);
KASSERT(bmap->bm_reclaim != NULL);
bmap->bm_reclaim(bmap, size);
}
bp->b_data = (void *)kva;
eva = kva + size;
for (pgpp = bp->b_pages; kva < eva; pgpp++, kva += PAGE_SIZE) {
const vm_prot_t prot = VM_PROT_READ | VM_PROT_WRITE;
struct vm_page *pg = *pgpp;
KASSERT(pg);
pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pg), prot);
}
bmap->bm_kva_used += size; /* XXX MP */
bufmap_update(bmap);
}
/*
* unmap a buffer.
*/
void
genbuf_mapout(struct buf *bp)
{
vaddr_t kva;
vsize_t size;
struct bufmap *bmap = bp->b_map;
LOCK_ASSERT(!simple_lock_held(&bp->b_interlock));
KASSERT(bp->b_flags & B_BUSY);
KASSERT(bp->b_flags & B_PAGES);
KASSERT(bp->b_flags & B_MAPPED);
KASSERT(bp->b_bufsize > 0);
KASSERT(bp->b_mappedlist.tqe_prev == NULL);
KASSERT(bp->b_map != NULL);
kva = (vaddr_t)bp->b_data;
size = round_page(bp->b_bufsize);
pmap_kremove(kva, size);
uvm_iomap_free(bmap->bm_iomap, kva, size);
bp->b_flags &= ~B_MAPPED;
#ifdef DIAGNOSTIC
bp->b_data = NULL;
#endif
KASSERT(bmap->bm_kva_used >= size); /* XXX MP */
bmap->bm_kva_used -= size; /* XXX MP */
}
void
bufmap_update(struct bufmap *bm)
{
pmap_update(uvm_iomap_pmap(bm->bm_iomap));
}
--NextPart-20031120184112-0180700--