Subject: Re: buffer cache memory management revision
To: None <pk@cs.few.eur.nl>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 11/20/2003 18:44:33
--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii

hi,

> Therefore, I propose to revisit the age old memory management still
> employed by the buffer cache. In particular, I'd like to get rid of
> the MAXBSIZE reservation of virtual memory per buffer which is sparsely
> mapped by privately managed pool of physical pages. Currently, this
> scheme stresses MMU resources on some platforms like sun4 & sun4c.
> It also wastes a large amount kernel VM space on machines with lots of
> physical memory when the default buffer cache parameters are in use.

this reminds me of my old patches.. (attached)

YAMAMOTO Takashi

--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="bufcache6.diff"

Index: miscfs/specfs/spec_vnops.c
===================================================================
--- miscfs/specfs/spec_vnops.c	(revision 283)
+++ miscfs/specfs/spec_vnops.c	(working copy)
@@ -581,11 +581,35 @@ spec_strategy(v)
 
 	bp = ap->a_bp;
 	if (!(bp->b_flags & B_READ) &&
-	    (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start)
+	    (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) {
+		/*
+		 * XXX softdep needs buffers kernel-addressable.
+		 * XXX this should be pushed into each filesystems.
+		 */
+		if (!BUF_IS_ADDRESSABLE(bp)) {
+			if (bp->b_map == NULL) {
+				/* attach */
+				bp->b_map = &devbufmap; 
+			}
+			buf_mapin(bp);
+		}
 		(*bioops.io_start)(bp);
+	}
 	bdev = bdevsw_lookup(bp->b_dev);
-	if (bdev != NULL)
+	if (bdev != NULL) {
+		/*
+		 * XXX currently all drivers needs buffers kernel-addressable.
+		 * XXX this should be pushed into each drivers.
+		 */
+		if (!BUF_IS_ADDRESSABLE(bp)) {
+			if (bp->b_map == NULL) {
+				/* attach */
+				bp->b_map = &devbufmap; 
+			}
+			buf_mapin(bp);
+		}
 		(*bdev->d_strategy)(bp);
+	}
 	return (0);
 }
 
Index: conf/files
===================================================================
--- conf/files	(revision 282)
+++ conf/files	(working copy)
@@ -1157,6 +1157,7 @@ file	kern/uipc_socket2.c
 file	kern/uipc_syscalls.c
 file	kern/uipc_usrreq.c
 file	kern/vfs_bio.c
+file	kern/vfs_bufmap.c
 file	kern/vfs_cache.c
 file	kern/vfs_getcwd.c
 file	kern/vfs_init.c
Index: ufs/lfs/lfs_segment.c
===================================================================
--- ufs/lfs/lfs_segment.c	(revision 266)
+++ ufs/lfs/lfs_segment.c	(working copy)
@@ -101,6 +101,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_segment.
 #include <uvm/uvm.h>
 #include <uvm/uvm_extern.h>
 
+void bufcache_notemappedfree(struct buf *); /* XXX */
+
 MALLOC_DEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
 
 extern int count_lock_queue(void);
@@ -1756,6 +1758,11 @@ lfs_writeseg(struct lfs *fs, struct segm
 			newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
 					   bp->b_bcount, LFS_NB_IBLOCK);
 			newbp->b_blkno = bp->b_blkno;
+			/* XXX should use our own map? */
+			if (!BUF_IS_ADDRESSABLE(bp)) {
+				buf_mapin(bp);
+				bufcache_notemappedfree(bp);
+			}
 			memcpy(newbp->b_data, bp->b_data,
 			       newbp->b_bcount);
 
@@ -1869,6 +1876,11 @@ lfs_writeseg(struct lfs *fs, struct segm
 			} else
 #endif /* LFS_USE_B_INVAL */
 			{
+				/* XXX should use our own map? */
+				if (!BUF_IS_ADDRESSABLE(bp)) {
+					buf_mapin(bp);
+					bufcache_notemappedfree(bp);
+				}
 				memcpy(dp, (*bpp)->b_data + byteoffset,
 				       el_size);
 			}
Index: ufs/lfs/lfs_bio.c
===================================================================
--- ufs/lfs/lfs_bio.c	(revision 266)
+++ ufs/lfs/lfs_bio.c	(working copy)
@@ -746,7 +746,7 @@ lfs_countlocked(int *count, long *bytes,
 		n++;
 		size += bp->b_bufsize;
 #ifdef DEBUG_LOCKED_LIST
-		if (n > nbuf)
+		if (n > nbufcache_min)
 			panic("lfs_countlocked: this can't happen: more"
 			      " buffers locked than exist");
 #endif
Index: ufs/lfs/lfs_vfsops.c
===================================================================
--- ufs/lfs/lfs_vfsops.c	(revision 266)
+++ ufs/lfs/lfs_vfsops.c	(working copy)
@@ -1296,12 +1296,12 @@ lfs_mountfs(struct vnode *devvp, struct 
 	if (fs->lfs_ivnode->v_size / fs->lfs_bsize > LFS_MAX_BUFS) {
 		fs->lfs_flags |= LFS_WARNED;
 		printf("lfs_mountfs: please consider increasing NBUF to at least %lld\n",
-			(long long)(fs->lfs_ivnode->v_size / fs->lfs_bsize) * (nbuf / LFS_MAX_BUFS));
+			(long long)(fs->lfs_ivnode->v_size / fs->lfs_bsize) * (nbufcache_min / LFS_MAX_BUFS));
 	}
 	if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES) {
 		fs->lfs_flags |= LFS_WARNED;
 		printf("lfs_mountfs: please consider increasing BUFPAGES to at least %lld\n",
-			(long long)fs->lfs_ivnode->v_size * bufpages / LFS_MAX_BYTES);
+			(long long)fs->lfs_ivnode->v_size * nbufcachepage_min / LFS_MAX_BYTES);
 	}
 
 	return (0);
@@ -1387,12 +1387,12 @@ lfs_unmount(struct mount *mp, int mntfla
 				" NBUF to at least %lld\n",
 				(long long)(fs->lfs_ivnode->v_size /
 					    fs->lfs_bsize) *
-				(long long)(nbuf / LFS_MAX_BUFS));
+				(long long)(nbufcache_min / LFS_MAX_BUFS));
 		if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES)
 			printf("lfs_unmount: please consider increasing"
 				" BUFPAGES to at least %lld\n",
 				(long long)fs->lfs_ivnode->v_size *
-				bufpages / LFS_MAX_BYTES);
+				nbufcachepage_min / LFS_MAX_BYTES);
 	}
 
 	/* Explicitly write the superblock, to update serial and pflags */
Index: ufs/lfs/lfs.h
===================================================================
--- ufs/lfs/lfs.h	(revision 266)
+++ ufs/lfs/lfs.h	(working copy)
@@ -109,11 +109,13 @@
 #define PG_DELWRI	PG_PAGER1	/* Local def for delayed pageout */
 
 /* Resource limits */
-#define LFS_MAX_BUFS	    ((nbuf >> 2) - 10)
-#define LFS_WAIT_BUFS	    ((nbuf >> 1) - (nbuf >> 3) - 10)
-#define LFS_MAX_BYTES	    (((bufpages >> 2) - 10) * PAGE_SIZE)
-#define LFS_WAIT_BYTES	    (((bufpages >> 1) - (bufpages >> 3) - 10) \
-			      * PAGE_SIZE)
+#define LFS_MAX_BUFS	    ((nbufcache_min >> 2) - 10)
+#define LFS_WAIT_BUFS	    ((nbufcache_min >> 1) - (nbufcache_min >> 3) - 10)
+/* XXX must consider kva */
+#define LFS_MAX_BYTES	\
+	(((nbufcachepage_min >> 2) - 10) * PAGE_SIZE)
+#define LFS_WAIT_BYTES	\
+	(((nbufcachepage_min >> 1) - (nbufcachepage_min >> 3) - 10) * PAGE_SIZE)
 #define LFS_MAX_DIROP	    ((desiredvnodes >> 2) + (desiredvnodes >> 3))
 #define LFS_MAX_PAGES \
      (((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemin) >> 8)
Index: ufs/ext2fs/ext2fs_subr.c
===================================================================
--- ufs/ext2fs/ext2fs_subr.c	(revision 266)
+++ ufs/ext2fs/ext2fs_subr.c	(working copy)
@@ -121,6 +121,7 @@ ext2fs_checkoverlap(bp, ip)
 	struct buf *bp;
 	struct inode *ip;
 {
+#if 0 /* XXX */
 	struct buf *ebp, *ep;
 	daddr_t start, last;
 	struct vnode *vp;
@@ -147,5 +148,6 @@ ext2fs_checkoverlap(bp, ip)
 			ep->b_blkno + btodb(ep->b_bcount) - 1);
 		panic("Disk buffer overlap");
 	}
+#endif
 }
 #endif
Index: ufs/ffs/ffs_softdep.c
===================================================================
--- ufs/ffs/ffs_softdep.c	(revision 253)
+++ ufs/ffs/ffs_softdep.c	(working copy)
@@ -5745,7 +5745,7 @@ softdep_trackbufs(int delta, boolean_t t
 {
 
 	if (delta < 0) {
-		if (softdep_lockedbufs < nbuf >> 2) {
+		if (softdep_lockedbufs < nbufcache_min >> 2) {
 			wakeup(&softdep_lockedbufs);
 		}
 		KASSERT(softdep_lockedbufs >= -delta);
@@ -5753,7 +5753,7 @@ softdep_trackbufs(int delta, boolean_t t
 		return;
 	}
 
-	while (throttle && softdep_lockedbufs >= nbuf >> 2) {
+	while (throttle && softdep_lockedbufs >= nbufcache_min >> 2) {
 		speedup_syncer();
 		tsleep(&softdep_lockedbufs, PRIBIO, "softdbufs", 0);
 	}
Index: ufs/ffs/ffs_subr.c
===================================================================
--- ufs/ffs/ffs_subr.c	(revision 266)
+++ ufs/ffs/ffs_subr.c	(working copy)
@@ -201,6 +201,7 @@ ffs_checkoverlap(bp, ip)
 	struct buf *bp;
 	struct inode *ip;
 {
+#if 0 /* XXX */
 	struct buf *ebp, *ep;
 	daddr_t start, last;
 	struct vnode *vp;
@@ -227,6 +228,7 @@ ffs_checkoverlap(bp, ip)
 		    ep->b_blkno + btodb(ep->b_bcount) - 1);
 		panic("Disk buffer overlap");
 	}
+#endif
 }
 #endif /* _KERNEL && DIAGNOSTIC */
 
Index: kern/kern_allocsys.c
===================================================================
--- kern/kern_allocsys.c	(revision 283)
+++ kern/kern_allocsys.c	(working copy)
@@ -69,7 +69,6 @@
 #include <sys/cdefs.h>
 __KERNEL_RCSID(0, "$NetBSD: kern_allocsys.c,v 1.24 2003/08/07 16:31:42 agc Exp $");
 
-#include "opt_bufcache.h"
 #include "opt_sysv.h"
 
 #include <sys/param.h>
@@ -85,29 +84,6 @@ __KERNEL_RCSID(0, "$NetBSD: kern_allocsy
 #include <sys/shm.h>
 #endif
 #include <uvm/uvm_extern.h>
-/*
- * Declare these as initialized data so we can patch them.
- */
-#ifndef	NBUF
-# define NBUF 0
-#endif
-
-#ifndef	BUFPAGES
-# define BUFPAGES 0
-#endif
-
-#ifdef BUFCACHE
-# if (BUFCACHE < 5) || (BUFCACHE > 95)
-#  error BUFCACHE is not between 5 and 95
-# endif
-#else
-  /* Default to 10% of first 2MB and 5% of remaining. */
-# define BUFCACHE 0
-#endif
-
-u_int	nbuf = NBUF;
-u_int	bufpages = BUFPAGES;	/* optional hardwired count */
-u_int	bufcache = BUFCACHE;	/* % of RAM to use for buffer cache */
 
 /*
  * Allocate space for system data structures.  We are given
@@ -140,61 +116,5 @@ allocsys(caddr_t v, caddr_t (*mdcallback
 	ALLOCSYS(v, msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
-	/*
-	 * Determine how many buffers to allocate.
-	 *
-	 *	- If bufcache is specified, use that % of memory
-	 *	  for the buffer cache.
-	 *
-	 *	- Otherwise, we default to the traditional BSD
-	 *	  formula of 10% of the first 2MB and 5% of
-	 *	  the remaining.
-	 */
-	if (bufpages == 0) {
-		if (bufcache != 0) {
-			if (bufcache < 5 || bufcache > 95)
-				panic("bufcache is out of range (%d)",
-				    bufcache);
-			bufpages = physmem / 100 * bufcache;
-		} else {
-			if (physmem < btoc(2 * 1024 * 1024))
-				bufpages = physmem / 10;
-			else
-				bufpages = (btoc(2 * 1024 * 1024) + physmem) /
-				    20;
-		}
-	}
-
-#ifdef DIAGNOSTIC
-	if (bufpages == 0)
-		panic("bufpages = 0");
-#endif
-
-	/*
-	 * Call the mdcallback now; it may need to adjust bufpages.
-	 */
-	if (mdcallback != NULL)
-		v = mdcallback(v);
-
-	/* 
-	 * Ensure a minimum of 16 buffers.
-	 */
-	if (nbuf == 0) {
-		nbuf = bufpages;
-		if (nbuf < 16)
-			nbuf = 16;
-	}
-
-#ifdef VM_MAX_KERNEL_BUF
-	/*
-	 * XXX stopgap measure to prevent wasting too much KVM on
-	 * the sparsely filled buffer cache.
-	 */
-	if (nbuf > VM_MAX_KERNEL_BUF / MAXBSIZE)
-		nbuf = VM_MAX_KERNEL_BUF / MAXBSIZE;
-#endif
-
-	ALLOCSYS(v, buf, struct buf, nbuf);
-
 	return (v);
 }
Index: kern/vfs_bio.c
===================================================================
--- kern/vfs_bio.c	(revision 283)
+++ kern/vfs_bio.c	(working copy)
@@ -82,11 +82,14 @@
 #include <sys/cdefs.h>
 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.93 2003/08/07 16:32:01 agc Exp $");
 
+#include "opt_bufcache.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
+#include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
@@ -96,6 +99,53 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 
 
 #include <miscfs/specfs/specdev.h>
 
+#include <machine/vmparam.h>	/* VM_FREELIST_BUFCACHE */
+
+/*
+ * if the arch has a freelist that's preferable for bufcache, use it.
+ */
+#ifdef VM_FREELIST_BUFCACHE
+#define	UVM_PGA_STRAT_BUFCACHE	UVM_PGA_STRAT_FALLBACK
+#else /* VM_FREELIST_BUFCACHE */
+#define	UVM_PGA_STRAT_BUFCACHE	UVM_PGA_STRAT_NORMAL
+#define	VM_FREELIST_BUFCACHE	0
+#endif /* VM_FREELIST_BUFCACHE */
+
+/*
+ * Declare these as initialized data so we can patch them.
+ */
+#ifndef	NBUF
+# define NBUF 0
+#endif
+
+#ifndef	BUFPAGES
+# define BUFPAGES 0
+#endif
+
+#ifdef BUFCACHE
+# if (BUFCACHE < 5) || (BUFCACHE > 95)
+#  error BUFCACHE is not between 5 and 95
+# endif
+#else
+  /* Default to 10% of first 2MB and 5% of remaining. */
+# define BUFCACHE 0
+#endif
+
+/*
+ * lock for bufcache counts below.
+ */
+struct simplelock bufcache_count_slock = SIMPLELOCK_INITIALIZER;
+
+unsigned int nbufcache_min = NBUF;
+unsigned int nbufcache_max = NBUF * 2; /* XXX */
+unsigned int nbufcache;
+
+unsigned int nbufcachepage_min = BUFPAGES;
+unsigned int nbufcachepage_max = BUFPAGES * 2; /* XXX */
+unsigned int nbufcachepage;
+
+unsigned int bufcache = BUFCACHE;	/* % of RAM to use for buffer cache */
+
 /* Macros to clear/set/test flags. */
 #define	SET(t, f)	(t) |= (f)
 #define	CLR(t, f)	(t) &= ~(f)
@@ -105,7 +155,8 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 
  * Definitions for the buffer hash lists.
  */
 #define	BUFHASH(dvp, lbn)	\
-	(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
+	(&bufhashtbl[((((u_long)(uintptr_t)(dvp) >> PAGE_SHIFT) ^ \
+	((u_long)(uintptr_t)(dvp) >> 3)) + (int)(lbn)) & bufhash])
 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 u_long	bufhash;
 #ifndef SOFTDEP
@@ -131,6 +182,10 @@ struct bio_ops bioops;	/* I/O operation 
 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 int needbuffer;
 
+TAILQ_HEAD(, buf) bufcache_iomap_lru;
+boolean_t bufcache_iomap_wanted;
+vsize_t bufcache_map_size;
+
 /*
  * Buffer queue lock.
  * Take this lock first if also taking some buffer's b_interlock.
@@ -148,6 +203,26 @@ struct pool bufpool;
 static __inline struct buf *bio_doread(struct vnode *, daddr_t, int,
 					struct ucred *, int);
 int count_lock_queue(void);
+struct buf *getnewbuf(int, int, boolean_t);
+
+/*
+ */
+void bufcachemap_init(void);
+void bufcache_mapin(struct buf *);
+void bufcache_mapout(struct buf *);
+void bufcache_reclaimkva(struct bufmap *, vsize_t);
+int bufcache_allocpages(struct buf *, vsize_t);
+int bufcache_freepages(struct buf *, vsize_t);
+void bufcache_movepages(struct buf *, struct buf *, int);
+void bufcache_initparam(void);
+unsigned int bufcache_countfree(void);
+int bufcache_reclaim(int);
+#ifdef DEBUG
+void bufcache_debugdump(void);
+#endif
+struct bufmap bufcachemapper;
+
+void devbufmap_init(void); /* XXX */
 
 /*
  * Insq/Remq for the buffer free lists.
@@ -162,6 +237,9 @@ bremfree(bp)
 {
 	struct bqueues *dp = NULL;
 
+	LOCK_ASSERT(simple_lock_held(&bqueue_slock));
+	LOCK_ASSERT(bp->b_bufsize == 0 || simple_lock_held(&bp->b_interlock));
+
 	/*
 	 * We only calculate the head of the freelist when removing
 	 * the last element of the list as that is the only time that
@@ -195,11 +273,24 @@ found:
 #endif /* DEBUG_BUFCACHE */
 
 	KASSERT(dp == NULL || !(bp->b_flags & B_LOCKED) ||
-	    dp == &bufqueues[BQ_LOCKED]);
+	    (bp->b_flags & B_INVAL) || dp == &bufqueues[BQ_LOCKED]);
+	KASSERT(dp == NULL || (bp->b_flags & B_LOCKED) ||
+	    dp != &bufqueues[BQ_LOCKED]);
 	KASSERT(dp == NULL || bp->b_bufsize != 0 || dp == &bufqueues[BQ_EMPTY]);
 	KASSERT(dp == NULL || bp->b_bufsize == 0 || dp != &bufqueues[BQ_EMPTY]);
 
 	TAILQ_REMOVE(dp, bp, b_freelist);
+
+	/*
+	 * Remove from the kva lru list.
+	 */
+	if (bp->b_flags & B_MAPPED) {
+		KASSERT(bp->b_mappedlist.tqe_prev != NULL);
+		TAILQ_REMOVE(&bufcache_iomap_lru, bp, b_mappedlist);
+#ifdef DIAGNOSTIC
+		bp->b_mappedlist.tqe_prev = NULL;
+#endif
+	}
 }
 
 /*
@@ -213,33 +304,42 @@ bufinit()
 	u_int i, base, residual;
 
 	/*
-	 * Initialize the buffer pool.  This pool is used for buffers
-	 * which are strictly I/O control blocks, not buffer cache
-	 * buffers.
+	 * Initialize the buffer pool.
 	 */
 	pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
 
+	bufcache_initparam();
+	bufcachemap_init();
+	devbufmap_init();
 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
 		TAILQ_INIT(dp);
-	bufhashtbl = hashinit(nbuf, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
-	base = bufpages / nbuf;
-	residual = bufpages % nbuf;
-	for (i = 0; i < nbuf; i++) {
-		bp = &buf[i];
-		memset((char *)bp, 0, sizeof(*bp));
+	TAILQ_INIT(&bufcache_iomap_lru);
+	bufhashtbl =
+	    hashinit(nbufcache_min, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
+	base = nbufcachepage_min / nbufcache_min;
+	residual = nbufcachepage_min % nbufcache_min;
+	for (i = 0; i < nbufcache_min; i++) {
+		vsize_t bufsize;
+
+		bp = pool_get(&bufpool, PR_NOWAIT);
+		memset(bp, 0, sizeof(*bp));
 		BUF_INIT(bp);
+		bp->b_map = &bufcachemapper;
 		bp->b_dev = NODEV;
 		bp->b_vnbufs.le_next = NOLIST;
-		bp->b_data = buffers + i * MAXBSIZE;
 		if (i < residual)
-			bp->b_bufsize = (base + 1) * PAGE_SIZE;
+			bufsize = (base + 1) * PAGE_SIZE;
 		else
-			bp->b_bufsize = base * PAGE_SIZE;
-		bp->b_flags = B_INVAL;
+			bufsize = base * PAGE_SIZE;
+		if (bufcache_allocpages(bp, bufsize))
+			panic("can't alloc buf page");
+		bp->b_flags = B_INVAL | B_PAGES;
 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
 		binsheadfree(bp, dp);
 		binshash(bp, &invalhash);
+		nbufcache++;
 	}
+	KASSERT(nbufcache == nbufcache_min);
 }
 
 static __inline struct buf *
@@ -367,6 +467,7 @@ bwrite(bp)
 	struct mount *mp;
 
 	KASSERT(ISSET(bp->b_flags, B_BUSY));
+	KASSERT(ISSET(bp->b_flags, B_PAGES));
 
 	vp = bp->b_vp;
 	if (vp != NULL) {
@@ -471,6 +572,7 @@ bdwrite(bp)
 	int s;
 
 	KASSERT(ISSET(bp->b_flags, B_BUSY));
+	KASSERT(ISSET(bp->b_flags, B_PAGES));
 
 	/* If this is a tape block, write the block now. */
 	bdev = bdevsw_lookup(bp->b_dev);
@@ -512,6 +614,7 @@ bawrite(bp)
 	int s;
 
 	KASSERT(ISSET(bp->b_flags, B_BUSY));
+	KASSERT(ISSET(bp->b_flags, B_PAGES));
 
 	s = splbio();
 	simple_lock(&bp->b_interlock);
@@ -554,15 +657,16 @@ brelse(bp)
 	struct buf *bp;
 {
 	struct bqueues *bufq;
+	boolean_t dofree = FALSE;
 	int s;
 
 	KASSERT(ISSET(bp->b_flags, B_BUSY));
+	KASSERT(ISSET(bp->b_flags, B_PAGES));
 	KASSERT(!ISSET(bp->b_flags, B_CALL));
 
 	/* Block disk interrupts. */
 	s = splbio();
 	simple_lock(&bqueue_slock);
-	simple_lock(&bp->b_interlock);
 
 	/* Wake up any processes waiting for any buffer to become free. */
 	if (needbuffer) {
@@ -570,6 +674,13 @@ brelse(bp)
 		wakeup(&needbuffer);
 	}
 
+	if ((bp->b_flags & B_MAPPED) && bufcache_iomap_wanted) {
+		bufcache_iomap_wanted = FALSE;
+		wakeup(&bufcache_iomap_wanted);
+	}
+
+	simple_lock(&bp->b_interlock);
+
 	/* Wake up any proceeses waiting for _this_ buffer to become free. */
 	if (ISSET(bp->b_flags, B_WANTED)) {
 		CLR(bp->b_flags, B_WANTED|B_AGE);
@@ -598,11 +709,14 @@ brelse(bp)
 		CLR(bp->b_flags, B_VFLUSH);
 		if (!ISSET(bp->b_flags, B_ERROR|B_INVAL|B_LOCKED|B_AGE))
 			goto already_queued;
-		else
-			bremfree(bp);
+
+		bremfree(bp);
 	}
 
+	KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+
 	if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
+		KASSERT(!(bp->b_flags & B_LOCKED));
 		/*
 		 * If it's invalid or empty, dissociate it from its vnode
 		 * and put on the head of the appropriate queue.
@@ -614,13 +728,25 @@ brelse(bp)
 			reassignbuf(bp, bp->b_vp);
 			brelvp(bp);
 		}
-		if (bp->b_bufsize <= 0)
-			/* no data */
-			bufq = &bufqueues[BQ_EMPTY];
-		else
+		if (bp->b_bufsize <= 0) {
+			simple_lock(&bufcache_count_slock);
+			if (nbufcache > nbufcache_min)
+				dofree = TRUE; /* put back to bufpool */
+			else
+				bufq = &bufqueues[BQ_EMPTY];
+			simple_unlock(&bufcache_count_slock);
+		} else {
 			/* invalid data */
+			/* XXX no worth to cache unless B_MAPPED. */
 			bufq = &bufqueues[BQ_AGE];
-		binsheadfree(bp, bufq);
+			if (bp->b_flags & B_MAPPED) {
+				KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+				TAILQ_INSERT_HEAD(&bufcache_iomap_lru, bp,
+				    b_mappedlist);
+			}
+		}
+		if (!dofree)
+			binsheadfree(bp, bufq);
 	} else {
 		/*
 		 * It has valid data.  Put it on the end of the appropriate
@@ -649,16 +775,29 @@ brelse(bp)
 			    &bufqueues[BQ_AGE];
 		}
 		binstailfree(bp, bufq);
+		if (bp->b_flags & B_MAPPED) {
+			KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+			TAILQ_INSERT_TAIL(&bufcache_iomap_lru, bp, b_mappedlist);
+		}
 	}
 
 already_queued:
+	simple_unlock(&bqueue_slock);
 	/* Unlock the buffer. */
 	CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE);
 	SET(bp->b_flags, B_CACHE);
 
-	/* Allow disk interrupts. */
 	simple_unlock(&bp->b_interlock);
-	simple_unlock(&bqueue_slock);
+	if (dofree) {
+		KASSERT(bp->b_bufsize == 0);
+		bremhash(bp);
+		pool_put(&bufpool, bp);
+		simple_lock(&bufcache_count_slock);
+		KASSERT(nbufcache > nbufcache_min);
+		nbufcache--;
+		simple_unlock(&bufcache_count_slock);
+	}
+	/* Allow disk interrupts. */
 	splx(s);
 }
 
@@ -732,7 +871,7 @@ start:
 		SET(bp->b_flags, B_BUSY);
 		bremfree(bp);
 	} else {
-		if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) {
+		if ((bp = getnewbuf(slpflag, slptimeo, TRUE)) == NULL) {
 			simple_unlock(&bqueue_slock);
 			splx(s);
 			goto start;
@@ -761,7 +900,7 @@ geteblk(size)
 
 	s = splbio();
 	simple_lock(&bqueue_slock);
-	while ((bp = getnewbuf(0, 0)) == 0)
+	while ((bp = getnewbuf(0, 0, TRUE)) == 0)
 		;
 
 	SET(bp->b_flags, B_INVAL);
@@ -790,6 +929,9 @@ allocbuf(bp, size)
 	vsize_t desired_size;
 	int s;
 
+	KASSERT(bp->b_flags & B_PAGES);
+	KASSERT(0 <= size);
+
 	desired_size = round_page((vsize_t)size);
 	if (desired_size > MAXBSIZE)
 		panic("allocbuf: buffer larger than MAXBSIZE requested");
@@ -803,12 +945,15 @@ allocbuf(bp, size)
 	 * steal their pages.
 	 */
 	while (bp->b_bufsize < desired_size) {
-		int amt;
+		/* try to allocate new pages */
+		if (bufcache_allocpages(bp, desired_size) == 0)
+			break;
 
 		/* find a buffer */
 		s = splbio();
 		simple_lock(&bqueue_slock);
-		while ((nbp = getnewbuf(0, 0)) == NULL)
+
+		while ((nbp = getnewbuf(0, 0, FALSE)) == NULL)
 			;
 
 		SET(nbp->b_flags, B_INVAL);
@@ -818,16 +963,19 @@ allocbuf(bp, size)
 		simple_unlock(&bqueue_slock);
 		splx(s);
 
+		if (nbp->b_bufsize == 0) {
+			/*
+			 * race between bufcache_allocpages and getnewbuf.
+			 * we don't want a buffer without pages.
+			 */
+			printf("bufcache race\n");
+			brelse(nbp);
+			continue;
+		}
+
 		/* and steal its pages, up to the amount we need */
-		amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
-		pagemove((nbp->b_data + nbp->b_bufsize - amt),
-			 bp->b_data + bp->b_bufsize, amt);
-		bp->b_bufsize += amt;
-		nbp->b_bufsize -= amt;
-
-		/* reduce transfer count if we stole some data */
-		if (nbp->b_bcount > nbp->b_bufsize)
-			nbp->b_bcount = nbp->b_bufsize;
+		bufcache_movepages(bp, nbp, desired_size - bp->b_bufsize);
+		KASSERT(bp->b_bufsize <= desired_size);
 
 #ifdef DIAGNOSTIC
 		if (nbp->b_bufsize < 0)
@@ -836,6 +984,8 @@ allocbuf(bp, size)
 		brelse(nbp);
 	}
 
+	KASSERT(bp->b_bufsize >= desired_size);
+
 	/*
 	 * If we want a buffer smaller than the current size,
 	 * shrink this buffer.  Grab a buf head from the EMPTY queue,
@@ -843,6 +993,9 @@ allocbuf(bp, size)
 	 * If there are no free buffer headers, leave the buffer alone.
 	 */
 	if (bp->b_bufsize > desired_size) {
+		if (bufcache_freepages(bp, desired_size) == 0)
+			goto out;
+
 		s = splbio();
 		simple_lock(&bqueue_slock);
 		if ((nbp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) == NULL) {
@@ -852,16 +1005,15 @@ allocbuf(bp, size)
 			goto out;
 		}
 		/* No need to lock nbp since it came from the empty queue */
+		KASSERT(nbp->b_bufsize == 0);
+		KASSERT(!(nbp->b_flags & B_BUSY));
 		bremfree(nbp);
 		SET(nbp->b_flags, B_BUSY | B_INVAL);
 		simple_unlock(&bqueue_slock);
 		splx(s);
 
 		/* move the page to it and note this change */
-		pagemove(bp->b_data + desired_size,
-		    nbp->b_data, bp->b_bufsize - desired_size);
-		nbp->b_bufsize = bp->b_bufsize - desired_size;
-		bp->b_bufsize = desired_size;
+		bufcache_movepages(nbp, bp, bp->b_bufsize - desired_size);
 		nbp->b_bcount = 0;
 
 		/* release the newly-filled buffer and leave */
@@ -870,6 +1022,8 @@ allocbuf(bp, size)
 
 out:
 	bp->b_bcount = size;
+	if (!(bp->b_flags & B_MAPPED))
+		bufcache_mapin(bp);
 }
 
 /*
@@ -881,19 +1035,37 @@ out:
  * Return buffer locked.
  */
 struct buf *
-getnewbuf(slpflag, slptimeo)
+getnewbuf(slpflag, slptimeo, doalloc)
 	int slpflag, slptimeo;
+	boolean_t doalloc;
 {
 	struct buf *bp;
 
 start:
 	LOCK_ASSERT(simple_lock_held(&bqueue_slock));
 
-	if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
-	    (bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
-		simple_lock(&bp->b_interlock);
-		bremfree(bp);
-	} else {
+	bp = TAILQ_FIRST(&bufqueues[BQ_AGE]);
+	if (doalloc && bp == NULL) {
+		simple_lock(&bufcache_count_slock);
+		if (nbufcache < nbufcache_max &&
+		    nbufcachepage < nbufcachepage_max) {
+			nbufcache++;
+			simple_unlock(&bufcache_count_slock);
+			bp = pool_get(&bufpool, PR_NOWAIT);
+			memset(bp, 0, sizeof(*bp));
+			BUF_INIT(bp);
+			bp->b_map = &bufcachemapper;
+			simple_lock(&bp->b_interlock);
+			bp->b_flags = B_BUSY | B_PAGES;
+			bp->b_dev = NODEV;
+			return bp;
+		}
+		simple_unlock(&bufcache_count_slock);
+	}
+	if (bp == NULL)
+		bp = TAILQ_FIRST(&bufqueues[BQ_LRU]);
+
+	if (bp == NULL) {
 		/* wait for a free buffer of any kind */
 		needbuffer = 1;
 		ltsleep(&needbuffer, slpflag|(PRIBIO+1),
@@ -901,12 +1073,17 @@ start:
 		return (NULL);
 	}
 
+	simple_lock(&bp->b_interlock);
+	bremfree(bp);
+	KASSERT(bp->b_bufsize > 0);
+
 	if (ISSET(bp->b_flags, B_VFLUSH)) {
 		/*
 		 * This is a delayed write buffer being flushed to disk.  Make
 		 * sure it gets aged out of the queue when it's finished, and
 		 * leave it off the LRU queue.
 		 */
+		KASSERT(bp->b_flags & B_BUSY);
 		CLR(bp->b_flags, B_VFLUSH);
 		SET(bp->b_flags, B_AGE);
 		simple_unlock(&bp->b_interlock);
@@ -941,7 +1118,7 @@ start:
 		(*bioops.io_deallocate)(bp);
 
 	/* clear out various other fields */
-	bp->b_flags = B_BUSY;
+	bp->b_flags = B_BUSY | (bp->b_flags & (B_PAGES | B_MAPPED));
 	bp->b_dev = NODEV;
 	bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = 0;
 	bp->b_iodone = 0;
@@ -1022,6 +1199,8 @@ biodone(bp)
 	if (ISSET(bp->b_flags, B_CALL)) {
 		CLR(bp->b_flags, B_CALL);	/* but note callout done */
 		simple_unlock(&bp->b_interlock);
+		if (bp->b_map != NULL)
+			buf_mapout(bp);
 		(*bp->b_iodone)(bp);
 	} else {
 		if (ISSET(bp->b_flags, B_ASYNC)) {	/* if async, release */
@@ -1086,3 +1265,578 @@ vfs_bufstats()
 	}
 }
 #endif /* DEBUG */
+
+#include <uvm/uvm_iomap.h>
+unsigned int bufcachemap_size;
+
+struct uvm_iomap bufcache_iomap;
+
+void
+bufcachemap_init()
+{
+
+	memset(&bufcachemapper, 0, sizeof(bufcachemapper));
+	bufcachemapper.bm_mapin = bufcache_mapin;
+	bufcachemapper.bm_mapout = NULL;
+	bufcachemapper.bm_reclaim = bufcache_reclaimkva;
+	bufcachemapper.bm_iomap = &bufcache_iomap;
+
+	uvm_iomap_init(&bufcache_iomap, bufcache_map_size, round_page(MAXPHYS));
+}
+
+void
+bufcache_reclaimkva(struct bufmap *bmap, vsize_t size)
+{
+	int need = size;
+
+	KASSERT(bmap == &bufcachemapper);
+
+	do {
+		struct buf *victim;
+		int s;
+
+		/*
+		 * pick a buffer from the top of kva lru list.
+		 */
+		s = splbio();
+		simple_lock(&bqueue_slock);
+		victim = TAILQ_FIRST(&bufcache_iomap_lru);
+		if (victim == NULL) {
+#ifdef DEBUG
+			bufcache_debugdump();
+#endif
+			printf("no buf on kva lru; sleep\n");
+			bufcache_iomap_wanted = TRUE;
+			ltsleep(&bufcache_iomap_wanted,
+			    (PRIBIO + 1) | PNORELOCK, "bufkva", 0,
+			    &bqueue_slock);
+			splx(s);
+			printf("no buf on kva lru; woken\n");
+			continue;
+		}
+		/*
+		 * lock the buffer and take it off the freelist.
+		 */
+		simple_lock(&victim->b_interlock);
+		if (victim->b_flags & B_BUSY) {
+			simple_unlock(&bqueue_slock);
+			if (!(victim->b_flags & (B_VFLUSH|B_LOCKED)))
+				panic("%p: %lx\n",
+				    victim, victim->b_flags);
+			KASSERT(victim->b_flags & (B_VFLUSH|B_LOCKED));
+			victim->b_flags |= B_WANTED;
+			ltsleep(victim, (PRIBIO + 1) | PNORELOCK,
+			    "bunmap", 0, &victim->b_interlock);
+			splx(s);
+			continue;
+		}
+		bremfree(victim);
+		simple_unlock(&bqueue_slock);
+		victim->b_flags |= B_BUSY;
+		simple_unlock(&victim->b_interlock);
+		splx(s);
+
+		KASSERT(victim->b_flags & B_MAPPED);
+		KASSERT(!(victim->b_flags & B_VFLUSH));
+		KASSERT(victim->b_bufsize > 0);
+
+		/*
+		 * unmap the buffer.
+		 */
+		bufcache_mapout(victim);
+		brelse(victim);
+		need -= victim->b_bufsize;
+		if (need < 0)
+			need = 0;
+	} while (need > 0);
+}
+
+/*
+ * map a buffer.
+ * - allocate kva(b_data) and map the pages.
+ */
+void
+bufcache_mapin(struct buf *bp)
+{
+
+	LOCK_ASSERT(!simple_lock_held(&bqueue_slock));
+
+	genbuf_mapin(bp);
+	KASSERT(bp->b_map->bm_kva_used <= bufcache_map_size); /* XXX MP */
+
+	/*
+	 * XXX B_VFLUSH buffers are on freelist.
+	 * XXX note that getnewbuf does bremfree for B_VFLUSH buffers.
+	 *
+	 * XXX LFS maps B_LOCKED buffers to copy their contents to
+	 * XXX a segment buffer.
+	 */
+	if (bp->b_flags & B_VFLUSH) { /* racy check first */
+		int s;
+
+		s = splbio();
+		simple_lock(&bqueue_slock);
+		simple_lock(&bp->b_interlock);
+		bp->b_flags |= B_MAPPED;
+		if (bp->b_flags & B_VFLUSH) {
+			KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+			TAILQ_INSERT_TAIL(&bufcache_iomap_lru, bp,
+			    b_mappedlist);
+		}
+		simple_unlock(&bp->b_interlock);
+		simple_unlock(&bqueue_slock);
+		splx(s);
+	} else
+		bp->b_flags |= B_MAPPED;
+
+	KASSERT(bp->b_map->bm_kva_used <= bufcache_map_size); /* XXX MP */
+}
+
+void bufcache_notemappedfree(struct buf *); /* XXX */
+void
+bufcache_notemappedfree(struct buf *bp)
+{
+
+	KASSERT(bp->b_flags & B_BUSY);
+	KASSERT(bp->b_flags & (B_VFLUSH | B_LOCKED));
+	KASSERT(bp->b_mappedlist.tqe_prev == NULL);
+	TAILQ_INSERT_TAIL(&bufcache_iomap_lru, bp,
+	    b_mappedlist);
+}
+
+/*
+ * unmap a buffer.
+ */
+void
+bufcache_mapout(struct buf *bp)
+{
+	int s;
+
+	KASSERT(bp->b_map == &bufcachemapper);
+	genbuf_mapout(bp);
+
+	s = splbio();
+	simple_lock(&bqueue_slock);
+	if (bufcache_iomap_wanted) {
+		bufcache_iomap_wanted = FALSE;
+		wakeup(&bufcache_iomap_wanted);
+	}
+	simple_unlock(&bqueue_slock);
+	splx(s);
+}
+
+int
+bufcache_allocpages(struct buf *bp, vsize_t size)
+{
+	int nalloc = (int)((int)size - bp->b_bufsize) >> PAGE_SHIFT;
+	int s;
+
+	KASSERT((size & PAGE_MASK) == 0);
+	KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
+	KASSERT(size <= MAXBSIZE);
+	KASSERT(nalloc > 0);
+
+	s = splbio();
+	simple_lock(&bufcache_count_slock);
+	if (nbufcachepage + nalloc > nbufcachepage_max) {
+		nalloc = nbufcachepage_max - nbufcachepage;
+	}
+	nbufcachepage += nalloc;
+	simple_unlock(&bufcache_count_slock);
+	splx(s);
+
+	if (nalloc > 0 && bp->b_flags & B_MAPPED)
+		bufcache_mapout(bp);
+
+	for (; nalloc > 0; nalloc--) {
+		struct vm_page *pg;
+		int idx;
+
+		/*
+		 * XXX need an md hook?
+		 */
+		pg = uvm_pagealloc_strat(NULL, 0, NULL, 0,
+		    UVM_PGA_STRAT_BUFCACHE, VM_FREELIST_BUFCACHE);
+		if (pg == NULL)
+			return ENOMEM;
+		idx = bp->b_bufsize >> PAGE_SHIFT;
+		KASSERT(bp->b_pages[idx] == NULL);
+		bp->b_pages[idx] = pg;
+		bp->b_bufsize += PAGE_SIZE;
+	}
+	KASSERT(bp->b_bufsize <= size);
+	if (bp->b_bufsize < size)
+		return ENOMEM;
+	return 0;
+}
+
+int
+bufcache_freepages(struct buf *bp, vsize_t size)
+{
+	int nfree = (int)(bp->b_bufsize - size) >> PAGE_SHIFT;
+	int s;
+
+	KASSERT((size & PAGE_MASK) == 0);
+	KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
+	KASSERT(size <= MAXBSIZE);
+	KASSERT(nfree >= 0);
+
+	s = splbio();
+	simple_lock(&bufcache_count_slock);
+	if (nbufcachepage < nbufcachepage_min + nfree) {
+		nfree = nbufcachepage - nbufcachepage_min;
+	}
+	nbufcachepage -= nfree;
+	simple_unlock(&bufcache_count_slock);
+	splx(s);
+
+	if (nfree > 0 && bp->b_flags & B_MAPPED)
+		bufcache_mapout(bp);
+
+	for (; nfree > 0; nfree--) {
+		struct vm_page *pg;
+		int idx;
+
+		bp->b_bufsize -= PAGE_SIZE;
+		idx = bp->b_bufsize >> PAGE_SHIFT;
+		pg = bp->b_pages[idx];
+		KASSERT(pg != NULL);
+		uvm_pagefree(pg); /* XXX md hook? */
+#ifdef DIAGNOSTIC
+		bp->b_pages[idx] = NULL;
+#endif
+	}
+	KASSERT(bp->b_bufsize >= size);
+	if (bp->b_bufsize > size)
+		return ENOMEM; /* XXX */
+	return 0;
+}
+
+/*
+ * move pages from a buffer to another.
+ */
+void
+bufcache_movepages(struct buf *bp, struct buf *victim, int movesize)
+{
+	int npages;
+	int npages_victim;
+	int npages_move;
+
+	KASSERT(bp->b_flags & B_PAGES);
+	KASSERT(bp->b_flags & B_BUSY);
+	KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
+	KASSERT(victim->b_flags & B_PAGES);
+	KASSERT(victim->b_flags & B_BUSY);
+	KASSERT((victim->b_bufsize & PAGE_MASK) == 0);
+	KASSERT(victim->b_bufsize > 0);
+	KASSERT(movesize > 0);
+	KASSERT((movesize & PAGE_MASK) == 0);
+	KASSERT(bp->b_bufsize + movesize <= MAXPHYS);
+
+	if (bp->b_flags & B_MAPPED)
+		bufcache_mapout(bp);
+	if (victim->b_flags & B_MAPPED)
+		bufcache_mapout(victim);
+
+	npages = bp->b_bufsize >> PAGE_SHIFT;
+	npages_victim = victim->b_bufsize >> PAGE_SHIFT;
+	npages_move = MIN(movesize >> PAGE_SHIFT, npages_victim);
+	while (npages_move > 0) {
+		npages_move--;
+		npages_victim--;
+		KASSERT(victim->b_pages[npages_victim]);
+		KASSERT(bp->b_pages[npages] == NULL);
+		bp->b_pages[npages] = victim->b_pages[npages_victim];
+#ifdef DIAGNOSTIC
+		victim->b_pages[npages_victim] = NULL;
+#endif
+		npages++;
+	}
+	bp->b_bufsize = npages << PAGE_SHIFT;
+	victim->b_bufsize = npages_victim << PAGE_SHIFT;
+
+	/* reduce transfer count if we stole some data */
+	if (victim->b_bcount > victim->b_bufsize)
+		victim->b_bcount = victim->b_bufsize;
+}
+
+/*
+ * count buffers on freelists.
+ */
+unsigned int
+bufcache_countfree()
+{
+	const struct bqueues *dp;
+	const struct buf *bp;
+	unsigned int nfree = 0;
+
+#if 0 /* used from ddb */
+	LOCK_ASSERT(simple_lock_held(&bqueue_slock));
+#endif
+
+	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
+		TAILQ_FOREACH(bp, dp, b_freelist) {
+			nfree++;
+		}
+	}
+/*	KASSERT(nbufcache >= nfree); */
+
+	return nfree;
+}
+
+/*
+ * flush out all buffer caches.
+ *
+ * XXX is this really needed?
+ */
+int
+bufcache_shutdown()
+{
+	struct bqueues *dp;
+	struct buf *bp;
+	int iter, nbusy, nbusy_prev = 0, dcount, s;
+	unsigned int nbusy2;
+
+	/* Wait for sync to finish. */
+	dcount = 10000;
+	for (iter = 0; iter < 20;) {
+		unsigned int nfree;
+		nbusy = 0;
+
+		/*
+		 * XXX broken.  generally, buffers in i/o is not on freelist.
+		 * XXX should free buffers until nbufcache reaches to zero?
+		 */
+		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
+			TAILQ_FOREACH(bp, dp, b_freelist) {
+				if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ))
+				    == B_BUSY) {
+					nbusy++;
+					printf("busy buffer\n");
+				}
+				/*
+				 * With soft updates, some buffers that are
+				 * written will be remarked as dirty until other
+				 * buffers are written.
+				 */
+				if (bp->b_vp && bp->b_vp->v_mount
+				    && (bp->b_vp->v_mount->mnt_flag &
+					MNT_SOFTDEP)
+				    && (bp->b_flags & B_DELWRI)) {
+					s = splbio();
+					simple_lock(&bqueue_slock);
+					simple_lock(&bp->b_interlock);
+					bremfree(bp);
+					simple_unlock(&bqueue_slock);
+					bp->b_flags |= B_BUSY;
+					simple_unlock(&bp->b_interlock);
+					splx(s);
+					nbusy++;
+					bawrite(bp);
+					printf("softdep dirty buffer\n");
+					if (dcount-- <= 0) {
+						printf("softdep ");
+						goto fail;
+					}
+				}
+			}
+		}
+
+		/*
+		 * count buffers on freelists.
+		 */
+		s = splbio();
+		simple_lock(&bqueue_slock);
+		simple_lock(&bufcache_count_slock);
+		nfree = bufcache_countfree();
+		nbusy2 = nbufcache - nfree;
+		simple_unlock(&bufcache_count_slock);
+		simple_unlock(&bqueue_slock);
+		splx(s);
+		printf("nbusy2=%u, busy=%d\n", nbusy2, nbusy);
+		if (nbusy2 == 0 && nbusy == 0)
+			break;
+		if (nbusy_prev == 0)
+			nbusy_prev = nbusy;
+		printf("%d ", nbusy);
+		tsleep(&nbusy, PRIBIO, "bflush",
+		    (iter == 0) ? 1 : hz / 25 * iter);
+		if (nbusy >= nbusy_prev) /* we didn't flush anything */
+			iter++;
+		else
+			nbusy_prev = nbusy;
+	}
+	if (nbusy || nbusy2)
+		Debugger();
+	if (nbusy) {
+fail:
+#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
+		printf("giving up\nPrinting vnodes for busy buffers\n");
+#if 0 /* XXX */
+		for (bp = &buf[nbuf]; --bp >= buf; )
+			if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
+				vprint(NULL, bp->b_vp);
+#endif
+
+#if defined(DDB) && defined(DEBUG_HALT_BUSY)
+		Debugger();
+#endif
+
+#else  /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
+		printf("giving up\n");
+#endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+bufcache_initparam()
+{
+	char pbufmin[9];
+	char pbufmax[9];
+
+	/*
+	 * Determine how many buffers to allocate.
+	 *
+	 *	- If bufcache is specified, use that % of memory
+	 *	  for the buffer cache.
+	 *
+	 *	- Otherwise, we default to the traditional BSD
+	 *	  formula of 10% of the first 2MB and 5% of
+	 *	  the remaining.
+	 */
+	if (nbufcachepage_min == 0) {
+		if (bufcache != 0) {
+			if (bufcache < 5 || bufcache > 95)
+				panic("bufcache is out of range (%d)",
+				    bufcache);
+			nbufcachepage_min = physmem / 100 * bufcache;
+		} else {
+			if (physmem < btoc(2 * 1024 * 1024))
+				nbufcachepage_min = physmem / 10;
+			else
+				nbufcachepage_min =
+				    (btoc(2 * 1024 * 1024) + physmem) / 20;
+		}
+	}
+
+#ifdef DIAGNOSTIC
+	if (nbufcachepage_min == 0)
+		panic("bufpages = 0");
+#endif
+
+#if 0 /* XXX XXX */
+	/*
+	 * Call the mdcallback now; it may need to adjust bufpages.
+	 */
+	if (mdcallback != NULL)
+		v = mdcallback(v);
+#endif
+
+	/* 
+	 * Ensure a minimum of 16 buffers.
+	 */
+	if (nbufcache_min == 0) {
+		nbufcache_min = nbufcachepage_min;
+		if (nbufcache_min < 16)
+			nbufcache_min = 16;
+	}
+
+	nbufcache_max = nbufcache_min * 2; /* XXX XXX */
+	nbufcachepage_max = nbufcachepage_min * 2; /* XXX XXX */
+
+	/* XXX */
+	if (bufcache_map_size == 0)
+		bufcache_map_size = MIN(nbufcache_min * MAXBSIZE,
+		    nbufcachepage_min * PAGE_SIZE);
+
+#ifdef VM_MAX_KERNEL_BUF
+	if (bufcache_map_size > VM_MAX_KERNEL_BUF) {
+		/* assuming VM_MAX_KERNEL_BUF is a reasonable value. */
+		bufcache_map_size = VM_MAX_KERNEL_BUF;
+		nbufcache_min =
+		    MIN(nbufcache_min, bufcache_map_size / MAXBSIZE);
+		nbufcachepage_min =
+		    MIN(nbufcachepage_min, bufcache_map_size / PAGE_SIZE);
+	}
+#endif
+
+	format_bytes(pbufmin, sizeof(pbufmin), nbufcachepage_min * PAGE_SIZE);
+	format_bytes(pbufmax, sizeof(pbufmax), nbufcachepage_max * PAGE_SIZE);
+	printf("using %d-%d buffers %s-%s of memory\n",
+	    nbufcache_min, nbufcache_max, pbufmin, pbufmax);
+	/*
+	 * XXX nbufcache*_min should be able to be a small constant
+	 * but they can't for now because they are used by filesystems to
+	 * throttle...
+	 */
+	KDASSERT(nbufcache_min >= 16);
+	KDASSERT(nbufcachepage_min >= 16);
+	KDASSERT(nbufcache_max >= nbufcache_min);
+	KDASSERT(nbufcachepage_max >= nbufcachepage_min);
+}
+
+int
+bufcache_reclaim(int num)
+{
+	int error = 0;
+
+	while (num-- > 0) {
+		struct buf *bp;
+		int s;
+
+		while ((bp = getnewbuf(0, 0, FALSE)) == NULL)
+			;
+
+		error = bufcache_freepages(bp, 0);
+		if (error)
+			break;
+
+		KASSERT(bp->b_bufsize == 0);
+		bremhash(bp);
+		s = splbio();
+		pool_put(&bufpool, bp);
+		simple_lock(&bufcache_count_slock);
+		KASSERT(nbufcache > nbufcache_min);
+		nbufcache--;
+		simple_unlock(&bufcache_count_slock);
+		splx(s);
+	}
+
+	return error;
+}
+
+#ifdef DEBUG
+void
+bufcache_debugdump()
+{
+	struct buf *it;
+	int n, m;
+
+	printf("nbuf=%d, npage=%d, nfree=%d, kva=%d/%d\n",
+	    nbufcache, nbufcachepage, bufcache_countfree(),
+	    (int)bufcachemapper.bm_kva_used, (int)bufcache_map_size);
+
+	n = m = 0;
+	TAILQ_FOREACH(it, &bufqueues[BQ_LRU], b_freelist){
+		if (it->b_flags & B_VFLUSH)
+			n++;
+		if ((it->b_flags & B_MAPPED) &&
+		    !(it->b_flags & B_BUSY))
+			m++;
+	}
+	printf("LRU %d, %d\n", n, m);
+
+	n = m = 0;
+	TAILQ_FOREACH(it, &bufqueues[BQ_AGE], b_freelist){
+		if (it->b_flags & B_VFLUSH)
+			n++;
+		if ((it->b_flags & B_MAPPED) &&
+		    !(it->b_flags & B_BUSY))
+			m++;
+	}
+	printf("AGE %d, %d\n", n, m);
+}
+#endif
Index: kern/vfs_subr.c
===================================================================
--- kern/vfs_subr.c	(revision 283)
+++ kern/vfs_subr.c	(working copy)
@@ -2591,8 +2591,6 @@ vfs_unmountall(p)
 void
 vfs_shutdown()
 {
-	struct buf *bp;
-	int iter, nbusy, nbusy_prev = 0, dcount, s;
 	struct lwp *l = curlwp;
 	struct proc *p;
 
@@ -2611,62 +2609,7 @@ vfs_shutdown()
 
 	sys_sync(l, NULL, NULL);
 
-	/* Wait for sync to finish. */
-	dcount = 10000;
-	for (iter = 0; iter < 20;) {
-		nbusy = 0;
-		for (bp = &buf[nbuf]; --bp >= buf; ) {
-			if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
-				nbusy++;
-			/*
-			 * With soft updates, some buffers that are
-			 * written will be remarked as dirty until other
-			 * buffers are written.
-			 */
-			if (bp->b_vp && bp->b_vp->v_mount
-			    && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
-			    && (bp->b_flags & B_DELWRI)) {
-				s = splbio();
-				bremfree(bp);
-				bp->b_flags |= B_BUSY;
-				splx(s);
-				nbusy++;
-				bawrite(bp);
-				if (dcount-- <= 0) {
-					printf("softdep ");
-					goto fail;
-				}
-			}
-		}
-		if (nbusy == 0)
-			break;
-		if (nbusy_prev == 0)
-			nbusy_prev = nbusy;
-		printf("%d ", nbusy);
-		tsleep(&nbusy, PRIBIO, "bflush",
-		    (iter == 0) ? 1 : hz / 25 * iter);
-		if (nbusy >= nbusy_prev) /* we didn't flush anything */
-			iter++;
-		else
-			nbusy_prev = nbusy;
-	}
-	if (nbusy) {
-fail:
-#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
-		printf("giving up\nPrinting vnodes for busy buffers\n");
-		for (bp = &buf[nbuf]; --bp >= buf; )
-			if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
-				vprint(NULL, bp->b_vp);
-
-#if defined(DDB) && defined(DEBUG_HALT_BUSY)
-		Debugger();
-#endif
-
-#else  /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
-		printf("giving up\n");
-#endif /* defined(DEBUG) || defined(DEBUG_HALT_BUSY) */
-		return;
-	} else
+	if (!bufcache_shutdown())
 		printf("done\n");
 
 	/*
@@ -2959,10 +2902,10 @@ set_statfs_info(const char *onp, int uko
 
 #ifdef DDB
 const char buf_flagbits[] =
-	"\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI"
+	"\20\1AGE\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI"
 	"\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE"
-	"\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED"
-	"\32XXX\33VFLUSH";
+	"\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED"
+	"\32XXX\33VFLUSH\34PAGES\35MAPPED";
 
 void
 vfs_buf_print(bp, full, pr)
Index: uvm/uvm_iomap.h
===================================================================
--- uvm/uvm_iomap.h	(revision 198)
+++ uvm/uvm_iomap.h	(working copy)
@@ -55,7 +55,7 @@ void uvm_iomap_init(struct uvm_iomap *, 
 vaddr_t uvm_iomap_alloc(struct uvm_iomap *, vsize_t, int);
 void uvm_iomap_free(struct uvm_iomap *, vaddr_t, vsize_t);
 
-#define	UVMIOMAP_WAITOK	UVMPAGER_MAPIN_WAITOK
+#define	UVMIOMAP_WAITOK	0x01
 
 #define	uvm_iomap_pmap(iomap)	vm_map_pmap((iomap)->ui_map)
 
Index: uvm/uvm_glue.c
===================================================================
--- uvm/uvm_glue.c	(revision 196)
+++ uvm/uvm_glue.c	(working copy)
@@ -142,9 +142,11 @@ uvm_kernacc(addr, len, rw)
 	 * or worse, inconsistencies at the pmap level.  We only worry
 	 * about the buffer cache for now.
 	 */
+#if 0
 	if (!readbuffers && rv && (eaddr > (vaddr_t)buffers &&
 			     saddr < (vaddr_t)buffers + MAXBSIZE * nbuf))
 		rv = FALSE;
+#endif
 	return(rv);
 }
 
Index: arch/i386/i386/machdep.c
===================================================================
--- arch/i386/i386/machdep.c	(revision 281)
+++ arch/i386/i386/machdep.c	(working copy)
@@ -284,7 +284,6 @@ cpu_startup()
 	caddr_t v;
 	int sz, x;
 	vaddr_t minaddr, maxaddr;
-	vsize_t size;
 	char pbuf[9];
 
 	/*
@@ -325,37 +324,6 @@ cpu_startup()
 		panic("startup: table size inconsistency");
 
 	/*
-	 * Allocate virtual address space for the buffers.  The area
-	 * is not managed by the VM system.
-	 */
-	size = MAXBSIZE * nbuf;
-	if (uvm_map(kernel_map, (vaddr_t *)(void *) &buffers, round_page(size),
-		    NULL, UVM_UNKNOWN_OFFSET, 0,
-		    UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
-				UVM_ADV_NORMAL, 0)) != 0)
-		panic("cpu_startup: cannot allocate VM for buffers");
-	minaddr = (vaddr_t)buffers;
-	if ((bufpages / nbuf) >= btoc(MAXBSIZE)) {
-		/* don't want to alloc more physical mem than needed */
-		bufpages = btoc(MAXBSIZE) * nbuf;
-	}
-
-	/*
-	 * XXX We defer allocation of physical pages for buffers until
-	 * XXX after autoconfiguration has run.  We must do this because
-	 * XXX on system with large amounts of memory or with large
-	 * XXX user-configured buffer caches, the buffer cache will eat
-	 * XXX up all of the lower 16M of RAM.  This prevents ISA DMA
-	 * XXX maps from allocating bounce pages.
-	 *
-	 * XXX Note that nothing can use buffer cache buffers until after
-	 * XXX autoconfiguration completes!!
-	 *
-	 * XXX This is a hack, and needs to be replaced with a better
-	 * XXX solution!  --thorpej@netbsd.org, December 6, 1997
-	 */
-
-	/*
 	 * Allocate a submap for exec arguments.  This map effectively
 	 * limits the number of processes exec'ing at any time.
 	 */
@@ -379,10 +347,8 @@ cpu_startup()
 	 * XXX we need to account for those pages when printing
 	 * XXX the amount of free memory.
 	 */
-	format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free - bufpages));
+	format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free/* - bufpages*/));
 	printf("avail memory = %s\n", pbuf);
-	format_bytes(pbuf, sizeof(pbuf), bufpages * PAGE_SIZE);
-	printf("using %d buffers containing %s of memory\n", nbuf, pbuf);
 
 	/* Safe for i/o port / memory space allocation to use malloc now. */
 	x86_bus_space_mallocok();
@@ -441,55 +407,6 @@ i386_init_pcb_tss_ldt(ci)
 }
 
 /*
- * XXX Finish up the deferred buffer cache allocation and initialization.
- */
-void
-i386_bufinit()
-{
-	int i, base, residual;
-
-	base = bufpages / nbuf;
-	residual = bufpages % nbuf;
-	for (i = 0; i < nbuf; i++) {
-		vsize_t curbufsize;
-		vaddr_t curbuf;
-		struct vm_page *pg;
-
-		/*
-		 * Each buffer has MAXBSIZE bytes of VM space allocated.  Of
-		 * that MAXBSIZE space, we allocate and map (base+1) pages
-		 * for the first "residual" buffers, and then we allocate
-		 * "base" pages for the rest.
-		 */
-		curbuf = (vaddr_t) buffers + (i * MAXBSIZE);
-		curbufsize = PAGE_SIZE * ((i < residual) ? (base+1) : base);
-
-		while (curbufsize) {
-			/*
-			 * Attempt to allocate buffers from the first
-			 * 16M of RAM to avoid bouncing file system
-			 * transfers.
-			 */
-			pg = uvm_pagealloc_strat(NULL, 0, NULL, 0,
-			    UVM_PGA_STRAT_FALLBACK, VM_FREELIST_FIRST16);
-			if (pg == NULL)
-				panic("cpu_startup: not enough memory for "
-				    "buffer cache");
-			pmap_kenter_pa(curbuf, VM_PAGE_TO_PHYS(pg),
-			    VM_PROT_READ|VM_PROT_WRITE);
-			curbuf += PAGE_SIZE;
-			curbufsize -= PAGE_SIZE;
-		}
-	}
-	pmap_update(pmap_kernel());
-
-	/*
-	 * Set up buffers, so they can be used to read disk labels.
-	 */
-	bufinit();
-}
-
-/*
  * machine dependent system variables.
  */
 int
Index: arch/i386/i386/autoconf.c
===================================================================
--- arch/i386/i386/autoconf.c	(revision 281)
+++ arch/i386/i386/autoconf.c	(working copy)
@@ -155,8 +155,10 @@ cpu_configure(void)
 	lapic_tpr = 0;
 #endif
 
-	/* XXX Finish deferred buffer cache allocation. */
-	i386_bufinit();
+	/*
+	 * Set up buffers, so they can be used to read disk labels.
+	 */
+	bufinit();
 }
 
 void
Index: sys/buf.h
===================================================================
--- sys/buf.h	(revision 284)
+++ sys/buf.h	(working copy)
@@ -76,10 +76,16 @@
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 
+#include <sys/param.h>	/* for NULL */
 #include <sys/pool.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 
+#include <uvm/uvm_param.h>	/* for MIN_PAGE_SIZE */
+#include <machine/param.h>	/* for MAXPHYS */
+
+#include <lib/libkern/libkern.h>	/* for KASSERT */
+
 struct buf;
 struct mount;
 struct vnode;
@@ -126,6 +132,24 @@ void	bufq_free(struct bufq_state *);
 #define BUFQ_PEEK(bufq) \
 	(*(bufq)->bq_get)((bufq), 0)	/* Get buffer from queue */
 
+/*
+ * buffer mapper
+ *
+ * XXX it's intended to be put into each devices/drivers which
+ * XXX require kernel-addressable buffers.
+ */
+struct bufmap {
+	struct uvm_iomap *bm_iomap;
+	void (*bm_mapin)(struct buf *);
+	void (*bm_mapout)(struct buf *);
+	void (*bm_reclaim)(struct bufmap *, vsize_t);
+	vsize_t bm_kva_used;
+};
+
+extern struct bufmap devbufmap;
+void genbuf_mapin(struct buf *);
+void genbuf_mapout(struct buf *);
+
 #endif /* _KERNEL */
 
 /*
@@ -153,7 +177,8 @@ struct buf {
 	LIST_ENTRY(buf) b_vnbufs;	/* Buffer's associated vnode. */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	TAILQ_ENTRY(buf) b_actq;	/* Device driver queue when active. */
-	struct  proc *b_proc;		/* Associated proc if B_PHYS set. */
+	TAILQ_ENTRY(buf) b_mappedlist;	/* LRU entry for mapped buffers. */
+	struct proc *b_proc;		/* Associated proc if B_PHYS set. */
 	volatile long	b_flags;	/* B_* flags. */
 	struct simplelock b_interlock;	/* Lock for b_flags changes */
 	int	b_error;		/* Errno value. */
@@ -172,16 +197,21 @@ struct buf {
 					   number (not partition relative) */
 					/* Function to call upon completion. */
 	void	(*b_iodone) __P((struct buf *));
-	struct	vnode *b_vp;		/* File vnode. */
+	struct vnode *b_vp;		/* File vnode. */
 	void	*b_private;		/* Private data for owner */
 	off_t	b_dcookie;		/* Offset cookie if dir block */
-	struct  workhead b_dep;		/* List of filesystem dependencies. */
+	struct workhead b_dep;		/* List of filesystem dependencies. */
+#ifdef _KERNEL /* XXX */
+	struct bufmap *b_map;
+	struct vm_page *b_pages[MAXPHYS/MIN_PAGE_SIZE];
+#endif /* _KERNEL */
 };
 
 #define	BUF_INIT(bp)							\
 do {									\
 	LIST_INIT(&(bp)->b_dep);					\
 	simple_lock_init(&(bp)->b_interlock);				\
+	(bp)->b_map = NULL;						\
 } while (/*CONSTCOND*/0)
 
 /*
@@ -220,6 +250,12 @@ do {									\
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_VFLUSH	0x04000000	/* Buffer is being synced. */
+#define	B_PAGES		0x08000000	/* b_pages is valid */
+#define	B_MAPPED	0x10000000	/* b_pages are addressable by b_data */
+
+/* test if a buffer is kernel-addressable */
+#define	BUF_IS_ADDRESSABLE(bp) \
+	(!((bp)->b_flags & B_PAGES) || ((bp)->b_flags & B_MAPPED))
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
@@ -251,10 +287,16 @@ do {									\
 #ifdef _KERNEL
 
 extern	struct bio_ops bioops;
-extern	u_int nbuf;		/* The number of buffer headers */
-extern	struct buf *buf;	/* The buffer headers. */
-extern	char *buffers;		/* The buffer contents. */
-extern	u_int bufpages;		/* Number of memory pages in the buffer pool. */
+
+/* The number of buffer headers */
+extern unsigned int nbufcache_min;
+extern unsigned int nbufcache_max;
+extern unsigned int nbufcache;
+
+/* Number of memory pages in the buffer pool. */
+extern unsigned int nbufcachepage_min;
+extern unsigned int nbufcachepage_max;
+extern unsigned int nbufcachepage;
 
 /*
  * Pool of I/O buffers.  Access to this pool must be protected with
@@ -285,7 +327,6 @@ int	cluster_read __P((struct vnode *, u_
 void	cluster_write __P((struct buf *, u_quad_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
-struct buf *getnewbuf __P((int, int));
 struct buf *incore __P((struct vnode *, daddr_t));
 
 void	minphys __P((struct buf *));
@@ -295,9 +336,33 @@ int	physio __P((void (*)(struct buf *), 
 void  brelvp __P((struct buf *));
 void  reassignbuf __P((struct buf *, struct vnode *));
 void  bgetvp __P((struct vnode *, struct buf *));
+
+int   bufcache_shutdown __P((void));
+
+static inline void
+buf_mapin(struct buf *bp)
+{
+
+	KASSERT(bp->b_map != NULL);
+	KASSERT(bp->b_map->bm_mapin != NULL);
+
+	bp->b_map->bm_mapin(bp);
+	KASSERT(BUF_IS_ADDRESSABLE(bp));
+}
+
+static inline void
+buf_mapout(struct buf *bp)
+{
+
+	KASSERT(bp->b_map != NULL);
+
+	if (bp->b_map->bm_mapout)
+		bp->b_map->bm_mapout(bp);
+}
+
 #ifdef DDB
 void	vfs_buf_print __P((struct buf *, int, void (*)(const char *, ...)));
-#endif
+#endif /* DDB */
 __END_DECLS
-#endif
+#endif /* _KERNEL */
 #endif /* !_SYS_BUF_H_ */

--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="uvm.iomap.diff"

Index: uvm/uvm_iomap.h
===================================================================
--- uvm/uvm_iomap.h	(revision 0)
+++ uvm/uvm_iomap.h	(revision 198)
@@ -0,0 +1,62 @@
+/*	$NetBSD$	*/
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Charles D. Cranor and
+ *      Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
+ */
+
+#ifndef _UVM_UVM_IOMAP_H_
+#define _UVM_UVM_IOMAP_H_
+
+/*
+ * uvm_iomap.h
+ */
+
+struct vm_map;
+
+struct uvm_iomap {
+	struct vm_map *ui_map;
+	struct simplelock ui_wanted_lock;
+	boolean_t ui_wanted;
+	vaddr_t ui_emergva;
+	boolean_t ui_emerginuse;
+};
+
+void uvm_iomap_init(struct uvm_iomap *, vsize_t, vsize_t);
+vaddr_t uvm_iomap_alloc(struct uvm_iomap *, vsize_t, int);
+void uvm_iomap_free(struct uvm_iomap *, vaddr_t, vsize_t);
+
+#define	UVMIOMAP_WAITOK	UVMPAGER_MAPIN_WAITOK
+
+#define	uvm_iomap_pmap(iomap)	vm_map_pmap((iomap)->ui_map)
+
+#endif /* _UVM_UVM_IOMAP_H_ */
Index: uvm/uvm_map.c
===================================================================
--- uvm/uvm_map.c	(revision 197)
+++ uvm/uvm_map.c	(revision 198)
@@ -98,8 +98,6 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 
 #include <uvm/uvm_ddb.h>
 #endif
 
-extern struct vm_map *pager_map;
-
 struct uvm_cnt map_ubackmerge, map_uforwmerge;
 struct uvm_cnt map_ubimerge, map_unomerge;
 struct uvm_cnt map_kbackmerge, map_kforwmerge;
@@ -578,14 +576,14 @@ uvm_map(map, startp, size, uobj, uoffset
 	}
 
 	/*
-	 * for pager_map, allocate the new entry first to avoid sleeping
+	 * for i/o map, allocate the new entry first to avoid sleeping
 	 * for memory while we have the map locked.
 	 */
 
 	new_entry = NULL;
-	if (map == pager_map) {
+	if (map->flags & VM_MAP_IOMAP) {
 		new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
-		 if (__predict_false(new_entry == NULL))
+		if (__predict_false(new_entry == NULL))
 			return ENOMEM;
 	}
 
Index: uvm/uvm_map.h
===================================================================
--- uvm/uvm_map.h	(revision 197)
+++ uvm/uvm_map.h	(revision 198)
@@ -231,6 +231,7 @@ struct vm_map {
 #define	VM_MAP_WANTLOCK		0x10		/* rw: want to write-lock */
 #define	VM_MAP_DYING		0x20		/* rw: map is being destroyed */
 #define	VM_MAP_TOPDOWN		0x40		/* ro: arrange map top-down */
+#define	VM_MAP_IOMAP		0x80		/* ro: map for i/o */
 
 /* XXX: number of kernel maps and entries to statically allocate */
 
Index: uvm/uvm_pager.c
===================================================================
--- uvm/uvm_pager.c	(revision 197)
+++ uvm/uvm_pager.c	(revision 198)
@@ -52,6 +52,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,
 
 #define UVM_PAGER
 #include <uvm/uvm.h>
+#include <uvm/uvm_iomap.h>
 
 struct pool *uvm_aiobuf_pool;
 
@@ -70,11 +71,7 @@ struct uvm_pagerops * const uvmpagerops[
  * the pager map: provides KVA for I/O
  */
 
-struct vm_map *pager_map;		/* XXX */
-struct simplelock pager_map_wanted_lock;
-boolean_t pager_map_wanted;	/* locked by pager map */
-static vaddr_t emergva;
-static boolean_t emerginuse;
+struct uvm_iomap pager_kva;
 
 /*
  * uvm_pager_init: init pagers (at boot time)
@@ -84,19 +81,12 @@ void
 uvm_pager_init()
 {
 	u_int lcv;
-	vaddr_t sva, eva;
 
 	/*
 	 * init pager map
 	 */
 
-	sva = 0;
-	pager_map = uvm_km_suballoc(kernel_map, &sva, &eva, PAGER_MAP_SIZE, 0,
-	    FALSE, NULL);
-	simple_lock_init(&pager_map_wanted_lock);
-	pager_map_wanted = FALSE;
-	emergva = uvm_km_valloc(kernel_map, round_page(MAXPHYS));
-	emerginuse = FALSE;
+	uvm_iomap_init(&pager_kva, PAGER_MAP_SIZE, round_page(MAXPHYS));
 
 	/*
 	 * init ASYNC I/O queue
@@ -136,6 +126,10 @@ uvm_pagermapin(pps, npages, flags)
 
 	UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d)", pps, npages,0,0);
 
+	size = npages << PAGE_SHIFT;
+	kva = uvm_iomap_alloc(&pager_kva, size,
+	    (flags & UVMPAGER_MAPIN_WAITOK) ? UVMIOMAP_WAITOK : 0);
+
 	/*
 	 * compute protection.  outgoing I/O only needs read
 	 * access to the page, whereas incoming needs read/write.
@@ -145,48 +139,13 @@ uvm_pagermapin(pps, npages, flags)
 	if (flags & UVMPAGER_MAPIN_READ)
 		prot |= VM_PROT_WRITE;
 
-ReStart:
-	size = npages << PAGE_SHIFT;
-	kva = 0;			/* let system choose VA */
-
-	if (uvm_map(pager_map, &kva, size, NULL,
-	      UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != 0) {
-		if (curproc == uvm.pagedaemon_proc) {
-			simple_lock(&pager_map_wanted_lock);
-			if (emerginuse) {
-				UVM_UNLOCK_AND_WAIT(&emergva,
-				    &pager_map_wanted_lock, FALSE,
-				    "emergva", 0);
-				goto ReStart;
-			}
-			emerginuse = TRUE;
-			simple_unlock(&pager_map_wanted_lock);
-			kva = emergva;
-			/* The shift implicitly truncates to PAGE_SIZE */
-			KASSERT(npages <= (MAXPHYS >> PAGE_SHIFT));
-			goto enter;
-		}
-		if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) {
-			UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0);
-			return(0);
-		}
-		simple_lock(&pager_map_wanted_lock);
-		pager_map_wanted = TRUE;
-		UVMHIST_LOG(maphist, "  SLEEPING on pager_map",0,0,0,0);
-		UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, FALSE,
-		    "pager_map", 0);
-		goto ReStart;
-	}
-
-enter:
-	/* got it */
 	for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) {
 		pp = *pps++;
 		KASSERT(pp);
 		KASSERT(pp->flags & PG_BUSY);
 		pmap_kenter_pa(cva, VM_PAGE_TO_PHYS(pp), prot);
 	}
-	pmap_update(vm_map_pmap(pager_map));
+	pmap_update(uvm_iomap_pmap(&pager_kva));
 
 	UVMHIST_LOG(maphist, "<- done (KVA=0x%x)", kva,0,0,0);
 	return(kva);
@@ -205,36 +164,11 @@ uvm_pagermapout(kva, npages)
 	int npages;
 {
 	vsize_t size = npages << PAGE_SHIFT;
-	struct vm_map_entry *entries;
 	UVMHIST_FUNC("uvm_pagermapout"); UVMHIST_CALLED(maphist);
 
 	UVMHIST_LOG(maphist, " (kva=0x%x, npages=%d)", kva, npages,0,0);
-
-	/*
-	 * duplicate uvm_unmap, but add in pager_map_wanted handling.
-	 */
-
-	pmap_kremove(kva, npages << PAGE_SHIFT);
-	if (kva == emergva) {
-		simple_lock(&pager_map_wanted_lock);
-		emerginuse = FALSE;
-		wakeup(&emergva);
-		simple_unlock(&pager_map_wanted_lock);
-		return;
-	}
-
-	vm_map_lock(pager_map);
-	uvm_unmap_remove(pager_map, kva, kva + size, &entries);
-	simple_lock(&pager_map_wanted_lock);
-	if (pager_map_wanted) {
-		pager_map_wanted = FALSE;
-		wakeup(pager_map);
-	}
-	simple_unlock(&pager_map_wanted_lock);
-	vm_map_unlock(pager_map);
-	if (entries)
-		uvm_unmap_detach(entries, 0);
-	pmap_update(pmap_kernel());
+	pmap_kremove(kva, size);
+	uvm_iomap_free(&pager_kva, kva, size);
 	UVMHIST_LOG(maphist,"<- done",0,0,0,0);
 }
 
Index: uvm/uvm_iomap.c
===================================================================
--- uvm/uvm_iomap.c	(revision 0)
+++ uvm/uvm_iomap.c	(revision 198)
@@ -0,0 +1,138 @@
+/*	$NetBSD$	*/
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Charles D. Cranor and
+ *      Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include "opt_uvmhist.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/vnode.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_iomap.h>
+
+void
+uvm_iomap_init(struct uvm_iomap *kp, vsize_t size, vsize_t emergsize)
+{
+	vaddr_t sva, eva;
+
+	sva = 0;
+	kp->ui_map = uvm_km_suballoc(kernel_map, &sva, &eva, size, 0, FALSE,
+	    NULL);
+	kp->ui_map->flags |= VM_MAP_IOMAP;
+	simple_lock_init(&kp->ui_wanted_lock);
+	kp->ui_wanted = FALSE;
+	kp->ui_emergva = uvm_km_valloc(kernel_map, emergsize);
+	kp->ui_emerginuse = FALSE;
+}
+
+vaddr_t
+uvm_iomap_alloc(struct uvm_iomap *kp, vsize_t size, int flags)
+{
+	vaddr_t kva;
+	UVMHIST_FUNC("uvm_iomap_alloc"); UVMHIST_CALLED(maphist);
+
+ReStart:
+	kva = 0;			/* let system choose VA */
+
+	if (uvm_map(kp->ui_map, &kva, size, NULL,
+	      UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != 0) {
+		if (curproc == uvm.pagedaemon_proc) {
+			simple_lock(&kp->ui_wanted_lock);
+			if (kp->ui_emerginuse) {
+				UVM_UNLOCK_AND_WAIT(&kp->ui_emergva,
+				    &kp->ui_wanted_lock, FALSE,
+				    "emergva", 0);
+				goto ReStart;
+			}
+			kp->ui_emerginuse = TRUE;
+			simple_unlock(&kp->ui_wanted_lock);
+			kva = kp->ui_emergva;
+			goto gotit;
+		}
+		if ((flags & UVMIOMAP_WAITOK) == 0) {
+			UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0);
+			return 0;
+		}
+		simple_lock(&kp->ui_wanted_lock);
+		kp->ui_wanted = TRUE;
+		UVMHIST_LOG(maphist, "  SLEEPING on iomap",0,0,0,0);
+		UVM_UNLOCK_AND_WAIT(kp->ui_map, &kp->ui_wanted_lock, FALSE,
+		    "iomap", 0);
+		goto ReStart;
+	}
+
+gotit:
+	return kva;
+	UVMHIST_LOG(maphist, "<- done (KVA=0x%x)", kva,0,0,0);
+}
+
+void
+uvm_iomap_free(struct uvm_iomap *kp, vaddr_t kva, vsize_t size)
+{
+	struct vm_map_entry *entries;
+
+	/*
+	 * duplicate uvm_unmap, but add in ui_wanted handling.
+	 */
+
+	if (kva == kp->ui_emergva) {
+		simple_lock(&kp->ui_wanted_lock);
+		kp->ui_emerginuse = FALSE;
+		wakeup(&kp->ui_emergva);
+		simple_unlock(&kp->ui_wanted_lock);
+		return;
+	}
+
+	vm_map_lock(kp->ui_map);
+	uvm_unmap_remove(kp->ui_map, kva, kva + size, &entries);
+	simple_lock(&kp->ui_wanted_lock);
+	if (kp->ui_wanted) {
+		kp->ui_wanted = FALSE;
+		wakeup(kp->ui_map);
+	}
+	simple_unlock(&kp->ui_wanted_lock);
+	vm_map_unlock(kp->ui_map);
+	if (entries)
+		uvm_unmap_detach(entries, 0);
+	pmap_update(pmap_kernel()); /* needed? */
+}
+

--NextPart-20031120184112-0180700
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="vfs_bufmap.c"

/* $NetBSD$ */

/*
 * XXX
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>

#include <uvm/uvm.h>
#include <uvm/uvm_iomap.h>

void devbufmap_init(void);
void bufmap_update(struct bufmap *);

struct uvm_iomap deviomap;
struct bufmap devbufmap;
int devbufcnt;

vsize_t deviomap_size = MAXPHYS * 128; /* XXX */

void
devbufmap_init()
{

	memset(&devbufmap, 0, sizeof(devbufmap));
	devbufmap.bm_mapin = genbuf_mapin;
	devbufmap.bm_mapout = genbuf_mapout;
	devbufmap.bm_iomap = &deviomap;

	uvm_iomap_init(&deviomap, deviomap_size, round_page(MAXPHYS));
}

/*
 * map a buffer.
 * - allocate kva(b_data) and map the pages.
 * - unmap free buffers on kva shortage if bm_reclaim callback is specified.
 */
void
genbuf_mapin(struct buf *bp)
{
	vaddr_t kva, eva;
	vsize_t size;
	struct vm_page **pgpp;
	struct bufmap *bmap = bp->b_map;
	struct uvm_iomap *iomap = bmap->bm_iomap;
	int iomapflags;

	KASSERT(bp->b_flags & B_PAGES);
	KASSERT(bp->b_map != NULL);
	KASSERT(bp->b_flags & B_BUSY);
	KASSERT(!(bp->b_flags & B_MAPPED));
	KASSERT((bp->b_bufsize & PAGE_MASK) == 0);
	KASSERT(bp->b_data == NULL);
	KASSERT(bp->b_bufsize > 0);
	LOCK_ASSERT(!simple_lock_held(&bp->b_interlock));

	if (bp->b_map == &devbufmap)
		devbufcnt++;

	size = bp->b_bufsize;
	if (bmap->bm_reclaim) {
		iomapflags = 0;
	} else {
		iomapflags = UVMIOMAP_WAITOK;
	}
	while ((kva = uvm_iomap_alloc(iomap, size, iomapflags)) == 0) {
		/*
		 * kva shortage.
		 * try to unmap free buffers.
		 *
		 * XXX need to consider kva fragmentation
		 */

		/*
		 * uvm_iomap_alloc shouldn't failed for pagedaemon.
		 */
		KASSERT(curproc != uvm.pagedaemon_proc);

		/*
		 * we can deadlock if the buffer we have is on kva lru list.
		 */
		KASSERT(bp->b_mappedlist.tqe_prev == NULL);
		KASSERT(bmap->bm_reclaim != NULL);
		bmap->bm_reclaim(bmap, size);
	}

	bp->b_data = (void *)kva;
	eva = kva + size;
	for (pgpp = bp->b_pages; kva < eva; pgpp++, kva += PAGE_SIZE) {
		const vm_prot_t prot = VM_PROT_READ | VM_PROT_WRITE;
		struct vm_page *pg = *pgpp;

		KASSERT(pg);
		pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pg), prot);
	}

	bmap->bm_kva_used += size; /* XXX MP */

	bufmap_update(bmap);
}

/*
 * unmap a buffer.
 */
void
genbuf_mapout(struct buf *bp)
{
	vaddr_t kva;
	vsize_t size;
	struct bufmap *bmap = bp->b_map;

	LOCK_ASSERT(!simple_lock_held(&bp->b_interlock));

	KASSERT(bp->b_flags & B_BUSY);
	KASSERT(bp->b_flags & B_PAGES);
	KASSERT(bp->b_flags & B_MAPPED);
	KASSERT(bp->b_bufsize > 0);
	KASSERT(bp->b_mappedlist.tqe_prev == NULL);
	KASSERT(bp->b_map != NULL);

	kva = (vaddr_t)bp->b_data;
	size = round_page(bp->b_bufsize);
	pmap_kremove(kva, size);
	uvm_iomap_free(bmap->bm_iomap, kva, size);

	bp->b_flags &= ~B_MAPPED;
#ifdef DIAGNOSTIC
	bp->b_data = NULL;
#endif

	KASSERT(bmap->bm_kva_used >= size); /* XXX MP */
	bmap->bm_kva_used -= size; /* XXX MP */
}

void
bufmap_update(struct bufmap *bm)
{

	pmap_update(uvm_iomap_pmap(bm->bm_iomap));
}


--NextPart-20031120184112-0180700--