Subject: buffer cache memory management revision
To: None <tech-kern@netbsd.org>
From: Paul Kranenburg <pk@cs.few.eur.nl>
List: tech-kern
Date: 11/20/2003 09:44:22
Since the introduction of UBC, the tasks left to do for the original buffer
cache system are much reduced and also have changed in pattern. The
buffer sizes requested are mostly determined by the block size parameters
of the file systems using the cache for metadata. On my ffsv1 system,
the sizes range from 1K to 8K.

Therefore, I propose to revisit the age old memory management still
employed by the buffer cache. In particular, I'd like to get rid of
the MAXBSIZE reservation of virtual memory per buffer which is sparsely
mapped by privately managed pool of physical pages. Currently, this
scheme stresses MMU resources on some platforms like sun4 & sun4c.
It also wastes a large amount kernel VM space on machines with lots of
physical memory when the default buffer cache parameters are in use.

Initially, I made some modifications to drop the VM reservation but still
used pagemove() to manage a fixed pool of dedicated physical buffer pages.
It turns out however, there's no real benefit to hold on to that scheme
since it's very rare for a buffer to get re-sized while it holds valid
data, i.e. I've never observed an `incore' buffer that needed its buffer
size adjusted.  Note: this observation is based on ffsv1 filesystem clients
only.

So, I decided to simply do something like realloc() whenever a buffer
needs to be resized, with the existing buffer data getting copied on request.
The fixed physical page buffer cache is gone and the MAXBSIZE limit lifted.

Other things to consider: use the already existing `bufpool' for buffer
allocation possibly inserting a poolcache on top of it and then drop
the EMPTY & AGE queues..

I've included a skeleton patch to illustrate this proposal.

Comments?

-pk



----
Index: vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.97
diff -c -r1.97 vfs_bio.c
*** vfs_bio.c	8 Nov 2003 04:22:35 -0000	1.97
--- vfs_bio.c	20 Nov 2003 08:46:46 -0000
***************
*** 184,189 ****
--- 184,190 ----
  /*
   * Initialize buffers and hash links for buffers.
   */
+ #ifndef __NEW_BIO__
  void
  bufinit()
  {
***************
*** 220,225 ****
--- 221,263 ----
  		binshash(bp, &invalhash);
  	}
  }
+ #else /*__NEW_BIO__*/
+ int bufpages_hiwater;
+ int bufpages_lowater;
+ int nbufpages;
+ void
+ bufinit()
+ {
+ 	struct buf *bp;
+ 	struct bqueues *dp;
+ 	u_int i;
+ 
+ 	/*
+ 	 * Initialize the buffer pool.  This pool is used for buffers
+ 	 * which are strictly I/O control blocks, not buffer cache
+ 	 * buffers.
+ 	 */
+ 	pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
+ 
+ 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
+ 		TAILQ_INIT(dp);
+ 	bufhashtbl = hashinit(nbuf, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
+ 	for (i = 0; i < nbuf; i++) {
+ 		bp = &buf[i];
+ 		memset((char *)bp, 0, sizeof(*bp));
+ 		BUF_INIT(bp);
+ 		bp->b_dev = NODEV;
+ 		bp->b_vnbufs.le_next = NOLIST;
+ 		bp->b_flags = B_INVAL;
+ 		binsheadfree(bp, &bufqueues[BQ_EMPTY]);
+ 		binshash(bp, &invalhash);
+ 	}
+ 
+ 	bufpages_hiwater = bufpages + (bufpages >> 3);
+ 	bufpages_lowater = bufpages - (bufpages >> 3);
+ 	nbufpages = 0;
+ }
+ #endif /*__NEW_BIO__*/
  
  static __inline struct buf *
  bio_doread(vp, blkno, size, cred, async)
***************
*** 682,687 ****
--- 720,726 ----
  {
  	struct buf *bp;
  	int s, err;
+ 	int preserve;
  
  start:
  	s = splbio();
***************
*** 711,716 ****
--- 750,758 ----
  #endif
  		SET(bp->b_flags, B_BUSY);
  		bremfree(bp);
+ #ifdef __NEW_BIO__
+ 		preserve = 1;
+ #endif
  	} else {
  		if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) {
  			simple_unlock(&bqueue_slock);
***************
*** 721,726 ****
--- 763,769 ----
  		binshash(bp, BUFHASH(vp, blkno));
  		bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
  		bgetvp(vp, bp);
+ 		preserve = 0;
  	}
  	simple_unlock(&bp->b_interlock);
  	simple_unlock(&bqueue_slock);
***************
*** 732,738 ****
  	if (ISSET(bp->b_flags, B_LOCKED)) {
  		KASSERT(bp->b_bufsize >= size);
  	} else {
! 		allocbuf(bp, size);
  	}
  	return (bp);
  }
--- 775,781 ----
  	if (ISSET(bp->b_flags, B_LOCKED)) {
  		KASSERT(bp->b_bufsize >= size);
  	} else {
! 		allocbuf(bp, size, preserve);
  	}
  	return (bp);
  }
***************
*** 757,763 ****
  	simple_unlock(&bqueue_slock);
  	simple_unlock(&bp->b_interlock);
  	splx(s);
! 	allocbuf(bp, size);
  	return (bp);
  }
  
--- 800,806 ----
  	simple_unlock(&bqueue_slock);
  	simple_unlock(&bp->b_interlock);
  	splx(s);
! 	allocbuf(bp, size, 0);
  	return (bp);
  }
  
***************
*** 769,774 ****
--- 812,818 ----
   * start a write.  If the buffer grows, it's the callers
   * responsibility to fill out the buffer's additional contents.
   */
+ #ifndef __NEW_BIO__
  void
  allocbuf(bp, size)
  	struct buf *bp;
***************
*** 859,864 ****
--- 903,991 ----
  out:
  	bp->b_bcount = size;
  }
+ #else
+ void
+ allocbuf(bp, size, preserve)
+ 	struct buf *bp;
+ 	int size;
+ 	int preserve;
+ {
+ 	vsize_t oldsize, desired_size;
+ 	vaddr_t va;
+ 	int s, delta;
+ 
+ 	desired_size = round_page((vsize_t)size);
+ 	if (desired_size > MAXBSIZE)
+ 		printf("allocbuf: buffer larger than MAXBSIZE requested");
+ 
+ 	bp->b_bcount = size;
+ 
+ 	oldsize = bp->b_bufsize;
+ 	if (oldsize == desired_size)
+ 		return;
+ 
+ 	/*
+ 	 * If the buffer is smaller than the desired size, re-allocate
+ 	 * the buffer's memory. Copy old content only if needed.
+ 	 */
+ 	if (oldsize < desired_size) {
+ 		va = uvm_km_alloc(kernel_map, desired_size);
+ 		if (preserve)
+ 			memcpy((void *)va, bp->b_data, oldsize);
+ 		uvm_km_free(kernel_map, (vaddr_t)bp->b_data, oldsize);
+ 		bp->b_data = (caddr_t)va;
+ 		bp->b_bufsize = desired_size;
+ 	}
+ 
+ 	/*
+ 	 * If we want a buffer smaller than the current size,
+ 	 * shrink this buffer.  Re-allocate the buffer's memory;
+ 	 * copy old content only if needed.
+ 	 */
+ 	if (oldsize > desired_size) {
+ 		va = uvm_km_alloc(kernel_map, desired_size);
+ 		if (preserve)
+ 			memcpy((void *)va, bp->b_data, desired_size);
+ 		uvm_km_free(kernel_map, (vaddr_t)bp->b_data, oldsize);
+ 		bp->b_data = (caddr_t)va;
+ 		bp->b_bufsize = desired_size;
+ 	}
+ 
+ 	delta = ((long)desired_size - (long)oldsize) >> PAGE_SHIFT;
+ 
+ 	s = splbio();
+ 	simple_lock(&bqueue_slock);
+ 	if ((nbufpages += delta) < bufpages_hiwater)
+ 		goto out;
+ 
+ 	/*
+ 	 * Need to trim overall memory usage.
+ 	 * XXX - relies on getnewbuf() not returning empty buffers
+ 	 *	 while we're above the low water mark.
+ 	 */
+ 	while (nbufpages > bufpages_lowater) {
+ 		vsize_t size;
+ 		if ((bp = getnewbuf(PCATCH,1)) == NULL)
+ 			break;
+ 		SET(bp->b_flags, B_INVAL);
+ 		binshash(bp, &invalhash);
+ 		simple_unlock(&bp->b_interlock);
+ 		simple_unlock(&bqueue_slock);
+ 		size = bp->b_bufsize;
+ 		if (size > 0) {
+ 			uvm_km_free(kernel_map, (vaddr_t)bp->b_data, size);
+ 			bp->b_bcount = bp->b_bufsize = 0;
+ 		}
+ 		brelse(bp);
+ 		simple_lock(&bqueue_slock);
+ 		nbufpages -= (size >> PAGE_SHIFT);
+ 	}
+ 
+ out:
+ 	simple_unlock(&bqueue_slock);
+ 	splx(s);
+ }
+ #endif
  
  /*
   * Find a buffer which is available for use.
***************
*** 878,883 ****
--- 1005,1014 ----
  	LOCK_ASSERT(simple_lock_held(&bqueue_slock));
  
  	if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
+ #ifdef __NEW_BIO__
+ 	    ((bp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) != NULL &&
+ 		nbufpages <= bufpages_lowater) ||
+ #endif
  	    (bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
  		simple_lock(&bp->b_interlock);
  		bremfree(bp);