Subject: kern/6332: bug in buffer cache?
To: None <gnats-bugs@gnats.netbsd.org>
From: Alasdair Baird <alasdair@wildcat.demon.co.uk>
List: netbsd-bugs
Date: 10/19/1998 22:32:31
>Number:         6332
>Category:       kern
>Synopsis:       B_CACHE not correctly maintained
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    kern-bug-people (Kernel Bug People)
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Mon Oct 19 14:35:00 1998
>Last-Modified:
>Originator:     Alasdair Baird
>Organization:
Absolutely none whatsoever.
>Release:        -current
>Environment:
System: NetBSD wildcat.demon.co.uk 1.3H NetBSD 1.3H (WILDCAT) #223: Mon Oct 19 20:59:46 BST 1998 root@wildcat.demon.co.uk:/usr/src/sys/arch/i386/compile/WILDCAT i386


>Description:

I have for a while tried to run NetBSD in a large buffer cache
configuration (~50MB) without huge success.  It boots, it runs,
but typically during some repeated untarring of large archives (>10MB)
with lots of other file system activity the tar will die claiming the
archive being read from is corrupted.  This problem can be made to go
away by purging the cache by either a) rebooting or b) running a whole
load of other data through it; after such action the archive is found
to still be intact and perfectly useable.  As I haven't had time to
look at this, I worked around this annoying situation by running with
a substantially reduced buffer cache (~8MB) which has given me no bother
and plenty of time to nap.

Judging by the charateristics of the problem, namely it is a corruption
of in-situ data and is dependant upon the speed with which cache entries
get recycled, I went off browsing through the read code path and noticed
very quickly that the B_CACHE flag seems to be poorly maintained.

The B_CACHE flag is used in vfs_cluster.c/cluster_read() to determine
whether a block is in the cache---in fact there is only one place where
it is read.  The flag is set in vfs_bio.c/getblk() whenever a "good"
buffer header is found, irrespective of whether the header has sufficient
data associated with it to fulfil the request.  This made me wonder what
would happen if said header had had some of its data ripped off it by
vfs_bio.c/allocbuf() before it was rediscovered in the cache.  There
also is the possibility that the flag is totally bogus as once it is set
it is never cleared until vfs_bio.c/getnewbuf() recycles it.  Maybe a
slight problem, I though.

Rather than rip out the B_CACHE flag I decided to make its use a bit
more consistant; I made vfs_bio.c/getblk() only set it if the b_bcount
value indicated there was enough data in the buffer, I made vfs_bio.c/brelse()
remove it, and I made vfs_bio.c/*read() use it to determine whether to
wait for IO completion.  Out of paranoia I also made sure B_DONE was cleared
before starting a read.

With these changes my problem with a large buffer cache seem to have gone
away and all still seems to work.

The diffs to vfs_bio.c follow: I'd be interested to hear if anyone else
has seen this problem or if this patch seems to be of any use to cure it.


   Alasdair.



>How-To-Repeat:

Run with a big buffer cache and thrash it a bit.

>Fix:


*** /usr/src/sys/kern/vfs_bio.c	Thu Aug 13 12:11:36 1998
--- /usr/SRC/sys/kern/vfs_bio.c	Mon Oct 19 20:59:02 1998
*************** bio_doread(vp, blkno, size, cred, async)
*** 179,187 ****
  	 * Note that if buffer is B_INVAL, getblk() won't return it.
  	 * Therefore, it's valid if it's I/O has completed or been delayed.
  	 */
! 	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
  		/* Start I/O for the buffer (keeping credentials). */
  		SET(bp->b_flags, B_READ | async);
  		if (cred != NOCRED && bp->b_rcred == NOCRED) {
  			crhold(cred);
  			bp->b_rcred = cred;
--- 179,188 ----
  	 * Note that if buffer is B_INVAL, getblk() won't return it.
  	 * Therefore, it's valid if it's I/O has completed or been delayed.
  	 */
! 	if (!ISSET(bp->b_flags, B_CACHE)) {
  		/* Start I/O for the buffer (keeping credentials). */
  		SET(bp->b_flags, B_READ | async);
+ 		CLR(bp->b_flags, B_DONE);
  		if (cred != NOCRED && bp->b_rcred == NOCRED) {
  			crhold(cred);
  			bp->b_rcred = cred;
*************** bread(vp, blkno, size, cred, bpp)
*** 215,225 ****
  	bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
  
  	/*
! 	 * Delayed write buffers are found in the cache and have
! 	 * valid contents. Also, B_ERROR is not set, otherwise
! 	 * getblk() would not have returned them.
  	 */
! 	if (ISSET(bp->b_flags, B_DELWRI))
  		return (0);
  
  	/*
--- 216,224 ----
  	bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
  
  	/*
! 	 * No need to wait if data was found in cache.	
  	 */
! 	if (ISSET(bp->b_flags, B_CACHE))
  		return (0);
  
  	/*
*************** breadn(vp, blkno, size, rablks, rasizes,
*** 260,271 ****
  	}
  
  	/*
! 	 * Delayed write buffers are found in the cache and have
! 	 * valid contents. Also, B_ERROR is not set, otherwise
! 	 * getblk() would not have returned them.
  	 */
! 	if (ISSET(bp->b_flags, B_DELWRI))
! 		SET(bp->b_flags, B_DONE);
  
  	/*
  	 * Otherwise, we had to start a read for it; wait until
--- 259,268 ----
  	}
  
  	/*
! 	 * No need to wait if data was found in cache.	
  	 */
! 	if (ISSET(bp->b_flags, B_CACHE))
! 		return (0);
  
  	/*
  	 * Otherwise, we had to start a read for it; wait until
*************** brelse(bp)
*** 439,445 ****
  
  	/* Wake up any proceeses waiting for _this_ buffer to become free. */
  	if (ISSET(bp->b_flags, B_WANTED)) {
! 		CLR(bp->b_flags, B_WANTED);
  		wakeup(bp);
  	}
  
--- 436,442 ----
  
  	/* Wake up any proceeses waiting for _this_ buffer to become free. */
  	if (ISSET(bp->b_flags, B_WANTED)) {
! 		CLR(bp->b_flags, (B_WANTED|B_AGE));
  		wakeup(bp);
  	}
  
*************** brelse(bp)
*** 506,512 ****
  
  already_queued:
  	/* Unlock the buffer. */
! 	CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
  
  	/* Allow disk interrupts. */
  	splx(s);
--- 503,509 ----
  
  already_queued:
  	/* Unlock the buffer. */
! 	CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_CACHE));
  
  	/* Allow disk interrupts. */
  	splx(s);
*************** start:
*** 585,591 ****
  		}
  
  		if (!ISSET(bp->b_flags, B_INVAL)) {
! 			SET(bp->b_flags, (B_BUSY | B_CACHE));
  			bremfree(bp);
  			splx(s);
  			break;
--- 582,590 ----
  		}
  
  		if (!ISSET(bp->b_flags, B_INVAL)) {
! 			SET(bp->b_flags, B_BUSY);
! 			if (size <= bp->b_bcount)
! 				SET(bp->b_flags, B_CACHE);
  			bremfree(bp);
  			splx(s);
  			break;
*************** getnewbuf(slpflag, slptimeo)
*** 728,743 ****
  
  start:
  	s = splbio();
! 	if ((bp = bufqueues[BQ_AGE].tqh_first) != NULL ||
! 	    (bp = bufqueues[BQ_LRU].tqh_first) != NULL) {
! 		bremfree(bp);
! 	} else {
  		/* wait for a free buffer of any kind */
  		needbuffer = 1;
  		tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
  		splx(s);
  		return (0);
  	}
  
  	if (ISSET(bp->b_flags, B_VFLUSH)) {
  		/*
--- 727,742 ----
  
  start:
  	s = splbio();
! 	if ((bp = bufqueues[BQ_AGE].tqh_first) == NULL &&
! 	    (bp = bufqueues[BQ_LRU].tqh_first) == NULL) {
  		/* wait for a free buffer of any kind */
  		needbuffer = 1;
  		tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
  		splx(s);
  		return (0);
  	}
+ 
+ 	bremfree(bp);
  
  	if (ISSET(bp->b_flags, B_VFLUSH)) {
  		/*
>Audit-Trail:
>Unformatted: