Subject: New patch (Re: RFC: addition of B_NESTED to buf.h and vfs_bio.c)
To: Jason Thorpe <thorpej@shagadelic.org>
From: Reinoud Zandijk <reinoud@netbsd.org>
List: tech-kern
Date: 01/07/2006 03:09:11
--V88s5gaDVPzZ0KCq
Content-Type: multipart/mixed; boundary="98e8jtXdkpgskNou"
Content-Disposition: inline


--98e8jtXdkpgskNou
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Dear folks,

i've got a new patch ready for the nesting of buffers complete with patch 
for genfs. I've also added a patch for LFS but since LFS is messing around 
with B_LOCKED buffers something went wrong. I've discussed with with Greg 
and it needs some more work. In the mean time LFS can still use the old 
method without problems; this means that the UVM patch can't be applied 
either since the function removed from UVM is needed still by LFS.

Basicly, i've tried to make the nested buffers interface agnostic as much 
as possible and indifferent to other layers they can be passed to. The 
buffers can even be brelse()'d or issued with B_ASYNC without a callback; 
the code will do the right thing. If code uses callbacks they can just 
brelse() them like it is a normal buffer; no need for special knowledge. 
This is documented in the source. A patch for manpages is not added yet.

If the splitup between iobuffers and cached buffers is made, this behaviour 
is then more logically implemented and can be easily taken care of.

I've also checked if nested buffers can be nested again and this has proven 
to work fine. Nested buffers can be used for read and for write actions 
with the appropriate write counter updated.

Also included - on request by Jason - is maintenance of a list of dependent 
buffers that is checked for sanity at biodone() time. This could also be 
used later to recall buffer operations if for such a thing is decided on.

I'd like to post the BIO en GENFS patch this week or so (if only to clean 
up my tree ;)). The LFS and its dependent UVM patch can be dealt with 
later.

With regards,
Reinoud


--98e8jtXdkpgskNou
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=DIFF_BIO

Index: sys/sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.86
diff -p -r1.86 buf.h
*** sys/sys/buf.h	7 Jan 2006 00:26:58 -0000	1.86
--- sys/sys/buf.h	7 Jan 2006 01:53:07 -0000
*************** struct buf {
*** 146,151 ****
--- 146,156 ----
  	struct  workhead b_dep;		/* List of filesystem dependencies. */
  	void	*b_saveaddr;		/* Original b_addr for physio. */
  
+ 	/* nested buffer support */
+ 	struct buf *b_masterbuf;	/* If nested points to masterbuf */
+ 	LIST_HEAD(, buf) b_nesters;	/* Keeps track of it's nested bufs */
+ 	LIST_ENTRY(buf) b_nestedlist;
+ 
  	/*
  	 * private data for owner.
  	 *  - buffer cache buffers are owned by corresponding filesystem.
*************** do {									\
*** 174,179 ****
--- 179,187 ----
  	LIST_INIT(&(bp)->b_dep);					\
  	simple_lock_init(&(bp)->b_interlock);				\
  	(bp)->b_dev = NODEV;						\
+ 	(bp)->b_vp = NULL;						\
+ 	(bp)->b_masterbuf = NULL;					\
+ 	LIST_INIT(&(bp)->b_nesters);					\
  	BIO_SETPRIO((bp), BPRIO_DEFAULT);				\
  } while (/*CONSTCOND*/0)
  
*************** struct buf *getblk(struct vnode *, daddr
*** 282,287 ****
--- 290,296 ----
  struct buf *geteblk(int);
  struct buf *getnewbuf(int, int, int);
  struct buf *incore(struct vnode *, daddr_t);
+ struct buf *nestiobuf(struct buf *, daddr_t, caddr_t, int);
  
  void	minphys(struct buf *);
  int	physio(void (*)(struct buf *), struct buf *, dev_t, int,
Index: sys/kern/vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.151
diff -p -r1.151 vfs_bio.c
*** sys/kern/vfs_bio.c	7 Jan 2006 00:26:58 -0000	1.151
--- sys/kern/vfs_bio.c	7 Jan 2006 01:53:08 -0000
*************** void
*** 883,888 ****
--- 883,889 ----
  brelse(struct buf *bp)
  {
  	struct bqueue *bufq;
+ 	struct buf *mbp;
  	int s;
  
  	/* Block disk interrupts. */
*************** brelse(struct buf *bp)
*** 905,910 ****
--- 906,936 ----
  		wakeup(bp);
  	}
  
+ 	/* 
+ 	 * If it's a nested buffer, we could be called by biodone on an async
+ 	 * buffer without a call back. We thus only recycle struct buf since
+ 	 * it doesn't own its data block and should not be cached or reused in
+ 	 * the normal way. It also needs to be put on the bufiopool.
+ 	 */
+ 	mbp = bp->b_masterbuf;
+ 	if (mbp) {
+ 		LIST_REMOVE(bp, b_nestedlist);
+ 
+ 		/* sanity */
+ 		mbp = bp->b_masterbuf = NULL;
+ 		bp->b_bufsize = 0;
+ 
+ 		/* Allow disk interrupts again */
+ 		simple_unlock(&bp->b_interlock);
+ 		simple_unlock(&bqueue_slock);
+ 
+ 		/* recycle */
+ 		putiobuf(bp);
+ 		splx(s);
+ 
+ 		return;
+ 	};
+ 
  	/*
  	 * Determine which queue the buffer should be on, then put it there.
  	 */
*************** already_queued:
*** 1004,1009 ****
--- 1030,1086 ----
  }
  
  /*
+  * Get a new buffer nested into the specified buffer at the given offset and
+  * length. NO read/write actions ought to be caried out on the master buffer
+  * anymore only on the nested buffers as they effectively split up the master
+  * buffer's action.
+  *
+  * Bug alert: make sure all nested buffers cover the complete mbp->resid
+  * space.  If space is to be skipped, mbp->resid needs to be accounted for or
+  * the biodone on mbp won't be called!
+  */
+ struct buf *
+ nestiobuf(struct buf *mbp, daddr_t blkno, caddr_t base, int size)
+ {
+ 	struct buf *bp;
+ 	int s;
+ 
+ 	KASSERT(base != NULL);
+ 	KASSERT(size > 0);
+ 	KASSERT(mbp);
+ 	KASSERT(mbp->b_bcount >= size);
+ 
+ 	bp = getiobuf_nowait();
+ 
+ 	/*
+ 	 * Adjust relevant information from the master buffer and clear
+ 	 * callback info (!)
+ 	 */
+ 	memcpy(bp, mbp, sizeof(struct buf));
+ 	bp->b_flags &= ~B_CALL;
+ 	bp->b_blkno = blkno;
+ 	bp->b_data  = base;
+ 	bp->b_bufsize = bp->b_bcount = bp->b_resid = size;
+ 
+ 	/* avoid confusion for softdep? */
+ 	LIST_INIT(&bp->b_dep);
+ 
+ 	/* dependency administration */
+ 	bp->b_masterbuf = mbp;
+ 	LIST_INSERT_HEAD(&mbp->b_nesters, bp, b_nestedlist);
+ 	LIST_INIT(&bp->b_nesters);
+ 
+ 	/* for write requests we have to increase mbp's num_output */
+ 	if ((mbp->b_flags & B_READ) == 0) {
+ 		s = splbio();
+ 		V_INCR_NUMOUTPUT(mbp->b_vp);
+ 		splx(s);
+ 	};
+ 
+ 	return bp;
+ }
+ 
+ /*
   * Determine if a block is in the cache.
   * Just look on what would be its hash chain.  If it's there, return
   * a pointer to it, unless it's marked invalid.  If it's marked invalid,
*************** void
*** 1382,1387 ****
--- 1459,1465 ----
  biodone(struct buf *bp)
  {
  	int s = splbio();
+ 	struct buf *mbp;
  
  	simple_lock(&bp->b_interlock);
  	if (ISSET(bp->b_flags, B_DONE))
*************** biodone(struct buf *bp)
*** 1395,1400 ****
--- 1473,1489 ----
  	if (!ISSET(bp->b_flags, B_READ))	/* wake up reader */
  		vwakeup(bp);
  
+ 	/* Propagate status to master buffer if its a nested buffer */
+ 	mbp = bp->b_masterbuf;
+ 	if (mbp) {
+ 		KASSERT(mbp);
+ 		if (bp->b_flags & B_ERROR) {
+ 			mbp->b_flags |= B_ERROR;
+ 			mbp->b_error = bp->b_error;
+ 		};
+ 		mbp->b_resid -= bp->b_bcount;
+ 	};
+ 
  	/*
  	 * If necessary, call out.  Unlock the buffer before calling
  	 * iodone() as the buffer isn't valid any more when it return.
*************** biodone(struct buf *bp)
*** 1415,1420 ****
--- 1504,1518 ----
  	}
  
  	splx(s);
+ 
+ 	/* call biodone on master buffer if its completed by this op */
+ 	if (mbp) {
+ 		if (mbp->b_resid == 0) {
+ 			if (!LIST_EMPTY(&mbp->b_nesters))
+ 				panic("biodone: empty nested buf list isn't");
+ 			biodone(mbp);
+ 		};
+ 	};
  }
  
  /*

--98e8jtXdkpgskNou
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=DIFF_GENFS

Index: sys/miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.119
diff -p -r1.119 genfs_vnops.c
*** sys/miscfs/genfs/genfs_vnops.c	4 Jan 2006 10:13:06 -0000	1.119
--- sys/miscfs/genfs/genfs_vnops.c	7 Jan 2006 01:53:30 -0000
*************** genfs_getpages(void *v)
*** 799,814 ****
  		if (offset == startoffset && iobytes == bytes) {
  			bp = mbp;
  		} else {
! 			bp = getiobuf();
! 			bp->b_data = (char *)kva + offset - startoffset;
! 			bp->b_resid = bp->b_bcount = iobytes;
! 			bp->b_flags = B_BUSY|B_READ|B_CALL|B_ASYNC;
! 			bp->b_iodone = uvm_aio_biodone1;
! 			bp->b_vp = vp;
! 			bp->b_proc = NULL;
  		}
  		bp->b_lblkno = 0;
- 		bp->b_private = mbp;
  
  		/* adjust physical blkno for partial blocks */
  		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
--- 799,811 ----
  		if (offset == startoffset && iobytes == bytes) {
  			bp = mbp;
  		} else {
! 			char *bufpos;
! 			
! 			bufpos = (char *)kva + offset - startoffset;
! 			bp = nestiobuf(mbp, 0, bufpos, iobytes);
! 			bp->b_flags |= B_ASYNC;
  		}
  		bp->b_lblkno = 0;
  
  		/* adjust physical blkno for partial blocks */
  		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
*************** genfs_gop_write(struct vnode *vp, struct
*** 1501,1521 ****
  		if (offset == startoffset && iobytes == bytes) {
  			bp = mbp;
  		} else {
! 			s = splbio();
! 			V_INCR_NUMOUTPUT(vp);
! 			splx(s);
! 			bp = getiobuf();
  			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
  			    vp, bp, vp->v_numoutput, 0);
- 			bp->b_data = (char *)kva +
- 			    (vaddr_t)(offset - pg->offset);
- 			bp->b_resid = bp->b_bcount = iobytes;
- 			bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
- 			bp->b_iodone = uvm_aio_biodone1;
- 			bp->b_vp = vp;
  		}
  		bp->b_lblkno = 0;
- 		bp->b_private = mbp;
  
  		/* adjust physical blkno for partial blocks */
  		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
--- 1498,1513 ----
  		if (offset == startoffset && iobytes == bytes) {
  			bp = mbp;
  		} else {
! 			char *bufpos;
! 
! 			bufpos = (char *)kva + (vaddr_t)(offset - pg->offset);
! 			bp = nestiobuf(mbp, 0, bufpos, iobytes);
! 			bp->b_flags |= B_ASYNC;
! 
  			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
  			    vp, bp, vp->v_numoutput, 0);
  		}
  		bp->b_lblkno = 0;
  
  		/* adjust physical blkno for partial blocks */
  		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>

--98e8jtXdkpgskNou
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=DIFF_LFS

Index: sys/ufs/lfs/lfs_vfsops.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/lfs/lfs_vfsops.c,v
retrieving revision 1.191
diff -p -b -r1.191 lfs_vfsops.c
*** sys/ufs/lfs/lfs_vfsops.c	4 Jan 2006 10:13:06 -0000	1.191
--- sys/ufs/lfs/lfs_vfsops.c	7 Jan 2006 02:03:18 -0000
*************** lfs_gop_write(struct vnode *vp, struct v
*** 2069,2077 ****
  	}
  
  	s = splbio();
! 	simple_lock(&global_v_numoutput_slock);
! 	vp->v_numoutput += 2; /* one for biodone, one for aiodone */
! 	simple_unlock(&global_v_numoutput_slock);
  	splx(s);
  
  	mbp = getiobuf();
--- 2069,2075 ----
  	}
  
  	s = splbio();
! 	V_INCR_NUMOUTPUT(vp);	 /* one for aoidone/biodone */
  	splx(s);
  
  	mbp = getiobuf();
*************** lfs_gop_write(struct vnode *vp, struct v
*** 2135,2165 ****
  		/* if it's really one i/o, don't make a second buf */
  		if (offset == startoffset && iobytes == bytes) {
  			bp = mbp;
- 			/* correct overcount if there is no second buffer */
- 			s = splbio();
- 			simple_lock(&global_v_numoutput_slock);
- 			--vp->v_numoutput;
- 			simple_unlock(&global_v_numoutput_slock);
- 			splx(s);
  		} else {
! 			bp = getiobuf();
  			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
  			    vp, bp, vp->v_numoutput, 0);
! 			bp->b_data = (char *)kva +
! 			    (vaddr_t)(offset - pg->offset);
! 			bp->b_resid = bp->b_bcount = iobytes;
! 			bp->b_flags = B_BUSY|B_WRITE|B_CALL;
! 			bp->b_iodone = uvm_aio_biodone1;
  		}
  
  		/* XXX This is silly ... is this necessary? */
! 		bp->b_vp = NULL;
  		s = splbio();
! 		bgetvp(vp, bp);
  		splx(s);
  
  		bp->b_lblkno = lblkno(fs, offset);
- 		bp->b_private = mbp;
  		if (devvp->v_type == VBLK) {
  			bp->b_dev = devvp->v_rdev;
  		}
--- 2133,2156 ----
  		/* if it's really one i/o, don't make a second buf */
  		if (offset == startoffset && iobytes == bytes) {
  			bp = mbp;
  		} else {
! 			char *bufpos;
! 
  			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
  				vp, bp, vp->v_numoutput, 0);
! 
! 			bufpos = (char *)kva + (vaddr_t)(offset - pg->offset);
! 			bp = nestiobuf(mbp, 0, bufpos, iobytes);
! 			/* CANT mark is async since LFS doesn't like that here */
  		}
  
  		/* XXX This is silly ... is this necessary? */
! 		mbp->b_vp = NULL;
  		s = splbio();
! 		bgetvp(vp, mbp);
  		splx(s);
  
  		bp->b_lblkno = lblkno(fs, offset);
  		if (devvp->v_type == VBLK) {
  			bp->b_dev = devvp->v_rdev;
  		}

--98e8jtXdkpgskNou
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=DIFF_UVM

Index: sys/uvm/uvm_extern.h
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.108
diff -p -r1.108 uvm_extern.h
*** sys/uvm/uvm_extern.h	21 Dec 2005 12:19:04 -0000	1.108
--- sys/uvm/uvm_extern.h	7 Jan 2006 01:54:00 -0000
*************** void			uvm_page_physload(paddr_t, paddr_
*** 672,678 ****
  void			uvm_setpagesize(void);
  
  /* uvm_pager.c */
- void			uvm_aio_biodone1(struct buf *);
  void			uvm_aio_biodone(struct buf *);
  void			uvm_aio_aiodone(struct buf *);
  
--- 672,677 ----
Index: sys/uvm/uvm_pager.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.73
diff -p -r1.73 uvm_pager.c
*** sys/uvm/uvm_pager.c	4 Jan 2006 10:13:06 -0000	1.73
--- sys/uvm/uvm_pager.c	7 Jan 2006 01:54:00 -0000
*************** uvm_pagermapout(vaddr_t kva, int npages)
*** 240,268 ****
  }
  
  /*
-  * interrupt-context iodone handler for nested i/o bufs.
-  *
-  * => must be at splbio().
-  */
- 
- void
- uvm_aio_biodone1(struct buf *bp)
- {
- 	struct buf *mbp = bp->b_private;
- 
- 	KASSERT(mbp != bp);
- 	if (bp->b_flags & B_ERROR) {
- 		mbp->b_flags |= B_ERROR;
- 		mbp->b_error = bp->b_error;
- 	}
- 	mbp->b_resid -= bp->b_bcount;
- 	putiobuf(bp);
- 	if (mbp->b_resid == 0) {
- 		biodone(mbp);
- 	}
- }
- 
- /*
   * interrupt-context iodone handler for single-buf i/os
   * or the top-level buf of a nested-buf i/o.
   *
--- 240,245 ----

--98e8jtXdkpgskNou--

--V88s5gaDVPzZ0KCq
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.6 (NetBSD)

iQEVAwUBQ78iuYKcNwBDyKpoAQKGhQf/W9OUj9AUROVgvsRXrCel6OQ2eAKVSXrg
NeUyauDKG9/PHjN0Hbdg6RpoidbyTXnZA3fiOcBc7NhJAAEQggmabXE7medm+COs
FATyx5IDsR5lemPB2ycI7YAfXUzUWV8wp79mILQTVvEi9zg+VxUEpOEfnCD+f+bu
9QF0heaDqt3/kErhg+2SGVgLFzEpfU5KTy7inQzxEQUNWIa5SfCCUdkYxwr+dplF
4fPxgKIFrvXrvz5VUBsR9IbYAMr+aueIsm28rZzwzPuW5KAdzqhzX0gjONTnEna0
3TD4XkbL3s5R8IYU59draaqcaLgm4QZ8yAZUDEeN2ld7n+UD4sbr9Q==
=xu1W
-----END PGP SIGNATURE-----

--V88s5gaDVPzZ0KCq--