Subject: kernel thread helper for vnd(4)
To: None <tech-kern@netbsd.org>
From: Manuel Bouyer <bouyer@antioche.eu.org>
List: tech-kern
Date: 03/29/2005 22:13:21
--+HP7ph2BbKc20aGI
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi,
quoting Jason in a previous mail on this list about vnd problems:
On Wed, Mar 23, 2005 at 04:52:53PM -0800, Jason Thorpe wrote:
> 
> This is really a known problem.  The vnd driver needs to be rewritten  
> badly.  It needs to queue up requests and defer everything to a  
> kthread that actually does the work.

I've changed vnd to use a kernel thread to perform the I/O.
in the current vnd version, vndstrategy() breaks the request in smaller
chunks, put these chunks in buffers and queue them up. vndstart() will
dequeue the buffers and call VOP_STRATEGY().

With my changes, vndstrategy() just queue the requests in a buffer queue
and wakes up the thread. The thread will breaks the request in smaller chunks
and call VOP_STRATEGY(), taking care to not have more than vnd->sc_maxactive
requests active in parallel. It will sleep waiting either for a buffer
in the queue, or for some request to complete if there are
vnd->sc_maxactive I/O active.
The thread is created when a file is attached to the vnd, and will exit when
the vnd is unconfigured.

Does this look good ?

Note: the attached diff is in diff -ub format, to make it smaller and easier
to read.

-- 
Manuel Bouyer <bouyer@antioche.eu.org>
     NetBSD: 26 ans d'experience feront toujours la difference
--

--+HP7ph2BbKc20aGI
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=diff

Index: vnd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/vnd.c,v
retrieving revision 1.111
diff -u -b -r1.111 vnd.c
--- vnd.c	28 Oct 2004 07:07:39 -0000	1.111
+++ vnd.c	29 Mar 2005 19:11:45 -0000
@@ -143,6 +143,7 @@
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
+#include <sys/kthread.h>
 #include <sys/errno.h>
 #include <sys/buf.h>
 #include <sys/bufq.h>
@@ -207,7 +208,6 @@
 int	vnddetach(void);
 
 static void	vndclear(struct vnd_softc *, int);
-static void	vndstart(struct vnd_softc *);
 static int	vndsetcred(struct vnd_softc *, struct ucred *);
 static void	vndthrottle(struct vnd_softc *, struct vnode *);
 static void	vndiodone(struct buf *);
@@ -221,6 +221,8 @@
 static int	vndlock(struct vnd_softc *);
 static void	vndunlock(struct vnd_softc *);
 
+void vndthread(void *);
+
 static dev_type_open(vndopen);
 static dev_type_close(vndclose);
 static dev_type_read(vndread);
@@ -393,38 +395,24 @@
 }
 
 /*
- * Break the request into bsize pieces and submit using VOP_BMAP/VOP_STRATEGY.
+ * Qeue the request, and wakeup the kernel thread to handle it.
  */
 static void
 vndstrategy(struct buf *bp)
 {
 	int unit = vndunit(bp->b_dev);
 	struct vnd_softc *vnd = &vnd_softc[unit];
-	struct vndxfer *vnx;
-	struct mount *mp;
-	int s, bsize, resid;
-	off_t bn;
-	caddr_t addr;
-	int sz, flags, error, wlabel;
-	struct disklabel *lp;
-	struct partition *pp;
+	struct disklabel *lp = vnd->sc_dkdev.dk_label;
+	int s = splbio();
+
+	bp->b_resid = bp->b_bcount;
 
-#ifdef DEBUG
-	if (vnddebug & VDB_FOLLOW)
-		printf("vndstrategy(%p): unit %d\n", bp, unit);
-#endif
 	if ((vnd->sc_flags & VNF_INITED) == 0) {
 		bp->b_error = ENXIO;
 		bp->b_flags |= B_ERROR;
 		goto done;
 	}
 
-	/* If it's a nil transfer, wake up the top half now. */
-	if (bp->b_bcount == 0)
-		goto done;
-
-	lp = vnd->sc_dkdev.dk_label;
-
 	/*
 	 * The transfer must be a whole number of blocks.
 	 */
@@ -435,15 +423,6 @@
 	}
 
 	/*
-	 * Do bounds checking and adjust transfer.  If there's an error,
-	 * the bounds check will flag that for us.
-	 */
-	wlabel = vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING);
-	if (DISKPART(bp->b_dev) != RAW_PART)
-		if (bounds_check_with_label(&vnd->sc_dkdev, bp, wlabel) <= 0)
-			goto done;
-
-	/*
 	 * check if we're read-only.
 	 */
 	if ((vnd->sc_flags & VNF_READONLY) && !(bp->b_flags & B_READ)) {
@@ -452,6 +431,67 @@
 		goto done;
 	}
 
+	/*
+	 * Do bounds checking and adjust transfer.  If there's an error,
+	 * the bounds check will flag that for us.
+	 */
+	if (DISKPART(bp->b_dev) != RAW_PART) {
+		if (bounds_check_with_label(&vnd->sc_dkdev,
+		    bp, vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING)) <= 0)
+			goto done;
+	}
+
+	/* If it's a nil transfer, wake up the top half now. */
+	if (bp->b_bcount == 0)
+		goto done;
+#ifdef DEBUG
+	if (vnddebug & VDB_FOLLOW)
+		printf("vndstrategy(%p): unit %d\n", bp, unit);
+#endif
+	BUFQ_PUT(&vnd->sc_tab, bp);
+	wakeup(&vnd->sc_tab);
+	splx(s);
+	return;
+done:
+	biodone(bp);
+	splx(s);
+}
+
+void
+vndthread(void *arg)
+{
+	struct vnd_softc *vnd = arg;
+	struct buf *bp;
+	struct vndxfer *vnx;
+	struct mount *mp;
+	int s, bsize, resid;
+	off_t bn;
+	caddr_t addr;
+	int sz, flags, error;
+	struct disklabel *lp;
+	struct partition *pp;
+
+	s = splbio();
+	vnd->sc_flags |= VNF_KTHREAD;
+	wakeup(&vnd->sc_kthread);
+
+	/*
+	 * Dequeue requests, break them into bsize pieces and submit using
+	 * VOP_BMAP/VOP_STRATEGY.
+	 */
+	while ((vnd->sc_flags & VNF_VUNCONF) == 0) {
+		bp = BUFQ_GET(&vnd->sc_tab);
+		if (bp == NULL) {
+			tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0);
+			continue;
+		};
+		splx(s);
+
+#ifdef DEBUG
+		if (vnddebug & VDB_FOLLOW)
+			printf("vndthread(%p\n", bp);
+#endif
+		lp = vnd->sc_dkdev.dk_label;
 	bp->b_resid = bp->b_bcount;
 
 	/*
@@ -464,7 +504,8 @@
 	 * Translate the partition-relative block number to an absolute.
 	 */
 	if (DISKPART(bp->b_dev) != RAW_PART) {
-		pp = &vnd->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
+			pp = &vnd->sc_dkdev.dk_label->d_partitions[
+			    DISKPART(bp->b_dev)];
 		bn += pp->p_offset;
 	}
 
@@ -480,7 +521,10 @@
 	addr = bp->b_data;
 	flags = (bp->b_flags & (B_READ|B_ASYNC)) | B_CALL;
 
-	/* Allocate a header for this transfer and link it to the buffer */
+		/*
+		 * Allocate a header for this transfer and link it to the
+		 * buffer
+		 */
 	s = splbio();
 	vnx = VND_GETXFER(vnd);
 	splx(s);
@@ -492,6 +536,12 @@
 	if ((flags & B_READ) == 0)
 		vn_start_write(vnd->sc_vp, &mp, V_WAIT);
 
+		/*
+		 * Feed requests sequentially.
+		 * We do it this way to keep from flooding NFS servers if we
+		 * are connected to an NFS file.  This places the burden on
+		 * the client rather than the server.
+		 */
 	for (resid = bp->b_resid; resid; resid -= sz) {
 		struct vndbuf *nbp;
 		struct vnode *vp;
@@ -540,6 +590,12 @@
 #endif
 
 		s = splbio();
+			while (vnd->sc_active >= vnd->sc_maxactive) {
+				tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0);
+				if (vnd->sc_flags & VNF_VUNCONF)
+					goto kthread_end;
+			}
+			vnd->sc_active++;
 		nbp = VND_GETBUF(vnd);
 		splx(s);
 		BUF_INIT(&nbp->vb_buf);
@@ -567,9 +623,23 @@
 			goto out;
 		}
 		vnx->vx_pending++;
+#ifdef DEBUG
+			if (vnddebug & VDB_IO)
+				printf("vndstart(%ld): bp %p vp %p blkno "
+				    "0x%" PRIx64 " flags %x addr %p cnt 0x%x\n",
+				    (long) (vnd-vnd_softc), &nbp->vb_buf,
+				    nbp->vb_buf.b_vp, nbp->vb_buf.b_blkno,
+				    nbp->vb_buf.b_flags, nbp->vb_buf.b_data,
+				    nbp->vb_buf.b_bcount);
+#endif
+
+			/* Instrumentation. */
+			disk_busy(&vnd->sc_dkdev);
+
+			if ((nbp->vb_buf.b_flags & B_READ) == 0)
+				vp->v_numoutput++;
+			VOP_STRATEGY(vp, &nbp->vb_buf);
 
-		BUFQ_PUT(&vnd->sc_tab, &nbp->vb_buf);
-		vndstart(vnd);
 		splx(s);
 		bn += sz;
 		addr += sz;
@@ -589,57 +659,21 @@
 		VND_PUTXFER(vnd, vnx);
 		biodone(bp);
 	}
-	splx(s);
-	return;
-
- done:
+		continue;
+done:
 	biodone(bp);
-}
-
-/*
- * Feed requests sequentially.
- * We do it this way to keep from flooding NFS servers if we are connected
- * to an NFS file.  This places the burden on the client rather than the
- * server.
- */
-static void
-vndstart(struct vnd_softc *vnd)
-{
-	struct buf	*bp;
-
-	/*
-	 * Dequeue now since lower level strategy routine might
-	 * queue using same links
-	 */
-
-	if ((vnd->sc_flags & VNF_BUSY) != 0)
-		return;
-
-	vnd->sc_flags |= VNF_BUSY;
-
-	while (vnd->sc_active < vnd->sc_maxactive) {
-		bp = BUFQ_GET(&vnd->sc_tab);
-		if (bp == NULL)
-			break;
-		vnd->sc_active++;
-#ifdef DEBUG
-		if (vnddebug & VDB_IO)
-			printf("vndstart(%ld): bp %p vp %p blkno 0x%" PRIx64
-				" flags %x addr %p cnt 0x%x\n",
-			    (long) (vnd-vnd_softc), bp, bp->b_vp, bp->b_blkno,
-			    bp->b_flags, bp->b_data, bp->b_bcount);
-#endif
-
-		/* Instrumentation. */
-		disk_busy(&vnd->sc_dkdev);
-
-		if ((bp->b_flags & B_READ) == 0)
-			bp->b_vp->v_numoutput++;
-		VOP_STRATEGY(bp->b_vp, bp);
+		s = splbio();
+		continue;
 	}
-	vnd->sc_flags &= ~VNF_BUSY;
+
+kthread_end:
+	vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF);
+	wakeup(&vnd->sc_kthread);
+	splx(s);
+	kthread_exit(0);
 }
 
+
 static void
 vndiodone(struct buf *bp)
 {
@@ -710,7 +744,7 @@
 	}
 
 	vnd->sc_active--;
-	vndstart(vnd);
+	wakeup(&vnd->sc_tab);
 	splx(s);
 }
 
@@ -906,9 +940,23 @@
 
 		if ((error = vndsetcred(vnd, p->p_ucred)) != 0)
 			goto close_and_exit;
+
+		memset(vnd->sc_xname, 0, sizeof(vnd->sc_xname)); /* XXX */
+		snprintf(vnd->sc_xname, sizeof(vnd->sc_xname), "vnd%d", unit);
+
+
 		vndthrottle(vnd, vnd->sc_vp);
 		vio->vnd_size = dbtob(vnd->sc_size);
 		vnd->sc_flags |= VNF_INITED;
+
+		/* create the kernel thread, wait for it to be up */
+		error = kthread_create1(vndthread, vnd, &vnd->sc_kthread,
+		    vnd->sc_xname);
+		if (error)
+			goto close_and_exit;
+		while ((vnd->sc_flags & VNF_KTHREAD) == 0) {
+			tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0);
+		}
 #ifdef DEBUG
 		if (vnddebug & VDB_INIT)
 			printf("vndioctl: SET vp %p size 0x%lx %d/%d/%d/%d\n",
@@ -920,8 +968,6 @@
 #endif
 
 		/* Attach the disk. */
-		memset(vnd->sc_xname, 0, sizeof(vnd->sc_xname)); /* XXX */
-		snprintf(vnd->sc_xname, sizeof(vnd->sc_xname), "vnd%d", unit);
 		vnd->sc_dkdev.dk_name = vnd->sc_xname;
 		disk_attach(&vnd->sc_dkdev);
 
@@ -1214,9 +1260,16 @@
 
 	if ((vnd->sc_flags & VNF_READONLY) == 0)
 		fflags |= FWRITE;
-	vnd->sc_flags &= ~(VNF_INITED | VNF_READONLY | VNF_VLABEL);
+
+	vnd->sc_flags |= VNF_VUNCONF;
+	wakeup(&vnd->sc_tab);
+	while (vnd->sc_flags & VNF_KTHREAD)
+		tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0);
+
+	vnd->sc_flags &=
+	    ~(VNF_INITED | VNF_READONLY | VNF_VLABEL | VNF_VUNCONF);
 	if (vp == (struct vnode *)0)
-		panic("vndioctl: null vp");
+		panic("vndclear: null vp");
 	(void) vn_close(vp, fflags, vnd->sc_cred, p);
 	crfree(vnd->sc_cred);
 	vnd->sc_vp = (struct vnode *)0;
Index: vndvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/vndvar.h,v
retrieving revision 1.13
diff -u -b -r1.13 vndvar.h
--- vndvar.h	27 Feb 2005 00:26:58 -0000	1.13
+++ vndvar.h	29 Mar 2005 19:11:45 -0000
@@ -162,6 +162,7 @@
 	struct vndgeom	 sc_geom;	/* virtual geometry */
 	struct pool	 sc_vxpool;	/* vndxfer pool */
 	struct pool	 sc_vbpool;	/* vndbuf pool */
+	struct proc 	*sc_kthread;	/* kernel thread */
 };
 #endif
 
@@ -171,10 +172,11 @@
 #define	VNF_LABELLING	0x004	/* unit is currently being labelled */
 #define	VNF_WANTED	0x008	/* someone is waiting to obtain a lock */
 #define	VNF_LOCKED	0x010	/* unit is locked */
-#define	VNF_BUSY	0x020	/* unit is busy */
-#define	VNF_READONLY	0x040	/* unit is read-only */
-#define	VNF_KLABEL	0x080	/* keep label on close */
-#define	VNF_VLABEL	0x100	/* label is valid */
+#define	VNF_READONLY	0x020	/* unit is read-only */
+#define	VNF_KLABEL	0x040	/* keep label on close */
+#define	VNF_VLABEL	0x080	/* label is valid */
+#define	VNF_KTHREAD	0x100	/* thread is running */
+#define	VNF_VUNCONF	0x200	/* device is unconfiguring */
 
 /*
  * A simple structure for describing which vnd units are in use.

--+HP7ph2BbKc20aGI--