Subject: Re: genfs_putpages with PGO_SYNCIO
To: None <enami@sm.sony.co.jp>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 01/22/2003 21:38:02
--NextPart-20030122212753-0037300
Content-Type: Text/Plain; charset=us-ascii

> > Yes.  IMHO, they (new one vs. struct buf and bio* routine) describe
> > different thing (the former is just related to GOP_PUTPAGES and
> > GOP_WRITE).  E.g., if one might want to limit the number of async
> > write issued at a time, the code will wait in the middle of loop in
> > _putpages routine.  For this, not using struct buf will be cleaner.

i made a new patch.  how about this one?

YAMAMOTO Takashi

--NextPart-20030122212753-0037300
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="genfs.syncio6.diff"

Index: sys/buf.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/sys/buf.h,v
retrieving revision 1.55
diff -u -p -r1.55 buf.h
--- sys/buf.h	2002/10/06 17:05:56	1.55
+++ sys/buf.h	2003/01/22 12:34:32
@@ -217,6 +217,7 @@ struct buf {
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_VFLUSH	0x04000000	/* Buffer is being synced. */
+#define	B_STABLE	0x08000000	/* Data should go to stable storage */
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
Index: nfs/nfs_bio.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfs_bio.c,v
retrieving revision 1.86
diff -u -p -r1.86 nfs_bio.c
--- nfs/nfs_bio.c	2003/01/18 09:34:30	1.86
+++ nfs/nfs_bio.c	2003/01/22 12:34:32
@@ -991,7 +991,7 @@ nfs_doio(bp, p)
 	    struct vm_page *pgs[npages];
 	    boolean_t needcommit = TRUE;
 
-	    if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) {
+	    if ((bp->b_flags & B_STABLE) == 0 && NFS_ISV3(vp)) {
 		    iomode = NFSV3WRITE_UNSTABLE;
 	    } else {
 		    iomode = NFSV3WRITE_FILESYNC;
Index: nfs/nfs_node.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfs_node.c,v
retrieving revision 1.56
diff -u -p -r1.56 nfs_node.c
--- nfs/nfs_node.c	2002/12/01 23:02:10	1.56
+++ nfs/nfs_node.c	2003/01/22 12:34:32
@@ -79,7 +79,8 @@ extern int prtactive;
 
 void nfs_gop_size(struct vnode *, off_t, off_t *);
 int nfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
-int nfs_gop_write(struct vnode *, struct vm_page **, int, int);
+int nfs_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct genfs_iodesc *);
 
 struct genfs_ops nfs_genfsops = {
 	nfs_gop_size,
@@ -323,12 +324,13 @@ nfs_gop_alloc(struct vnode *vp, off_t of
 }
 
 int
-nfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+nfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags,
+    struct genfs_iodesc *iodesc)
 {
 	int i;
 
 	for (i = 0; i < npages; i++) {
 		pmap_page_protect(pgs[i], VM_PROT_READ);
 	}
-	return genfs_gop_write(vp, pgs, npages, flags);
+	return genfs_gop_write(vp, pgs, npages, flags, iodesc);
 }
Index: nfs/nfsnode.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfsnode.h,v
retrieving revision 1.37
diff -u -p -r1.37 nfsnode.h
--- nfs/nfsnode.h	2002/12/01 23:02:11	1.37
+++ nfs/nfsnode.h	2003/01/22 12:34:33
@@ -244,7 +244,8 @@ int	nfs_truncate	__P((void *));
 int	nfs_update	__P((void *));
 int	nfs_getpages	__P((void *));
 int	nfs_putpages	__P((void *));
-int	nfs_gop_write(struct vnode *, struct vm_page **, int, int);
+int	nfs_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct genfs_iodesc *);
 int	nfs_kqfilter	__P((void *));
 
 extern int (**nfsv2_vnodeop_p) __P((void *));
Index: miscfs/genfs/genfs_node.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/miscfs/genfs/genfs_node.h,v
retrieving revision 1.3
diff -u -p -r1.3 genfs_node.h
--- miscfs/genfs/genfs_node.h	2001/12/18 07:49:36	1.3
+++ miscfs/genfs/genfs_node.h	2003/01/22 12:34:33
@@ -35,18 +35,24 @@
 
 struct vm_page;
 
+struct genfs_iodesc {
+	volatile int gid_iocount;	/* number of pending i/o */
+	int gid_error;		/* error number if any */
+};
+
 struct genfs_ops {
 	void	(*gop_size)(struct vnode *, off_t, off_t *);
 	int	(*gop_alloc)(struct vnode *, off_t, off_t, int, struct ucred *);
-	int	(*gop_write)(struct vnode *, struct vm_page **, int, int);
+	int	(*gop_write)(struct vnode *, struct vm_page **, int, int,
+	    struct genfs_iodesc *);
 };
 
 #define GOP_SIZE(vp, size, eobp) \
 	(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp))
 #define GOP_ALLOC(vp, off, len, flags, cred) \
 	(*VTOG(vp)->g_op->gop_alloc)((vp), (off), (len), (flags), (cred))
-#define GOP_WRITE(vp, pgs, npages, flags) \
-	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags))
+#define GOP_WRITE(vp, pgs, npages, flags, iodesc) \
+	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags), (iodesc))
 
 struct genfs_node {
 	struct genfs_ops	*g_op;		/* ops vector */
@@ -57,7 +63,12 @@ struct genfs_node {
 
 void	genfs_size(struct vnode *, off_t, off_t *);
 void	genfs_node_init(struct vnode *, struct genfs_ops *);
-int	genfs_gop_write(struct vnode *, struct vm_page **, int, int);
-int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
+int	genfs_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct genfs_iodesc *);
+int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct genfs_iodesc *);
+
+void	genfs_aio_biodone(struct buf *);
+int	genfs_iowait(const struct genfs_iodesc *);
 
 #endif	/* _MISCFS_GENFS_GENFS_NODE_H_ */
Index: miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.70
diff -u -p -r1.70 genfs_vnops.c
--- miscfs/genfs/genfs_vnops.c	2003/01/21 00:01:14	1.70
+++ miscfs/genfs/genfs_vnops.c	2003/01/22 12:34:33
@@ -1028,6 +1028,7 @@ genfs_putpages(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uvm_object *uobj = &vp->v_uobj;
 	struct simplelock *slock = &uobj->vmobjlock;
+	struct genfs_iodesc *iodesc = NULL;
 	off_t startoff = ap->a_offlo;
 	off_t endoff = ap->a_offhi;
 	off_t off;
@@ -1039,6 +1040,7 @@ genfs_putpages(void *v)
 	boolean_t wasclean, by_list, needs_clean, yield;
 	boolean_t async = (flags & PGO_SYNCIO) == 0;
 	boolean_t pagedaemon = curproc == uvm.pagedaemon_proc;
+	boolean_t metbusy = FALSE;
 	struct lwp *l = curlwp ? curlwp : &lwp0;
 
 	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
@@ -1108,18 +1110,28 @@ genfs_putpages(void *v)
 		KASSERT(pg == NULL ||
 		    (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
 		    (pg->flags & PG_BUSY) != 0);
+
 		if (by_list) {
+			boolean_t outofrange;
+
 			if (pg == &endmp) {
 				break;
 			}
-			if (pg->offset < startoff || pg->offset >= endoff ||
+
+			outofrange = (pg->offset < startoff ||
+			    pg->offset >= endoff);
+			if (outofrange ||
 			    pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
+				if (!outofrange)
+					metbusy = TRUE;
 				pg = TAILQ_NEXT(pg, listq);
 				continue;
 			}
 			off = pg->offset;
 		} else if (pg == NULL ||
 		    pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
+			if (pg != NULL)
+				metbusy = TRUE;
 			off += PAGE_SIZE;
 			if (off < endoff) {
 				pg = uvm_pagelookup(uobj, off);
@@ -1287,7 +1299,22 @@ genfs_putpages(void *v)
 				    listq);
 			}
 			simple_unlock(slock);
-			error = GOP_WRITE(vp, pgs, npages, flags);
+			if (!async && iodesc == NULL) {
+				/*
+				 * XXX should use stack or pool? 
+				 */
+				MALLOC(iodesc, struct genfs_iodesc *,
+				    sizeof(*iodesc), M_TEMP, M_WAITOK);
+				/*
+				 * set gid_iocount to 1 now so that
+				 * i/o completion event can't occur
+				 * before we issue all writes.
+				 * see below.
+				 */
+				iodesc->gid_iocount = 1;
+				iodesc->gid_error = 0;
+			}
+			error = GOP_WRITE(vp, pgs, npages, flags, iodesc);
 			simple_lock(slock);
 			if (by_list) {
 				pg = TAILQ_NEXT(&curmp, listq);
@@ -1338,23 +1365,45 @@ genfs_putpages(void *v)
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
 	}
-	splx(s);
-	if (!wasclean && !async) {
-		s = splbio();
+	if (iodesc != NULL && metbusy) {
+		KASSERT(!async);
+		KASSERT(!wasclean);
+
+		/*
+		 * If we met pages being paged out by other threads,
+		 * we should wait for all outputs.
+		 */
 		while (vp->v_numoutput != 0) {
 			vp->v_flag |= VBWAIT;
 			UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock, FALSE,
 			    "genput2", 0);
 			simple_lock(slock);
 		}
+		/*
+		 * As we drained all writes for this vnode,
+		 * ours was also completed, of course.
+		 */
+		KASSERT(iodesc->gid_iocount == 1);
+	}
+	splx(s);
+	simple_unlock(slock);
+	if (iodesc != NULL) {
+		KASSERT(!async);
+		KASSERT(!wasclean);
+
+		s = splbio();
+		KASSERT(iodesc->gid_iocount >= 1);
+		iodesc->gid_iocount--;
+		error = genfs_iowait(iodesc);
 		splx(s);
+		FREE(iodesc, M_TEMP);
 	}
-	simple_unlock(&uobj->vmobjlock);
 	return (error);
 }
 
 int
-genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags,
+    struct genfs_iodesc *iodesc)
 {
 	int s, error, run;
 	int fs_bshift, dev_bshift;
@@ -1367,6 +1416,7 @@ genfs_gop_write(struct vnode *vp, struct
 	struct vnode *devvp;
 	boolean_t async = (flags & PGO_SYNCIO) == 0;
 	UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
+	KASSERT((async && iodesc == NULL) || (!async && iodesc != NULL));
 
 	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
 	    vp, pgs, npages, flags);
@@ -1398,10 +1448,19 @@ genfs_gop_write(struct vnode *vp, struct
 	mbp->b_bufsize = npages << PAGE_SHIFT;
 	mbp->b_data = (void *)kva;
 	mbp->b_resid = mbp->b_bcount = bytes;
-	mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? (B_CALL|B_ASYNC) : 0);
-	mbp->b_iodone = uvm_aio_biodone;
+	mbp->b_flags = B_BUSY|B_WRITE|B_AGE|B_CALL|B_ASYNC;
 	mbp->b_vp = vp;
 	LIST_INIT(&mbp->b_dep);
+	if (iodesc) {
+		mbp->b_iodone = genfs_aio_biodone;
+		mbp->b_private = iodesc;
+		s = splbio();
+		iodesc->gid_iocount++;
+		splx(s);
+	}
+	else {
+		mbp->b_iodone = uvm_aio_biodone;
+	}
 
 	bp = NULL;
 	for (offset = startoffset;
@@ -1438,11 +1497,11 @@ genfs_gop_write(struct vnode *vp, struct
 			bp->b_resid = bp->b_bcount = iobytes;
 			bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
 			bp->b_iodone = uvm_aio_biodone1;
+			bp->b_private = mbp;
 			bp->b_vp = vp;
 			LIST_INIT(&bp->b_dep);
 		}
 		bp->b_lblkno = 0;
-		bp->b_private = mbp;
 		if (devvp->v_type == VBLK) {
 			bp->b_dev = devvp->v_rdev;
 		}
@@ -1450,6 +1509,9 @@ genfs_gop_write(struct vnode *vp, struct
 		/* adjust physical blkno for partial blocks */
 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
 		    dev_bshift);
+		if (!async) {
+			bp->b_flags |= B_STABLE;
+		}
 		UVMHIST_LOG(ubchist,
 		    "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
 		    vp, offset, bp->b_bcount, bp->b_blkno);
@@ -1472,9 +1534,10 @@ genfs_gop_write(struct vnode *vp, struct
 		UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
 		return (0);
 	}
-	UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
-	error = biowait(mbp);
-	uvm_aio_aiodone(mbp);
+	/*
+	 * We don't have to wait for I/O completions even for PGO_SYNCIO
+	 * because genfs_putpages will wait for us.
+	 */
 	UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
 	return (error);
 }
@@ -1607,9 +1670,10 @@ genfs_compat_getpages(void *v)
 	return (error);
 }
 
+/* ARGSUSED */
 int
 genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
-    int flags)
+    int flags, struct genfs_iodesc *iodesc)
 {
 	off_t offset;
 	struct iovec iov;
@@ -1730,4 +1794,48 @@ genfs_kqfilter(void *v)
 	SLIST_INSERT_HEAD(&vp->v_klist, kn, kn_selnext);
 
 	return (0);
+}
+
+/*
+ * biodone routine for GOP_WRITE.
+ *
+ * must be called at splbio.
+ * XXX interlock
+ */
+void
+genfs_aio_biodone(struct buf *bp)
+{
+	struct genfs_iodesc *iodesc = bp->b_private;
+
+	KASSERT(iodesc->gid_iocount > 0);
+
+	if (bp->b_flags & B_ERROR) {
+		iodesc->gid_error = (bp->b_error ? bp->b_error : EIO);
+	}
+
+	if ((--iodesc->gid_iocount) == 0) {
+		wakeup(iodesc);
+	}
+
+	uvm_aio_biodone(bp);
+}
+
+/*
+ * wait for i/o completion.
+ *
+ * must be called at splbio.
+ * XXX interlock
+ */
+int
+genfs_iowait(const struct genfs_iodesc *iodesc)
+{
+
+	while (iodesc->gid_iocount > 0) {
+		/* LINTED const cast away */
+		void *waitpoint = (void *)iodesc;
+
+		(void)ltsleep(waitpoint, PRIBIO + 1, "giowait", 0, 0);
+	}
+
+	return iodesc->gid_error;
 }

--NextPart-20030122212753-0037300--