Subject: Re: genfs_putpages with PGO_SYNCIO
To: None <enami@sm.sony.co.jp>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 01/16/2003 23:19:09
--NextPart-20030116231758-0032500
Content-Type: Text/Plain; charset=us-ascii

hi.

> > - introduce B_STABLE so that nfs doesn't have to (ab)use B_ASYNC to determine
> >   if stable write is needed or not.
> > - always do async i/o for GOP_WRITE.
> 
> Is an error correctly reported?  I guess caller of
> VOP_{PUT,GET}PAGES() expects an error to be returned on synchronous
> I/O.

no. thanks.

how about attached one?
it introduces third level nested buf so that we can get error via B_ERROR.

YAMAMOTO Takashi

--NextPart-20030116231758-0032500
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="genfs.syncio3.diff"

Index: sys/buf.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/sys/buf.h,v
retrieving revision 1.55
diff -u -p -r1.55 buf.h
--- sys/buf.h	2002/10/06 17:05:56	1.55
+++ sys/buf.h	2003/01/16 14:16:55
@@ -217,6 +217,7 @@ struct buf {
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_VFLUSH	0x04000000	/* Buffer is being synced. */
+#define	B_STABLE	0x08000000	/* Data should go to stable storage */
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
Index: miscfs/genfs/genfs_node.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/miscfs/genfs/genfs_node.h,v
retrieving revision 1.3
diff -u -p -r1.3 genfs_node.h
--- miscfs/genfs/genfs_node.h	2001/12/18 07:49:36	1.3
+++ miscfs/genfs/genfs_node.h	2003/01/16 14:16:55
@@ -38,15 +38,16 @@ struct vm_page;
 struct genfs_ops {
 	void	(*gop_size)(struct vnode *, off_t, off_t *);
 	int	(*gop_alloc)(struct vnode *, off_t, off_t, int, struct ucred *);
-	int	(*gop_write)(struct vnode *, struct vm_page **, int, int);
+	int	(*gop_write)(struct vnode *, struct vm_page **, int, int,
+	    struct buf *);
 };
 
 #define GOP_SIZE(vp, size, eobp) \
 	(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp))
 #define GOP_ALLOC(vp, off, len, flags, cred) \
 	(*VTOG(vp)->g_op->gop_alloc)((vp), (off), (len), (flags), (cred))
-#define GOP_WRITE(vp, pgs, npages, flags) \
-	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags))
+#define GOP_WRITE(vp, pgs, npages, flags, pbp) \
+	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags), (pbp))
 
 struct genfs_node {
 	struct genfs_ops	*g_op;		/* ops vector */
@@ -57,7 +58,9 @@ struct genfs_node {
 
 void	genfs_size(struct vnode *, off_t, off_t *);
 void	genfs_node_init(struct vnode *, struct genfs_ops *);
-int	genfs_gop_write(struct vnode *, struct vm_page **, int, int);
-int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
+int	genfs_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct buf *);
+int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct buf *);
 
 #endif	/* _MISCFS_GENFS_GENFS_NODE_H_ */
Index: miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.68
diff -u -p -r1.68 genfs_vnops.c
--- miscfs/genfs/genfs_vnops.c	2002/11/15 14:01:57	1.68
+++ miscfs/genfs/genfs_vnops.c	2003/01/16 14:16:55
@@ -1024,6 +1024,7 @@ genfs_putpages(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uvm_object *uobj = &vp->v_uobj;
 	struct simplelock *slock = &uobj->vmobjlock;
+	struct buf *pbp = NULL;
 	off_t startoff = ap->a_offlo;
 	off_t endoff = ap->a_offhi;
 	off_t off;
@@ -1035,6 +1036,7 @@ genfs_putpages(void *v)
 	boolean_t wasclean, by_list, needs_clean, yield;
 	boolean_t async = (flags & PGO_SYNCIO) == 0;
 	boolean_t pagedaemon = curproc == uvm.pagedaemon_proc;
+	boolean_t metbusy = FALSE;
 	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
 
 	KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
@@ -1102,18 +1104,23 @@ genfs_putpages(void *v)
 		KASSERT(pg == NULL ||
 		    (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
 		    (pg->flags & PG_BUSY) != 0);
+
 		if (by_list) {
 			if (pg == &endmp) {
 				break;
 			}
 			if (pg->offset < startoff || pg->offset >= endoff ||
 			    pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
+				if (pg->flags & (PG_RELEASED|PG_PAGEOUT))
+					metbusy = TRUE;
 				pg = TAILQ_NEXT(pg, listq);
 				continue;
 			}
 			off = pg->offset;
 		} else if (pg == NULL ||
 		    pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
+			if (pg != NULL)
+				metbusy = TRUE;
 			off += PAGE_SIZE;
 			if (off < endoff) {
 				pg = uvm_pagelookup(uobj, off);
@@ -1275,13 +1282,33 @@ genfs_putpages(void *v)
 			 * start the i/o.  if we're traversing by list,
 			 * keep our place in the list with a marker page.
 			 */
+			if (!async && pbp == NULL) {
+				s = splbio();
+				vp->v_numoutput++;
+				pbp = pool_get(&bufpool, PR_WAITOK);
+				splx(s);
+				pbp->b_bufsize = 0;
+				pbp->b_data = NULL;
+				pbp->b_bcount = 0;
+				pbp->b_flags = B_BUSY|B_WRITE;
+				pbp->b_iodone = NULL;
+				pbp->b_vp = vp;
+				LIST_INIT(&pbp->b_dep);
+
+				/*
+				 * set b_resid to 1 now so that i/o completion
+				 * can't occur before we issue all writes.
+				 * see below.
+				 */
+				pbp->b_resid = 1;
+			}
 
 			if (by_list) {
 				TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp,
 				    listq);
 			}
 			simple_unlock(slock);
-			error = GOP_WRITE(vp, pgs, npages, flags);
+			error = GOP_WRITE(vp, pgs, npages, flags, pbp);
 			simple_lock(slock);
 			if (by_list) {
 				pg = TAILQ_NEXT(&curmp, listq);
@@ -1333,22 +1360,49 @@ genfs_putpages(void *v)
 		LIST_REMOVE(vp, v_synclist);
 	}
 	splx(s);
-	if (!wasclean && !async) {
+	if (pbp != NULL) {
+		KASSERT(!async);
+		KASSERT(!wasclean);
+
 		s = splbio();
-		while (vp->v_numoutput != 0) {
-			vp->v_flag |= VBWAIT;
-			UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock, FALSE,
-			    "genput2", 0);
-			simple_lock(slock);
+		pbp->b_resid--;
+		/*
+		 * b_resid == 0 here means all i/o has already completed.
+		 */
+		if (pbp->b_resid == 0) {
+			biodone(pbp);
+		}
+
+		/*
+		 * If we met pages being paged out by other threads,
+		 * we should wait for all outputs.
+		 */
+		if (metbusy) {
+			while (vp->v_numoutput != 0) {
+				vp->v_flag |= VBWAIT;
+				UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock,
+				    FALSE, "genput2", 0);
+				simple_lock(slock);
+			}
+			KASSERT(pbp->b_resid == 0);
 		}
 		splx(s);
 	}
-	simple_unlock(&uobj->vmobjlock);
+	simple_unlock(slock);
+	if (pbp != NULL) {
+		KASSERT(pbp->b_resid >= 0);
+
+		error = biowait(pbp);
+		s = splbio();
+		pool_put(&bufpool, pbp);
+		splx(s);
+	}
 	return (error);
 }
 
 int
-genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags,
+    struct buf *pbp)
 {
 	int s, error, run;
 	int fs_bshift, dev_bshift;
@@ -1361,6 +1415,7 @@ genfs_gop_write(struct vnode *vp, struct
 	struct vnode *devvp;
 	boolean_t async = (flags & PGO_SYNCIO) == 0;
 	UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
+	KASSERT((async && pbp == NULL) || (!async && pbp != NULL));
 
 	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
 	    vp, pgs, npages, flags);
@@ -1392,10 +1447,19 @@ genfs_gop_write(struct vnode *vp, struct
 	mbp->b_bufsize = npages << PAGE_SHIFT;
 	mbp->b_data = (void *)kva;
 	mbp->b_resid = mbp->b_bcount = bytes;
-	mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? (B_CALL|B_ASYNC) : 0);
-	mbp->b_iodone = uvm_aio_biodone;
+	mbp->b_flags = B_BUSY|B_WRITE|B_AGE|B_CALL|B_ASYNC;
 	mbp->b_vp = vp;
 	LIST_INIT(&mbp->b_dep);
+	if (pbp) {
+		mbp->b_iodone = uvm_aio_biodone2;
+		mbp->b_private = pbp;
+		s = splbio();
+		pbp->b_resid += mbp->b_bcount;
+		splx(s);
+	}
+	else {
+		mbp->b_iodone = uvm_aio_biodone;
+	}
 
 	bp = NULL;
 	for (offset = startoffset;
@@ -1432,11 +1496,11 @@ genfs_gop_write(struct vnode *vp, struct
 			bp->b_resid = bp->b_bcount = iobytes;
 			bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
 			bp->b_iodone = uvm_aio_biodone1;
+			bp->b_private = mbp;
 			bp->b_vp = vp;
 			LIST_INIT(&bp->b_dep);
 		}
 		bp->b_lblkno = 0;
-		bp->b_private = mbp;
 		if (devvp->v_type == VBLK) {
 			bp->b_dev = devvp->v_rdev;
 		}
@@ -1444,6 +1508,9 @@ genfs_gop_write(struct vnode *vp, struct
 		/* adjust physical blkno for partial blocks */
 		bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
 		    dev_bshift);
+		if (!async) {
+			bp->b_flags |= B_STABLE;
+		}
 		UVMHIST_LOG(ubchist,
 		    "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
 		    vp, offset, bp->b_bcount, bp->b_blkno);
@@ -1466,9 +1533,10 @@ genfs_gop_write(struct vnode *vp, struct
 		UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
 		return (0);
 	}
-	UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
-	error = biowait(mbp);
-	uvm_aio_aiodone(mbp);
+	/*
+	 * We don't have to wait for I/O completions even for PGO_SYNCIO
+	 * because genfs_putpages will wait for us.
+	 */
 	UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
 	return (error);
 }
@@ -1601,9 +1669,10 @@ genfs_compat_getpages(void *v)
 	return (error);
 }
 
+/* ARGSUSED */
 int
 genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
-    int flags)
+    int flags, struct buf *pbp)
 {
 	off_t offset;
 	struct iovec iov;
Index: nfs/nfs_bio.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfs_bio.c,v
retrieving revision 1.85
diff -u -p -r1.85 nfs_bio.c
--- nfs/nfs_bio.c	2002/10/29 10:15:16	1.85
+++ nfs/nfs_bio.c	2003/01/16 14:16:55
@@ -989,7 +989,7 @@ nfs_doio(bp, p)
 	    struct vm_page *pgs[npages];
 	    boolean_t needcommit = TRUE;
 
-	    if ((bp->b_flags & B_ASYNC) != 0 && NFS_ISV3(vp)) {
+	    if ((bp->b_flags & B_STABLE) == 0 && NFS_ISV3(vp)) {
 		    iomode = NFSV3WRITE_UNSTABLE;
 	    } else {
 		    iomode = NFSV3WRITE_FILESYNC;
Index: nfs/nfs_node.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfs_node.c,v
retrieving revision 1.56
diff -u -p -r1.56 nfs_node.c
--- nfs/nfs_node.c	2002/12/01 23:02:10	1.56
+++ nfs/nfs_node.c	2003/01/16 14:16:55
@@ -79,7 +79,7 @@ extern int prtactive;
 
 void nfs_gop_size(struct vnode *, off_t, off_t *);
 int nfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
-int nfs_gop_write(struct vnode *, struct vm_page **, int, int);
+int nfs_gop_write(struct vnode *, struct vm_page **, int, int, struct buf *);
 
 struct genfs_ops nfs_genfsops = {
 	nfs_gop_size,
@@ -323,12 +323,13 @@ nfs_gop_alloc(struct vnode *vp, off_t of
 }
 
 int
-nfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+nfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags,
+    struct buf *pbp)
 {
 	int i;
 
 	for (i = 0; i < npages; i++) {
 		pmap_page_protect(pgs[i], VM_PROT_READ);
 	}
-	return genfs_gop_write(vp, pgs, npages, flags);
+	return genfs_gop_write(vp, pgs, npages, flags, pbp);
 }
Index: nfs/nfsnode.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/nfs/nfsnode.h,v
retrieving revision 1.37
diff -u -p -r1.37 nfsnode.h
--- nfs/nfsnode.h	2002/12/01 23:02:11	1.37
+++ nfs/nfsnode.h	2003/01/16 14:16:55
@@ -244,7 +244,8 @@ int	nfs_truncate	__P((void *));
 int	nfs_update	__P((void *));
 int	nfs_getpages	__P((void *));
 int	nfs_putpages	__P((void *));
-int	nfs_gop_write(struct vnode *, struct vm_page **, int, int);
+int	nfs_gop_write(struct vnode *, struct vm_page **, int, int,
+    struct buf *);
 int	nfs_kqfilter	__P((void *));
 
 extern int (**nfsv2_vnodeop_p) __P((void *));
Index: uvm/uvm_extern.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.75
diff -u -p -r1.75 uvm_extern.h
--- uvm/uvm_extern.h	2002/12/11 07:10:20	1.75
+++ uvm/uvm_extern.h	2003/01/16 14:16:55
@@ -660,6 +660,7 @@ void			uvm_setpagesize __P((void));
 
 /* uvm_pager.c */
 void			uvm_aio_biodone1 __P((struct buf *));
+void			uvm_aio_biodone2 __P((struct buf *));
 void			uvm_aio_biodone __P((struct buf *));
 void			uvm_aio_aiodone __P((struct buf *));
 
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/NetBSD/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.59
diff -u -p -r1.59 uvm_pager.c
--- uvm/uvm_pager.c	2002/11/09 20:09:52	1.59
+++ uvm/uvm_pager.c	2003/01/16 14:16:55
@@ -238,13 +238,13 @@ uvm_pagermapout(kva, npages)
 }
 
 /*
- * interrupt-context iodone handler for nested i/o bufs.
+ * common part of iodone handler for 'children' of nested i/o buf.
  *
  * => must be at splbio().
  */
 
 void
-uvm_aio_biodone1(bp)
+uvm_aio_nestedbiodone(bp)
 	struct buf *bp;
 {
 	struct buf *mbp = bp->b_private;
@@ -255,10 +255,40 @@ uvm_aio_biodone1(bp)
 		mbp->b_error = bp->b_error;
 	}
 	mbp->b_resid -= bp->b_bcount;
-	pool_put(&bufpool, bp);
+	KASSERT(mbp->b_resid >= 0);
 	if (mbp->b_resid == 0) {
 		biodone(mbp);
 	}
+}
+
+/*
+ * interrupt-context iodone handler for nested i/o bufs.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone1(bp)
+	struct buf *bp;
+{
+
+	uvm_aio_nestedbiodone(bp);
+	pool_put(&bufpool, bp);
+}
+
+/*
+ * interrupt-context iodone handler for middle level bufs of nested i/o.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone2(bp)
+	struct buf *bp;
+{
+
+	uvm_aio_nestedbiodone(bp);
+	uvm_aio_biodone(bp);
 }
 
 /*
Index: uvm/uvm_pager.h
===================================================================
RCS file: /cvs/NetBSD/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.25
diff -u -p -r1.25 uvm_pager.h
--- uvm/uvm_pager.h	2002/03/25 02:08:10	1.25
+++ uvm/uvm_pager.h	2003/01/16 14:16:55
@@ -150,6 +150,7 @@ void	uvm_pager_init __P((void));
 PAGER_INLINE struct vm_page *uvm_pageratop __P((vaddr_t));
 vaddr_t	uvm_pagermapin __P((struct vm_page **, int, int));
 void	uvm_pagermapout __P((vaddr_t, int));
+void	uvm_aio_nestedbiodone __P((struct buf *));
 
 /* Flags to uvm_pagermapin() */
 #define	UVMPAGER_MAPIN_WAITOK	0x01	/* it's okay to wait */

--NextPart-20030116231758-0032500--