Subject: file overwrite performance after ubc
To: None <tech-kern@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 05/21/2007 23:38:22
--NextPart-20070521233348-0389500
Content-Type: Text/Plain; charset=us-ascii

hi,

the attached patch is to improve file overwrite performance in common cases.
it also fixes PR/33152 and PR/36303.
(currently only ufs and nfs are implemented.)

YAMAMOTO Takashi

--NextPart-20070521233348-0389500
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="a.diff"

Index: sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.168
diff -u -p -r1.168 vnode.h
--- sys/vnode.h	8 Apr 2007 11:20:50 -0000	1.168
+++ sys/vnode.h	21 May 2007 14:29:36 -0000
@@ -98,6 +98,7 @@ struct vnode {
 #define	v_usecount	v_uobj.uo_refs
 #define	v_interlock	v_uobj.vmobjlock
 	voff_t		v_size;			/* size of file */
+	voff_t		v_writesize;		/* new size after write */
 	int		v_flag;			/* flags */
 	int		v_numoutput;		/* number of pending writes */
 	long		v_writecount;		/* reference count of writers */
Index: kern/vfs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.287
diff -u -p -r1.287 vfs_subr.c
--- kern/vfs_subr.c	16 Apr 2007 05:14:54 -0000	1.287
+++ kern/vfs_subr.c	21 May 2007 14:29:37 -0000
@@ -594,7 +594,7 @@ getnewvnode(enum vtagtype tag, struct mo
 	KASSERT(uobj->pgops == &uvm_vnodeops);
 	KASSERT(uobj->uo_npages == 0);
 	KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
-	vp->v_size = VSIZENOTSET;
+	vp->v_size = vp->v_writesize = VSIZENOTSET;
 
 	if (mp && error != EDEADLK)
 		vfs_unbusy(mp);
@@ -2638,8 +2638,8 @@ vfs_vnode_print(struct vnode *vp, int fu
 	uvm_object_printit(&vp->v_uobj, full, pr);
 	bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf));
 	(*pr)("\nVNODE flags %s\n", bf);
-	(*pr)("mp %p numoutput %d size 0x%llx\n",
-	      vp->v_mount, vp->v_numoutput, vp->v_size);
+	(*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n",
+	      vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize);
 
 	(*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n",
 	      vp->v_data, vp->v_usecount, vp->v_writecount,
Index: uvm/uvm_bio.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_bio.c,v
retrieving revision 1.57
diff -u -p -r1.57 uvm_bio.c
--- uvm/uvm_bio.c	7 May 2007 12:39:45 -0000	1.57
+++ uvm/uvm_bio.c	21 May 2007 14:29:37 -0000
@@ -632,6 +632,57 @@ ubc_release(void *va, int flags)
 	simple_unlock(&ubc_object.uobj.vmobjlock);
 }
 
+/*
+ * ubc_uiomove:
+ *
+ * XXX
+ */
+
+int
+ubc_uiomove(struct uvm_object *uobj, struct uio *uio, vsize_t todo, int flags)
+{
+	voff_t off;
+	const bool overwrite = (flags & UBC_FAULTBUSY) != 0;
+	int error;
+
+	KASSERT(todo <= uio->uio_resid);
+	KASSERT(((flags & UBC_WRITE) != 0 && uio->uio_rw == UIO_WRITE) ||
+	    ((flags & UBC_READ) != 0 && uio->uio_rw == UIO_READ));
+
+	off = uio->uio_offset;
+	error = 0;
+	while (todo > 0) {
+		vsize_t bytelen = todo;
+		void *win;
+
+		win = ubc_alloc(uobj, off, &bytelen, UVM_ADV_NORMAL, flags);
+		if (error == 0) {
+			error = uiomove(win, bytelen, uio);
+		}
+		if (error != 0 && overwrite) {
+			/*
+			 * if we haven't initialized the pages yet,
+			 * do it now.  it's safe to use memset here
+			 * because we just mapped the pages above.
+			 */
+			printf("%s: error=%d\n", __func__, error);
+			memset(win, 0, bytelen);
+		}
+		ubc_release(win, flags);
+		off += bytelen;
+		todo -= bytelen;
+		if (error != 0 && (flags & UBC_PARTIALOK) != 0) {
+			break;
+		}
+#if 0
+		if (!overwrite) {
+			break;
+		}
+#endif
+	}
+
+	return error;
+}
 
 #if 0 /* notused */
 /*
Index: uvm/uvm_extern.h
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.129
diff -u -p -r1.129 uvm_extern.h
--- uvm/uvm_extern.h	24 Mar 2007 21:15:39 -0000	1.129
+++ uvm/uvm_extern.h	21 May 2007 14:29:37 -0000
@@ -189,14 +189,19 @@ typedef voff_t pgoff_t;		/* XXX: number 
 /*
  * flags for ubc_alloc()
  */
-#define UBC_READ	0x01
-#define UBC_WRITE	0x02
-#define UBC_FAULTBUSY	0x04
+#define UBC_READ	0x001
+#define UBC_WRITE	0x002
+#define UBC_FAULTBUSY	0x004
 
 /*
  * flags for ubc_release()
  */
-#define UBC_UNMAP	0x01
+#define UBC_UNMAP	0x010
+
+/*
+ * flags for ubc_uiomve()
+ */
+#define	UBC_PARTIALOK	0x100
 
 /*
  * helpers for calling ubc_release()
@@ -556,6 +561,8 @@ void *			ubc_alloc(struct uvm_object *, 
 			    int);
 void			ubc_release(void *, int);
 void			ubc_flush(struct uvm_object *, voff_t, voff_t);
+int			ubc_uiomove(struct uvm_object *, struct uio *, vsize_t,
+			    int);
 
 /* uvm_fault.c */
 #define uvm_fault(m, a, p) uvm_fault_internal(m, a, p, 0)
@@ -703,6 +710,7 @@ void			uvm_deallocate(struct vm_map *, v
 
 /* uvm_vnode.c */
 void			uvm_vnp_setsize(struct vnode *, voff_t);
+void			uvm_vnp_setwritesize(struct vnode *, voff_t);
 void			uvm_vnp_sync(struct mount *);
 struct uvm_object	*uvn_attach(void *, vm_prot_t);
 int			uvn_findpages(struct uvm_object *, voff_t,
Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.81
diff -u -p -r1.81 uvm_vnode.c
--- uvm/uvm_vnode.c	4 Mar 2007 06:03:49 -0000	1.81
+++ uvm/uvm_vnode.c	21 May 2007 14:29:37 -0000
@@ -212,7 +212,7 @@ uvn_attach(void *arg, vm_prot_t accesspr
 		UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0);
 		return(NULL);
 	}
-	vp->v_size = used_vnode_size;
+	vp->v_size = vp->v_writesize = used_vnode_size;
 
 	}
 
@@ -470,14 +470,29 @@ uvm_vnp_setsize(struct vnode *vp, voff_t
 	 */
 
 	oldsize = vp->v_size;
+	KASSERT(oldsize <= vp->v_writesize);
+	KASSERT(oldsize == vp->v_writesize || vp->v_writesize == newsize);
 	if (oldsize > pgend && oldsize != VSIZENOTSET) {
 		(void) uvn_put(uobj, pgend, 0, PGO_FREE | PGO_SYNCIO);
 		simple_lock(&uobj->vmobjlock);
 	}
-	vp->v_size = newsize;
+	vp->v_size = vp->v_writesize = newsize;
 	simple_unlock(&uobj->vmobjlock);
 }
 
+void
+uvm_vnp_setwritesize(struct vnode *vp, voff_t newsize)
+{
+
+	simple_lock(&vp->v_interlock);
+	KASSERT(vp->v_size != VSIZENOTSET);
+	KASSERT(vp->v_writesize != VSIZENOTSET);
+	KASSERT(vp->v_size <= vp->v_writesize);
+	KASSERT(vp->v_size <= newsize);
+	vp->v_writesize = newsize;
+	simple_unlock(&vp->v_interlock);
+}
+
 /*
  * uvm_vnp_zerorange:  set a range of bytes in a file to zero.
  */
Index: miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.153
diff -u -p -r1.153 genfs_vnops.c
--- miscfs/genfs/genfs_vnops.c	17 May 2007 07:26:22 -0000	1.153
+++ miscfs/genfs/genfs_vnops.c	21 May 2007 14:29:38 -0000
@@ -425,7 +425,7 @@ genfs_getpages(void *v)
 	int i, error, npages, orignpages, npgs, run, ridx, pidx, pcount;
 	int fs_bshift, fs_bsize, dev_bshift;
 	int flags = ap->a_flags;
-	size_t bytes, iobytes, tailbytes, totalbytes, skipbytes;
+	size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes;
 	vaddr_t kva;
 	struct buf *bp, *mbp;
 	struct vnode *vp = ap->a_vp;
@@ -465,9 +465,19 @@ startover:
 	orignpages = *ap->a_count;
 	GOP_SIZE(vp, origvsize, &diskeof, 0);
 	if (flags & PGO_PASTEOF) {
+#if defined(DIAGNOSTIC)
+		off_t writeeof;
+#endif /* defined(DIAGNOSTIC) */
+
 		newsize = MAX(origvsize,
 		    origoffset + (orignpages << PAGE_SHIFT));
 		GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM);
+#if defined(DIAGNOSTIC)
+		GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM);
+		if (newsize > round_page(writeeof)) {
+			panic("%s: past eof", __func__);
+		}
+#endif /* defined(DIAGNOSTIC) */
 	} else {
 		GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM);
 	}
@@ -727,21 +737,24 @@ startover:
 		BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
 
 	/*
+	 * XXX update
 	 * if EOF is in the middle of the range, zero the part past EOF.
 	 * if the page including EOF is not PG_FAKE, skip over it since
 	 * in that case it has valid data that we need to preserve.
 	 */
 
-	if (tailbytes > 0) {
-		size_t tailstart = bytes;
-
-		if ((pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE) == 0) {
-			tailstart = round_page(tailstart);
-			tailbytes -= tailstart - bytes;
-		}
-		UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x",
-		    kva, tailstart, tailbytes,0);
-		memset((void *)(kva + tailstart), 0, tailbytes);
+	tailstart = bytes;
+	while (tailbytes > 0) {
+		const int len = PAGE_SIZE - (tailstart & PAGE_MASK);
+
+		KASSERT(len <= tailbytes);
+		if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) {
+			memset((void *)(kva + tailstart), 0, len);
+			UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x",
+			    kva, tailstart, len, 0);
+		}
+		tailstart += len;
+		tailbytes -= len;
 	}
 
 	/*
@@ -1514,7 +1527,8 @@ genfs_do_io(struct vnode *vp, off_t off,
 	UVMHIST_LOG(ubchist, "vp %p kva %p len 0x%x flags 0x%x",
 	    vp, kva, len, flags);
 
-	GOP_SIZE(vp, vp->v_size, &eof, 0);
+	KASSERT(vp->v_size <= vp->v_writesize);
+	GOP_SIZE(vp, vp->v_writesize, &eof, 0);
 	if (vp->v_type != VBLK) {
 		fs_bshift = vp->v_mount->mnt_fs_bshift;
 		dev_bshift = vp->v_mount->mnt_dev_bshift;
Index: miscfs/specfs/spec_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/specfs/spec_vnops.c,v
retrieving revision 1.98
diff -u -p -r1.98 spec_vnops.c
--- miscfs/specfs/spec_vnops.c	4 Mar 2007 06:03:14 -0000	1.98
+++ miscfs/specfs/spec_vnops.c	21 May 2007 14:29:38 -0000
@@ -245,7 +245,8 @@ spec_open(v)
 	if (error)
 		return error;
 	if (!(*d_ioctl)(vp->v_rdev, DIOCGPART, (void *)&pi, FREAD, curlwp))
-		vp->v_size = (voff_t)pi.disklab->d_secsize * pi.part->p_size;
+		uvm_vnp_setsize(vp,
+		    (voff_t)pi.disklab->d_secsize * pi.part->p_size);
 	return 0;
 }
 
Index: ufs/ffs/ffs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.87
diff -u -p -r1.87 ffs_vnops.c
--- ufs/ffs/ffs_vnops.c	17 May 2007 07:26:23 -0000	1.87
+++ ufs/ffs/ffs_vnops.c	21 May 2007 14:29:38 -0000
@@ -110,7 +110,7 @@ const struct vnodeopv_entry_desc ffs_vno
 	{ &vop_pathconf_desc, ufs_pathconf },		/* pathconf */
 	{ &vop_advlock_desc, ufs_advlock },		/* advlock */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ &vop_getpages_desc, ffs_getpages },		/* getpages */
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
 	{ &vop_putpages_desc, genfs_putpages },		/* putpages */
 	{ &vop_openextattr_desc, ffs_openextattr },	/* openextattr */
 	{ &vop_closeextattr_desc, ffs_closeextattr },	/* closeextattr */
@@ -514,6 +514,7 @@ ffs_reclaim(void *v)
 	return (0);
 }
 
+#if 0
 int
 ffs_getpages(void *v)
 {
@@ -548,6 +549,7 @@ ffs_getpages(void *v)
 	}
 	return genfs_getpages(v);
 }
+#endif
 
 /*
  * Return the last logical file offset that should be written for this file
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.78
diff -u -p -r1.78 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	17 May 2007 07:26:23 -0000	1.78
+++ ufs/ufs/ufs_readwrite.c	21 May 2007 14:29:38 -0000
@@ -214,9 +214,7 @@ WRITE(void *v)
 	off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
 	int aflag;
-	int ubc_alloc_flags, ubc_release_flags;
 	int extended=0;
-	void *win;
 	vsize_t bytelen;
 	bool async;
 	bool usepc = false;
@@ -314,20 +312,20 @@ WRITE(void *v)
 		off_t eob;
 
 		eob = blkroundup(fs, osize);
+		uvm_vnp_setwritesize(vp, eob);
 		error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
 		if (error)
 			goto out;
 		if (flags & B_SYNC) {
-			vp->v_size = eob;
 			simple_lock(&vp->v_interlock);
 			VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
 			    round_page(eob), PGO_CLEANIT | PGO_SYNCIO);
 		}
 	}
 
-	ubc_alloc_flags = UBC_WRITE;
 	while (uio->uio_resid > 0) {
-		bool extending; /* if we're extending a whole block */
+		int ubc_flags = UBC_WRITE;
+		bool overwrite; /* if we're overwrite a whole block */
 		off_t newoff;
 
 		if (ioflag & IO_DIRECT) {
@@ -348,15 +346,31 @@ WRITE(void *v)
 		 * since the new blocks will be inaccessible until the write
 		 * is complete.
 		 */
-		extending = uio->uio_offset >= preallocoff &&
+		overwrite = uio->uio_offset >= preallocoff &&
 		    uio->uio_offset < endallocoff;
+		if (!overwrite && (vp->v_flag & VMAPPED) == 0 &&
+		    blkoff(fs, uio->uio_offset) == 0 &&
+		    (uio->uio_offset & PAGE_MASK) == 0) {
+			vsize_t len;
+
+			len = trunc_page(bytelen);
+			len -= blkoff(fs, len);
+			if (len > 0) {
+				overwrite = true;
+				bytelen = len;
+			}
+		}
 
-		if (!extending) {
+		newoff = oldoff + bytelen;
+		if (vp->v_size < newoff) {
+			uvm_vnp_setwritesize(vp, newoff);
+		}
+
+		if (!overwrite) {
 			error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
 			    cred, aflag);
 			if (error)
 				break;
-			ubc_alloc_flags &= ~UBC_FAULTBUSY;
 		} else {
 			genfs_node_wrlock(vp);
 			error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
@@ -364,26 +378,15 @@ WRITE(void *v)
 			genfs_node_unlock(vp);
 			if (error)
 				break;
-			ubc_alloc_flags |= UBC_FAULTBUSY;
+			ubc_flags |= UBC_FAULTBUSY;
 		}
 
 		/*
 		 * copy the data.
 		 */
 
-		win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen,
-		    UVM_ADV_NORMAL, ubc_alloc_flags);
-		error = uiomove(win, bytelen, uio);
-		if (error && extending) {
-			/*
-			 * if we haven't initialized the pages yet,
-			 * do it now.  it's safe to use memset here
-			 * because we just mapped the pages above.
-			 */
-			memset(win, 0, bytelen);
-		}
-		ubc_release_flags = UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
-		ubc_release(win, ubc_release_flags);
+		ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
+		error = ubc_uiomove(&vp->v_uobj, uio, bytelen, ubc_flags);
 
 		/*
 		 * update UVM's notion of the size now that we've
@@ -393,7 +396,6 @@ WRITE(void *v)
 		 * otherwise ffs_truncate can't flush soft update states.
 		 */
 
-		newoff = oldoff + bytelen;
 		if (vp->v_size < newoff) {
 			uvm_vnp_setsize(vp, newoff);
 			extended = 1;
Index: nfs/nfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/nfs/nfs_bio.c,v
retrieving revision 1.154
diff -u -p -r1.154 nfs_bio.c
--- nfs/nfs_bio.c	9 May 2007 23:17:45 -0000	1.154
+++ nfs/nfs_bio.c	21 May 2007 14:29:38 -0000
@@ -462,10 +462,9 @@ nfs_write(v)
 	kauth_cred_t cred = ap->a_cred;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-	void *win;
 	voff_t oldoff, origoff;
 	vsize_t bytelen;
-	int flags, error = 0;
+	int error = 0;
 	int ioflag = ap->a_ioflag;
 	int extended = 0, wrotedata = 0;
 
@@ -519,7 +518,7 @@ nfs_write(v)
 
 	origoff = uio->uio_offset;
 	do {
-		bool extending; /* if we are extending whole pages */
+		bool overwrite; /* if we are overwriting whole pages */
 		u_quad_t oldsize;
 		oldoff = uio->uio_offset;
 		bytelen = uio->uio_resid;
@@ -531,17 +530,27 @@ nfs_write(v)
 		if (np->n_size < uio->uio_offset + bytelen) {
 			np->n_size = uio->uio_offset + bytelen;
 		}
-		extending = ((uio->uio_offset & PAGE_MASK) == 0 &&
-		    (bytelen & PAGE_MASK) == 0 &&
-		    uio->uio_offset >= vp->v_size);
-		win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen,
-		    UVM_ADV_NORMAL,
-		    UBC_WRITE | (extending ? UBC_FAULTBUSY : 0));
-		error = uiomove(win, bytelen, uio);
-		flags = UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
-		ubc_release(win, flags);
+		overwrite = false;
+		if ((uio->uio_offset & PAGE_MASK) == 0) {
+			if ((vp->v_flag & VMAPPED) == 0 &&
+			    bytelen > PAGE_SIZE) {
+				bytelen = trunc_page(bytelen);
+				overwrite = true;
+			} else if ((bytelen & PAGE_MASK) == 0 &&
+			    uio->uio_offset >= vp->v_size) {
+				overwrite = true;
+			}
+		}
+		if (vp->v_size < uio->uio_offset + bytelen) {
+			uvm_vnp_setwritesize(vp, uio->uio_offset + bytelen);
+		}
+		error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+		    UBC_WRITE | UBC_PARTIALOK |
+		    (overwrite ? UBC_FAULTBUSY : 0) |
+		    (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
 		if (error) {
-			if (extending) {
+			uvm_vnp_setwritesize(vp, vp->v_size);
+			if (overwrite && np->n_size != oldsize) {
 				/*
 				 * backout size and free pages past eof.
 				 */

--NextPart-20070521233348-0389500--