Subject: Re: poolifying fileassoc
To: Chuck Silvers <chuq@chuq.com>
From: Brett Lymn <blymn@baesystems.com.au>
List: tech-kern
Date: 01/01/2007 20:37:16
--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Mon, Oct 09, 2006 at 09:16:58AM -0700, Chuck Silvers wrote:
> 
> I'd rather rearrange uvm_aio_aiodone() and genfs_getpages() to separate out
> the code that's more or less duplicated between them.  then the new function
> (which would be a uvm thing and not a genfs thing) would be the single place
> to put pagecache I/O completion hooks like this.
> 

It took me a while to get to do this and a while more to straighten
out the mess I made.  The attached diff works for me with LOCKDEBUG
and DIAGNOSTIC on a multiprocessor machine.  I put the veriexec
page checking in there and this too seemed to work fine - detecting
and killing a modified binary on a nfs share.

Unfortunately, it seems like my original problem still exists though -
I think that the fileassoc code needs to use pools because the
veriexec check is done with locks held so I get panics of the
"vmem_alloc with simple_lock held" kind if fileassoc is not using
pools for it's working storage.

Apologies for the horrible diff - cvs has decided that I just renamed
the uvm_aio_aiodone function and then wrote a new one.

-- 
Brett Lymn

--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="iodone.diff"

Index: sys/uvm/uvm_pager.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.79
diff -b -u -r1.79 uvm_pager.c
--- sys/uvm/uvm_pager.c	21 Dec 2006 15:55:26 -0000	1.79
+++ sys/uvm/uvm_pager.c	1 Jan 2007 09:50:22 -0000
@@ -43,6 +43,7 @@
 
 #include "opt_uvmhist.h"
 #include "opt_readahead.h"
+#include "veriexec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -52,6 +53,7 @@
 #include <sys/vnode.h>
 
 #include <uvm/uvm.h>
+#include <sys/verified_exec.h>
 
 struct pool *uvm_aiobuf_pool;
 
@@ -279,65 +281,51 @@
 }
 
 /*
- * uvm_aio_aiodone: do iodone processing for async i/os.
- * this should be called in thread context, not interrupt context.
+ *
+ * uvm_io_done: Set page flag status when an i/o (synch or asynch) has
+ * completed or an async i/o is started.
+ *
+ * pgs - array of pages to handle
+ * npages - number of pages in request
+ * write - TRUE if a write request
+ * swap - TRUE if a swap request
+ * aio_complete - TRUE if called from aio completion
+ * overwrite - TRUE if overwriting pages
+ * ext_error - error status from caller
+ * ext_slock - ignored iff swap otherwise is uobj lock held
+ * swslot - swap slot (used iff swap == TRUE)
+ * ridx - ?? extra pages to round to fs block size transaction?
+ *         (set to npages if no rounding done)
+ * orignpages - original transaction size before rounding.
+ * aio_initiate - TRUE if pages are being brought in asynchronously.
+ * vn - pointer to vnode for object.
+ *
+ * pageqlock must be held if swap is FALSE.  The locks will be released
+ * before return.
  */
-
-void
-uvm_aio_aiodone(struct buf *bp)
+int
+uvm_io_done(struct vm_page **pgs, int npages, boolean_t write, boolean_t swap,
+	    boolean_t aio_complete, boolean_t overwrite, int ext_error,
+	    struct simplelock *ext_slock, int swslot, int ridx,
+	    int orignpages, boolean_t aio_initiate, struct vnode *vp)
 {
-	int npages = bp->b_bufsize >> PAGE_SHIFT;
-	struct vm_page *pg, *pgs[npages];
-	struct uvm_object *uobj;
+	struct vm_page *pg;
+	int i, error;
 	struct simplelock *slock;
-	int s, i, error, swslot;
-	boolean_t write, swap;
-	UVMHIST_FUNC("uvm_aio_aiodone"); UVMHIST_CALLED(ubchist);
-	UVMHIST_LOG(ubchist, "bp %p", bp, 0,0,0);
-
-	error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0;
-	write = (bp->b_flags & B_READ) == 0;
-	/* XXXUBC B_NOCACHE is for swap pager, should be done differently */
-	if (write && !(bp->b_flags & B_NOCACHE) && bioops.io_pageiodone) {
-		(*bioops.io_pageiodone)(bp);
-	}
 
-	uobj = NULL;
-	for (i = 0; i < npages; i++) {
-		pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
-		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0);
-	}
-	uvm_pagermapout((vaddr_t)bp->b_data, npages);
+#if NOTYET /*NVERIEXEC > 0*/
+	if (!ext_error && !swap && !write && !aio_initiate)
+		error = veriexec_block_verify(vp, pgs, npages);
+	else
+#endif
+		error = ext_error;
 
-	swslot = 0;
-	slock = NULL;
-	pg = pgs[0];
-	swap = (pg->uanon != NULL && pg->uobject == NULL) ||
-		(pg->pqflags & PQ_AOBJ) != 0;
-	if (!swap) {
-		uobj = pg->uobject;
-		slock = &uobj->vmobjlock;
-		simple_lock(slock);
-		uvm_lock_pageq();
-	} else {
-#if defined(VMSWAP)
-		if (error) {
-			if (pg->uobject != NULL) {
-				swslot = uao_find_swslot(pg->uobject,
-				    pg->offset >> PAGE_SHIFT);
-			} else {
-				KASSERT(pg->uanon != NULL);
-				swslot = pg->uanon->an_swslot;
-			}
-			KASSERT(swslot);
-		}
-#else /* defined(VMSWAP) */
-		panic("%s: swap", __func__);
-#endif /* defined(VMSWAP) */
-	}
+	slock = ext_slock;
 	for (i = 0; i < npages; i++) {
+		if (pgs[i] == NULL)
+			continue;
 		pg = pgs[i];
-		KASSERT(swap || pg->uobject == uobj);
+/*		KASSERT(swap || pg->uobject == uobj);*/
 		UVMHIST_LOG(ubchist, "pg %p", pg, 0,0,0);
 
 #if defined(VMSWAP)
@@ -347,11 +335,10 @@
 		 */
 
 		if (swap) {
-			if (pg->uobject != NULL) {
+			if (pg->uobject != NULL)
 				slock = &pg->uobject->vmobjlock;
-			} else {
+			else
 				slock = &pg->uanon->an_lock;
-			}
 			simple_lock(slock);
 			uvm_lock_pageq();
 		}
@@ -369,6 +356,8 @@
 		if (error) {
 			int slot;
 			if (!write) {
+				if (aio_complete ||
+				    (pg->flags & PG_FAKE))
 				pg->flags |= PG_RELEASED;
 				continue;
 			} else if (error == ENOMEM) {
@@ -405,16 +394,40 @@
 		 * still be set from the page's previous identity.
 		 */
 
+		if (pg->flags & PG_FAKE && !overwrite) {
+			pg->flags &= ~(PG_FAKE);
+			pmap_clear_modify(pgs[i]);
+		}
+
+		if (i < ridx || i >= ridx + orignpages || aio_initiate) {
+			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
+			    pg, pg->offset,0,0);
+			if ((pg->flags & PG_WANTED) && (!aio_complete)) {
+				wakeup(pg);
+			}
 		if (pg->flags & PG_FAKE) {
 			KASSERT(!write);
-			pg->flags &= ~PG_FAKE;
+				KASSERT(overwrite);
+				KASSERT((pg->flags & PG_CLEAN) != 0);
+				if (!aio_complete)
+					uvm_pagezero(pg);
+				else {
+					pmap_clear_modify(pg);
 #if defined(READAHEAD_STATS)
 			pg->pqflags |= PQ_READAHEAD;
 			uvm_ra_total.ev_count++;
 #endif /* defined(READAHEAD_STATS) */
-			KASSERT((pg->flags & PG_CLEAN) != 0);
+				}
+			}
+			if ((pg->flags & PG_RELEASED) && (!aio_complete)) {
+				uvm_pagefree(pg);
+				continue;
+			}
 			uvm_pageenqueue(pg);
-			pmap_clear_modify(pg);
+			if (!aio_complete)
+				pg->flags &= ~(PG_WANTED|PG_BUSY);
+			pg->flags &= ~PG_FAKE;
+			UVM_PAGE_OWN(pg, NULL);
 		}
 
 		/*
@@ -448,6 +461,7 @@
 #endif /* defined(VMSWAP) */
 	}
 	if (!swap) {
+		if (aio_complete || error)
 		uvm_page_unbusy(pgs, npages);
 		uvm_unlock_pageq();
 		simple_unlock(slock);
@@ -470,6 +484,70 @@
 		uvmexp.pdpending--;
 #endif /* defined(VMSWAP) */
 	}
+
+	return error;
+}
+
+/*
+ * uvm_aio_aiodone: do iodone processing for async i/os.
+ * this should be called in thread context, not interrupt context.
+ */
+
+void
+uvm_aio_aiodone(struct buf *bp)
+{
+	int npages = bp->b_bufsize >> PAGE_SHIFT;
+	struct vm_page *pg, *pgs[npages];
+	struct uvm_object *uobj;
+	struct simplelock *slock;
+	int s, i, error, swslot;
+	boolean_t write, swap;
+	UVMHIST_FUNC("uvm_aio_aiodone"); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "bp %p", bp, 0,0,0);
+
+	error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0;
+	write = (bp->b_flags & B_READ) == 0;
+	/* XXXUBC B_NOCACHE is for swap pager, should be done differently */
+	if (write && !(bp->b_flags & B_NOCACHE) && bioops.io_pageiodone) {
+		(*bioops.io_pageiodone)(bp);
+	}
+
+	uobj = NULL;
+	for (i = 0; i < npages; i++) {
+		pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
+		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0);
+	}
+	uvm_pagermapout((vaddr_t)bp->b_data, npages);
+
+	swslot = 0;
+	slock = NULL;
+	pg = pgs[0];
+	swap = (pg->uanon != NULL && pg->uobject == NULL) ||
+		(pg->pqflags & PQ_AOBJ) != 0;
+	if (!swap) {
+		uobj = pg->uobject;
+		slock = &uobj->vmobjlock;
+		simple_lock(slock);
+		uvm_lock_pageq();
+	} else {
+#if defined(VMSWAP)
+		if (error) {
+			if (pg->uobject != NULL) {
+				swslot = uao_find_swslot(pg->uobject,
+				    pg->offset >> PAGE_SHIFT);
+			} else {
+				KASSERT(pg->uanon != NULL);
+				swslot = pg->uanon->an_swslot;
+			}
+			KASSERT(swslot);
+		}
+#else /* defined(VMSWAP) */
+		panic("%s: swap", __func__);
+#endif /* defined(VMSWAP) */
+	}
+
+	error = uvm_io_done(pgs, npages, write, swap, TRUE, FALSE, error,
+			    slock, swslot, npages, npages, FALSE, bp->b_vp);
 	s = splbio();
 	if (write && (bp->b_flags & B_AGE) != 0) {
 		vwakeup(bp);
Index: sys/uvm/uvm_pager.h
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.34
diff -b -u -r1.34 uvm_pager.h
--- sys/uvm/uvm_pager.h	22 Feb 2006 22:28:18 -0000	1.34
+++ sys/uvm/uvm_pager.h	1 Jan 2007 09:50:22 -0000
@@ -176,6 +176,9 @@
 struct vm_page *uvm_pageratop(vaddr_t);
 vaddr_t	uvm_pagermapin(struct vm_page **, int, int);
 void	uvm_pagermapout(vaddr_t, int);
+int	uvm_io_done(struct vm_page **, int, boolean_t, boolean_t,
+		    boolean_t, boolean_t, int, struct simplelock *,
+		    int, int, int, int, struct vnode *);
 
 /* Flags to uvm_pagermapin() */
 #define	UVMPAGER_MAPIN_WAITOK	0x01	/* it's okay to wait */
Index: sys/miscfs/genfs/genfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_vnops.c,v
retrieving revision 1.142
diff -b -u -r1.142 genfs_vnops.c
--- sys/miscfs/genfs/genfs_vnops.c	27 Dec 2006 12:10:09 -0000	1.142
+++ sys/miscfs/genfs/genfs_vnops.c	1 Jan 2007 09:50:22 -0000
@@ -891,77 +891,19 @@
 	lockmgr(&gp->g_glock, LK_RELEASE, NULL);
 	simple_lock(&uobj->vmobjlock);
 
-	/*
-	 * we're almost done!  release the pages...
-	 * for errors, we free the pages.
-	 * otherwise we activate them and mark them as valid and clean.
-	 * also, unbusy pages that were not actually requested.
-	 */
-
-	if (error) {
-		for (i = 0; i < npages; i++) {
-			if (pgs[i] == NULL) {
-				continue;
-			}
-			UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
-			    pgs[i], pgs[i]->flags, 0,0);
-			if (pgs[i]->flags & PG_FAKE) {
-				pgs[i]->flags |= PG_RELEASED;
-			}
-		}
-		uvm_lock_pageq();
-		uvm_page_unbusy(pgs, npages);
-		uvm_unlock_pageq();
-		simple_unlock(&uobj->vmobjlock);
-		UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0);
-		if (pgs != pgs_onstack)
-			kmem_free(pgs, pgs_size);
-		return (error);
-	}
-
 out:
-	UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0);
 	uvm_lock_pageq();
-	for (i = 0; i < npages; i++) {
-		pg = pgs[i];
-		if (pg == NULL) {
-			continue;
-		}
-		UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
-		    pg, pg->flags, 0,0);
-		if (pg->flags & PG_FAKE && !overwrite) {
-			pg->flags &= ~(PG_FAKE);
-			pmap_clear_modify(pgs[i]);
-		}
-		KASSERT(!write || !blockalloc || (pg->flags & PG_RDONLY) == 0);
-		if (i < ridx || i >= ridx + orignpages || async) {
-			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
-			    pg, pg->offset,0,0);
-			if (pg->flags & PG_WANTED) {
-				wakeup(pg);
-			}
-			if (pg->flags & PG_FAKE) {
-				KASSERT(overwrite);
-				uvm_pagezero(pg);
-			}
-			if (pg->flags & PG_RELEASED) {
-				uvm_pagefree(pg);
-				continue;
-			}
-			uvm_pageenqueue(pg);
-			pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
-			UVM_PAGE_OWN(pg, NULL);
-		}
-	}
-	uvm_unlock_pageq();
-	simple_unlock(&uobj->vmobjlock);
-	if (ap->a_m != NULL) {
+	error = uvm_io_done(pgs, npages, FALSE, FALSE, FALSE, overwrite,
+			    error, &uobj->vmobjlock, 0, ridx, orignpages,
+			    async, vp);
+
+	if ((!error) && (ap->a_m != NULL)) {
 		memcpy(ap->a_m, &pgs[ridx],
 		    orignpages * sizeof(struct vm_page *));
 	}
 	if (pgs != pgs_onstack)
 		kmem_free(pgs, pgs_size);
-	return (0);
+	return (error);
 }
 
 /*

--opJtzjQTFsWo+cga--