Subject: Re: poolifying fileassoc
To: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
From: Brett Lymn <blymn@baesystems.com.au>
List: tech-kern
Date: 01/21/2007 20:37:51
--OgqxwSJOaUobr8KG
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Sun, Jan 07, 2007 at 10:07:29PM +0900, YAMAMOTO Takashi wrote:
> 
> keeping pages locked (PG_BUSY) is enough for your check, isn't it?
>

Yes, it seems so.
 
> 
> i don't think we need such a hack in this case.
> 

OK, I have changed things around in uvm_io_done() so that the veriexec
check is done without locks held but before the pages are unlocked.
This seems to work ok for me with a kernel that has both LOCK_DEBUG
and DIAGNOSTIC enabled on a 2 processor machine (well, a 2 processor
qemu machine).  Diff is attached (again, sorry for the horrible diff -
I cannot seem to get cvs to give a better/clearer one)


-- 
Brett Lymn

--OgqxwSJOaUobr8KG
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="uvm_pager.diff"

Index: uvm_pager.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.79
diff -u -r1.79 uvm_pager.c
--- uvm_pager.c	21 Dec 2006 15:55:26 -0000	1.79
+++ uvm_pager.c	21 Jan 2007 09:59:59 -0000
@@ -43,6 +43,7 @@
 
 #include "opt_uvmhist.h"
 #include "opt_readahead.h"
+#include "veriexec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -52,6 +53,7 @@
 #include <sys/vnode.h>
 
 #include <uvm/uvm.h>
+#include <sys/verified_exec.h>
 
 struct pool *uvm_aiobuf_pool;
 
@@ -279,65 +281,53 @@
 }
 
 /*
- * uvm_aio_aiodone: do iodone processing for async i/os.
- * this should be called in thread context, not interrupt context.
+ *
+ * uvm_io_done: Set page flag status when an i/o (synch or asynch) has
+ * completed or an async i/o is started.  This function handles the
+ * following situations:
+ *
+ *  - Synchronous i/o has completed
+ *  - Asynchronous i/o has been initiated
+ *  - Asynchronous i/o had completed
+ *  - i/o is associated with swap
+ *
+ * All these cases what needs to be done to the page structures is very
+ * similar with some small variations depending on the calling situation.
+ *
+ * pgs - array of pages to handle
+ * npages - number of pages in request
+ * write - TRUE if a write request
+ * swap - TRUE if a swap request
+ * aio_complete - TRUE if called from aio completion
+ * overwrite - TRUE if overwriting pages
+ * ext_error - error status from caller
+ * ext_slock - ignored iff swap otherwise is uobj lock held
+ * swslot - swap slot (used iff swap == TRUE)
+ * ridx - extra pages to round to fs block size transaction
+ *        (set to npages if no rounding done)
+ * orignpages - original transaction size before rounding.
+ * aio_initiate - TRUE if pages are being brought in asynchronously.
+ * vn - pointer to vnode for object.
+ *
+ * pageqlock and object locks must be held iff swap is FALSE.  The
+ * locks will be released before return.
  */
-
-void
-uvm_aio_aiodone(struct buf *bp)
+int
+uvm_io_done(struct vm_page **pgs, int npages, boolean_t write, boolean_t swap,
+	    boolean_t aio_complete, boolean_t overwrite, int ext_error,
+	    struct simplelock *ext_slock, int swslot, int ridx,
+	    int orignpages, boolean_t aio_initiate, struct vnode *vp)
 {
-	int npages = bp->b_bufsize >> PAGE_SHIFT;
-	struct vm_page *pg, *pgs[npages];
-	struct uvm_object *uobj;
+	struct vm_page *pg;
+	int i, error;
 	struct simplelock *slock;
-	int s, i, error, swslot;
-	boolean_t write, swap;
-	UVMHIST_FUNC("uvm_aio_aiodone"); UVMHIST_CALLED(ubchist);
-	UVMHIST_LOG(ubchist, "bp %p", bp, 0,0,0);
 
-	error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0;
-	write = (bp->b_flags & B_READ) == 0;
-	/* XXXUBC B_NOCACHE is for swap pager, should be done differently */
-	if (write && !(bp->b_flags & B_NOCACHE) && bioops.io_pageiodone) {
-		(*bioops.io_pageiodone)(bp);
-	}
-
-	uobj = NULL;
-	for (i = 0; i < npages; i++) {
-		pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
-		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0);
-	}
-	uvm_pagermapout((vaddr_t)bp->b_data, npages);
-
-	swslot = 0;
-	slock = NULL;
-	pg = pgs[0];
-	swap = (pg->uanon != NULL && pg->uobject == NULL) ||
-		(pg->pqflags & PQ_AOBJ) != 0;
-	if (!swap) {
-		uobj = pg->uobject;
-		slock = &uobj->vmobjlock;
-		simple_lock(slock);
-		uvm_lock_pageq();
-	} else {
-#if defined(VMSWAP)
-		if (error) {
-			if (pg->uobject != NULL) {
-				swslot = uao_find_swslot(pg->uobject,
-				    pg->offset >> PAGE_SHIFT);
-			} else {
-				KASSERT(pg->uanon != NULL);
-				swslot = pg->uanon->an_swslot;
-			}
-			KASSERT(swslot);
-		}
-#else /* defined(VMSWAP) */
-		panic("%s: swap", __func__);
-#endif /* defined(VMSWAP) */
-	}
+	error = ext_error;
+	slock = ext_slock;
 	for (i = 0; i < npages; i++) {
+		if (pgs[i] == NULL)
+			continue;
 		pg = pgs[i];
-		KASSERT(swap || pg->uobject == uobj);
 		UVMHIST_LOG(ubchist, "pg %p", pg, 0,0,0);
 
 #if defined(VMSWAP)
@@ -347,11 +337,10 @@
 		 */
 
 		if (swap) {
-			if (pg->uobject != NULL) {
+			if (pg->uobject != NULL)
 				slock = &pg->uobject->vmobjlock;
-			} else {
+			else
 				slock = &pg->uanon->an_lock;
-			}
 			simple_lock(slock);
 			uvm_lock_pageq();
 		}
@@ -369,7 +358,9 @@
 		if (error) {
 			int slot;
 			if (!write) {
-				pg->flags |= PG_RELEASED;
+				if (aio_complete ||
+				    (pg->flags & PG_FAKE))
+					pg->flags |= PG_RELEASED;
 				continue;
 			} else if (error == ENOMEM) {
 				if (pg->flags & PG_PAGEOUT) {
@@ -405,16 +396,34 @@
 		 * still be set from the page's previous identity.
 		 */
 
-		if (pg->flags & PG_FAKE) {
-			KASSERT(!write);
-			pg->flags &= ~PG_FAKE;
+		if (pg->flags & PG_FAKE && !overwrite) {
+			pg->flags &= ~(PG_FAKE);
+			pmap_clear_modify(pgs[i]);
+		}
+
+		if (i < ridx || i >= ridx + orignpages || aio_initiate) {
+			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
+			    pg, pg->offset,0,0);
+			if (pg->flags & PG_FAKE) {
+				KASSERT(!write);
+				KASSERT(overwrite);
+				KASSERT((pg->flags & PG_CLEAN) != 0);
+				if (!aio_complete)
+					uvm_pagezero(pg);
+				else {
+					pmap_clear_modify(pg);
 #if defined(READAHEAD_STATS)
-			pg->pqflags |= PQ_READAHEAD;
-			uvm_ra_total.ev_count++;
+					pg->pqflags |= PQ_READAHEAD;
+					uvm_ra_total.ev_count++;
 #endif /* defined(READAHEAD_STATS) */
-			KASSERT((pg->flags & PG_CLEAN) != 0);
+				}
+			}
+			if ((pg->flags & PG_RELEASED) && (!aio_complete)) {
+				uvm_pagefree(pg);
+				continue;
+			}
 			uvm_pageenqueue(pg);
-			pmap_clear_modify(pg);
+			UVM_PAGE_OWN(pg, NULL);
 		}
 
 		/*
@@ -435,6 +444,17 @@
 		 */
 
 		if (swap) {
+			if (i < ridx || i >= ridx + orignpages
+			    || aio_initiate) {
+				if ((pg->flags & PG_WANTED) &&
+				    (!aio_complete)) {
+					wakeup(pg);
+				}
+			}
+			if (!aio_complete)
+				pg->flags &= ~(PG_WANTED|PG_BUSY);
+			pg->flags &= ~PG_FAKE;
+
 			if (pg->uobject == NULL && pg->uanon->an_ref == 0 &&
 			    (pg->flags & PG_RELEASED) != 0) {
 				uvm_unlock_pageq();
@@ -448,7 +468,40 @@
 #endif /* defined(VMSWAP) */
 	}
 	if (!swap) {
-		uvm_page_unbusy(pgs, npages);
+#if NVERIEXEC > 0
+		if (!error && !write && !aio_initiate) {
+			uvm_unlock_pageq();
+			simple_unlock(slock);
+			error = veriexec_block_verify(vp, pgs, npages);
+			simple_lock(slock);
+			uvm_lock_pageq();
+
+			if (error) {
+				for (i = 0; i < npages; i++) {
+					if (pgs[i] == NULL)
+						continue;
+					pgs[i]->flags |= PG_RELEASED;
+				}
+			}
+		}
+#endif
+		if (!aio_complete && !error) {
+			for (i = 0; i < npages; i++) {
+				if (pgs[i] == NULL)
+					continue;
+				pg = pgs[i];
+				if (i < ridx || i >= ridx + orignpages
+				    || aio_initiate) {
+					if (pg->flags & PG_WANTED)
+						wakeup(pg);
+					pg->flags &= ~(PG_WANTED|PG_BUSY);
+					pg->flags &= ~PG_FAKE;
+				}
+			}
+		}
+
+		if (aio_complete || error)
+			uvm_page_unbusy(pgs, npages);
 		uvm_unlock_pageq();
 		simple_unlock(slock);
 	} else {
@@ -470,6 +523,70 @@
 		uvmexp.pdpending--;
 #endif /* defined(VMSWAP) */
 	}
+
+	return error;
+}
+
+/*
+ * uvm_aio_aiodone: do iodone processing for async i/os.
+ * this should be called in thread context, not interrupt context.
+ */
+
+void
+uvm_aio_aiodone(struct buf *bp)
+{
+	int npages = bp->b_bufsize >> PAGE_SHIFT;
+	struct vm_page *pg, *pgs[npages];
+	struct uvm_object *uobj;
+	struct simplelock *slock;
+	int s, i, error, swslot;
+	boolean_t write, swap;
+	UVMHIST_FUNC("uvm_aio_aiodone"); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "bp %p", bp, 0,0,0);
+
+	error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0;
+	write = (bp->b_flags & B_READ) == 0;
+	/* XXXUBC B_NOCACHE is for swap pager, should be done differently */
+	if (write && !(bp->b_flags & B_NOCACHE) && bioops.io_pageiodone) {
+		(*bioops.io_pageiodone)(bp);
+	}
+
+	uobj = NULL;
+	for (i = 0; i < npages; i++) {
+		pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
+		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0);
+	}
+	uvm_pagermapout((vaddr_t)bp->b_data, npages);
+
+	swslot = 0;
+	slock = NULL;
+	pg = pgs[0];
+	swap = (pg->uanon != NULL && pg->uobject == NULL) ||
+		(pg->pqflags & PQ_AOBJ) != 0;
+	if (!swap) {
+		uobj = pg->uobject;
+		slock = &uobj->vmobjlock;
+		simple_lock(slock);
+		uvm_lock_pageq();
+	} else {
+#if defined(VMSWAP)
+		if (error) {
+			if (pg->uobject != NULL) {
+				swslot = uao_find_swslot(pg->uobject,
+				    pg->offset >> PAGE_SHIFT);
+			} else {
+				KASSERT(pg->uanon != NULL);
+				swslot = pg->uanon->an_swslot;
+			}
+			KASSERT(swslot);
+		}
+#else /* defined(VMSWAP) */
+		panic("%s: swap", __func__);
+#endif /* defined(VMSWAP) */
+	}
+
+	error = uvm_io_done(pgs, npages, write, swap, TRUE, FALSE, error,
+			    slock, swslot, npages, npages, FALSE, bp->b_vp);
 	s = splbio();
 	if (write && (bp->b_flags & B_AGE) != 0) {
 		vwakeup(bp);

--OgqxwSJOaUobr8KG--