Subject: Re: ffs with UBC rewrite performance improvement
To: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
From: Isaku Yamahata <yamahata@private.email.ne.jp>
List: tech-kern
Date: 05/23/2003 00:37:12
--xHFwDpU9dbj6ez1V
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: 7bit


Hello

> because ufs_balloc_range_with_pages keeps pages busy,
> it can deadlock if some of them are mapped to the userspace and
> a page fault occurs on one of them in uiomove.

I solved this deadlock by wiring down the usespace beforehand.
I attach a new patch that is for NetBSD 1.6.1. 
Almost all part of this patch is same.

Does this seem ok? or do I miss anything else?

--  
Isaku Yamahata <yamahata@private.email.ne.jp>

--xHFwDpU9dbj6ez1V
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="ubc_1.diff"
Content-Transfer-Encoding: 7bit

Index: ufs/ufs/ufs_extern.h
===================================================================
RCS file: /usr/home/cvsroot/NetBSD/1.6/usr/src/sys/ufs/ufs/ufs_extern.h,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.8.1
diff -u -r1.1.1.1 -r1.1.1.1.8.1
--- ufs/ufs/ufs_extern.h	7 Nov 2002 04:54:38 -0000	1.1.1.1
+++ ufs/ufs/ufs_extern.h	22 May 2003 11:43:27 -0000	1.1.1.1.8.1
@@ -115,6 +115,7 @@
 /* ufs_inode.c */
 int ufs_reclaim __P((struct vnode *, struct proc *));
 int ufs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *, int));
+int ufs_balloc_range_with_pages(struct vnode *vp, off_t off, off_t len, struct ucred *cred, int flags, struct vm_page** pgs, int* npages); 
 
 /* ufs_lookup.c */
 void ufs_dirbad __P((struct inode *, doff_t, char *));
Index: ufs/ufs/ufs_inode.c
===================================================================
RCS file: /usr/home/cvsroot/NetBSD/1.6/usr/src/sys/ufs/ufs/ufs_inode.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.8.1
diff -u -r1.1.1.1 -r1.1.1.1.8.1
--- ufs/ufs/ufs_inode.c	7 Nov 2002 04:54:38 -0000	1.1.1.1
+++ ufs/ufs/ufs_inode.c	22 May 2003 11:43:27 -0000	1.1.1.1.8.1
@@ -172,21 +172,28 @@
  * accessible to others.
  */
 
-int
-ufs_balloc_range(vp, off, len, cred, flags)
+// pgs array of struct vm_page* must be longer than
+// MAX(block size >> PAGE_SHIT, 1)
+static int ufs_balloc_range_with_pages_internal(struct vnode *vp, off_t off, off_t len, struct ucred *cred, int flags, struct vm_page** pgs, int* npages, boolean_t partial_unbusy); 
+
+
+static int
+ufs_balloc_range_with_pages_internal(vp, off, len, cred, flags, pgs, npages, partial_unbusy)
 	struct vnode *vp;
 	off_t off, len;
 	struct ucred *cred;
 	int flags;
+	struct vm_page** pgs;
+	int* npages;
+	boolean_t partial_unbusy;
 {
 	off_t oldeof, neweof, oldeob, neweob, pagestart;
 	struct uvm_object *uobj;
 	struct genfs_node *gp = VTOG(vp);
-	int i, delta, error, npages;
-	int bshift = vp->v_mount->mnt_fs_bshift;
-	int bsize = 1 << bshift;
-	int ppb = MAX(bsize >> PAGE_SHIFT, 1);
-	struct vm_page *pgs[ppb];
+	int i, delta, error;
+	const int bshift = vp->v_mount->mnt_fs_bshift;
+	const int bsize = 1 << bshift;
+	const int ppb = MAX(bsize >> PAGE_SHIFT, 1);
 	UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
 	UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x",
 		    vp, off, len, vp->v_size);
@@ -209,17 +216,29 @@
 	 */
 
 	pagestart = trunc_page(off) & ~(bsize - 1);
-	npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
-	memset(pgs, 0, npages * sizeof(struct vm_page *));
+	*npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
+	memset(pgs, 0, (*npages) * sizeof(struct vm_page *));
+#if 0	
+	printf("off = 0x%llx, len = 0x%llx, pagestart = 0x%llx, *npages = %d, "
+	       "bsize = %d, 0x%llx 0x%llx 0x%llx 0x%llx\n",
+	       off, len, pagestart, *npages, bsize,
+	       oldeof, oldeob, neweof, neweob);
+#endif	
 	simple_lock(&uobj->vmobjlock);
-	error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
-	    VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF);
+	error = VOP_GETPAGES(vp, pagestart, pgs, npages, 0,
+	    VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF | (flags & PGO_OVERWRITE));
 	if (error) {
 		return error;
 	}
 	simple_lock(&uobj->vmobjlock);
+#if 0	
+	printf("off = 0x%llx, len = 0x%llx, pagestart = 0x%llx, *npages = %d, "
+	       "bsize = %d, 0x%llx 0x%llx 0x%llx 0x%llx\n",
+	       off, len, pagestart, *npages, bsize,
+	       oldeof, oldeob, neweof, neweob);
+#endif	
 	uvm_lock_pageq();
-	for (i = 0; i < npages; i++) {
+	for (i = 0; i < *npages; i++) {
 		UVMHIST_LOG(ubchist, "got pgs[%d] %p", i, pgs[i],0,0);
 		KASSERT((pgs[i]->flags & PG_RELEASED) == 0);
 		pgs[i]->flags &= ~PG_CLEAN;
@@ -250,7 +269,7 @@
 	 */
 
 	simple_lock(&uobj->vmobjlock);
-	for (i = 0; i < npages; i++) {
+	for (i = 0; i < *npages; i++) {
 		pgs[i]->flags &= ~PG_RDONLY;
 		if (error) {
 			pgs[i]->flags |= PG_RELEASED;
@@ -258,11 +277,59 @@
 	}
 	if (error) {
 		uvm_lock_pageq();
-		uvm_page_unbusy(pgs, npages);
+		uvm_page_unbusy(pgs, *npages);
 		uvm_unlock_pageq();
 	} else {
-		uvm_page_unbusy(pgs, npages);
+		if (partial_unbusy) {
+			int start_npages = trunc_page(delta) >> PAGE_SHIFT;
+			int end_npages = round_page(len) >> PAGE_SHIFT;
+			if (start_npages > 0){
+				assert(!(flags & PGO_OVERWRITE));
+				uvm_page_unbusy(pgs, start_npages);
+			}
+			if (end_npages < *npages) {
+				assert(!(flags & PGO_OVERWRITE));
+				uvm_page_unbusy(pgs + end_npages,
+						*npages - end_npages);
+			}
+			*npages = end_npages - start_npages;
+			assert(*npages > 0);
+			memmove(pgs, pgs + start_npages,
+				sizeof(pgs[0]) * (*npages));
+		} else {
+			uvm_page_unbusy(pgs, *npages);
+		}
 	}
 	simple_unlock(&uobj->vmobjlock);
 	return error;
+}
+
+int
+ufs_balloc_range_with_pages(vp, off, len, cred, flags, pgs, npages)
+	struct vnode *vp;
+	off_t off, len;
+	struct ucred *cred;
+	int flags;
+	struct vm_page** pgs;
+	int* npages;
+{
+	return ufs_balloc_range_with_pages_internal(vp, off, len ,cred, flags, pgs, npages, TRUE);
+}
+
+
+
+int
+ufs_balloc_range(vp, off, len, cred, flags)
+	struct vnode *vp;
+	off_t off, len;
+	struct ucred *cred;
+	int flags;
+{
+	const int bshift = vp->v_mount->mnt_fs_bshift;
+	const int bsize = 1 << bshift;
+	const int ppb = MAX(bsize >> PAGE_SHIFT, 1);
+	struct vm_page *pgs[ppb];
+	int npages;
+	
+	return ufs_balloc_range_with_pages_internal(vp, off, len ,cred, flags, pgs, &npages, FALSE);
 }
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /usr/home/cvsroot/NetBSD/1.6/usr/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	7 May 2003 09:04:01 -0000	1.1.1.2
+++ ufs/ufs/ufs_readwrite.c	22 May 2003 14:12:57 -0000
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.42.4.1 2002/10/21 01:54:27 lukem Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.42 2002/03/25 02:23:56 chs Exp $	*/
 
 /*-
  * Copyright (c) 1993
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.42.4.1 2002/10/21 01:54:27 lukem Exp $");
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.42 2002/03/25 02:23:56 chs Exp $");
 
 #ifdef LFS_READWRITE
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@@ -115,14 +115,17 @@
 #endif
 	if (usepc) {
 		while (uio->uio_resid > 0) {
+			void* cookie;
+		  
 			bytelen = MIN(ip->i_ffs_size - uio->uio_offset,
 			    uio->uio_resid);
 			if (bytelen == 0)
 				break;
-
-			win = ubc_alloc(&vp->v_uobj, uio->uio_offset,
-					&bytelen, UBC_READ);
-			error = uiomove(win, bytelen, uio);
+			win = ubc_alloc_with_cookie(&vp->v_uobj, uio->uio_offset, &bytelen, UBC_READ, &cookie);
+			error = ubc_pages_mapin(cookie, uio->uio_offset);
+			if (error == 0) {
+				error = uiomove(win, bytelen, uio);
+			}
 			ubc_release(win, 0);
 			if (error)
 				break;
@@ -211,6 +214,10 @@
 	boolean_t async;
 	boolean_t usepc = FALSE;
 
+	const int bshift = ap->a_vp->v_mount->mnt_fs_bshift;
+	const int ppb = MAX((1 << bshift) >> PAGE_SHIFT, 1);
+	struct vm_page *pgs[ppb];
+
 	cred = ap->a_cred;
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
@@ -312,10 +319,31 @@
 		boolean_t extending; /* if we're extending a whole block */
 		off_t newoff;
 
+		int owrite_flag;
+		int npages;
+		int ubc_npages;
+		void* cookie;
+
+		caddr_t addr;
+		size_t  len;
+		
 		oldoff = uio->uio_offset;
 		blkoffset = blkoff(fs, uio->uio_offset);
-		bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
-
+		while (uio->uio_iov[0].iov_len == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+		}
+		bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_iov[0].iov_len);
+		/*
+		 * to avoid dead lock.
+		 */
+		addr = uio->uio_iov[0].iov_base;
+		len = bytelen;
+		error = uvm_vslock(p, addr, len, VM_PROT_READ);
+		if (error) {
+			break;
+		}
+		
 		/*
 		 * if we're filling in a hole, allocate the blocks now and
 		 * initialize the pages first.  if we're extending the file,
@@ -325,58 +353,68 @@
 		 */
 		extending = uio->uio_offset >= preallocoff &&
 		    uio->uio_offset < endallocoff;
-
-		if (!extending) {
-			error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
-			    cred, aflag);
-			if (error) {
-				break;
-			}
-			ubc_alloc_flags &= ~UBC_FAULTBUSY;
+		if (blkoffset > 0 || bytelen < fs->fs_bsize) {
+			owrite_flag = 0;
 		} else {
-			lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
-			error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
-			    aflag, cred);
-			lockmgr(&gp->g_glock, LK_RELEASE, NULL);
-			if (error) {
-				break;
-			}
-			ubc_alloc_flags |= UBC_FAULTBUSY;
+			owrite_flag = PGO_OVERWRITE;
 		}
-
-		/*
-		 * copy the data.
-		 */
-
-		win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen,
-		    ubc_alloc_flags);
-		error = uiomove(win, bytelen, uio);
-		if (error && extending) {
-			/*
-			 * if we haven't initialized the pages yet,
-			 * do it now.  it's safe to use memset here
-			 * because we just mapped the pages above.
-			 */
-			memset(win, 0, bytelen);
+		
+		cookie = ubc_reserve(&vp->v_uobj, ubc_alloc_flags);
+		error = ufs_balloc_range_with_pages(vp, uio->uio_offset, bytelen, cred, aflag | owrite_flag, pgs, &npages);
+		if (error) {
+			uvm_vsunlock(p, addr, len);
+			ubc_release_with_cookie(cookie);
+			break;
 		}
-		ubc_release(win, 0);
 
 		/*
-		 * update UVM's notion of the size now that we've
-		 * copied the data into the vnode's pages.
-		 *
-		 * we should update the size even when uiomove failed.
-		 * otherwise ffs_truncate can't flush soft update states.
+		 * copy the data.
 		 */
-
-		newoff = oldoff + bytelen;
-		if (vp->v_size < newoff) {
-			uvm_vnp_setsize(vp, newoff);
+		ubc_npages = 0;
+		while (bytelen > 0) {
+			int alloc_npages;
+			vsize_t ubc_bytelen = bytelen;
+			off_t ubc_oldoff = uio->uio_offset;
+
+			win = ubc_pages_enter(cookie, uio->uio_offset, &ubc_bytelen, pgs + ubc_npages, npages - ubc_npages, &alloc_npages);
+			
+			assert(ubc_bytelen > 0);
+			assert(((round_page(uio->uio_offset + ubc_bytelen) - trunc_page(uio->uio_offset)) >> PAGE_SHIFT) == alloc_npages);
+			assert(alloc_npages == npages ||
+			       round_page(uio->uio_offset + ubc_bytelen) ==
+			       uio->uio_offset + ubc_bytelen);
+
+			error = uiomove(win, ubc_bytelen, uio);
+			if (error & extending) {
+				/*
+				 * if we haven't initialized the pages yet,
+				 * do it now.  it's safe to use memset here
+				 * because we just mapped the pages above.
+				 */
+				memset(win, 0, ubc_bytelen);
+			}
+			ubc_pages_remove(cookie, pgs + ubc_npages, alloc_npages);
+			
+			ubc_npages += alloc_npages;
+			bytelen -= ubc_bytelen;
+
+			newoff = ubc_oldoff + ubc_bytelen;
+			if (vp->v_size < newoff) {
+				uvm_vnp_setsize(vp, newoff);
+			}
+			if (error) {
+				uvm_page_unbusy(pgs + ubc_npages,
+						npages - ubc_npages);
+				break;
+			}
 		}
-
+		ubc_release_with_cookie(cookie);
+		uvm_vsunlock(p, addr, len);
+		
 		if (error) {
 			break;
 		}
+		assert(npages == ubc_npages);
 
 		/*
 		 * flush what we just wrote if necessary.
Index: uvm/uvm_bio.c
===================================================================
RCS file: /usr/home/cvsroot/NetBSD/1.6/usr/src/sys/uvm/uvm_bio.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.8.1
diff -u -r1.1.1.1 -r1.1.1.1.8.1
--- uvm/uvm_bio.c	7 Nov 2002 04:54:39 -0000	1.1.1.1
+++ uvm/uvm_bio.c	22 May 2003 11:43:27 -0000	1.1.1.1.8.1
@@ -71,11 +71,19 @@
 	(&ubc_object.inactive[(((u_long)(offset)) >> ubc_winshift) &	\
 			     (UBC_NQUEUES - 1)])
 
+#define UBC_QUEUE_WITH_INDEX(i) (&ubc_object.inactive[(i) & (UBC_NQUEUES - 1)])
+#define UBC_QUEUE_WITH_UMAP(u)	UBC_QUEUE_WITH_INDEX((u) - ubc_object.umap)
+
+
+
 #define UBC_UMAP_ADDR(u)						\
 	(vaddr_t)(ubc_object.kva + (((u) - ubc_object.umap) << ubc_winshift))
 
 
 #define UMAP_PAGES_LOCKED	0x0001
+#define UMAP_PAGES_MAPIN	0x1000
+#define UMAP_PAGES_RESERVED	0x2000
+#define UMAP_PAGES_ENTERED	0x4000
 #define UMAP_MAPPING_CACHED	0x0002
 
 struct ubc_map
@@ -102,7 +110,6 @@
 
 	TAILQ_HEAD(ubc_inactive_head, ubc_map) *inactive;
 					/* inactive queues for ubc_map's */
-
 } ubc_object;
 
 struct uvm_pagerops ubc_pager =
@@ -190,8 +197,7 @@
 	}
 	for (i = 0; i < ubc_nwins; i++) {
 		umap = &ubc_object.umap[i];
-		TAILQ_INSERT_TAIL(&ubc_object.inactive[i & (UBC_NQUEUES - 1)],
-				  umap, inactive);
+		TAILQ_INSERT_TAIL(UBC_QUEUE_WITH_INDEX(i), umap, inactive);
 	}
 
 	ubc_object.hash = hashinit(ubc_nwins, HASH_LIST, M_TEMP, M_NOWAIT,
@@ -257,6 +263,8 @@
 	uobj = umap->uobj;
 	vp = (struct vnode *)uobj;
 	KASSERT(vp != NULL);
+	assert(!(umap->flags & UMAP_PAGES_RESERVED));
+	assert(!(umap->flags & UMAP_PAGES_MAPIN));
 
 	npages = MIN(ubc_winsize - slot_offset,
 		     (round_page(MAX(vp->v_size, umap->offset +
@@ -347,7 +355,8 @@
 	struct ubc_map *umap;
 
 	LIST_FOREACH(umap, &ubc_object.hash[UBC_HASH(uobj, offset)], hash) {
-		if (umap->uobj == uobj && umap->offset == offset) {
+		if (umap->uobj == uobj && umap->offset == offset &&
+		    !(umap->flags & UMAP_PAGES_RESERVED)) {
 			return umap;
 		}
 	}
@@ -359,30 +368,21 @@
  * ubc interface functions
  */
 
-/*
- * ubc_alloc:  allocate a file mapping window
- */
-
-void *
-ubc_alloc(uobj, offset, lenp, flags)
-	struct uvm_object *uobj;
-	voff_t offset;
-	vsize_t *lenp;
-	int flags;
+static vaddr_t
+ubc_alloc_internal(struct uvm_object* uobj, voff_t offset, vsize_t *lenp,
+		   int flags,
+		   struct ubc_map** umap, vaddr_t* slot_offset)
 {
-	struct vnode *vp = (struct vnode *)uobj;
-	vaddr_t slot_offset, va;
-	struct ubc_map *umap;
+	vaddr_t va;
 	voff_t umap_offset;
-	int error;
-	UVMHIST_FUNC("ubc_alloc"); UVMHIST_CALLED(ubchist);
+	UVMHIST_FUNC(__FUNCTION__); UVMHIST_CALLED(ubchist);
 
 	UVMHIST_LOG(ubchist, "uobj %p offset 0x%lx len 0x%lx filesize 0x%x",
-	    uobj, offset, *lenp, vp->v_size);
+	    uobj, offset, *lenp, ((struct vnode*)uobj)->v_size);
 
 	umap_offset = (offset & ~((voff_t)ubc_winsize - 1));
-	slot_offset = (vaddr_t)(offset & ((voff_t)ubc_winsize - 1));
-	*lenp = MIN(*lenp, ubc_winsize - slot_offset);
+	*slot_offset = (vaddr_t)(offset & ((voff_t)ubc_winsize - 1));
+	*lenp = MIN(*lenp, ubc_winsize - *slot_offset);
 
 	/*
 	 * the vnode is always locked here, so we don't need to add a ref.
@@ -390,10 +390,10 @@
 
 again:
 	simple_lock(&ubc_object.uobj.vmobjlock);
-	umap = ubc_find_mapping(uobj, umap_offset);
-	if (umap == NULL) {
-		umap = TAILQ_FIRST(UBC_QUEUE(offset));
-		if (umap == NULL) {
+	(*umap) = ubc_find_mapping(uobj, umap_offset);
+	if ((*umap) == NULL) {
+		(*umap) = TAILQ_FIRST(UBC_QUEUE(offset));
+		if ((*umap) == NULL) {
 			simple_unlock(&ubc_object.uobj.vmobjlock);
 			tsleep(&lbolt, PVM, "ubc_alloc", 0);
 			goto again;
@@ -403,44 +403,74 @@
 		 * remove from old hash (if any), add to new hash.
 		 */
 
-		if (umap->uobj != NULL) {
-			LIST_REMOVE(umap, hash);
+		if ((*umap)->uobj != NULL) {
+			LIST_REMOVE(*umap, hash);
 		}
-		umap->uobj = uobj;
-		umap->offset = umap_offset;
+		(*umap)->uobj = uobj;
+		(*umap)->offset = umap_offset;
 		LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(uobj, umap_offset)],
-		    umap, hash);
-		va = UBC_UMAP_ADDR(umap);
-		if (umap->flags & UMAP_MAPPING_CACHED) {
-			umap->flags &= ~UMAP_MAPPING_CACHED;
+				 *umap, hash);
+		va = UBC_UMAP_ADDR(*umap);
+		if ((*umap)->flags & UMAP_MAPPING_CACHED) {
+			(*umap)->flags &= ~UMAP_MAPPING_CACHED;
 			pmap_remove(pmap_kernel(), va, va + ubc_winsize);
 			pmap_update(pmap_kernel());
 		}
 	} else {
-		va = UBC_UMAP_ADDR(umap);
+		va = UBC_UMAP_ADDR(*umap);
 	}
 
-	if (umap->refcount == 0) {
-		TAILQ_REMOVE(UBC_QUEUE(offset), umap, inactive);
+	if ((*umap)->refcount == 0) {
+		TAILQ_REMOVE(UBC_QUEUE_WITH_UMAP(*umap), *umap, inactive);
 	}
 
 #ifdef DIAGNOSTIC
-	if ((flags & UBC_WRITE) && (umap->writeoff || umap->writelen)) {
+	if ((flags & UBC_WRITE) && ((*umap)->writeoff || (*umap)->writelen)) {
 		panic("ubc_fault: concurrent writes vp %p", uobj);
 	}
 #endif
 	if (flags & UBC_WRITE) {
-		umap->writeoff = slot_offset;
-		umap->writelen = *lenp;
+		(*umap)->writeoff = *slot_offset;
+		(*umap)->writelen = *lenp;
 	}
 
-	umap->refcount++;
+	(*umap)->refcount++;
 	simple_unlock(&ubc_object.uobj.vmobjlock);
 	UVMHIST_LOG(ubchist, "umap %p refs %d va %p flags 0x%x",
-	    umap, umap->refcount, va, flags);
+		    *umap, (*umap)->refcount, va, flags);
+	assert(!((*umap)->flags & UMAP_PAGES_RESERVED));
+	assert(!((*umap)->flags & UMAP_PAGES_ENTERED));	
+
+	return va;
+}
+
+
+/*
+ * ubc_alloc:  allocate a file mapping window
+ */
+
+void *
+ubc_alloc_with_cookie(uobj, offset, lenp, flags, cookie)
+	struct uvm_object *uobj;
+	voff_t offset;
+	vsize_t *lenp;
+	int flags;
+	void** cookie;
+{
+	struct vnode *vp = (struct vnode *)uobj;
+	vaddr_t slot_offset, va;
+	struct ubc_map *umap;
+	int error;
+	UVMHIST_FUNC(__FUNCTION__); UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "uobj %p offset 0x%lx len 0x%lx filesize 0x%x",
+	    uobj, offset, *lenp, vp->v_size);
+
+	va = ubc_alloc_internal(uobj, offset, lenp, flags, &umap,
+				&slot_offset);
 
 	if (flags & UBC_FAULTBUSY) {
-		int npages = (*lenp + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		int npages = (int)((round_page(*lenp + offset) - trunc_page(offset)) >> PAGE_SHIFT);
 		struct vm_page *pgs[npages];
 		int gpflags = PGO_SYNCIO|PGO_OVERWRITE|PGO_PASTEOF;
 		int i;
@@ -467,31 +497,62 @@
 	}
 
 out:
+	if (cookie != NULL) {
+		*cookie = umap;
+	}
 	return (void *)(va + slot_offset);
 }
 
-/*
- * ubc_release:  free a file mapping window.
- */
-
-void
-ubc_release(va, flags)
-	void *va;
+void *
+ubc_alloc(uobj, offset, lenp, flags)
+	struct uvm_object *uobj;
+	voff_t offset;
+	vsize_t *lenp;
 	int flags;
 {
-	struct ubc_map *umap;
-	struct uvm_object *uobj;
-	vaddr_t umapva;
-	boolean_t unmapped;
-	UVMHIST_FUNC("ubc_release"); UVMHIST_CALLED(ubchist);
-
-	UVMHIST_LOG(ubchist, "va %p", va, 0, 0, 0);
-	umap = &ubc_object.umap[((char *)va - ubc_object.kva) >> ubc_winshift];
-	umapva = UBC_UMAP_ADDR(umap);
-	uobj = umap->uobj;
-	KASSERT(uobj != NULL);
+	return ubc_alloc_with_cookie(uobj, offset, lenp, flags, NULL);
+}
 
-	if (umap->flags & UMAP_PAGES_LOCKED) {
+static void
+ubc_release_umap(struct ubc_map* umap)
+{
+	boolean_t unmapped;
+	struct uvm_object *uobj = umap->uobj;
+	const vaddr_t umapva = UBC_UMAP_ADDR(umap);
+	UVMHIST_FUNC(__FUNCTION__); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "umap %p offset 0x%lx refcount %d flags 0x%x",
+		    umap, umap->offset, umap->refcount, umap->flags);
+
+	if (umap->flags & UMAP_PAGES_RESERVED) {
+		assert(umap->refcount == 1);
+		assert(!(umap->flags & UMAP_PAGES_ENTERED));
+		unmapped = TRUE;
+		umap->flags &= ~UMAP_PAGES_RESERVED;
+	} else if (umap->flags & UMAP_PAGES_MAPIN) {
+		int slot_offset = umap->writeoff;
+		int npages = (int)(round_page(umap->writeoff + umap->writelen)
+				   - trunc_page(umap->writeoff)) >> PAGE_SHIFT;
+		struct vm_page *pgs[npages];
+		paddr_t pa;
+		int i;
+		boolean_t rv;
+		
+		umap->flags &= ~UMAP_PAGES_MAPIN;
+		uvm_lock_pageq();
+		for (i = 0; i < npages; i++) {
+			rv = pmap_extract(pmap_kernel(),
+			    umapva + slot_offset + (i << PAGE_SHIFT), &pa);
+			KASSERT(rv);
+			pgs[i] = PHYS_TO_VM_PAGE(pa);
+			pgs[i]->flags &= ~(PG_FAKE|PG_CLEAN);
+			uvm_pageactivate(pgs[i]);
+		}
+		uvm_unlock_pageq();
+		pmap_kremove(umapva, ubc_winsize);
+		pmap_update(pmap_kernel());
+		uvm_page_unbusy(pgs, npages);
+		unmapped = TRUE;
+	} else if (umap->flags & UMAP_PAGES_LOCKED) {
 		int slot_offset = umap->writeoff;
 		int endoff = umap->writeoff + umap->writelen;
 		int zerolen = round_page(endoff) - endoff;
@@ -524,6 +585,8 @@
 		unmapped = FALSE;
 	}
 
+	assert(!(umap->flags & UMAP_PAGES_RESERVED));
+	assert(!(umap->flags & UMAP_PAGES_ENTERED));
 	simple_lock(&ubc_object.uobj.vmobjlock);
 	umap->writeoff = 0;
 	umap->writelen = 0;
@@ -549,14 +612,18 @@
 			pmap_update(pmap_kernel());
 			LIST_REMOVE(umap, hash);
 			umap->uobj = NULL;
-			TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap,
-			    inactive);
+			TAILQ_INSERT_HEAD(UBC_QUEUE_WITH_UMAP(umap), umap,
+					  inactive);
+			
 		} else {
 			if (!unmapped) {
 				umap->flags |= UMAP_MAPPING_CACHED;
+				TAILQ_INSERT_TAIL(UBC_QUEUE_WITH_UMAP(umap),
+						  umap, inactive);
+			} else {
+				TAILQ_INSERT_HEAD(UBC_QUEUE_WITH_UMAP(umap),
+						  umap, inactive);
 			}
-			TAILQ_INSERT_TAIL(UBC_QUEUE(umap->offset), umap,
-			    inactive);
 		}
 	}
 	UVMHIST_LOG(ubchist, "umap %p refs %d", umap, umap->refcount, 0, 0);
@@ -565,6 +632,35 @@
 
 
 /*
+ * ubc_release:  free a file mapping window.
+ */
+
+void
+ubc_release(va, flags)
+	void *va;
+	int flags;
+{
+	struct ubc_map *umap;
+	struct uvm_object *uobj;
+	vaddr_t umapva;
+	UVMHIST_FUNC("ubc_release"); UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "va %p", va, 0, 0, 0);
+	umap = &ubc_object.umap[((char *)va - ubc_object.kva) >> ubc_winshift];
+	umapva = UBC_UMAP_ADDR(umap);
+	uobj = umap->uobj;
+	KASSERT(uobj != NULL);
+
+	ubc_release_umap(umap);
+}
+
+void
+ubc_release_with_cookie(void* cookie)
+{
+	ubc_release_umap((struct ubc_map*)cookie);
+}
+
+/*
  * removing a range of mappings from the ubc mapping cache.
  */
 
@@ -602,9 +698,244 @@
 
 		LIST_REMOVE(umap, hash);
 		umap->uobj = NULL;
-		TAILQ_REMOVE(UBC_QUEUE(umap->offset), umap, inactive);
-		TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, inactive);
+		TAILQ_REMOVE(UBC_QUEUE_WITH_UMAP(umap), umap, inactive);
+		TAILQ_INSERT_HEAD(UBC_QUEUE_WITH_UMAP(umap), umap, inactive);
+	}
+	pmap_update(pmap_kernel());
+	simple_unlock(&ubc_object.uobj.vmobjlock);
+}
+
+/*
+ * for read
+ */
+int
+ubc_pages_mapin(void* cookie, voff_t offset)
+{
+	struct uvm_object *uobj;
+	struct vnode *vp;
+	struct ubc_map* const umap = (struct ubc_map*)cookie;
+	vaddr_t va, eva, slot_offset;
+	int i, error, npages;
+	struct vm_page *pgs[ubc_winsize >> PAGE_SHIFT], *pg;
+	vm_prot_t prot;
+	vm_prot_t access_type = VM_PROT_READ;
+	int flags;
+	UVMHIST_FUNC(__FUNCTION__);  UVMHIST_CALLED(ubchist);
+
+	KASSERT(umap->refcount != 0);
+	slot_offset = offset & (ubc_winsize - 1);	
+
+	/* no umap locking needed since we have a ref on the umap */
+	//printf("mapin:umap = %p, cookie = %p\n", umap, cookie);
+	uobj = umap->uobj;
+	vp = (struct vnode *)uobj;
+	KASSERT(vp != NULL);
+	assert(!(umap->flags & UMAP_PAGES_MAPIN));
+
+	npages = MIN(ubc_winsize - slot_offset,
+		     (round_page(MAX(vp->v_size, umap->offset +
+				     umap->writeoff + umap->writelen)) -
+		      umap->offset)) >> PAGE_SHIFT;
+
+again:
+	memset(pgs, 0, sizeof (pgs));
+	simple_lock(&uobj->vmobjlock);
+
+	UVMHIST_LOG(ubchist, "slot_offset 0x%x writeoff 0x%x writelen 0x%x "
+	    "v_size 0x%x", slot_offset, umap->writeoff, umap->writelen,
+	    vp->v_size);
+	UVMHIST_LOG(ubchist, "getpages vp %p offset 0x%x npages %d",
+	    uobj, umap->offset + slot_offset, npages, 0);
+
+	flags = PGO_SYNCIO;
+	error = VOP_GETPAGES(vp, umap->offset + slot_offset, pgs, &npages, 0,
+	    access_type, 0, flags);
+	UVMHIST_LOG(ubchist, "getpages error %d npages %d", error, npages, 0,
+	    0);
+
+	if (error == EAGAIN) {
+		tsleep(&lbolt, PVM, "ubc_fault", 0);
+		goto again;
 	}
+	if (error) {
+		return error;
+	}
+
+	va = trunc_page(UBC_UMAP_ADDR(umap) + slot_offset);
+	eva = va + (npages << PAGE_SHIFT);
+
+	/*
+	 * for virtually-indexed, virtually-tagged caches we should avoid
+	 * creating writable mappings when we don't absolutely need them,
+	 * since the "compatible alias" trick doesn't work on such caches.
+	 * otherwise, we can always map the pages writable.
+	 */
+
+#ifdef PMAP_CACHE_VIVT
+	prot = VM_PROT_READ | access_type;
+#else
+	prot = VM_PROT_READ | VM_PROT_WRITE;
+#endif
+	UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0, 0);
+	simple_lock(&uobj->vmobjlock);
+	uvm_lock_pageq();
+	for (i = 0; va < eva; i++, va += PAGE_SIZE) {
+		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i], 0, 0);
+		pg = pgs[i];
+
+		if (pg == NULL || pg == PGO_DONTCARE) {
+			continue;
+		}
+		if (pg->flags & PG_WANTED) {
+			wakeup(pg);
+		}
+		KASSERT((pg->flags & PG_FAKE) == 0);
+		if (pg->flags & PG_RELEASED) {
+			uvm_pagefree(pg);
+			continue;
+		}
+		KASSERT(access_type == VM_PROT_READ ||
+		    (pg->flags & PG_RDONLY) == 0);
+
+		pmap_enter(pmap_kernel(), va, VM_PAGE_TO_PHYS(pg),
+		    (pg->flags & PG_RDONLY) ? prot & ~VM_PROT_WRITE : prot,
+		    access_type);
+
+		uvm_pageactivate(pg);
+		pg->flags &= ~(PG_BUSY);
+		UVM_PAGE_OWN(pg, NULL);
+	}
+	uvm_unlock_pageq();
+	simple_unlock(&uobj->vmobjlock);
+	pmap_update(pmap_kernel());
+	
+	return 0;
+}
+
+/*
+ * for write
+ */
+void*
+ubc_reserve(struct uvm_object* uobj, int flags)
+{
+	struct ubc_map *umap;
+	// these are protected by uobj.vmobjlock
+	static int queue_index = 0;
+	static voff_t umap_offset = 0;
+	vaddr_t va;
+	UVMHIST_FUNC(__FUNCTION__); UVMHIST_CALLED(ubchist);
+
+again:
+	simple_lock(&ubc_object.uobj.vmobjlock);
+	++queue_index;
+	umap_offset += PAGE_SIZE;
+	umap = TAILQ_FIRST(UBC_QUEUE_WITH_INDEX(queue_index));
+	if (umap == NULL) {
+		simple_unlock(&ubc_object.uobj.vmobjlock);
+		tsleep(&lbolt, PVM, __FUNCTION__, 0);
+		goto again;
+	}
+
+	/*
+	 * remove from old hash (if any)
+	 */
+
+	if (umap->uobj != NULL) {
+		LIST_REMOVE(umap, hash);
+	}
+	umap->uobj = uobj;
+	umap->offset = umap_offset;
+	LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(umap->uobj, umap_offset)],
+			 umap, hash);
+	va = UBC_UMAP_ADDR(umap);
+	if (umap->flags & UMAP_MAPPING_CACHED) {
+		umap->flags &= ~UMAP_MAPPING_CACHED;
+		pmap_remove(pmap_kernel(), va, va + ubc_winsize);
+		pmap_update(pmap_kernel());
+	}
+	umap->flags |= UMAP_PAGES_RESERVED;
+	umap->flags &= ~UMAP_PAGES_ENTERED;
+	assert(umap->refcount == 0);
+	TAILQ_REMOVE(UBC_QUEUE_WITH_UMAP(umap), umap, inactive);
+	umap->refcount++;
+	simple_unlock(&ubc_object.uobj.vmobjlock);
+	UVMHIST_LOG(ubchist, "umap %p refs %d flags 0x%x",
+		    umap, umap->refcount, flags, 0);
+
+	return (void*)umap;
+}
+
+static void
+ubc_pages_umap_rehash(struct ubc_map* umap, voff_t umap_offset)
+{
+	LIST_REMOVE(umap, hash);
+	umap->offset = umap_offset;
+	assert(umap->uobj != NULL);
+	LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(umap->uobj, umap_offset)],
+			 umap, hash);
+}
+
+void*
+ubc_pages_enter(void* cookie, voff_t offset, vsize_t* lenp, struct vm_page* pgs[], int npages, int* alloc_npages)
+{
+	struct ubc_map* const umap = (struct ubc_map*)cookie;
+	vaddr_t va = UBC_UMAP_ADDR(umap);
+	int i;
+	voff_t umap_offset = (offset & ~((voff_t)ubc_winsize - 1));
+	vaddr_t slot_offset = (vaddr_t)(offset & ((voff_t)ubc_winsize - 1));
+	
+	UVMHIST_FUNC(__FUNCTION__); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "cookie 0x%x offset 0x%lx *lenp 0x%lx npages %d",
+		    cookie, offset, *lenp, npages);
+
+	*lenp = MIN(*lenp, ubc_winsize - slot_offset);
+
+	simple_lock(&ubc_object.uobj.vmobjlock);
+	umap->writeoff = offset;
+	umap->writelen = *lenp;
+	ubc_pages_umap_rehash(umap, umap_offset);
+	
+	*alloc_npages = (int)(round_page(*lenp + offset) - trunc_page(offset)) >> PAGE_SHIFT;
+	assert(*alloc_npages <= npages);
+	for (i = 0; i < *alloc_npages; ++i) {
+		pmap_kenter_pa(va + slot_offset + (i << PAGE_SHIFT),
+			       VM_PAGE_TO_PHYS(pgs[i]),
+			       VM_PROT_READ | VM_PROT_WRITE);
+	}
+	pmap_update(pmap_kernel());
+	assert(!(umap->flags & UMAP_PAGES_ENTERED));
+	umap->flags |= UMAP_PAGES_ENTERED;
+	simple_unlock(&ubc_object.uobj.vmobjlock);
+
+	UVMHIST_LOG(ubchist, "win 0x%x psg 0x%x *lenp 0x%lx *alloc_npages %d",
+		    (void*)(va + slot_offset), pgs, *lenp, *alloc_npages);
+	return (void*)(va + slot_offset);
+}
+
+void
+ubc_pages_remove(void* cookie, struct vm_page* pgs[], int npages)
+{
+	int i;
+	struct ubc_map* const umap = (struct ubc_map*)cookie;
+	const vaddr_t umapva = UBC_UMAP_ADDR(umap);
+	UVMHIST_FUNC(__FUNCTION__); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "cookie %p pgs 0x%x npages %d",
+		    cookie, pgs, npages, 0);
+	
+	simple_lock(&ubc_object.uobj.vmobjlock);
+	assert(umap->flags & UMAP_PAGES_ENTERED);
+	umap->flags &= ~UMAP_PAGES_ENTERED;
+	umap->writeoff = 0;
+	umap->writelen = 0;
+	
+	uvm_lock_pageq();
+	for (i = 0; i < npages; i++) {
+		pgs[i]->flags &= ~(PG_FAKE|PG_CLEAN);
+		uvm_pageactivate(pgs[i]);
+	}
+	uvm_unlock_pageq();
+	pmap_kremove(umapva, ubc_winsize);
 	pmap_update(pmap_kernel());
 	simple_unlock(&ubc_object.uobj.vmobjlock);
+	uvm_page_unbusy(pgs, npages);
 }
Index: uvm/uvm_extern.h
===================================================================
RCS file: /usr/home/cvsroot/NetBSD/1.6/usr/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.8.1
diff -u -r1.1.1.1 -r1.1.1.1.8.1
--- uvm/uvm_extern.h	7 Nov 2002 04:54:39 -0000	1.1.1.1
+++ uvm/uvm_extern.h	22 May 2003 11:43:27 -0000	1.1.1.1.8.1
@@ -545,10 +545,20 @@
 
 /* uvm_bio.c */
 void			ubc_init __P((void));
+void *			ubc_alloc_with_cookie(struct uvm_object *uobj, voff_t offset, vsize_t *lenp, int flags, void** cookie);
+
 void *			ubc_alloc __P((struct uvm_object *, voff_t, vsize_t *,
 				       int));
+void*			ubc_alloc_with_pages __P((struct uvm_object* uobj, voff_t offset, vsize_t* lenp, int flags, struct vm_page* pgs[], int npages, int* alloc_npages));
 void			ubc_release __P((void *, int));
+void			ubc_release_with_cookie(void* cookie); 
 void			ubc_flush __P((struct uvm_object *, voff_t, voff_t));
+
+void*			ubc_reserve(struct uvm_object* uobj, int flags); 
+int			ubc_pages_mapin(void* cookie, voff_t offset);
+void*			ubc_pages_enter(void* cookie, voff_t offset, vsize_t* lenp, struct vm_page* pgs[], int npsages, int* alloc_npages);
+void			ubc_pages_remove(void* cookie, struct vm_page* pgs[], int npages); 
+
 
 /* uvm_fault.c */
 int			uvm_fault __P((struct vm_map *, vaddr_t, vm_fault_t,

--xHFwDpU9dbj6ez1V--