Subject: loaning for read() of regular files
To: None <tech-kern@netbsd.org>
From: Chuck Silvers <chuq@chuq.com>
List: tech-kern
Date: 02/15/2005 09:36:45
--3V7upXqbjpZ4EhLz
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

hi folks,

I've been fiddling with some changes that implement loaning pages for read()
of regular files for quite a while, and I figure it's time to share them.
there's a diff attached which adds this generic support and changes FFS
to use it.

the new interface is:

    int uvm_map_loanobj(struct vm_map *, struct uvm_object *, struct uio *);

this says to read (or potentially write) to (or from) the given map from
(or to) the given object, using the addresses, lengths and offset given by
the uio.  it returns 0 for success and an errno if we can't do it (such as
because it's not aligned properly).  this call can partially succeed, in
which case the uio is updated to indicate how far we got.  the expectation
is that the caller will complete the transfer in the normal fashion if this
operation does not transfer everything that was requested.  (currently I'm
ignoring the error return, maybe this should just be void.)

the benchmark I used to evaluate this was sequential read()s of a large
file that would fit completely in memory.  I used two different request
sizes: 4KB (1 page on a pc, the minimum where loaning is possible) and 1MB
(an arbitrary "large" size).  I ran each test case twice in a row, to see
the effects for both cold and warm caches.  the results were:


the current code:

# time pt -r -s 4096 -c 655360 /usr/file
2684354560 bytes transferred in 69.628 secs (38552690 bytes/sec)
0.226u 9.922s 1:09.63 14.5%     0+0k 43+0io 0pf+0w
# time pt -r -s 4096 -c 655360 /usr/file
2684354560 bytes transferred in 8.011 secs (335078017 bytes/sec)
0.210u 7.801s 0:08.01 100.0%    0+0k 0+0io 0pf+0w

# time pt -r -s 1048576 -c 2560 /usr/file
2684354560 bytes transferred in 69.640 secs (38546036 bytes/sec)
0.000u 17.310s 1:09.64 24.8%    0+0k 41+0io 0pf+0w
# time pt -r -s 1048576-c 2560 /usr/file
2684354560 bytes transferred in 13.311 secs (201652336 bytes/sec)
0.000u 13.315s 0:13.31 100.0%   0+0k 0+0io 0pf+0w


the new code:

# time pt -r -s 4096 -c 655360 /usr/file
2684354560 bytes transferred in 69.661 secs (38534439 bytes/sec)
0.213u 3.316s 1:09.67 5.0%      0+0k 43+0io 0pf+0w
# time pt -r -s 4096 -c 655360 /usr/file
2684354560 bytes transferred in 2.979 secs (900835767 bytes/sec)
0.170u 2.811s 0:02.98 100.0%    0+0k 0+0io 0pf+0w

# time pt -r -s 1048576 -c 2560 /usr/file
2684354560 bytes transferred in 69.710 secs (38506963 bytes/sec)
0.000u 1.504s 1:09.71 2.1%      0+0k 41+0io 0pf+0w
# time pt -r -s 1048576 -c 2560 /usr/file
2684354560 bytes transferred in 0.974 secs (2755281003 bytes/sec)
0.000u 0.979s 0:00.97 100.0%    0+0k 0+0io 0pf+0w



comments?   if everyone is happy with this then I'll look into
loaning for write() also.

-Chuck

--3V7upXqbjpZ4EhLz
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="diff.uvm_readloan.6"

Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.60
diff -u -p -r1.60 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c	9 Jan 2005 16:42:44 -0000	1.60
+++ ufs/ufs/ufs_readwrite.c	15 Feb 2005 16:38:53 -0000
@@ -112,6 +112,10 @@ READ(void *v)
 	usepc = vp->v_type == VREG;
 #endif /* !LFS_READWRITE */
 	if (usepc) {
+		if (uio->uio_offset + uio->uio_resid <= vp->v_size) {
+			error = uvm_map_loanobj(&curproc->p_vmspace->vm_map,
+						&vp->v_uobj, uio);
+		}
 		while (uio->uio_resid > 0) {
 			bytelen = MIN(ip->i_size - uio->uio_offset,
 			    uio->uio_resid);
Index: uvm/uvm_amap_i.h
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_amap_i.h,v
retrieving revision 1.20
diff -u -p -r1.20 uvm_amap_i.h
--- uvm/uvm_amap_i.h	20 Dec 2002 18:21:13 -0000	1.20
+++ uvm/uvm_amap_i.h	15 Feb 2005 16:38:53 -0000
@@ -59,6 +59,7 @@ amap_lookup(aref, offset)
 	int slot;
 	struct vm_amap *amap = aref->ar_amap;
 	UVMHIST_FUNC("amap_lookup"); UVMHIST_CALLED(maphist);
+	LOCK_ASSERT(simple_lock_held(&amap->am_l));
 
 	AMAP_B2SLOT(slot, offset);
 	slot += aref->ar_pageoff;
@@ -87,6 +88,7 @@ amap_lookups(aref, offset, anons, npages
 	int slot;
 	struct vm_amap *amap = aref->ar_amap;
 	UVMHIST_FUNC("amap_lookups"); UVMHIST_CALLED(maphist);
+	LOCK_ASSERT(simple_lock_held(&amap->am_l));
 
 	AMAP_B2SLOT(slot, offset);
 	slot += aref->ar_pageoff;
@@ -120,6 +122,7 @@ amap_add(aref, offset, anon, replace)
 	int slot;
 	struct vm_amap *amap = aref->ar_amap;
 	UVMHIST_FUNC("amap_add"); UVMHIST_CALLED(maphist);
+	LOCK_ASSERT(simple_lock_held(&amap->am_l));
 
 	AMAP_B2SLOT(slot, offset);
 	slot += aref->ar_pageoff;
@@ -166,6 +169,7 @@ amap_unadd(aref, offset)
 	int ptr, slot;
 	struct vm_amap *amap = aref->ar_amap;
 	UVMHIST_FUNC("amap_unadd"); UVMHIST_CALLED(maphist);
+	LOCK_ASSERT(simple_lock_held(&amap->am_l));
 
 	AMAP_B2SLOT(slot, offset);
 	slot += aref->ar_pageoff;
Index: uvm/uvm_anon.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_anon.c,v
retrieving revision 1.31
diff -u -p -r1.31 uvm_anon.c
--- uvm/uvm_anon.c	1 Sep 2004 11:53:38 -0000	1.31
+++ uvm/uvm_anon.c	15 Feb 2005 16:38:54 -0000
@@ -201,17 +201,6 @@ uvm_anfree(anon)
 
 	pg = anon->u.an_page;
 
-	/*
-	 * if there is a resident page and it is loaned, then anon may not
-	 * own it.   call out to uvm_anon_lockpage() to ensure the real owner
- 	 * of the page has been identified and locked.
-	 */
-
-	if (pg && pg->loan_count) {
-		simple_lock(&anon->an_lock);
-		pg = uvm_anon_lockloanpg(anon);
-		simple_unlock(&anon->an_lock);
-	}
 
 	/*
 	 * if we have a resident page, we must dispose of it before freeing
@@ -221,6 +210,18 @@ uvm_anfree(anon)
 	if (pg) {
 
 		/*
+		 * if there is a resident page and it is loaned, then anon
+		 * may not own it.  call out to uvm_anon_lockpage() to ensure
+		 * the real owner of the page has been identified and locked.
+		 */
+
+		if (pg->loan_count) {
+			simple_lock(&anon->an_lock);
+			pg = uvm_anon_lockloanpg(anon);
+			simple_unlock(&anon->an_lock);
+		}
+
+		/*
 		 * if the page is owned by a uobject (now locked), then we must
 		 * kill the loan on the page rather than free it.
 		 */
@@ -230,6 +231,7 @@ uvm_anfree(anon)
 			KASSERT(pg->loan_count > 0);
 			pg->loan_count--;
 			pg->uanon = NULL;
+			anon->u.an_page = NULL;
 			uvm_unlock_pageq();
 			simple_unlock(&pg->uobject->vmobjlock);
 		} else {
@@ -259,8 +261,7 @@ uvm_anfree(anon)
 			UVMHIST_LOG(maphist, "anon 0x%x, page 0x%x: "
 				    "freed now!", anon, pg, 0, 0);
 		}
-	}
-	if (pg == NULL && anon->an_swslot > 0) {
+	} else if (anon->an_swslot > 0) {
 		/* this page is no longer only in swap. */
 		simple_lock(&uvm.swap_data_lock);
 		KASSERT(uvmexp.swpgonly > 0);
Index: uvm/uvm_fault.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.90
diff -u -p -r1.90 uvm_fault.c
--- uvm/uvm_fault.c	7 Feb 2005 11:57:38 -0000	1.90
+++ uvm/uvm_fault.c	15 Feb 2005 16:38:55 -0000
@@ -1210,7 +1210,7 @@ ReFault:
 		uvm_unlock_pageq();
 		UVM_PAGE_OWN(pg, NULL);
 		amap_add(&ufi.entry->aref, ufi.orig_rvaddr - ufi.entry->start,
-		    anon, 1);
+		    anon, TRUE);
 
 		/* deref: can not drop to zero here by defn! */
 		oanon->an_ref--;
@@ -1623,7 +1623,7 @@ Case2:
 			    anon, pg, 0, 0);
 		}
 		amap_add(&ufi.entry->aref, ufi.orig_rvaddr - ufi.entry->start,
-		    anon, 0);
+		    anon, FALSE);
 	}
 
 	/*
Index: uvm/uvm_map.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_map.c,v
retrieving revision 1.184
diff -u -p -r1.184 uvm_map.c
--- uvm/uvm_map.c	11 Feb 2005 02:12:03 -0000	1.184
+++ uvm/uvm_map.c	15 Feb 2005 16:38:59 -0000
@@ -147,6 +147,8 @@ EVCNT_ATTACH_STATIC(uvm_mlk_hint);
 
 const char vmmapbsy[] = "vmmapbsy";
 
+boolean_t domaploanobj = TRUE;
+
 /*
  * pool for vmspace structures.
  */
@@ -261,6 +263,8 @@ static void	uvm_map_reference_amap(struc
 static int	uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
 		    struct vm_map_entry *);
 static void	uvm_map_unreference_amap(struct vm_map_entry *, int);
+static int	uvm_do_map_loanobj(struct vm_map *, vaddr_t, size_t,
+		    struct uvm_object *, off_t);
 
 int _uvm_tree_sanity(struct vm_map *, const char *);
 static vsize_t uvm_rb_subtree_space(const struct vm_map_entry *);
@@ -1904,6 +1908,7 @@ uvm_unmap_remove(struct vm_map *map, vad
 		if (VM_MAPENT_ISWIRED(entry)) {
 			uvm_map_entry_unwire(map, entry);
 		}
+
 		if ((map->flags & VM_MAP_PAGEABLE) == 0) {
 
 			/*
@@ -2000,7 +2005,7 @@ uvm_unmap_remove(struct vm_map *map, vad
 		 * that we've nuked.  then go to next entry.
 		 */
 
-		UVMHIST_LOG(maphist, "  removed map entry 0x%x", entry, 0, 0,0);
+		UVMHIST_LOG(maphist, "  removed map entry %p", entry, 0, 0,0);
 
 		/* critical!  prevents stale hint */
 		SAVE_HINT(map, entry, entry->prev);
@@ -3430,6 +3435,10 @@ uvm_map_clean(struct vm_map *map, vaddr_
 			vm_map_unlock_read(map);
 			return EINVAL;
 		}
+		if (flags & PGO_FREE && VM_MAPENT_ISWIRED(entry)) {
+			vm_map_unlock_read(map);
+			return EBUSY;
+		}
 		if (end <= current->end) {
 			break;
 		}
@@ -3595,6 +3604,264 @@ uvm_map_checkprot(struct vm_map *map, va
 	return (TRUE);
 }
 
+int
+uvm_map_loanobj(struct vm_map *map, struct uvm_object *uobj, struct uio *uio)
+{
+	struct iovec *iov;
+	vaddr_t va;
+	size_t len;
+	int i, error = 0;
+
+	if (!domaploanobj) {
+		return ENOSYS;
+	}
+
+	/*
+	 * We only support loaning for reads currently.
+	 */
+
+	if (uio->uio_rw != UIO_READ) {
+		return ENOSYS;
+	}
+
+	/*
+	 * This interface is only for loaning to user space.
+	 * Loans to the kernel should be done with the kernel-specific
+	 * loaning interfaces since those are more efficient for that.
+	 */
+
+	if (uio->uio_segflg != UIO_USERSPACE) {
+		return ENOSYS;
+	}
+
+	/*
+	 * Check that the uio is aligned properly for loaning.
+	 */
+
+	if (uio->uio_offset & PAGE_MASK || uio->uio_resid & PAGE_MASK) {
+		return EINVAL;
+	}
+	for (i = 0; i < uio->uio_iovcnt; i++) {
+		if (((vaddr_t)uio->uio_iov[i].iov_base & PAGE_MASK) ||
+		    (uio->uio_iov[i].iov_len & PAGE_MASK)) {
+			return EINVAL;
+		}
+	}
+
+	/*
+	 * Process the uio.
+	 */
+
+	while (uio->uio_resid) {
+		iov = uio->uio_iov;
+		while (iov->iov_len) {
+			va = (vaddr_t)iov->iov_base;
+			len = MIN(iov->iov_len, MAXPHYS);
+			error = uvm_do_map_loanobj(map, va, len, uobj,
+						   uio->uio_offset);
+			if (error) {
+				return error;
+			}
+			iov->iov_base = (caddr_t)iov->iov_base + len;
+			iov->iov_len -= len;
+			uio->uio_offset += len;
+			uio->uio_resid -= len;
+		}
+		uio->uio_iov++;
+		uio->uio_iovcnt--;
+	}
+	return 0;
+}
+
+static int
+uvm_do_map_loanobj(struct vm_map *map, vaddr_t va, size_t len,
+    struct uvm_object *uobj, off_t off)
+{
+	int npages = len >> PAGE_SHIFT;
+	struct vm_page *pgs[npages], *pg;
+	struct vm_aref aref;
+	struct vm_amap *amap;
+	struct vm_anon *anon, *oanons[npages];
+	struct vm_map_entry *entry;
+	unsigned int maptime;
+	int error, i, refs, aoff, oanon;
+	UVMHIST_FUNC("uvm_vnp_loanread"); UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "map %p va 0x%x npages %d", map, va, npages, 0);
+	UVMHIST_LOG(ubchist, "uobj %p off 0x%x", uobj, off, 0, 0);
+	oanon = 0;
+	vm_map_lock_read(map);
+
+retry:
+	if (!uvm_map_lookup_entry(map, va, &entry)) {
+		vm_map_unlock_read(map);
+		UVMHIST_LOG(ubchist, "no entry", 0,0,0,0);
+		return EINVAL;
+	}
+	if (VM_MAPENT_ISWIRED(entry)) {
+		vm_map_unlock_read(map);
+		UVMHIST_LOG(ubchist, "entry is wired", 0,0,0,0);
+		return EBUSY;
+	}
+	if (!UVM_ET_ISCOPYONWRITE(entry)) {
+		vm_map_unlock_read(map);
+		UVMHIST_LOG(ubchist, "entry is not COW", 0,0,0,0);
+		return EINVAL;
+	}
+	if (entry->end < va + len) {
+		vm_map_unlock_read(map);
+		UVMHIST_LOG(ubchist, "chunk longer than entry", 0,0,0,0);
+		return EINVAL;
+	}
+
+	/*
+	 * None of the obvious reasons why we might not be able to do the loan
+	 * are true.  If we need to COW the amap, try to do it now.
+	 */
+
+	aref = entry->aref;
+	amap = aref.ar_amap;
+	KASSERT(amap || UVM_ET_ISNEEDSCOPY(entry));
+	if (amap == NULL) {
+		amap_copy(map, entry, M_WAITOK, TRUE, va, va + len);
+		if (UVM_ET_ISNEEDSCOPY(entry)) {
+			vm_map_unlock_read(map);
+			UVMHIST_LOG(ubchist, "amap COW failed", 0,0,0,0);
+			return ENOMEM;
+		}
+		aref = entry->aref;
+		amap = aref.ar_amap;
+		KASSERT(amap != NULL);
+		UVMHIST_LOG(ubchist, "amap has been COWed", 0,0,0,0);
+	}
+	aoff = va - entry->start;
+	maptime = map->timestamp;
+	vm_map_unlock_read(map);
+
+	/*
+	 * The map is all ready for us, now fetch the pages.
+	 * If the map changes out from under us, start over.
+	 */
+
+	simple_lock(&uobj->vmobjlock);
+	memset(pgs, 0, sizeof(pgs));
+	error = (*uobj->pgops->pgo_get)(uobj, off, pgs, &npages, 0,
+	    VM_PROT_READ, 0, PGO_SYNCIO);
+	if (error) {
+		UVMHIST_LOG(ubchist, "getpages -> %d", error,0,0,0);
+		return error;
+	}
+	vm_map_lock_read(map);
+	if (map->timestamp != maptime) {
+		simple_lock(&uobj->vmobjlock);
+		uvm_lock_pageq();
+		for (i = 0; i < npages; i++) {
+			uvm_pageactivate(pgs[i]);
+		}
+		uvm_page_unbusy(pgs, npages);
+		uvm_unlock_pageq();
+		simple_unlock(&uobj->vmobjlock);
+		goto retry;
+	}
+
+	/*
+	 * If any of the pages we fetched are wired, don't try to loan them.
+	 */
+
+	simple_lock(&uobj->vmobjlock);
+	for (i = 0; i < npages; i++) {
+		if (pgs[i]->wire_count) {
+			for (i = 0; i < npages; i++) {
+				uvm_pageactivate(pgs[i]);
+			}
+			uvm_lock_pageq();
+			uvm_page_unbusy(pgs, npages);
+			uvm_unlock_pageq();
+			simple_unlock(&uobj->vmobjlock);
+			vm_map_unlock_read(map);
+			return EBUSY;
+		}
+	}
+
+	/*
+	 * Both the map and the object pages are good to go.
+	 * Loan the pages to the anons in the amap.
+	 * The only thing that can go wrong now is that
+	 * we could run out of anons.  XXX handle this.
+	 */
+
+	memset(oanons, 0, sizeof(oanons));
+	amap_lock(amap);
+	uvm_lock_pageq();
+	for (i = 0; i < npages; i++) {
+		UVMHIST_LOG(ubchist, "pgs[%d] %p", i, pgs[i], 0,0);
+		pg = pgs[i];
+		pmap_page_protect(pg, VM_PROT_READ);
+		pg->loan_count++;
+		uvm_pageactivate(pg);
+		anon = amap_lookup(&aref, aoff + (i << PAGE_SHIFT));
+		if (anon) {
+			oanons[oanon++] = anon;
+			amap_unadd(&aref, aoff + (i << PAGE_SHIFT));
+		}
+		if (pg->uanon) {
+			anon = pg->uanon;
+			simple_lock(&anon->an_lock);
+			anon->an_ref++;
+		} else {
+			anon = uvm_analloc();
+			if (anon == NULL)
+				panic("uvm_do_map_loanobj: ran out of anons");
+			anon->u.an_page = pg;
+			pg->uanon = anon;
+		}
+		simple_unlock(&anon->an_lock);
+		amap_add(&aref, aoff + (i << PAGE_SHIFT), anon, FALSE);
+	}
+	uvm_unlock_pageq();
+	amap_unlock(amap);
+	simple_unlock(&uobj->vmobjlock);
+	vm_map_unlock_read(map);
+
+	/*
+	 * The map has all the new information now.
+	 * Enter the pages into the pmap to save likely faults later.
+	 */
+
+	for (i = 0; i < npages; i++) {
+		(void) pmap_enter(map->pmap, va + (i << PAGE_SHIFT),
+		    VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ, PMAP_CANFAIL);
+	}
+
+	/*
+	 * At this point we're done with the pages, unlock them now.
+	 */
+
+	simple_lock(&uobj->vmobjlock);
+	uvm_lock_pageq();
+	uvm_page_unbusy(pgs, npages);
+	uvm_unlock_pageq();
+	simple_unlock(&uobj->vmobjlock);
+
+	/*
+	 * Finally, free any anons which we replaced in the map.
+	 */
+
+	for (i = 0; i < oanon; i++) {
+		anon = oanons[i];
+		if (!anon) {
+			continue;
+		}
+		simple_lock(&anon->an_lock);
+		refs = --anon->an_ref;
+		simple_unlock(&anon->an_lock);
+		if (refs == 0) {
+			uvm_anfree(anon);
+		}
+	}
+	return error;
+}
+
 /*
  * uvmspace_alloc: allocate a vmspace structure.
  *
@@ -3643,7 +3910,7 @@ uvmspace_init(struct vmspace *vm, struct
 /*
  * uvmspace_share: share a vmspace between two processes
  *
- * - used for vfork, threads(?)
+ * - used for vfork.
  */
 
 void
Index: uvm/uvm_map.h
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_map.h,v
retrieving revision 1.45
diff -u -p -r1.45 uvm_map.h
--- uvm/uvm_map.h	11 Feb 2005 02:12:03 -0000	1.45
+++ uvm/uvm_map.h	15 Feb 2005 16:39:00 -0000
@@ -335,6 +335,8 @@ int		uvm_map_inherit(struct vm_map *, va
 		    vm_inherit_t);
 int		uvm_map_advice(struct vm_map *, vaddr_t, vaddr_t, int);
 void		uvm_map_init(void);
+int		uvm_map_loanobj(struct vm_map *, struct uvm_object *,
+		    struct uio *);
 boolean_t	uvm_map_lookup_entry(struct vm_map *, vaddr_t,
 		    struct vm_map_entry **);
 MAP_INLINE

--3V7upXqbjpZ4EhLz--