Subject: remplacing page mappings behind uvm
To: None <tech-kern@netbsd.org>
From: Manuel Bouyer <bouyer@antioche.eu.org>
List: tech-kern
Date: 11/28/2007 15:06:37
--9amGYk9869ThD9tj
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi,
For Xen dom0 support, we have to provide 2 ioctls to userland 
(IOCTL_PRIVCMD_MMAP and IOCTL_PRIVCMD_MMAPBATCH). The caller program
allocate some pages (usually via mmap) and then call this ioctl vi the
VA of the memory area and a list of page from a remote domain to map
in place of the pages from dom0 that were just allocated.
The current code just calls pmap_enter_ma() with domid != DOMID_SELF for each
page provided by the caller. The new pages are wired and non-mamanged.
We usually don't have to unmap old pages because the memory area that has just
been mmaped have not been faulted in yet.

Although it'a gross hack, it works for Xeni386. It also works on Xenamd64,
up to process exit: it triggers an assersion in pmap_destroy() because
pmap_remove() is not called on the hacked memory ranges.

I added a call to uvm_map_pageable() in the ioctl handlers, so that the
vm_map is populated with these ranges and marked wired.
So now the calls to pmap_enter_ma() remplace a managed page with a non-mamaned
one. In this case I added a call to uvm_pagefree() to give back the
now-unused dom0 page to UVM. All these changes are in the attached patch.
This works fine; I can now start and stop UVM domains without triggering
a KASSERT when qemu-dm exits. But I can cause a lockdebug panic under
high paging activity, like a dd if=/dev/zero of=some_file:
Reader / writer lock error: rw_vector_enter: locking against myself

lock address : 0xffffa0003020b6d0
current cpu  :                  0
current lwp  : 0xffffa000304cf180
owner/count  : 0xffffa000304cf180 flags    : 0x0000000000000004

panic: lock error
Stopped in pid 291.1 (qemu-dm) at       netbsd:breakpoint+0x1:  ret
breakpoint() at netbsd:breakpoint+0x1
lockdebug_abort() at netbsd:lockdebug_abort+0x63
rw_vector_enter() at netbsd:rw_vector_enter+0xfc
uvm_fault_internal() at netbsd:uvm_fault_internal+0xec
trap() at netbsd:trap+0x74f

It's always in qemu-dm so I suspect the cause is these pmap manipulation
(and the "locking against myself" is because of an unexpected
"goto ReFault;" in uvm_fault_internal).

Any idea of what I'm doing wrong here ?

-- 
Manuel Bouyer, LIP6, Universite Paris VI.           Manuel.Bouyer@lip6.fr
     NetBSD: 26 ans d'experience feront toujours la difference
--

--9amGYk9869ThD9tj
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=diff

Index: xen/xen/privcmd.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/privcmd.c,v
retrieving revision 1.20
diff -u -r1.20 privcmd.c
--- xen/xen/privcmd.c	22 Nov 2007 16:17:08 -0000	1.20
+++ xen/xen/privcmd.c	28 Nov 2007 13:27:08 -0000
@@ -159,7 +159,13 @@
 #endif
 			va = mentry.va & ~PAGE_MASK;
 			ma = mentry.mfn <<  PGSHIFT; /* XXX ??? */
-			vm_map_lock_read(vmm);
+			if ((error = uvm_map_pageable(vmm, va,
+			    va + (mentry.npages << PGSHIFT), 0,
+			    UVM_LK_EXIT )) != 0) {
+				vm_map_unlock(vmm);
+				printf("uvm_map_pageable error %d\n", error);
+				return error;
+			}
 			if (uvm_map_checkprot(vmm, va,
 			    va + (mentry.npages << PGSHIFT) - 1,
 			    VM_PROT_WRITE))
@@ -172,10 +178,9 @@
 				printf("uvm_map_checkprot 0x%lx -> 0x%lx "
 				    "failed\n",
 				    va, va + (mentry.npages << PGSHIFT) - 1);
-				vm_map_unlock_read(vmm);
+				vm_map_unlock(vmm);
 				return EINVAL;
 			}
-			vm_map_unlock_read(vmm);
 
 			for (j = 0; j < mentry.npages; j++) {
 				//printf("remap va 0x%lx to 0x%lx\n", va, ma);
@@ -183,11 +188,13 @@
 				    prot, PMAP_WIRED | PMAP_CANFAIL,
 				    mcmd->dom);
 				if (error != 0) {
+					vm_map_unlock(vmm);
 					return error;
 				}
 				va += PAGE_SIZE;
 				ma += PAGE_SIZE;
 			}
+			vm_map_unlock(vmm);
 		}
 		break;
 	}
@@ -211,7 +218,12 @@
 			return EINVAL;
 		
 		//printf("mmapbatch: va0=%lx num=%d dom=%d\n", va0, pmb->num, pmb->dom);
-		vm_map_lock_read(vmm);
+		if ((error = uvm_map_pageable(vmm, va0,
+		    va0 + (pmb->num << PGSHIFT), 0, UVM_LK_EXIT)) != 0) {
+			printf("uvm_map_pageable error %d\n", error);
+			vm_map_unlock(vmm);
+			return error;
+		}
 		if (uvm_map_checkprot(vmm,
 		    va0, va0 + (pmb->num << PGSHIFT) - 1,
 		    VM_PROT_WRITE))
@@ -224,10 +236,9 @@
 			printf("uvm_map_checkprot2 0x%lx -> 0x%lx "
 			    "failed\n",
 			    va0, va0 + (pmb->num << PGSHIFT) - 1);
-			vm_map_unlock_read(vmm);
+			vm_map_unlock(vmm);
 			return EINVAL;
 		}
-		vm_map_unlock_read(vmm);
 
 		for(i = 0; i < pmb->num; ++i) {
 			va = va0 + (i * PAGE_SIZE);
@@ -244,11 +255,12 @@
 			error = pmap_enter_ma(pmap, va, ma, 0, prot,
 			    PMAP_WIRED | PMAP_CANFAIL, pmb->dom);
 			if (error != 0) {
-				printf("mmapbatch: remap error %d!\n", error);
+				//printf("mmapbatch: remap error %d!\n", error);
 				mfn |= 0xF0000000;
 				copyout(&mfn, &pmb->arr[i], sizeof(mfn));
 			}
 		}
+		vm_map_unlock(vmm);
 		error = 0;
 		break;
 	}
Index: x86/x86/pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/x86/x86/pmap.c,v
retrieving revision 1.9
diff -u -r1.9 pmap.c
--- x86/x86/pmap.c	23 Nov 2007 16:33:25 -0000	1.9
+++ x86/x86/pmap.c	28 Nov 2007 13:27:09 -0000
@@ -2012,6 +2022,8 @@
 	 */
 
 	for (i = 0; i < PTP_LEVELS - 1; i++) {
+		if (pmap->pm_obj[i].uo_npages)
+			printf("pmap_destroy %p pmap->pm_obj[%d].uo_npages == %d\n", pmap, i, pmap->pm_obj[i].uo_npages);
 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
 	}
@@ -3581,15 +3595,6 @@
 	 */
 
 	if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == ma)) {
-
-		/*
-		 * first, calculate pm_stats updates.  resident count will not
-		 * change since we are replacing/changing a valid mapping.
-		 * wired count might change...
-		 */
-		pmap->pm_stats.wired_count +=
-		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
-
 		npte |= (opte & PG_PVLIST);
 
 		/* zap! */
@@ -3597,17 +3602,30 @@
 		if (domid != DOMID_SELF) {
 			int s = splvm();
 			opte = *ptep;
-			xpq_update_foreign((pt_entry_t *)vtomach((vaddr_t)ptep),
-			    npte, domid);
+			error = xpq_update_foreign(
+			    (pt_entry_t *)vtomach((vaddr_t)ptep), npte, domid);
 			splx(s);
+			if (error)
+			    goto out;
 		} else
 #endif /* XEN */
 			opte = pmap_pte_testset(ptep, npte);
 
 		/*
+		 * calculate pm_stats updates.  resident count will not
+		 * change since we are replacing/changing a valid mapping.
+		 * wired count might change...
+		 */
+		pmap->pm_stats.wired_count +=
+		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+		/*
 		 * if this is on the PVLIST, sync R/M bit
 		 */
 		if (opte & PG_PVLIST) {
+#ifdef XEN
+			KASSERT(domid == DOMID_SELF);
+#endif
 			pg = PHYS_TO_VM_PAGE(pa);
 #ifdef DIAGNOSTIC
 			if (pg == NULL)
@@ -3683,8 +3701,34 @@
 			/* new_pvh is NULL if page will not be managed */
 			pmap_lock_pvhs(old_pvh, new_pvh);
 
-			/* zap! */
-			opte = pmap_pte_testset(ptep, npte);
+#ifdef XEN
+			if (domid != DOMID_SELF) {
+				int s = splvm();
+				opte = *ptep;
+				error = xpq_update_foreign(
+				    (pt_entry_t *)vtomach((vaddr_t)ptep),
+				    npte, domid);
+				if (error) {
+					pmap->pm_stats.wired_count -=
+					    ((npte & PG_W) ? 1 : 0 -
+					    (opte & PG_W) ? 1 : 0);
+					splx(s);
+					mutex_spin_exit(&old_pvh->pvh_lock);
+					goto out;
+				}
+				splx(s);
+
+#if 1
+				KASSERT(pg->uobject == NULL);
+				simple_lock(&pg->uanon->an_lock)
+				uvm_lock_pageq()
+				uvm_pagefree(pg);
+				uvm_unlock_pageq()
+				simple_unlock(&pg->uanon->an_lock)
+#endif
+			} else
+#endif /* XEN */
+				opte = pmap_pte_testset(ptep, npte); /* zap !*/
 
 			pve = pmap_remove_pv(old_pvh, pmap, va);
 			KASSERT(pve != 0);
@@ -3707,23 +3751,38 @@
 			ptp->wire_count++;
 	}
 
-	if (new_pvh) {
-		mutex_spin_enter(&new_pvh->pvh_lock);
-		pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
-		mutex_spin_exit(&new_pvh->pvh_lock);
-	}
 
 #ifdef XEN
 	if (domid != DOMID_SELF) {
 		int s = splvm();
 		opte = *ptep;
-		xpq_update_foreign((pt_entry_t *)vtomach((vaddr_t)ptep),
+		error = xpq_update_foreign((pt_entry_t *)vtomach((vaddr_t)ptep),
 		    npte, domid);
 		splx(s);
+		if (error) {
+			if (pmap_valid_entry(opte)) {
+				pmap->pm_stats.wired_count -=
+				    ((npte & PG_W) ? 1 : 0 -
+				    (opte & PG_W) ? 1 : 0);
+			} else {	/* opte not valid */
+				pmap->pm_stats.resident_count++;
+				if (wired) 
+					pmap->pm_stats.wired_count++;
+				if (ptp)
+					ptp->wire_count++;
+			}
+			goto out;
+		}
 	} else
 #endif /* XEN */
 		opte = pmap_pte_testset(ptep, npte);   /* zap! */
 
+	if (new_pvh) {
+		mutex_spin_enter(&new_pvh->pvh_lock);
+		pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+		mutex_spin_exit(&new_pvh->pvh_lock);
+	}
+
 shootdown_test:
 	/* Update page attributes if needed */
 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {

--9amGYk9869ThD9tj--