Subject: Possible fix for resets under heavy load
To: None <port-amd64@netbsd.org>
From: Christopher SEKIYA <wileyc@rezrov.net>
List: port-amd64
Date: 06/03/2004 11:07:00
All,

For those who were experiencing spontaneous reboots under heavy load, please
try the appended patch.  It brings the amd64 pmap into closer alignment with
the i386 pmap; namely, the use of splay tree macros and more cpu locking.
(Also, it removes i386-specific cruft).

It seems to do the right thing on my machine.

If there is no objection, I'll commit it over the weekend and request a
pullup.

-- Chris
	GPG key FEB9DE7F (91AF 4534 4529 4BCC 31A5  938E 023E EEFB FEB9 DE7F)


Index: amd64/pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/amd64/pmap.c,v
retrieving revision 1.9.2.1
diff -u -r1.9.2.1 pmap.c
--- amd64/pmap.c	1 Jun 2004 04:34:52 -0000	1.9.2.1
+++ amd64/pmap.c	2 Jun 2004 08:00:37 -0000
@@ -435,6 +435,24 @@
 #define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
 					/* high water mark */
 
+static __inline int
+pv_compare(struct pv_entry *a, struct pv_entry *b)
+{
+	if (a->pv_pmap < b->pv_pmap)
+		return (-1);
+	else if (a->pv_pmap > b->pv_pmap)
+		return (1);
+	else if (a->pv_va < b->pv_va)
+		return (-1);  
+	else if (a->pv_va > b->pv_va)
+		return (1);
+	else
+		return (0);
+}
+
+SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare); 
+SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
+
 /*
  * linked list of all non-kernel pmaps
  */
@@ -693,13 +711,9 @@
 		if (pmap_is_active(pmap, ci->ci_cpuid)) {
 			pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
 			s = splipi();
-#ifdef MULTIPROCESSOR
 			__cpu_simple_lock(&pq->pq_slock);
-#endif
 			pq->pq_flushu++;
-#ifdef MULTIPROCESSOR
 			__cpu_simple_unlock(&pq->pq_slock);
-#endif
 			splx(s);
 			x86_send_ipi(ci, X86_IPI_TLB);
 		}
@@ -867,7 +881,8 @@
 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
 			      va);
 #endif
-		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
+			pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
 	}
 	pmap_tlb_shootnow(cpumask);
 }
@@ -1094,12 +1109,6 @@
 	idt_paddr = avail_start;			/* steal a page */
 	avail_start += 2 * PAGE_SIZE;
 
-#if defined(I586_CPU)
-	/* pentium f00f bug stuff */
-	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
-	virtual_avail += PAGE_SIZE; pte++;
-#endif
-
 #ifdef _LP64
 	/*
 	 * Grab a page below 4G for things that need it (i.e.
@@ -1340,7 +1349,7 @@
 		}
 		pv = pvpage->pvinfo.pvpi_pvfree;
 		KASSERT(pv);
-		pvpage->pvinfo.pvpi_pvfree = pv->pv_next;
+		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
 		pv_nfpvents--;  /* took one from pool */
 	} else {
 		pv = NULL;		/* need more of them */
@@ -1400,7 +1409,7 @@
 		pvpage->pvinfo.pvpi_nfree--;	/* can't go to zero */
 		pv = pvpage->pvinfo.pvpi_pvfree;
 		KASSERT(pv);
-		pvpage->pvinfo.pvpi_pvfree = pv->pv_next;
+		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
 		pv_nfpvents--;  /* took one from pool */
 		return(pv);
 	}
@@ -1465,7 +1474,8 @@
 	pvp->pvinfo.pvpi_pvfree = NULL;
 	pvp->pvinfo.pvpi_nfree = tofree;
 	for (lcv = 0 ; lcv < tofree ; lcv++) {
-		pvp->pvents[lcv].pv_next = pvp->pvinfo.pvpi_pvfree;
+		SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
+			pvp->pvinfo.pvpi_pvfree;
 		pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
 	}
 	if (need_entry)
@@ -1501,7 +1511,7 @@
 	}
 
 	/* free it */
-	pv->pv_next = pvp->pvinfo.pvpi_pvfree;
+	SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
 	pvp->pvinfo.pvpi_pvfree = pv;
 
 	/*
@@ -1555,7 +1565,7 @@
 	simple_lock(&pvalloc_lock);
 
 	for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
-		nextpv = pvs->pv_next;
+		nextpv = SPLAY_RIGHT(pvs, pv_node);
 		pmap_free_pv_doit(pvs);
 	}
 
@@ -1652,10 +1662,7 @@
 	pve->pv_pmap = pmap;
 	pve->pv_va = va;
 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
-	simple_lock(&pvh->pvh_lock);		/* lock pv_head */
-	pve->pv_next = pvh->pvh_list;		/* add to ... */
-	pvh->pvh_list = pve;			/* ... locked list */
-	simple_unlock(&pvh->pvh_lock);		/* unlock, done! */
+	SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
 }
 
 /*
@@ -1674,18 +1681,14 @@
 	struct pmap *pmap;
 	vaddr_t va;
 {
-	struct pv_entry *pve, **prevptr;
+	struct pv_entry tmp, *pve;
 
-	prevptr = &pvh->pvh_list;		/* previous pv_entry pointer */
-	pve = *prevptr;
-	while (pve) {
-		if (pve->pv_pmap == pmap && pve->pv_va == va) {	/* match? */
-			*prevptr = pve->pv_next;		/* remove it! */
-			break;
-		}
-		prevptr = &pve->pv_next;		/* previous pointer */
-		pve = pve->pv_next;			/* advance */
-	}
+	tmp.pv_pmap = pmap;
+	tmp.pv_va = va;
+	pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
+	if (pve == NULL)
+		return (NULL);
+	SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
 	return(pve);				/* return removed pve */
 }
 
@@ -1703,13 +1706,11 @@
 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
 		return (pmap->pm_ptphint[lidx]);
 	}
-	if (lidx == 0)
-		pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
-	else {
-		simple_lock(&pmap->pm_obj[lidx].vmobjlock);
-		pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
-		simple_unlock(&pmap->pm_obj[lidx].vmobjlock);
-	}
+
+	simple_lock(&pmap->pm_obj[lidx].vmobjlock);
+	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
+	simple_unlock(&pmap->pm_obj[lidx].vmobjlock);
+
 	return pg;
 }
 
@@ -1723,14 +1724,13 @@
 
 	obj = &pmap->pm_obj[lidx];
 	pmap->pm_stats.resident_count--;
-	if (lidx != 0)
-		simple_lock(&obj->vmobjlock);
+
+	simple_lock(&obj->vmobjlock);
 	if (pmap->pm_ptphint[lidx] == ptp)
 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
 	ptp->wire_count = 0;
 	uvm_pagefree(ptp);
-	if (lidx != 0)
-		simple_unlock(&obj->vmobjlock);
+	simple_unlock(&obj->vmobjlock);
 }
 
 static void
@@ -2456,7 +2456,8 @@
 			pmap->pm_stats.wired_count--;
 		pmap->pm_stats.resident_count--;
 
-		pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
+		if (opte & PG_U)
+			pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
 
 		if (ptp)
 			ptp->wire_count--;		/* dropping a PTE */
@@ -2491,7 +2492,7 @@
 		simple_unlock(&vm_physmem[bank].pmseg.pvhead[off].pvh_lock);
 
 		if (pve) {
-			pve->pv_next = pv_tofree;
+			SPLAY_RIGHT(pve, pv_node) = pv_tofree;
 			pv_tofree = pve;
 		}
 
@@ -2541,7 +2542,8 @@
 	if (ptp)
 		ptp->wire_count--;		/* dropping a PTE */
 
-	pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+	if (opte & PG_U)
+		pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
 
 	/*
 	 * if we are not on a pv_head list we are done.
@@ -2733,7 +2735,7 @@
 {
 	int bank, off;
 	struct pv_head *pvh;
-	struct pv_entry *pve, *npve, **prevptr, *killlist = NULL;
+	struct pv_entry *pve, *npve, *killlist = NULL;
 	pt_entry_t *ptes, opte;
 	pd_entry_t **pdes;
 #ifdef DIAGNOSTIC
@@ -2743,13 +2745,11 @@
 
 	/* XXX: vm_page should either contain pv_head or have a pointer to it */
 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
-	if (bank == -1) {
-		printf("pmap_page_remove: unmanaged page?\n");
-		return;
-	}
+	if (bank == -1)
+		panic("pmap_page_remove: unmanaged page?");
 
 	pvh = &vm_physmem[bank].pmseg.pvhead[off];
-	if (pvh->pvh_list == NULL) {
+	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
 		return;
 	}
 
@@ -2759,9 +2759,8 @@
 	/* XXX: needed if we hold head->map lock? */
 	simple_lock(&pvh->pvh_lock);
 
-	for (prevptr = &pvh->pvh_list, pve = pvh->pvh_list;
-	    pve != NULL; pve = npve) {
-		npve = pve->pv_next;
+	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
+		npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
 		pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);	/* locks pmap */
 
 #ifdef DIAGNOSTIC
@@ -2785,7 +2784,9 @@
 			pve->pv_pmap->pm_stats.wired_count--;
 		pve->pv_pmap->pm_stats.resident_count--;
 
-		pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, &cpumask);
+		if (opte & PG_U)
+			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+				&cpumask);
 
 		/* sync R/M bits */
 		vm_physmem[bank].pmseg.attrs[off] |= (opte & (PG_U|PG_M));
@@ -2799,12 +2800,11 @@
 			}
 		}
 		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
-		*prevptr = npve;			/* remove it */
-		pve->pv_next = killlist;		/* mark it for death */
+		SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
+		SPLAY_RIGHT(pve, pv_node) = killlist;	/* mark it for death */
 		killlist = pve;
 	}
 	pmap_free_pvs(NULL, killlist);
-	pvh->pvh_list = NULL;
 	simple_unlock(&pvh->pvh_lock);
 	PMAP_HEAD_TO_MAP_UNLOCK();
 	pmap_tlb_shootnow(cpumask);
@@ -2837,10 +2837,8 @@
 
 	/* XXX: vm_page should either contain pv_head or have a pointer to it */
 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
-	if (bank == -1) {
-		printf("pmap_test_attrs: unmanaged page?\n");
-		return(FALSE);
-	}
+	if (bank == -1)
+		panic("pmap_test_attrs: unmanaged page?");
 
 	/*
 	 * before locking: see if attributes are already set and if so,
@@ -2853,7 +2851,7 @@
 
 	/* test to see if there is a list before bothering to lock */
 	pvh = &vm_physmem[bank].pmseg.pvhead[off];
-	if (pvh->pvh_list == NULL) {
+	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
 		return(FALSE);
 	}
 
@@ -2862,8 +2860,9 @@
 	/* XXX: needed if we hold head->map lock? */
 	simple_lock(&pvh->pvh_lock);
 
-	for (pve = pvh->pvh_list; pve != NULL && (*myattrs & testbits) == 0;
-	     pve = pve->pv_next) {
+	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
+	     pve != NULL && (*myattrs & testbits) == 0;
+	     pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
 		pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
 		pte = ptes[pl1_i(pve->pv_va)];
 		pmap_unmap_ptes(pve->pv_pmap);
@@ -2903,10 +2902,8 @@
 
 	/* XXX: vm_page should either contain pv_head or have a pointer to it */
 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
-	if (bank == -1) {
-		printf("pmap_change_attrs: unmanaged page?\n");
-		return(FALSE);
-	}
+	if (bank == -1)
+		panic("pmap_change_attrs: unmanaged page?");
 
 	PMAP_HEAD_TO_MAP_LOCK();
 	pvh = &vm_physmem[bank].pmseg.pvhead[off];
@@ -2917,7 +2914,7 @@
 	result = *myattrs & clearbits;
 	*myattrs &= ~clearbits;
 
-	for (pve = pvh->pvh_list; pve != NULL; pve = pve->pv_next) {
+	SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
 		pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);	/* locks pmap */
 #ifdef DIAGNOSTIC
 		if (!pmap_pdes_valid(pve->pv_va, pdes, NULL))
@@ -3648,9 +3645,7 @@
 		if (ci != self && !(ci->ci_flags & CPUF_RUNNING))
 			continue;
 		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
-#if defined(MULTIPROCESSOR)
 		__cpu_simple_lock(&pq->pq_slock);
-#endif
 
 		/*
 		 * If there's a global flush already queued, or a
@@ -3659,28 +3654,9 @@
 		 */
 		if (pq->pq_flushg > 0 ||
 		    (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) {
-#if defined(MULTIPROCESSOR)
 			__cpu_simple_unlock(&pq->pq_slock);
-#endif
-			continue;
-		}
-
-#ifdef I386_CPU
-		/*
-		 * i386 CPUs can't invalidate a single VA, only
-		 * flush the entire TLB, so don't bother allocating
-		 * jobs for them -- just queue a `flushu'.
-		 *
-		 * XXX note that this can be executed for non-i386
-		 * when called * early (before identifycpu() has set
-		 * cpu_class)
-		 */
-		if (cpu_class == CPUCLASS_386) {
-			pq->pq_flushu++;
-			*cpumaskp |= 1U << ci->ci_cpuid;
 			continue;
 		}
-#endif
 
 		pj = pmap_tlb_shootdown_job_get(pq);
 		pq->pq_pte |= pte;
@@ -3693,9 +3669,7 @@
 			 */
 			if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) {
 				pmap_update_pg(va);
-#if defined(MULTIPROCESSOR)
 				__cpu_simple_unlock(&pq->pq_slock);
-#endif
 				continue;
 			} else {
 				if (pq->pq_pte & pmap_pg_g)
@@ -3717,9 +3691,7 @@
 			TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
 			*cpumaskp |= 1U << ci->ci_cpuid;
 		}
-#if defined(MULTIPROCESSOR)
 		__cpu_simple_unlock(&pq->pq_slock);
-#endif
 	}
 	splx(s);
 }
@@ -3743,9 +3715,7 @@
 
 	s = splipi();
 
-#ifdef MULTIPROCESSOR
 	__cpu_simple_lock(&pq->pq_slock);
-#endif
 
 	if (pq->pq_flushg) {
 		COUNT(flushg);
@@ -3779,8 +3749,8 @@
 	for (CPU_INFO_FOREACH(cii, ci))
 		x86_atomic_clearbits_ul(&ci->ci_tlb_ipi_mask,
 		    (1U << cpu_id));
-	__cpu_simple_unlock(&pq->pq_slock);
 #endif
+	__cpu_simple_unlock(&pq->pq_slock);
 
 	splx(s);
 }
@@ -3825,9 +3795,7 @@
 	if (pq->pq_count >= PMAP_TLB_MAXJOBS)
 		return (NULL);
 
-#ifdef MULTIPROCESSOR
 	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
 	if (pj_free == NULL) {
 #ifdef MULTIPROCESSOR
 		__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
@@ -3837,9 +3805,7 @@
 	pj = &pj_free->pja_job;
 	pj_free =
 	    (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
-#ifdef MULTIPROCESSOR
 	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
 
 	pq->pq_count++;
 	return (pj);
@@ -3862,14 +3828,10 @@
 	if (pq->pq_count == 0)
 		panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
 #endif
-#ifdef MULTIPROCESSOR
 	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
 	pj->pj_nextfree = &pj_free->pja_job;
 	pj_free = (union pmap_tlb_shootdown_job_al *)pj;
-#ifdef MULTIPROCESSOR
 	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
 
 	pq->pq_count--;
 }
Index: include/pmap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/pmap.h,v
retrieving revision 1.1
diff -u -r1.1 pmap.h
--- include/pmap.h	26 Apr 2003 18:39:46 -0000	1.1
+++ include/pmap.h	2 Jun 2004 08:00:37 -0000
@@ -334,19 +334,12 @@
  * describes one mapping).
  */
 
-struct pv_entry;
-
-struct pv_head {
-	struct simplelock pvh_lock;	/* locks every pv on this list */
-	struct pv_entry *pvh_list;	/* head of list (locked by pvh_lock) */
-};
-
-struct pv_entry {			/* locked by its list's pvh_lock */
-	struct pv_entry *pv_next;	/* next entry */
-	struct pmap *pv_pmap;		/* the pmap */
-	vaddr_t pv_va;			/* the virtual address */
-	struct vm_page *pv_ptp;		/* the vm_page of the PTP */
-};
+struct pv_entry {                       /* locked by its list's pvh_lock */
+        SPLAY_ENTRY(pv_entry) pv_node;  /* splay-tree node */
+        struct pmap *pv_pmap;           /* the pmap */
+        vaddr_t pv_va;                  /* the virtual address */
+        struct vm_page *pv_ptp;         /* the vm_page of the PTP */
+};    
 
 /*
  * pv_entrys are dynamically allocated in chunks from a single page.
Index: include/vmparam.h
===================================================================
RCS file: /cvsroot/src/sys/arch/amd64/include/vmparam.h,v
retrieving revision 1.4
diff -u -r1.4 vmparam.h
--- include/vmparam.h	23 Mar 2004 18:54:32 -0000	1.4
+++ include/vmparam.h	2 Jun 2004 08:00:37 -0000
@@ -37,6 +37,8 @@
 #ifndef _VMPARAM_H_
 #define _VMPARAM_H_
 
+#include <sys/tree.h>
+
 /*
  * Machine dependent constants for 386.
  */
@@ -123,6 +125,25 @@
 
 #define __HAVE_PMAP_PHYSSEG
 
+#define __HAVE_VM_PAGE_MD
+#define VM_MDPAGE_INIT(pg)                                      \
+        memset(&(pg)->mdpage, 0, sizeof((pg)->mdpage));         \
+        simple_lock_init(&(pg)->mdpage.mp_pvhead.pvh_lock);     \
+        SPLAY_INIT(&(pg)->mdpage.mp_pvhead.pvh_root);
+
+struct pv_entry;
+
+struct pv_head {
+        struct simplelock pvh_lock;     /* locks every pv in this tree */
+        SPLAY_HEAD(pvtree, pv_entry) pvh_root;
+                                        /* head of tree (locked by pvh_lock) */
+};
+
+struct vm_page_md {
+        struct pv_head mp_pvhead;
+        int mp_attrs;
+};
+
 /*
  * pmap specific data stored in the vm_physmem[] array
  */