Subject: pmap.c hacking...
To: None <port-amd64@netbsd.org>
From: Greg Oster <oster@cs.usask.ca>
List: port-amd64
Date: 03/28/2007 20:47:38
This is a multipart MIME message.

--==_Exmh_1175121370_267330
Content-Type: text/plain; charset=us-ascii


Hi

In an effort to reduce the amount of time spent in "system" when 
doing a pkgsrc build, I've been mucking around with pmap.c on amd64.  
One of the changes I've made is to have one page of 
pmap_tlb_shootdown_job's per CPU.  This eliminates the global variable 
used to hold the "free pool" of such jobs, and also eliminates all 
the locking contention associated with that free pool.  From the 
testing I've done, performance for building a set of 166 packages 
('time make -k -j 8') goes from this (baseline):
 
 18925.7u 25439.2s 3:05:03.98 399.5% 0+0k 229+2870194io 3371pf+57w

to this (when just doubling the number of shootdown job slots available 
per CPU):

 18776.5u 23900.5s 2:56:29.96 402.9% 0+0k 292+2864111io 3374pf+0w

to this (double the number of job slots, and eliminate the global 
"free queue"):

 17941.4u 20939.2s 2:43:56.35 395.2% 0+0k 6048+2639046io 6331pf+0w

with my most recent tweaks to pmap.c (included here).

Comments are more than welcome (yes, the diff would need to be 
cleaned up before it could be checked in :) )  A review by someone
more pmap/amd64-savvy would certainly be appreciated. :)  I suspect 
similar changes could be made to the i386 pmap.c. (I just havn't made 
time to make the changes there too and do testing)

Later...

Greg Oster



--==_Exmh_1175121370_267330
Content-Type: text/plain ; name="pmap.c.diff"; charset=us-ascii
Content-Description: pmap.c.diff

--- pmap.c.orig	2007-03-27 09:17:49.000000000 -0600
+++ pmap.c	2007-03-28 15:49:22.000000000 -0600
@@ -359,19 +359,24 @@ struct pmap_tlb_shootdown_q {
 	__cpu_simple_lock_t pq_slock;	/* spin lock on queue */
 	int pq_flushg;		/* pending flush global */
 	int pq_flushu;		/* pending flush user */
+	struct pmap_tlb_shootdown_job *pj_free;
+	union pmap_tlb_shootdown_job_al *pj_page;
 } pmap_tlb_shootdown_q[X86_MAXPROCS];
 
+#if 0
 #define	PMAP_TLB_MAXJOBS	16
+#endif
+#define	PMAP_TLB_MAXJOBS	32
 
 void	pmap_tlb_shootdown_q_drain __P((struct pmap_tlb_shootdown_q *));
 struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
 	    __P((struct pmap_tlb_shootdown_q *));
 void	pmap_tlb_shootdown_job_put __P((struct pmap_tlb_shootdown_q *,
 	    struct pmap_tlb_shootdown_job *));
-
+#if 0
 __cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
 union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
-
+#endif
 /*
  * global data structures
  */
@@ -1089,7 +1094,9 @@ pmap_bootstrap(kva_start)
 	 * Initialize the TLB shootdown queues.
 	 */
 
+#if 0
 	__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
+#endif
 
 	for (i = 0; i < X86_MAXPROCS; i++) {
 		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
@@ -1148,6 +1155,9 @@ pmap_prealloc_lowmem_ptps(void)
 void
 pmap_init()
 {
+	struct pmap_tlb_shootdown_q *pq;
+	struct cpu_info *ci;
+	CPU_INFO_ITERATOR cii;
 	int npages, lcv, i;
 	vaddr_t addr;
 	vsize_t s;
@@ -1212,17 +1222,20 @@ pmap_init()
 
 	pv_nfpvents = 0;
 
-	pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
-	if (pj_page == NULL)
-		panic("pmap_init: pj_page");
-
-	for (i = 0;
-	     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
-	     i++)
-		pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
-	pj_page[i].pja_job.pj_nextfree = NULL;
-	pj_free = &pj_page[0];
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+		pq->pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 
+						   0, UVM_KMF_WIRED);
+		if (pq->pj_page == NULL)
+			panic("pmap_init: pj_page");
 
+		for (i = 0;
+		     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
+		     i++)
+			pq->pj_page[i].pja_job.pj_nextfree = &pq->pj_page[i + 1].pja_job;
+		pq->pj_page[i].pja_job.pj_nextfree = NULL;
+		pq->pj_free = (struct pmap_tlb_shootdown_job *)&(pq->pj_page[0]);
+	}
 	/*
 	 * done: pmap module is up (and ready for business)
 	 */
@@ -3660,7 +3673,6 @@ pmap_do_tlb_shootdown(struct cpu_info *s
 			if ((!pq->pq_flushu && pmap_is_curpmap(pj->pj_pmap)) ||
 			    (pj->pj_pte & pmap_pg_g))
 				pmap_update_pg(pj->pj_va);
-
 			pmap_tlb_shootdown_job_put(pq, pj);
 		}
 
@@ -3717,22 +3729,12 @@ pmap_tlb_shootdown_job_get(pq)
 	if (pq->pq_count >= PMAP_TLB_MAXJOBS)
 		return (NULL);
 
-#ifdef MULTIPROCESSOR
-	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
-	if (pj_free == NULL) {
-#ifdef MULTIPROCESSOR
-		__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
+	if (pq->pj_free == NULL) {
 		return NULL;
+	} else {
+		pj = pq->pj_free;
+		pq->pj_free = pj->pj_nextfree;
 	}
-	pj = &pj_free->pja_job;
-	pj_free =
-	    (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
-#ifdef MULTIPROCESSOR
-	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
-
 	pq->pq_count++;
 	return (pj);
 }
@@ -3754,14 +3756,7 @@ pmap_tlb_shootdown_job_put(pq, pj)
 	if (pq->pq_count == 0)
 		panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
 #endif
-#ifdef MULTIPROCESSOR
-	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
-	pj->pj_nextfree = &pj_free->pja_job;
-	pj_free = (union pmap_tlb_shootdown_job_al *)pj;
-#ifdef MULTIPROCESSOR
-	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
-
+	pj->pj_nextfree = pq->pj_free;
+	pq->pj_free = pj;
 	pq->pq_count--;
 }

--==_Exmh_1175121370_267330--