Subject: pmap.c hacking...
To: None <port-amd64@netbsd.org>
From: Greg Oster <oster@cs.usask.ca>
List: port-amd64
Date: 03/28/2007 20:47:38
This is a multipart MIME message.
--==_Exmh_1175121370_267330
Content-Type: text/plain; charset=us-ascii
Hi
In an effort to reduce the amount of time spent in "system" when
doing a pkgsrc build, I've been mucking around with pmap.c on amd64.
One of the changes I've made is to have one page of
pmap_tlb_shootdown_job's per CPU. This eliminates the global variable
used to hold the "free pool" of such jobs, and also eliminates all
the locking contention associated with that free pool. From the
testing I've done, performance for building a set of 166 packages
('time make -k -j 8') goes from this (baseline):
18925.7u 25439.2s 3:05:03.98 399.5% 0+0k 229+2870194io 3371pf+57w
to this (when just doubling the number of shootdown job slots available
per CPU):
18776.5u 23900.5s 2:56:29.96 402.9% 0+0k 292+2864111io 3374pf+0w
to this (double the number of job slots, and eliminate the global
"free queue"):
17941.4u 20939.2s 2:43:56.35 395.2% 0+0k 6048+2639046io 6331pf+0w
with my most recent tweaks to pmap.c (included here).
Comments are more than welcome (yes, the diff would need to be
cleaned up before it could be checked in :) ) A review by someone
more pmap/amd64-savvy would certainly be appreciated. :) I suspect
similar changes could be made to the i386 pmap.c. (I just havn't made
time to make the changes there too and do testing)
Later...
Greg Oster
--==_Exmh_1175121370_267330
Content-Type: text/plain ; name="pmap.c.diff"; charset=us-ascii
Content-Description: pmap.c.diff
--- pmap.c.orig 2007-03-27 09:17:49.000000000 -0600
+++ pmap.c 2007-03-28 15:49:22.000000000 -0600
@@ -359,19 +359,24 @@ struct pmap_tlb_shootdown_q {
__cpu_simple_lock_t pq_slock; /* spin lock on queue */
int pq_flushg; /* pending flush global */
int pq_flushu; /* pending flush user */
+ struct pmap_tlb_shootdown_job *pj_free;
+ union pmap_tlb_shootdown_job_al *pj_page;
} pmap_tlb_shootdown_q[X86_MAXPROCS];
+#if 0
#define PMAP_TLB_MAXJOBS 16
+#endif
+#define PMAP_TLB_MAXJOBS 32
void pmap_tlb_shootdown_q_drain __P((struct pmap_tlb_shootdown_q *));
struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
__P((struct pmap_tlb_shootdown_q *));
void pmap_tlb_shootdown_job_put __P((struct pmap_tlb_shootdown_q *,
struct pmap_tlb_shootdown_job *));
-
+#if 0
__cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
-
+#endif
/*
* global data structures
*/
@@ -1089,7 +1094,9 @@ pmap_bootstrap(kva_start)
* Initialize the TLB shootdown queues.
*/
+#if 0
__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
+#endif
for (i = 0; i < X86_MAXPROCS; i++) {
TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
@@ -1148,6 +1155,9 @@ pmap_prealloc_lowmem_ptps(void)
void
pmap_init()
{
+ struct pmap_tlb_shootdown_q *pq;
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
int npages, lcv, i;
vaddr_t addr;
vsize_t s;
@@ -1212,17 +1222,20 @@ pmap_init()
pv_nfpvents = 0;
- pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
- if (pj_page == NULL)
- panic("pmap_init: pj_page");
-
- for (i = 0;
- i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
- i++)
- pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
- pj_page[i].pja_job.pj_nextfree = NULL;
- pj_free = &pj_page[0];
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+ pq->pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE,
+ 0, UVM_KMF_WIRED);
+ if (pq->pj_page == NULL)
+ panic("pmap_init: pj_page");
+ for (i = 0;
+ i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
+ i++)
+ pq->pj_page[i].pja_job.pj_nextfree = &pq->pj_page[i + 1].pja_job;
+ pq->pj_page[i].pja_job.pj_nextfree = NULL;
+ pq->pj_free = (struct pmap_tlb_shootdown_job *)&(pq->pj_page[0]);
+ }
/*
* done: pmap module is up (and ready for business)
*/
@@ -3660,7 +3673,6 @@ pmap_do_tlb_shootdown(struct cpu_info *s
if ((!pq->pq_flushu && pmap_is_curpmap(pj->pj_pmap)) ||
(pj->pj_pte & pmap_pg_g))
pmap_update_pg(pj->pj_va);
-
pmap_tlb_shootdown_job_put(pq, pj);
}
@@ -3717,22 +3729,12 @@ pmap_tlb_shootdown_job_get(pq)
if (pq->pq_count >= PMAP_TLB_MAXJOBS)
return (NULL);
-#ifdef MULTIPROCESSOR
- __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
- if (pj_free == NULL) {
-#ifdef MULTIPROCESSOR
- __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
+ if (pq->pj_free == NULL) {
return NULL;
+ } else {
+ pj = pq->pj_free;
+ pq->pj_free = pj->pj_nextfree;
}
- pj = &pj_free->pja_job;
- pj_free =
- (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
-#ifdef MULTIPROCESSOR
- __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
-
pq->pq_count++;
return (pj);
}
@@ -3754,14 +3756,7 @@ pmap_tlb_shootdown_job_put(pq, pj)
if (pq->pq_count == 0)
panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
#endif
-#ifdef MULTIPROCESSOR
- __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
-#endif
- pj->pj_nextfree = &pj_free->pja_job;
- pj_free = (union pmap_tlb_shootdown_job_al *)pj;
-#ifdef MULTIPROCESSOR
- __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
-#endif
-
+ pj->pj_nextfree = pq->pj_free;
+ pq->pj_free = pj;
pq->pq_count--;
}
--==_Exmh_1175121370_267330--