Subject: Re: pmap.c hacking...
To: Greg Oster <oster@cs.usask.ca>
From: Jason Thorpe <thorpej@shagadelic.org>
List: port-amd64
Date: 03/28/2007 20:56:29
On Mar 28, 2007, at 7:47 PM, Greg Oster wrote:
> In an effort to reduce the amount of time spent in "system" when
> doing a pkgsrc build, I've been mucking around with pmap.c on amd64.
> One of the changes I've made is to have one page of
> pmap_tlb_shootdown_job's per CPU. This eliminates the global variable
> used to hold the "free pool" of such jobs, and also eliminates all
> the locking contention associated with that free pool. From the
> testing I've done, performance for building a set of 166 packages
> ('time make -k -j 8') goes from this (baseline):
...
Cool.
This reminds me -- I really ought to change pool_cache to support per-
CPU caches. For another day...
Anyway, I think this is probably OK for now, but we should certainly
revisit TLB shootdown in general.
>
> 18925.7u 25439.2s 3:05:03.98 399.5% 0+0k 229+2870194io 3371pf+57w
>
> to this (when just doubling the number of shootdown job slots
> available
> per CPU):
>
> 18776.5u 23900.5s 2:56:29.96 402.9% 0+0k 292+2864111io 3374pf+0w
>
> to this (double the number of job slots, and eliminate the global
> "free queue"):
>
> 17941.4u 20939.2s 2:43:56.35 395.2% 0+0k 6048+2639046io 6331pf+0w
>
> with my most recent tweaks to pmap.c (included here).
>
> Comments are more than welcome (yes, the diff would need to be
> cleaned up before it could be checked in :) ) A review by someone
> more pmap/amd64-savvy would certainly be appreciated. :) I suspect
> similar changes could be made to the i386 pmap.c. (I just havn't made
> time to make the changes there too and do testing)
>
> Later...
>
> Greg Oster
>
>
> --- pmap.c.orig 2007-03-27 09:17:49.000000000 -0600
> +++ pmap.c 2007-03-28 15:49:22.000000000 -0600
> @@ -359,19 +359,24 @@ struct pmap_tlb_shootdown_q {
> __cpu_simple_lock_t pq_slock; /* spin lock on queue */
> int pq_flushg; /* pending flush global */
> int pq_flushu; /* pending flush user */
> + struct pmap_tlb_shootdown_job *pj_free;
> + union pmap_tlb_shootdown_job_al *pj_page;
> } pmap_tlb_shootdown_q[X86_MAXPROCS];
>
> +#if 0
> #define PMAP_TLB_MAXJOBS 16
> +#endif
> +#define PMAP_TLB_MAXJOBS 32
>
> void pmap_tlb_shootdown_q_drain __P((struct pmap_tlb_shootdown_q *));
> struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
> __P((struct pmap_tlb_shootdown_q *));
> void pmap_tlb_shootdown_job_put __P((struct pmap_tlb_shootdown_q *,
> struct pmap_tlb_shootdown_job *));
> -
> +#if 0
> __cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
> union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
> -
> +#endif
> /*
> * global data structures
> */
> @@ -1089,7 +1094,9 @@ pmap_bootstrap(kva_start)
> * Initialize the TLB shootdown queues.
> */
>
> +#if 0
> __cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
> +#endif
>
> for (i = 0; i < X86_MAXPROCS; i++) {
> TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
> @@ -1148,6 +1155,9 @@ pmap_prealloc_lowmem_ptps(void)
> void
> pmap_init()
> {
> + struct pmap_tlb_shootdown_q *pq;
> + struct cpu_info *ci;
> + CPU_INFO_ITERATOR cii;
> int npages, lcv, i;
> vaddr_t addr;
> vsize_t s;
> @@ -1212,17 +1222,20 @@ pmap_init()
>
> pv_nfpvents = 0;
>
> - pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
> UVM_KMF_WIRED);
> - if (pj_page == NULL)
> - panic("pmap_init: pj_page");
> -
> - for (i = 0;
> - i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
> - i++)
> - pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
> - pj_page[i].pja_job.pj_nextfree = NULL;
> - pj_free = &pj_page[0];
> + for (CPU_INFO_FOREACH(cii, ci)) {
> + pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
> + pq->pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE,
> + 0, UVM_KMF_WIRED);
> + if (pq->pj_page == NULL)
> + panic("pmap_init: pj_page");
>
> + for (i = 0;
> + i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) -
> 1);
> + i++)
> + pq->pj_page[i].pja_job.pj_nextfree = &pq->pj_page[i + 1].pja_job;
> + pq->pj_page[i].pja_job.pj_nextfree = NULL;
> + pq->pj_free = (struct pmap_tlb_shootdown_job *)&(pq->pj_page[0]);
> + }
> /*
> * done: pmap module is up (and ready for business)
> */
> @@ -3660,7 +3673,6 @@ pmap_do_tlb_shootdown(struct cpu_info *s
> if ((!pq->pq_flushu && pmap_is_curpmap(pj->pj_pmap)) ||
> (pj->pj_pte & pmap_pg_g))
> pmap_update_pg(pj->pj_va);
> -
> pmap_tlb_shootdown_job_put(pq, pj);
> }
>
> @@ -3717,22 +3729,12 @@ pmap_tlb_shootdown_job_get(pq)
> if (pq->pq_count >= PMAP_TLB_MAXJOBS)
> return (NULL);
>
> -#ifdef MULTIPROCESSOR
> - __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
> -#endif
> - if (pj_free == NULL) {
> -#ifdef MULTIPROCESSOR
> - __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> + if (pq->pj_free == NULL) {
> return NULL;
> + } else {
> + pj = pq->pj_free;
> + pq->pj_free = pj->pj_nextfree;
> }
> - pj = &pj_free->pja_job;
> - pj_free =
> - (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
> -#ifdef MULTIPROCESSOR
> - __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -
> pq->pq_count++;
> return (pj);
> }
> @@ -3754,14 +3756,7 @@ pmap_tlb_shootdown_job_put(pq, pj)
> if (pq->pq_count == 0)
> panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
> #endif
> -#ifdef MULTIPROCESSOR
> - __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
> -#endif
> - pj->pj_nextfree = &pj_free->pja_job;
> - pj_free = (union pmap_tlb_shootdown_job_al *)pj;
> -#ifdef MULTIPROCESSOR
> - __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -
> + pj->pj_nextfree = pq->pj_free;
> + pq->pj_free = pj;
> pq->pq_count--;
> }
-- thorpej