Subject: Re: pmap.c hacking...
To: Greg Oster <oster@cs.usask.ca>
From: Jason Thorpe <thorpej@shagadelic.org>
List: tech-kern
Date: 03/28/2007 20:56:29
On Mar 28, 2007, at 7:47 PM, Greg Oster wrote:

> In an effort to reduce the amount of time spent in "system" when
> doing a pkgsrc build, I've been mucking around with pmap.c on amd64.
> One of the changes I've made is to have one page of
> pmap_tlb_shootdown_job's per CPU.  This eliminates the global variable
> used to hold the "free pool" of such jobs, and also eliminates all
> the locking contention associated with that free pool.  From the
> testing I've done, performance for building a set of 166 packages
> ('time make -k -j 8') goes from this (baseline):

...

Cool.

This reminds me -- I really ought to change pool_cache to support per- 
CPU caches.  For another day...

Anyway, I think this is probably OK for now, but we should certainly  
revisit TLB shootdown in general.


>
>  18925.7u 25439.2s 3:05:03.98 399.5% 0+0k 229+2870194io 3371pf+57w
>
> to this (when just doubling the number of shootdown job slots  
> available
> per CPU):
>
>  18776.5u 23900.5s 2:56:29.96 402.9% 0+0k 292+2864111io 3374pf+0w
>
> to this (double the number of job slots, and eliminate the global
> "free queue"):
>
>  17941.4u 20939.2s 2:43:56.35 395.2% 0+0k 6048+2639046io 6331pf+0w
>
> with my most recent tweaks to pmap.c (included here).
>
> Comments are more than welcome (yes, the diff would need to be
> cleaned up before it could be checked in :) )  A review by someone
> more pmap/amd64-savvy would certainly be appreciated. :)  I suspect
> similar changes could be made to the i386 pmap.c. (I just havn't made
> time to make the changes there too and do testing)
>
> Later...
>
> Greg Oster
>
>
> --- pmap.c.orig	2007-03-27 09:17:49.000000000 -0600
> +++ pmap.c	2007-03-28 15:49:22.000000000 -0600
> @@ -359,19 +359,24 @@ struct pmap_tlb_shootdown_q {
>  	__cpu_simple_lock_t pq_slock;	/* spin lock on queue */
>  	int pq_flushg;		/* pending flush global */
>  	int pq_flushu;		/* pending flush user */
> +	struct pmap_tlb_shootdown_job *pj_free;
> +	union pmap_tlb_shootdown_job_al *pj_page;
>  } pmap_tlb_shootdown_q[X86_MAXPROCS];
>
> +#if 0
>  #define	PMAP_TLB_MAXJOBS	16
> +#endif
> +#define	PMAP_TLB_MAXJOBS	32
>
>  void	pmap_tlb_shootdown_q_drain __P((struct pmap_tlb_shootdown_q *));
>  struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
>  	    __P((struct pmap_tlb_shootdown_q *));
>  void	pmap_tlb_shootdown_job_put __P((struct pmap_tlb_shootdown_q *,
>  	    struct pmap_tlb_shootdown_job *));
> -
> +#if 0
>  __cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
>  union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
> -
> +#endif
>  /*
>   * global data structures
>   */
> @@ -1089,7 +1094,9 @@ pmap_bootstrap(kva_start)
>  	 * Initialize the TLB shootdown queues.
>  	 */
>
> +#if 0
>  	__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
> +#endif
>
>  	for (i = 0; i < X86_MAXPROCS; i++) {
>  		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
> @@ -1148,6 +1155,9 @@ pmap_prealloc_lowmem_ptps(void)
>  void
>  pmap_init()
>  {
> +	struct pmap_tlb_shootdown_q *pq;
> +	struct cpu_info *ci;
> +	CPU_INFO_ITERATOR cii;
>  	int npages, lcv, i;
>  	vaddr_t addr;
>  	vsize_t s;
> @@ -1212,17 +1222,20 @@ pmap_init()
>
>  	pv_nfpvents = 0;
>
> -	pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,  
> UVM_KMF_WIRED);
> -	if (pj_page == NULL)
> -		panic("pmap_init: pj_page");
> -
> -	for (i = 0;
> -	     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
> -	     i++)
> -		pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
> -	pj_page[i].pja_job.pj_nextfree = NULL;
> -	pj_free = &pj_page[0];
> +	for (CPU_INFO_FOREACH(cii, ci)) {
> +		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
> +		pq->pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE,
> +						   0, UVM_KMF_WIRED);
> +		if (pq->pj_page == NULL)
> +			panic("pmap_init: pj_page");
>
> +		for (i = 0;
> +		     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) -  
> 1);
> +		     i++)
> +			pq->pj_page[i].pja_job.pj_nextfree = &pq->pj_page[i + 1].pja_job;
> +		pq->pj_page[i].pja_job.pj_nextfree = NULL;
> +		pq->pj_free = (struct pmap_tlb_shootdown_job *)&(pq->pj_page[0]);
> +	}
>  	/*
>  	 * done: pmap module is up (and ready for business)
>  	 */
> @@ -3660,7 +3673,6 @@ pmap_do_tlb_shootdown(struct cpu_info *s
>  			if ((!pq->pq_flushu && pmap_is_curpmap(pj->pj_pmap)) ||
>  			    (pj->pj_pte & pmap_pg_g))
>  				pmap_update_pg(pj->pj_va);
> -
>  			pmap_tlb_shootdown_job_put(pq, pj);
>  		}
>
> @@ -3717,22 +3729,12 @@ pmap_tlb_shootdown_job_get(pq)
>  	if (pq->pq_count >= PMAP_TLB_MAXJOBS)
>  		return (NULL);
>
> -#ifdef MULTIPROCESSOR
> -	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -	if (pj_free == NULL) {
> -#ifdef MULTIPROCESSOR
> -		__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> +	if (pq->pj_free == NULL) {
>  		return NULL;
> +	} else {
> +		pj = pq->pj_free;
> +		pq->pj_free = pj->pj_nextfree;
>  	}
> -	pj = &pj_free->pja_job;
> -	pj_free =
> -	    (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
> -#ifdef MULTIPROCESSOR
> -	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -
>  	pq->pq_count++;
>  	return (pj);
>  }
> @@ -3754,14 +3756,7 @@ pmap_tlb_shootdown_job_put(pq, pj)
>  	if (pq->pq_count == 0)
>  		panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
>  #endif
> -#ifdef MULTIPROCESSOR
> -	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -	pj->pj_nextfree = &pj_free->pja_job;
> -	pj_free = (union pmap_tlb_shootdown_job_al *)pj;
> -#ifdef MULTIPROCESSOR
> -	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -
> +	pj->pj_nextfree = pq->pj_free;
> +	pq->pj_free = pj;
>  	pq->pq_count--;
>  }

-- thorpej