NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: lib/50791: NetBSD's malloc has performance issues



The following reply was made to PR lib/50791; it has been noted by GNATS.

From: Joerg Sonnenberger <joerg%britannica.bec.de@localhost>
To: gnats-bugs%NetBSD.org@localhost
Cc: lib-bug-people%netbsd.org@localhost, gnats-admin%netbsd.org@localhost,
	netbsd-bugs%netbsd.org@localhost, Andreas Gustafsson <gson%gson.org@localhost>
Subject: Re: lib/50791: NetBSD's malloc has performance issues
Date: Fri, 19 Feb 2016 04:23:59 +0100

 --vtzGhvizbBRQ85DL
 Content-Type: text/plain; charset=us-ascii
 Content-Disposition: inline
 
 On Thu, Feb 18, 2016 at 04:15:00PM +0000, Andreas Gustafsson wrote:
 >  I wonder how much faster release builds would be with a newer
 >  jemalloc in libc...
 
 I somehow doubt that it makes a big difference. The provided trace is
 somewhat atypical as roughly half of the allocations is between 64KB and
 256KB. For jemalloc, that's a large allocation, so chunks out of the
 larger 1MB blocks are allocated for it. What happens is that we are
 using a single arena in either the single threaded or always running on
 the same CPU case. The search of for the next free run of enough pages
 is done in chunk address order, so the most of the allocations
 process have to process most of the chunks. Keeping the chunk tree
 ordered by the estimated number size of the largest run avoids this
 worst case at the cost of shuffling things around a bit more often. A
 proof of concept is attached. I haven't done any exhaustive testing for
 this...
 
 Joerg
 
 --vtzGhvizbBRQ85DL
 Content-Type: text/plain; charset=us-ascii
 Content-Disposition: attachment; filename="jemalloc.c.diff"
 
 Index: jemalloc.c
 ===================================================================
 RCS file: /home/joerg/repo/netbsd/src/lib/libc/stdlib/jemalloc.c,v
 retrieving revision 1.39
 diff -u -p -r1.39 jemalloc.c
 --- jemalloc.c	24 Jan 2016 21:56:43 -0000	1.39
 +++ jemalloc.c	19 Feb 2016 03:11:25 -0000
 @@ -1661,6 +1661,11 @@ arena_chunk_comp(arena_chunk_t *a, arena
  	assert(a != NULL);
  	assert(b != NULL);
  
 +	if (a->max_frun_npages < a->max_frun_npages)
 +		return -1;
 +	if (a->max_frun_npages > a->max_frun_npages)
 +		return 1;
 +
  	if ((uintptr_t)a < (uintptr_t)b)
  		return (-1);
  	else if (a == b)
 @@ -1912,9 +1917,6 @@ arena_chunk_alloc(arena_t *arena)
  
  		chunk->arena = arena;
  
 -		/* LINTED */
 -		RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
 -
  		/*
  		 * Claim that no pages are in use, since the header is merely
  		 * overhead.
 @@ -1925,6 +1927,10 @@ arena_chunk_alloc(arena_t *arena)
  		    arena_chunk_header_npages;
  		chunk->min_frun_ind = arena_chunk_header_npages;
  
 +		/* Insert now that max_frun_npages is up-to-date. */
 +		/* LINTED */
 +		RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
 +
  		/*
  		 * Initialize enough of the map to support one maximal free run.
  		 */
 @@ -1970,7 +1976,7 @@ arena_chunk_dealloc(arena_t *arena, aren
  static arena_run_t *
  arena_run_alloc(arena_t *arena, size_t size)
  {
 -	arena_chunk_t *chunk;
 +	arena_chunk_t *chunk, *chunk_tmp;
  	arena_run_t *run;
  	unsigned need_npages, limit_pages, compl_need_npages;
  
 @@ -1986,14 +1992,30 @@ arena_run_alloc(arena_t *arena, size_t s
  	limit_pages = chunk_npages - arena_chunk_header_npages;
  	compl_need_npages = limit_pages - need_npages;
  	/* LINTED */
 -	RB_FOREACH(chunk, arena_chunk_tree_s, &arena->chunks) {
 +	for (;;) {
 +		chunk_tmp = RB_ROOT(&arena->chunks);
 +		chunk = NULL;
 +		while (chunk_tmp) {
 +			if (chunk_tmp->max_frun_npages == need_npages) {
 +				chunk = chunk_tmp;
 +				break;
 +			}
 +			if (chunk_tmp->max_frun_npages < need_npages) {
 +				chunk_tmp = RB_RIGHT(chunk_tmp, link);
 +				continue;
 +			}
 +			chunk = chunk_tmp;
 +			chunk_tmp = RB_LEFT(chunk, link);
 +		}
 +		if (chunk == NULL)
 +			break;
  		/*
  		 * Avoid searching this chunk if there are not enough
  		 * contiguous free pages for there to possibly be a large
  		 * enough free run.
  		 */
 -		if (chunk->pages_used <= compl_need_npages &&
 -		    need_npages <= chunk->max_frun_npages) {
 +		assert(need_npages <= chunk->max_frun_npages);
 +		{
  			arena_chunk_map_t *mapelm;
  			unsigned i;
  			unsigned max_frun_npages = 0;
 @@ -2031,7 +2053,9 @@ arena_run_alloc(arena_t *arena, size_t s
  			 * chunk->min_frun_ind was already reset above (if
  			 * necessary).
  			 */
 +			RB_REMOVE(arena_chunk_tree_s, &arena->chunks, chunk);
  			chunk->max_frun_npages = max_frun_npages;
 +			RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
  		}
  	}
  
 @@ -2114,8 +2138,11 @@ arena_run_dalloc(arena_t *arena, arena_r
  		assert(chunk->map[run_ind + run_pages - 1].pos == POS_FREE);
  	}
  
 -	if (chunk->map[run_ind].npages > chunk->max_frun_npages)
 +	if (chunk->map[run_ind].npages > chunk->max_frun_npages) {
 +		RB_REMOVE(arena_chunk_tree_s, &arena->chunks, chunk);
  		chunk->max_frun_npages = chunk->map[run_ind].npages;
 +		RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
 +	}
  	if (run_ind < chunk->min_frun_ind)
  		chunk->min_frun_ind = run_ind;
  
 
 --vtzGhvizbBRQ85DL--
 


Home | Main Index | Thread Index | Old Index