Subject: Re: kern/33185: kva shortage problems
To: None <kern-bug-people@netbsd.org, gnats-admin@netbsd.org,>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: netbsd-bugs
Date: 04/06/2006 11:30:03
The following reply was made to PR kern/33185; it has been noted by GNATS.

From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
To: gnats-bugs@netbsd.org
Cc: tech-kern@netbsd.org
Subject: Re: kern/33185: kva shortage problems
Date: Thu, 06 Apr 2006 20:26:42 +0900

 --NextPart-20060406202103-1948600
 Content-Type: Text/Plain; charset=us-ascii
 
 > 	1. lack of kva reclamation mechanism.
 > 	   http://mail-index.NetBSD.org/tech-kern/2005/12/17/0028.html
 
 the attached diff is an attempt to fix this part.
 it basically moves wait points for kva to vm_map.
 
 comments?
 
 YAMAMOTO Takashi
 
 --NextPart-20060406202103-1948600
 Content-Type: Text/Plain; charset=us-ascii
 Content-Disposition: attachment; filename="a.diff"
 
 Index: sys/callback.h
 ===================================================================
 --- sys/callback.h	(revision 0)
 +++ sys/callback.h	(revision 0)
 @@ -0,0 +1,57 @@
 +/*	$NetBSD$	*/
 +
 +/*-
 + * Copyright (c)2006 YAMAMOTO Takashi,
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +
 +#ifndef _SYS_CALLBACK_H_
 +#define	_SYS_CALLBACK_H_
 +
 +struct callback_entry {
 +	TAILQ_ENTRY(callback_entry) ce_q;
 +	int (*ce_func)(struct callback_entry *, void *, void *);
 +	void *ce_obj;
 +};
 +
 +struct callback_head {
 +	struct simplelock ch_lock;
 +	TAILQ_HEAD(, callback_entry) ch_q;
 +	struct callback_entry *ch_next;
 +	int ch_nentries;
 +	int ch_running;
 +	int ch_flags;
 +};
 +
 +/* return values of ce_func */
 +#define	CALLBACK_CHAIN_CONTINUE	0
 +#define	CALLBACK_CHAIN_ABORT	1
 +
 +int callback_run_roundrobin(struct callback_head *, void *);
 +void callback_register(struct callback_head *, struct callback_entry *,
 +    void *, int (*)(struct callback_entry *, void *, void *));
 +void callback_unregister(struct callback_head *, struct callback_entry *);
 +void callback_head_init(struct callback_head *);
 +
 +#endif /* !_SYS_CALLBACK_H_ */
 Index: sys/pool.h
 ===================================================================
 --- sys/pool.h	(revision 1574)
 +++ sys/pool.h	(working copy)
 @@ -53,6 +53,9 @@
  #include <sys/queue.h>
  #include <sys/time.h>
  #include <sys/tree.h>
 +#if defined(_KERNEL)
 +#include <sys/callback.h>
 +#endif /* defined(_KERNEL) */
  #endif
  
  #define	PCG_NOBJECTS		16
 @@ -108,9 +111,17 @@ struct pool_allocator {
  	TAILQ_HEAD(, pool) pa_list;	/* list of pools using this allocator */
  	int		pa_flags;
  #define	PA_INITIALIZED	0x01
 -#define	PA_WANT		0x02		/* wakeup any sleeping pools on free */
  	int		pa_pagemask;
  	int		pa_pageshift;
 +	struct vm_map *pa_backingmap;
 +#if defined(_KERNEL)
 +	struct {
 +		struct vm_map **i_backingmapptr;
 +		SLIST_ENTRY(pool_allocator) i_q;
 +	} pa_init;
 +#define	pa_q			pa_init.i_q
 +#define	pa_backingmapptr	pa_init.i_backingmapptr
 +#endif /* defined(_KERNEL) */
  };
  
  LIST_HEAD(pool_pagelist,pool_item_header);
 @@ -205,6 +216,8 @@ struct pool {
  
  	const char	*pr_entered_file; /* reentrancy check */
  	long		pr_entered_line;
 +
 +	struct callback_entry pr_reclaimerentry;
  };
  #endif /* __POOL_EXPOSE */
  
 Index: kern/subr_pool.c
 ===================================================================
 --- kern/subr_pool.c	(revision 1589)
 +++ kern/subr_pool.c	(working copy)
 @@ -82,12 +82,16 @@ static struct pool phpool[PHPOOL_MAX];
  static struct pool psppool;
  #endif
  
 +static SLIST_HEAD(, pool_allocator) pa_deferinitq =
 +    SLIST_HEAD_INITIALIZER(pa_deferinitq);
 +
  static void *pool_page_alloc_meta(struct pool *, int);
  static void pool_page_free_meta(struct pool *, void *);
  
  /* allocator for pool metadata */
  static struct pool_allocator pool_allocator_meta = {
 -	pool_page_alloc_meta, pool_page_free_meta
 +	pool_page_alloc_meta, pool_page_free_meta,
 +	.pa_backingmapptr = &kmem_map,
  };
  
  /* # of seconds to retain page after last use */
 @@ -184,8 +188,8 @@ static void	pool_prime_page(struct pool 
  static void	pool_update_curpage(struct pool *);
  
  static int	pool_grow(struct pool *, int);
 -void		*pool_allocator_alloc(struct pool *, int);
 -void		pool_allocator_free(struct pool *, void *);
 +static void	*pool_allocator_alloc(struct pool *, int);
 +static void	pool_allocator_free(struct pool *, void *);
  
  static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
  	void (*)(const char *, ...));
 @@ -443,12 +447,106 @@ pr_rmpage(struct pool *pp, struct pool_i
  	pool_update_curpage(pp);
  }
  
 +static boolean_t
 +pa_starved_p(struct pool_allocator *pa)
 +{
 +
 +	if (pa->pa_backingmap != NULL) {
 +		return vm_map_starved_p(pa->pa_backingmap);
 +	}
 +	return FALSE;
 +}
 +
 +static int
 +pool_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
 +{
 +	struct pool *pp = obj;
 +	struct pool_allocator *pa = pp->pr_alloc;
 +#if 1
 +	unsigned long oidle = pp->pr_nidle;
 +	unsigned long nidle;
 +#endif
 +
 +	KASSERT(&pp->pr_reclaimerentry == ce);
 +
 +	pool_reclaim(pp);
 +
 +#if 1
 +	nidle = pp->pr_nidle;
 +	if (nidle != oidle) {
 +		printf("%s: '%s' %lu -> %lu\n",
 +		    __func__, pp->pr_wchan, oidle, nidle);
 +	}
 +#endif
 +
 +	if (!pa_starved_p(pa)) {
 +		return CALLBACK_CHAIN_ABORT;
 +	}
 +	return CALLBACK_CHAIN_CONTINUE;
 +}
 +
 +static void
 +pool_reclaim_register(struct pool *pp)
 +{
 +	struct vm_map *map = pp->pr_alloc->pa_backingmap;
 +	int s;
 +
 +	if (map == NULL) {
 +#if 1
 +		if (pp->pr_alloc->pa_backingmapptr == NULL) {
 +			printf("%s: pool %p '%s' doesn't have backing map\n",
 +			    __func__, pp, pp->pr_wchan);
 +		}
 +#endif
 +		return;
 +	}
 +
 +	s = splvm(); /* not necessary for INTRSAFE maps, but don't care. */
 +	callback_register(&vm_map_to_kernel(map)->vmk_reclaim_callback,
 +	    &pp->pr_reclaimerentry, pp, pool_reclaim_callback);
 +	splx(s);
 +}
 +
 +static void
 +pool_reclaim_unregister(struct pool *pp)
 +{
 +	struct vm_map *map = pp->pr_alloc->pa_backingmap;
 +	int s;
 +
 +	if (map == NULL) {
 +		return;
 +	}
 +
 +	s = splvm(); /* not necessary for INTRSAFE maps, but don't care. */
 +	callback_unregister(&vm_map_to_kernel(map)->vmk_reclaim_callback,
 +	    &pp->pr_reclaimerentry);
 +	splx(s);
 +}
 +
 +static void
 +pa_reclaim_register(struct pool_allocator *pa)
 +{
 +	struct vm_map *map = *pa->pa_backingmapptr;
 +	struct pool *pp;
 +
 +	KASSERT(pa->pa_backingmap == NULL);
 +	if (map == NULL) {
 +		SLIST_INSERT_HEAD(&pa_deferinitq, pa, pa_q);
 +		return;
 +	}
 +	pa->pa_backingmap = map;
 +	TAILQ_FOREACH(pp, &pa->pa_list, pr_alloc_list) {
 +		pool_reclaim_register(pp);
 +	}
 +}
 +
  /*
   * Initialize all the pools listed in the "pools" link set.
   */
  void
 -link_pool_init(void)
 +link_pool_init(void) /* XXX rename */
  {
 +	struct pool_allocator *pa;
  	__link_set_decl(pools, struct link_pool_init);
  	struct link_pool_init * const *pi;
  
 @@ -456,6 +554,14 @@ link_pool_init(void)
  		pool_init((*pi)->pp, (*pi)->size, (*pi)->align,
  		    (*pi)->align_offset, (*pi)->flags, (*pi)->wchan,
  		    (*pi)->palloc);
 +
 +	/* XXX XXX */
 +	while ((pa = SLIST_FIRST(&pa_deferinitq)) != NULL) {
 +		KASSERT(pa->pa_backingmapptr != NULL);
 +		KASSERT(*pa->pa_backingmapptr != NULL);
 +		SLIST_REMOVE_HEAD(&pa_deferinitq, pa_q);
 +		pa_reclaim_register(pa);
 +	}
  }
  
  /*
 @@ -502,6 +608,10 @@ pool_init(struct pool *pp, size_t size, 
  		simple_lock_init(&palloc->pa_slock);
  		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
  		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
 +
 +		if (palloc->pa_backingmapptr != NULL) {
 +			pa_reclaim_register(palloc);
 +		}
  		palloc->pa_flags |= PA_INITIALIZED;
  	}
  
 @@ -683,6 +793,7 @@ pool_init(struct pool *pp, size_t size, 
  	TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
  	simple_unlock(&palloc->pa_slock);
  	splx(s);
 +	pool_reclaim_register(pp);
  }
  
  /*
 @@ -703,6 +814,7 @@ pool_destroy(struct pool *pp)
  	simple_unlock(&pool_head_slock);
  
  	/* Remove this pool from its allocator's list of pools. */
 +	pool_reclaim_unregister(pp); /* XXX can sleep */
  	s = splvm();
  	simple_lock(&pp->pr_alloc->pa_slock);
  	TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
 @@ -897,24 +1009,10 @@ pool_get(struct pool *pp, int flags)
  			if (pp->pr_curpage != NULL)
  				goto startover;
  
 -			if ((flags & PR_WAITOK) == 0) {
 -				pp->pr_nfail++;
 -				pr_leave(pp);
 -				simple_unlock(&pp->pr_slock);
 -				return (NULL);
 -			}
 -
 -			/*
 -			 * Wait for items to be returned to this pool.
 -			 *
 -			 * wake up once a second and try again,
 -			 * as the check in pool_cache_put_paddr() is racy.
 -			 */
 -			pp->pr_flags |= PR_WANTED;
 -			/* PA_WANTED is already set on the allocator. */
 +			pp->pr_nfail++;
  			pr_leave(pp);
 -			ltsleep(pp, PSWP, pp->pr_wchan, hz, &pp->pr_slock);
 -			pr_enter(pp, file, line);
 +			simple_unlock(&pp->pr_slock);
 +			return (NULL);
  		}
  
  		/* Start the allocation process over. */
 @@ -1114,7 +1212,7 @@ pool_do_put(struct pool *pp, void *v, st
  		pp->pr_nidle++;
  		if (pp->pr_npages > pp->pr_minpages &&
  		    (pp->pr_npages > pp->pr_maxpages ||
 -		     (pp->pr_alloc->pa_flags & PA_WANT) != 0)) {
 +		     pa_starved_p(pp->pr_alloc))) {
  			pr_rmpage(pp, ph, pq);
  		} else {
  			LIST_REMOVE(ph, ph_pagelist);
 @@ -1483,7 +1581,8 @@ pool_reclaim(struct pool *pp)
  
  		KASSERT(ph->ph_nmissing == 0);
  		timersub(&curtime, &ph->ph_time, &diff);
 -		if (diff.tv_sec < pool_inactive_time)
 +		if (diff.tv_sec < pool_inactive_time
 +		    && !pa_starved_p(pp->pr_alloc))
  			continue;
  
  		/*
 @@ -2166,10 +2265,12 @@ void	pool_page_free(struct pool *, void 
  #ifdef POOL_SUBPAGE
  struct pool_allocator pool_allocator_kmem_fullpage = {
  	pool_page_alloc, pool_page_free, 0,
 +	.pa_backingmapptr = &kmem_map,
  };
  #else
  struct pool_allocator pool_allocator_kmem = {
  	pool_page_alloc, pool_page_free, 0,
 +	.pa_backingmapptr = &kmem_map,
  };
  #endif
  
 @@ -2179,10 +2280,12 @@ void	pool_page_free_nointr(struct pool *
  #ifdef POOL_SUBPAGE
  struct pool_allocator pool_allocator_nointr_fullpage = {
  	pool_page_alloc_nointr, pool_page_free_nointr, 0,
 +	.pa_backingmapptr = &kernel_map,
  };
  #else
  struct pool_allocator pool_allocator_nointr = {
  	pool_page_alloc_nointr, pool_page_free_nointr, 0,
 +	.pa_backingmapptr = &kernel_map,
  };
  #endif
  
 @@ -2192,6 +2295,7 @@ void	pool_subpage_free(struct pool *, vo
  
  struct pool_allocator pool_allocator_kmem = {
  	pool_subpage_alloc, pool_subpage_free, POOL_SUBPAGE,
 +	.pa_backingmapptr = &kmem_map,
  };
  
  void	*pool_subpage_alloc_nointr(struct pool *, int);
 @@ -2199,125 +2303,41 @@ void	pool_subpage_free_nointr(struct poo
  
  struct pool_allocator pool_allocator_nointr = {
  	pool_subpage_alloc, pool_subpage_free, POOL_SUBPAGE,
 +	.pa_backingmapptr = &kmem_map,
  };
  #endif /* POOL_SUBPAGE */
  
 -/*
 - * We have at least three different resources for the same allocation and
 - * each resource can be depleted.  First, we have the ready elements in the
 - * pool.  Then we have the resource (typically a vm_map) for this allocator.
 - * Finally, we have physical memory.  Waiting for any of these can be
 - * unnecessary when any other is freed, but the kernel doesn't support
 - * sleeping on multiple wait channels, so we have to employ another strategy.
 - *
 - * The caller sleeps on the pool (so that it can be awakened when an item
 - * is returned to the pool), but we set PA_WANT on the allocator.  When a
 - * page is returned to the allocator and PA_WANT is set, pool_allocator_free
 - * will wake up all sleeping pools belonging to this allocator.
 - *
 - * XXX Thundering herd.
 - */
 -void *
 -pool_allocator_alloc(struct pool *org, int flags)
 +static void *
 +pool_allocator_alloc(struct pool *pp, int flags)
  {
 -	struct pool_allocator *pa = org->pr_alloc;
 -	struct pool *pp, *start;
 -	int s, freed;
 +	struct pool_allocator *pa = pp->pr_alloc;
  	void *res;
  
 -	LOCK_ASSERT(!simple_lock_held(&org->pr_slock));
 +	LOCK_ASSERT(!simple_lock_held(&pp->pr_slock));
  
 -	do {
 -		if ((res = (*pa->pa_alloc)(org, flags)) != NULL)
 -			return (res);
 -		if ((flags & PR_WAITOK) == 0) {
 -			/*
 -			 * We only run the drain hookhere if PR_NOWAIT.
 -			 * In other cases, the hook will be run in
 -			 * pool_reclaim().
 -			 */
 -			if (org->pr_drain_hook != NULL) {
 -				(*org->pr_drain_hook)(org->pr_drain_hook_arg,
 -				    flags);
 -				if ((res = (*pa->pa_alloc)(org, flags)) != NULL)
 -					return (res);
 -			}
 -			break;
 -		}
 -
 +	res = (*pa->pa_alloc)(pp, flags);
 +	if (res == NULL && (flags & PR_WAITOK) == 0) {
  		/*
 -		 * Drain all pools, that use this allocator.
 -		 * We do this to reclaim VA space.
 -		 * pa_alloc is responsible for waiting for
 -		 * physical memory.
 -		 *
 -		 * XXX We risk looping forever if start if someone
 -		 * calls pool_destroy on "start".  But there is no
 -		 * other way to have potentially sleeping pool_reclaim,
 -		 * non-sleeping locks on pool_allocator, and some
 -		 * stirring of drained pools in the allocator.
 -		 *
 -		 * XXX Maybe we should use pool_head_slock for locking
 -		 * the allocators?
 +		 * We only run the drain hook here if PR_NOWAIT.
 +		 * In other cases, the hook will be run in
 +		 * pool_reclaim().
  		 */
 -		freed = 0;
 -
 -		s = splvm();
 -		simple_lock(&pa->pa_slock);
 -		pp = start = TAILQ_FIRST(&pa->pa_list);
 -		do {
 -			TAILQ_REMOVE(&pa->pa_list, pp, pr_alloc_list);
 -			TAILQ_INSERT_TAIL(&pa->pa_list, pp, pr_alloc_list);
 -			simple_unlock(&pa->pa_slock);
 -			freed = pool_reclaim(pp);
 -			simple_lock(&pa->pa_slock);
 -		} while ((pp = TAILQ_FIRST(&pa->pa_list)) != start &&
 -			 freed == 0);
 -
 -		if (freed == 0) {
 -			/*
 -			 * We set PA_WANT here, the caller will most likely
 -			 * sleep waiting for pages (if not, this won't hurt
 -			 * that much), and there is no way to set this in
 -			 * the caller without violating locking order.
 -			 */
 -			pa->pa_flags |= PA_WANT;
 +		if (pp->pr_drain_hook != NULL) {
 +			(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
 +			res = (*pa->pa_alloc)(pp, flags);
  		}
 -		simple_unlock(&pa->pa_slock);
 -		splx(s);
 -	} while (freed);
 -	return (NULL);
 +	}
 +	return res;
  }
  
 -void
 +static void
  pool_allocator_free(struct pool *pp, void *v)
  {
  	struct pool_allocator *pa = pp->pr_alloc;
 -	int s;
  
  	LOCK_ASSERT(!simple_lock_held(&pp->pr_slock));
  
  	(*pa->pa_free)(pp, v);
 -
 -	s = splvm();
 -	simple_lock(&pa->pa_slock);
 -	if ((pa->pa_flags & PA_WANT) == 0) {
 -		simple_unlock(&pa->pa_slock);
 -		splx(s);
 -		return;
 -	}
 -
 -	TAILQ_FOREACH(pp, &pa->pa_list, pr_alloc_list) {
 -		simple_lock(&pp->pr_slock);
 -		if ((pp->pr_flags & PR_WANTED) != 0) {
 -			pp->pr_flags &= ~PR_WANTED;
 -			wakeup(pp);
 -		}
 -		simple_unlock(&pp->pr_slock);
 -	}
 -	pa->pa_flags &= ~PA_WANT;
 -	simple_unlock(&pa->pa_slock);
 -	splx(s);
  }
  
  void *
 Index: kern/vfs_bio.c
 ===================================================================
 --- kern/vfs_bio.c	(revision 1587)
 +++ kern/vfs_bio.c	(working copy)
 @@ -177,9 +177,9 @@ struct simplelock bqueue_slock = SIMPLEL
  
  /*
   * Buffer pool for I/O buffers.
 - * Access to this pool must be protected with splbio().
   */
 -static POOL_INIT(bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
 +static POOL_INIT(bufpool, sizeof(struct buf), 0, 0, 0, "bufpl",
 +    &pool_allocator_nointr);
  
  
  /* XXX - somewhat gross.. */
 @@ -375,8 +375,7 @@ bufinit(void)
  	if (bufmem_valimit != 0) {
  		vaddr_t minaddr = 0, maxaddr;
  		buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
 -					  bufmem_valimit, VM_MAP_PAGEABLE,
 -					  FALSE, 0);
 +					  bufmem_valimit, 0, FALSE, 0);
  		if (buf_map == NULL)
  			panic("bufinit: cannot allocate submap");
  	} else
 @@ -393,6 +392,7 @@ bufinit(void)
  	use_std = 1;
  #endif
  
 +	bufmempool_allocator.pa_backingmap = buf_map;
  	for (i = 0; i < NMEMPOOLS; i++) {
  		struct pool_allocator *pa;
  		struct pool *pp = &bmempools[i];
 @@ -985,13 +985,13 @@ already_queued:
  	/* Allow disk interrupts. */
  	simple_unlock(&bp->b_interlock);
  	simple_unlock(&bqueue_slock);
 +	splx(s);
  	if (bp->b_bufsize <= 0) {
  #ifdef DEBUG
  		memset((char *)bp, 0, sizeof(*bp));
  #endif
  		pool_put(&bufpool, bp);
  	}
 -	splx(s);
  }
  
  /*
 Index: kern/uipc_socket.c
 ===================================================================
 --- kern/uipc_socket.c	(revision 1590)
 +++ kern/uipc_socket.c	(working copy)
 @@ -126,16 +126,8 @@ EVCNT_ATTACH_STATIC(sosend_kvalimit);
  
  #endif /* SOSEND_COUNTERS */
  
 -void
 -soinit(void)
 -{
 +static struct callback_entry sokva_reclaimerentry;
  
 -	/* Set the initial adjusted socket buffer size. */
 -	if (sb_max_set(sb_max))
 -		panic("bad initial sb_max value: %lu", sb_max);
 -
 -}
 -
  #ifdef SOSEND_NO_LOAN
  int use_sosend_loan = 0;
  #else
 @@ -438,6 +430,32 @@ sosend_loan(struct socket *so, struct ui
  	}
  
  	return (space);
 +}
 +
 +static int
 +sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
 +{
 +
 +	KASSERT(ce == &sokva_reclaimerentry);
 +	KASSERT(obj == NULL);
 +
 +	sodopendfree();
 +	if (!vm_map_starved_p(kernel_map)) {
 +		return CALLBACK_CHAIN_ABORT;
 +	}
 +	return CALLBACK_CHAIN_CONTINUE;
 +}
 +
 +void
 +soinit(void)
 +{
 +
 +	/* Set the initial adjusted socket buffer size. */
 +	if (sb_max_set(sb_max))
 +		panic("bad initial sb_max value: %lu", sb_max);
 +
 +	callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
 +	    &sokva_reclaimerentry, NULL, sokva_reclaim_callback);
  }
  
  /*
 Index: kern/uipc_mbuf.c
 ===================================================================
 --- kern/uipc_mbuf.c	(revision 1585)
 +++ kern/uipc_mbuf.c	(working copy)
 @@ -154,6 +154,7 @@ mbinit(void)
  	KASSERT(sizeof(struct _m_ext) <= MHLEN);
  	KASSERT(sizeof(struct mbuf) == MSIZE);
  
 +	mclpool_allocator.pa_backingmap = mb_map;
  	pool_init(&mbpool, msize, 0, 0, 0, "mbpl", NULL);
  	pool_init(&mclpool, mclbytes, 0, 0, 0, "mclpl", &mclpool_allocator);
  
 Index: kern/subr_callback.c
 ===================================================================
 --- kern/subr_callback.c	(revision 0)
 +++ kern/subr_callback.c	(revision 0)
 @@ -0,0 +1,139 @@
 +/*	$NetBSD$	*/
 +
 +/*-
 + * Copyright (c)2006 YAMAMOTO Takashi,
 + * All rights reserved.
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in the
 + *    documentation and/or other materials provided with the distribution.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +
 +#include <sys/cdefs.h>
 +__KERNEL_RCSID(0, "$NetBSD$");
 +
 +#include <sys/param.h>
 +#include <sys/systm.h>
 +#include <sys/proc.h>
 +#include <sys/callback.h>
 +
 +#define	CH_WANT	1
 +
 +void
 +callback_head_init(struct callback_head *ch)
 +{
 +
 +	simple_lock_init(&ch->ch_lock);
 +	TAILQ_INIT(&ch->ch_q);
 +	ch->ch_next = NULL;
 +	ch->ch_nentries = 0;
 +}
 +
 +void
 +callback_register(struct callback_head *ch, struct callback_entry *ce,
 +    void *obj, int (*fn)(struct callback_entry *, void *, void *))
 +{
 +
 +	ce->ce_func = fn;
 +	ce->ce_obj = obj;
 +	simple_lock(&ch->ch_lock);
 +	TAILQ_INSERT_TAIL(&ch->ch_q, ce, ce_q);
 +	ch->ch_nentries++;
 +	simple_unlock(&ch->ch_lock);
 +}
 +
 +void
 +callback_unregister(struct callback_head *ch, struct callback_entry *ce)
 +{
 +
 +	simple_lock(&ch->ch_lock);
 +	while (ch->ch_running > 0) {
 +		ch->ch_flags |= CH_WANT;
 +		ltsleep(&ch->ch_running, PVM, "recunreg", 0, &ch->ch_lock);
 +	}
 +	if (__predict_false(ch->ch_next == ce)) {
 +		ch->ch_next = TAILQ_NEXT(ce, ce_q);
 +	}
 +	TAILQ_REMOVE(&ch->ch_q, ce, ce_q);
 +	ch->ch_nentries--;
 +	simple_unlock(&ch->ch_lock);
 +}
 +
 +static int
 +callback_runone(struct callback_head *ch, void *arg)
 +{
 +	struct callback_entry *ce;
 +	int result;
 +
 +	KASSERT(ch->ch_nentries > 0);
 +	KASSERT(ch->ch_running > 0);
 +
 +	ce = ch->ch_next;
 +	if (ce == NULL) {
 +		ce = TAILQ_FIRST(&ch->ch_q);
 +	}
 +	KASSERT(ce != NULL);
 +	result = (*ce->ce_func)(ce, ce->ce_obj, arg);
 +	ch->ch_next = TAILQ_NEXT(ce, ce_q);
 +	return result;
 +}
 +
 +static void
 +callback_run_enter(struct callback_head *ch)
 +{
 +
 +	simple_lock(&ch->ch_lock);
 +	ch->ch_running++;
 +	simple_unlock(&ch->ch_lock);
 +}
 +
 +static void
 +callback_run_leave(struct callback_head *ch)
 +{
 +
 +	simple_lock(&ch->ch_lock);
 +	KASSERT(ch->ch_running > 0);
 +	ch->ch_running--;
 +	if (ch->ch_running == 0 && (ch->ch_flags & CH_WANT) != 0) {
 +		ch->ch_flags &= ~CH_WANT;
 +		wakeup(&ch->ch_running);
 +	}
 +	simple_unlock(&ch->ch_lock);
 +}
 +
 +int
 +callback_run_roundrobin(struct callback_head *ch, void *arg)
 +{
 +	int i;
 +	int n;
 +	int result = 0;
 +
 +	callback_run_enter(ch);
 +	n = ch->ch_nentries;
 +	for (i = 0; i < n; i++) {
 +		result = callback_runone(ch, arg);
 +		if (result != CALLBACK_CHAIN_CONTINUE) {
 +			break;
 +		}
 +	}
 +	callback_run_leave(ch);
 +
 +	return result;
 +}
 Index: uvm/uvm_km.c
 ===================================================================
 --- uvm/uvm_km.c	(revision 1591)
 +++ uvm/uvm_km.c	(working copy)
 @@ -188,7 +188,8 @@ km_vacache_alloc(struct pool *pp, int fl
  	if (uvm_map(map, &va, size, NULL, UVM_UNKNOWN_OFFSET, size,
  	    UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
  	    UVM_ADV_RANDOM, UVM_FLAG_QUANTUM |
 -	    ((flags & PR_WAITOK) ? 0 : UVM_FLAG_TRYLOCK | UVM_FLAG_NOWAIT))))
 +	    ((flags & PR_WAITOK) ? UVM_FLAG_WAITVA :
 +	    UVM_FLAG_TRYLOCK | UVM_FLAG_NOWAIT))))
  		return NULL;
  
  	return (void *)va;
 @@ -226,10 +227,9 @@ km_vacache_init(struct vm_map *map, cons
  	pa->pa_alloc = km_vacache_alloc;
  	pa->pa_free = km_vacache_free;
  	pa->pa_pagesz = (unsigned int)size;
 +	pa->pa_backingmap = map;
 +	pa->pa_backingmapptr = NULL;
  	pool_init(pp, PAGE_SIZE, 0, 0, PR_NOTOUCH | PR_RECURSIVE, name, pa);
 -
 -	/* XXX for now.. */
 -	pool_sethiwat(pp, 0);
  }
  
  void
 @@ -252,6 +252,30 @@ uvm_km_vacache_init(struct vm_map *map, 
  }
  
  #endif /* !defined(PMAP_MAP_POOLPAGE) */
 +
 +void
 +uvm_km_va_drain(struct vm_map *map, uvm_flag_t flags)
 +{
 +	struct vm_map_kernel *vmk = vm_map_to_kernel(map);
 +	const boolean_t intrsafe = (map->flags & VM_MAP_INTRSAFE) != 0;
 +#if 0
 +	const int rflags =
 +	    (flags & (UVM_FLAG_NOWAIT|UVM_FLAG_WAITVA)) == UVM_FLAG_WAITVA ?
 +	    0 : RECLAIM_FLAG_NOWAIT;
 +	struct reclaim_args args = {
 +		.ra_flags = rflags,
 +	};
 +#endif
 +	int s = 0xdeadbeaf; /* XXX: gcc */
 +
 +	if (intrsafe) {
 +		s = splvm();
 +	}
 +	callback_run_roundrobin(&vmk->vmk_reclaim_callback, NULL);
 +	if (intrsafe) {
 +		splx(s);
 +	}
 +}
  
  /*
   * uvm_km_init: init kernel maps and objects to reflect reality (i.e.
 Index: uvm/uvm_km.h
 ===================================================================
 --- uvm/uvm_km.h	(revision 1464)
 +++ uvm/uvm_km.h	(working copy)
 @@ -55,6 +55,7 @@ void uvm_km_check_empty(vaddr_t, vaddr_t
  #else
  #define	uvm_km_check_empty(a, b, c)	/* nothing */
  #endif /* defined(DEBUG) */
 +void uvm_km_va_drain(struct vm_map *, uvm_flag_t);
  
  #endif /* _KERNEL */
  
 Index: uvm/uvm_map.c
 ===================================================================
 --- uvm/uvm_map.c	(revision 1587)
 +++ uvm/uvm_map.c	(working copy)
 @@ -742,7 +742,17 @@ uvm_map_clip_end(struct vm_map *map, str
  	uvm_tree_sanity(map, "clip_end leave");
  }
  
 +static void
 +vm_map_drain(struct vm_map *map, uvm_flag_t flags)
 +{
  
 +	if (!VM_MAP_IS_KERNEL(map)) {
 +		return;
 +	}
 +
 +	uvm_km_va_drain(map, flags);
 +}
 +
  /*
   *   M A P   -   m a i n   e n t r y   p o i n t
   */
 @@ -875,16 +885,11 @@ retry:
  		}
  		vm_map_lock(map); /* could sleep here */
  	}
 -	if ((prev_entry = uvm_map_findspace(map, start, size, &start,
 -	    uobj, uoffset, align, flags)) == NULL) {
 +	prev_entry = uvm_map_findspace(map, start, size, &start,
 +	    uobj, uoffset, align, flags);
 +	if (prev_entry == NULL) {
  		unsigned int timestamp;
  
 -		if ((flags & UVM_FLAG_WAITVA) == 0) {
 -			UVMHIST_LOG(maphist,"<- uvm_map_findspace failed!",
 -			    0,0,0,0);
 -			vm_map_unlock(map);
 -			return ENOMEM;
 -		}
  		timestamp = map->timestamp;
  		UVMHIST_LOG(maphist,"waiting va timestamp=0x%x",
  			    timestamp,0,0,0);
 @@ -894,15 +899,24 @@ retry:
  		vm_map_unlock(map);
  
  		/*
 -		 * wait until someone does unmap.
 +		 * try to reclaim kva and wait until someone does unmap.
  		 * XXX fragile locking
  		 */
  
 +		vm_map_drain(map, flags);
 +
  		simple_lock(&map->flags_lock);
  		while ((map->flags & VM_MAP_WANTVA) != 0 &&
  		   map->timestamp == timestamp) {
 -			ltsleep(&map->header, PVM, "vmmapva", 0,
 -			    &map->flags_lock);
 +			if ((flags & UVM_FLAG_WAITVA) == 0) {
 +				simple_unlock(&map->flags_lock);
 +				UVMHIST_LOG(maphist,
 +				    "<- uvm_map_findspace failed!", 0,0,0,0);
 +				return ENOMEM;
 +			} else {
 +				ltsleep(&map->header, PVM, "vmmapva", 0,
 +				    &map->flags_lock);
 +			}
  		}
  		simple_unlock(&map->flags_lock);
  		goto retry;
 @@ -2655,6 +2669,7 @@ uvm_map_setup_kernel(struct vm_map_kerne
  
  	uvm_map_setup(&map->vmk_map, vmin, vmax, flags);
  
 +	callback_head_init(&map->vmk_reclaim_callback);
  	LIST_INIT(&map->vmk_kentry_free);
  	map->vmk_merged_entries = NULL;
  }
 @@ -4789,4 +4804,18 @@ vm_map_to_kernel(struct vm_map *map)
  	KASSERT(VM_MAP_IS_KERNEL(map));
  
  	return (struct vm_map_kernel *)map;
 +}
 +
 +boolean_t
 +vm_map_starved_p(struct vm_map *map)
 +{
 +
 +	if ((map->flags & VM_MAP_WANTVA) != 0) {
 +		return TRUE;
 +	}
 +	/* XXX */
 +	if ((vm_map_max(map) - vm_map_min(map)) / 16 * 15 < map->size) {
 +		return TRUE;
 +	}
 +	return FALSE;
  }
 Index: uvm/uvm_map.h
 ===================================================================
 --- uvm/uvm_map.h	(revision 1571)
 +++ uvm/uvm_map.h	(working copy)
 @@ -234,6 +234,9 @@ struct vm_map {
  };
  
  #if defined(_KERNEL)
 +
 +#include <sys/callback.h>
 +
  struct vm_map_kernel {
  	struct vm_map vmk_map;
  	LIST_HEAD(, uvm_kmapent_hdr) vmk_kentry_free;
 @@ -241,6 +244,7 @@ struct vm_map_kernel {
  	struct vm_map_entry	*vmk_merged_entries;
  			/* Merged entries, kept for later splitting */
  
 +	struct callback_head vmk_reclaim_callback;
  #if !defined(PMAP_MAP_POOLPAGE)
  	struct pool vmk_vacache; /* kva cache */
  	struct pool_allocator vmk_vacache_allocator; /* ... and its allocator */
 @@ -506,6 +510,8 @@ do {									\
  	if (oflags & VM_MAP_WANTLOCK)					\
  		wakeup(&(map)->flags);					\
  } while (/*CONSTCOND*/ 0)
 +
 +boolean_t vm_map_starved_p(struct vm_map *);
  
  #endif /* _KERNEL */
  
 
 --NextPart-20060406202103-1948600--