Subject: Re: kern/33185: kva shortage problems
To: None <gnats-bugs@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 04/06/2006 20:26:42
--NextPart-20060406202103-1948600
Content-Type: Text/Plain; charset=us-ascii

> 	1. lack of kva reclamation mechanism.
> 	   http://mail-index.NetBSD.org/tech-kern/2005/12/17/0028.html

the attached diff is an attempt to fix this part.
it basically moves wait points for kva to vm_map.

comments?

YAMAMOTO Takashi

--NextPart-20060406202103-1948600
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="a.diff"

Index: sys/callback.h
===================================================================
--- sys/callback.h	(revision 0)
+++ sys/callback.h	(revision 0)
@@ -0,0 +1,57 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c)2006 YAMAMOTO Takashi,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SYS_CALLBACK_H_
+#define	_SYS_CALLBACK_H_
+
+struct callback_entry {
+	TAILQ_ENTRY(callback_entry) ce_q;
+	int (*ce_func)(struct callback_entry *, void *, void *);
+	void *ce_obj;
+};
+
+struct callback_head {
+	struct simplelock ch_lock;
+	TAILQ_HEAD(, callback_entry) ch_q;
+	struct callback_entry *ch_next;
+	int ch_nentries;
+	int ch_running;
+	int ch_flags;
+};
+
+/* return values of ce_func */
+#define	CALLBACK_CHAIN_CONTINUE	0
+#define	CALLBACK_CHAIN_ABORT	1
+
+int callback_run_roundrobin(struct callback_head *, void *);
+void callback_register(struct callback_head *, struct callback_entry *,
+    void *, int (*)(struct callback_entry *, void *, void *));
+void callback_unregister(struct callback_head *, struct callback_entry *);
+void callback_head_init(struct callback_head *);
+
+#endif /* !_SYS_CALLBACK_H_ */
Index: sys/pool.h
===================================================================
--- sys/pool.h	(revision 1574)
+++ sys/pool.h	(working copy)
@@ -53,6 +53,9 @@
 #include <sys/queue.h>
 #include <sys/time.h>
 #include <sys/tree.h>
+#if defined(_KERNEL)
+#include <sys/callback.h>
+#endif /* defined(_KERNEL) */
 #endif
 
 #define	PCG_NOBJECTS		16
@@ -108,9 +111,17 @@ struct pool_allocator {
 	TAILQ_HEAD(, pool) pa_list;	/* list of pools using this allocator */
 	int		pa_flags;
 #define	PA_INITIALIZED	0x01
-#define	PA_WANT		0x02		/* wakeup any sleeping pools on free */
 	int		pa_pagemask;
 	int		pa_pageshift;
+	struct vm_map *pa_backingmap;
+#if defined(_KERNEL)
+	struct {
+		struct vm_map **i_backingmapptr;
+		SLIST_ENTRY(pool_allocator) i_q;
+	} pa_init;
+#define	pa_q			pa_init.i_q
+#define	pa_backingmapptr	pa_init.i_backingmapptr
+#endif /* defined(_KERNEL) */
 };
 
 LIST_HEAD(pool_pagelist,pool_item_header);
@@ -205,6 +216,8 @@ struct pool {
 
 	const char	*pr_entered_file; /* reentrancy check */
 	long		pr_entered_line;
+
+	struct callback_entry pr_reclaimerentry;
 };
 #endif /* __POOL_EXPOSE */
 
Index: kern/subr_pool.c
===================================================================
--- kern/subr_pool.c	(revision 1589)
+++ kern/subr_pool.c	(working copy)
@@ -82,12 +82,16 @@ static struct pool phpool[PHPOOL_MAX];
 static struct pool psppool;
 #endif
 
+static SLIST_HEAD(, pool_allocator) pa_deferinitq =
+    SLIST_HEAD_INITIALIZER(pa_deferinitq);
+
 static void *pool_page_alloc_meta(struct pool *, int);
 static void pool_page_free_meta(struct pool *, void *);
 
 /* allocator for pool metadata */
 static struct pool_allocator pool_allocator_meta = {
-	pool_page_alloc_meta, pool_page_free_meta
+	pool_page_alloc_meta, pool_page_free_meta,
+	.pa_backingmapptr = &kmem_map,
 };
 
 /* # of seconds to retain page after last use */
@@ -184,8 +188,8 @@ static void	pool_prime_page(struct pool 
 static void	pool_update_curpage(struct pool *);
 
 static int	pool_grow(struct pool *, int);
-void		*pool_allocator_alloc(struct pool *, int);
-void		pool_allocator_free(struct pool *, void *);
+static void	*pool_allocator_alloc(struct pool *, int);
+static void	pool_allocator_free(struct pool *, void *);
 
 static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
 	void (*)(const char *, ...));
@@ -443,12 +447,106 @@ pr_rmpage(struct pool *pp, struct pool_i
 	pool_update_curpage(pp);
 }
 
+static boolean_t
+pa_starved_p(struct pool_allocator *pa)
+{
+
+	if (pa->pa_backingmap != NULL) {
+		return vm_map_starved_p(pa->pa_backingmap);
+	}
+	return FALSE;
+}
+
+static int
+pool_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
+{
+	struct pool *pp = obj;
+	struct pool_allocator *pa = pp->pr_alloc;
+#if 1
+	unsigned long oidle = pp->pr_nidle;
+	unsigned long nidle;
+#endif
+
+	KASSERT(&pp->pr_reclaimerentry == ce);
+
+	pool_reclaim(pp);
+
+#if 1
+	nidle = pp->pr_nidle;
+	if (nidle != oidle) {
+		printf("%s: '%s' %lu -> %lu\n",
+		    __func__, pp->pr_wchan, oidle, nidle);
+	}
+#endif
+
+	if (!pa_starved_p(pa)) {
+		return CALLBACK_CHAIN_ABORT;
+	}
+	return CALLBACK_CHAIN_CONTINUE;
+}
+
+static void
+pool_reclaim_register(struct pool *pp)
+{
+	struct vm_map *map = pp->pr_alloc->pa_backingmap;
+	int s;
+
+	if (map == NULL) {
+#if 1
+		if (pp->pr_alloc->pa_backingmapptr == NULL) {
+			printf("%s: pool %p '%s' doesn't have backing map\n",
+			    __func__, pp, pp->pr_wchan);
+		}
+#endif
+		return;
+	}
+
+	s = splvm(); /* not necessary for INTRSAFE maps, but don't care. */
+	callback_register(&vm_map_to_kernel(map)->vmk_reclaim_callback,
+	    &pp->pr_reclaimerentry, pp, pool_reclaim_callback);
+	splx(s);
+}
+
+static void
+pool_reclaim_unregister(struct pool *pp)
+{
+	struct vm_map *map = pp->pr_alloc->pa_backingmap;
+	int s;
+
+	if (map == NULL) {
+		return;
+	}
+
+	s = splvm(); /* not necessary for INTRSAFE maps, but don't care. */
+	callback_unregister(&vm_map_to_kernel(map)->vmk_reclaim_callback,
+	    &pp->pr_reclaimerentry);
+	splx(s);
+}
+
+static void
+pa_reclaim_register(struct pool_allocator *pa)
+{
+	struct vm_map *map = *pa->pa_backingmapptr;
+	struct pool *pp;
+
+	KASSERT(pa->pa_backingmap == NULL);
+	if (map == NULL) {
+		SLIST_INSERT_HEAD(&pa_deferinitq, pa, pa_q);
+		return;
+	}
+	pa->pa_backingmap = map;
+	TAILQ_FOREACH(pp, &pa->pa_list, pr_alloc_list) {
+		pool_reclaim_register(pp);
+	}
+}
+
 /*
  * Initialize all the pools listed in the "pools" link set.
  */
 void
-link_pool_init(void)
+link_pool_init(void) /* XXX rename */
 {
+	struct pool_allocator *pa;
 	__link_set_decl(pools, struct link_pool_init);
 	struct link_pool_init * const *pi;
 
@@ -456,6 +554,14 @@ link_pool_init(void)
 		pool_init((*pi)->pp, (*pi)->size, (*pi)->align,
 		    (*pi)->align_offset, (*pi)->flags, (*pi)->wchan,
 		    (*pi)->palloc);
+
+	/* XXX XXX */
+	while ((pa = SLIST_FIRST(&pa_deferinitq)) != NULL) {
+		KASSERT(pa->pa_backingmapptr != NULL);
+		KASSERT(*pa->pa_backingmapptr != NULL);
+		SLIST_REMOVE_HEAD(&pa_deferinitq, pa_q);
+		pa_reclaim_register(pa);
+	}
 }
 
 /*
@@ -502,6 +608,10 @@ pool_init(struct pool *pp, size_t size, 
 		simple_lock_init(&palloc->pa_slock);
 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
+
+		if (palloc->pa_backingmapptr != NULL) {
+			pa_reclaim_register(palloc);
+		}
 		palloc->pa_flags |= PA_INITIALIZED;
 	}
 
@@ -683,6 +793,7 @@ pool_init(struct pool *pp, size_t size, 
 	TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
 	simple_unlock(&palloc->pa_slock);
 	splx(s);
+	pool_reclaim_register(pp);
 }
 
 /*
@@ -703,6 +814,7 @@ pool_destroy(struct pool *pp)
 	simple_unlock(&pool_head_slock);
 
 	/* Remove this pool from its allocator's list of pools. */
+	pool_reclaim_unregister(pp); /* XXX can sleep */
 	s = splvm();
 	simple_lock(&pp->pr_alloc->pa_slock);
 	TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
@@ -897,24 +1009,10 @@ pool_get(struct pool *pp, int flags)
 			if (pp->pr_curpage != NULL)
 				goto startover;
 
-			if ((flags & PR_WAITOK) == 0) {
-				pp->pr_nfail++;
-				pr_leave(pp);
-				simple_unlock(&pp->pr_slock);
-				return (NULL);
-			}
-
-			/*
-			 * Wait for items to be returned to this pool.
-			 *
-			 * wake up once a second and try again,
-			 * as the check in pool_cache_put_paddr() is racy.
-			 */
-			pp->pr_flags |= PR_WANTED;
-			/* PA_WANTED is already set on the allocator. */
+			pp->pr_nfail++;
 			pr_leave(pp);
-			ltsleep(pp, PSWP, pp->pr_wchan, hz, &pp->pr_slock);
-			pr_enter(pp, file, line);
+			simple_unlock(&pp->pr_slock);
+			return (NULL);
 		}
 
 		/* Start the allocation process over. */
@@ -1114,7 +1212,7 @@ pool_do_put(struct pool *pp, void *v, st
 		pp->pr_nidle++;
 		if (pp->pr_npages > pp->pr_minpages &&
 		    (pp->pr_npages > pp->pr_maxpages ||
-		     (pp->pr_alloc->pa_flags & PA_WANT) != 0)) {
+		     pa_starved_p(pp->pr_alloc))) {
 			pr_rmpage(pp, ph, pq);
 		} else {
 			LIST_REMOVE(ph, ph_pagelist);
@@ -1483,7 +1581,8 @@ pool_reclaim(struct pool *pp)
 
 		KASSERT(ph->ph_nmissing == 0);
 		timersub(&curtime, &ph->ph_time, &diff);
-		if (diff.tv_sec < pool_inactive_time)
+		if (diff.tv_sec < pool_inactive_time
+		    && !pa_starved_p(pp->pr_alloc))
 			continue;
 
 		/*
@@ -2166,10 +2265,12 @@ void	pool_page_free(struct pool *, void 
 #ifdef POOL_SUBPAGE
 struct pool_allocator pool_allocator_kmem_fullpage = {
 	pool_page_alloc, pool_page_free, 0,
+	.pa_backingmapptr = &kmem_map,
 };
 #else
 struct pool_allocator pool_allocator_kmem = {
 	pool_page_alloc, pool_page_free, 0,
+	.pa_backingmapptr = &kmem_map,
 };
 #endif
 
@@ -2179,10 +2280,12 @@ void	pool_page_free_nointr(struct pool *
 #ifdef POOL_SUBPAGE
 struct pool_allocator pool_allocator_nointr_fullpage = {
 	pool_page_alloc_nointr, pool_page_free_nointr, 0,
+	.pa_backingmapptr = &kernel_map,
 };
 #else
 struct pool_allocator pool_allocator_nointr = {
 	pool_page_alloc_nointr, pool_page_free_nointr, 0,
+	.pa_backingmapptr = &kernel_map,
 };
 #endif
 
@@ -2192,6 +2295,7 @@ void	pool_subpage_free(struct pool *, vo
 
 struct pool_allocator pool_allocator_kmem = {
 	pool_subpage_alloc, pool_subpage_free, POOL_SUBPAGE,
+	.pa_backingmapptr = &kmem_map,
 };
 
 void	*pool_subpage_alloc_nointr(struct pool *, int);
@@ -2199,125 +2303,41 @@ void	pool_subpage_free_nointr(struct poo
 
 struct pool_allocator pool_allocator_nointr = {
 	pool_subpage_alloc, pool_subpage_free, POOL_SUBPAGE,
+	.pa_backingmapptr = &kmem_map,
 };
 #endif /* POOL_SUBPAGE */
 
-/*
- * We have at least three different resources for the same allocation and
- * each resource can be depleted.  First, we have the ready elements in the
- * pool.  Then we have the resource (typically a vm_map) for this allocator.
- * Finally, we have physical memory.  Waiting for any of these can be
- * unnecessary when any other is freed, but the kernel doesn't support
- * sleeping on multiple wait channels, so we have to employ another strategy.
- *
- * The caller sleeps on the pool (so that it can be awakened when an item
- * is returned to the pool), but we set PA_WANT on the allocator.  When a
- * page is returned to the allocator and PA_WANT is set, pool_allocator_free
- * will wake up all sleeping pools belonging to this allocator.
- *
- * XXX Thundering herd.
- */
-void *
-pool_allocator_alloc(struct pool *org, int flags)
+static void *
+pool_allocator_alloc(struct pool *pp, int flags)
 {
-	struct pool_allocator *pa = org->pr_alloc;
-	struct pool *pp, *start;
-	int s, freed;
+	struct pool_allocator *pa = pp->pr_alloc;
 	void *res;
 
-	LOCK_ASSERT(!simple_lock_held(&org->pr_slock));
+	LOCK_ASSERT(!simple_lock_held(&pp->pr_slock));
 
-	do {
-		if ((res = (*pa->pa_alloc)(org, flags)) != NULL)
-			return (res);
-		if ((flags & PR_WAITOK) == 0) {
-			/*
-			 * We only run the drain hookhere if PR_NOWAIT.
-			 * In other cases, the hook will be run in
-			 * pool_reclaim().
-			 */
-			if (org->pr_drain_hook != NULL) {
-				(*org->pr_drain_hook)(org->pr_drain_hook_arg,
-				    flags);
-				if ((res = (*pa->pa_alloc)(org, flags)) != NULL)
-					return (res);
-			}
-			break;
-		}
-
+	res = (*pa->pa_alloc)(pp, flags);
+	if (res == NULL && (flags & PR_WAITOK) == 0) {
 		/*
-		 * Drain all pools, that use this allocator.
-		 * We do this to reclaim VA space.
-		 * pa_alloc is responsible for waiting for
-		 * physical memory.
-		 *
-		 * XXX We risk looping forever if start if someone
-		 * calls pool_destroy on "start".  But there is no
-		 * other way to have potentially sleeping pool_reclaim,
-		 * non-sleeping locks on pool_allocator, and some
-		 * stirring of drained pools in the allocator.
-		 *
-		 * XXX Maybe we should use pool_head_slock for locking
-		 * the allocators?
+		 * We only run the drain hook here if PR_NOWAIT.
+		 * In other cases, the hook will be run in
+		 * pool_reclaim().
 		 */
-		freed = 0;
-
-		s = splvm();
-		simple_lock(&pa->pa_slock);
-		pp = start = TAILQ_FIRST(&pa->pa_list);
-		do {
-			TAILQ_REMOVE(&pa->pa_list, pp, pr_alloc_list);
-			TAILQ_INSERT_TAIL(&pa->pa_list, pp, pr_alloc_list);
-			simple_unlock(&pa->pa_slock);
-			freed = pool_reclaim(pp);
-			simple_lock(&pa->pa_slock);
-		} while ((pp = TAILQ_FIRST(&pa->pa_list)) != start &&
-			 freed == 0);
-
-		if (freed == 0) {
-			/*
-			 * We set PA_WANT here, the caller will most likely
-			 * sleep waiting for pages (if not, this won't hurt
-			 * that much), and there is no way to set this in
-			 * the caller without violating locking order.
-			 */
-			pa->pa_flags |= PA_WANT;
+		if (pp->pr_drain_hook != NULL) {
+			(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
+			res = (*pa->pa_alloc)(pp, flags);
 		}
-		simple_unlock(&pa->pa_slock);
-		splx(s);
-	} while (freed);
-	return (NULL);
+	}
+	return res;
 }
 
-void
+static void
 pool_allocator_free(struct pool *pp, void *v)
 {
 	struct pool_allocator *pa = pp->pr_alloc;
-	int s;
 
 	LOCK_ASSERT(!simple_lock_held(&pp->pr_slock));
 
 	(*pa->pa_free)(pp, v);
-
-	s = splvm();
-	simple_lock(&pa->pa_slock);
-	if ((pa->pa_flags & PA_WANT) == 0) {
-		simple_unlock(&pa->pa_slock);
-		splx(s);
-		return;
-	}
-
-	TAILQ_FOREACH(pp, &pa->pa_list, pr_alloc_list) {
-		simple_lock(&pp->pr_slock);
-		if ((pp->pr_flags & PR_WANTED) != 0) {
-			pp->pr_flags &= ~PR_WANTED;
-			wakeup(pp);
-		}
-		simple_unlock(&pp->pr_slock);
-	}
-	pa->pa_flags &= ~PA_WANT;
-	simple_unlock(&pa->pa_slock);
-	splx(s);
 }
 
 void *
Index: kern/vfs_bio.c
===================================================================
--- kern/vfs_bio.c	(revision 1587)
+++ kern/vfs_bio.c	(working copy)
@@ -177,9 +177,9 @@ struct simplelock bqueue_slock = SIMPLEL
 
 /*
  * Buffer pool for I/O buffers.
- * Access to this pool must be protected with splbio().
  */
-static POOL_INIT(bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
+static POOL_INIT(bufpool, sizeof(struct buf), 0, 0, 0, "bufpl",
+    &pool_allocator_nointr);
 
 
 /* XXX - somewhat gross.. */
@@ -375,8 +375,7 @@ bufinit(void)
 	if (bufmem_valimit != 0) {
 		vaddr_t minaddr = 0, maxaddr;
 		buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
-					  bufmem_valimit, VM_MAP_PAGEABLE,
-					  FALSE, 0);
+					  bufmem_valimit, 0, FALSE, 0);
 		if (buf_map == NULL)
 			panic("bufinit: cannot allocate submap");
 	} else
@@ -393,6 +392,7 @@ bufinit(void)
 	use_std = 1;
 #endif
 
+	bufmempool_allocator.pa_backingmap = buf_map;
 	for (i = 0; i < NMEMPOOLS; i++) {
 		struct pool_allocator *pa;
 		struct pool *pp = &bmempools[i];
@@ -985,13 +985,13 @@ already_queued:
 	/* Allow disk interrupts. */
 	simple_unlock(&bp->b_interlock);
 	simple_unlock(&bqueue_slock);
+	splx(s);
 	if (bp->b_bufsize <= 0) {
 #ifdef DEBUG
 		memset((char *)bp, 0, sizeof(*bp));
 #endif
 		pool_put(&bufpool, bp);
 	}
-	splx(s);
 }
 
 /*
Index: kern/uipc_socket.c
===================================================================
--- kern/uipc_socket.c	(revision 1590)
+++ kern/uipc_socket.c	(working copy)
@@ -126,16 +126,8 @@ EVCNT_ATTACH_STATIC(sosend_kvalimit);
 
 #endif /* SOSEND_COUNTERS */
 
-void
-soinit(void)
-{
+static struct callback_entry sokva_reclaimerentry;
 
-	/* Set the initial adjusted socket buffer size. */
-	if (sb_max_set(sb_max))
-		panic("bad initial sb_max value: %lu", sb_max);
-
-}
-
 #ifdef SOSEND_NO_LOAN
 int use_sosend_loan = 0;
 #else
@@ -438,6 +430,32 @@ sosend_loan(struct socket *so, struct ui
 	}
 
 	return (space);
+}
+
+static int
+sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
+{
+
+	KASSERT(ce == &sokva_reclaimerentry);
+	KASSERT(obj == NULL);
+
+	sodopendfree();
+	if (!vm_map_starved_p(kernel_map)) {
+		return CALLBACK_CHAIN_ABORT;
+	}
+	return CALLBACK_CHAIN_CONTINUE;
+}
+
+void
+soinit(void)
+{
+
+	/* Set the initial adjusted socket buffer size. */
+	if (sb_max_set(sb_max))
+		panic("bad initial sb_max value: %lu", sb_max);
+
+	callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
+	    &sokva_reclaimerentry, NULL, sokva_reclaim_callback);
 }
 
 /*
Index: kern/uipc_mbuf.c
===================================================================
--- kern/uipc_mbuf.c	(revision 1585)
+++ kern/uipc_mbuf.c	(working copy)
@@ -154,6 +154,7 @@ mbinit(void)
 	KASSERT(sizeof(struct _m_ext) <= MHLEN);
 	KASSERT(sizeof(struct mbuf) == MSIZE);
 
+	mclpool_allocator.pa_backingmap = mb_map;
 	pool_init(&mbpool, msize, 0, 0, 0, "mbpl", NULL);
 	pool_init(&mclpool, mclbytes, 0, 0, 0, "mclpl", &mclpool_allocator);
 
Index: kern/subr_callback.c
===================================================================
--- kern/subr_callback.c	(revision 0)
+++ kern/subr_callback.c	(revision 0)
@@ -0,0 +1,139 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c)2006 YAMAMOTO Takashi,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/callback.h>
+
+#define	CH_WANT	1
+
+void
+callback_head_init(struct callback_head *ch)
+{
+
+	simple_lock_init(&ch->ch_lock);
+	TAILQ_INIT(&ch->ch_q);
+	ch->ch_next = NULL;
+	ch->ch_nentries = 0;
+}
+
+void
+callback_register(struct callback_head *ch, struct callback_entry *ce,
+    void *obj, int (*fn)(struct callback_entry *, void *, void *))
+{
+
+	ce->ce_func = fn;
+	ce->ce_obj = obj;
+	simple_lock(&ch->ch_lock);
+	TAILQ_INSERT_TAIL(&ch->ch_q, ce, ce_q);
+	ch->ch_nentries++;
+	simple_unlock(&ch->ch_lock);
+}
+
+void
+callback_unregister(struct callback_head *ch, struct callback_entry *ce)
+{
+
+	simple_lock(&ch->ch_lock);
+	while (ch->ch_running > 0) {
+		ch->ch_flags |= CH_WANT;
+		ltsleep(&ch->ch_running, PVM, "recunreg", 0, &ch->ch_lock);
+	}
+	if (__predict_false(ch->ch_next == ce)) {
+		ch->ch_next = TAILQ_NEXT(ce, ce_q);
+	}
+	TAILQ_REMOVE(&ch->ch_q, ce, ce_q);
+	ch->ch_nentries--;
+	simple_unlock(&ch->ch_lock);
+}
+
+static int
+callback_runone(struct callback_head *ch, void *arg)
+{
+	struct callback_entry *ce;
+	int result;
+
+	KASSERT(ch->ch_nentries > 0);
+	KASSERT(ch->ch_running > 0);
+
+	ce = ch->ch_next;
+	if (ce == NULL) {
+		ce = TAILQ_FIRST(&ch->ch_q);
+	}
+	KASSERT(ce != NULL);
+	result = (*ce->ce_func)(ce, ce->ce_obj, arg);
+	ch->ch_next = TAILQ_NEXT(ce, ce_q);
+	return result;
+}
+
+static void
+callback_run_enter(struct callback_head *ch)
+{
+
+	simple_lock(&ch->ch_lock);
+	ch->ch_running++;
+	simple_unlock(&ch->ch_lock);
+}
+
+static void
+callback_run_leave(struct callback_head *ch)
+{
+
+	simple_lock(&ch->ch_lock);
+	KASSERT(ch->ch_running > 0);
+	ch->ch_running--;
+	if (ch->ch_running == 0 && (ch->ch_flags & CH_WANT) != 0) {
+		ch->ch_flags &= ~CH_WANT;
+		wakeup(&ch->ch_running);
+	}
+	simple_unlock(&ch->ch_lock);
+}
+
+int
+callback_run_roundrobin(struct callback_head *ch, void *arg)
+{
+	int i;
+	int n;
+	int result = 0;
+
+	callback_run_enter(ch);
+	n = ch->ch_nentries;
+	for (i = 0; i < n; i++) {
+		result = callback_runone(ch, arg);
+		if (result != CALLBACK_CHAIN_CONTINUE) {
+			break;
+		}
+	}
+	callback_run_leave(ch);
+
+	return result;
+}
Index: uvm/uvm_km.c
===================================================================
--- uvm/uvm_km.c	(revision 1591)
+++ uvm/uvm_km.c	(working copy)
@@ -188,7 +188,8 @@ km_vacache_alloc(struct pool *pp, int fl
 	if (uvm_map(map, &va, size, NULL, UVM_UNKNOWN_OFFSET, size,
 	    UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
 	    UVM_ADV_RANDOM, UVM_FLAG_QUANTUM |
-	    ((flags & PR_WAITOK) ? 0 : UVM_FLAG_TRYLOCK | UVM_FLAG_NOWAIT))))
+	    ((flags & PR_WAITOK) ? UVM_FLAG_WAITVA :
+	    UVM_FLAG_TRYLOCK | UVM_FLAG_NOWAIT))))
 		return NULL;
 
 	return (void *)va;
@@ -226,10 +227,9 @@ km_vacache_init(struct vm_map *map, cons
 	pa->pa_alloc = km_vacache_alloc;
 	pa->pa_free = km_vacache_free;
 	pa->pa_pagesz = (unsigned int)size;
+	pa->pa_backingmap = map;
+	pa->pa_backingmapptr = NULL;
 	pool_init(pp, PAGE_SIZE, 0, 0, PR_NOTOUCH | PR_RECURSIVE, name, pa);
-
-	/* XXX for now.. */
-	pool_sethiwat(pp, 0);
 }
 
 void
@@ -252,6 +252,30 @@ uvm_km_vacache_init(struct vm_map *map, 
 }
 
 #endif /* !defined(PMAP_MAP_POOLPAGE) */
+
+void
+uvm_km_va_drain(struct vm_map *map, uvm_flag_t flags)
+{
+	struct vm_map_kernel *vmk = vm_map_to_kernel(map);
+	const boolean_t intrsafe = (map->flags & VM_MAP_INTRSAFE) != 0;
+#if 0
+	const int rflags =
+	    (flags & (UVM_FLAG_NOWAIT|UVM_FLAG_WAITVA)) == UVM_FLAG_WAITVA ?
+	    0 : RECLAIM_FLAG_NOWAIT;
+	struct reclaim_args args = {
+		.ra_flags = rflags,
+	};
+#endif
+	int s = 0xdeadbeaf; /* XXX: gcc */
+
+	if (intrsafe) {
+		s = splvm();
+	}
+	callback_run_roundrobin(&vmk->vmk_reclaim_callback, NULL);
+	if (intrsafe) {
+		splx(s);
+	}
+}
 
 /*
  * uvm_km_init: init kernel maps and objects to reflect reality (i.e.
Index: uvm/uvm_km.h
===================================================================
--- uvm/uvm_km.h	(revision 1464)
+++ uvm/uvm_km.h	(working copy)
@@ -55,6 +55,7 @@ void uvm_km_check_empty(vaddr_t, vaddr_t
 #else
 #define	uvm_km_check_empty(a, b, c)	/* nothing */
 #endif /* defined(DEBUG) */
+void uvm_km_va_drain(struct vm_map *, uvm_flag_t);
 
 #endif /* _KERNEL */
 
Index: uvm/uvm_map.c
===================================================================
--- uvm/uvm_map.c	(revision 1587)
+++ uvm/uvm_map.c	(working copy)
@@ -742,7 +742,17 @@ uvm_map_clip_end(struct vm_map *map, str
 	uvm_tree_sanity(map, "clip_end leave");
 }
 
+static void
+vm_map_drain(struct vm_map *map, uvm_flag_t flags)
+{
 
+	if (!VM_MAP_IS_KERNEL(map)) {
+		return;
+	}
+
+	uvm_km_va_drain(map, flags);
+}
+
 /*
  *   M A P   -   m a i n   e n t r y   p o i n t
  */
@@ -875,16 +885,11 @@ retry:
 		}
 		vm_map_lock(map); /* could sleep here */
 	}
-	if ((prev_entry = uvm_map_findspace(map, start, size, &start,
-	    uobj, uoffset, align, flags)) == NULL) {
+	prev_entry = uvm_map_findspace(map, start, size, &start,
+	    uobj, uoffset, align, flags);
+	if (prev_entry == NULL) {
 		unsigned int timestamp;
 
-		if ((flags & UVM_FLAG_WAITVA) == 0) {
-			UVMHIST_LOG(maphist,"<- uvm_map_findspace failed!",
-			    0,0,0,0);
-			vm_map_unlock(map);
-			return ENOMEM;
-		}
 		timestamp = map->timestamp;
 		UVMHIST_LOG(maphist,"waiting va timestamp=0x%x",
 			    timestamp,0,0,0);
@@ -894,15 +899,24 @@ retry:
 		vm_map_unlock(map);
 
 		/*
-		 * wait until someone does unmap.
+		 * try to reclaim kva and wait until someone does unmap.
 		 * XXX fragile locking
 		 */
 
+		vm_map_drain(map, flags);
+
 		simple_lock(&map->flags_lock);
 		while ((map->flags & VM_MAP_WANTVA) != 0 &&
 		   map->timestamp == timestamp) {
-			ltsleep(&map->header, PVM, "vmmapva", 0,
-			    &map->flags_lock);
+			if ((flags & UVM_FLAG_WAITVA) == 0) {
+				simple_unlock(&map->flags_lock);
+				UVMHIST_LOG(maphist,
+				    "<- uvm_map_findspace failed!", 0,0,0,0);
+				return ENOMEM;
+			} else {
+				ltsleep(&map->header, PVM, "vmmapva", 0,
+				    &map->flags_lock);
+			}
 		}
 		simple_unlock(&map->flags_lock);
 		goto retry;
@@ -2655,6 +2669,7 @@ uvm_map_setup_kernel(struct vm_map_kerne
 
 	uvm_map_setup(&map->vmk_map, vmin, vmax, flags);
 
+	callback_head_init(&map->vmk_reclaim_callback);
 	LIST_INIT(&map->vmk_kentry_free);
 	map->vmk_merged_entries = NULL;
 }
@@ -4789,4 +4804,18 @@ vm_map_to_kernel(struct vm_map *map)
 	KASSERT(VM_MAP_IS_KERNEL(map));
 
 	return (struct vm_map_kernel *)map;
+}
+
+boolean_t
+vm_map_starved_p(struct vm_map *map)
+{
+
+	if ((map->flags & VM_MAP_WANTVA) != 0) {
+		return TRUE;
+	}
+	/* XXX */
+	if ((vm_map_max(map) - vm_map_min(map)) / 16 * 15 < map->size) {
+		return TRUE;
+	}
+	return FALSE;
 }
Index: uvm/uvm_map.h
===================================================================
--- uvm/uvm_map.h	(revision 1571)
+++ uvm/uvm_map.h	(working copy)
@@ -234,6 +234,9 @@ struct vm_map {
 };
 
 #if defined(_KERNEL)
+
+#include <sys/callback.h>
+
 struct vm_map_kernel {
 	struct vm_map vmk_map;
 	LIST_HEAD(, uvm_kmapent_hdr) vmk_kentry_free;
@@ -241,6 +244,7 @@ struct vm_map_kernel {
 	struct vm_map_entry	*vmk_merged_entries;
 			/* Merged entries, kept for later splitting */
 
+	struct callback_head vmk_reclaim_callback;
 #if !defined(PMAP_MAP_POOLPAGE)
 	struct pool vmk_vacache; /* kva cache */
 	struct pool_allocator vmk_vacache_allocator; /* ... and its allocator */
@@ -506,6 +510,8 @@ do {									\
 	if (oflags & VM_MAP_WANTLOCK)					\
 		wakeup(&(map)->flags);					\
 } while (/*CONSTCOND*/ 0)
+
+boolean_t vm_map_starved_p(struct vm_map *);
 
 #endif /* _KERNEL */
 

--NextPart-20060406202103-1948600--