Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/rump/librump/rumpkern Improve pagedaemon performance:



details:   https://anonhg.NetBSD.org/src/rev/a255ead8b5e3
branches:  trunk
changeset: 757629:a255ead8b5e3
user:      pooka <pooka%NetBSD.org@localhost>
date:      Wed Sep 08 21:02:11 2010 +0000

description:
Improve pagedaemon performance:
  * page out vnode objects
  * drain kmem/kernel_map

As long as there is a reasonable memory hardlimit (>600kB or so),
a rump kernel can now survive file system metadata access for an
arbitrary size file system (provided, of course, that the file
system does not use wired kernel memory for metadata ...).

Data handling still needs a little give&take finetuning.  The
general problem is that a single vm object can easily be the owner
of all vm pages in a rump kernel.  now, if a thread wants to allocate
memory while holding that object locked, there's very little the
pagedaemon can do to avoid deadlock.  but I think the problem can
be solved by making an object release a page when it wants to
allocate a page if a) the system is short on memory and b) too many
pages belong to the object.  that still doesn't take care of the
pathological situation where 1000 threads hold an object with 1
page of memory locked and try to allocate more.  but then again,
running 1000 threads with <1MB of memory is an unlikely scenario.
and ultimately, I call upon the fundamental interaction which is
the basis of why any operating works: luck.

diffstat:

 sys/rump/librump/rumpkern/vm.c |  224 ++++++++++++++++++++++++++++++++++------
 1 files changed, 186 insertions(+), 38 deletions(-)

diffs (truncated from 347 to 300 lines):

diff -r c669a8301107 -r a255ead8b5e3 sys/rump/librump/rumpkern/vm.c
--- a/sys/rump/librump/rumpkern/vm.c    Wed Sep 08 20:40:24 2010 +0000
+++ b/sys/rump/librump/rumpkern/vm.c    Wed Sep 08 21:02:11 2010 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: vm.c,v 1.91 2010/09/07 21:11:10 pooka Exp $    */
+/*     $NetBSD: vm.c,v 1.92 2010/09/08 21:02:11 pooka Exp $    */
 
 /*
  * Copyright (c) 2007-2010 Antti Kantee.  All Rights Reserved.
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.91 2010/09/07 21:11:10 pooka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.92 2010/09/08 21:02:11 pooka Exp $");
 
 #include <sys/param.h>
 #include <sys/atomic.h>
@@ -84,6 +84,26 @@
 
 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED;
 static unsigned long curphysmem;
+static unsigned long dddlim;           /* 90% of memory limit used */
+#define NEED_PAGEDAEMON() \
+    (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim)
+
+/*
+ * Try to free two pages worth of pages from objects.
+ * If this succesfully frees a full page cache page, we'll
+ * free the released page plus PAGE_SIZE²/sizeof(vm_page).
+ */
+#define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page))
+
+/*
+ * Keep a list of least recently used pages.  Since the only way a
+ * rump kernel can "access" a page is via lookup, we put the page
+ * at the back of queue every time a lookup for it is done.  If the
+ * page is in front of this global queue and we're short of memory, 
+ * it's a candidate for pageout.
+ */
+static struct pglist vmpage_lruqueue;
+static unsigned vmpage_onqueue;
 
 static int
 pg_compare_key(const struct rb_node *n, const void *key)
@@ -135,13 +155,18 @@
 
 static struct pool_cache pagecache;
 
-/* called with the object locked */
+/*
+ * Called with the object locked.  We don't support anons.
+ */
 struct vm_page *
 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon,
        int flags, int strat, int free_list)
 {
        struct vm_page *pg;
 
+       KASSERT(uobj && mutex_owned(&uobj->vmobjlock));
+       KASSERT(anon == NULL);
+
        pg = pool_cache_get(&pagecache, PR_WAITOK);
        pg->offset = off;
        pg->uobject = uobj;
@@ -154,6 +179,17 @@
        TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue);
        rb_tree_insert_node(&uobj->rb_tree, &pg->rb_node);
 
+       /*
+        * Put vnodes on the LRU page queue.  we can't flush others,
+        * so don't bother with them.
+        */
+       if (UVM_OBJ_IS_VNODE(uobj)) {
+               atomic_inc_uint(&vmpage_onqueue);
+               mutex_enter(&uvm_pageqlock);
+               TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
+               mutex_exit(&uvm_pageqlock);
+       }
+
        uobj->uo_npages++;
 
        return pg;
@@ -169,12 +205,21 @@
 {
        struct uvm_object *uobj = pg->uobject;
 
+       KASSERT(mutex_owned(&uvm_pageqlock));
+
        if (pg->flags & PG_WANTED)
                wakeup(pg);
 
+       TAILQ_REMOVE(&uobj->memq, pg, listq.queue);
+
        uobj->uo_npages--;
        rb_tree_remove_node(&uobj->rb_tree, &pg->rb_node);
-       TAILQ_REMOVE(&uobj->memq, pg, listq.queue);
+
+       if (UVM_OBJ_IS_VNODE(uobj)) {
+               TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
+               atomic_dec_uint(&vmpage_onqueue);
+       }
+
        pool_cache_put(&pagecache, pg);
 }
 
@@ -207,11 +252,14 @@
                CTASSERT(sizeof(buf) >= HUMANIZE_BYTES);
                format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit);
 #undef HUMANIZE_BYTES
+               dddlim = 9 * (rump_physmemlimit / 10);
        } else {
                strlcpy(buf, "unlimited (host limit)", sizeof(buf));
        }
        aprint_verbose("total memory = %s\n", buf);
 
+       TAILQ_INIT(&vmpage_lruqueue);
+
        uvmexp.free = 1024*1024; /* XXX: arbitrary & not updated */
 
        mutex_init(&pagermtx, MUTEX_DEFAULT, 0);
@@ -416,8 +464,17 @@
 struct vm_page *
 uvm_pagelookup(struct uvm_object *uobj, voff_t off)
 {
+       struct vm_page *pg;
 
-       return (struct vm_page *)rb_tree_find_node(&uobj->rb_tree, &off);
+       pg = (struct vm_page *)rb_tree_find_node(&uobj->rb_tree, &off);
+       if (pg && UVM_OBJ_IS_VNODE(pg->uobject)) {
+               mutex_enter(&uvm_pageqlock);
+               TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
+               TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
+               mutex_exit(&uvm_pageqlock);
+       }
+
+       return pg;
 }
 
 void
@@ -755,30 +812,125 @@
 }
 
 /*
- * Under-construction page mistress.  This is lacking vfs support, namely:
- *
- *  1) draining vfs buffers
- *  2) paging out pages in vm vnode objects
- *     (we will not page out anon memory on the basis that
- *     that's the task of the host)
+ * The Diabolical pageDaemon Director (DDD).
  */
-
 void
 uvm_pageout(void *arg)
 {
+       struct vm_page *pg;
        struct pool *pp, *pp_first;
        uint64_t where;
        int timo = 0;
-       bool succ;
+       int cleaned, skip, skipped;
+       bool succ = false;
 
        mutex_enter(&pdaemonmtx);
        for (;;) {
-               cv_timedwait(&pdaemoncv, &pdaemonmtx, timo);
+               if (succ) {
+                       kernel_map->flags &= ~VM_MAP_WANTVA;
+                       kmem_map->flags &= VM_MAP_WANTVA;
+                       timo = 0;
+               }
+               succ = false;
+
+               /*
+                * Wake up everyone regardless of perceived success.
+                * They will just resleep if we're stil out of juice.
+                */
+               if (pdaemon_waiters) {
+                       pdaemon_waiters = 0;
+                       cv_broadcast(&oomwait);
+               }
+
+               cv_timedwait(&pdaemoncv, &pdaemonmtx, 0);
                uvmexp.pdwoke++;
+
+               /* tell the world that we are hungry */
                kernel_map->flags |= VM_MAP_WANTVA;
+               kmem_map->flags |= VM_MAP_WANTVA;
+
+               if (pdaemon_waiters == 0 && !NEED_PAGEDAEMON())
+                       continue;
                mutex_exit(&pdaemonmtx);
 
-               succ = false;
+               /*
+                * step one: reclaim the page cache.  this should give
+                * us the biggest earnings since whole pages are released
+                * into backing memory.
+                */
+               pool_cache_reclaim(&pagecache);
+               if (!NEED_PAGEDAEMON()) {
+                       succ = true;
+                       mutex_enter(&pdaemonmtx);
+                       continue;
+               }
+
+               /*
+                * Ok, so that didn't help.  Next, try to hunt memory
+                * by pushing out vnode pages.  The pages might contain
+                * useful cached data, but we need the memory.
+                */
+               cleaned = 0;
+               skip = 0;
+ again:
+               mutex_enter(&uvm_pageqlock);
+               while (cleaned < PAGEDAEMON_OBJCHUNK) {
+                       skipped = 0;
+                       TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) {
+                               struct uvm_object *uobj;
+
+                               /*
+                                * skip over pages we _might_ have tried
+                                * to handle earlier.  they might not be
+                                * exactly the same ones, but I'm not too
+                                * concerned.
+                                */
+                               while (skipped++ < skip)
+                                       continue;
+
+                               uobj = pg->uobject;
+                               if (mutex_tryenter(&uobj->vmobjlock)) {
+                                       if ((pg->flags & PG_BUSY) == 0) {
+                                               mutex_exit(&uvm_pageqlock);
+                                               uobj->pgops->pgo_put(uobj,
+                                                   pg->offset,
+                                                   pg->offset + PAGE_SIZE,
+                                                   PGO_CLEANIT|PGO_FREE);
+                                               cleaned++;
+                                               goto again;
+                                       }
+                               }
+
+                               skip++;
+                       }
+                       break;
+               }
+               mutex_exit(&uvm_pageqlock);
+
+               /*
+                * And of course we need to reclaim the page cache
+                * again to actually release memory.
+                */
+               pool_cache_reclaim(&pagecache);
+               if (!NEED_PAGEDAEMON()) {
+                       succ = true;
+                       mutex_enter(&pdaemonmtx);
+                       continue;
+               }
+
+               /*
+                * Still not there?  sleeves come off right about now.
+                * First: do reclaim on kernel/kmem map.
+                */
+               callback_run_roundrobin(&kernel_map_store.vmk_reclaim_callback,
+                   NULL);
+               callback_run_roundrobin(&kmem_map_store.vmk_reclaim_callback,
+                   NULL);
+
+               /*
+                * And then drain the pools.  Wipe them out ... all of them.
+                */
+
                pool_drain_start(&pp_first, &where);
                pp = pp_first;
                for (;;) {
@@ -792,44 +944,38 @@
                                break;
                        }
                }
-               mutex_enter(&pdaemonmtx);
+
+               /*
+                * Need to use PYEC on our bag of tricks.
+                * Unfortunately, the wife just borrowed it.
+                */
 
                if (!succ) {
                        rumpuser_dprintf("pagedaemoness: failed to reclaim "
                            "memory ... sleeping (deadlock?)\n");
-                       timo = hz;
-                       continue;
+                       kpause("dpdd", false, hz, NULL);
                }
-               kernel_map->flags &= ~VM_MAP_WANTVA;
-               timo = 0;
 
-               if (pdaemon_waiters) {
-                       pdaemon_waiters = 0;
-                       cv_broadcast(&oomwait);



Home | Main Index | Thread Index | Old Index