[src/ad-namecache]: src/sys namecache:

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/ad-namecache]: src/sys namecache:
From: ad <ad%NetBSD.org@localhost>
Date: Tue, 14 Jan 2020 13:33:34 +0000
details:   https://anonhg.NetBSD.org/src/rev/d45134b4c97d
branches:  ad-namecache
changeset: 467125:d45134b4c97d
user:      ad <ad%NetBSD.org@localhost>
date:      Tue Jan 14 11:07:40 2020 +0000

description:
namecache:

This is working better than expected.  It seems to cut system time for
build.sh by ~10% on my test machine and joerg@ is seeing better results with
pbulk.  Improve it a bit more without changing the basic idea:

- Split cache_list_lock into a per-vnode rwlock for reverse lookup, and a
  lightly contended global lock on LRU state (cache_lru_lock),

- For LRU replacement, imitate the VM system's page replacement algorithm.
  This eliminates the writebacks to struct namecache (to track time of last
  hit).

- Dynamically allocate the per-directory lock, preparing the way for having
  a "struct nchdir" or similar which could contain stuff like different
  structures for lookup, cached info to do the equivalent of VOP_ACCESS() in
  cache, and so on.

diffstat:

 sys/kern/vfs_cache.c |  526 +++++++++++++++++++++++++++++++-------------------
 sys/sys/namei.src    |   11 +-
 sys/sys/vnode_impl.h |    7 +-
 3 files changed, 338 insertions(+), 206 deletions(-)

diffs (truncated from 991 to 300 lines):

diff -r 30e99cb5d2d0 -r d45134b4c97d sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c      Mon Jan 13 08:51:06 2020 +0000
+++ b/sys/kern/vfs_cache.c      Tue Jan 14 11:07:40 2020 +0000
@@ -1,7 +1,7 @@
-/*     $NetBSD: vfs_cache.c,v 1.126.2.4 2020/01/13 08:51:07 ad Exp $   */
+/*     $NetBSD: vfs_cache.c,v 1.126.2.5 2020/01/14 11:07:40 ad Exp $   */
 
 /*-
- * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,6 +71,10 @@
  *     DELETE, or NOCACHE is set (rewrite), and name is located in the
  *     cache, it will be dropped.
  *
+ * Background:
+ *
+ *     XXX add a bit of history
+ *
  * Data structures:
  *
  *     The original BSD implementation used a global hash table, which
@@ -86,24 +90,16 @@
  *     utimately make use of some other data structure, perhaps a Robin
  *     Hood hash.  Insert blurb here when that happens.
  *
+ * Replacement:
+ *
+ *     XXX LRU blurb.
+ *
  * Concurrency:
  *
- *     There are two locks that are of particular interest:
- *
- *     nc_dvp->vi_nclock: a per-directory lock.  This is taken mainly
- *     during lookups.
+ *     XXX need new blurb here
  *
- *     cache_list_lock: a global lock for all lists, including per-vnode
- *     lists and the LRU queue.  This is taken mainly during insertion and
- *     removal, and when operating in the list -> tree direction.
- *
- *     vp->v_interlock: per vnode interlock taken when acquiring a ref.
- *
- *     Most all modifications are made holding both cache_list_lock and the
- *     directory lock write locked.  nc_hittime does not have any kind of
- *     serialization appliet to it - updates are racy, but since it's only
- *     used for pseudo-LRU replacement it doesn't matter.  See definition
- *     of "struct namecache" in src/sys/namei.src for the particulars.
+ *     See definition of "struct namecache" in src/sys/namei.src for the
+ *     particulars.
  *
  *     Per-CPU statistics, and "numcache" are read unlocked, since an
  *     approximate value is OK.  We maintain uintptr_t sized per-CPU
@@ -112,12 +108,14 @@
  *
  * Lock order:
  *
- *     1) nc_dvp->vi_nclock
+ *     1) nc_dvp->vi_ncdlock
  *     2) cache_list_lock
  *     3) vp->v_interlock
  *
  * Ugly ASCII diagram:
  *
+ *     XXX replace tabs with spaces, make less ugly
+ *
  *          ...
  *          |
  *     -----o-----
@@ -150,7 +148,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126.2.4 2020/01/13 08:51:07 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126.2.5 2020/01/14 11:07:40 ad Exp $");
 
 #define __NAMECACHE_PRIVATE
 #ifdef _KERNEL_OPT
@@ -176,13 +174,22 @@
 /* Per-CPU counters. */
 struct nchstats_percpu _NAMEI_CACHE_STATS(uintptr_t);
 
-/* Global lock, lists and pool. */
-static kmutex_t cache_list_lock __cacheline_aligned;
-static pool_cache_t namecache_cache __read_mostly;
-static TAILQ_HEAD(, namecache) nclruhead __cacheline_aligned;
+/* Global pool cache. */
+static pool_cache_t cache_pool __read_mostly;
 
-/* Number of cache entries allocated. */
-static u_int   numcache __cacheline_aligned;
+/* LRU replacement state. */
+enum cache_lru_id {
+       LRU_ACTIVE,
+       LRU_INACTIVE,
+       LRU_COUNT
+};
+
+static struct {
+       TAILQ_HEAD(, namecache) list[LRU_COUNT];
+       u_int                   count[LRU_COUNT];
+} cache_lru __cacheline_aligned;
+
+static kmutex_t cache_lru_lock __cacheline_aligned;
 
 /* Cache effectiveness statistics.  This holds total from per-cpu stats */
 struct nchstats        nchstats __cacheline_aligned;
@@ -195,14 +202,12 @@
 } while (/* CONSTCOND */ 0);
 
 /* Tunables */
-static const int cache_hottime = 5;    /* number of seconds */
-static const int cache_maxscan = 128;  /* reclaim: max entries to scan */
-static int doingcache = 1;             /* 1 => enable the cache */
+static const int cache_lru_maxdeact = 2;       /* max # to deactivate */
+static const int cache_lru_maxscan = 128;      /* max # to scan/reclaim */
+static int doingcache = 1;                     /* 1 => enable the cache */
 
 static void    cache_reclaim(void);
-static int     cache_ctor(void *, void *, int);
-static void    cache_dtor(void *, void *);
-static void    cache_remove(struct namecache *, bool);
+static void    cache_remove(struct namecache *, const bool);
 
 /* sysctl */
 static struct  sysctllog *sysctllog;
@@ -273,15 +278,16 @@
 };
 
 /*
- * Remove an entry from the cache.  The directory must be locked, and if
- * "inreverse" is false, cache_list_lock will be acquired when removing the
- * entry from the global lists.
+ * Remove an entry from the cache.  The directory lock must be held, and if
+ * "dirtovnode" is true (it usually will be), the vnode's cache lock will be
+ * acquired when removing the entry from the vnode list.
  */
 static void
-cache_remove(struct namecache *ncp, bool inreverse)
+cache_remove(struct namecache *ncp, const bool dirtovnode)
 {
+       krwlock_t *vlock;
 
-       KASSERT(rw_write_held(VNODE_TO_VIMPL(ncp->nc_dvp)->vi_nclock));
+       KASSERT(rw_write_held(VNODE_TO_VIMPL(ncp->nc_dvp)->vi_ncdlock));
 
        SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp,
            0, 0, 0, 0);
@@ -289,29 +295,88 @@
        /* First remove from the directory's rbtree. */
        rb_tree_remove_node(&VNODE_TO_VIMPL(ncp->nc_dvp)->vi_nctree, ncp);
 
-       /* Then remove from the lists. */
-       if (!inreverse) {
-               mutex_enter(&cache_list_lock);
-       }
-       ncp->nc_dvp = NULL;
+       /* Then remove from the LRU lists. */
+       mutex_enter(&cache_lru_lock);
+       TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
+       cache_lru.count[ncp->nc_lrulist]--;
+       mutex_exit(&cache_lru_lock);
+
+       /* Then remove from the vnode list. */
        if (ncp->nc_vp != NULL) {
+               vlock = VNODE_TO_VIMPL(ncp->nc_vp)->vi_ncvlock;
+               if (__predict_true(dirtovnode)) {
+                       rw_enter(vlock, RW_WRITER);
+               }
                TAILQ_REMOVE(&VNODE_TO_VIMPL(ncp->nc_vp)->vi_nclist, ncp,
                    nc_vlist);
+               if (__predict_true(dirtovnode)) {
+                       rw_exit(vlock);
+               }
                ncp->nc_vp = NULL;
        }
-       TAILQ_REMOVE(&nclruhead, ncp, nc_lru);  
-       numcache--;
-       if (!inreverse) {
-               mutex_exit(&cache_list_lock);
-       }
 
        /* Finally, free it. */
        if (ncp->nc_nlen > NCHNAMLEN) {
                size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
                kmem_free(ncp, sz);
        } else {
-               pool_cache_put(namecache_cache, ncp);
+               pool_cache_put(cache_pool, ncp);
+       }
+}
+
+/*
+ * Try to balance the LRU lists.  Pick some victim entries, and re-queue
+ * them from the head of the ACTIVE list to the tail of the INACTIVE list. 
+ */
+static void __noinline
+cache_deactivate(void)
+{
+       struct namecache *ncp;
+       int total, i;
+
+       KASSERT(mutex_owned(&cache_lru_lock));
+
+       /* If we're nowhere near budget yet, don't bother. */
+       total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
+       if (total < (desiredvnodes >> 1)) {
+               return;
+       }
+
+       /* If not out of balance, don't bother. */
+       if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
+               return;
        }
+
+       /* Move victim from head of ACTIVE list, to tail of INACTIVE. */
+       for (i = 0; i < cache_lru_maxdeact; i++) {
+               ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
+               if (ncp == NULL) {
+                       break;
+               }
+               KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
+               ncp->nc_lrulist = LRU_INACTIVE;
+               TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+               TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
+               cache_lru.count[LRU_ACTIVE]--;
+               cache_lru.count[LRU_INACTIVE]++;
+       }
+}
+
+/*
+ * Re-queue an entry onto the correct LRU list, after it has scored a hit.
+ */
+static void __noinline
+cache_activate(struct namecache *ncp)
+{
+
+       mutex_enter(&cache_lru_lock);
+       /* Put on tail of ACTIVE queue, since it just scored a hit. */
+       TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
+       TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+       cache_lru.count[ncp->nc_lrulist]--;
+       cache_lru.count[LRU_ACTIVE]++;
+       ncp->nc_lrulist = LRU_ACTIVE;
+       mutex_exit(&cache_lru_lock);
 }
 
 /*
@@ -324,7 +389,7 @@
        struct namecache *ncp;
        struct iovec iov;
 
-       KASSERT(rw_lock_held(VNODE_TO_VIMPL(dvp)->vi_nclock));
+       KASSERT(rw_lock_held(VNODE_TO_VIMPL(dvp)->vi_ncdlock));
 
        iov.iov_base = __UNCONST(name);
        iov.iov_len = namelen;
@@ -332,14 +397,9 @@
 
        if (ncp != NULL) {
                KASSERT(ncp->nc_dvp == dvp);
-               /*
-                * Avoid false sharing: don't write back to nc_hittime
-                * unless changed significantly.  This is an unlocked
-                * update and is racy, but it doesn't matter since it's
-                * only used for pseudo-LRU replacement.
-                */
-               if (((ncp->nc_hittime ^ hardclock_ticks) & ~31) != 0) {
-                       ncp->nc_hittime = hardclock_ticks;
+               /* If the entry is on the wrong LRU list, requeue it. */
+               if (__predict_false(ncp->nc_lrulist != LRU_ACTIVE)) {
+                       cache_activate(ncp);
                }
                SDT_PROBE(vfs, namecache, lookup, hit, dvp,
                    name, namelen, 0, 0);
@@ -407,7 +467,7 @@
 {
        struct namecache *ncp;
        struct vnode *vp;
-       krwlock_t *dirlock;
+       krwlock_t *dlock;
        int error;
        bool hit;
        krw_t op;
@@ -438,11 +498,16 @@
                op = RW_READER;
        }
 
-       dirlock = VNODE_TO_VIMPL(dvp)->vi_nclock;
-       rw_enter(dirlock, op);
+       /* If no directory lock yet, there's nothing in cache. */
+       dlock = VNODE_TO_VIMPL(dvp)->vi_ncdlock;
+       if (__predict_false(dlock == NULL)) {
+               return false;
+       }
+
+       rw_enter(dlock, op);
Prev by Date: [src/ad-namecache]: src/sys Regen.
Next by Date: [src/trunk]: src/usr.bin/ktruss Catch up in ktruss(1) with ptrace descriptive...
Previous by Thread: [src/ad-namecache]: src/sys Regen.
Next by Thread: [src/trunk]: src/usr.bin/ktruss Catch up in ktruss(1) with ptrace descriptive...
Indexes:
Home | Main Index | Thread Index | Old Index