[src/trunk]: src/sys Merge vfs_cache.c from the ad-namecache branch. With th...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys Merge vfs_cache.c from the ad-namecache branch. With th...
From: ad <ad%NetBSD.org@localhost>
Date: Sun, 22 Mar 2020 17:21:35 +0000
details:   https://anonhg.NetBSD.org/src/rev/edfa53aabdae
branches:  trunk
changeset: 746118:edfa53aabdae
user:      ad <ad%NetBSD.org@localhost>
date:      Sun Mar 22 14:38:37 2020 +0000

description:
Merge vfs_cache.c from the ad-namecache branch.  With this the namecache
index becomes per-directory (initially, a red-black tree).  The remaining
changes on the branch to namei()/getcwd() will be merged in the future.

diffstat:

 sys/kern/init_sysctl.c |     5 +-
 sys/kern/vfs_cache.c   |  1828 +++++++++++++++++++++++++----------------------
 sys/kern/vfs_getcwd.c  |     8 +-
 sys/kern/vfs_vnode.c   |    21 +-
 sys/sys/namei.src      |    70 +-
 sys/sys/vnode_impl.h   |    33 +-
 6 files changed, 1062 insertions(+), 903 deletions(-)

diffs (truncated from 2521 to 300 lines):

diff -r 23a267c50fba -r edfa53aabdae sys/kern/init_sysctl.c
--- a/sys/kern/init_sysctl.c    Sun Mar 22 14:27:33 2020 +0000
+++ b/sys/kern/init_sysctl.c    Sun Mar 22 14:38:37 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: init_sysctl.c,v 1.224 2020/01/18 14:40:03 skrll Exp $ */
+/*     $NetBSD: init_sysctl.c,v 1.225 2020/03/22 14:38:37 ad Exp $ */
 
 /*-
  * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_sysctl.c,v 1.224 2020/01/18 14:40:03 skrll Exp $");
+__KERNEL_RCSID(0, "$NetBSD: init_sysctl.c,v 1.225 2020/03/22 14:38:37 ad Exp $");
 
 #include "opt_sysv.h"
 #include "opt_compat_netbsd.h"
@@ -732,7 +732,6 @@
                return (error);
        }
        vfs_reinit();
-       nchreinit();
 
        return (0);
 }
diff -r 23a267c50fba -r edfa53aabdae sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c      Sun Mar 22 14:27:33 2020 +0000
+++ b/sys/kern/vfs_cache.c      Sun Mar 22 14:38:37 2020 +0000
@@ -1,9 +1,12 @@
-/*     $NetBSD: vfs_cache.c,v 1.127 2020/01/08 12:04:56 ad Exp $       */
+/*     $NetBSD: vfs_cache.c,v 1.128 2020/03/22 14:38:37 ad Exp $       */
 
 /*-
- * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -57,8 +60,119 @@
  *     @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
  */
 
+/*
+ * Name caching:
+ *
+ *     Names found by directory scans are retained in a cache for future
+ *     reference.  It is managed LRU, so frequently used names will hang
+ *     around.  The cache is indexed by hash value obtained from the name.
+ *
+ *     The name cache is the brainchild of Robert Elz and was introduced in
+ *     4.3BSD.  See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
+ *     McKusick, May 21 1984.
+ *
+ * Data structures:
+ *
+ *     Most Unix namecaches very sensibly use a global hash table to index
+ *     names.  The global hash table works well, but can cause concurrency
+ *     headaches for the kernel hacker.  In the NetBSD 10.0 implementation
+ *     we are not sensible, and use a per-directory data structure to index
+ *     names, but the cache otherwise functions the same.
+ *
+ *     The index is a red-black tree.  There are no special concurrency
+ *     requirements placed on it, because it's per-directory and protected
+ *     by the namecache's per-directory locks.  It should therefore not be
+ *     difficult to experiment with other types of index.
+ *
+ *     Each cached name is stored in a struct namecache, along with a
+ *     pointer to the associated vnode (nc_vp).  Names longer than a
+ *     maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
+ *     occur infrequently, and names shorter than this are stored directly
+ *     in struct namecache.  If it is a "negative" entry, (i.e. for a name
+ *     that is known NOT to exist) the vnode pointer will be NULL.
+ *
+ *     For a directory with 3 cached names for 3 distinct vnodes, the
+ *     various vnodes and namecache structs would be connected like this
+ *     (the root is at the bottom of the diagram):
+ *
+ *          ...
+ *           ^
+ *           |- vi_nc_tree
+ *           |                                                           
+ *      +----o----+               +---------+               +---------+
+ *      |  VDIR   |               |  VCHR   |               |  VREG   |
+ *      |  vnode  o-----+         |  vnode  o-----+         |  vnode  o------+
+ *      +---------+     |         +---------+     |         +---------+      |
+ *           ^          |              ^          |              ^           |
+ *           |- nc_vp   |- vi_nc_list  |- nc_vp   |- vi_nc_list  |- nc_vp    |
+ *           |          |              |          |              |           |
+ *      +----o----+     |         +----o----+     |         +----o----+      |
+ *  +---onamecache|<----+     +---onamecache|<----+     +---onamecache|<-----+
+ *  |   +---------+           |   +---------+           |   +---------+
+ *  |        ^                |        ^                |        ^
+ *  |        |                |        |                |        |
+ *  |        |  +----------------------+                |        |
+ *  |-nc_dvp | +-------------------------------------------------+
+ *  |        |/- vi_nc_tree   |                         |
+ *  |        |                |- nc_dvp                 |- nc_dvp
+ *  |   +----o----+           |                         |
+ *  +-->|  VDIR   |<----------+                         |
+ *      |  vnode  |<------------------------------------+
+ *      +---------+
+ *
+ *      START HERE
+ *
+ * Replacement:
+ *
+ *     As the cache becomes full, old and unused entries are purged as new
+ *     entries are added.  The synchronization overhead in maintaining a
+ *     strict ordering would be prohibitive, so the VM system's "clock" or
+ *     "second chance" page replacement algorithm is aped here.  New
+ *     entries go to the tail of the active list.  After they age out and
+ *     reach the head of the list, they are moved to the tail of the
+ *     inactive list.  Any use of the deactivated cache entry reactivates
+ *     it, saving it from impending doom; if not reactivated, the entry
+ *     eventually reaches the head of the inactive list and is purged.
+ *
+ * Concurrency:
+ *
+ *     From a performance perspective, cache_lookup(nameiop == LOOKUP) is
+ *     what really matters; insertion of new entries with cache_enter() is
+ *     comparatively infrequent, and overshadowed by the cost of expensive
+ *     file system metadata operations (which may involve disk I/O).  We
+ *     therefore want to make everything simplest in the lookup path.
+ *
+ *     struct namecache is mostly stable except for list and tree related
+ *     entries, changes to which don't affect the cached name or vnode. 
+ *     For changes to name+vnode, entries are purged in preference to
+ *     modifying them.
+ *
+ *     Read access to namecache entries is made via tree, list, or LRU
+ *     list.  A lock corresponding to the direction of access should be
+ *     held.  See definition of "struct namecache" in src/sys/namei.src,
+ *     and the definition of "struct vnode" for the particulars.
+ *
+ *     Per-CPU statistics, and LRU list totals are read unlocked, since
+ *     an approximate value is OK.  We maintain 32-bit sized per-CPU
+ *     counters and 64-bit global counters under the theory that 32-bit
+ *     sized counters are less likely to be hosed by nonatomic increment
+ *     (on 32-bit platforms).
+ *
+ *     The lock order is:
+ *
+ *     1) vi->vi_nc_lock       (tree or parent -> child direction,
+ *                              used during forward lookup)
+ *
+ *     2) vi->vi_nc_listlock   (list or child -> parent direction,
+ *                              used during reverse lookup)
+ *
+ *     3) cache_lru_lock       (LRU list direction, used during reclaim)
+ *
+ *     4) vp->v_interlock      (what the cache entry points to)
+ */
+
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.127 2020/01/08 12:04:56 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.128 2020/03/22 14:38:37 ad Exp $");
 
 #define __NAMECACHE_PRIVATE
 #ifdef _KERNEL_OPT
@@ -66,16 +180,18 @@
 #include "opt_dtrace.h"
 #endif
 
-#include <sys/param.h>
+#include <sys/types.h>
 #include <sys/atomic.h>
+#include <sys/callout.h>
 #include <sys/cpu.h>
 #include <sys/errno.h>
 #include <sys/evcnt.h>
+#include <sys/hash.h>
 #include <sys/kernel.h>
-#include <sys/kthread.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/param.h>
 #include <sys/pool.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
@@ -83,244 +199,77 @@
 #include <sys/time.h>
 #include <sys/vnode_impl.h>
 
-/*
- * Name caching works as follows:
- *
- * Names found by directory scans are retained in a cache
- * for future reference.  It is managed LRU, so frequently
- * used names will hang around.  Cache is indexed by hash value
- * obtained from (dvp, name) where dvp refers to the directory
- * containing name.
- *
- * Upon reaching the last segment of a path, if the reference
- * is for DELETE, or NOCACHE is set (rewrite), and the
- * name is located in the cache, it will be dropped.
- */
+#include <miscfs/genfs/genfs.h>
+
+static void    cache_activate(struct namecache *);
+static void    cache_update_stats(void *);
+static int     cache_compare_key(void *, const void *, const void *);
+static int     cache_compare_nodes(void *, const void *, const void *);
+static void    cache_deactivate(void);
+static void    cache_reclaim(void);
+static int     cache_stat_sysctl(SYSCTLFN_ARGS);
+
+/* Global pool cache. */
+static pool_cache_t cache_pool __read_mostly;
+
+/* LRU replacement. */
+enum cache_lru_id {
+       LRU_ACTIVE,
+       LRU_INACTIVE,
+       LRU_COUNT
+};
+
+static struct {
+       TAILQ_HEAD(, namecache) list[LRU_COUNT];
+       u_int                   count[LRU_COUNT];
+} cache_lru __cacheline_aligned;
 
-/*
- * Cache entry lifetime:
- *
- *     nonexistent
- *     ---create---> active
- *     ---invalidate---> queued
- *     ---reclaim---> nonexistent.
- *
- * States:
- * - Nonexistent.  Cache entry does not exist.
- *
- * - Active.  cache_lookup, cache_lookup_raw, cache_revlookup can look
- *   up, acquire references, and hand off references to vnodes,
- *   e.g. via v_interlock.  Marked by nonnull ncp->nc_dvp.
- *
- * - Queued.  Pending desstruction by cache_reclaim.  Cannot be used by
- *   cache_lookup, cache_lookup_raw, or cache_revlookup.  May still be
- *   on lists.  Marked by null ncp->nc_dvp.
- *
- * Transitions:
- *
- * - Create: nonexistent--->active
- *
- *   Done by cache_enter(dvp, vp, name, namelen, cnflags), called by
- *   VOP_LOOKUP after the answer is found.  Allocates a struct
- *   namecache object, initializes it with the above fields, and
- *   activates it by inserting it into the forward and reverse tables.
- *
- * - Invalidate: active--->queued
- *
- *   Done by cache_invalidate.  If not already invalidated, nullify
- *   ncp->nc_dvp and and add to cache_gcqueue.  Called,
- *   among various other places, in cache_lookup(dvp, name, namelen,
- *   nameiop, cnflags, &iswht, &vp) when MAKEENTRY is missing from
- *   cnflags.
- *
- * - Reclaim: queued--->nonexistent
- *
- *   Done by cache_reclaim.  Disassociate ncp from any lists it is on
- *   and free memory.
- */
+static kmutex_t cache_lru_lock __cacheline_aligned;
+
+/* Cache effectiveness statistics.  nchstats holds system-wide total. */
+struct nchstats        nchstats;
+struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
+struct nchcpu {
+       struct nchstats_percpu cur;
+       struct nchstats_percpu last;
+};
+static callout_t cache_stat_callout;
+static kmutex_t cache_stat_lock __cacheline_aligned;
+
+#define        COUNT(f)        do { \
+       lwp_t *l = curlwp; \
+       KPREEMPT_DISABLE(l); \
+       ((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \
+       KPREEMPT_ENABLE(l); \
+} while (/* CONSTCOND */ 0);
+
+#define        UPDATE(nchcpu, f) do { \
+       uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
+       nchstats.f += cur - nchcpu->last.f; \
+       nchcpu->last.f = cur; \
+} while (/* CONSTCOND */ 0)
 
 /*
- * Locking.
- *
- * L namecache_lock            Global lock for namecache table and queues.
- * C struct nchcpu::cpu_lock   Per-CPU lock to reduce read contention.
- * N struct namecache::nc_lock Per-entry lock.
Prev by Date: [src/trunk]: src Wrap vnode_impl.h in defined(_KERNEL) || defined(_KMEMUSER), ...
Next by Date: [src/trunk]: src/sys Regen.
Previous by Thread: [src/trunk]: src Wrap vnode_impl.h in defined(_KERNEL) || defined(_KMEMUSER), ...
Next by Thread: [src/trunk]: src/sys Regen.
Indexes:
Home | Main Index | Thread Index | Old Index