Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys Import thmap -- a concurrent trie-hash map, combining th...



details:   https://anonhg.NetBSD.org/src/rev/5c1cdd3b64ad
branches:  trunk
changeset: 446741:5c1cdd3b64ad
user:      rmind <rmind%NetBSD.org@localhost>
date:      Sun Dec 16 14:06:56 2018 +0000

description:
Import thmap -- a concurrent trie-hash map, combining the elements of
hashing and radix trie.  It supports lock-free lookups and concurrent
inserts/deletes.  It is designed to be optimal as a general purpose
*concurrent* associative array.

Upstream: https://github.com/rmind/thmap
Discussed on tech-kern@

diffstat:

 sys/kern/files.kern                         |    3 +-
 sys/kern/subr_thmap.c                       |  934 ++++++++++++++++++++++++++++
 sys/rump/librump/rumpkern/Makefile.rumpkern |    3 +-
 sys/sys/thmap.h                             |   62 +
 4 files changed, 1000 insertions(+), 2 deletions(-)

diffs (truncated from 1038 to 300 lines):

diff -r 6db077adc6bd -r 5c1cdd3b64ad sys/kern/files.kern
--- a/sys/kern/files.kern       Sun Dec 16 14:04:14 2018 +0000
+++ b/sys/kern/files.kern       Sun Dec 16 14:06:56 2018 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: files.kern,v 1.27 2018/12/03 00:11:02 christos Exp $
+#      $NetBSD: files.kern,v 1.28 2018/12/16 14:06:56 rmind Exp $
 
 #
 # kernel sources
@@ -142,6 +142,7 @@
 file   kern/subr_specificdata.c        kern
 file   kern/subr_tftproot.c            tftproot
 file   kern/subr_time.c                kern
+file   kern/subr_thmap.c               kern
 file   kern/subr_userconf.c            userconf
 file   kern/subr_vmem.c                kern
 file   kern/subr_workqueue.c           kern
diff -r 6db077adc6bd -r 5c1cdd3b64ad sys/kern/subr_thmap.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/kern/subr_thmap.c     Sun Dec 16 14:06:56 2018 +0000
@@ -0,0 +1,934 @@
+/*-
+ * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Upstream: https://github.com/rmind/thmap/
+ */
+
+/*
+ * Concurrent trie-hash map.
+ *
+ * The data structure is conceptually a radix trie on hashed keys.
+ * Keys are hashed using a 32-bit function.  The root level is a special
+ * case: it is managed using the compare-and-swap (CAS) atomic operation
+ * and has a fanout of 64.  The subsequent levels are constructed using
+ * intermediate nodes with a fanout of 16 (using 4 bits).  As more levels
+ * are created, more blocks of the 32-bit hash value might be generated
+ * by incrementing the seed parameter of the hash function.
+ *
+ * Concurrency
+ *
+ * - READERS: Descending is simply walking through the slot values of
+ *   the intermediate nodes.  It is lock-free as there is no intermediate
+ *   state: the slot is either empty or has a pointer to the child node.
+ *   The main assumptions here are the following:
+ *
+ *   i) modifications must preserve consistency with the respect to the
+ *   readers i.e. the readers can only see the valid node values;
+ *
+ *   ii) any invalid view must "fail" the reads, e.g. by making them
+ *   re-try from the root; this is a case for deletions and is achieved
+ *   using the NODE_DELETED flag.
+ *
+ *   iii) the node destruction must be synchronised with the readers,
+ *   e.g. by using the Epoch-based reclamation or other techniques.
+ *
+ * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which
+ *   is implemented using the NODE_LOCKED bit) -- it provides mutual
+ *   exclusion amongst concurrent writers.  The lock order for the nodes
+ *   is "bottom-up" i.e. they are locked as we ascend the trie.  A key
+ *   constraint here is that parent pointer never changes.
+ *
+ * - DELETES: In addition to writer's locking, the deletion keeps the
+ *   intermediate nodes in a valid state and sets the NODE_DELETED flag,
+ *   to indicate that the readers must re-start the walk from the root.
+ *   As the levels are collapsed, NODE_DELETED gets propagated up-tree.
+ *   The leaf nodes just stay as-is until they are reclaimed.
+ *
+ * - ROOT LEVEL: The root level is a special case, as it is implemented
+ *   as an array (rather than intermediate node).  The root-level slot can
+ *   only be set using CAS and it can only be set to a valid intermediate
+ *   node.  The root-level slot can only be cleared when the node it points
+ *   at becomes empty, is locked and marked as NODE_DELETED (this causes
+ *   the insert/delete operations to re-try until the slot is set to NULL).
+ *
+ * References:
+ *
+ *     W. Litwin, 1981, Trie Hashing.
+ *     Proceedings of the 1981 ACM SIGMOD, p. 19-29
+ *     https://dl.acm.org/citation.cfm?id=582322
+ *
+ *     P. L. Lehman and S. B. Yao.
+ *     Efficient locking for concurrent operations on B-trees.
+ *     ACM TODS, 6(4):650-670, 1981
+ *     https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf
+ */
+
+#ifdef _KERNEL
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/thmap.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/atomic.h>
+#include <sys/hash.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <string.h>
+#include <limits.h>
+
+#include "thmap.h"
+#include "utils.h"
+#endif
+
+/*
+ * NetBSD kernel wrappers
+ */
+#ifdef _KERNEL
+#define        ASSERT KASSERT
+#define        DEBUG 1
+#define        atomic_thread_fence(x) x
+#define        memory_order_stores membar_producer()
+#define        memory_order_loads membar_consumer()
+#define        atomic_cas_32_p(p, e, n) (atomic_cas_32((p), (e), (n)) == (e))
+#define        atomic_cas_ptr_p(p, e, n) \
+    (atomic_cas_ptr((p), (void *)(e), (void *)(n)) == (e))
+#define        atomic_exchange atomic_swap_ptr
+#define        murmurhash3 murmurhash2
+#endif
+
+/*
+ * The root level fanout is 64 (indexed by the last 6 bits of the hash
+ * value XORed with the length).  Each subsequent level, represented by
+ * intermediate nodes, has a fanout of 16 (using 4 bits).
+ *
+ * The hash function produces 32-bit values.
+ */
+
+#define        HASHVAL_BITS    (32)
+#define        HASHVAL_MOD     (HASHVAL_BITS - 1)
+#define        HASHVAL_SHIFT   (5)
+
+#define        ROOT_BITS       (6)
+#define        ROOT_SIZE       (1 << ROOT_BITS)
+#define        ROOT_MASK       (ROOT_SIZE - 1)
+#define        ROOT_MSBITS     (HASHVAL_BITS - ROOT_BITS)
+
+#define        LEVEL_BITS      (4)
+#define        LEVEL_SIZE      (1 << LEVEL_BITS)
+#define        LEVEL_MASK      (LEVEL_SIZE - 1)
+
+/*
+ * Instead of raw pointers, we use offsets from the base address.
+ * This accommodates the use of this data structure in shared memory,
+ * where mappings can be in different address spaces.
+ *
+ * The pointers must be aligned, since pointer tagging is used to
+ * differentiate the intermediate nodes from leaves.  We reserve the
+ * least significant bit.
+ */
+typedef uintptr_t thmap_ptr_t;
+
+#define        THMAP_NULL              ((thmap_ptr_t)0)
+
+#define        THMAP_LEAF_BIT          (0x1)
+
+#define        THMAP_ALIGNED_P(p)      (((uintptr_t)(p) & 3) == 0)
+#define        THMAP_ALIGN(p)          ((uintptr_t)(p) & ~(uintptr_t)3)
+#define        THMAP_INODE_P(p)        (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0)
+
+#define        THMAP_GETPTR(th, p)     ((void *)((th)->baseptr + (uintptr_t)(p)))
+#define        THMAP_GETOFF(th, p)     ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr))
+#define        THMAP_NODE(th, p)       THMAP_GETPTR(th, THMAP_ALIGN(p))
+
+/*
+ * State field.
+ */
+
+#define        NODE_LOCKED             (1U << 31)              // lock (writers)
+#define        NODE_DELETED            (1U << 30)              // node deleted
+#define        NODE_COUNT(s)           ((s) & 0x3fffffff)      // slot count mask
+
+/*
+ * There are two types of nodes:
+ * - Intermediate nodes -- arrays pointing to another level or a leaf;
+ * - Leaves, which store a key-value pair.
+ */
+
+typedef struct {
+       uint32_t        state;
+       thmap_ptr_t     parent;
+       thmap_ptr_t     slots[LEVEL_SIZE];
+} thmap_inode_t;
+
+#define        THMAP_INODE_LEN sizeof(thmap_inode_t)
+
+typedef struct {
+       thmap_ptr_t     key;
+       size_t          len;
+       void *          val;
+} thmap_leaf_t;
+
+typedef struct {
+       unsigned        rslot;          // root-level slot index
+       unsigned        level;          // current level in the tree
+       unsigned        hashidx;        // current hash index (block of bits)
+       uint32_t        hashval;        // current hash value
+} thmap_query_t;
+
+typedef struct {
+       uintptr_t       addr;
+       size_t          len;
+       void *          next;
+} thmap_gc_t;
+
+#define        THMAP_ROOT_LEN  (sizeof(thmap_ptr_t) * ROOT_SIZE)
+
+struct thmap {
+       uintptr_t       baseptr;
+       thmap_ptr_t *   root;
+       unsigned        flags;
+       const thmap_ops_t *ops;
+       thmap_gc_t *    gc_list;
+};
+
+static void    stage_mem_gc(thmap_t *, uintptr_t, size_t);
+
+/*
+ * A few low-level helper routines.
+ */
+
+static uintptr_t
+alloc_wrapper(size_t len)
+{
+       return (uintptr_t)kmem_intr_alloc(len, KM_SLEEP);
+}
+
+static void
+free_wrapper(uintptr_t addr, size_t len)
+{
+       kmem_intr_free((void *)addr, len);
+}
+
+static const thmap_ops_t thmap_default_ops = {
+       .alloc = alloc_wrapper,
+       .free = free_wrapper
+};
+
+/*
+ * NODE LOCKING.
+ */
+
+#ifdef DEBUG
+static inline bool
+node_locked_p(const thmap_inode_t *node)
+{
+       return (node->state & NODE_LOCKED) != 0;
+}
+#endif
+
+static void
+lock_node(thmap_inode_t *node)
+{
+       unsigned bcount = SPINLOCK_BACKOFF_MIN;
+       uint32_t s;
+again:
+       s = node->state;
+       if (s & NODE_LOCKED) {
+               SPINLOCK_BACKOFF(bcount);
+               goto again;
+       }
+       /*
+        * CAS will issue a full memory fence for us.
+        *
+        * WARNING: for optimisations purposes, callers rely on us
+        * issuing load and store fence
+        */
+       if (!atomic_cas_32_p(&node->state, s, s | NODE_LOCKED)) {
+               bcount = SPINLOCK_BACKOFF_MIN;
+               goto again;
+       }
+}
+
+static void



Home | Main Index | Thread Index | Old Index