Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/common/lib/libc/gen Delete the counter from "struct radix_tr...
details: https://anonhg.NetBSD.org/src/rev/fd364e0cdf8a
branches: trunk
changeset: 846966:fd364e0cdf8a
user: ad <ad%NetBSD.org@localhost>
date: Thu Dec 05 18:50:41 2019 +0000
description:
Delete the counter from "struct radix_tree_node", and in the one place we
need a non-zero check, substitute with a deterministic bitwise OR of all
values in the node. The structure then becomes cache line aligned.
For each node we now need only touch 2 cache lines instead of 3, which makes
all the operations faster (measured), amortises the cost of not having a
counter, and will avoid intra-pool-page false sharing on MP.
diffstat:
common/lib/libc/gen/radixtree.c | 101 +++++++++++++++++++++++++++------------
1 files changed, 69 insertions(+), 32 deletions(-)
diffs (225 lines):
diff -r 177716449d1a -r fd364e0cdf8a common/lib/libc/gen/radixtree.c
--- a/common/lib/libc/gen/radixtree.c Thu Dec 05 18:32:25 2019 +0000
+++ b/common/lib/libc/gen/radixtree.c Thu Dec 05 18:50:41 2019 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: radixtree.c,v 1.18 2019/12/05 18:32:25 ad Exp $ */
+/* $NetBSD: radixtree.c,v 1.19 2019/12/05 18:50:41 ad Exp $ */
/*-
* Copyright (c)2011,2012,2013 YAMAMOTO Takashi,
@@ -112,7 +112,7 @@
#include <sys/cdefs.h>
#if defined(_KERNEL) || defined(_STANDALONE)
-__KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.18 2019/12/05 18:32:25 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.19 2019/12/05 18:50:41 ad Exp $");
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/pool.h>
@@ -122,7 +122,7 @@
#include <lib/libsa/stand.h>
#endif /* defined(_STANDALONE) */
#else /* defined(_KERNEL) || defined(_STANDALONE) */
-__RCSID("$NetBSD: radixtree.c,v 1.18 2019/12/05 18:32:25 ad Exp $");
+__RCSID("$NetBSD: radixtree.c,v 1.19 2019/12/05 18:50:41 ad Exp $");
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
@@ -185,11 +185,16 @@
* radix_tree_node: an intermediate node
*
* we don't care the type of leaf nodes. they are just void *.
+ *
+ * we used to maintain a count of non-NULL nodes in this structure, but it
+ * prevented it from being aligned to a cache line boundary; the performance
+ * benefit from being cache friendly is greater than the benefit of having
+ * a dedicated count value, especially in multi-processor situations where
+ * we need to avoid intra-pool-page false sharing.
*/
struct radix_tree_node {
void *n_ptrs[RADIX_TREE_PTR_PER_NODE];
- unsigned int n_nptrs; /* # of non-NULL pointers in n_ptrs */
};
/*
@@ -340,8 +345,8 @@
{
radix_tree_node_cache = pool_cache_init(sizeof(struct radix_tree_node),
- 0, 0, 0, "radix_tree_node", NULL, IPL_NONE, radix_tree_node_ctor,
- NULL, NULL);
+ coherency_unit, 0, 0, "radixnode", NULL, IPL_NONE,
+ radix_tree_node_ctor, NULL, NULL);
KASSERT(radix_tree_node_cache != NULL);
}
#endif /* defined(_KERNEL) */
@@ -349,17 +354,57 @@
static bool __unused
radix_tree_node_clean_p(const struct radix_tree_node *n)
{
+#if RADIX_TREE_PTR_PER_NODE > 16
unsigned int i;
- if (n->n_nptrs != 0) {
- return false;
- }
for (i = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
if (n->n_ptrs[i] != NULL) {
return false;
}
}
return true;
+#else /* RADIX_TREE_PTR_PER_NODE > 16 */
+ uintptr_t sum;
+
+ /*
+ * Unrolling the above is much better than a tight loop with two
+ * test+branch pairs. On x86 with gcc 5.5.0 this compiles into 19
+ * deterministic instructions including the "return" and prologue &
+ * epilogue.
+ */
+ sum = (uintptr_t)n->n_ptrs[0];
+ sum |= (uintptr_t)n->n_ptrs[1];
+ sum |= (uintptr_t)n->n_ptrs[2];
+ sum |= (uintptr_t)n->n_ptrs[3];
+#if RADIX_TREE_PTR_PER_NODE > 4
+ sum |= (uintptr_t)n->n_ptrs[4];
+ sum |= (uintptr_t)n->n_ptrs[5];
+ sum |= (uintptr_t)n->n_ptrs[6];
+ sum |= (uintptr_t)n->n_ptrs[7];
+#endif
+#if RADIX_TREE_PTR_PER_NODE > 8
+ sum |= (uintptr_t)n->n_ptrs[8];
+ sum |= (uintptr_t)n->n_ptrs[9];
+ sum |= (uintptr_t)n->n_ptrs[10];
+ sum |= (uintptr_t)n->n_ptrs[11];
+ sum |= (uintptr_t)n->n_ptrs[12];
+ sum |= (uintptr_t)n->n_ptrs[13];
+ sum |= (uintptr_t)n->n_ptrs[14];
+ sum |= (uintptr_t)n->n_ptrs[15];
+#endif
+ return sum == 0;
+#endif /* RADIX_TREE_PTR_PER_NODE > 16 */
+}
+
+static int __unused
+radix_tree_node_count_ptrs(const struct radix_tree_node *n)
+{
+ unsigned int i, c;
+
+ for (i = c = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
+ c += (n->n_ptrs[i] != NULL);
+ }
+ return c;
}
static struct radix_tree_node *
@@ -421,7 +466,6 @@
*/
return ENOMEM;
}
- n->n_nptrs = 1;
n->n_ptrs[0] = t->t_root;
t->t_root = entry_compose(n, tagmask);
t->t_height++;
@@ -516,10 +560,6 @@
return NULL;
}
*vpp = c;
- if (n != NULL) {
- KASSERT(n->n_nptrs < RADIX_TREE_PTR_PER_NODE);
- n->n_nptrs++;
- }
}
n = c;
vpp = &n->n_ptrs[i];
@@ -531,10 +571,6 @@
}
if (alloc) {
KASSERT(*vpp == NULL);
- if (n != NULL) {
- KASSERT(n->n_nptrs < RADIX_TREE_PTR_PER_NODE);
- n->n_nptrs++;
- }
}
if (path != NULL) {
path->p_lastidx = refs - path->p_refs;
@@ -634,9 +670,7 @@
entry = *pptr;
n = entry_ptr(entry);
KASSERT(n != NULL);
- KASSERT(n->n_nptrs > 0);
- n->n_nptrs--;
- if (n->n_nptrs > 0) {
+ if (!radix_tree_node_clean_p(n)) {
break;
}
radix_tree_free_node(n);
@@ -663,7 +697,7 @@
entry = *pptr;
n = entry_ptr(entry);
KASSERT(n != NULL);
- KASSERT(n->n_nptrs > 0);
+ KASSERT(!radix_tree_node_clean_p(n));
newmask = any_children_tagmask(n);
if (newmask == entry_tagmask(entry)) {
break;
@@ -702,10 +736,7 @@
{
void **vpp __unused;
-#if defined(DIAGNOSTIC)
- vpp =
-#endif /* defined(DIAGNOSTIC) */
- radix_tree_lookup_ptr(t, idx, path, false, tagmask);
+ vpp = radix_tree_lookup_ptr(t, idx, path, false, tagmask);
KASSERT(vpp == NULL ||
vpp == path_pptr(t, path, path->p_lastidx));
KASSERT(&t->t_root == path_pptr(t, path, 0));
@@ -795,7 +826,15 @@
break;
}
n = path_node(t, path, lastidx - 1);
- if (*vpp != NULL && n->n_nptrs == 1) {
+ /*
+ * we used to have an integer counter in the node, and this
+ * optimization made sense then, even though marginal. it
+ * no longer provides benefit with the structure cache line
+ * aligned and the counter replaced by an unrolled sequence
+ * testing the pointers in batch.
+ */
+#if 0
+ if (*vpp != NULL && radix_tree_node_count_ptrs(n) == 1) {
/*
* optimization; if the node has only a single pointer
* and we've already visited it, there's no point to
@@ -803,6 +842,7 @@
*/
goto no_siblings;
}
+#endif /* 0 */
for (i = vpp - n->n_ptrs + step; i != guard; i += step) {
KASSERT(i < RADIX_TREE_PTR_PER_NODE);
if (entry_match_p(n->n_ptrs[i], tagmask)) {
@@ -986,10 +1026,7 @@
int i;
KASSERT(tagmask != 0);
-#if defined(DIAGNOSTIC)
- vpp =
-#endif /* defined(DIAGNOSTIC) */
- radix_tree_lookup_ptr(t, idx, &path, false, 0);
+ vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
KASSERT(vpp != NULL);
KASSERT(*vpp != NULL);
KASSERT(path.p_lastidx == t->t_height);
@@ -1087,7 +1124,7 @@
}
n = entry_ptr(vp);
assert(any_children_tagmask(n) == entry_tagmask(vp));
- printf(" (%u children)\n", n->n_nptrs);
+ printf(" (%u children)\n", radix_tree_node_count_ptrs(n));
for (i = 0; i < __arraycount(n->n_ptrs); i++) {
void *c;
Home |
Main Index |
Thread Index |
Old Index