Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src-draft/trunk]: src/lib/libc/cdb Optimize CPU and memory use of cdbw(3)
details: https://anonhg.NetBSD.org/src-all/rev/d2f0ca3ae8d2
branches: trunk
changeset: 949257:d2f0ca3ae8d2
user: Joerg Sonnenberger <joerg%bec.de@localhost>
date: Tue Jan 05 23:53:56 2021 +0100
description:
Optimize CPU and memory use of cdbw(3)
Reduce memory footprint and processing time by dropping the vertex parts
of the edges kept during the peeling. Hook up the
division-by-multiplication logic to help older platforms.
diffstat:
lib/libc/cdb/cdbw.c | 244 +++++++++++++++++++++++++++++++++------------------
1 files changed, 159 insertions(+), 85 deletions(-)
diffs (truncated from 357 to 300 lines):
diff -r 88478e8cd931 -r d2f0ca3ae8d2 lib/libc/cdb/cdbw.c
--- a/lib/libc/cdb/cdbw.c Mon Jan 04 03:00:46 2021 +0100
+++ b/lib/libc/cdb/cdbw.c Tue Jan 05 23:53:56 2021 +0100
@@ -40,6 +40,7 @@
#include "namespace.h"
+
#if !HAVE_NBTOOL_CONFIG_H || HAVE_SYS_ENDIAN_H
#include <sys/endian.h>
#endif
@@ -49,6 +50,74 @@
#include <string.h>
#include <unistd.h>
+#if !HAVE_NBTOOL_CONFIG_H
+#include <sys/bitops.h>
+#else
+static inline int
+my_fls32(uint32_t n)
+{
+ int v;
+
+ if (!n)
+ return 0;
+
+ v = 32;
+ if ((n & 0xFFFF0000U) == 0) {
+ n <<= 16;
+ v -= 16;
+ }
+ if ((n & 0xFF000000U) == 0) {
+ n <<= 8;
+ v -= 8;
+ }
+ if ((n & 0xF0000000U) == 0) {
+ n <<= 4;
+ v -= 4;
+ }
+ if ((n & 0xC0000000U) == 0) {
+ n <<= 2;
+ v -= 2;
+ }
+ if ((n & 0x80000000U) == 0) {
+ n <<= 1;
+ v -= 1;
+ }
+ return v;
+}
+
+static inline void
+fast_divide32_prepare(uint32_t div, uint32_t * m,
+ uint8_t *s1, uint8_t *s2)
+{
+ uint64_t mt;
+ int l;
+
+ l = my_fls32(div - 1);
+ mt = (uint64_t)(0x100000000ULL * ((1ULL << l) - div));
+ *m = (uint32_t)(mt / div + 1);
+ *s1 = (l > 1) ? 1U : (uint8_t)l;
+ *s2 = (l == 0) ? 0 : (uint8_t)(l - 1);
+}
+
+static inline uint32_t
+fast_divide32(uint32_t v, uint32_t div, uint32_t m, uint8_t s1,
+ uint8_t s2)
+{
+ uint32_t t;
+
+ t = (uint32_t)(((uint64_t)v * m) >> 32);
+ return (t + ((v - t) >> s1)) >> s2;
+}
+
+static inline uint32_t
+fast_remainder32(uint32_t v, uint32_t div, uint32_t m, uint8_t s1,
+ uint8_t s2)
+{
+
+ return v - div * fast_divide32(v, div, m, s1, s2);
+}
+#endif
+
#ifdef __weak_alias
__weak_alias(cdbw_close,_cdbw_close)
__weak_alias(cdbw_open,_cdbw_open)
@@ -279,30 +348,29 @@
}
/*
- * The algorithm below is based on paper
- * Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
- * Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
- * Vigna.
- * http://zola.di.unipi.it/rossano/wp-content/papercite-data/pdf/dcc14.pdf
+ * For each vertex in the 3-graph, the incidence lists need to kept.
+ * Avoid storing the full list by just XORing the indices of the still
+ * incident edges and the number of such edges as that's all the peeling
+ * computation needs. This is inspired by:
+ * Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
+ * Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
+ * Vigna. https://arxiv.org/abs/1312.0526
+ *
+ * Unlike in the paper, we don't care about external storage and have
+ * the edge list at hand all the time. As such, no ordering is necessary
+ * and the vertices of the edge don't have to be copied.
+ *
+ * The core observation of the paper above is that for a degree of one,
+ * the incident edge can be obtained directly.
*/
-
-/*
- * Data type for a valid oriented edge (v0, v1, v2), v1 < v2.
- * The first vertex v0 is implicit and is determined by an index
- * of the corresponding element in the state->oedges array.
- * If the degree of v0 is greater than 1, other members don't
- * make sense because they're a result of XORing multiple values.
- */
-struct oedge {
- uint32_t degree; /* Degree of v0. */
- uint32_t verts[2]; /* v1 and v2 */
- uint32_t edge;
+struct vertex {
+ uint32_t degree;
+ uint32_t edges;
};
struct edge {
+ uint32_t vertices[3];
uint32_t idx;
-
- uint32_t left, middle, right;
};
struct state {
@@ -314,48 +382,40 @@
uint32_t *g;
char *visited;
- struct oedge *oedges;
+ struct vertex *vertices;
struct edge *edges;
uint32_t output_index;
uint32_t *output_order;
};
/*
- * Add (delta == 1) or remove (delta == -1) the edge e from vertex v0.
+ * Add (delta == 1) or remove (delta == -1) the edge e
+ * from the incidence lists.
*/
static inline void
-add_remove_edge(struct oedge *o, int delta, uint32_t e,
- uint32_t v0, uint32_t v1, uint32_t v2)
+change_edge(struct state *state, int delta, uint32_t e)
{
+ int i;
+ struct vertex *v;
+ struct edge *e_ = &state->edges[e];
- o[v0].verts[v1 < v2 ? 0 : 1] ^= v1;
- o[v0].verts[v1 < v2 ? 1 : 0] ^= v2;
- o[v0].degree += delta;
- o[v0].edge ^= e;
+ for (i = 0; i < 3; ++i) {
+ v = &state->vertices[e_->vertices[i]];
+ v->edges ^= e;
+ v->degree += delta;
+ }
}
static inline void
-add_edge(struct oedge *o, uint32_t e,
- uint32_t v0, uint32_t v1, uint32_t v2)
-{
-
- add_remove_edge(o, 1, e, v0, v1, v2);
-}
-
-static inline void
-remove_vertex(struct state *state, uint32_t v0)
+remove_vertex(struct state *state, uint32_t v)
{
- uint32_t e, v1, v2;
- struct oedge *o = state->oedges;
+ struct vertex *v_ = &state->vertices[v];
+ uint32_t e;
- if (o[v0].degree == 1) {
- e = o[v0].edge;
- v1 = o[v0].verts[0];
- v2 = o[v0].verts[1];
- o[v0].degree = 0;
- add_remove_edge(o, -1, e, v1, v0, v2);
- add_remove_edge(o, -1, e, v2, v0, v1);
+ if (v_->degree == 1) {
+ e = v_->edges;
state->output_order[--state->output_index] = e;
+ change_edge(state, -1, e);
}
}
@@ -365,39 +425,48 @@
struct key_hash_head *head;
struct key_hash *key_hash;
struct edge *e;
+ uint32_t entries_m;
+ uint8_t entries_s1, entries_s2;
uint32_t hashes[3];
size_t i;
+ int j;
- memset(state->oedges, 0, sizeof(struct oedge) * state->entries);
+ memset(state->vertices, 0, sizeof(*state->vertices) * state->entries);
e = state->edges;
+ fast_divide32_prepare(state->entries, &entries_m, &entries_s1, &entries_s2);
+
for (i = 0; i < cdbw->hash_size; ++i) {
head = &cdbw->hash[i];
SLIST_FOREACH(key_hash, head, link) {
- e->idx = key_hash->idx;
mi_vector_hash(key_hash->key, key_hash->keylen,
state->seed, hashes);
- e->left = hashes[0] % state->entries;
- e->middle = hashes[1] % state->entries;
- e->right = hashes[2] % state->entries;
- if (e->left == e->middle)
+ for (j = 0; j < 3; ++j) {
+ e->vertices[j] = fast_remainder32(hashes[j],
+ state->entries, entries_m, entries_s1,
+ entries_s2);
+ }
+
+ if (e->vertices[0] == e->vertices[1])
return -1;
- add_edge(state->oedges, e - state->edges,
- e->right, e->left, e->middle);
- if (e->left == e->right)
+ if (e->vertices[0] == e->vertices[2])
return -1;
- add_edge(state->oedges, e - state->edges,
- e->middle, e->left, e->right);
- if (e->middle == e->right)
+ if (e->vertices[1] == e->vertices[2])
return -1;
- add_edge(state->oedges, e - state->edges,
- e->left, e->middle, e->right);
-
+ e->idx = key_hash->idx;
++e;
}
}
+ /*
+ * Do the edge processing separately as there is a good chance for
+ * the degraded edge case above to happen, so avoid unnecessary
+ * work.
+ */
+ for (i = 0; i < state->keys; ++i)
+ change_edge(state, 1, i);
+
state->output_index = state->keys;
for (i = 0; i < state->entries; ++i)
remove_vertex(state, i);
@@ -406,9 +475,8 @@
while (i > 0 && i > state->output_index) {
--i;
e = state->edges + state->output_order[i];
- remove_vertex(state, e->left);
- remove_vertex(state, e->middle);
- remove_vertex(state, e->right);
+ for (j = 0; j < 3; ++j)
+ remove_vertex(state, e->vertices[j]);
}
return state->output_index == 0 ? 0 : -1;
@@ -420,28 +488,34 @@
struct edge *e;
size_t i;
+ uint32_t v0, v1, v2, entries_m;
+ uint8_t entries_s1, entries_s2;
+
+ fast_divide32_prepare(state->data_entries, &entries_m, &entries_s1, &entries_s2);
+
for (i = 0; i < state->keys; ++i) {
e = state->edges + state->output_order[i];
-
- if (!state->visited[e->left]) {
- state->g[e->left] =
- (2 * state->data_entries + e->idx
- - state->g[e->middle] - state->g[e->right])
- % state->data_entries;
- } else if (!state->visited[e->middle]) {
- state->g[e->middle] =
- (2 * state->data_entries + e->idx
- - state->g[e->left] - state->g[e->right])
- % state->data_entries;
Home |
Main Index |
Thread Index |
Old Index