[src-draft/trunk]: src/lib/libc/cdb Optimize CPU and memory use of cdbw(3)

To: source-changes-hg%NetBSD.org@localhost
Subject: [src-draft/trunk]: src/lib/libc/cdb Optimize CPU and memory use of cdbw(3)
From: Joerg Sonnenberger <joerg%bec.de@localhost>
Date: Wed, 06 Jan 2021 13:37:24 +0000
details:   https://anonhg.NetBSD.org/src-all/rev/7aa6ef2acb5f
branches:  trunk
changeset: 949258:7aa6ef2acb5f
user:      Joerg Sonnenberger <joerg%bec.de@localhost>
date:      Tue Jan 05 23:53:56 2021 +0100

description:
Optimize CPU and memory use of cdbw(3)

Reduce memory footprint and processing time by dropping the vertex parts
of the edges kept during the peeling. Hook up the
division-by-multiplication logic to help older platforms.

diffstat:

 lib/libc/cdb/cdbw.c |  244 +++++++++++++++++++++++++++++++++------------------
 1 files changed, 159 insertions(+), 85 deletions(-)

diffs (truncated from 357 to 300 lines):

diff -r 88478e8cd931 -r 7aa6ef2acb5f lib/libc/cdb/cdbw.c
--- a/lib/libc/cdb/cdbw.c       Mon Jan 04 03:00:46 2021 +0100
+++ b/lib/libc/cdb/cdbw.c       Tue Jan 05 23:53:56 2021 +0100
@@ -40,6 +40,7 @@
 
 #include "namespace.h"
 
+
 #if !HAVE_NBTOOL_CONFIG_H || HAVE_SYS_ENDIAN_H
 #include <sys/endian.h>
 #endif
@@ -49,6 +50,74 @@
 #include <string.h>
 #include <unistd.h>
 
+#if !HAVE_NBTOOL_CONFIG_H
+#include <sys/bitops.h>
+#else
+static inline int
+my_fls32(uint32_t n)
+{
+       int v;
+
+       if (!n)
+               return 0;
+
+       v = 32;
+       if ((n & 0xFFFF0000U) == 0) {
+               n <<= 16;
+               v -= 16;
+       }
+       if ((n & 0xFF000000U) == 0) {
+               n <<= 8;
+               v -= 8;
+       }
+       if ((n & 0xF0000000U) == 0) {
+               n <<= 4;
+               v -= 4;
+       }
+       if ((n & 0xC0000000U) == 0) {
+               n <<= 2;
+               v -= 2;
+       }
+       if ((n & 0x80000000U) == 0) {
+               n <<= 1;
+               v -= 1;
+       }
+       return v;
+}
+
+static inline void
+fast_divide32_prepare(uint32_t div, uint32_t * m,
+    uint8_t *s1, uint8_t *s2)
+{
+       uint64_t mt;
+       int l;
+
+       l = my_fls32(div - 1);
+       mt = (uint64_t)(0x100000000ULL * ((1ULL << l) - div));
+       *m = (uint32_t)(mt / div + 1);
+       *s1 = (l > 1) ? 1U : (uint8_t)l;
+       *s2 = (l == 0) ? 0 : (uint8_t)(l - 1);
+}
+
+static inline uint32_t
+fast_divide32(uint32_t v, uint32_t div, uint32_t m, uint8_t s1,
+    uint8_t s2)
+{
+       uint32_t t;
+
+       t = (uint32_t)(((uint64_t)v * m) >> 32);
+       return (t + ((v - t) >> s1)) >> s2;
+}
+
+static inline uint32_t
+fast_remainder32(uint32_t v, uint32_t div, uint32_t m, uint8_t s1,
+    uint8_t s2)
+{
+
+       return v - div * fast_divide32(v, div, m, s1, s2);
+}
+#endif
+
 #ifdef __weak_alias
 __weak_alias(cdbw_close,_cdbw_close)
 __weak_alias(cdbw_open,_cdbw_open)
@@ -279,30 +348,29 @@
 }
 
 /*
- * The algorithm below is based on paper
- * Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
- * Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
- * Vigna.
- * http://zola.di.unipi.it/rossano/wp-content/papercite-data/pdf/dcc14.pdf
+ * For each vertex in the 3-graph, the incidence lists needs to be kept.
+ * Avoid storing the full list by just XORing the indices of the still
+ * incident edges and remember the number of such edges as that's all
+ * the peeling computation needs. This is inspired by:
+ *   Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
+ *   Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
+ *   Vigna. https://arxiv.org/abs/1312.0526
+ *
+ * Unlike in the paper, we don't care about external storage and have
+ * the edge list at hand all the time. As such, no ordering is necessary
+ * and the vertices of the edge don't have to be copied.
+ *
+ * The core observation of the paper above is that for a degree of one,
+ * the incident edge can be obtained directly.
  */
-
-/*
- * Data type for a valid oriented edge (v0, v1, v2), v1 < v2.
- * The first vertex v0 is implicit and is determined by an index
- * of the corresponding element in the state->oedges array.
- * If the degree of v0 is greater than 1, other members don't
- * make sense because they're a result of XORing multiple values.
- */
-struct oedge {
-       uint32_t degree;   /* Degree of v0. */
-       uint32_t verts[2]; /* v1 and v2 */
-       uint32_t edge;
+struct vertex {
+       uint32_t degree;
+       uint32_t edges;
 };
 
 struct edge {
+       uint32_t vertices[3];
        uint32_t idx;
-
-       uint32_t left, middle, right;
 };
 
 struct state {
@@ -314,48 +382,40 @@
        uint32_t *g;
        char *visited;
 
-       struct oedge *oedges;
+       struct vertex *vertices;
        struct edge *edges;
        uint32_t output_index;
        uint32_t *output_order;
 };
 
 /*
- * Add (delta == 1) or remove (delta == -1) the edge e from vertex v0.
+ * Add (delta == 1) or remove (delta == -1) the edge e
+ * from the incidence lists.
  */
 static inline void
-add_remove_edge(struct oedge *o, int delta, uint32_t e,
-    uint32_t v0, uint32_t v1, uint32_t v2)
+change_edge(struct state *state, int delta, uint32_t e)
 {
+       int i;
+       struct vertex *v;
+       struct edge *e_ = &state->edges[e];
 
-       o[v0].verts[v1 < v2 ? 0 : 1] ^= v1;
-       o[v0].verts[v1 < v2 ? 1 : 0] ^= v2;
-       o[v0].degree += delta;
-       o[v0].edge ^= e;
+       for (i = 0; i < 3; ++i) {
+               v = &state->vertices[e_->vertices[i]];
+               v->edges ^= e;
+               v->degree += delta;
+       }
 }
 
 static inline void
-add_edge(struct oedge *o, uint32_t e,
-    uint32_t v0, uint32_t v1, uint32_t v2)
-{
-
-       add_remove_edge(o, 1, e, v0, v1, v2);
-}
-
-static inline void
-remove_vertex(struct state *state, uint32_t v0)
+remove_vertex(struct state *state, uint32_t v)
 {
-       uint32_t e, v1, v2;
-       struct oedge *o = state->oedges;
+       struct vertex *v_ = &state->vertices[v];
+       uint32_t e;
 
-       if (o[v0].degree == 1) {
-               e = o[v0].edge;
-               v1 = o[v0].verts[0];
-               v2 = o[v0].verts[1];
-               o[v0].degree = 0;
-               add_remove_edge(o, -1, e, v1, v0, v2);
-               add_remove_edge(o, -1, e, v2, v0, v1);
+       if (v_->degree == 1) {
+               e = v_->edges;
                state->output_order[--state->output_index] = e;
+               change_edge(state, -1, e);
        }
 }
 
@@ -365,39 +425,48 @@
        struct key_hash_head *head;
        struct key_hash *key_hash;
        struct edge *e;
+       uint32_t entries_m;
+       uint8_t entries_s1, entries_s2;
        uint32_t hashes[3];
        size_t i;
+       int j;
 
-       memset(state->oedges, 0, sizeof(struct oedge) * state->entries);
+       memset(state->vertices, 0, sizeof(*state->vertices) * state->entries);
 
        e = state->edges;
+       fast_divide32_prepare(state->entries, &entries_m, &entries_s1, &entries_s2);
+
        for (i = 0; i < cdbw->hash_size; ++i) {
                head = &cdbw->hash[i];
                SLIST_FOREACH(key_hash, head, link) {
-                       e->idx = key_hash->idx;
                        mi_vector_hash(key_hash->key, key_hash->keylen,
                            state->seed, hashes);
-                       e->left = hashes[0] % state->entries;
-                       e->middle = hashes[1] % state->entries;
-                       e->right = hashes[2] % state->entries;
 
-                       if (e->left == e->middle)
+                       for (j = 0; j < 3; ++j) {
+                               e->vertices[j] = fast_remainder32(hashes[j],
+                                   state->entries, entries_m, entries_s1,
+                                   entries_s2);
+                       }
+
+                       if (e->vertices[0] == e->vertices[1])
                                return -1;
-                       add_edge(state->oedges, e - state->edges,
-                           e->right, e->left, e->middle);
-                       if (e->left == e->right)
+                       if (e->vertices[0] == e->vertices[2])
                                return -1;
-                       add_edge(state->oedges, e - state->edges,
-                           e->middle, e->left, e->right);
-                       if (e->middle == e->right)
+                       if (e->vertices[1] == e->vertices[2])
                                return -1;
-                       add_edge(state->oedges, e - state->edges,
-                           e->left, e->middle, e->right);
-
+                       e->idx = key_hash->idx;
                        ++e;
                }
        }
 
+       /*
+        * Do the edge processing separately as there is a good chance for
+        * the degraded edge case above to happen, so avoid unnecessary
+        * work.
+        */
+       for (i = 0; i < state->keys; ++i)
+               change_edge(state, 1, i);
+
        state->output_index = state->keys;
        for (i = 0; i < state->entries; ++i)
                remove_vertex(state, i);
@@ -406,9 +475,8 @@
        while (i > 0 && i > state->output_index) {
                --i;
                e = state->edges + state->output_order[i];
-               remove_vertex(state, e->left);
-               remove_vertex(state, e->middle);
-               remove_vertex(state, e->right);
+               for (j = 0; j < 3; ++j)
+                       remove_vertex(state, e->vertices[j]);
        }
 
        return state->output_index == 0 ? 0 : -1;
@@ -420,28 +488,34 @@
        struct edge *e;
        size_t i;
 
+       uint32_t v0, v1, v2, entries_m;
+       uint8_t entries_s1, entries_s2;
+
+       fast_divide32_prepare(state->data_entries, &entries_m, &entries_s1, &entries_s2);
+
        for (i = 0; i < state->keys; ++i) {
                e = state->edges + state->output_order[i];
-
-               if (!state->visited[e->left]) {
-                       state->g[e->left] =
-                           (2 * state->data_entries + e->idx
-                           - state->g[e->middle] - state->g[e->right])
-                           % state->data_entries;
-               } else if (!state->visited[e->middle]) {
-                       state->g[e->middle] =
-                           (2 * state->data_entries + e->idx
-                           - state->g[e->left] - state->g[e->right])
-                           % state->data_entries;
References:
- [src-draft/trunk]: src/lib/libc/cdb Optimize CPU and memory use of cdbw(3)
  - From: Joerg Sonnenberger
Prev by Date: [src-draft/trunk]: src/lib/libc/cdb Optimize CPU and memory use of cdbw(3)
Next by Date: [src/trunk]: src/sys/arch/amiga/dev Implement the WSDISPLAYIO_GET_FBINFO ioct...
Previous by Thread: [src-draft/trunk]: src/lib/libc/cdb Optimize CPU and memory use of cdbw(3)
Next by Thread: [src/trunk]: src/sys/arch/amiga/dev Implement the WSDISPLAYIO_GET_FBINFO ioct...
Indexes:
Home | Main Index | Thread Index | Old Index