tech-userlevel archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

cdbw optimisation in libc



Hi,

I recently came across an interesting article [1] which introduces
a simpler algorithm for chm3 hash generation. The key idea is that
incidence lists can be eliminated by using a clever XOR trick.

The patch of the new implementation is attached. It's faster than
the original algorithm but for some reason I only get about 20%
speedup on a Haswell notebook and 10% more by using fast_remainder32(3).

However, the new algorithm uses significantly less memory. In the
current implementation, edges, verts and output_order arrays take
16n 32-bit words (where n is a number of keys). I believe that there
is a typo in the edges array allocation (fixed in the attached patch)
and it allocates too much memory. The fix reduces memory size to
13.75n in the current implementation. My change brings it down to 9n.

Ok to commit?

[1] Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano Vigna.

Alex
Index: cdb/cdbw.c
===================================================================
RCS file: /cvsroot/src/lib/libc/cdb/cdbw.c,v
retrieving revision 1.5
diff -p -u -u -r1.5 cdbw.c
--- cdb/cdbw.c	21 Jul 2012 22:49:37 -0000	1.5
+++ cdb/cdbw.c	2 Jul 2015 19:29:20 -0000
@@ -1,10 +1,10 @@
 /*	$NetBSD: cdbw.c,v 1.5 2012/07/21 22:49:37 joerg Exp $	*/
 /*-
- * Copyright (c) 2009, 2010 The NetBSD Foundation, Inc.
+ * Copyright (c) 2009, 2010, 2015 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
- * by Joerg Sonnenberger.
+ * by Joerg Sonnenberger and Alexander Nasonov.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -43,6 +43,7 @@ __RCSID("$NetBSD: cdbw.c,v 1.5 2012/07/2
 #if !HAVE_NBTOOL_CONFIG_H || HAVE_SYS_ENDIAN_H
 #include <sys/endian.h>
 #endif
+#include <sys/bitops.h>
 #include <sys/queue.h>
 #include <cdbw.h>
 #include <stdlib.h>
@@ -278,18 +279,31 @@ cdbw_stable_seeder(void)
 	return 0;
 }
 
-#define unused 0xffffffffU
+/*
+ * The algorithm below is based on paper
+ * Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui,
+ * Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano
+ * Vigna.
+ * http://zola.di.unipi.it/rossano/wp-content/papercite-data/pdf/dcc14.pdf
+ */
 
-struct vertex {
-	uint32_t l_edge, m_edge, r_edge;
+/*
+ * Data type for a valid oriented edge (v0, v1, v2), v1 < v2.
+ * The first vertex v0 is implicit and is determined by an index
+ * of the corresponding element in the state->oedges array.
+ * If the degree of v0 is greater than 1, other members don't make
+ * sense because they're a result of XORing multiple random values.
+ */
+struct oedge {
+	uint32_t degree;   /* Degree of v0. */
+	uint32_t verts[2]; /* v1 and v2 */
+	uint32_t edge;
 };
 
 struct edge {
 	uint32_t idx;
 
 	uint32_t left, middle, right;
-	uint32_t l_prev, m_prev, l_next;
-	uint32_t r_prev, m_next, r_next;
 };
 
 struct state {
@@ -301,69 +315,49 @@ struct state {
 	uint32_t *g;
 	char *visited;
 
-	struct vertex *verts;
+	struct oedge *oedges;
 	struct edge *edges;
 	uint32_t output_index;
 	uint32_t *output_order;
 };
 
-static void
-remove_vertex(struct state *state, struct vertex *v)
+/*
+ * Add (delta == 1) or remove (delta == -1) the edge e from vertex v0.
+ */
+static inline void
+add_remove_edge(struct oedge *o, int delta, uint32_t e,
+    uint32_t v0, uint32_t v1, uint32_t v2)
 {
-	struct edge *e;
-	struct vertex *vl, *vm, *vr;
 
-	if (v->l_edge != unused && v->m_edge != unused)
-		return;
-	if (v->l_edge != unused && v->r_edge != unused)
-		return;
-	if (v->m_edge != unused && v->r_edge != unused)
-		return;
-	if (v->l_edge == unused && v->m_edge == unused && v->r_edge == unused)
-		return;
-
-	if (v->l_edge != unused) {
-		e = &state->edges[v->l_edge];
-		if (e->l_next != unused)
-			return;
-	} else if (v->m_edge != unused) {
-		e = &state->edges[v->m_edge];
-		if (e->m_next != unused)
-			return;
-	} else {
-		if (v->r_edge == unused)
-			abort();
-		e = &state->edges[v->r_edge];
-		if (e->r_next != unused)
-			return;
-	}
-
-	state->output_order[--state->output_index] = e - state->edges;
-
-	vl = &state->verts[e->left];
-	vm = &state->verts[e->middle];
-	vr = &state->verts[e->right];
+	o[v0].verts[v1 < v2 ? 0 : 1] ^= v1;
+	o[v0].verts[v1 < v2 ? 1 : 0] ^= v2;
+	o[v0].degree += delta;
+	o[v0].edge ^= e;
+}
 
-	if (e->l_prev == unused)
-		vl->l_edge = e->l_next;
-	else
-		state->edges[e->l_prev].l_next = e->l_next;
-	if (e->l_next != unused)
-		state->edges[e->l_next].l_prev = e->l_prev;
+static inline void
+add_edge(struct oedge *o, uint32_t e,
+    uint32_t v0, uint32_t v1, uint32_t v2)
+{
 
-	if (e->m_prev == unused)
-		vm->m_edge = e->m_next;
-	else
-		state->edges[e->m_prev].m_next = e->m_next;
-	if (e->m_next != unused)
-		state->edges[e->m_next].m_prev = e->m_prev;
+	add_remove_edge(o, 1, e, v0, v1, v2);
+}
 
-	if (e->r_prev == unused)
-		vr->r_edge = e->r_next;
-	else
-		state->edges[e->r_prev].r_next = e->r_next;
-	if (e->r_next != unused)
-		state->edges[e->r_next].r_prev = e->r_prev;
+static inline void
+remove_vertex(struct state *state, uint32_t v0)
+{
+	uint32_t e, v1, v2;
+	struct oedge *o = state->oedges;
+
+	if (o[v0].degree == 1) {
+		e = o[v0].edge;
+		v1 = o[v0].verts[0];
+		v2 = o[v0].verts[1];
+		o[v0].degree = 0;
+		add_remove_edge(o, -1, e, v1, v0, v2);
+		add_remove_edge(o, -1, e, v2, v0, v1);
+		state->output_order[--state->output_index] = e;
+	}
 }
 
 static int
@@ -371,11 +365,16 @@ build_graph(struct cdbw *cdbw, struct st
 {
 	struct key_hash_head *head;
 	struct key_hash *key_hash;
-	struct vertex *v;
 	struct edge *e;
 	uint32_t hashes[3];
+	uint32_t remm; /* Magic values for a fast remainder. */
+	uint8_t rems1, rems2;
 	size_t i;
 
+	memset(state->oedges, 0, sizeof(struct oedge) * state->entries);
+
+	fast_divide32_prepare(state->entries, &remm, &rems1, &rems2);
+
 	e = state->edges;
 	for (i = 0; i < cdbw->hash_size; ++i) {
 		head = &cdbw->hash[i];
@@ -383,63 +382,41 @@ build_graph(struct cdbw *cdbw, struct st
 			e->idx = key_hash->idx;
 			mi_vector_hash(key_hash->key, key_hash->keylen,
 			    state->seed, hashes);
-			e->left = hashes[0] % state->entries;
-			e->middle = hashes[1] % state->entries;
-			e->right = hashes[2] % state->entries;
+			e->left = fast_remainder32(hashes[0],
+			    state->entries, remm, rems1, rems2);
+			e->middle = fast_remainder32(hashes[1],
+			    state->entries, remm, rems1, rems2);
+			e->right = fast_remainder32(hashes[2],
+			    state->entries, remm, rems1, rems2);
 
 			if (e->left == e->middle)
 				return -1;
+			add_edge(state->oedges, e - state->edges,
+			    e->right, e->left, e->middle);
 			if (e->left == e->right)
 				return -1;
+			add_edge(state->oedges, e - state->edges,
+			    e->middle, e->left, e->right);
 			if (e->middle == e->right)
 				return -1;
+			add_edge(state->oedges, e - state->edges,
+			    e->left, e->middle, e->right);
 
 			++e;
 		}
 	}
 
-	for (i = 0; i < state->entries; ++i) {
-		v = state->verts + i;
-		v->l_edge = unused;
-		v->m_edge = unused;
-		v->r_edge = unused;
-	}
-
-	for (i = 0; i < state->keys; ++i) {
-		e = state->edges + i;
-		v = state->verts + e->left;
-		if (v->l_edge != unused)
-			state->edges[v->l_edge].l_prev = i;
-		e->l_next = v->l_edge;
-		e->l_prev = unused;
-		v->l_edge = i;
-
-		v = &state->verts[e->middle];
-		if (v->m_edge != unused)
-			state->edges[v->m_edge].m_prev = i;
-		e->m_next = v->m_edge;
-		e->m_prev = unused;
-		v->m_edge = i;
-
-		v = &state->verts[e->right];
-		if (v->r_edge != unused)
-			state->edges[v->r_edge].r_prev = i;
-		e->r_next = v->r_edge;
-		e->r_prev = unused;
-		v->r_edge = i;
-	}
-
 	state->output_index = state->keys;
 	for (i = 0; i < state->entries; ++i)
-		remove_vertex(state, state->verts + i);
+		remove_vertex(state, i);
 
 	i = state->keys;
 	while (i > 0 && i > state->output_index) {
 		--i;
 		e = state->edges + state->output_order[i];
-		remove_vertex(state, state->verts + e->left);
-		remove_vertex(state, state->verts + e->middle);
-		remove_vertex(state, state->verts + e->right);
+		remove_vertex(state, e->left);
+		remove_vertex(state, e->middle);
+		remove_vertex(state, e->right);
 	}
 
 	return state->output_index == 0 ? 0 : -1;
@@ -450,25 +427,29 @@ assign_nodes(struct state *state)
 {
 	struct edge *e;
 	size_t i;
+	uint32_t remm; /* Magic values for a fast remainder. */
+	uint8_t rems1, rems2;
+
+	fast_divide32_prepare(state->data_entries, &remm, &rems1, &rems2);
 
 	for (i = 0; i < state->keys; ++i) {
 		e = state->edges + state->output_order[i];
 
 		if (!state->visited[e->left]) {
-			state->g[e->left] =
+			state->g[e->left] = fast_remainder32(
 			    (2 * state->data_entries + e->idx
-			    - state->g[e->middle] - state->g[e->right])
-			    % state->data_entries;
+			    - state->g[e->middle] - state->g[e->right]),
+			    state->data_entries, remm, rems1, rems2);
 		} else if (!state->visited[e->middle]) {
-			state->g[e->middle] =
+			state->g[e->middle] = fast_remainder32(
 			    (2 * state->data_entries + e->idx
-			    - state->g[e->left] - state->g[e->right])
-			    % state->data_entries;
+			    - state->g[e->left] - state->g[e->right]),
+			    state->data_entries, remm, rems1, rems2);
 		} else {
-			state->g[e->right] =
+			state->g[e->right] = fast_remainder32(
 			    (2 * state->data_entries + e->idx
-			    - state->g[e->left] - state->g[e->middle])
-			    % state->data_entries;
+			    - state->g[e->left] - state->g[e->middle]),
+			    state->data_entries, remm, rems1, rems2);
 		}
 		state->visited[e->left] = 1;
 		state->visited[e->middle] = 1;
@@ -590,12 +571,12 @@ cdbw_output(struct cdbw *cdbw, int fd, c
 #define	NALLOC(var, n)	var = calloc(sizeof(*var), n)
 	NALLOC(state.g, state.entries);
 	NALLOC(state.visited, state.entries);
-	NALLOC(state.verts, state.entries);
-	NALLOC(state.edges, state.entries);
+	NALLOC(state.oedges, state.entries);
+	NALLOC(state.edges, state.keys);
 	NALLOC(state.output_order, state.keys);
 #undef NALLOC
 
-	if (state.g == NULL || state.visited == NULL || state.verts == NULL ||
+	if (state.g == NULL || state.visited == NULL || state.oedges == NULL ||
 	    state.edges == NULL || state.output_order == NULL) {
 		rv = -1;
 		goto release;
@@ -615,7 +596,7 @@ cdbw_output(struct cdbw *cdbw, int fd, c
 release:
 	free(state.g);
 	free(state.visited);
-	free(state.verts);
+	free(state.oedges);
 	free(state.edges);
 	free(state.output_order);
 


Home | Main Index | Thread Index | Old Index