Hi all, attached is an implement of a constant database based on a minimal perfect hash table. This is a very compact read-only database. Two special design constraint is that more than one key can exist for a value and that the key itself is not stored by the database. Most users already do that one way or the other in the data, so it is just redundant and with a proper content format not even cheaper. A patch to modify services_mkdb for using the new format is attached as well to demonstrate the size difference. For this use case, the database size drops from 2.1MB (disk space, 4.2MB file size) to 320KB (disk space, 306KB file size). Creation time drops from 1.9s to 0.23s on AMD64. I would like to integrate this into libc for the various purposes at hand. Joerg
Attachment:
cdb.tar.gz
Description: application/tar-gz
Index: Makefile =================================================================== RCS file: /home/joerg/repo/netbsd/src/usr.sbin/services_mkdb/Makefile,v retrieving revision 1.6 diff -u -p -r1.6 Makefile --- Makefile 22 Apr 2009 15:23:08 -0000 1.6 +++ Makefile 3 Mar 2010 23:55:00 -0000 @@ -4,7 +4,7 @@ PROG= services_mkdb MAN= services_mkdb.8 SRCS= services_mkdb.c uniq.c -LDADD+= -lutil +LDADD+= -lutil -lcdbw DPADD+= ${LIBUTIL} #COPTS+=-g Index: services_mkdb.c =================================================================== RCS file: /home/joerg/repo/netbsd/src/usr.sbin/services_mkdb/services_mkdb.c,v retrieving revision 1.14 diff -u -p -r1.14 services_mkdb.c --- services_mkdb.c 28 Apr 2008 20:24:17 -0000 1.14 +++ services_mkdb.c 3 Mar 2010 23:54:31 -0000 @@ -37,7 +37,7 @@ __RCSID("$NetBSD: services_mkdb.c,v 1.14 #include <sys/param.h> #include <assert.h> -#include <db.h> +#include <cdbw.h> #include <err.h> #include <fcntl.h> #include <netdb.h> @@ -57,31 +57,21 @@ static char tname[MAXPATHLEN]; extern void uniq(const char *); -static void add(DB *, StringList *, size_t, const char *, size_t *, int); +static void add(StringList *, size_t, const char *, size_t *, int); static StringList ***parseservices(const char *, StringList *); static void cleanup(void); -static void store(DB *, DBT *, DBT *, int); -static void killproto(DBT *); static char *getstring(const char *, size_t, char **, const char *); static size_t getprotoindex(StringList *, const char *); static const char *getprotostr(StringList *, size_t); static const char *mkaliases(StringList *, char *, size_t); static void usage(void) __dead; -const HASHINFO hinfo = { - .bsize = 256, - .ffactor = 4, - .nelem = 32768, - .cachesize = 1024, - .hash = NULL, - .lorder = 0 -}; - +struct cdbw *cdbw; int main(int argc, char *argv[]) { - DB *db; + FILE *output; int ch; const char *fname = _PATH_SERVICES; const char *dbname = _PATH_SERVICES_DB; @@ -129,11 +119,13 @@ main(int argc, char *argv[]) err(1, "Cannot install exit handler"); (void)snprintf(tname, sizeof(tname), "%s.tmp", dbname); - db = dbopen(tname, O_RDWR | O_CREAT | O_EXCL, - (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH), DB_HASH, &hinfo); - if (!db) + output = fopen(tname, "w"); + if (output == NULL) err(1, "Error opening temporary database `%s'", tname); + cdbw = cdbw_open(); + if (cdbw == NULL) + err(1, "Error opening cdb writer"); for (port = 0; port < PMASK + 1; port++) { if (svc[port] == NULL) @@ -143,7 +135,7 @@ main(int argc, char *argv[]) StringList *s; if ((s = svc[port][proto]) == NULL) continue; - add(db, s, port, getprotostr(sl, proto), &cnt, warndup); + add(s, port, getprotostr(sl, proto), &cnt, warndup); } free(svc[port]); @@ -152,7 +144,13 @@ main(int argc, char *argv[]) free(svc); sl_free(sl, 1); - if ((db->close)(db)) + if (cdbw_output(cdbw, output, NULL)) { + err(1, "Error computing temporary database `%s'", tname); + } + + cdbw_close(cdbw); + + if (fclose(output)) err(1, "Error closing temporary database `%s'", tname); if (rename(tname, dbname) == -1) @@ -162,14 +160,14 @@ main(int argc, char *argv[]) } static void -add(DB *db, StringList *sl, size_t port, const char *proto, size_t *cnt, +add(StringList *sl, size_t port, const char *proto, size_t *cnt, int warndup) { + char abuf[BUFSIZ]; + char *data, *key; + size_t datalen, keylen; + uint32_t idx; size_t i; - char keyb[BUFSIZ], datab[BUFSIZ], abuf[BUFSIZ]; - DBT data, key; - key.data = keyb; - data.data = datab; #ifdef DEBUG (void)printf("add %s %zu %s [ ", sl->sl_str[0], port, proto); @@ -178,32 +176,31 @@ add(DB *db, StringList *sl, size_t port, (void)printf("]\n"); #endif - /* key `indirect key', data `full line' */ - data.size = snprintf(datab, sizeof(datab), "%zu", (*cnt)++) + 1; - key.size = snprintf(keyb, sizeof(keyb), "%s %zu/%s %s", - sl->sl_str[0], port, proto, mkaliases(sl, abuf, sizeof(abuf))) + 1; - store(db, &data, &key, warndup); - - /* key `\377port/proto', data = `indirect key' */ - key.size = snprintf(keyb, sizeof(keyb), "\377%zu/%s", - port, proto) + 1; - store(db, &key, &data, warndup); - - /* key `\377port', data = `indirect key' */ - killproto(&key); - store(db, &key, &data, warndup); + datalen = asprintf(&data, "%s %zu/%s %s", + sl->sl_str[0], port, proto, mkaliases(sl, abuf, sizeof(abuf))); + + if (cdbw_put_data(cdbw, data, datalen, &idx)) + err(1, "cdbw_put_data failed"); + + keylen = asprintf(&key, "\377%zu/%s", port, proto); + if (cdbw_put_key(cdbw, key, keylen, idx) && warndup) + warnx("Duplicate key: `%s'", key); + + keylen = asprintf(&key, "\377%zu", port); + if (cdbw_put_key(cdbw, key, keylen, idx) && warndup) + warnx("Duplicate key: `%s'", key); /* add references for service and all aliases */ for (i = 0; i < sl->sl_cur; i++) { - /* key `\376service/proto', data = `indirect key' */ - key.size = snprintf(keyb, sizeof(keyb), "\376%s/%s", - sl->sl_str[i], proto) + 1; - store(db, &key, &data, warndup); - - /* key `\376service', data = `indirect key' */ - killproto(&key); - store(db, &key, &data, warndup); + keylen = asprintf(&key, "\376%s/%s", sl->sl_str[i], proto); + if (cdbw_put_key(cdbw, key, keylen, idx) && warndup) + warnx("Duplicate key: `%s'", key); + + keylen = asprintf(&key, "\376%s", sl->sl_str[i]); + if (cdbw_put_key(cdbw, key, keylen, idx) && warndup) + warnx("Duplicate key: `%s'", key); } + sl_free(sl, 1); } @@ -318,44 +315,6 @@ getstring(const char *fname, size_t line return str; } -static void -killproto(DBT *key) -{ - char *p, *d = key->data; - - if ((p = strchr(d, '/')) == NULL) - abort(); - *p++ = '\0'; - key->size = p - d; -} - -static void -store(DB *db, DBT *key, DBT *data, int warndup) -{ -#ifdef DEBUG - int k = key->size - 1; - int d = data->size - 1; - (void)printf("store [%*.*s] [%*.*s]\n", - k, k, (char *)key->data + 1, - d, d, (char *)data->data + 1); -#endif - switch ((db->put)(db, key, data, R_NOOVERWRITE)) { - case 0: - break; - case 1: - if (warndup) - warnx("duplicate service `%s'", - &((char *)key->data)[1]); - break; - case -1: - err(1, "put"); - break; - default: - abort(); - break; - } -} - static size_t getprotoindex(StringList *sl, const char *str) { Index: uniq.c =================================================================== RCS file: /home/joerg/repo/netbsd/src/usr.sbin/services_mkdb/uniq.c,v retrieving revision 1.4 diff -u -p -r1.4 uniq.c --- uniq.c 28 Apr 2008 20:24:17 -0000 1.4 +++ uniq.c 3 Mar 2010 23:55:21 -0000 @@ -40,7 +40,14 @@ __RCSID("$NetBSD: uniq.c,v 1.4 2008/04/2 #include <ctype.h> #include <fcntl.h> -extern const HASHINFO hinfo; +static const HASHINFO hinfo = { + .bsize = 256, + .ffactor = 4, + .nelem = 32768, + .cachesize = 1024, + .hash = NULL, + .lorder = 0 +}; void uniq(const char *); static int comp(const char *, char **, size_t *);