Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/lib/libc/string In wcscspn and wcspbrk, handle set size of 0...



details:   https://anonhg.NetBSD.org/src/rev/4a831f867cbc
branches:  trunk
changeset: 771545:4a831f867cbc
user:      joerg <joerg%NetBSD.org@localhost>
date:      Thu Nov 24 18:44:25 2011 +0000

description:
In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly.
For larger sets, use a bloom filter to avoid the inner loop for most of
the input. The current implementation uses a simple modular hash as
first function (well suited for input e.g. in ISO Latin character sets)
and a more complex multiplicative hash as second function with a filter
size of 512 Bit. This reduces the typical run time to O(n+m).

diffstat:

 lib/libc/string/wcscspn.c       |  31 ++++++++++++--
 lib/libc/string/wcscspn_bloom.h |  84 +++++++++++++++++++++++++++++++++++++++++
 lib/libc/string/wcspbrk.c       |  27 ++++++++++--
 3 files changed, 132 insertions(+), 10 deletions(-)

diffs (210 lines):

diff -r ff7cfd2f2b68 -r 4a831f867cbc lib/libc/string/wcscspn.c
--- a/lib/libc/string/wcscspn.c Thu Nov 24 18:34:56 2011 +0000
+++ b/lib/libc/string/wcscspn.c Thu Nov 24 18:44:25 2011 +0000
@@ -1,7 +1,8 @@
-/*     $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $        */
+/*     $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $        */
 
 /*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,45 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $");
 
 #include <assert.h>
+#include <inttypes.h>
+#include <string.h>
 #include <wchar.h>
 
+#include "wcscspn_bloom.h"
+
 size_t
 wcscspn(const wchar_t *s, const wchar_t *set)
 {
+       size_t bloom[BLOOM_ARRAY_SIZE];
        const wchar_t *p;
        const wchar_t *q;
 
        _DIAGASSERT(s != NULL);
        _DIAGASSERT(set != NULL);
 
+       if (set[0] == '\0')
+               return wcslen(s);
+       if (set[1] == '\0') {
+               for (p = s; *p; ++p)
+                       if (*p == set[0])
+                               break;
+               return p - s;
+       }
+
+       wcsspn_bloom_init(bloom, set);
+
        for (p = s; *p; ++p) {
-               for (q = set; *q; ++q) {
+               if (!wcsspn_in_bloom(bloom, *p))
+                       continue;
+
+               q = set;
+               do {
                        if (*p == *q)
                                goto done;
-               }
+               } while (*++q);
        }
 
 done:
diff -r ff7cfd2f2b68 -r 4a831f867cbc lib/libc/string/wcscspn_bloom.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/libc/string/wcscspn_bloom.h   Thu Nov 24 18:44:25 2011 +0000
@@ -0,0 +1,84 @@
+/*     $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $  */
+
+/*-
+ * Copyright (c) 2011 Joerg Sonnenberger,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Bloom filter for fast test if a given character is part of the reject set.
+ * The test may have false positives, but doesn't have false negatives.
+ * The first hash function is designed to be very fast to evaluate.
+ * It is collision free if the input is part of the same European language
+ * and shouldn't be too bad even other input.  The second hash function
+ * tries to provide a much better mixing, but involves the slower
+ * multiplication.
+ */
+
+#define        BLOOM_SIZE              64
+#define        BLOOM_ARRAY_SIZE        (BLOOM_SIZE / sizeof(size_t))
+#define        BLOOM_MASK              (BLOOM_SIZE * 8 - 1)
+#define        BLOOM_DIV               (sizeof(size_t) * 8)
+
+static inline size_t
+wcscspn_bloom1(size_t x)
+{
+       return x & BLOOM_MASK;
+}
+
+static inline size_t
+wcscspn_bloom2(size_t x)
+{
+       return (uint32_t)(x * 2654435761U) /
+           (0x100000000ULL / (BLOOM_MASK + 1));
+}
+
+static inline void
+wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
+{
+       size_t val;
+
+       memset(bloom, 0, BLOOM_SIZE);
+       do {
+               val = wcscspn_bloom1((size_t)*charset);
+               bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+               val = wcscspn_bloom2((size_t)*charset);
+               bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+       }
+       while (*++charset);
+}
+
+static inline int
+wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
+{
+       size_t val;
+
+       val = wcscspn_bloom1((size_t)ch);
+       if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+               return 1;
+       val = wcscspn_bloom2((size_t)ch);
+       if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+               return 1;
+       return 0;
+}
diff -r ff7cfd2f2b68 -r 4a831f867cbc lib/libc/string/wcspbrk.c
--- a/lib/libc/string/wcspbrk.c Thu Nov 24 18:34:56 2011 +0000
+++ b/lib/libc/string/wcspbrk.c Thu Nov 24 18:44:25 2011 +0000
@@ -1,7 +1,8 @@
-/*     $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $        */
+/*     $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $        */
 
 /*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,41 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $");
 
 #include <assert.h>
+#include <inttypes.h>
+#include <string.h>
 #include <wchar.h>
 
+#include "wcscspn_bloom.h"
+
 wchar_t *
 wcspbrk(const wchar_t *s, const wchar_t *set)
 {
+       size_t bloom[BLOOM_ARRAY_SIZE];
        const wchar_t *p;
        const wchar_t *q;
 
        _DIAGASSERT(s != NULL);
        _DIAGASSERT(set != NULL);
 
+       if (set[0] == '\0')
+               return NULL;
+       if (set[1] == '\0')
+               return wcschr(s, set[0]);
+
+       wcsspn_bloom_init(bloom, set);
+
        for (p = s; *p; ++p) {
-               for (q = set; *q; ++q) {
+               if (!wcsspn_in_bloom(bloom, *p))
+                       continue;
+
+               q = set;
+               do {
                        if (*p == *q)
                                return __UNCONST(p);
-               }
+               } while (*++q);
        }
        return NULL;
 }



Home | Main Index | Thread Index | Old Index