Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/lib/libc/string In wcscspn and wcspbrk, handle set size of 0...
details: https://anonhg.NetBSD.org/src/rev/4a831f867cbc
branches: trunk
changeset: 771545:4a831f867cbc
user: joerg <joerg%NetBSD.org@localhost>
date: Thu Nov 24 18:44:25 2011 +0000
description:
In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly.
For larger sets, use a bloom filter to avoid the inner loop for most of
the input. The current implementation uses a simple modular hash as
first function (well suited for input e.g. in ISO Latin character sets)
and a more complex multiplicative hash as second function with a filter
size of 512 Bit. This reduces the typical run time to O(n+m).
diffstat:
lib/libc/string/wcscspn.c | 31 ++++++++++++--
lib/libc/string/wcscspn_bloom.h | 84 +++++++++++++++++++++++++++++++++++++++++
lib/libc/string/wcspbrk.c | 27 ++++++++++--
3 files changed, 132 insertions(+), 10 deletions(-)
diffs (210 lines):
diff -r ff7cfd2f2b68 -r 4a831f867cbc lib/libc/string/wcscspn.c
--- a/lib/libc/string/wcscspn.c Thu Nov 24 18:34:56 2011 +0000
+++ b/lib/libc/string/wcscspn.c Thu Nov 24 18:44:25 2011 +0000
@@ -1,7 +1,8 @@
-/* $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $ */
+/* $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $ */
/*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,45 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $");
#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
#include <wchar.h>
+#include "wcscspn_bloom.h"
+
size_t
wcscspn(const wchar_t *s, const wchar_t *set)
{
+ size_t bloom[BLOOM_ARRAY_SIZE];
const wchar_t *p;
const wchar_t *q;
_DIAGASSERT(s != NULL);
_DIAGASSERT(set != NULL);
+ if (set[0] == '\0')
+ return wcslen(s);
+ if (set[1] == '\0') {
+ for (p = s; *p; ++p)
+ if (*p == set[0])
+ break;
+ return p - s;
+ }
+
+ wcsspn_bloom_init(bloom, set);
+
for (p = s; *p; ++p) {
- for (q = set; *q; ++q) {
+ if (!wcsspn_in_bloom(bloom, *p))
+ continue;
+
+ q = set;
+ do {
if (*p == *q)
goto done;
- }
+ } while (*++q);
}
done:
diff -r ff7cfd2f2b68 -r 4a831f867cbc lib/libc/string/wcscspn_bloom.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/libc/string/wcscspn_bloom.h Thu Nov 24 18:44:25 2011 +0000
@@ -0,0 +1,84 @@
+/* $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $ */
+
+/*-
+ * Copyright (c) 2011 Joerg Sonnenberger,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Bloom filter for fast test if a given character is part of the reject set.
+ * The test may have false positives, but doesn't have false negatives.
+ * The first hash function is designed to be very fast to evaluate.
+ * It is collision free if the input is part of the same European language
+ * and shouldn't be too bad even other input. The second hash function
+ * tries to provide a much better mixing, but involves the slower
+ * multiplication.
+ */
+
+#define BLOOM_SIZE 64
+#define BLOOM_ARRAY_SIZE (BLOOM_SIZE / sizeof(size_t))
+#define BLOOM_MASK (BLOOM_SIZE * 8 - 1)
+#define BLOOM_DIV (sizeof(size_t) * 8)
+
+static inline size_t
+wcscspn_bloom1(size_t x)
+{
+ return x & BLOOM_MASK;
+}
+
+static inline size_t
+wcscspn_bloom2(size_t x)
+{
+ return (uint32_t)(x * 2654435761U) /
+ (0x100000000ULL / (BLOOM_MASK + 1));
+}
+
+static inline void
+wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
+{
+ size_t val;
+
+ memset(bloom, 0, BLOOM_SIZE);
+ do {
+ val = wcscspn_bloom1((size_t)*charset);
+ bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+ val = wcscspn_bloom2((size_t)*charset);
+ bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
+ }
+ while (*++charset);
+}
+
+static inline int
+wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
+{
+ size_t val;
+
+ val = wcscspn_bloom1((size_t)ch);
+ if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+ return 1;
+ val = wcscspn_bloom2((size_t)ch);
+ if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
+ return 1;
+ return 0;
+}
diff -r ff7cfd2f2b68 -r 4a831f867cbc lib/libc/string/wcspbrk.c
--- a/lib/libc/string/wcspbrk.c Thu Nov 24 18:34:56 2011 +0000
+++ b/lib/libc/string/wcspbrk.c Thu Nov 24 18:44:25 2011 +0000
@@ -1,7 +1,8 @@
-/* $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $ */
+/* $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $ */
/*-
- * Copyright (c)1999 Citrus Project,
+ * Copyright (c) 1999 Citrus Project,
+ * Copyright (c) 2011 Joerg Sonnenberger,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,25 +30,41 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $");
+__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $");
#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
#include <wchar.h>
+#include "wcscspn_bloom.h"
+
wchar_t *
wcspbrk(const wchar_t *s, const wchar_t *set)
{
+ size_t bloom[BLOOM_ARRAY_SIZE];
const wchar_t *p;
const wchar_t *q;
_DIAGASSERT(s != NULL);
_DIAGASSERT(set != NULL);
+ if (set[0] == '\0')
+ return NULL;
+ if (set[1] == '\0')
+ return wcschr(s, set[0]);
+
+ wcsspn_bloom_init(bloom, set);
+
for (p = s; *p; ++p) {
- for (q = set; *q; ++q) {
+ if (!wcsspn_in_bloom(bloom, *p))
+ continue;
+
+ q = set;
+ do {
if (*p == *q)
return __UNCONST(p);
- }
+ } while (*++q);
}
return NULL;
}
Home |
Main Index |
Thread Index |
Old Index