Source-Changes-D archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: CVS commit: src/lib/libc/string
Am 24.11.11 19:44, schrieb Joerg Sonnenberger:
> Module Name: src
> Committed By: joerg
> Date: Thu Nov 24 18:44:25 UTC 2011
>
> Modified Files:
> src/lib/libc/string: wcscspn.c wcspbrk.c
> Added Files:
> src/lib/libc/string: wcscspn_bloom.h
This breaks the build on i386, at least.
>
> Log Message:
> In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly.
> For larger sets, use a bloom filter to avoid the inner loop for most of
> the input. The current implementation uses a simple modular hash as
> first function (well suited for input e.g. in ISO Latin character sets)
> and a more complex multiplicative hash as second function with a filter
> size of 512 Bit. This reduces the typical run time to O(n+m).
>
>
> To generate a diff of this commit:
> cvs rdiff -u -r1.3 -r1.4 src/lib/libc/string/wcscspn.c
> cvs rdiff -u -r0 -r1.1 src/lib/libc/string/wcscspn_bloom.h
> cvs rdiff -u -r1.4 -r1.5 src/lib/libc/string/wcspbrk.c
>
> Please note that diffs are not public domain; they are subject to the
> copyright notices on the relevant files.
>
>
>
>
> Modified files:
>
> Index: src/lib/libc/string/wcscspn.c
> diff -u src/lib/libc/string/wcscspn.c:1.3 src/lib/libc/string/wcscspn.c:1.4
> --- src/lib/libc/string/wcscspn.c:1.3 Mon Nov 21 15:02:48 2011
> +++ src/lib/libc/string/wcscspn.c Thu Nov 24 18:44:25 2011
> @@ -1,7 +1,8 @@
> -/* $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $ */
> +/* $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $ */
>
> /*-
> - * Copyright (c)1999 Citrus Project,
> + * Copyright (c) 1999 Citrus Project,
> + * Copyright (c) 2011 Joerg Sonnenberger,
> * All rights reserved.
> *
> * Redistribution and use in source and binary forms, with or without
> @@ -29,25 +30,45 @@
> */
>
> #include <sys/cdefs.h>
> -__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $");
> +__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $");
>
> #include <assert.h>
> +#include <inttypes.h>
> +#include <string.h>
> #include <wchar.h>
>
> +#include "wcscspn_bloom.h"
> +
> size_t
> wcscspn(const wchar_t *s, const wchar_t *set)
> {
> + size_t bloom[BLOOM_ARRAY_SIZE];
> const wchar_t *p;
> const wchar_t *q;
>
> _DIAGASSERT(s != NULL);
> _DIAGASSERT(set != NULL);
>
> + if (set[0] == '\0')
> + return wcslen(s);
> + if (set[1] == '\0') {
> + for (p = s; *p; ++p)
> + if (*p == set[0])
> + break;
> + return p - s;
> + }
> +
> + wcsspn_bloom_init(bloom, set);
> +
> for (p = s; *p; ++p) {
> - for (q = set; *q; ++q) {
> + if (!wcsspn_in_bloom(bloom, *p))
> + continue;
> +
> + q = set;
> + do {
> if (*p == *q)
> goto done;
> - }
> + } while (*++q);
> }
>
> done:
>
> Index: src/lib/libc/string/wcspbrk.c
> diff -u src/lib/libc/string/wcspbrk.c:1.4 src/lib/libc/string/wcspbrk.c:1.5
> --- src/lib/libc/string/wcspbrk.c:1.4 Mon Nov 21 15:02:48 2011
> +++ src/lib/libc/string/wcspbrk.c Thu Nov 24 18:44:25 2011
> @@ -1,7 +1,8 @@
> -/* $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $ */
> +/* $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $ */
>
> /*-
> - * Copyright (c)1999 Citrus Project,
> + * Copyright (c) 1999 Citrus Project,
> + * Copyright (c) 2011 Joerg Sonnenberger,
> * All rights reserved.
> *
> * Redistribution and use in source and binary forms, with or without
> @@ -29,25 +30,41 @@
> */
>
> #include <sys/cdefs.h>
> -__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $");
> +__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $");
>
> #include <assert.h>
> +#include <inttypes.h>
> +#include <string.h>
> #include <wchar.h>
>
> +#include "wcscspn_bloom.h"
> +
> wchar_t *
> wcspbrk(const wchar_t *s, const wchar_t *set)
> {
> + size_t bloom[BLOOM_ARRAY_SIZE];
> const wchar_t *p;
> const wchar_t *q;
>
> _DIAGASSERT(s != NULL);
> _DIAGASSERT(set != NULL);
>
> + if (set[0] == '\0')
> + return NULL;
> + if (set[1] == '\0')
> + return wcschr(s, set[0]);
> +
> + wcsspn_bloom_init(bloom, set);
> +
> for (p = s; *p; ++p) {
> - for (q = set; *q; ++q) {
> + if (!wcsspn_in_bloom(bloom, *p))
> + continue;
> +
> + q = set;
> + do {
> if (*p == *q)
> return __UNCONST(p);
> - }
> + } while (*++q);
> }
> return NULL;
> }
>
> Added files:
>
> Index: src/lib/libc/string/wcscspn_bloom.h
> diff -u /dev/null src/lib/libc/string/wcscspn_bloom.h:1.1
> --- /dev/null Thu Nov 24 18:44:25 2011
> +++ src/lib/libc/string/wcscspn_bloom.h Thu Nov 24 18:44:25 2011
> @@ -0,0 +1,84 @@
> +/* $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $ */
> +
> +/*-
> + * Copyright (c) 2011 Joerg Sonnenberger,
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +/*
> + * Bloom filter for fast test if a given character is part of the reject set.
> + * The test may have false positives, but doesn't have false negatives.
> + * The first hash function is designed to be very fast to evaluate.
> + * It is collision free if the input is part of the same European language
> + * and shouldn't be too bad even other input. The second hash function
> + * tries to provide a much better mixing, but involves the slower
> + * multiplication.
> + */
> +
> +#define BLOOM_SIZE 64
> +#define BLOOM_ARRAY_SIZE (BLOOM_SIZE / sizeof(size_t))
> +#define BLOOM_MASK (BLOOM_SIZE * 8 - 1)
> +#define BLOOM_DIV (sizeof(size_t) * 8)
> +
> +static inline size_t
> +wcscspn_bloom1(size_t x)
> +{
> + return x & BLOOM_MASK;
> +}
> +
> +static inline size_t
> +wcscspn_bloom2(size_t x)
> +{
> + return (uint32_t)(x * 2654435761U) /
> + (0x100000000ULL / (BLOOM_MASK + 1));
> +}
> +
> +static inline void
> +wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
> +{
> + size_t val;
> +
> + memset(bloom, 0, BLOOM_SIZE);
> + do {
> + val = wcscspn_bloom1((size_t)*charset);
> + bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
> + val = wcscspn_bloom2((size_t)*charset);
> + bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
> + }
> + while (*++charset);
> +}
> +
> +static inline int
> +wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
> +{
> + size_t val;
> +
> + val = wcscspn_bloom1((size_t)ch);
> + if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
> + return 1;
> + val = wcscspn_bloom2((size_t)ch);
> + if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
> + return 1;
> + return 0;
> +}
>
--
\~~~~~. The NetBSD Foundation
\~~~~~' Marc Balmer, Developer / Marketing
NetBSD
\ mbalmer%NetBSD.org@localhost http://www.NetBSD.org/
Home |
Main Index |
Thread Index |
Old Index