Source-Changes-D archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: CVS commit: src/lib/libc/string



Am 24.11.11 19:44, schrieb Joerg Sonnenberger:
> Module Name:  src
> Committed By: joerg
> Date:         Thu Nov 24 18:44:25 UTC 2011
> 
> Modified Files:
>       src/lib/libc/string: wcscspn.c wcspbrk.c
> Added Files:
>       src/lib/libc/string: wcscspn_bloom.h

This breaks the build on i386, at least.

> 
> Log Message:
> In wcscspn and wcspbrk, handle set size of 0 and 1 explicitly.
> For larger sets, use a bloom filter to avoid the inner loop for most of
> the input. The current implementation uses a simple modular hash as
> first function (well suited for input e.g. in ISO Latin character sets)
> and a more complex multiplicative hash as second function with a filter
> size of 512 Bit. This reduces the typical run time to O(n+m).
> 
> 
> To generate a diff of this commit:
> cvs rdiff -u -r1.3 -r1.4 src/lib/libc/string/wcscspn.c
> cvs rdiff -u -r0 -r1.1 src/lib/libc/string/wcscspn_bloom.h
> cvs rdiff -u -r1.4 -r1.5 src/lib/libc/string/wcspbrk.c
> 
> Please note that diffs are not public domain; they are subject to the
> copyright notices on the relevant files.
> 
> 
> 
> 
> Modified files:
> 
> Index: src/lib/libc/string/wcscspn.c
> diff -u src/lib/libc/string/wcscspn.c:1.3 src/lib/libc/string/wcscspn.c:1.4
> --- src/lib/libc/string/wcscspn.c:1.3 Mon Nov 21 15:02:48 2011
> +++ src/lib/libc/string/wcscspn.c     Thu Nov 24 18:44:25 2011
> @@ -1,7 +1,8 @@
> -/*   $NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $        */
> +/*   $NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $        */
>  
>  /*-
> - * Copyright (c)1999 Citrus Project,
> + * Copyright (c) 1999 Citrus Project,
> + * Copyright (c) 2011 Joerg Sonnenberger,
>   * All rights reserved.
>   *
>   * Redistribution and use in source and binary forms, with or without
> @@ -29,25 +30,45 @@
>   */
>  
>  #include <sys/cdefs.h>
> -__RCSID("$NetBSD: wcscspn.c,v 1.3 2011/11/21 15:02:48 joerg Exp $");
> +__RCSID("$NetBSD: wcscspn.c,v 1.4 2011/11/24 18:44:25 joerg Exp $");
>  
>  #include <assert.h>
> +#include <inttypes.h>
> +#include <string.h>
>  #include <wchar.h>
>  
> +#include "wcscspn_bloom.h"
> +
>  size_t
>  wcscspn(const wchar_t *s, const wchar_t *set)
>  {
> +     size_t bloom[BLOOM_ARRAY_SIZE];
>       const wchar_t *p;
>       const wchar_t *q;
>  
>       _DIAGASSERT(s != NULL);
>       _DIAGASSERT(set != NULL);
>  
> +     if (set[0] == '\0')
> +             return wcslen(s);
> +     if (set[1] == '\0') {
> +             for (p = s; *p; ++p)
> +                     if (*p == set[0])
> +                             break;
> +             return p - s;
> +     }
> +
> +     wcsspn_bloom_init(bloom, set);
> +
>       for (p = s; *p; ++p) {
> -             for (q = set; *q; ++q) {
> +             if (!wcsspn_in_bloom(bloom, *p))
> +                     continue;
> +
> +             q = set;
> +             do {
>                       if (*p == *q)
>                               goto done;
> -             }
> +             } while (*++q);
>       }
>  
>  done:
> 
> Index: src/lib/libc/string/wcspbrk.c
> diff -u src/lib/libc/string/wcspbrk.c:1.4 src/lib/libc/string/wcspbrk.c:1.5
> --- src/lib/libc/string/wcspbrk.c:1.4 Mon Nov 21 15:02:48 2011
> +++ src/lib/libc/string/wcspbrk.c     Thu Nov 24 18:44:25 2011
> @@ -1,7 +1,8 @@
> -/*   $NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $        */
> +/*   $NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $        */
>  
>  /*-
> - * Copyright (c)1999 Citrus Project,
> + * Copyright (c) 1999 Citrus Project,
> + * Copyright (c) 2011 Joerg Sonnenberger,
>   * All rights reserved.
>   *
>   * Redistribution and use in source and binary forms, with or without
> @@ -29,25 +30,41 @@
>   */
>  
>  #include <sys/cdefs.h>
> -__RCSID("$NetBSD: wcspbrk.c,v 1.4 2011/11/21 15:02:48 joerg Exp $");
> +__RCSID("$NetBSD: wcspbrk.c,v 1.5 2011/11/24 18:44:25 joerg Exp $");
>  
>  #include <assert.h>
> +#include <inttypes.h>
> +#include <string.h>
>  #include <wchar.h>
>  
> +#include "wcscspn_bloom.h"
> +
>  wchar_t *
>  wcspbrk(const wchar_t *s, const wchar_t *set)
>  {
> +     size_t bloom[BLOOM_ARRAY_SIZE];
>       const wchar_t *p;
>       const wchar_t *q;
>  
>       _DIAGASSERT(s != NULL);
>       _DIAGASSERT(set != NULL);
>  
> +     if (set[0] == '\0')
> +             return NULL;
> +     if (set[1] == '\0')
> +             return wcschr(s, set[0]);
> +
> +     wcsspn_bloom_init(bloom, set);
> +
>       for (p = s; *p; ++p) {
> -             for (q = set; *q; ++q) {
> +             if (!wcsspn_in_bloom(bloom, *p))
> +                     continue;
> +
> +             q = set;
> +             do {
>                       if (*p == *q)
>                               return __UNCONST(p);
> -             }
> +             } while (*++q);
>       }
>       return NULL;
>  }
> 
> Added files:
> 
> Index: src/lib/libc/string/wcscspn_bloom.h
> diff -u /dev/null src/lib/libc/string/wcscspn_bloom.h:1.1
> --- /dev/null Thu Nov 24 18:44:25 2011
> +++ src/lib/libc/string/wcscspn_bloom.h       Thu Nov 24 18:44:25 2011
> @@ -0,0 +1,84 @@
> +/*   $NetBSD: wcscspn_bloom.h,v 1.1 2011/11/24 18:44:25 joerg Exp $  */
> +
> +/*-
> + * Copyright (c) 2011 Joerg Sonnenberger,
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +/*
> + * Bloom filter for fast test if a given character is part of the reject set.
> + * The test may have false positives, but doesn't have false negatives.
> + * The first hash function is designed to be very fast to evaluate.
> + * It is collision free if the input is part of the same European language
> + * and shouldn't be too bad even other input.  The second hash function
> + * tries to provide a much better mixing, but involves the slower
> + * multiplication.
> + */
> +
> +#define      BLOOM_SIZE              64
> +#define      BLOOM_ARRAY_SIZE        (BLOOM_SIZE / sizeof(size_t))
> +#define      BLOOM_MASK              (BLOOM_SIZE * 8 - 1)
> +#define      BLOOM_DIV               (sizeof(size_t) * 8)
> +
> +static inline size_t
> +wcscspn_bloom1(size_t x)
> +{
> +     return x & BLOOM_MASK;
> +}
> +
> +static inline size_t
> +wcscspn_bloom2(size_t x)
> +{
> +     return (uint32_t)(x * 2654435761U) /
> +         (0x100000000ULL / (BLOOM_MASK + 1));
> +}
> +
> +static inline void
> +wcsspn_bloom_init(size_t *bloom, const wchar_t *charset)
> +{
> +     size_t val;
> +
> +     memset(bloom, 0, BLOOM_SIZE);
> +     do {
> +             val = wcscspn_bloom1((size_t)*charset);
> +             bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
> +             val = wcscspn_bloom2((size_t)*charset);
> +             bloom[val / BLOOM_DIV] |= 1ULL << (val % BLOOM_DIV);
> +     }
> +     while (*++charset);
> +}
> +
> +static inline int
> +wcsspn_in_bloom(const size_t *bloom, wchar_t ch)
> +{
> +     size_t val;
> +
> +     val = wcscspn_bloom1((size_t)ch);
> +     if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
> +             return 1;
> +     val = wcscspn_bloom2((size_t)ch);
> +     if (bloom[val / BLOOM_DIV] & (1ULL << (val % BLOOM_DIV)))
> +             return 1;
> +     return 0;
> +}
> 


-- 
  \~~~~~.                The NetBSD Foundation
   \~~~~~'               Marc Balmer, Developer / Marketing
  NetBSD
     \                   mbalmer%NetBSD.org@localhost   http://www.NetBSD.org/


Home | Main Index | Thread Index | Old Index