tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Patch: cprng_fast performance - please review.



On Wed, Apr 16, 2014 at 09:52:22PM -0400, Thor Lancelot Simon wrote:
> 
> Attached is a patch which makes cprng_fast per-CPU and lockless.  *IT IS NOT
> WELL TESTED YET (I haven't even run test vectors) AND IS ONLY FOR REVIEW.*

New diff, with some missing files and incorporating some more comments
from Taylor.

Thor
? kern/.init_main.c.swp
? sys/.cprng.h.swo
Index: conf/files
===================================================================
RCS file: /cvsroot/src/sys/conf/files,v
retrieving revision 1.1090
diff -u -p -r1.1090 files
--- conf/files  1 Apr 2014 17:49:30 -0000       1.1090
+++ conf/files  17 Apr 2014 03:17:18 -0000
@@ -160,6 +160,7 @@ include "crypto/cast128/files.cast128"
 include "crypto/rijndael/files.rijndael"
 include "crypto/skipjack/files.skipjack"
 include "crypto/camellia/files.camellia"
+include "crypto/hc128/files.hc128"
 # General-purpose crypto processing framework.
 include "opencrypto/files.opencrypto"
 
Index: crypto/hc128/files.hc128
===================================================================
RCS file: crypto/hc128/files.hc128
diff -N crypto/hc128/files.hc128
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ crypto/hc128/files.hc128    17 Apr 2014 03:17:18 -0000
@@ -0,0 +1,5 @@
+#      $NetBSD: $
+
+define hc128
+
+file   crypto/hc128/hc128.c
Index: crypto/hc128/hc128.c
===================================================================
RCS file: crypto/hc128/hc128.c
diff -N crypto/hc128/hc128.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ crypto/hc128/hc128.c        17 Apr 2014 03:17:18 -0000
@@ -0,0 +1,162 @@
+/*     $NetBSD: $ */
+
+/* Author: Lucas Clemente Vella
+ * Source code placed into public domain. */
+
+/*
+ * This is the HC-128 stream cipher, one of the eStream Profile 1
+ * selected ciphers.  It is based on SHA-256.
+ *
+ * This cipher has a very large key setup time (estimated at 74,000
+ * cycles on a modern x86 CPU) but is quite efficient once keyed:
+ * around 3 cycles per byte.  Since it produces a stream of 32-bit
+ * values, it is well suited for use as a kernel RNG, usually requiring
+ * no output buffering and wasting little of the output stream.
+ */
+
+#include <crypto/hc128/hc128.h>
+
+static inline uint32_t
+rotl(uint32_t x, unsigned int n)
+{
+       return (x << n) | (x >> (32-n));
+}
+
+static inline unsigned int
+m512(unsigned int x)
+{
+       static const unsigned int mask = 0x1ff; /* 511 mask, for mod 512 */
+       return x & mask;
+}
+
+static inline uint32_t
+f1(uint32_t x)
+{
+       return rotl(x, 25) ^ rotl(x, 14) ^ (x >> 3);
+}
+
+static inline uint32_t
+f2(uint32_t x)
+{
+       return rotl(x, 15) ^ rotl(x, 13) ^ (x >> 10);
+}
+
+static inline uint32_t
+g1(uint32_t x, uint32_t y, uint32_t z)
+{
+       return (rotl(x, 22) ^ rotl(z, 9)) + rotl(y, 24);
+}
+
+static inline uint32_t
+g2(uint32_t x, uint32_t y, uint32_t z)
+{
+       return (rotl(x, 10) ^ rotl(z, 23)) + rotl(y, 8);
+}
+
+static inline uint32_t
+h(const uint32_t *qp, uint32_t x)
+{
+       return qp[x & 0xFFu] + qp[256 + ((x >> 16) & 0xFFu)];
+}
+
+static inline uint32_t
+round_expression(uint32_t *pq, const uint32_t *qp,
+                uint32_t (*g)(uint32_t x, uint32_t y, uint32_t z),
+                uint16_t i)
+{
+       pq[i] += g(pq[m512(i-3u)], pq[m512(i-10u)], pq[m512(i+1u)]);
+       return pq[i] ^ h(qp, pq[m512(i-12u)]);
+}
+
+static inline uint32_t
+pack_littleendian(const uint8_t *v)
+{
+#ifdef LITTLE_ENDIAN
+       return *((const uint32_t*)v);
+#else
+       return (uint32_t)v[3] << 24
+           | (uint32_t)v[2] << 16
+           | (uint32_t)v[1] << 8
+           | (uint32_t)v[0];
+#endif
+}
+
+static inline void
+unpack_littleendian(uint32_t value, uint8_t *v)
+{
+#if BYTE_ORDER == LITTLE_ENDIAN
+       *((uint32_t*)v) = value;
+#else
+       int i;
+       for(i = 0; i < 4; ++i) {
+               v[i] = value >> (i * 8);
+       }
+#endif
+}
+
+/** Initialize HC-128 state with key and IV.
+ *
+ * Contrary to the other implemented algorithms, the key and IV are taken
+ * in a single function to initialize the state. This approach was chosen
+ * here because of the nature of the algorithm, that keeps no intermediate
+ * state between the key setting and the IV setting.
+ *
+ * Notice: an IV should never be reused.
+ *
+ * @param state The uninitialized state, it will be ready to
+ * encryption/decryption afterwards.
+ * @param key 16 bytes buffer containing the 128-bit key. The buffer must
+ * be aligned to at least 4 bytes (depending on the platform it may or may
+ * not work with unaligned memory).
+ * @param iv 16 bytes buffer containing the IV.
+ */
+void
+hc128_init(hc128_state_t *state, const uint8_t *key, const uint8_t *iv)
+{
+       unsigned int i;
+       uint32_t w[1280], *p = state->p, *q = state->q;
+
+       for(i = 0; i < 4; ++i) {
+               w[i] = w[i+4] = pack_littleendian(key + 4 * i);
+               w[i+8] = w[i+12] = pack_littleendian(iv + 4 * i);
+       }
+
+       for(i = 16; i < 1280; ++i) {
+               w[i] = f2(w[i-2]) + w[i-7] + f1(w[i-15]) + w[i-16] + i;
+       }
+
+       for(i = 0; i < 512; ++i) {
+               p[i] = w[i+256];
+               q[i] = w[i+768];
+       }
+
+       for(i = 0; i < 512; ++i) {
+               p[i] = round_expression(p, q, g1, i);
+       }
+
+       for(i = 0; i < 512; ++i) {
+               q[i] = round_expression(q, p, g2, i);
+       }
+
+       state->i = 0;
+}
+
+/** Performs one round of the algorithm.
+ *
+ * @param state The algorithm state.
+ * @param stream A 4 byte buffer where the generated stream will be stored.
+ * Must be aligned.
+ */
+void
+hc128_extract(hc128_state_t *state, uint8_t *stream)
+{
+       register uint32_t ret;
+
+       uint16_t i = state->i;
+       state->i = (i + 1u) & 1023u;
+
+       ret = (i < 512) ? round_expression(state->p, state->q, g1, i) :
+                         round_expression(state->q, state->p, g2, m512(i));
+
+       unpack_littleendian(ret, stream);
+}
Index: crypto/hc128/hc128.h
===================================================================
RCS file: crypto/hc128/hc128.h
diff -N crypto/hc128/hc128.h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ crypto/hc128/hc128.h        17 Apr 2014 03:17:18 -0000
@@ -0,0 +1,22 @@
+/*     $NetBSD: $ */
+
+/* Author: Lucas Clemente Vella
+ * Source code placed into public domain. */
+
+#ifndef _CRYPTO_HC128_H_
+#define _CRYPTO_HC128_H_
+
+#include <sys/types.h>
+
+typedef struct
+{
+       uint32_t p[512];
+       uint32_t q[512];
+       uint16_t i;
+} hc128_state_t;
+
+void hc128_init(hc128_state_t *, const uint8_t *, const uint8_t *);
+
+void hc128_extract(hc128_state_t *, uint8_t *);
+
+#endif
Index: kern/init_main.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_main.c,v
retrieving revision 1.454.2.1
diff -u -p -r1.454.2.1 init_main.c
--- kern/init_main.c    7 Apr 2014 02:20:00 -0000       1.454.2.1
+++ kern/init_main.c    17 Apr 2014 03:17:19 -0000
@@ -497,6 +497,8 @@ main(void)
        /* Initialize the kernel strong PRNG. */
        kern_cprng = cprng_strong_create("kernel", IPL_VM,
                                         CPRNG_INIT_ANY|CPRNG_REKEY_ANY);
+
+       cprng_fast_init();
                                         
        /* Initialize interfaces. */
        ifinit1();
Index: kern/subr_cprng.c
===================================================================
RCS file: /cvsroot/src/sys/kern/subr_cprng.c,v
retrieving revision 1.23
diff -u -p -r1.23 subr_cprng.c
--- kern/subr_cprng.c   17 Jan 2014 02:12:48 -0000      1.23
+++ kern/subr_cprng.c   17 Apr 2014 03:17:19 -0000
@@ -43,6 +43,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_cprng.c
 #include <sys/kmem.h>
 #include <sys/lwp.h>
 #include <sys/once.h>
+#include <sys/percpu.h>
 #include <sys/poll.h>          /* XXX POLLIN/POLLOUT/&c. */
 #include <sys/select.h>
 #include <sys/systm.h>
@@ -54,6 +55,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_cprng.c
 #endif
 
 #include <crypto/nist_ctr_drbg/nist_ctr_drbg.h>
+#include <crypto/hc128/hc128.h>
 
 #if defined(__HAVE_CPU_COUNTER)
 #include <machine/cpu_counter.h>
@@ -72,6 +74,13 @@ static void  cprng_strong_rngtest(struct 
 
 static rndsink_callback_t      cprng_strong_rndsink_callback;
 
+percpu_t *percpu_cprng_fast_ctx;
+static int cprng_fast_initialized;
+
+static void cprng_fast_randrekey(cprng_fast_ctx_t *);
+
+void *cprng_fast_rekey_softintr = NULL;
+
 void
 cprng_init(void)
 {
@@ -103,10 +112,11 @@ cprng_counter(void)
                return cpu_counter32();
 #endif
        if (__predict_false(cold)) {
+               static int ctr;
                /* microtime unsafe if clock not running yet */
-               return 0;
+               return ctr++;
        }
-       microtime(&tv);
+       getmicrotime(&tv);
        return (tv.tv_sec * 1000000 + tv.tv_usec);
 }
 
@@ -532,8 +542,16 @@ sysctl_kern_urnd(SYSCTLFN_ARGS)
 }
 
 /*
- * sysctl helper routine for kern.arandom node. Picks a random number
- * for you.
+ * sysctl helper routine for kern.arandom node.  Fills the supplied
+ * structure with random data for you.
+ *
+ * This node was originally declared as type "int" but its implementation
+ * in OpenBSD, whence it came, would happily return up to 8K of data if
+ * requested.  Evidently this was used to key RC4 in userspace.
+ *
+ * In NetBSD, the libc stack-smash-protection code reads 64 bytes
+ * from here at every program startup.  So though it would be nice
+ * to make this node return only 32 or 64 bits, we can't.  Too bad!
  */
 static int
 sysctl_kern_arnd(SYSCTLFN_ARGS)
@@ -542,31 +560,145 @@ sysctl_kern_arnd(SYSCTLFN_ARGS)
        void *v;
        struct sysctlnode node = *rnode;
 
-       if (*oldlenp == 0)
+       switch (*oldlenp) {
+           case 0:
                return 0;
+           default:
+               if (*oldlenp > 256) {
+                       return E2BIG;
+               }
+               v = kmem_alloc(*oldlenp, KM_SLEEP);
+               cprng_fast(v, *oldlenp);
+               node.sysctl_data = v;
+               node.sysctl_size = *oldlenp;
+               error = sysctl_lookup(SYSCTLFN_CALL(&node));
+               kmem_free(v, *oldlenp);
+               return error;
+       }
+}
+
+static void
+cprng_fast_randrekey(cprng_fast_ctx_t *ctx)
+{
+       uint8_t key[16], iv[16];
+       hc128_state_t tempstate;
+       int s;
+
+       int have_initial = rnd_initial_entropy;
+
+       cprng_strong(kern_cprng, key, sizeof(key), FASYNC);
+       cprng_strong(kern_cprng, iv, sizeof(iv), FASYNC);
+
+       /* Rekey the hc128 state - expensive, don't do this at splhigh.  */
+       hc128_init(&ctx->hc128, key, iv);
+       explicit_memset(key, 0, sizeof(key));
+       explicit_memset(iv, 0, sizeof(iv));
+
+       s = splhigh();
+       memcpy(&ctx->hc128, &tempstate, sizeof(tempstate));
+       splx(s);
+       
+       explicit_memset(&tempstate, 0, sizeof(tempstate));
+
        /*
-        * This code used to allow sucking 8192 bytes at a time out
-        * of the kernel arc4random generator.  Evidently there is some
-        * very old OpenBSD application code that may try to do this.
-        *
-        * Note that this node is documented as type "INT" -- 4 or 8
-        * bytes, not 8192.
-        *
-        * We continue to support this abuse of the "len" pointer here
-        * but only 256 bytes at a time, as, anecdotally, the actual
-        * application use here was to generate RC4 keys in userspace.
-        *
-        * Support for such large requests will probably be removed
-        * entirely in the future.
+        * Reset for next reseed cycle.
         */
-       if (*oldlenp > 256)
-               return E2BIG;
+       ctx->nextreseed = time_uptime +
+           (have_initial ? CPRNGF_RESEED_SECONDS : 0);
+       ctx->numbytes = 0;
+}
+
+static void
+cprng_fast_init_ctx(void *v,
+             void *arg __unused,
+             struct cpu_info * ci __unused)
+{
+       cprng_fast_ctx_t *ctx = v;
+       cprng_fast_randrekey(ctx);
+}
+
+static void
+cprng_fast_rekey_one(void *arg __unused)
+{
+       cprng_fast_ctx_t *ctx = percpu_getref(percpu_cprng_fast_ctx);
+
+       cprng_fast_randrekey(ctx);
+       percpu_putref(percpu_cprng_fast_ctx);
+}
+
+void
+cprng_fast_init(void)
+{
+        percpu_cprng_fast_ctx = percpu_alloc(sizeof(cprng_fast_ctx_t));
+        percpu_foreach(percpu_cprng_fast_ctx, cprng_fast_init_ctx, NULL);
+       cprng_fast_initialized++;
+       cprng_fast_rekey_softintr = 
softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
+                                               cprng_fast_rekey_one, NULL);
+}
+
+size_t
+_cprng_fast_exact(void *p, size_t len)
+{
+       uint32_t *pi = p, *iter;
+       int s;
+       size_t ilen = len / sizeof(*pi);
+       cprng_fast_ctx_t *ctx = percpu_getref(percpu_cprng_fast_ctx);
+
+       KASSERT(cprng_fast_initialized);
+       KASSERT(0 == ((uintptr_t)p % sizeof(uint32_t)));
+       KASSERT(ilen * sizeof(*pi) == len);
+
+       _cprng_fast_checkrekey(ctx);
+
+       s = splhigh();
+       for (iter = pi; iter < pi + ilen; iter++) {
+               hc128_extract(&ctx->hc128, (uint8_t *)iter);
+       }
+       splx(s);
+
+       ctx->numbytes += len;
+       percpu_putref(percpu_cprng_fast_ctx);
+       return len;
+}
+
+size_t
+_cprng_fast_inexact(void *p, size_t len)
+{
+       uint8_t *pc = p;
+       uint32_t *pi = p, tmp, *iter;
+       int s;
+       size_t initial_len, aligned_len, final_len, main_len;
+       cprng_fast_ctx_t *ctx = percpu_getref(percpu_cprng_fast_ctx);
+
+       KASSERT(cprng_fast_initialized);
+
+       initial_len = sizeof(uint32_t) - ((uintptr_t)pc % sizeof(uint32_t));
+       aligned_len = len - initial_len;
+       final_len = aligned_len % sizeof(uint32_t);
+       main_len = aligned_len - final_len;
+
+       main_len /= sizeof(uint32_t);
+
+       _cprng_fast_checkrekey(ctx);
+
+       s = splhigh();
+       if (initial_len) {
+               hc128_extract(&ctx->hc128, (uint8_t *)&tmp);
+               memcpy(pc, &tmp, initial_len);
+               pi = (uint32_t *)pc;
+       }
+
+       for (iter = pi; iter < pi + main_len ; iter++) {
+               hc128_extract(&ctx->hc128, (uint8_t *)iter);
+       }
+
+       if (final_len) {
+               hc128_extract(&ctx->hc128, (uint8_t *)&tmp);
+               memcpy(pi + main_len, &tmp, final_len);
+       }
+       splx(s);
 
-       v = kmem_alloc(*oldlenp, KM_SLEEP);
-       cprng_fast(v, *oldlenp);
-       node.sysctl_data = v;
-       node.sysctl_size = *oldlenp;
-       error = sysctl_lookup(SYSCTLFN_CALL(&node));
-       kmem_free(v, *oldlenp);
-       return error;
+       ctx->numbytes += len;
+       percpu_putref(percpu_cprng_fast_ctx);
+       return len;
 }
Index: lib/libkern/Makefile.libkern
===================================================================
RCS file: /cvsroot/src/sys/lib/libkern/Makefile.libkern,v
retrieving revision 1.32.2.1
diff -u -p -r1.32.2.1 Makefile.libkern
--- lib/libkern/Makefile.libkern        7 Apr 2014 01:10:55 -0000       1.32.2.1
+++ lib/libkern/Makefile.libkern        17 Apr 2014 03:17:19 -0000
@@ -54,7 +54,7 @@ SRCS+=        cpuset.c inet_addr.c intoa.c
 SRCS+= bswap64.c
 .endif
 SRCS+= md4c.c md5c.c rmd160.c sha1.c sha2.c murmurhash.c
-SRCS+= pmatch.c arc4random.c bcd.c mcount.c mertwist.c crc32.c
+SRCS+= pmatch.c bcd.c mcount.c mertwist.c crc32.c
 
 SRCS+= ppath_kmem_alloc.c
 
Index: lib/libkern/arc4random.c
===================================================================
RCS file: lib/libkern/arc4random.c
diff -N lib/libkern/arc4random.c
--- lib/libkern/arc4random.c    24 Jun 2013 04:21:20 -0000      1.35
+++ /dev/null   1 Jan 1970 00:00:00 -0000
@@ -1,277 +0,0 @@
-/*     $NetBSD: arc4random.c,v 1.35 2013/06/24 04:21:20 riastradh Exp $        
*/
-
-/*-
- * Copyright (c) 2002, 2011 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Thor Lancelot Simon.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*-
- * THE BEER-WARE LICENSE
- *
- * <dan%FreeBSD.ORG@localhost> wrote this file.  As long as you retain this 
notice you
- * can do whatever you want with this stuff.  If we meet some day, and you
- * think this stuff is worth it, you can buy me a beer in return.
- *
- * Dan Moschuk
- *
- * $FreeBSD: src/sys/libkern/arc4random.c,v 1.9 2001/08/30 12:30:58 bde Exp $
- */
-
-#include <sys/cdefs.h>
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/rngtest.h>
-#include <sys/systm.h>
-#include <sys/time.h>
-
-#ifdef _STANDALONE
-/*
- * XXX This is a load of bollocks.  Standalone has no entropy source.
- * This module should be removed from libkern once we confirm nobody is
- * using it.
- */
-#define        time_uptime     1
-typedef struct kmutex  *kmutex_t;
-#define        MUTEX_DEFAULT   0
-#define        IPL_VM          0
-static void mutex_init(kmutex_t *m, int t, int i) {}
-static void mutex_spin_enter(kmutex_t *m) {}
-static void mutex_spin_exit(kmutex_t *m) {}
-typedef void rndsink_callback_t(void *, const void *, size_t);
-struct rndsink;
-static struct rndsink *rndsink_create(size_t n, rndsink_callback_t c, void *a)
-  { return NULL; }
-static bool rndsink_request(struct rndsink *s, void *b, size_t n)
-  { return true; }
-#else  /* !_STANDALONE */
-#include <sys/kernel.h>
-#include <sys/mutex.h>
-#include <sys/rndsink.h>
-#endif /* _STANDALONE */
-
-#include <lib/libkern/libkern.h>
-
-/*
- * The best known attack that distinguishes RC4 output from a random
- * bitstream requires 2^25 bytes.  (see Paul and Preneel, Analysis of
- * Non-fortuitous Predictive States of the RC4 Keystream Generator.
- * INDOCRYPT 2003, pp52 – 67).
- *
- * However, we discard the first 1024 bytes of output, avoiding the
- * biases detected in this paper.  The best current attack that
- * can distinguish this "RC4[drop]" output seems to be Fleuhrer &
- * McGrew's attack which requires 2^30.6 bytes of output:
- * Fluhrer and McGrew, Statistical Analysis of the Alleged RC4
- * Keystream Generator. FSE 2000, pp19 – 30
- *
- * We begin trying to rekey at 2^24 bytes, and forcibly rekey at 2^29 bytes
- * even if the resulting key cannot be guaranteed to have full entropy.
- */
-#define        ARC4_MAXBYTES           (16 * 1024 * 1024)
-#define ARC4_HARDMAX           (512 * 1024 * 1024)
-#define        ARC4_RESEED_SECONDS     300
-#define        ARC4_KEYBYTES           16 /* 128 bit key */
-
-static kmutex_t        arc4_mtx;
-static struct rndsink *arc4_rndsink;
-
-static u_int8_t arc4_i, arc4_j;
-static int arc4_initialized = 0;
-static int arc4_numbytes = 0;
-static u_int8_t arc4_sbox[256];
-static time_t arc4_nextreseed;
-
-static rndsink_callback_t arc4_rndsink_callback;
-static void arc4_randrekey(void);
-static void arc4_randrekey_from(const uint8_t[ARC4_KEYBYTES], bool);
-static void arc4_init(void);
-static inline u_int8_t arc4_randbyte(void);
-static inline void arc4randbytes_unlocked(void *, size_t);
-void _arc4randbytes(void *, size_t);
-uint32_t _arc4random(void);
-
-static inline void
-arc4_swap(u_int8_t *a, u_int8_t *b)
-{
-       u_int8_t c;
-
-       c = *a;
-       *a = *b;
-       *b = c;
-}
-
-static void
-arc4_rndsink_callback(void *context __unused, const void *seed, size_t bytes)
-{
-
-       KASSERT(bytes == ARC4_KEYBYTES);
-       arc4_randrekey_from(seed, true);
-}
-
-/*
- * Stir our S-box with whatever we can get from the system entropy pool
- * now.
- */
-static void
-arc4_randrekey(void)
-{
-       uint8_t seed[ARC4_KEYBYTES];
-
-       const bool full_entropy = rndsink_request(arc4_rndsink, seed,
-           sizeof(seed));
-       arc4_randrekey_from(seed, full_entropy);
-       explicit_memset(seed, 0, sizeof(seed));
-}
-
-/*
- * Stir our S-box with what's in seed.
- */
-static void
-arc4_randrekey_from(const uint8_t seed[ARC4_KEYBYTES], bool full_entropy)
-{
-       uint8_t key[256];
-       size_t n;
-
-       mutex_spin_enter(&arc4_mtx);
-
-       (void)memcpy(key, seed, ARC4_KEYBYTES);
-
-       /* Rekey the arc4 state.  */
-       for (n = ARC4_KEYBYTES; n < sizeof(key); n++)
-               key[n] = key[n % ARC4_KEYBYTES];
-
-       for (n = 0; n < 256; n++) {
-               arc4_j = (arc4_j + arc4_sbox[n] + key[n]) % 256;
-               arc4_swap(&arc4_sbox[n], &arc4_sbox[arc4_j]);
-       }
-       arc4_i = arc4_j;
-
-       explicit_memset(key, 0, sizeof(key));
-
-       /*
-        * Throw away the first N words of output, as suggested in the
-        * paper "Weaknesses in the Key Scheduling Algorithm of RC4" by
-        * Fluher, Mantin, and Shamir.  (N = 256 in our case.)
-        */
-       for (n = 0; n < 256 * 4; n++)
-               arc4_randbyte();
-
-       /*
-        * Reset for next reseed cycle.  If we don't have full entropy,
-        * caller has scheduled a reseed already.
-        */
-       arc4_nextreseed = time_uptime +
-           (full_entropy? ARC4_RESEED_SECONDS : 0);
-       arc4_numbytes = 0;
-
-#if 0                          /* XXX */
-       arc4_rngtest();
-#endif
-
-       mutex_spin_exit(&arc4_mtx);
-}
-
-/*
- * Initialize our S-box to its beginning defaults.
- */
-static void
-arc4_init(void)
-{
-       int n;
-
-       mutex_init(&arc4_mtx, MUTEX_DEFAULT, IPL_VM);
-       arc4_rndsink = rndsink_create(ARC4_KEYBYTES, &arc4_rndsink_callback,
-           NULL);
-
-       arc4_i = arc4_j = 0;
-       for (n = 0; n < 256; n++)
-               arc4_sbox[n] = (u_int8_t) n;
-
-       arc4_randrekey();
-       arc4_initialized = 1;
-}
-
-/*
- * Generate a random byte.
- */
-static inline u_int8_t
-arc4_randbyte(void)
-{
-       u_int8_t arc4_t;
-
-       arc4_i = (arc4_i + 1) % 256;
-       arc4_j = (arc4_j + arc4_sbox[arc4_i]) % 256;
-
-       arc4_swap(&arc4_sbox[arc4_i], &arc4_sbox[arc4_j]);
-
-       arc4_t = (arc4_sbox[arc4_i] + arc4_sbox[arc4_j]) % 256;
-       return arc4_sbox[arc4_t];
-}
-
-static inline void
-arc4randbytes_unlocked(void *p, size_t len)
-{
-       u_int8_t *buf = (u_int8_t *)p;
-       size_t i;
-
-       for (i = 0; i < len; buf[i] = arc4_randbyte(), i++)
-               continue;
-}
-
-void
-_arc4randbytes(void *p, size_t len)
-{
-       /* Initialize array if needed. */
-       if (!arc4_initialized) {
-               arc4_init();
-               /* avoid conditionalizing locking */
-               arc4randbytes_unlocked(p, len);
-               arc4_numbytes += len;
-               return;
-       }
-       mutex_spin_enter(&arc4_mtx);
-       arc4randbytes_unlocked(p, len);
-       arc4_numbytes += len;
-       mutex_spin_exit(&arc4_mtx);
-       if ((arc4_numbytes > ARC4_MAXBYTES) ||
-           (time_uptime > arc4_nextreseed)) {
-               arc4_randrekey();
-       }
-}
-
-u_int32_t
-_arc4random(void)
-{
-        u_int32_t ret;
-        u_int8_t *retc;
-
-        retc = (u_int8_t *)&ret;
-
-        _arc4randbytes(retc, sizeof(u_int32_t));
-        return ret;
-}
Index: nfs/nfs_subs.c
===================================================================
RCS file: /cvsroot/src/sys/nfs/nfs_subs.c,v
retrieving revision 1.225
diff -u -p -r1.225 nfs_subs.c
--- nfs/nfs_subs.c      17 Mar 2014 09:35:24 -0000      1.225
+++ nfs/nfs_subs.c      17 Apr 2014 03:17:19 -0000
@@ -1489,7 +1489,6 @@ nfs_init0(void)
        nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
        if (nfs_ticks < 1)
                nfs_ticks = 1;
-       nfs_xid = cprng_fast32();
        nfsdreq_init();
 
        /*
@@ -1994,6 +1993,10 @@ nfs_getxid(void)
 {
        u_int32_t newxid;
 
+       if (__predict_false(nfs_xid == 0)) {
+               nfs_xid = cprng_fast32();
+       }
+
        /* get next xid.  skip 0 */
        do {
                newxid = atomic_inc_32_nv(&nfs_xid);
Index: sys/cprng.h
===================================================================
RCS file: /cvsroot/src/sys/sys/cprng.h,v
retrieving revision 1.9
diff -u -p -r1.9 cprng.h
--- sys/cprng.h 17 Jan 2014 02:08:56 -0000      1.9
+++ sys/cprng.h 17 Apr 2014 03:17:19 -0000
@@ -41,42 +41,91 @@
 #include <sys/rnd.h>           /* XXX users bogusly transitively need this */
 
 #include <crypto/nist_ctr_drbg/nist_ctr_drbg.h>
+#include <crypto/hc128/hc128.h>
+#include <sys/percpu.h>
+#include <sys/intr.h>
 
 /*
  * NIST SP800-90 says 2^19 bytes per request for the CTR_DRBG.
  */
 #define CPRNG_MAX_LEN  524288
 
+#define CPRNGF_MAXBYTES           (512 * 1024 * 1024)
+#define CPRNGF_HARDMAX            (1 * 1024 * 1024 * 1024)
+#define CPRNGF_RESEED_SECONDS     600
+
+typedef struct  {
+        hc128_state_t   hc128;
+        int             numbytes;
+        time_t          nextreseed;
+} cprng_fast_ctx_t;
+
 /*
- * We do not want an arc4random() prototype available to anyone.
+ * This is a macro so we can skip any conditional logic at runtime if
+ * the size provided is a multiple of the underlying stream cipher
+ * blocksize, e.g. sizeof(padded struct).
  */
-void _arc4randbytes(void *, size_t);
-uint32_t _arc4random(void);
+#define cprng_fast(p, len) ((0 == (len % sizeof(uint32_t))) && \
+                           (0 == ((uintptr_t)p % sizeof(uint32_t))) ? \
+                           _cprng_fast_exact(p, len) : \
+                           _cprng_fast_inexact(p, len))
+
+size_t _cprng_fast_exact(void *, size_t);
+size_t _cprng_fast_inexact(void *, size_t);
 
-static inline size_t
-cprng_fast(void *p, size_t len)
+static inline void
+_cprng_fast_checkrekey(cprng_fast_ctx_t *ctx)
 {
-       _arc4randbytes(p, len);
-       return len;
+       extern void *cprng_fast_rekey_softintr;
+
+       if (__predict_false((ctx->numbytes > CPRNGF_MAXBYTES) ||
+                           (time_uptime > ctx->nextreseed))) {
+               /* Schedule a deferred reseed */
+               softint_schedule(cprng_fast_rekey_softintr);
+       }
 }
 
-static inline uint32_t
-cprng_fast32(void)
+static inline uint32_t cprng_fast32(void)
 {
-       return _arc4random();
+       uint32_t ret;
+       extern percpu_t *percpu_cprng_fast_ctx;
+       cprng_fast_ctx_t *ctx = percpu_getref(percpu_cprng_fast_ctx);
+       int s;
+
+       _cprng_fast_checkrekey(ctx);
+
+       s = splhigh();
+       hc128_extract(&ctx->hc128, (uint8_t *)&ret);
+       splx(s);
+
+       ctx->numbytes += sizeof(uint32_t);
+       percpu_putref(percpu_cprng_fast_ctx);
+       return ret;
 }
 
-static inline uint64_t
-cprng_fast64(void)
+static inline uint64_t cprng_fast64(void)
 {
-       uint64_t r;
-       _arc4randbytes(&r, sizeof(r));
-       return r;
+       uint64_t ret;
+       extern percpu_t *percpu_cprng_fast_ctx;
+       cprng_fast_ctx_t *ctx = percpu_getref(percpu_cprng_fast_ctx);
+       int s;
+
+       _cprng_fast_checkrekey(ctx);
+
+       s = splhigh();
+       hc128_extract(&ctx->hc128, (uint8_t *)&ret);
+       hc128_extract(&ctx->hc128, (uint8_t *)(((uint32_t *)&ret) + 1));
+       splx(s);
+
+       ctx->numbytes += sizeof(uint64_t);
+       percpu_putref(percpu_cprng_fast_ctx);
+       return ret;
 }
 
 typedef struct cprng_strong cprng_strong_t;
 
 void   cprng_init(void);
+void   cprng_fast_init(void);
 
 #define CPRNG_INIT_ANY         0x00000001
 #define CPRNG_REKEY_ANY                0x00000002


Home | Main Index | Thread Index | Old Index