tech-crypto archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Patch: cprng_fast performance - please review.



On Wed, Apr 09, 2014 at 05:16:55PM +0100, Mindaugas Rasiukevicius wrote:
> 
> Perhaps I missed it, but were the benchmark results published?  Also, how
> about cprng_fast32/64()?  If the current mechanism is indeed faster, then
> I am glad that you have made an improvement.

To quote http://mail-index.netbsd.org/tech-crypto/2011/12/09/msg000538.html:

        This generator is approximately 20 times as fast as the old generator
        (dd with bs=64K yields 53MB/sec on 2Ghz Core2 instead of 2.5MB/sec)
        and also uses a separate mutex per instance so concurrency is greatly
        improved.

I made a number of other performance improvements after that, but unless you
insist let's leave off the subject of cprng_strong() for a bit.

Attached is a patch which makes cprng_fast per-CPU and lockless.  *IT IS NOT
WELL TESTED YET (I haven't even run test vectors) AND IS ONLY FOR REVIEW.*
The diff is against the head of the tls-earlyentropy branch.

I would *very* much appreciate comments of any kind on this patch.  Some
notes:

        1) This patch replaces RC4 with the HC-128 stream cipher.  HC-128
           is one of the selected eSTREAM ciphers.  It is mature, well
           analyzed, and produces an output stream of 32-bit values which
           makes it well suited to our application.

           Unlike many of the other eSTREAM final portfolio ciphers it
           appears to perform well in a fairly naive, portable C
           implementation.  Keying is slow but I've dealt with that.

           RC4 is at this point dangerously broken and HC-128 actually
           appears to be faster, too.

        2) This patch wraps the actual manipulation of the HC-128 state
           with splhigh().  The alternative would be to prohibit use of
           cprng_fast* from interrupt context.  Neither is great but either
           is better than a global spin mutex!  Comments on this in
           particular I would appreciate.  For reference, the operations in
           question cost about 4-6 cycles per output byte, and we don't
           usually ask for many output bytes, so we're not at IPL_HIGH
           for long.

        3) This patch replaces direct keying of cprng_fast from the
           entropy pool with indirect keying via the kernel_cprng.
           This is done from a soft interrupt which is scheduled when
           (long before, actually) rekeying is needed.  To make this work
           I had to adjust the initialization of the NFS filesystem
           module which was calling cprng_fast() *very* early.

        4) There are some moderately ugly macro tricks here to avoid
           copying output into alignment unless truly necessary and to avoid
           conditional execution at runtime.  I think they are worth it.
           The copy likely costs almost as much as the actual stream
           generation and we all know what branch mispredictions cost.

           The alignment math is expressed with / and % because I find it
           more clear but I've looked at the asm output and it appears the
           compiler does the obvious optimizations.

        5) The HC-128 implementation is Lucas Vella's from libestream,
           slightly reorganized and KNFed.

Patch is attached.  Thanks very much in advance for any comments.

Thor
? kern/.init_main.c.swp
? sys/.cprng.h.swo
Index: conf/files
===================================================================
RCS file: /cvsroot/src/sys/conf/files,v
retrieving revision 1.1090
diff -u -r1.1090 files
--- conf/files  1 Apr 2014 17:49:30 -0000       1.1090
+++ conf/files  17 Apr 2014 01:20:26 -0000
@@ -160,6 +160,7 @@
 include "crypto/rijndael/files.rijndael"
 include "crypto/skipjack/files.skipjack"
 include "crypto/camellia/files.camellia"
+include "crypto/hc128/files.hc128"
 # General-purpose crypto processing framework.
 include "opencrypto/files.opencrypto"
 
Index: kern/init_main.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_main.c,v
retrieving revision 1.454.2.1
diff -u -r1.454.2.1 init_main.c
--- kern/init_main.c    7 Apr 2014 02:20:00 -0000       1.454.2.1
+++ kern/init_main.c    17 Apr 2014 01:20:27 -0000
@@ -497,6 +497,8 @@
        /* Initialize the kernel strong PRNG. */
        kern_cprng = cprng_strong_create("kernel", IPL_VM,
                                         CPRNG_INIT_ANY|CPRNG_REKEY_ANY);
+
+       cprf_init();
                                         
        /* Initialize interfaces. */
        ifinit1();
Index: kern/subr_cprng.c
===================================================================
RCS file: /cvsroot/src/sys/kern/subr_cprng.c,v
retrieving revision 1.23
diff -u -r1.23 subr_cprng.c
--- kern/subr_cprng.c   17 Jan 2014 02:12:48 -0000      1.23
+++ kern/subr_cprng.c   17 Apr 2014 01:20:27 -0000
@@ -43,6 +43,7 @@
 #include <sys/kmem.h>
 #include <sys/lwp.h>
 #include <sys/once.h>
+#include <sys/percpu.h>
 #include <sys/poll.h>          /* XXX POLLIN/POLLOUT/&c. */
 #include <sys/select.h>
 #include <sys/systm.h>
@@ -54,6 +55,7 @@
 #endif
 
 #include <crypto/nist_ctr_drbg/nist_ctr_drbg.h>
+#include <crypto/hc128/hc128.h>
 
 #if defined(__HAVE_CPU_COUNTER)
 #include <machine/cpu_counter.h>
@@ -72,6 +74,13 @@
 
 static rndsink_callback_t      cprng_strong_rndsink_callback;
 
+percpu_t *percpu_cprf_ctx;
+static int cprf_initialized;
+
+static void cprf_randrekey(cprf_ctx_t *);
+
+void *cprf_rekey_softintr = NULL;
+
 void
 cprng_init(void)
 {
@@ -103,10 +112,11 @@
                return cpu_counter32();
 #endif
        if (__predict_false(cold)) {
+               static int ctr;
                /* microtime unsafe if clock not running yet */
-               return 0;
+               return ctr++;
        }
-       microtime(&tv);
+       getmicrotime(&tv);
        return (tv.tv_sec * 1000000 + tv.tv_usec);
 }
 
@@ -532,8 +542,16 @@
 }
 
 /*
- * sysctl helper routine for kern.arandom node. Picks a random number
- * for you.
+ * sysctl helper routine for kern.arandom node.  Fills the supplied
+ * structure with random data for you.
+ *
+ * This node was originally declared as type "int" but its implementation
+ * in OpenBSD, whence it came, would happily return up to 8K of data if
+ * requested.  Evidently this was used to key RC4 in userspace.
+ *
+ * In NetBSD, the libc stack-smash-protection code reads 64 bytes
+ * from here at every program startup.  So though it would be nice
+ * to make this node return only 32 or 64 bits, we can't.  Too bad!
  */
 static int
 sysctl_kern_arnd(SYSCTLFN_ARGS)
@@ -542,31 +560,143 @@
        void *v;
        struct sysctlnode node = *rnode;
 
-       if (*oldlenp == 0)
+       switch (*oldlenp) {
+           case 0:
                return 0;
+           default:
+               if (*oldlenp > 256) {
+                       return E2BIG;
+               }
+               v = kmem_alloc(*oldlenp, KM_SLEEP);
+               cprng_fast(v, *oldlenp);
+               node.sysctl_data = v;
+               node.sysctl_size = *oldlenp;
+               error = sysctl_lookup(SYSCTLFN_CALL(&node));
+               kmem_free(v, *oldlenp);
+               return error;
+       }
+}
+
+static void
+cprf_randrekey(cprf_ctx_t *ctx)
+{
+       uint8_t key[16], iv[16];
+       hc128_state_t tempstate;
+       int s;
+
+       int have_initial = rnd_initial_entropy;
+
+       cprng_strong(kern_cprng, key, sizeof(key), FASYNC);
+       cprng_strong(kern_cprng, iv, sizeof(iv), FASYNC);
+
+       /* Rekey the hc128 state - expensive, don't do this at splhigh.  */
+       hc128_init(&ctx->hc128, key, iv);
+       explicit_memset(key, 0, sizeof(key));
+       explicit_memset(iv, 0, sizeof(iv));
+
+       s = splhigh();
+       memcpy(&ctx->hc128, &tempstate, sizeof(tempstate));
+       splx(s);
+       
+       explicit_memset(&tempstate, 0, sizeof(tempstate));
+
        /*
-        * This code used to allow sucking 8192 bytes at a time out
-        * of the kernel arc4random generator.  Evidently there is some
-        * very old OpenBSD application code that may try to do this.
-        *
-        * Note that this node is documented as type "INT" -- 4 or 8
-        * bytes, not 8192.
-        *
-        * We continue to support this abuse of the "len" pointer here
-        * but only 256 bytes at a time, as, anecdotally, the actual
-        * application use here was to generate RC4 keys in userspace.
-        *
-        * Support for such large requests will probably be removed
-        * entirely in the future.
+        * Reset for next reseed cycle.
         */
-       if (*oldlenp > 256)
-               return E2BIG;
+       ctx->nextreseed = time_uptime +
+           (have_initial ? CPRF_RESEED_SECONDS : 0);
+       ctx->numbytes = 0;
+}
+
+static void
+cprf_init_ctx(void *v,
+             void *arg __unused,
+             struct cpu_info * ci __unused)
+{
+       cprf_ctx_t *ctx = v;
+       cprf_randrekey(ctx);
+}
+
+static void
+cprf_rekey_one(void *arg __unused)
+{
+       cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx);
+       cprf_randrekey(ctx);
+}
+
+void
+cprf_init(void)
+{
+        percpu_cprf_ctx = percpu_alloc(sizeof(cprf_ctx_t));
+        percpu_foreach(percpu_cprf_ctx, cprf_init_ctx, NULL);
+       cprf_initialized++;
+       cprf_rekey_softintr = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE,
+                                               cprf_rekey_one, NULL);
+}
+
+size_t
+_cprng_fast_exact(void *p, size_t len)
+{
+       uint32_t *pi = p, *iter;
+       int s;
+       size_t ilen = len / sizeof(*pi);
+       cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx);
+
+       KDASSERT(cprf_initialized);
+       KDASSERT(0 == ((ptrdiff_t)p % sizeof(uint32_t)));
+       KDASSERT(ilen * sizeof(*pi) == len);
+
+       _cprf_checkrekey(ctx);
+
+       s = splhigh();
+       for (iter = pi; iter < pi + ilen; iter++) {
+               hc128_extract(&ctx->hc128, (uint8_t *)iter);
+       }
+       splx(s);
+
+       ctx->numbytes += len;
+       percpu_putref(percpu_cprf_ctx);
+       return len;
+}
+
+size_t
+_cprng_fast_inexact(void *p, size_t len)
+{
+       uint8_t *pc = p;
+       uint32_t *pi = p, tmp, *iter;
+       int s;
+       size_t initial_len, aligned_len, final_len, main_len;
+       cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx);
+
+       KDASSERT(cprf_initialized);
+
+       initial_len = sizeof(uint32_t) - ((ptrdiff_t)pc % sizeof(uint32_t));
+       aligned_len = len - initial_len;
+       final_len = aligned_len % sizeof(uint32_t);
+       main_len = aligned_len - final_len;
+
+       main_len /= sizeof(uint32_t);
+
+       _cprf_checkrekey(ctx);
+
+       s = splhigh();
+       if (initial_len) {
+               hc128_extract(&ctx->hc128, (uint8_t *)&tmp);
+               memcpy(pc, &tmp, initial_len);
+               pi = (uint32_t *)pc;
+       }
+
+       for (iter = pi; iter < pi + main_len ; iter++) {
+               hc128_extract(&ctx->hc128, (uint8_t *)iter);
+       }
+
+       if (final_len) {
+               hc128_extract(&ctx->hc128, (uint8_t *)&tmp);
+               memcpy(pi + main_len, &tmp, final_len);
+       }
+       splx(s);
 
-       v = kmem_alloc(*oldlenp, KM_SLEEP);
-       cprng_fast(v, *oldlenp);
-       node.sysctl_data = v;
-       node.sysctl_size = *oldlenp;
-       error = sysctl_lookup(SYSCTLFN_CALL(&node));
-       kmem_free(v, *oldlenp);
-       return error;
+       ctx->numbytes += len;
+       percpu_putref(percpu_cprf_ctx);
+       return len;
 }
Index: lib/libkern/Makefile.libkern
===================================================================
RCS file: /cvsroot/src/sys/lib/libkern/Makefile.libkern,v
retrieving revision 1.32.2.1
diff -u -r1.32.2.1 Makefile.libkern
--- lib/libkern/Makefile.libkern        7 Apr 2014 01:10:55 -0000       1.32.2.1
+++ lib/libkern/Makefile.libkern        17 Apr 2014 01:20:27 -0000
@@ -54,7 +54,7 @@
 SRCS+= bswap64.c
 .endif
 SRCS+= md4c.c md5c.c rmd160.c sha1.c sha2.c murmurhash.c
-SRCS+= pmatch.c arc4random.c bcd.c mcount.c mertwist.c crc32.c
+SRCS+= pmatch.c bcd.c mcount.c mertwist.c crc32.c
 
 SRCS+= ppath_kmem_alloc.c
 
Index: nfs/nfs_subs.c
===================================================================
RCS file: /cvsroot/src/sys/nfs/nfs_subs.c,v
retrieving revision 1.225
diff -u -r1.225 nfs_subs.c
--- nfs/nfs_subs.c      17 Mar 2014 09:35:24 -0000      1.225
+++ nfs/nfs_subs.c      17 Apr 2014 01:20:27 -0000
@@ -1489,7 +1489,6 @@
        nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
        if (nfs_ticks < 1)
                nfs_ticks = 1;
-       nfs_xid = cprng_fast32();
        nfsdreq_init();
 
        /*
@@ -1994,6 +1993,10 @@
 {
        u_int32_t newxid;
 
+       if (__predict_false(nfs_xid == 0)) {
+               nfs_xid = cprng_fast32();
+       }
+
        /* get next xid.  skip 0 */
        do {
                newxid = atomic_inc_32_nv(&nfs_xid);
Index: sys/cprng.h
===================================================================
RCS file: /cvsroot/src/sys/sys/cprng.h,v
retrieving revision 1.9
diff -u -r1.9 cprng.h
--- sys/cprng.h 17 Jan 2014 02:08:56 -0000      1.9
+++ sys/cprng.h 17 Apr 2014 01:20:27 -0000
@@ -41,42 +41,91 @@
 #include <sys/rnd.h>           /* XXX users bogusly transitively need this */
 
 #include <crypto/nist_ctr_drbg/nist_ctr_drbg.h>
+#include <crypto/hc128/hc128.h>
+#include <sys/percpu.h>
+#include <sys/intr.h>
 
 /*
  * NIST SP800-90 says 2^19 bytes per request for the CTR_DRBG.
  */
 #define CPRNG_MAX_LEN  524288
 
+#define CPRF_MAXBYTES           (512 * 1024 * 1024)
+#define CPRF_HARDMAX            (1 * 1024 * 1024 * 1024)
+#define CPRF_RESEED_SECONDS     600
+
+typedef struct  {
+        hc128_state_t   hc128;
+        int             numbytes;
+        time_t          nextreseed;
+} cprf_ctx_t;
+
 /*
- * We do not want an arc4random() prototype available to anyone.
+ * This is a macro so we can skip any conditional logic at runtime if
+ * the size provided is a multiple of the underlying stream cipher
+ * blocksize, e.g. sizeof(padded struct).
  */
-void _arc4randbytes(void *, size_t);
-uint32_t _arc4random(void);
+#define cprng_fast(p, len) ((0 == (len % sizeof(uint32_t))) && \
+                           (0 == ((ptrdiff_t)p % sizeof(uint32_t))) ? \
+                           _cprng_fast_exact(p, len) : \
+                           _cprng_fast_inexact(p, len))
+
+size_t _cprng_fast_exact(void *, size_t);
+size_t _cprng_fast_inexact(void *, size_t);
 
-static inline size_t
-cprng_fast(void *p, size_t len)
+static inline void
+_cprf_checkrekey(cprf_ctx_t *ctx)
 {
-       _arc4randbytes(p, len);
-       return len;
+       extern void *cprf_rekey_softintr;
+
+       if (__predict_false((ctx->numbytes > CPRF_MAXBYTES) ||
+                           (time_uptime > ctx->nextreseed))) {
+               /* Schedule a deferred reseed */
+               softint_schedule(cprf_rekey_softintr);
+       }
 }
 
-static inline uint32_t
-cprng_fast32(void)
+static inline uint32_t cprng_fast32(void)
 {
-       return _arc4random();
+       uint32_t ret;
+       extern percpu_t *percpu_cprf_ctx;
+       cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx);
+       int s;
+
+       _cprf_checkrekey(ctx);
+
+       s = splhigh();
+       hc128_extract(&ctx->hc128, (uint8_t *)&ret);
+       splx(s);
+
+       ctx->numbytes += sizeof(uint32_t);
+       percpu_putref(percpu_cprf_ctx);
+       return ret;
 }
 
-static inline uint64_t
-cprng_fast64(void)
+static inline uint64_t cprng_fast64(void)
 {
-       uint64_t r;
-       _arc4randbytes(&r, sizeof(r));
-       return r;
+       uint64_t ret;
+       extern percpu_t *percpu_cprf_ctx;
+       cprf_ctx_t *ctx = percpu_getref(percpu_cprf_ctx);
+       int s;
+
+       _cprf_checkrekey(ctx);
+
+       s = splhigh();
+       hc128_extract(&ctx->hc128, (uint8_t *)&ret);
+       hc128_extract(&ctx->hc128, (uint8_t *)(((uint32_t *)&ret) + 1));
+       splx(s);
+
+       ctx->numbytes += sizeof(uint64_t);
+       percpu_putref(percpu_cprf_ctx);
+       return ret;
 }
 
 typedef struct cprng_strong cprng_strong_t;
 
 void   cprng_init(void);
+void   cprf_init(void);
 
 #define CPRNG_INIT_ANY         0x00000001
 #define CPRNG_REKEY_ANY                0x00000002


Home | Main Index | Thread Index | Old Index