[src/trunk]: src/sys/crypto/aes/arch/arm Draft 2x vectorized neon vpaes for a...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/crypto/aes/arch/arm Draft 2x vectorized neon vpaes for a...
From: riastradh <riastradh%NetBSD.org@localhost>
Date: Tue, 28 Jul 2020 23:02:39 +0000
details:   https://anonhg.NetBSD.org/src/rev/d4a30223e41e
branches:  trunk
changeset: 936532:d4a30223e41e
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Tue Jul 28 20:11:09 2020 +0000

description:
Draft 2x vectorized neon vpaes for aarch64.

Gives a modest speed boost on rk3399 (Cortex-A53/A72), around 20% in
cgd tests, for parallelizable operations like CBC decryption; same
improvement should probably carry over to rpi4 CPU which lacks
ARMv8.0-AES.

diffstat:

 sys/crypto/aes/arch/arm/aes_neon.c      |  113 +++++++++++++++++++++++++++++++-
 sys/crypto/aes/arch/arm/aes_neon_impl.h |   31 ++++++++-
 sys/crypto/aes/arch/arm/aes_neon_subr.c |  105 +++++++++++++++++++++++++----
 sys/crypto/aes/arch/arm/arm_neon.h      |    4 +-
 4 files changed, 233 insertions(+), 20 deletions(-)

diffs (truncated from 386 to 300 lines):

diff -r f1b25b09d6b6 -r d4a30223e41e sys/crypto/aes/arch/arm/aes_neon.c
--- a/sys/crypto/aes/arch/arm/aes_neon.c        Tue Jul 28 20:08:48 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.c        Tue Jul 28 20:11:09 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $   */
+/*     $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $   */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.3 2020/06/30 20:32:11 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 
 #include <sys/types.h>
 
@@ -589,6 +589,59 @@
        return vqtbl1q_u8(x, sr[rmod4]);
 }
 
+uint8x16x2_t
+aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
+{
+       const uint32_t *rk32 = enc->aese_aes.aes_rk;
+       uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
+       uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
+       uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
+       uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
+       uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
+       uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
+       uint8x16_t x0 = x.val[0], x1 = x.val[1];
+       uint8x16_t io0, jo0, io1, jo1;
+       unsigned rmod4 = 0;
+
+       x0 = aes_schedule_transform(x0, ipt);
+       x1 = aes_schedule_transform(x1, ipt);
+       x0 ^= loadroundkey(rk32);
+       x1 ^= loadroundkey(rk32);
+       for (;;) {
+               uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
+               uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
+
+               subbytes(&io0, &jo0, x0, inv_, inva_);
+               subbytes(&io1, &jo1, x1, inv_, inva_);
+
+               rk32 += 4;
+               rmod4 = (rmod4 + 1) % 4;
+               if (--nrounds == 0)
+                       break;
+
+               A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
+               A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
+               A_0 ^= loadroundkey(rk32);
+               A_1 ^= loadroundkey(rk32);
+               A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
+               A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
+               A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
+               A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
+               A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
+               A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
+               x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
+               x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
+       }
+       x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
+       x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
+       x0 ^= loadroundkey(rk32);
+       x1 ^= loadroundkey(rk32);
+       return (uint8x16x2_t) { .val = {
+               [0] = vqtbl1q_u8(x0, sr[rmod4]),
+               [1] = vqtbl1q_u8(x1, sr[rmod4]),
+       } };
+}
+
 uint8x16_t
 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
 {
@@ -628,4 +681,60 @@
        return vqtbl1q_u8(x, sr[i]);
 }
 
+uint8x16x2_t
+aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
+{
+       const uint32_t *rk32 = dec->aesd_aes.aes_rk;
+       unsigned i = 3 & ~(nrounds - 1);
+       uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
+       uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
+       uint8x16_t x0 = x.val[0], x1 = x.val[1];
+       uint8x16_t io0, jo0, io1, jo1, mc;
+
+       x0 = aes_schedule_transform(x0, dipt);
+       x1 = aes_schedule_transform(x1, dipt);
+       x0 ^= loadroundkey(rk32);
+       x1 ^= loadroundkey(rk32);
+       rk32 += 4;
+
+       mc = mc_forward[3];
+       for (;;) {
+               subbytes(&io0, &jo0, x0, inv_, inva_);
+               subbytes(&io1, &jo1, x1, inv_, inva_);
+               if (--nrounds == 0)
+                       break;
+
+               x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
+               x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
+               x0 ^= loadroundkey(rk32);
+               x1 ^= loadroundkey(rk32);
+               rk32 += 4;                              /* next round key */
+
+               x0 = vqtbl1q_u8(x0, mc);
+               x1 = vqtbl1q_u8(x1, mc);
+               x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
+               x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
+
+               x0 = vqtbl1q_u8(x0, mc);
+               x1 = vqtbl1q_u8(x1, mc);
+               x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
+               x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
+
+               x0 = vqtbl1q_u8(x0, mc);
+               x1 = vqtbl1q_u8(x1, mc);
+               x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
+               x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
+
+               mc = vextq_u8(mc, mc, 12);
+       }
+       x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
+       x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
+       x0 ^= loadroundkey(rk32);
+       x1 ^= loadroundkey(rk32);
+       return (uint8x16x2_t) { .val = {
+               [0] = vqtbl1q_u8(x0, sr[i]),
+               [1] = vqtbl1q_u8(x1, sr[i]),
+       } };
+}
+
 #endif
diff -r f1b25b09d6b6 -r d4a30223e41e sys/crypto/aes/arch/arm/aes_neon_impl.h
--- a/sys/crypto/aes/arch/arm/aes_neon_impl.h   Tue Jul 28 20:08:48 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_impl.h   Tue Jul 28 20:11:09 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_neon_impl.h,v 1.1 2020/06/29 23:56:31 riastradh Exp $      */
+/*     $NetBSD: aes_neon_impl.h,v 1.2 2020/07/28 20:11:09 riastradh Exp $      */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,4 +39,33 @@
 uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned);
 uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned);
 
+#ifdef __aarch64__
+
+uint8x16x2_t aes_neon_enc2(const struct aesenc *, uint8x16x2_t, unsigned);
+uint8x16x2_t aes_neon_dec2(const struct aesdec *, uint8x16x2_t, unsigned);
+
+#else
+
+static inline uint8x16x2_t
+aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t b2, unsigned nrounds)
+{
+
+       return (uint8x16x2_t) { .val = {
+               [0] = aes_neon_enc1(enc, b2.val[0], nrounds),
+               [1] = aes_neon_enc1(enc, b2.val[1], nrounds),
+       } };
+}
+
+static inline uint8x16x2_t
+aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t b2, unsigned nrounds)
+{
+
+       return (uint8x16x2_t) { .val = {
+               [0] = aes_neon_dec1(dec, b2.val[0], nrounds),
+               [1] = aes_neon_dec1(dec, b2.val[1], nrounds),
+       } };
+}
+
+#endif
+
 #endif /* _CRYPTO_AES_ARCH_ARM_AES_NEON_IMPL_H */
diff -r f1b25b09d6b6 -r d4a30223e41e sys/crypto/aes/arch/arm/aes_neon_subr.c
--- a/sys/crypto/aes/arch/arm/aes_neon_subr.c   Tue Jul 28 20:08:48 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_subr.c   Tue Jul 28 20:11:09 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $      */
+/*     $NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $      */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.3 2020/07/25 22:36:06 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon_subr.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
 
 #include <sys/endian.h>
 
@@ -111,14 +111,33 @@
        cv = loadblock(in + nbytes - 16);
        storeblock(iv, cv);
 
-       for (;;) {
+       if (nbytes % 32) {
+               KASSERT(nbytes % 32 == 16);
                b = aes_neon_dec1(dec, cv, nrounds);
                if ((nbytes -= 16) == 0)
-                       break;
+                       goto out;
                cv = loadblock(in + nbytes - 16);
-               storeblock(out + nbytes, b ^ cv);
+               storeblock(out + nbytes, cv ^ b);
        }
-       storeblock(out, b ^ iv0);
+
+       for (;;) {
+               uint8x16x2_t b2;
+
+               KASSERT(nbytes >= 32);
+
+               b2.val[1] = cv;
+               b2.val[0] = cv = loadblock(in + nbytes - 32);
+               b2 = aes_neon_dec2(dec, b2, nrounds);
+               storeblock(out + nbytes - 16, cv ^ b2.val[1]);
+               if ((nbytes -= 32) == 0) {
+                       b = b2.val[0];
+                       goto out;
+               }
+               cv = loadblock(in + nbytes - 16);
+               storeblock(out + nbytes, cv ^ b2.val[0]);
+       }
+
+out:   storeblock(out, b ^ iv0);
 }
 
 static inline uint8x16_t
@@ -186,11 +205,28 @@
        KASSERT(nbytes % 16 == 0);
 
        t = loadblock(tweak);
-       for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+       if (nbytes % 32) {
+               KASSERT(nbytes % 32 == 16);
                b = t ^ loadblock(in);
                b = aes_neon_enc1(enc, b, nrounds);
                storeblock(out, t ^ b);
                t = aes_neon_xts_update(t);
+               nbytes -= 16;
+               in += 16;
+               out += 16;
+       }
+       for (; nbytes; nbytes -= 32, in += 32, out += 32) {
+               uint8x16_t t1;
+               uint8x16x2_t b2;
+
+               t1 = aes_neon_xts_update(t);
+               b2.val[0] = t ^ loadblock(in);
+               b2.val[1] = t1 ^ loadblock(in + 16);
+               b2 = aes_neon_enc2(enc, b2, nrounds);
+               storeblock(out, b2.val[0] ^ t);
+               storeblock(out + 16, b2.val[1] ^ t1);
+
+               t = aes_neon_xts_update(t1);
        }
        storeblock(tweak, t);
 }
@@ -206,11 +242,28 @@
        KASSERT(nbytes % 16 == 0);
 
        t = loadblock(tweak);
-       for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+       if (nbytes % 32) {
+               KASSERT(nbytes % 32 == 16);
                b = t ^ loadblock(in);
                b = aes_neon_dec1(dec, b, nrounds);
                storeblock(out, t ^ b);
                t = aes_neon_xts_update(t);
+               nbytes -= 16;
+               in += 16;
+               out += 16;
+       }
+       for (; nbytes; nbytes -= 32, in += 32, out += 32) {
+               uint8x16_t t1;
+               uint8x16x2_t b2;
+
+               t1 = aes_neon_xts_update(t);
+               b2.val[0] = t ^ loadblock(in);
+               b2.val[1] = t1 ^ loadblock(in + 16);
+               b2 = aes_neon_dec2(dec, b2, nrounds);
+               storeblock(out, b2.val[0] ^ t);
+               storeblock(out + 16, b2.val[1] ^ t1);
+
+               t = aes_neon_xts_update(t1);
        }
        storeblock(tweak, t);
 }
@@ -262,11 +315,16 @@
        ctr_be = loadblock(authctr + 16);
Prev by Date: [src/trunk]: src/tests/sys/crypto/chacha Implement 4-way vectorization of Cha...
Next by Date: [src/trunk]: src/sys Rewrite cprng_fast in terms of new ChaCha API.
Previous by Thread: [src/trunk]: src/tests/sys/crypto/chacha Implement 4-way vectorization of Cha...
Next by Thread: [src/trunk]: src/sys Rewrite cprng_fast in terms of new ChaCha API.
Indexes:
Home | Main Index | Thread Index | Old Index