Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src-draft/trunk]: src/sys/crypto/aes/arch/aarch64 Vectorize AES-CBC on aarch64.



details:   https://anonhg.NetBSD.org/src-all/rev/d5bf2e196259
branches:  trunk
changeset: 934561:d5bf2e196259
user:      Taylor R Campbell <riastradh%NetBSD.org@localhost>
date:      Sat Jun 13 23:34:32 2020 +0000

description:
Vectorize AES-CBC on aarch64.

Can't parallelize encryption but can at least keep the state in
vector registers rather than loading and storing memory.

diffstat:

 sys/crypto/aes/arch/aarch64/aes_arm.c    |   30 +--
 sys/crypto/aes/arch/aarch64/aes_arm.h    |    7 +
 sys/crypto/aes/arch/aarch64/aesarmfunc.S |  220 +++++++++++++++++++++++++++++++
 3 files changed, 236 insertions(+), 21 deletions(-)

diffs (truncated from 306 to 300 lines):

diff -r b310f21a2976 -r d5bf2e196259 sys/crypto/aes/arch/aarch64/aes_arm.c
--- a/sys/crypto/aes/arch/aarch64/aes_arm.c     Sat Jun 13 16:43:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.c     Sat Jun 13 23:34:32 2020 +0000
@@ -88,35 +88,23 @@
 }
 
 static void
-aesarm_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
+aesarm_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
     uint32_t nrounds)
 {
 
        KASSERT(nbytes % 16 == 0);
 
-       for (; nbytes; nbytes -= 16, in += 16, out += 16) {
-               xor(iv, in, iv);
-               aesarm_enc(enc, iv, out, nrounds);
-               memcpy(iv, out, 16);
+       if (nbytes % 128) {
+               aesarm_cbc_dec1(dec, in, out, nbytes % 128, iv, nrounds);
+               in += nbytes % 128;
+               out += nbytes % 128;
+               nbytes -= nbytes % 128;
        }
-}
 
-static void
-aesarm_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
-    uint32_t nrounds)
-{
-       uint8_t tmp[16];
-
-       KASSERT(nbytes % 16 == 0);
-
-       for (; nbytes; nbytes -= 16, in += 16, out += 16) {
-               memcpy(tmp, in, 16);
-               aesarm_dec(dec, in, out, nrounds);
-               xor(out, out, iv);
-               memcpy(iv, tmp, 16);
-       }
+       KASSERT(nbytes % 128 == 0);
+       if (nbytes)
+               aesarm_cbc_dec8(dec, in, out, nbytes, iv, nrounds);
 }
 
 static inline void
diff -r b310f21a2976 -r d5bf2e196259 sys/crypto/aes/arch/aarch64/aes_arm.h
--- a/sys/crypto/aes/arch/aarch64/aes_arm.h     Sat Jun 13 16:43:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.h     Sat Jun 13 23:34:32 2020 +0000
@@ -46,6 +46,13 @@
 void   aesarm_dec(const struct aesdec *, const uint8_t[static 16],
            uint8_t[static 16], uint32_t);
 
+void   aesarm_cbc_enc(const struct aesenc *, const uint8_t[static 16],
+           uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void   aesarm_cbc_dec1(const struct aesdec *, const uint8_t[static 16],
+           uint8_t[static 16], size_t, const uint8_t[static 16], uint32_t);
+void   aesarm_cbc_dec8(const struct aesdec *, const uint8_t[static 128],
+           uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+
 extern struct aes_impl aes_arm_impl;
 
 #endif /* _CRYPTO_AES_AES_ARCH_AARCH64_AES_ARM_H */
diff -r b310f21a2976 -r d5bf2e196259 sys/crypto/aes/arch/aarch64/aesarmfunc.S
--- a/sys/crypto/aes/arch/aarch64/aesarmfunc.S  Sat Jun 13 16:43:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aesarmfunc.S  Sat Jun 13 23:34:32 2020 +0000
@@ -491,6 +491,132 @@
 END(aesarm_dec)
 
 /*
+ * aesarm_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Encrypt a contiguous sequence of blocks with AES-CBC.
+ *
+ *     nbytes must be an integral multiple of 16.
+ *
+ *     Standard ABI calling convention.
+ */
+ENTRY(aesarm_cbc_enc)
+       stp     xzr, lr, [sp, #-16]!    /* push stack frame */
+       cbz     x3, 2f                  /* stop if nothing to do */
+       mov     x9, x0                  /* x9 := enckey */
+       mov     x10, x3                 /* x10 := nbytes */
+       ldr     q0, [x4]                /* q0 := chaining value */
+1:     ldr     q1, [x1], #0x10         /* q1 := plaintext block */
+       eor     v0.16b, v0.16b, v1.16b  /* q0 := cv ^ ptxt */
+       mov     x0, x9                  /* x0 := enckey */
+       mov     x3, x5                  /* x3 := nrounds */
+       bl      aesarm_enc1             /* q0 := ciphertext block */
+       subs    x10, x10, #0x10         /* count down nbytes */
+       str     q0, [x2], #0x10         /* store ciphertext block */
+       b.ne    1b                      /* repeat if x10 is nonzero */
+       str     q0, [x4]                /* store chaining value */
+       ldp     xzr, lr, [sp], #16      /* pop stack frame */
+       ret
+END(aesarm_cbc_enc)
+
+/*
+ * aesarm_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Decrypt a contiguous sequence of blocks with AES-CBC.
+ *
+ *     nbytes must be a positive integral multiple of 16.  This routine
+ *     is not vectorized; use aesarm_cbc_dec8 for >=8 blocks at once.
+ *
+ *     Standard ABI calling convention.
+ */
+ENTRY(aesarm_cbc_dec1)
+       stp     xzr, lr, [sp, #-32]!    /* push stack frame with uint128 */
+       ldr     q8, [x4]                /* q8 := iv */
+       str     q8, [sp, #16]           /* save iv */
+       mov     x9, x0                  /* x9 := enckey */
+       mov     x10, x3                 /* x10 := nbytes */
+       add     x1, x1, x3              /* x1 := pointer past end of in */
+       add     x2, x2, x3              /* x2 := pointer past end of out */
+       ldr     q0, [x1, #-0x10]!       /* q0 := last ciphertext block */
+       str     q0, [x4]                /* update iv */
+1:     mov     x0, x9                  /* x0 := enckey */
+       mov     x3, x5                  /* x3 := nrounds */
+       bl      aesarm_dec1             /* q0 := cv ^ ptxt; trash x0/x3 */
+       subs    x10, x10, #0x10         /* count down nbytes */
+       b.eq    2f                      /* stop if this is the first block */
+       ldr     q8, [x1, #-0x10]!       /* q8 := chaining value */
+       eor     v0.16b, v0.16b, v8.16b  /* q0 := plaintext block */
+       str     q0, [x2, #-0x10]!       /* store plaintext block */
+       mov     v0.16b, v8.16b          /* move cv = ciphertext block */
+       b       1b
+2:     ldr     q8, [sp, #16]           /* q8 := iv */
+       eor     v0.16b, v0.16b, v8.16b  /* q0 := first plaintext block */
+       str     q0, [x2, #-0x10]!       /* store first plaintext block */
+       ldp     xzr, lr, [sp], #32      /* pop stack frame */
+       ret
+END(aesarm_cbc_dec1)
+
+/*
+ * aesarm_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Decrypt a contiguous sequence of 8-block units with AES-CBC.
+ *
+ *     nbytes must be a positive integral multiple of 128.
+ *
+ *     Standard ABI calling convention.
+ */
+ENTRY(aesarm_cbc_dec8)
+       stp     xzr, lr, [sp, #-32]!    /* push stack frame with uint128 */
+       ldr     q8, [x4]                /* q8 := iv */
+       str     q8, [sp, #16]           /* save iv */
+       mov     x9, x0                  /* x9 := enckey */
+       mov     x10, x3                 /* x10 := nbytes */
+       add     x1, x1, x3              /* x1 := pointer past end of in */
+       add     x2, x2, x3              /* x2 := pointer past end of out */
+       ldp     q6, q7, [x1, #-0x20]!   /* q6, q7 := last ciphertext blocks */
+       str     q7, [x4]                /* update iv */
+1:     ldp     q4, q5, [x1, #-0x20]!
+       ldp     q2, q3, [x1, #-0x20]!
+       ldp     q0, q1, [x1, #-0x20]!
+       mov     v15.16b, v6.16b         /* q[8+i] := cv[i], 0<i<8 */
+       mov     v14.16b, v5.16b
+       mov     v13.16b, v4.16b
+       mov     v12.16b, v3.16b
+       mov     v11.16b, v2.16b
+       mov     v10.16b, v1.16b
+       mov     v9.16b, v0.16b
+       mov     x0, x9                  /* x0 := enckey */
+       mov     x3, x5                  /* x3 := nrounds */
+       bl      aesarm_dec8             /* q[i] := cv[i] ^ pt[i] */
+       eor     v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */
+       eor     v6.16b, v6.16b, v14.16b
+       eor     v5.16b, v5.16b, v13.16b
+       eor     v4.16b, v4.16b, v12.16b
+       eor     v3.16b, v3.16b, v11.16b
+       eor     v2.16b, v2.16b, v10.16b
+       eor     v1.16b, v1.16b, v9.16b
+       subs    x10, x10, #0x80         /* count down nbytes */
+       stp     q6, q7, [x2, #-0x20]!   /* store plaintext blocks */
+       stp     q4, q5, [x2, #-0x20]!
+       stp     q2, q3, [x2, #-0x20]!
+       b.eq    2f                      /* stop if this is the first block */
+       ldp     q6, q7, [x1, #-0x20]!
+       eor     v0.16b, v0.16b, v7.16b  /* q0 := pt0 */
+       stp     q0, q1, [x2, #-0x20]!
+       b       1b
+2:     ldr     q8, [sp, #16]           /* q8 := iv */
+       eor     v0.16b, v0.16b, v8.16b  /* q0 := pt0 */
+       stp     q0, q1, [x2, #-0x20]!   /* store first two plaintext blocks */
+       ldp     xzr, lr, [sp], #32      /* pop stack frame */
+       ret
+END(aesarm_cbc_dec8)
+
+/*
  * aesarm_enc1(const struct aesenc *enckey@x0,
  *     uint128_t block@q0, uint32_t nrounds@x3)
  *
@@ -516,6 +642,53 @@
 END(aesarm_enc1)
 
 /*
+ * aesarm_enc8(const struct aesenc *enckey@x0,
+ *     uint128_t block0@q0, ..., uint128_t block7@q7,
+ *     uint32_t nrounds@x3)
+ *
+ *     Encrypt eight AES blocks in q0 through q7 in parallel.
+ *
+ *     Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ */
+       .text
+       _ALIGN_TEXT
+       .type   aesarm_enc8,@function
+aesarm_enc8:
+       ldr     q8, [x0], #0x10         /* load round key */
+1:     subs    x3, x3, #1
+       /* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
+       aese    v0.16b, v8.16b
+       aese    v1.16b, v8.16b
+       aese    v2.16b, v8.16b
+       aese    v3.16b, v8.16b
+       aese    v4.16b, v8.16b
+       aese    v5.16b, v8.16b
+       aese    v6.16b, v8.16b
+       aese    v7.16b, v8.16b
+       ldr     q8, [x0], #0x10         /* load next round key */
+       b.eq    2f
+       /* q[i] := MixColumns(q[i]) */
+       aesmc   v0.16b, v0.16b
+       aesmc   v1.16b, v1.16b
+       aesmc   v2.16b, v2.16b
+       aesmc   v3.16b, v3.16b
+       aesmc   v4.16b, v4.16b
+       aesmc   v5.16b, v5.16b
+       aesmc   v6.16b, v6.16b
+       aesmc   v7.16b, v7.16b
+       b       1b
+2:     eor     v0.16b, v0.16b, v8.16b  /* AddRoundKey */
+       eor     v1.16b, v1.16b, v8.16b
+       eor     v2.16b, v2.16b, v8.16b
+       eor     v3.16b, v3.16b, v8.16b
+       eor     v4.16b, v4.16b, v8.16b
+       eor     v5.16b, v5.16b, v8.16b
+       eor     v6.16b, v6.16b, v8.16b
+       eor     v7.16b, v7.16b, v8.16b
+       ret
+END(aesarm_enc8)
+
+/*
  * aesarm_dec1(const struct aesdec *deckey@x0,
  *     uint128_t block@q0, uint32_t nrounds@x3)
  *
@@ -539,3 +712,50 @@
 2:     eor     v0.16b, v0.16b, v8.16b
        ret
 END(aesarm_dec1)
+
+/*
+ * aesarm_dec8(const struct aesdec *deckey@x0,
+ *     uint128_t block0@q0, ..., uint128_t block7@q7,
+ *     uint32_t nrounds@x3)
+ *
+ *     Decrypt eight AES blocks in q0 through q7 in parallel.
+ *
+ *     Internal ABI.  Uses q8 as temporary.  Destroys x0 and x3.
+ */
+       .text
+       _ALIGN_TEXT
+       .type   aesarm_dec8,@function
+aesarm_dec8:
+       ldr     q8, [x0], #0x10         /* load round key */
+1:     subs    x3, x3, #1
+       /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
+       aesd    v0.16b, v8.16b
+       aesd    v1.16b, v8.16b
+       aesd    v2.16b, v8.16b
+       aesd    v3.16b, v8.16b
+       aesd    v4.16b, v8.16b
+       aesd    v5.16b, v8.16b
+       aesd    v6.16b, v8.16b
+       aesd    v7.16b, v8.16b
+       ldr     q8, [x0], #0x10         /* load next round key */
+       b.eq    2f
+       /* q[i] := InMixColumns(q[i]) */
+       aesimc  v0.16b, v0.16b
+       aesimc  v1.16b, v1.16b
+       aesimc  v2.16b, v2.16b
+       aesimc  v3.16b, v3.16b
+       aesimc  v4.16b, v4.16b
+       aesimc  v5.16b, v5.16b
+       aesimc  v6.16b, v6.16b
+       aesimc  v7.16b, v7.16b
+       b       1b
+2:     eor     v0.16b, v0.16b, v8.16b  /* AddRoundKey */
+       eor     v1.16b, v1.16b, v8.16b
+       eor     v2.16b, v2.16b, v8.16b
+       eor     v3.16b, v3.16b, v8.16b



Home | Main Index | Thread Index | Old Index