Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src-draft/trunk]: src/sys/crypto/aes/arch/aarch64 Vectorize AES-XTS on aarch64.



details:   https://anonhg.NetBSD.org/src-all/rev/75dbad3a3753
branches:  trunk
changeset: 934575:75dbad3a3753
user:      Taylor R Campbell <riastradh%NetBSD.org@localhost>
date:      Sun Jun 14 00:57:26 2020 +0000

description:
Vectorize AES-XTS on aarch64.

diffstat:

 sys/crypto/aes/arch/aarch64/aes_arm.c    |  102 ++++--------
 sys/crypto/aes/arch/aarch64/aes_arm.h    |   10 +
 sys/crypto/aes/arch/aarch64/aesarmfunc.S |  257 +++++++++++++++++++++++++++++++
 3 files changed, 301 insertions(+), 68 deletions(-)

diffs (truncated from 417 to 300 lines):

diff -r d2a7f58a15c1 -r 75dbad3a3753 sys/crypto/aes/arch/aarch64/aes_arm.c
--- a/sys/crypto/aes/arch/aarch64/aes_arm.c     Sat Jun 13 23:34:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.c     Sun Jun 14 00:57:26 2020 +0000
@@ -70,24 +70,6 @@
 }
 
 static void
-xor(uint8_t x[static 16],
-    const uint8_t a[static 16], const uint8_t b[static 16])
-{
-       uint64_t alo, ahi;
-       uint64_t blo, bhi;
-       uint64_t xlo, xhi;
-
-       memcpy(&alo, a, 8);
-       memcpy(&ahi, a + 8, 8);
-       memcpy(&blo, b, 8);
-       memcpy(&bhi, b + 8, 8);
-       xlo = alo ^ blo;
-       xhi = ahi ^ bhi;
-       memcpy(x, &xlo, 8);
-       memcpy(x + 8, &xhi, 8);
-}
-
-static void
 aesarm_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
     uint32_t nrounds)
@@ -107,22 +89,44 @@
                aesarm_cbc_dec8(dec, in, out, nbytes, iv, nrounds);
 }
 
-static inline void
-aesarm_xts_update(const uint8_t in[static 16], uint8_t out[static 16])
+static void
+aesarm_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
 {
-       uint64_t t0, t1;
-       unsigned s0, s1;
+
+       KASSERT(nbytes % 16 == 0);
+
+       if (nbytes % 128) {
+               aesarm_xts_enc1(enc, in, out, nbytes % 128, tweak, nrounds);
+               in += nbytes % 128;
+               out += nbytes % 128;
+               nbytes -= nbytes % 128;
+       }
 
-       t0 = le64dec(in + 8*0);
-       t1 = le64dec(in + 8*1);
+       KASSERT(nbytes % 128 == 0);
+       if (nbytes)
+               aesarm_xts_enc8(enc, in, out, nbytes, tweak, nrounds);
+}
+
+static void
+aesarm_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
 
-       s0 = t0 >> 63;
-       s1 = t1 >> 63;
-       t0 = (t0 << 1) ^ (-s1 & 0x87);
-       t1 = (t1 << 1) ^ s0;
+       KASSERT(nbytes % 16 == 0);
 
-       le64enc(out + 8*0, t0);
-       le64enc(out + 8*1, t1);
+       if (nbytes % 128) {
+               aesarm_xts_dec1(dec, in, out, nbytes % 128, tweak, nrounds);
+               in += nbytes % 128;
+               out += nbytes % 128;
+               nbytes -= nbytes % 128;
+       }
+
+       KASSERT(nbytes % 128 == 0);
+       if (nbytes)
+               aesarm_xts_dec8(dec, in, out, nbytes, tweak, nrounds);
 }
 
 static int
@@ -155,44 +159,6 @@
        return 0;
 }
 
-static void
-aesarm_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
-    uint32_t nrounds)
-{
-       uint8_t tmp[16];
-
-       KASSERT(nbytes % 16 == 0);
-
-       for (; nbytes; nbytes -= 16, in += 16, out += 16) {
-               xor(tmp, in, tweak);
-               aesarm_enc(enc, tmp, tmp, nrounds);
-               xor(out, tmp, tweak);
-               aesarm_xts_update(tweak, tweak);
-       }
-
-       explicit_memset(tmp, 0, sizeof tmp);
-}
-
-static void
-aesarm_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
-    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
-    uint32_t nrounds)
-{
-       uint8_t tmp[16];
-
-       KASSERT(nbytes % 16 == 0);
-
-       for (; nbytes; nbytes -= 16, in += 16, out += 16) {
-               xor(tmp, in, tweak);
-               aesarm_dec(dec, tmp, tmp, nrounds);
-               xor(out, tmp, tweak);
-               aesarm_xts_update(tweak, tweak);
-       }
-
-       explicit_memset(tmp, 0, sizeof tmp);
-}
-
 static int
 aesarm_probe(void)
 {
diff -r d2a7f58a15c1 -r 75dbad3a3753 sys/crypto/aes/arch/aarch64/aes_arm.h
--- a/sys/crypto/aes/arch/aarch64/aes_arm.h     Sat Jun 13 23:34:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.h     Sun Jun 14 00:57:26 2020 +0000
@@ -53,6 +53,16 @@
 void   aesarm_cbc_dec8(const struct aesdec *, const uint8_t[static 128],
            uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
 
+void   aesarm_xts_enc1(const struct aesenc *, const uint8_t[static 16],
+           uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void   aesarm_xts_enc8(const struct aesenc *, const uint8_t[static 128],
+           uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void   aesarm_xts_dec1(const struct aesdec *, const uint8_t[static 16],
+           uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void   aesarm_xts_dec8(const struct aesdec *, const uint8_t[static 128],
+           uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void   aesarm_xts_update(const uint8_t[static 16], uint8_t[static 16]);
+
 extern struct aes_impl aes_arm_impl;
 
 #endif /* _CRYPTO_AES_AES_ARCH_AARCH64_AES_ARM_H */
diff -r d2a7f58a15c1 -r 75dbad3a3753 sys/crypto/aes/arch/aarch64/aesarmfunc.S
--- a/sys/crypto/aes/arch/aarch64/aesarmfunc.S  Sat Jun 13 23:34:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aesarmfunc.S  Sun Jun 14 00:57:26 2020 +0000
@@ -613,6 +613,263 @@
 END(aesarm_cbc_dec8)
 
 /*
+ * aesarm_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Encrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *     nbytes must be a positive integral multiple of 16.  This routine
+ *     is not vectorized; use aesarm_xts_enc8 for >=8 blocks at once.
+ *
+ *     Standard ABI calling convention.
+ */
+ENTRY(aesarm_xts_enc1)
+       stp     fp, lr, [sp, #-16]!     /* push stack frame */
+       mov     fp, sp
+       mov     x9, x0                  /* x9 := enckey */
+       mov     x10, x3                 /* x10 := nbytes */
+       ldr     q9, [x4]                /* q9 := tweak */
+1:     ldr     q0, [x1], #0x10         /* q0 := ptxt */
+       mov     x0, x9                  /* x0 := enckey */
+       mov     x3, x5                  /* x3 := nrounds */
+       eor     v0.16b, v0.16b, v9.16b  /* q0 := ptxt ^ tweak */
+       bl      aesarm_enc1             /* q0 := AES(ptxt ^ tweak) */
+       eor     v0.16b, v0.16b, v9.16b  /* q0 := AES(ptxt ^ tweak) ^ tweak */
+       str     q0, [x2], #0x10         /* store ciphertext block */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       subs    x10, x10, #0x10         /* count down nbytes */
+       b.ne    1b                      /* repeat if more blocks */
+       str     q9, [x4]                /* update tweak */
+       ldp     fp, lr, [sp], #16       /* pop stack frame */
+       ret
+END(aesarm_xts_enc1)
+
+/*
+ * aesarm_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Encrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *     nbytes must be a positive integral multiple of 128.
+ *
+ *     Standard ABI calling convention.
+ */
+ENTRY(aesarm_xts_enc8)
+       stp     fp, lr, [sp, #-48]!     /* push stack frame uint128[2] */
+       mov     fp, sp
+       mov     x9, x0                  /* x9 := enckey */
+       mov     x10, x3                 /* x10 := nbytes */
+       ldr     q9, [x4]                /* q9 := tweak */
+1:     str     q9, [sp, #16]           /* save tweak[0] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       str     q9, [sp, #32]           /* save tweak[1] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       mov     v10.16b, v9.16b         /* q10 := tweak[2] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       mov     v11.16b, v9.16b         /* q11 := tweak[3] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       mov     v12.16b, v9.16b         /* q11 := tweak[4] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       mov     v13.16b, v9.16b         /* q11 := tweak[5] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       mov     v14.16b, v9.16b         /* q11 := tweak[6] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       mov     v15.16b, v9.16b         /* q11 := tweak[7] */
+       ldp     q8, q9, [sp, #16]       /* q8 := tweak[0], q9 := tweak[1] */
+       ldp     q0, q1, [x1], #0x20     /* q[i] := pt[i] */
+       ldp     q2, q3, [x1], #0x20
+       ldp     q4, q5, [x1], #0x20
+       ldp     q6, q7, [x1], #0x20
+       eor     v0.16b, v0.16b, v8.16b  /* q[i] := pt[i] ^ tweak[i] */
+       eor     v1.16b, v1.16b, v9.16b
+       eor     v2.16b, v2.16b, v10.16b
+       eor     v3.16b, v3.16b, v11.16b
+       eor     v4.16b, v4.16b, v12.16b
+       eor     v5.16b, v5.16b, v13.16b
+       eor     v6.16b, v6.16b, v14.16b
+       eor     v7.16b, v7.16b, v15.16b
+       mov     x0, x9                  /* x0 := enckey */
+       mov     x3, x5                  /* x3 := nrounds */
+       bl      aesarm_enc8             /* encrypt q0,...,q7; trash x0/x3/q8 */
+       ldr     q8, [sp, #16]           /* reload q8 := tweak[0] */
+       eor     v1.16b, v1.16b, v9.16b  /* q[i] := AES(...) ^ tweak[i] */
+       eor     v2.16b, v2.16b, v10.16b
+       eor     v3.16b, v3.16b, v11.16b
+       eor     v0.16b, v0.16b, v8.16b
+       eor     v4.16b, v4.16b, v12.16b
+       eor     v5.16b, v5.16b, v13.16b
+       eor     v6.16b, v6.16b, v14.16b
+       eor     v7.16b, v7.16b, v15.16b
+       stp     q0, q1, [x2], #0x20     /* store ciphertext blocks */
+       stp     q2, q3, [x2], #0x20     /* store ciphertext blocks */
+       stp     q4, q5, [x2], #0x20     /* store ciphertext blocks */
+       stp     q6, q7, [x2], #0x20     /* store ciphertext blocks */
+       mov     v9.16b, v15.16b         /* q9 := q15 = tweak[7] */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       subs    x10, x10, #0x80         /* count down nbytes */
+       b.ne    1b                      /* repeat if more block groups */
+       str     q9, [x4]                /* update tweak */
+       ldp     fp, lr, [sp], #48       /* pop stack frame */
+       ret
+END(aesarm_xts_enc8)
+
+/*
+ * aesarm_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Decrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *     nbytes must be a positive integral multiple of 16.  This routine
+ *     is not vectorized; use aesarm_xts_dec8 for >=8 blocks at once.
+ *
+ *     Standard ABI calling convention.
+ */
+ENTRY(aesarm_xts_dec1)
+       stp     fp, lr, [sp, #-16]!     /* push stack frame */
+       mov     fp, sp
+       mov     x9, x0                  /* x9 := deckey */
+       mov     x10, x3                 /* x10 := nbytes */
+       ldr     q9, [x4]                /* q9 := tweak */
+1:     ldr     q0, [x1], #0x10         /* q0 := ptxt */
+       mov     x0, x9                  /* x0 := deckey */
+       mov     x3, x5                  /* x3 := nrounds */
+       eor     v0.16b, v0.16b, v9.16b  /* q0 := ptxt ^ tweak */
+       bl      aesarm_dec1             /* q0 := AES(ptxt ^ tweak) */
+       eor     v0.16b, v0.16b, v9.16b  /* q0 := AES(ptxt ^ tweak) ^ tweak */
+       str     q0, [x2], #0x10         /* store ciphertext block */
+       bl      aesarm_xts_mulx         /* q9 *= x; trash x0/q0/q1 */
+       subs    x10, x10, #0x10         /* count down nbytes */
+       b.ne    1b                      /* repeat if more blocks */
+       str     q9, [x4]                /* update tweak */
+       ldp     fp, lr, [sp], #16       /* pop stack frame */
+       ret
+END(aesarm_xts_dec1)
+
+/*
+ * aesarm_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ *     uint32_t nrounds@x5)
+ *
+ *     Decrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ *     nbytes must be a positive integral multiple of 128.



Home | Main Index | Thread Index | Old Index