Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src-draft/trunk]: src/sys/crypto/aes/arch/aarch64 Vectorize AES-XTS on aarch64.
details: https://anonhg.NetBSD.org/src-all/rev/75dbad3a3753
branches: trunk
changeset: 934575:75dbad3a3753
user: Taylor R Campbell <riastradh%NetBSD.org@localhost>
date: Sun Jun 14 00:57:26 2020 +0000
description:
Vectorize AES-XTS on aarch64.
diffstat:
sys/crypto/aes/arch/aarch64/aes_arm.c | 102 ++++--------
sys/crypto/aes/arch/aarch64/aes_arm.h | 10 +
sys/crypto/aes/arch/aarch64/aesarmfunc.S | 257 +++++++++++++++++++++++++++++++
3 files changed, 301 insertions(+), 68 deletions(-)
diffs (truncated from 417 to 300 lines):
diff -r d2a7f58a15c1 -r 75dbad3a3753 sys/crypto/aes/arch/aarch64/aes_arm.c
--- a/sys/crypto/aes/arch/aarch64/aes_arm.c Sat Jun 13 23:34:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.c Sun Jun 14 00:57:26 2020 +0000
@@ -70,24 +70,6 @@
}
static void
-xor(uint8_t x[static 16],
- const uint8_t a[static 16], const uint8_t b[static 16])
-{
- uint64_t alo, ahi;
- uint64_t blo, bhi;
- uint64_t xlo, xhi;
-
- memcpy(&alo, a, 8);
- memcpy(&ahi, a + 8, 8);
- memcpy(&blo, b, 8);
- memcpy(&bhi, b + 8, 8);
- xlo = alo ^ blo;
- xhi = ahi ^ bhi;
- memcpy(x, &xlo, 8);
- memcpy(x + 8, &xhi, 8);
-}
-
-static void
aesarm_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
uint32_t nrounds)
@@ -107,22 +89,44 @@
aesarm_cbc_dec8(dec, in, out, nbytes, iv, nrounds);
}
-static inline void
-aesarm_xts_update(const uint8_t in[static 16], uint8_t out[static 16])
+static void
+aesarm_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
+ uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+ uint32_t nrounds)
{
- uint64_t t0, t1;
- unsigned s0, s1;
+
+ KASSERT(nbytes % 16 == 0);
+
+ if (nbytes % 128) {
+ aesarm_xts_enc1(enc, in, out, nbytes % 128, tweak, nrounds);
+ in += nbytes % 128;
+ out += nbytes % 128;
+ nbytes -= nbytes % 128;
+ }
- t0 = le64dec(in + 8*0);
- t1 = le64dec(in + 8*1);
+ KASSERT(nbytes % 128 == 0);
+ if (nbytes)
+ aesarm_xts_enc8(enc, in, out, nbytes, tweak, nrounds);
+}
+
+static void
+aesarm_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
+ uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+ uint32_t nrounds)
+{
- s0 = t0 >> 63;
- s1 = t1 >> 63;
- t0 = (t0 << 1) ^ (-s1 & 0x87);
- t1 = (t1 << 1) ^ s0;
+ KASSERT(nbytes % 16 == 0);
- le64enc(out + 8*0, t0);
- le64enc(out + 8*1, t1);
+ if (nbytes % 128) {
+ aesarm_xts_dec1(dec, in, out, nbytes % 128, tweak, nrounds);
+ in += nbytes % 128;
+ out += nbytes % 128;
+ nbytes -= nbytes % 128;
+ }
+
+ KASSERT(nbytes % 128 == 0);
+ if (nbytes)
+ aesarm_xts_dec8(dec, in, out, nbytes, tweak, nrounds);
}
static int
@@ -155,44 +159,6 @@
return 0;
}
-static void
-aesarm_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
- uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
- uint32_t nrounds)
-{
- uint8_t tmp[16];
-
- KASSERT(nbytes % 16 == 0);
-
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
- xor(tmp, in, tweak);
- aesarm_enc(enc, tmp, tmp, nrounds);
- xor(out, tmp, tweak);
- aesarm_xts_update(tweak, tweak);
- }
-
- explicit_memset(tmp, 0, sizeof tmp);
-}
-
-static void
-aesarm_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
- uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
- uint32_t nrounds)
-{
- uint8_t tmp[16];
-
- KASSERT(nbytes % 16 == 0);
-
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
- xor(tmp, in, tweak);
- aesarm_dec(dec, tmp, tmp, nrounds);
- xor(out, tmp, tweak);
- aesarm_xts_update(tweak, tweak);
- }
-
- explicit_memset(tmp, 0, sizeof tmp);
-}
-
static int
aesarm_probe(void)
{
diff -r d2a7f58a15c1 -r 75dbad3a3753 sys/crypto/aes/arch/aarch64/aes_arm.h
--- a/sys/crypto/aes/arch/aarch64/aes_arm.h Sat Jun 13 23:34:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.h Sun Jun 14 00:57:26 2020 +0000
@@ -53,6 +53,16 @@
void aesarm_cbc_dec8(const struct aesdec *, const uint8_t[static 128],
uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void aesarm_xts_enc1(const struct aesenc *, const uint8_t[static 16],
+ uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aesarm_xts_enc8(const struct aesenc *, const uint8_t[static 128],
+ uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void aesarm_xts_dec1(const struct aesdec *, const uint8_t[static 16],
+ uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aesarm_xts_dec8(const struct aesdec *, const uint8_t[static 128],
+ uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+void aesarm_xts_update(const uint8_t[static 16], uint8_t[static 16]);
+
extern struct aes_impl aes_arm_impl;
#endif /* _CRYPTO_AES_AES_ARCH_AARCH64_AES_ARM_H */
diff -r d2a7f58a15c1 -r 75dbad3a3753 sys/crypto/aes/arch/aarch64/aesarmfunc.S
--- a/sys/crypto/aes/arch/aarch64/aesarmfunc.S Sat Jun 13 23:34:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aesarmfunc.S Sun Jun 14 00:57:26 2020 +0000
@@ -613,6 +613,263 @@
END(aesarm_cbc_dec8)
/*
+ * aesarm_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Encrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ * nbytes must be a positive integral multiple of 16. This routine
+ * is not vectorized; use aesarm_xts_enc8 for >=8 blocks at once.
+ *
+ * Standard ABI calling convention.
+ */
+ENTRY(aesarm_xts_enc1)
+ stp fp, lr, [sp, #-16]! /* push stack frame */
+ mov fp, sp
+ mov x9, x0 /* x9 := enckey */
+ mov x10, x3 /* x10 := nbytes */
+ ldr q9, [x4] /* q9 := tweak */
+1: ldr q0, [x1], #0x10 /* q0 := ptxt */
+ mov x0, x9 /* x0 := enckey */
+ mov x3, x5 /* x3 := nrounds */
+ eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
+ bl aesarm_enc1 /* q0 := AES(ptxt ^ tweak) */
+ eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
+ str q0, [x2], #0x10 /* store ciphertext block */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ subs x10, x10, #0x10 /* count down nbytes */
+ b.ne 1b /* repeat if more blocks */
+ str q9, [x4] /* update tweak */
+ ldp fp, lr, [sp], #16 /* pop stack frame */
+ ret
+END(aesarm_xts_enc1)
+
+/*
+ * aesarm_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Encrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ * nbytes must be a positive integral multiple of 128.
+ *
+ * Standard ABI calling convention.
+ */
+ENTRY(aesarm_xts_enc8)
+ stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */
+ mov fp, sp
+ mov x9, x0 /* x9 := enckey */
+ mov x10, x3 /* x10 := nbytes */
+ ldr q9, [x4] /* q9 := tweak */
+1: str q9, [sp, #16] /* save tweak[0] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ str q9, [sp, #32] /* save tweak[1] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ mov v10.16b, v9.16b /* q10 := tweak[2] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ mov v11.16b, v9.16b /* q11 := tweak[3] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ mov v12.16b, v9.16b /* q11 := tweak[4] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ mov v13.16b, v9.16b /* q11 := tweak[5] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ mov v14.16b, v9.16b /* q11 := tweak[6] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ mov v15.16b, v9.16b /* q11 := tweak[7] */
+ ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */
+ ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
+ ldp q2, q3, [x1], #0x20
+ ldp q4, q5, [x1], #0x20
+ ldp q6, q7, [x1], #0x20
+ eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+ mov x0, x9 /* x0 := enckey */
+ mov x3, x5 /* x3 := nrounds */
+ bl aesarm_enc8 /* encrypt q0,...,q7; trash x0/x3/q8 */
+ ldr q8, [sp, #16] /* reload q8 := tweak[0] */
+ eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+ stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
+ stp q2, q3, [x2], #0x20 /* store ciphertext blocks */
+ stp q4, q5, [x2], #0x20 /* store ciphertext blocks */
+ stp q6, q7, [x2], #0x20 /* store ciphertext blocks */
+ mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ subs x10, x10, #0x80 /* count down nbytes */
+ b.ne 1b /* repeat if more block groups */
+ str q9, [x4] /* update tweak */
+ ldp fp, lr, [sp], #48 /* pop stack frame */
+ ret
+END(aesarm_xts_enc8)
+
+/*
+ * aesarm_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Decrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ * nbytes must be a positive integral multiple of 16. This routine
+ * is not vectorized; use aesarm_xts_dec8 for >=8 blocks at once.
+ *
+ * Standard ABI calling convention.
+ */
+ENTRY(aesarm_xts_dec1)
+ stp fp, lr, [sp, #-16]! /* push stack frame */
+ mov fp, sp
+ mov x9, x0 /* x9 := deckey */
+ mov x10, x3 /* x10 := nbytes */
+ ldr q9, [x4] /* q9 := tweak */
+1: ldr q0, [x1], #0x10 /* q0 := ptxt */
+ mov x0, x9 /* x0 := deckey */
+ mov x3, x5 /* x3 := nrounds */
+ eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
+ bl aesarm_dec1 /* q0 := AES(ptxt ^ tweak) */
+ eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
+ str q0, [x2], #0x10 /* store ciphertext block */
+ bl aesarm_xts_mulx /* q9 *= x; trash x0/q0/q1 */
+ subs x10, x10, #0x10 /* count down nbytes */
+ b.ne 1b /* repeat if more blocks */
+ str q9, [x4] /* update tweak */
+ ldp fp, lr, [sp], #16 /* pop stack frame */
+ ret
+END(aesarm_xts_dec1)
+
+/*
+ * aesarm_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Decrypt a contiguous sequence of blocks with AES-XTS.
+ *
+ * nbytes must be a positive integral multiple of 128.
Home |
Main Index |
Thread Index |
Old Index