Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src-draft/trunk]: src/sys/crypto/aes/arch/aarch64 Vectorize AES-CBC on aarch64.
details: https://anonhg.NetBSD.org/src-all/rev/d2a7f58a15c1
branches: trunk
changeset: 934574:d2a7f58a15c1
user: Taylor R Campbell <riastradh%NetBSD.org@localhost>
date: Sat Jun 13 23:34:32 2020 +0000
description:
Vectorize AES-CBC on aarch64.
Can't parallelize encryption but can at least keep the state in
vector registers rather than loading and storing memory.
diffstat:
sys/crypto/aes/arch/aarch64/aes_arm.c | 30 +--
sys/crypto/aes/arch/aarch64/aes_arm.h | 7 +
sys/crypto/aes/arch/aarch64/aesarmfunc.S | 223 +++++++++++++++++++++++++++++++
3 files changed, 239 insertions(+), 21 deletions(-)
diffs (truncated from 309 to 300 lines):
diff -r b7efbff87b38 -r d2a7f58a15c1 sys/crypto/aes/arch/aarch64/aes_arm.c
--- a/sys/crypto/aes/arch/aarch64/aes_arm.c Sat Jun 13 16:43:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.c Sat Jun 13 23:34:32 2020 +0000
@@ -88,35 +88,23 @@
}
static void
-aesarm_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
+aesarm_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
uint32_t nrounds)
{
KASSERT(nbytes % 16 == 0);
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
- xor(iv, in, iv);
- aesarm_enc(enc, iv, out, nrounds);
- memcpy(iv, out, 16);
+ if (nbytes % 128) {
+ aesarm_cbc_dec1(dec, in, out, nbytes % 128, iv, nrounds);
+ in += nbytes % 128;
+ out += nbytes % 128;
+ nbytes -= nbytes % 128;
}
-}
-static void
-aesarm_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
- uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
- uint32_t nrounds)
-{
- uint8_t tmp[16];
-
- KASSERT(nbytes % 16 == 0);
-
- for (; nbytes; nbytes -= 16, in += 16, out += 16) {
- memcpy(tmp, in, 16);
- aesarm_dec(dec, in, out, nrounds);
- xor(out, out, iv);
- memcpy(iv, tmp, 16);
- }
+ KASSERT(nbytes % 128 == 0);
+ if (nbytes)
+ aesarm_cbc_dec8(dec, in, out, nbytes, iv, nrounds);
}
static inline void
diff -r b7efbff87b38 -r d2a7f58a15c1 sys/crypto/aes/arch/aarch64/aes_arm.h
--- a/sys/crypto/aes/arch/aarch64/aes_arm.h Sat Jun 13 16:43:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aes_arm.h Sat Jun 13 23:34:32 2020 +0000
@@ -46,6 +46,13 @@
void aesarm_dec(const struct aesdec *, const uint8_t[static 16],
uint8_t[static 16], uint32_t);
+void aesarm_cbc_enc(const struct aesenc *, const uint8_t[static 16],
+ uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aesarm_cbc_dec1(const struct aesdec *, const uint8_t[static 16],
+ uint8_t[static 16], size_t, const uint8_t[static 16], uint32_t);
+void aesarm_cbc_dec8(const struct aesdec *, const uint8_t[static 128],
+ uint8_t[static 128], size_t, const uint8_t[static 16], uint32_t);
+
extern struct aes_impl aes_arm_impl;
#endif /* _CRYPTO_AES_AES_ARCH_AARCH64_AES_ARM_H */
diff -r b7efbff87b38 -r d2a7f58a15c1 sys/crypto/aes/arch/aarch64/aesarmfunc.S
--- a/sys/crypto/aes/arch/aarch64/aesarmfunc.S Sat Jun 13 16:43:32 2020 +0000
+++ b/sys/crypto/aes/arch/aarch64/aesarmfunc.S Sat Jun 13 23:34:32 2020 +0000
@@ -484,6 +484,135 @@
END(aesarm_dec)
/*
+ * aesarm_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Encrypt a contiguous sequence of blocks with AES-CBC.
+ *
+ * nbytes must be an integral multiple of 16.
+ *
+ * Standard ABI calling convention.
+ */
+ENTRY(aesarm_cbc_enc)
+ cbz x3, 2f /* stop if nothing to do */
+ stp fp, lr, [sp, #-16]! /* push stack frame */
+ mov fp, sp
+ mov x9, x0 /* x9 := enckey */
+ mov x10, x3 /* x10 := nbytes */
+ ldr q0, [x4] /* q0 := chaining value */
+1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
+ eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
+ mov x0, x9 /* x0 := enckey */
+ mov x3, x5 /* x3 := nrounds */
+ bl aesarm_enc1 /* q0 := ciphertext block */
+ subs x10, x10, #0x10 /* count down nbytes */
+ str q0, [x2], #0x10 /* store ciphertext block */
+ b.ne 1b /* repeat if x10 is nonzero */
+ str q0, [x4] /* store chaining value */
+ ldp fp, lr, [sp], #16 /* pop stack frame */
+2: ret
+END(aesarm_cbc_enc)
+
+/*
+ * aesarm_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Decrypt a contiguous sequence of blocks with AES-CBC.
+ *
+ * nbytes must be a positive integral multiple of 16. This routine
+ * is not vectorized; use aesarm_cbc_dec8 for >=8 blocks at once.
+ *
+ * Standard ABI calling convention.
+ */
+ENTRY(aesarm_cbc_dec1)
+ stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
+ mov fp, sp
+ ldr q8, [x4] /* q8 := iv */
+ str q8, [sp, #16] /* save iv */
+ mov x9, x0 /* x9 := enckey */
+ mov x10, x3 /* x10 := nbytes */
+ add x1, x1, x3 /* x1 := pointer past end of in */
+ add x2, x2, x3 /* x2 := pointer past end of out */
+ ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
+ str q0, [x4] /* update iv */
+1: mov x0, x9 /* x0 := enckey */
+ mov x3, x5 /* x3 := nrounds */
+ bl aesarm_dec1 /* q0 := cv ^ ptxt; trash x0/x3 */
+ subs x10, x10, #0x10 /* count down nbytes */
+ b.eq 2f /* stop if this is the first block */
+ ldr q8, [x1, #-0x10]! /* q8 := chaining value */
+ eor v0.16b, v0.16b, v8.16b /* q0 := plaintext block */
+ str q0, [x2, #-0x10]! /* store plaintext block */
+ mov v0.16b, v8.16b /* move cv = ciphertext block */
+ b 1b
+2: ldr q8, [sp, #16] /* q8 := iv */
+ eor v0.16b, v0.16b, v8.16b /* q0 := first plaintext block */
+ str q0, [x2, #-0x10]! /* store first plaintext block */
+ ldp fp, lr, [sp], #32 /* pop stack frame */
+ ret
+END(aesarm_cbc_dec1)
+
+/*
+ * aesarm_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
+ * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
+ * uint32_t nrounds@x5)
+ *
+ * Decrypt a contiguous sequence of 8-block units with AES-CBC.
+ *
+ * nbytes must be a positive integral multiple of 128.
+ *
+ * Standard ABI calling convention.
+ */
+ENTRY(aesarm_cbc_dec8)
+ stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
+ mov fp, sp
+ ldr q8, [x4] /* q8 := iv */
+ str q8, [sp, #16] /* save iv */
+ mov x9, x0 /* x9 := enckey */
+ mov x10, x3 /* x10 := nbytes */
+ add x1, x1, x3 /* x1 := pointer past end of in */
+ add x2, x2, x3 /* x2 := pointer past end of out */
+ ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
+ str q7, [x4] /* update iv */
+1: ldp q4, q5, [x1, #-0x20]!
+ ldp q2, q3, [x1, #-0x20]!
+ ldp q0, q1, [x1, #-0x20]!
+ mov v15.16b, v6.16b /* q[8+i] := cv[i], 0<i<8 */
+ mov v14.16b, v5.16b
+ mov v13.16b, v4.16b
+ mov v12.16b, v3.16b
+ mov v11.16b, v2.16b
+ mov v10.16b, v1.16b
+ mov v9.16b, v0.16b
+ mov x0, x9 /* x0 := enckey */
+ mov x3, x5 /* x3 := nrounds */
+ bl aesarm_dec8 /* q[i] := cv[i] ^ pt[i] */
+ eor v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */
+ eor v6.16b, v6.16b, v14.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v1.16b, v1.16b, v9.16b
+ subs x10, x10, #0x80 /* count down nbytes */
+ stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
+ stp q4, q5, [x2, #-0x20]!
+ stp q2, q3, [x2, #-0x20]!
+ b.eq 2f /* stop if this is the first block */
+ ldp q6, q7, [x1, #-0x20]!
+ eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
+ stp q0, q1, [x2, #-0x20]!
+ b 1b
+2: ldr q8, [sp, #16] /* q8 := iv */
+ eor v0.16b, v0.16b, v8.16b /* q0 := pt0 */
+ stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
+ ldp fp, lr, [sp], #32 /* pop stack frame */
+ ret
+END(aesarm_cbc_dec8)
+
+/*
* aesarm_enc1(const struct aesenc *enckey@x0,
* uint128_t block@q0, uint32_t nrounds@x3)
*
@@ -509,6 +638,53 @@
END(aesarm_enc1)
/*
+ * aesarm_enc8(const struct aesenc *enckey@x0,
+ * uint128_t block0@q0, ..., uint128_t block7@q7,
+ * uint32_t nrounds@x3)
+ *
+ * Encrypt eight AES blocks in q0 through q7 in parallel.
+ *
+ * Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
+ */
+ .text
+ _ALIGN_TEXT
+ .type aesarm_enc8,@function
+aesarm_enc8:
+ ldr q8, [x0], #0x10 /* load round key */
+1: subs x3, x3, #1
+ /* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
+ aese v0.16b, v8.16b
+ aese v1.16b, v8.16b
+ aese v2.16b, v8.16b
+ aese v3.16b, v8.16b
+ aese v4.16b, v8.16b
+ aese v5.16b, v8.16b
+ aese v6.16b, v8.16b
+ aese v7.16b, v8.16b
+ ldr q8, [x0], #0x10 /* load next round key */
+ b.eq 2f
+ /* q[i] := MixColumns(q[i]) */
+ aesmc v0.16b, v0.16b
+ aesmc v1.16b, v1.16b
+ aesmc v2.16b, v2.16b
+ aesmc v3.16b, v3.16b
+ aesmc v4.16b, v4.16b
+ aesmc v5.16b, v5.16b
+ aesmc v6.16b, v6.16b
+ aesmc v7.16b, v7.16b
+ b 1b
+2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */
+ eor v1.16b, v1.16b, v8.16b
+ eor v2.16b, v2.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ ret
+END(aesarm_enc8)
+
+/*
* aesarm_dec1(const struct aesdec *deckey@x0,
* uint128_t block@q0, uint32_t nrounds@x3)
*
@@ -532,3 +708,50 @@
2: eor v0.16b, v0.16b, v8.16b
ret
END(aesarm_dec1)
+
+/*
+ * aesarm_dec8(const struct aesdec *deckey@x0,
+ * uint128_t block0@q0, ..., uint128_t block7@q7,
+ * uint32_t nrounds@x3)
+ *
+ * Decrypt eight AES blocks in q0 through q7 in parallel.
+ *
+ * Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
+ */
+ .text
+ _ALIGN_TEXT
+ .type aesarm_dec8,@function
+aesarm_dec8:
+ ldr q8, [x0], #0x10 /* load round key */
+1: subs x3, x3, #1
+ /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
+ aesd v0.16b, v8.16b
+ aesd v1.16b, v8.16b
+ aesd v2.16b, v8.16b
+ aesd v3.16b, v8.16b
+ aesd v4.16b, v8.16b
+ aesd v5.16b, v8.16b
+ aesd v6.16b, v8.16b
+ aesd v7.16b, v8.16b
+ ldr q8, [x0], #0x10 /* load next round key */
+ b.eq 2f
+ /* q[i] := InMixColumns(q[i]) */
+ aesimc v0.16b, v0.16b
+ aesimc v1.16b, v1.16b
+ aesimc v2.16b, v2.16b
+ aesimc v3.16b, v3.16b
+ aesimc v4.16b, v4.16b
+ aesimc v5.16b, v5.16b
+ aesimc v6.16b, v6.16b
+ aesimc v7.16b, v7.16b
+ b 1b
+2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */
Home |
Main Index |
Thread Index |
Old Index