Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/crypto Fix ARM NEON implementations of AES and ChaCha on...
details: https://anonhg.NetBSD.org/src/rev/93dcbb5feb4f
branches: trunk
changeset: 936898:93dcbb5feb4f
user: riastradh <riastradh%NetBSD.org@localhost>
date: Sat Aug 08 14:47:01 2020 +0000
description:
Fix ARM NEON implementations of AES and ChaCha on big-endian ARM.
New macros such as VQ_N_U32(a,b,c,d) for NEON vector initializers.
Needed because GCC and Clang disagree on the ordering of lanes,
depending on whether it's 64-bit big-endian, 32-bit big-endian, or
little-endian -- and, bizarrely, both of them disagree with the
architectural numbering of lanes.
Experimented with using
static const uint8_t x8[16] = {...};
uint8x16_t x = vld1q_u8(x8);
which doesn't require knowing anything about the ordering of lanes,
but this generates considerably worse code and apparently confuses
GCC into not recognizing the constant value of x8.
Fix some clang mistakes while here too.
diffstat:
sys/crypto/aes/arch/arm/aes_armv8_64.S | 27 +---
sys/crypto/aes/arch/arm/aes_neon.c | 200 ++++++++++++++--------------
sys/crypto/aes/arch/arm/aes_neon_32.S | 74 +++++-----
sys/crypto/aes/arch/arm/aes_neon_impl.h | 3 +-
sys/crypto/aes/arch/arm/aes_neon_subr.c | 80 +++++-----
sys/crypto/aes/arch/arm/arm_neon.h | 171 ++++++++++++++++++++++-
sys/crypto/aes/arch/arm/arm_neon_imm.h | 80 +++++++++++
sys/crypto/chacha/arch/arm/arm_neon.h | 44 ++++-
sys/crypto/chacha/arch/arm/arm_neon_imm.h | 80 +++++++++++
sys/crypto/chacha/arch/arm/chacha_neon.c | 162 +++++++++------------
sys/crypto/chacha/arch/arm/chacha_neon_32.S | 178 +++++++-----------------
sys/crypto/chacha/arch/arm/chacha_neon_64.S | 132 +++++++++---------
12 files changed, 720 insertions(+), 511 deletions(-)
diffs (truncated from 2141 to 300 lines):
diff -r 19780c35d1f9 -r 93dcbb5feb4f sys/crypto/aes/arch/arm/aes_armv8_64.S
--- a/sys/crypto/aes/arch/arm/aes_armv8_64.S Sat Aug 08 14:43:28 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_armv8_64.S Sat Aug 08 14:47:01 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $ */
+/* $NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -26,11 +26,9 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
-#include <sys/endian.h>
-
#include <aarch64/asm.h>
-RCSID("$NetBSD: aes_armv8_64.S,v 1.11 2020/07/27 20:57:23 riastradh Exp $")
+RCSID("$NetBSD: aes_armv8_64.S,v 1.12 2020/08/08 14:47:01 riastradh Exp $")
.arch_extension aes
@@ -921,19 +919,13 @@
ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
-#endif
_ALIGN_TEXT
1: ldr q3, [x1], #0x10 /* q3 := plaintext block */
add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */
-#else
- mov v1.16b, v2.16b /* q1 := ctr (big-endian) */
-#endif
eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */
bl aesarmv8_enc2 /* q0 := auth', q1 := pad;
* trash x0/x3/q16 */
@@ -941,9 +933,7 @@
subs x10, x10, #0x10 /* count down bytes */
str q3, [x2], #0x10 /* store ciphertext block */
b.ne 1b /* repeat if more blocks */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */
-#endif
stp q0, q2, [x4] /* store updated auth/ctr */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
@@ -968,18 +958,12 @@
ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
-#endif
/* Decrypt the first block. */
add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
mov x3, x5 /* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */
-#else
- mov v0.16b, v2.16b /* q0 := ctr (big-endian) */
-#endif
ldr q3, [x1], #0x10 /* q3 := ctxt */
bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */
b 2f
@@ -995,11 +979,7 @@
add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */
-#else
- mov v0.16b, v2.16b /* q0 := ctr (big-endian) */
-#endif
ldr q3, [x1], #0x10 /* q3 := ctxt */
bl aesarmv8_enc2 /* q0 := pad, q1 := auth';
* trash x0/x3/q16 */
@@ -1009,15 +989,14 @@
eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */
b.ne 1b
-#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */
-#endif
/* Authenticate the last block. */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
mov v0.16b, v1.16b /* q0 := auth ^ ptxt */
bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */
+
stp q0, q2, [x4] /* store updated auth/ctr */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
diff -r 19780c35d1f9 -r 93dcbb5feb4f sys/crypto/aes/arch/arm/aes_neon.c
--- a/sys/crypto/aes/arch/arm/aes_neon.c Sat Aug 08 14:43:28 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.c Sat Aug 08 14:47:01 2020 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $ */
+/* $NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.4 2020/07/28 20:11:09 riastradh Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.5 2020/08/08 14:47:01 riastradh Exp $");
#include <sys/types.h>
@@ -60,141 +60,141 @@
static const uint8x16_t
mc_forward[4] = {
- {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
- 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
- {0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
- 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
- {0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
- 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
- {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
- 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
+ VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
+ 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
+ VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
+ 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
+ VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
+ 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
+ VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
+ 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
},
mc_backward[4] __aarch64_used = {
- {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
- 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
- {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
- 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
- {0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
- 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
- {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
- 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
+ VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
+ 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
+ VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
+ 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
+ VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
+ 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
+ VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
+ 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
},
ipt[2] __aarch64_used = {
- {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
- 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
- {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
- 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
+ VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
+ 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
+ VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
+ 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
},
opt[2] = {
- {0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
- 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
- {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
- 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
+ VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
+ 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
+ VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
+ 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
},
dipt[2] __aarch64_used = {
- {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
- 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
- {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
- 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
+ VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
+ 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
+ VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
+ 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
},
sb1[2] __aarch64_used = {
- {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
- 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
- {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
- 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
+ VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
+ 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
+ VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
+ 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
},
sb2[2] __aarch64_used = {
- {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
- 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
- {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
- 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
+ VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
+ 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
+ VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
+ 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
},
sbo[2] __aarch64_used = {
- {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
- 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
- {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
- 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
+ VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
+ 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
+ VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
+ 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
},
dsb9[2] __aarch64_used = {
- {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
- 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
- {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
- 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
+ VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
+ 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
+ VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
+ 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
},
dsbd[2] __aarch64_used = {
- {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
- 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
- {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
- 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
+ VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
+ 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
+ VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
+ 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
},
dsbb[2] __aarch64_used = {
- {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
- 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
- {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
- 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
+ VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
+ 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
+ VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
+ 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
},
dsbe[2] __aarch64_used = {
- {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
- 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
- {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
- 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
+ VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
+ 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
+ VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
+ 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
},
dsbo[2] __aarch64_used = {
- {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
- 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
- {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
- 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
+ VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
+ 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
+ VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
+ 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
},
dks1[2] = {
- {0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
- 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
- {0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
- 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
+ VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
+ 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
+ VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
+ 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
},
dks2[2] = {
- {0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
- 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
- {0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
- 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
+ VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
+ 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
+ VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
+ 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
},
dks3[2] = {
- {0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
- 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
- {0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
- 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
+ VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
+ 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
+ VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
+ 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
},
dks4[2] = {
- {0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
- 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
Home |
Main Index |
Thread Index |
Old Index