Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src-draft/trunk]: src/sys/crypto/aes/arch/arm Provide hand-written AES NEON ...



details:   https://anonhg.NetBSD.org/src-all/rev/7d34c99e3b9a
branches:  trunk
changeset: 935281:7d34c99e3b9a
user:      Taylor R Campbell <riastradh%NetBSD.org@localhost>
date:      Mon Jun 29 08:18:16 2020 +0000

description:
Provide hand-written AES NEON assembly for arm32.

gcc does a lousy job at compiling 128-bit NEON intrinsics on arm32;
hand-writing it made it about 12x faster, by avoiding a zillion loads
and stores to spill everything and the kitchen sink onto the stack.
(But gcc does fine on aarch64, presumably because it has twice as
many registers and doesn't have to deal with q2=d4/d5 overlapping.)

diffstat:

 sys/crypto/aes/arch/arm/aes_neon.c        |   40 +-
 sys/crypto/aes/arch/arm/aes_neon_encdec.S |  653 ++++++++++++++++++++++++++++++
 sys/crypto/aes/arch/arm/files.aesneon     |    4 +
 3 files changed, 685 insertions(+), 12 deletions(-)

diffs (truncated from 797 to 300 lines):

diff -r cd5bb502d348 -r 7d34c99e3b9a sys/crypto/aes/arch/arm/aes_neon.c
--- a/sys/crypto/aes/arch/arm/aes_neon.c        Sun Jun 28 15:25:29 2020 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.c        Mon Jun 29 08:18:16 2020 +0000
@@ -47,6 +47,12 @@
 
 #include "aes_neon_internal.h"
 
+#ifdef __aarch64__
+#define        __aarch64_used
+#else
+#define        __aarch64_used  __unused
+#endif
+
 static const uint8x16_t
 mc_forward[4] = {
        {0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
@@ -58,7 +64,7 @@
        {0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
         0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
 },
-mc_backward[4] = {
+mc_backward[4] __aarch64_used = {
        {0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
         0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
        {0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
@@ -68,7 +74,7 @@
        {0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
         0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
 },
-ipt[2] = {
+ipt[2] __aarch64_used = {
        {0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
         0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
        {0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
@@ -80,55 +86,55 @@
        {0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
         0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
 },
-dipt[2] = {
+dipt[2] __aarch64_used = {
        {0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
         0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
        {0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
         0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
 },
-sb1[2] = {
+sb1[2] __aarch64_used = {
        {0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
         0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
        {0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
         0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
 },
-sb2[2] = {
+sb2[2] __aarch64_used = {
        {0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
         0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
        {0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
         0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
 },
-sbo[2] = {
+sbo[2] __aarch64_used = {
        {0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
         0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
        {0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
         0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
 },
-dsb9[2] = {
+dsb9[2] __aarch64_used = {
        {0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
         0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
        {0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
         0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
 },
-dsbd[2] = {
+dsbd[2] __aarch64_used = {
        {0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
         0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
        {0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
         0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
 },
-dsbb[2] = {
+dsbb[2] __aarch64_used = {
        {0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
         0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
        {0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
         0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
 },
-dsbe[2] = {
+dsbe[2] __aarch64_used = {
        {0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
         0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
        {0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
         0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
 },
-dsbo[2] = {
+dsbo[2] __aarch64_used = {
        {0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
         0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
        {0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
@@ -164,7 +170,7 @@
        {0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
         0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
 },
-sr[4] = {
+sr[4] __aarch64_used = {
        {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
         0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
        {0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
@@ -546,6 +552,14 @@
        storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
 }
 
+#ifdef __aarch64__
+
+/*
+ * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
+ * do the performance-critical parts -- encryption and decryption -- in
+ * hand-written assembly on arm32.
+ */
+
 uint8x16_t
 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
 {
@@ -616,3 +630,5 @@
        x ^= loadroundkey(rk32);
        return vqtbl1q_u8(x, sr[i]);
 }
+
+#endif
diff -r cd5bb502d348 -r 7d34c99e3b9a sys/crypto/aes/arch/arm/aes_neon_encdec.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_encdec.S Mon Jun 29 08:18:16 2020 +0000
@@ -0,0 +1,653 @@
+/*     $NetBSD$        */
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arm/asm.h>
+
+       .fpu    neon
+
+       .section .rodata
+       .p2align 4
+
+       .type   inv,_ASM_TYPE_OBJECT
+inv:
+       .byte   0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
+       .byte   0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
+END(inv)
+
+       .type   inva,_ASM_TYPE_OBJECT
+inva:
+       .byte   0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
+       .byte   0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
+END(inva)
+
+       .type   mc_forward,_ASM_TYPE_OBJECT
+mc_forward:
+       .byte   0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
+       .byte   0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
+
+       .byte   0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
+       .byte   0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
+
+       .byte   0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
+       .byte   0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
+
+.Lmc_forward_3:
+       .byte   0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
+       .byte   0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
+END(mc_forward)
+
+       .type   mc_backward,_ASM_TYPE_OBJECT
+mc_backward:
+       .byte   0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
+       .byte   0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
+
+       .byte   0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
+       .byte   0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
+
+       .byte   0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
+       .byte   0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
+
+       .byte   0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
+       .byte   0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
+END(mc_backward)
+
+       .type   sr,_ASM_TYPE_OBJECT
+sr:
+       .byte   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */
+       .byte   0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
+
+       .byte   0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */
+       .byte   0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
+
+       .byte   0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */
+       .byte   0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
+
+       .byte   0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */
+       .byte   0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
+END(sr)
+
+       .type   iptlo,_ASM_TYPE_OBJECT
+iptlo:
+       .byte   0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
+       .byte   0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
+END(iptlo)
+
+       .type   ipthi,_ASM_TYPE_OBJECT
+ipthi:
+       .byte   0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
+       .byte   0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
+END(ipthi)
+
+       .type   sb1_0,_ASM_TYPE_OBJECT
+sb1_0:
+       .byte   0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
+       .byte   0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
+END(sb1_0)
+
+       .type   sb1_1,_ASM_TYPE_OBJECT
+sb1_1:
+       .byte   0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
+       .byte   0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
+END(sb1_1)
+
+       .type   sb2_0,_ASM_TYPE_OBJECT
+sb2_0:
+       .byte   0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
+       .byte   0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
+END(sb2_0)
+
+       .type   sb2_1,_ASM_TYPE_OBJECT
+sb2_1:
+       .byte   0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
+       .byte   0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
+END(sb2_1)
+
+       .type   sbo_0,_ASM_TYPE_OBJECT
+sbo_0:
+       .byte   0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
+       .byte   0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
+END(sbo_0)
+
+       .type   sbo_1,_ASM_TYPE_OBJECT
+sbo_1:
+       .byte   0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
+       .byte   0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
+END(sbo_1)
+
+       .type   diptlo,_ASM_TYPE_OBJECT
+diptlo:
+       .byte   0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
+       .byte   0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
+END(diptlo)
+
+       .type   dipthi,_ASM_TYPE_OBJECT
+dipthi:
+       .byte   0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
+       .byte   0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
+END(dipthi)
+
+       .type   dsb9_0,_ASM_TYPE_OBJECT
+dsb9_0:
+       .byte   0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
+       .byte   0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
+END(dsb9_0)
+
+       .type   dsb9_1,_ASM_TYPE_OBJECT
+dsb9_1:
+       .byte   0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
+       .byte   0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
+END(dsb9_1)
+
+       .type   dsbd_0,_ASM_TYPE_OBJECT
+dsbd_0:



Home | Main Index | Thread Index | Old Index