Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/crypto/external/bsd/openssl/lib/libcrypto/arch Add new files



details:   https://anonhg.NetBSD.org/src/rev/d69becfba5c3
branches:  trunk
changeset: 374665:d69becfba5c3
user:      christos <christos%NetBSD.org@localhost>
date:      Tue May 09 17:22:43 2023 +0000

description:
Add new files

diffstat:

 crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S     |  6026 +++++++++
 crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S         |  6027 ++++++++++
 crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S   |   347 +
 crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S |   357 +
 4 files changed, 12757 insertions(+), 0 deletions(-)

diffs (truncated from 12773 to 300 lines):

diff -r 333f2fdd3ef1 -r d69becfba5c3 crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S Tue May 09 17:22:43 2023 +0000
@@ -0,0 +1,6026 @@
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+.fpu   neon
+#ifdef __thumb2__
+.syntax        unified
+.thumb
+# define INST(a,b,c,d)   c,0xef,a,b
+#else
+.code  32
+# define INST(a,b,c,d)   a,b,c,0xf2
+#endif
+
+.text
+.globl aes_gcm_enc_128_kernel
+.type  aes_gcm_enc_128_kernel,%function
+.align 4
+aes_gcm_enc_128_kernel:
+       cbz     r1, .L128_enc_ret
+       stp     r19, r20, [sp, #-112]!
+       mov     r16, r4
+       mov     r8, r5
+       stp     r21, r22, [sp, #16]
+       stp     r23, r24, [sp, #32]
+       stp     d8, d9, [sp, #48]
+       stp     d10, d11, [sp, #64]
+       stp     d12, d13, [sp, #80]
+       stp     d14, d15, [sp, #96]
+
+       ldp     r10, r11, [r16]              @ ctr96_b64, ctr96_t32
+       ldp     r13, r14, [r8, #160]                     @ load rk10
+
+       ld1     {v11.16b}, [r3]
+       ext     v11.16b, v11.16b, v11.16b, #8
+       rev64   v11.16b, v11.16b
+       lsr     r5, r1, #3              @ byte_len
+       mov     r15, r5
+
+       ldr     q27, [r8, #144]                                @ load rk9
+       add     r4, r0, r1, lsr #3   @ end_input_ptr
+       sub     r5, r5, #1      @ byte_len - 1
+
+       lsr     r12, r11, #32
+       ldr     q15, [r3, #112]                        @ load h4l | h4h
+       ext     v15.16b, v15.16b, v15.16b, #8
+
+       fmov    d1, r10                               @ CTR block 1
+       rev     r12, r12                                @ rev_ctr32
+
+       add     r12, r12, #1                            @ increment rev_ctr32
+       orr     r11, r11, r11
+       ldr     q18, [r8, #0]                                  @ load rk0
+
+       rev     r9, r12                                 @ CTR block 1
+       add     r12, r12, #1                            @ CTR block 1
+       fmov    d3, r10                               @ CTR block 3
+
+       orr     r9, r11, r9, lsl #32            @ CTR block 1
+       ld1     { q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
+
+       fmov    v1.d[1], r9                               @ CTR block 1
+       rev     r9, r12                                 @ CTR block 2
+
+       fmov    d2, r10                               @ CTR block 2
+       orr     r9, r11, r9, lsl #32            @ CTR block 2
+       add     r12, r12, #1                            @ CTR block 2
+
+       fmov    v2.d[1], r9                               @ CTR block 2
+       rev     r9, r12                                 @ CTR block 3
+
+       orr     r9, r11, r9, lsl #32            @ CTR block 3
+       ldr     q19, [r8, #16]                                 @ load rk1
+
+       add     r12, r12, #1                            @ CTR block 3
+       fmov    v3.d[1], r9                               @ CTR block 3
+
+       ldr     q14, [r3, #80]                         @ load h3l | h3h
+       ext     v14.16b, v14.16b, v14.16b, #8
+
+       aese    q1, v18.16b
+       aesmc   q1, q1          @ AES block 1 - round 0
+       ldr     q20, [r8, #32]                                 @ load rk2
+
+       aese    q2, v18.16b
+       aesmc   q2, q2          @ AES block 2 - round 0
+       ldr     q12, [r3, #32]                         @ load h1l | h1h
+       ext     v12.16b, v12.16b, v12.16b, #8
+
+       aese    q0, v18.16b
+       aesmc   q0, q0          @ AES block 0 - round 0
+       ldr     q26, [r8, #128]                                @ load rk8
+
+       aese    q3, v18.16b
+       aesmc   q3, q3          @ AES block 3 - round 0
+       ldr     q21, [r8, #48]                                 @ load rk3
+
+       aese    q2, v19.16b
+       aesmc   q2, q2          @ AES block 2 - round 1
+       trn2    v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
+
+       aese    q0, v19.16b
+       aesmc   q0, q0          @ AES block 0 - round 1
+       ldr     q24, [r8, #96]                                 @ load rk6
+
+       aese    q1, v19.16b
+       aesmc   q1, q1          @ AES block 1 - round 1
+       ldr     q25, [r8, #112]                                @ load rk7
+
+       aese    q3, v19.16b
+       aesmc   q3, q3          @ AES block 3 - round 1
+       trn1    q9, v14.2d,    v15.2d                      @ h4h | h3h
+
+       aese    q0, v20.16b
+       aesmc   q0, q0          @ AES block 0 - round 2
+       ldr     q23, [r8, #80]                                 @ load rk5
+
+       aese    q1, v20.16b
+       aesmc   q1, q1          @ AES block 1 - round 2
+       ldr     q13, [r3, #64]                         @ load h2l | h2h
+       ext     v13.16b, v13.16b, v13.16b, #8
+
+       aese    q3, v20.16b
+       aesmc   q3, q3          @ AES block 3 - round 2
+
+       aese    q2, v20.16b
+       aesmc   q2, q2          @ AES block 2 - round 2
+       eor     v17.16b, v17.16b, q9                  @ h4k | h3k
+
+       aese    q0, v21.16b
+       aesmc   q0, q0          @ AES block 0 - round 3
+
+       aese    q1, v21.16b
+       aesmc   q1, q1          @ AES block 1 - round 3
+
+       aese    q2, v21.16b
+       aesmc   q2, q2          @ AES block 2 - round 3
+       ldr     q22, [r8, #64]                                 @ load rk4
+
+       aese    q3, v21.16b
+       aesmc   q3, q3          @ AES block 3 - round 3
+
+       and     r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+       trn2    v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
+
+       aese    q3, v22.16b
+       aesmc   q3, q3          @ AES block 3 - round 4
+       add     r5, r5, r0
+
+       aese    q2, v22.16b
+       aesmc   q2, q2          @ AES block 2 - round 4
+       cmp     r0, r5                   @ check if we have <= 4 blocks
+
+       aese    q0, v22.16b
+       aesmc   q0, q0          @ AES block 0 - round 4
+
+       aese    q3, v23.16b
+       aesmc   q3, q3          @ AES block 3 - round 5
+
+       aese    q2, v23.16b
+       aesmc   q2, q2          @ AES block 2 - round 5
+
+       aese    q0, v23.16b
+       aesmc   q0, q0          @ AES block 0 - round 5
+
+       aese    q3, v24.16b
+       aesmc   q3, q3          @ AES block 3 - round 6
+
+       aese    q1, v22.16b
+       aesmc   q1, q1          @ AES block 1 - round 4
+
+       aese    q2, v24.16b
+       aesmc   q2, q2          @ AES block 2 - round 6
+       trn1    q8,    v12.2d,    v13.2d                      @ h2h | h1h
+
+       aese    q0, v24.16b
+       aesmc   q0, q0          @ AES block 0 - round 6
+
+       aese    q1, v23.16b
+       aesmc   q1, q1          @ AES block 1 - round 5
+
+       aese    q3, v25.16b
+       aesmc   q3, q3          @ AES block 3 - round 7
+
+       aese    q0, v25.16b
+       aesmc   q0, q0          @ AES block 0 - round 7
+
+       aese    q1, v24.16b
+       aesmc   q1, q1          @ AES block 1 - round 6
+
+       aese    q2, v25.16b
+       aesmc   q2, q2          @ AES block 2 - round 7
+
+       aese    q0, v26.16b
+       aesmc   q0, q0          @ AES block 0 - round 8
+
+       aese    q1, v25.16b
+       aesmc   q1, q1          @ AES block 1 - round 7
+
+       aese    q2, v26.16b
+       aesmc   q2, q2          @ AES block 2 - round 8
+
+       aese    q3, v26.16b
+       aesmc   q3, q3          @ AES block 3 - round 8
+
+       aese    q1, v26.16b
+       aesmc   q1, q1          @ AES block 1 - round 8
+
+       aese    q2, v27.16b                                      @ AES block 2 - round 9
+
+       aese    q0, v27.16b                                      @ AES block 0 - round 9
+
+       eor     v16.16b, v16.16b, q8                     @ h2k | h1k
+
+       aese    q1, v27.16b                                      @ AES block 1 - round 9
+
+       aese    q3, v27.16b                                      @ AES block 3 - round 9
+       bge     .L128_enc_tail                                    @ handle tail
+
+       ldp     r6, r7, [r0, #0]            @ AES block 0 - load plaintext
+
+       ldp     r21, r22, [r0, #32]           @ AES block 2 - load plaintext
+
+       ldp     r19, r20, [r0, #16]           @ AES block 1 - load plaintext
+
+       ldp     r23, r24, [r0, #48]           @ AES block 3 - load plaintext
+
+       eor     r6, r6, r13                     @ AES block 0 - round 10 low
+       eor     r7, r7, r14                     @ AES block 0 - round 10 high
+
+       eor     r21, r21, r13                     @ AES block 2 - round 10 low
+       fmov    d4, r6                               @ AES block 0 - mov low
+
+       eor     r19, r19, r13                     @ AES block 1 - round 10 low
+       eor     r22, r22, r14                     @ AES block 2 - round 10 high
+       fmov    v4.d[1], r7                           @ AES block 0 - mov high
+
+       fmov    d5, r19                               @ AES block 1 - mov low
+       eor     r20, r20, r14                     @ AES block 1 - round 10 high
+
+       eor     r23, r23, r13                     @ AES block 3 - round 10 low
+       fmov    v5.d[1], r20                           @ AES block 1 - mov high
+
+       fmov    d6, r21                               @ AES block 2 - mov low
+       eor     r24, r24, r14                     @ AES block 3 - round 10 high
+       rev     r9, r12                                 @ CTR block 4
+
+       fmov    v6.d[1], r22                           @ AES block 2 - mov high
+       orr     r9, r11, r9, lsl #32            @ CTR block 4
+
+       eor     q4, q4, q0                          @ AES block 0 - result
+       fmov    d0, r10                               @ CTR block 4
+       add     r12, r12, #1                            @ CTR block 4
+
+       fmov    v0.d[1], r9                               @ CTR block 4
+       rev     r9, r12                                 @ CTR block 5
+
+       eor     q5, q5, q1                          @ AES block 1 - result
+       fmov    d1, r10                               @ CTR block 5
+       orr     r9, r11, r9, lsl #32            @ CTR block 5
+
+       add     r12, r12, #1                            @ CTR block 5
+       add     r0, r0, #64                       @ AES input_ptr update
+       fmov    v1.d[1], r9                               @ CTR block 5
+
+       fmov    d7, r23                               @ AES block 3 - mov low
+       rev     r9, r12                                 @ CTR block 6
+       st1     { q4}, [r2], #16                     @ AES block 0 - store result
+
+       fmov    v7.d[1], r24                           @ AES block 3 - mov high
+       orr     r9, r11, r9, lsl #32            @ CTR block 6
+
+       add     r12, r12, #1                            @ CTR block 6
+       eor     q6, q6, q2                          @ AES block 2 - result
+       st1     { q5}, [r2], #16                     @ AES block 1 - store result
+
+       fmov    d2, r10                               @ CTR block 6
+       cmp     r0, r5                   @ check if we have <= 8 blocks
+
+       fmov    v2.d[1], r9                               @ CTR block 6
+       rev     r9, r12                                 @ CTR block 7
+       st1     { q6}, [r2], #16                     @ AES block 2 - store result
+
+       orr     r9, r11, r9, lsl #32            @ CTR block 7
+
+       eor     q7, q7, q3                          @ AES block 3 - result
+       st1     { q7}, [r2], #16                     @ AES block 3 - store result
+       bge     .L128_enc_prepretail                              @ do prepretail
+
+.L128_enc_main_loop:@ main loop start
+       ldp     r23, r24, [r0, #48]           @ AES block 4k+3 - load plaintext
+       rev64   q4, q4                                    @ GHASH block 4k (only t0 is free)
+       rev64   q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
+
+       aese    q2, v18.16b
+       aesmc   q2, q2          @ AES block 4k+6 - round 0
+       fmov    d3, r10                               @ CTR block 4k+3



Home | Main Index | Thread Index | Old Index