Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/crypto/chacha/arch/arm Note that VSRI seems to hurt here.



details:   https://anonhg.NetBSD.org/src/rev/b81932f2cd0c
branches:  trunk
changeset: 936488:b81932f2cd0c
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Mon Jul 27 20:58:56 2020 +0000

description:
Note that VSRI seems to hurt here.

diffstat:

 sys/crypto/chacha/arch/arm/arm_neon.h    |  36 +++++++++++++++++++++++++++++++-
 sys/crypto/chacha/arch/arm/chacha_neon.c |  10 ++++++++-
 2 files changed, 44 insertions(+), 2 deletions(-)

diffs (75 lines):

diff -r eeda7666768a -r b81932f2cd0c sys/crypto/chacha/arch/arm/arm_neon.h
--- a/sys/crypto/chacha/arch/arm/arm_neon.h     Mon Jul 27 20:58:06 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/arm_neon.h     Mon Jul 27 20:58:56 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: arm_neon.h,v 1.2 2020/07/27 20:58:06 riastradh Exp $   */
+/*     $NetBSD: arm_neon.h,v 1.3 2020/07/27 20:58:56 riastradh Exp $   */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -529,6 +529,40 @@
 #endif /* __LITTLE_ENDIAN__ */
 #endif
 
+#if defined(__GNUC__) && !defined(__clang__)
+_INTRINSATTR
+static __inline uint32x4_t
+vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+       return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
+#else
+       return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
+           (int32x4_t)__vsh, __bits);
+#endif
+}
+#elif defined(__clang__)
+#ifdef __LITTLE_ENDIAN__
+#define        vsriq_n_u32(__vins, __vsh, __bits)                                    \
+       (int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins),              \
+           (int32x4_t)(__vsh), (__bits), 34)
+#else
+#define        vsliq_n_s32(__vins, __vsh, __bits) (                                  \
+{                                                                            \
+       int32x4_t __tvins = (__vins);                                         \
+       int32x4_t __tvsh = (__vsh);                                           \
+       uint8_t __tbits = (__bits);                                           \
+       int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins,        \
+           3,2,1,0);                                                         \
+       int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh,           \
+           3,2,1,0);                                                         \
+       int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits,    \
+           34);                                                              \
+       __builtin_shufflevector(__r, __r, 3,2,1,0);                           \
+})
+#endif
+#endif
+
 _INTRINSATTR
 static __inline void
 vst1q_u32(uint32_t *__p32, uint32x4_t __v)
diff -r eeda7666768a -r b81932f2cd0c sys/crypto/chacha/arch/arm/chacha_neon.c
--- a/sys/crypto/chacha/arch/arm/chacha_neon.c  Mon Jul 27 20:58:06 2020 +0000
+++ b/sys/crypto/chacha/arch/arm/chacha_neon.c  Mon Jul 27 20:58:56 2020 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: chacha_neon.c,v 1.4 2020/07/27 20:58:06 riastradh Exp $        */
+/*     $NetBSD: chacha_neon.c,v 1.5 2020/07/27 20:58:56 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2020 The NetBSD Foundation, Inc.
@@ -36,7 +36,15 @@
 vrolq_n_u32(uint32x4_t x, uint8_t n)
 {
 
+       /*
+        * Tempting to use VSHL/VSRI instead of VSHL/VSHR/VORR, but in
+        * practice it hurts performance at least on Cortex-A8.
+        */
+#if 1
        return vshlq_n_u32(x, n) | vshrq_n_u32(x, 32 - n);
+#else
+       return vsriq_n_u32(vshlq_n_u32(x, n), x, 32 - n);
+#endif
 }
 
 static inline uint32x4_t



Home | Main Index | Thread Index | Old Index