Port-sparc archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

bswap is slow on SPARC



Hio,

While benchmarking disk encryption/decryption on a 500MHz
UltraSPARC IIe, myself and riastradh noticed that there's a
bottleneck. For most NetBSD architectures, GCC's __builtin_bswapX
inserts fast inline MD code. For SPARC, SPARC64, and VAX, it falls
back to a function call.

When encryption algorithms expect to be able to encode little-endian
integers in a tight loop, that adds up to being *slow*. Especially
when NetBSD encrypts swap by default now.

By reusing the existing inline code, and avoiding the function call,
I found that disk decryption throughput increased by as much as 1 MiB/s
(that matters when it's only about 4 MiB/s to begin with).

You can test this yourself by setting up a cgd on a vnd, e.g.

# dd if=/dev/zero of=/testfile bs=1m count=1024
# vndconfig vnd0 /testfile
# cgdconfig -s cgd0 /dev/vnd0c adiantum 256 < /dev/urandom
# dd if=/dev/rcgd0 bs=512k | progress dd of=/dev/null bs=512k

Patch attached.
Index: sys/sys/bswap.h
===================================================================
RCS file: /cvsroot/src/sys/sys/bswap.h,v
retrieving revision 1.19
diff -u -p -r1.19 bswap.h
--- sys/sys/bswap.h	12 Mar 2015 15:28:16 -0000	1.19
+++ sys/sys/bswap.h	24 Nov 2025 13:21:43 -0000
@@ -24,20 +24,7 @@ __END_DECLS
 
 #if defined(__GNUC__) && !defined(__lint__)
 
-/* machine/byte_swap.h might have defined inline versions */
-#ifndef __BYTE_SWAP_U64_VARIABLE
-#define	__BYTE_SWAP_U64_VARIABLE bswap64
-#endif
-
-#ifndef __BYTE_SWAP_U32_VARIABLE
-#define	__BYTE_SWAP_U32_VARIABLE bswap32
-#endif
-
-#ifndef __BYTE_SWAP_U16_VARIABLE
-#define	__BYTE_SWAP_U16_VARIABLE bswap16
-#endif
-
-#define	__byte_swap_u64_constant(x) \
+#define	__byte_swap_u64(x) \
 	(__CAST(uint64_t, \
 	 ((((x) & 0xff00000000000000ull) >> 56) | \
 	  (((x) & 0x00ff000000000000ull) >> 40) | \
@@ -48,29 +35,68 @@ __END_DECLS
 	  (((x) & 0x000000000000ff00ull) << 40) | \
 	  (((x) & 0x00000000000000ffull) << 56))))
 
-#define	__byte_swap_u32_constant(x) \
+#define	__byte_swap_u32(x) \
 	(__CAST(uint32_t, \
 	((((x) & 0xff000000) >> 24) | \
 	 (((x) & 0x00ff0000) >>  8) | \
 	 (((x) & 0x0000ff00) <<  8) | \
 	 (((x) & 0x000000ff) << 24))))
 
-#define	__byte_swap_u16_constant(x) \
+#define	__byte_swap_u16(x) \
 	(__CAST(uint16_t, \
 	((((x) & 0xff00) >> 8) | \
 	 (((x) & 0x00ff) << 8))))
 
+/*
+ * The compiler always generates an expensive function call to bswap
+ * on some architectures, we want the inline versions there.
+ */
+#ifdef _BSWAP_IS_SLOW
+
+static __inline uint64_t __byte_swap_u64_inline(uint64_t x) {
+	return __byte_swap_u64(x);
+}
+
+static __inline uint32_t __byte_swap_u32_inline(uint32_t x) {
+	return __byte_swap_u32(x);
+}
+
+static __inline uint16_t __byte_swap_u16_inline(uint16_t x) {
+	return __byte_swap_u16(x);
+}
+
+#define	__BYTE_SWAP_U64_VARIABLE __byte_swap_u64_inline
+#define	__BYTE_SWAP_U32_VARIABLE __byte_swap_u32_inline
+#define	__BYTE_SWAP_U16_VARIABLE __byte_swap_u16_inline
+
+#else
+
+/* allow machine/bswap.h to override these with inline versions */
+#ifndef __BYTE_SWAP_U64_VARIABLE
+#define	__BYTE_SWAP_U64_VARIABLE bswap64
+#endif
+
+#ifndef __BYTE_SWAP_U32_VARIABLE
+#define	__BYTE_SWAP_U32_VARIABLE bswap32
+#endif
+
+#ifndef __BYTE_SWAP_U16_VARIABLE
+#define	__BYTE_SWAP_U16_VARIABLE bswap16
+#endif
+
+#endif /* _BSWAP_IS_SLOW */
+
 #define	bswap64(x) \
 	__CAST(uint64_t, __builtin_constant_p((x)) ? \
-	 __byte_swap_u64_constant(x) : __BYTE_SWAP_U64_VARIABLE(x))
+	 __byte_swap_u64(x) : __BYTE_SWAP_U64_VARIABLE(x))
 
 #define	bswap32(x) \
 	__CAST(uint32_t, __builtin_constant_p((x)) ? \
-	 __byte_swap_u32_constant(x) : __BYTE_SWAP_U32_VARIABLE(x))
+	 __byte_swap_u32(x) : __BYTE_SWAP_U32_VARIABLE(x))
 
 #define	bswap16(x) \
 	__CAST(uint16_t, __builtin_constant_p((x)) ? \
-	 __byte_swap_u16_constant(x) : __BYTE_SWAP_U16_VARIABLE(x))
+	 __byte_swap_u16(x) : __BYTE_SWAP_U16_VARIABLE(x))
 
 #endif /* __GNUC__ && !__lint__ */
 #endif /* !_LOCORE */
Index: sys/arch/sparc/include/bswap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/sparc/include/bswap.h,v
retrieving revision 1.2
diff -u -p -r1.2 bswap.h
--- sys/arch/sparc/include/bswap.h	21 Aug 1999 05:39:55 -0000	1.2
+++ sys/arch/sparc/include/bswap.h	24 Nov 2025 13:21:44 -0000
@@ -3,6 +3,12 @@
 #ifndef _MACHINE_BSWAP_H_
 #define	_MACHINE_BSWAP_H_
 
+/*
+ * GCC fails to generate inline calls to bswapX on sparc and instead
+ * generates function calls.
+ */
+#define _BSWAP_IS_SLOW 1
+
 #include <sys/bswap.h>
 
 #endif /* !_MACHINE_BSWAP_H_ */
Index: sys/arch/sparc64/include/bswap.h
===================================================================
RCS file: /cvsroot/src/sys/arch/sparc64/include/bswap.h,v
retrieving revision 1.2
diff -u -p -r1.2 bswap.h
--- sys/arch/sparc64/include/bswap.h	21 Aug 1999 05:39:55 -0000	1.2
+++ sys/arch/sparc64/include/bswap.h	24 Nov 2025 13:21:44 -0000
@@ -3,6 +3,12 @@
 #ifndef _MACHINE_BSWAP_H_
 #define	_MACHINE_BSWAP_H_
 
+/*
+ * GCC fails to generate inline calls to bswapX on sparc and instead
+ * generates function calls.
+ */
+#define _BSWAP_IS_SLOW 1
+
 #include <sys/bswap.h>
 
 #endif /* !_MACHINE_BSWAP_H_ */


Home | Main Index | Thread Index | Old Index