Port-sparc archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: bswap is slow on SPARC
On 11/24/25 14:55, Sad Clouds wrote:
On Mon, 24 Nov 2025 13:30:50 +0000
nia <nia%NetBSD.org@localhost> wrote:
When encryption algorithms expect to be able to encode little-endian
integers in a tight loop, that adds up to being *slow*. Especially
when NetBSD encrypts swap by default now.
I wonder, why store data in a little-endian byte order on a big-endian
architecture? Is it that common to access the same swap partition
across multiple architectures?
It can just make life easy to store data in the most commonly used
format. Swapping around 8 bytes from end to end is no big deal and all
the code looks like :
uint64_t result, b[8] = {0,0,0,0,0,0,0,0};
b[0] = ( ( x & 0xff ) << 56 );
b[1] = ( ( x & 0xff00 ) << 40 );
b[2] = ( ( x & 0xff0000 ) << 24 );
b[3] = ( ( x & 0xff000000 ) << 8 );
b[4] = ( ( x & 0xff00000000 ) >> 8 );
b[5] = ( ( x & 0xff0000000000 ) >> 24 );
b[6] = ( ( x & 0xff000000000000 ) >> 40 );
b[7] = ( x >> 56 );
result = b[0] | b[1] | b[2] | b[3] | b[4] | b[5] | b[6] | b[7];
The final output asm is generally about the same :
000118 ! predecessor blocks: .L14
000119
000120 .L15:
000121
000122 ! 50 ! uint64_t result, b[8] =
{0,0,0,0,0,0,0,0};
000123
000124
000125 ! predecessor blocks: .L15
000126
000127 .L19:
000128 /* 0x0020 50 */ stx %g0,[%fp+1959]
000129 /* 0x0024 */ stx %g0,[%fp+1967]
000130 /* 0x0028 */ stx %g0,[%fp+1975]
000131 /* 0x002c */ stx %g0,[%fp+1983]
000132 /* 0x0030 */ stx %g0,[%fp+1991]
000133 /* 0x0034 */ stx %g0,[%fp+1999]
000134 /* 0x0038 */ stx %g0,[%fp+2007]
000135 /* 0x003c */ stx %g0,[%fp+2015]
000136
000137 ! 52 ! b[0] = ( ( x & 0xff ) << 56 );
000138
000139
000140 ! predecessor blocks: .L19
000141
000142 .L20:
000143 /* 0x0040 52 */ ldx [%fp+2039],%o0
000144 /* 0x0044 */ and %o0,255,%o0
000145 /* 0x0048 */ sllx %o0,56,%o0
000146 /* 0x004c */ stx %o0,[%fp+1959] ! volatile
000147
000148 ! 53 ! b[1] = ( ( x & 0xff00 ) << 40 );
000149
000150
000151 ! predecessor blocks: .L20
000152
000153 .L21:
000154 /* 0x0050 53 */ ldx [%fp+2039],%o0
000155 /* 0x0054 */ sethi %hi(0xfc00),%o1
000156 /* 0x0058 */ add %o1,768,%o1
000157 /* 0x005c */ and %o0,%o1,%o0
000158 /* 0x0060 */ sllx %o0,40,%o0
000159 /* 0x0064 */ stx %o0,[%fp+1967] ! volatile
000160
000161 ! 54 ! b[2] = ( ( x & 0xff0000 ) << 24 );
000162
000163
000164 ! predecessor blocks: .L21
000165
000166 .L22:
000167 /* 0x0068 54 */ ldx [%fp+2039],%o0
000168 /* 0x006c */ sethi %hi(0xff0000),%o1
000169 /* 0x0070 */ and %o0,%o1,%o0
000170 /* 0x0074 */ sllx %o0,24,%o0
000171 /* 0x0078 */ stx %o0,[%fp+1975] ! volatile
000172
000173 ! 55 ! b[3] = ( ( x & 0xff000000 ) << 8 );
000174
000175
000176 ! predecessor blocks: .L22
000177
000178 .L23:
000179 /* 0x007c 55 */ ldx [%fp+2039],%o0
000180 /* 0x0080 */ sethi %hi(0xff000000),%o1
000181 /* 0x0084 */ and %o0,%o1,%o0
000182 /* 0x0088 */ sllx %o0,8,%o0
000183 /* 0x008c */ stx %o0,[%fp+1983] ! volatile
000184
000185 ! 56 ! b[4] = ( ( x & 0xff00000000 ) >> 8 );
000186
000187
000188 ! predecessor blocks: .L23
000189
000190 .L24:
000191 /* 0x0090 56 */ ldx [%fp+2039],%o0
000192 /* 0x0094 */ sethi %hi(0x3fc00),%o1
000193 /* 0x0098 */ sllx %o1,22,%o1
000194 /* 0x009c */ and %o0,%o1,%o0
000195 /* 0x00a0 */ srlx %o0,8,%o0
000196 /* 0x00a4 */ stx %o0,[%fp+1991] ! volatile
000197
000198 ! 57 ! b[5] = ( ( x & 0xff0000000000 ) >> 24 );
000199
000200
000201 ! predecessor blocks: .L24
000202
000203 .L25:
000204 /* 0x00a8 57 */ ldx [%fp+2039],%o0
000205 /* 0x00ac */ sethi %hi(0x3fc0000),%o1
000206 /* 0x00b0 */ sllx %o1,22,%o1
000207 /* 0x00b4 */ and %o0,%o1,%o0
000208 /* 0x00b8 */ srlx %o0,24,%o0
000209 /* 0x00bc */ stx %o0,[%fp+1999] ! volatile
000210
000211 ! 58 ! b[6] = ( ( x & 0xff000000000000 ) >>
40 );
000212
000213
000214 ! predecessor blocks: .L25
000215
000216 .L26:
000217 /* 0x00c0 58 */ ldx [%fp+2039],%o0
000218 /* 0x00c4 */ sethi %hi(0xff000000),%o1
000219 /* 0x00c8 */ sllx %o1,24,%o1
000220 /* 0x00cc */ and %o0,%o1,%o0
000221 /* 0x00d0 */ srlx %o0,40,%o0
000222 /* 0x00d4 */ stx %o0,[%fp+2007] ! volatile
000223
000224 ! 59 ! b[7] = ( x >> 56 );
000225
000226
000227 ! predecessor blocks: .L26
000228
000229 .L27:
000230 /* 0x00d8 59 */ ldx [%fp+2039],%o0
000231 /* 0x00dc */ srlx %o0,56,%o0
000232 /* 0x00e0 */ stx %o0,[%fp+2015] ! volatile
000233
000234 ! 61 ! result = b[0] | b[1] | b[2] | b[3] |
b[4] | b[5] | b[6] | b[7];
000235
000236
000237 ! predecessor blocks: .L27
000238
000239 .L28:
000240 /* 0x00e4 61 */ ldx [%fp+1959],%o0
000241 /* 0x00e8 */ ldx [%fp+1967],%o1
000242 /* 0x00ec */ or %o0,%o1,%o0
000243 /* 0x00f0 */ ldx [%fp+1975],%o1
000244 /* 0x00f4 */ or %o0,%o1,%o0
000245 /* 0x00f8 */ ldx [%fp+1983],%o1
000246 /* 0x00fc */ or %o0,%o1,%o0
000247 /* 0x0100 */ ldx [%fp+1991],%o1
000248 /* 0x0104 */ or %o0,%o1,%o0
000249 /* 0x0108 */ ldx [%fp+1999],%o1
000250 /* 0x010c */ or %o0,%o1,%o0
000251 /* 0x0110 */ ldx [%fp+2007],%o1
000252 /* 0x0114 */ or %o0,%o1,%o0
000253 /* 0x0118 */ ldx [%fp+2015],%o1
000254 /* 0x011c */ or %o0,%o1,%o0
000255 /* 0x0120 */ stx %o0,[%fp+2023]
000256
000257 ! 63 ! return result;
000258
000259
000260 ! predecessor blocks: .L28
000261
000262 .L29:
000263 /* 0x0124 63 */ ldx [%fp+2023],%o0
000264 /* 0x0128 */ stx %o0,[%fp+2031]
000265 /* 0x012c */ ba .L12
000266 /* 0x0130 */ nop
000267
000268
000269 .L30:
000270
000271 ! predecessor blocks: .L30
000272
000273 .L31:
000274
000275 ! predecessor blocks: .L29 .L31
000276
000277 .L12:
000278 /* 0x0134 63 */ ldx [%fp+2031],%i0
000279 /* 0x0138 */ ret ! Result = %i0
000280 /* 0x013c */ restore %g0,%g0,%g0
000281 /* 0x0140 0 */ .type swap_eight,#function
000282 /* 0x0140 0 */ .size swap_eight,(.-swap_eight)
000283
000284
That is from the ORACLE Studio 12.6 C90 clean compiler on a SPARC-S7 and
the output from GCC does pretty much the same thing. However it does it
in a weird order. Same result. The really tight optimal stuff looks like
this :
000096 ! predecessor blocks: swap_eight
000097
000098 .L900000109:
000099 /* 000000 0 */ sethi %hi(0xff000000),%g5
000100
000101 ! 50 ! uint64_t result, b[8] =
{0,0,0,0,0,0,0,0};
000102 ! 52 ! b[0] = ( ( x & 0xff ) << 56 );
000103 ! 53 ! b[1] = ( ( x & 0xff00 ) << 40 );
000104 ! 54 ! b[2] = ( ( x & 0xff0000 ) << 24 );
000105 ! 55 ! b[3] = ( ( x & 0xff000000 ) << 8 );
000106 ! 56 ! b[4] = ( ( x & 0xff00000000 ) >> 8 );
000107 ! 57 ! b[5] = ( ( x & 0xff0000000000 ) >> 24 );
000108 ! 58 ! b[6] = ( ( x & 0xff000000000000 ) >>
40 );
000109 ! 59 ! b[7] = ( x >> 56 );
000110 ! 61 ! result = b[0] | b[1] | b[2] | b[3] |
b[4] | b[5] | b[6] | b[7];
000111 ! 63 ! return result;
000112
000113 /* 0x0004 63 */ sethi %hi(0xfc00),%g4
000114 /* 0x0008 */ and %o0,%g5,%o5
000115 /* 0x000c */ sllx %g5,24,%g5
000116 /* 0x0010 */ sllx %o5,8,%o1
000117 /* 0x0014 */ sethi %hi(0xff0000),%o4
000118 /* 0x0018 */ add %g4,768,%g1
000119 /* 0x001c */ and %o0,%o4,%o3
000120 /* 0x0020 */ sllx %o3,24,%o2
000121 /* 0x0024 */ sllx %o0,56,%o3
000122 /* 0x0028 */ and %o0,%g1,%o5
000123 /* 0x002c */ or %o1,%o2,%o2
000124 /* 0x0030 */ sllx %o5,40,%o4
000125 /* 0x0034 */ sethi %hi(0x3fc0000),%g1
000126 /* 0x0038 */ sllx %g1,22,%o5
000127 /* 0x003c */ or %o3,%o4,%o1
000128 /* 0x0040 */ sethi %hi(0x3fc00),%o3
000129 /* 0x0044 */ or %o1,%o2,%g4
000130 /* 0x0048 */ sllx %o3,22,%o2
000131 /* 0x004c */ and %o0,%o5,%o4
000132 /* 0x0050 */ srlx %o4,24,%o1
000133 /* 0x0054 */ and %o0,%o2,%g1
000134 /* 0x0058 */ srlx %g1,8,%o5
000135 /* 0x005c */ and %o0,%g5,%o3
000136 /* 0x0060 */ srlx %o0,56,%o0
000137 /* 0x0064 */ srlx %o3,40,%o2
000138 /* 0x0068 */ or %o1,%g4,%o4
000139 /* 0x006c */ or %o0,%o2,%g1
000140 /* 0x0070 */ or %o5,%o4,%g4
000141
000142 ! 65 !}
000143
000144 /* 0x0074 65 */ retl ! Result = %o0
000145 /* 0x0078 63 */ or %g4,%g1,%o0
000146 /* 0x007c 0 */ .type swap_eight,#function
000147 /* 0x007c 0 */ .size swap_eight,(.-swap_eight)
000148
000149
However truth be told it is just plain convenient to store data in the
little endian format when dealing with a whack of machines. Most of them
are going to be little endian these days in this millenium :)
--
--
Dennis Clarke
RISC-V/SPARC/PPC/ARM/CISC
UNIX and Linux spoken
ps: IBM POWER9 big endian hardware looks about the same.
Home |
Main Index |
Thread Index |
Old Index