Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386 Optimize...
details: https://anonhg.NetBSD.org/src/rev/b133edc94579
branches: trunk
changeset: 338259:b133edc94579
user: joerg <joerg%NetBSD.org@localhost>
date: Sat May 16 17:32:54 2015 +0000
description:
Optimize i386 support in libcrypto:
- Enable optional SSE2 assembler versions. Regenerate.
- Hook up assembler version of GCM.
diffstat:
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile | 4 +-
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S | 441 ++-
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S | 1421 ++++++---
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc | 4 +
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S | 1265 ++++++++-
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S | 281 +-
crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S | 46 +-
7 files changed, 2850 insertions(+), 612 deletions(-)
diffs (truncated from 4229 to 300 lines):
diff -r e439f5190bc6 -r b133edc94579 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile Sat May 16 17:26:51 2015 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile Sat May 16 17:32:54 2015 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.6 2012/07/30 10:25:24 christos Exp $
+# $NetBSD: Makefile,v 1.7 2015/05/16 17:32:54 joerg Exp $
.include "bsd.own.mk"
@@ -9,7 +9,7 @@
for i in $$(find ${OPENSSLSRC} -name \*86.pl) \
${OPENSSLSRC}/crypto/x86cpuid.pl; do \
perl -I${OPENSSLSRC}/crypto/perlasm \
- -I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC \
+ -I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC -DOPENSSL_IA32_SSE2 \
| sed -e 's,^\.file.*$$,#include <machine/asm.h>,' \
-e 's/ call OPENSSL_cpuid_setup/ PIC_PROLOGUE! call PIC_PLT(OPENSSL_cpuid_setup)! PIC_EPILOGUE/' | tr '!' '\n' \
> $$(basename $$i .pl).S; \
diff -r e439f5190bc6 -r b133edc94579 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S Sat May 16 17:26:51 2015 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S Sat May 16 17:32:54 2015 +0000
@@ -5,6 +5,103 @@
.align 16
bn_mul_add_words:
.L_bn_mul_add_words_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %eax
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ btl $26,(%eax)
+ jnc .L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp .L002maw_sse2_entry
+.align 16
+.L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz .L004maw_sse2_exit
+.L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz .L003maw_sse2_unrolled
+.align 4
+.L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L005maw_sse2_loop
+.L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L001maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -17,9 +114,9 @@
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
- jz .L000maw_finish
+ jz .L006maw_finish
.align 16
-.L001maw_loop:
+.L007maw_loop:
movl (%ebx),%eax
mull %ebp
@@ -96,13 +193,13 @@
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
- jnz .L001maw_loop
-.L000maw_finish:
+ jnz .L007maw_loop
+.L006maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
- jnz .L002maw_finish2
- jmp .L003maw_end
-.L002maw_finish2:
+ jnz .L008maw_finish2
+ jmp .L009maw_end
+.L008maw_finish2:
movl (%ebx),%eax
mull %ebp
@@ -113,7 +210,7 @@
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 4(%ebx),%eax
mull %ebp
@@ -124,7 +221,7 @@
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 8(%ebx),%eax
mull %ebp
@@ -135,7 +232,7 @@
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 12(%ebx),%eax
mull %ebp
@@ -146,7 +243,7 @@
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 16(%ebx),%eax
mull %ebp
@@ -157,7 +254,7 @@
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 20(%ebx),%eax
mull %ebp
@@ -168,7 +265,7 @@
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 24(%ebx),%eax
mull %ebp
@@ -178,7 +275,7 @@
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L003maw_end:
+.L009maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@@ -192,6 +289,34 @@
.align 16
bn_mul_words:
.L_bn_mul_words_begin:
+ call .L010PIC_me_up
+.L010PIC_me_up:
+ popl %eax
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ btl $26,(%eax)
+ jnc .L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 16
+.L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L011mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -203,8 +328,8 @@
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
- jz .L004mw_finish
-.L005mw_loop:
+ jz .L013mw_finish
+.L014mw_loop:
movl (%ebx),%eax
mull %ecx
@@ -265,14 +390,14 @@
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
- jz .L004mw_finish
- jmp .L005mw_loop
-.L004mw_finish:
+ jz .L013mw_finish
+ jmp .L014mw_loop
+.L013mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
- jnz .L006mw_finish2
- jmp .L007mw_end
-.L006mw_finish2:
+ jnz .L015mw_finish2
+ jmp .L016mw_end
+.L015mw_finish2:
movl (%ebx),%eax
mull %ecx
@@ -281,7 +406,7 @@
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 4(%ebx),%eax
mull %ecx
@@ -290,7 +415,7 @@
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
Home |
Main Index |
Thread Index |
Old Index