Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/crypto/external/bsd/openssl/lib/libcrypto/arch/i386 Optimize...



details:   https://anonhg.NetBSD.org/src/rev/b133edc94579
branches:  trunk
changeset: 338259:b133edc94579
user:      joerg <joerg%NetBSD.org@localhost>
date:      Sat May 16 17:32:54 2015 +0000

description:
Optimize i386 support in libcrypto:
- Enable optional SSE2 assembler versions. Regenerate.
- Hook up assembler version of GCM.

diffstat:

 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile     |     4 +-
 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S     |   441 ++-
 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/ghash-x86.S  |  1421 ++++++---
 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/modes.inc    |     4 +
 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha1-586.S   |  1265 ++++++++-
 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/sha512-586.S |   281 +-
 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/x86cpuid.S   |    46 +-
 7 files changed, 2850 insertions(+), 612 deletions(-)

diffs (truncated from 4229 to 300 lines):

diff -r e439f5190bc6 -r b133edc94579 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile      Sat May 16 17:26:51 2015 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/Makefile      Sat May 16 17:32:54 2015 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: Makefile,v 1.6 2012/07/30 10:25:24 christos Exp $
+#      $NetBSD: Makefile,v 1.7 2015/05/16 17:32:54 joerg Exp $
 
 .include "bsd.own.mk"
 
@@ -9,7 +9,7 @@
        for i in $$(find ${OPENSSLSRC} -name \*86.pl) \
                  ${OPENSSLSRC}/crypto/x86cpuid.pl; do \
                perl -I${OPENSSLSRC}/crypto/perlasm \
-               -I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC \
+               -I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC -DOPENSSL_IA32_SSE2 \
                | sed -e 's,^\.file.*$$,#include <machine/asm.h>,' \
                        -e 's/  call    OPENSSL_cpuid_setup/    PIC_PROLOGUE!   call    PIC_PLT(OPENSSL_cpuid_setup)!   PIC_EPILOGUE/' | tr '!' '\n' \
                > $$(basename $$i .pl).S; \
diff -r e439f5190bc6 -r b133edc94579 crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S
--- a/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S      Sat May 16 17:26:51 2015 +0000
+++ b/crypto/external/bsd/openssl/lib/libcrypto/arch/i386/bn-586.S      Sat May 16 17:32:54 2015 +0000
@@ -5,6 +5,103 @@
 .align 16
 bn_mul_add_words:
 .L_bn_mul_add_words_begin:
+       call    .L000PIC_me_up
+.L000PIC_me_up:
+       popl    %eax
+       leal    _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax
+       movl    OPENSSL_ia32cap_P@GOT(%eax),%eax
+       btl     $26,(%eax)
+       jnc     .L001maw_non_sse2
+       movl    4(%esp),%eax
+       movl    8(%esp),%edx
+       movl    12(%esp),%ecx
+       movd    16(%esp),%mm0
+       pxor    %mm1,%mm1
+       jmp     .L002maw_sse2_entry
+.align 16
+.L003maw_sse2_unrolled:
+       movd    (%eax),%mm3
+       paddq   %mm3,%mm1
+       movd    (%edx),%mm2
+       pmuludq %mm0,%mm2
+       movd    4(%edx),%mm4
+       pmuludq %mm0,%mm4
+       movd    8(%edx),%mm6
+       pmuludq %mm0,%mm6
+       movd    12(%edx),%mm7
+       pmuludq %mm0,%mm7
+       paddq   %mm2,%mm1
+       movd    4(%eax),%mm3
+       paddq   %mm4,%mm3
+       movd    8(%eax),%mm5
+       paddq   %mm6,%mm5
+       movd    12(%eax),%mm4
+       paddq   %mm4,%mm7
+       movd    %mm1,(%eax)
+       movd    16(%edx),%mm2
+       pmuludq %mm0,%mm2
+       psrlq   $32,%mm1
+       movd    20(%edx),%mm4
+       pmuludq %mm0,%mm4
+       paddq   %mm3,%mm1
+       movd    24(%edx),%mm6
+       pmuludq %mm0,%mm6
+       movd    %mm1,4(%eax)
+       psrlq   $32,%mm1
+       movd    28(%edx),%mm3
+       addl    $32,%edx
+       pmuludq %mm0,%mm3
+       paddq   %mm5,%mm1
+       movd    16(%eax),%mm5
+       paddq   %mm5,%mm2
+       movd    %mm1,8(%eax)
+       psrlq   $32,%mm1
+       paddq   %mm7,%mm1
+       movd    20(%eax),%mm5
+       paddq   %mm5,%mm4
+       movd    %mm1,12(%eax)
+       psrlq   $32,%mm1
+       paddq   %mm2,%mm1
+       movd    24(%eax),%mm5
+       paddq   %mm5,%mm6
+       movd    %mm1,16(%eax)
+       psrlq   $32,%mm1
+       paddq   %mm4,%mm1
+       movd    28(%eax),%mm5
+       paddq   %mm5,%mm3
+       movd    %mm1,20(%eax)
+       psrlq   $32,%mm1
+       paddq   %mm6,%mm1
+       movd    %mm1,24(%eax)
+       psrlq   $32,%mm1
+       paddq   %mm3,%mm1
+       movd    %mm1,28(%eax)
+       leal    32(%eax),%eax
+       psrlq   $32,%mm1
+       subl    $8,%ecx
+       jz      .L004maw_sse2_exit
+.L002maw_sse2_entry:
+       testl   $4294967288,%ecx
+       jnz     .L003maw_sse2_unrolled
+.align 4
+.L005maw_sse2_loop:
+       movd    (%edx),%mm2
+       movd    (%eax),%mm3
+       pmuludq %mm0,%mm2
+       leal    4(%edx),%edx
+       paddq   %mm3,%mm1
+       paddq   %mm2,%mm1
+       movd    %mm1,(%eax)
+       subl    $1,%ecx
+       psrlq   $32,%mm1
+       leal    4(%eax),%eax
+       jnz     .L005maw_sse2_loop
+.L004maw_sse2_exit:
+       movd    %mm1,%eax
+       emms
+       ret
+.align 16
+.L001maw_non_sse2:
        pushl   %ebp
        pushl   %ebx
        pushl   %esi
@@ -17,9 +114,9 @@
        andl    $4294967288,%ecx
        movl    32(%esp),%ebp
        pushl   %ecx
-       jz      .L000maw_finish
+       jz      .L006maw_finish
 .align 16
-.L001maw_loop:
+.L007maw_loop:
 
        movl    (%ebx),%eax
        mull    %ebp
@@ -96,13 +193,13 @@
        subl    $8,%ecx
        leal    32(%ebx),%ebx
        leal    32(%edi),%edi
-       jnz     .L001maw_loop
-.L000maw_finish:
+       jnz     .L007maw_loop
+.L006maw_finish:
        movl    32(%esp),%ecx
        andl    $7,%ecx
-       jnz     .L002maw_finish2
-       jmp     .L003maw_end
-.L002maw_finish2:
+       jnz     .L008maw_finish2
+       jmp     .L009maw_end
+.L008maw_finish2:
 
        movl    (%ebx),%eax
        mull    %ebp
@@ -113,7 +210,7 @@
        decl    %ecx
        movl    %eax,(%edi)
        movl    %edx,%esi
-       jz      .L003maw_end
+       jz      .L009maw_end
 
        movl    4(%ebx),%eax
        mull    %ebp
@@ -124,7 +221,7 @@
        decl    %ecx
        movl    %eax,4(%edi)
        movl    %edx,%esi
-       jz      .L003maw_end
+       jz      .L009maw_end
 
        movl    8(%ebx),%eax
        mull    %ebp
@@ -135,7 +232,7 @@
        decl    %ecx
        movl    %eax,8(%edi)
        movl    %edx,%esi
-       jz      .L003maw_end
+       jz      .L009maw_end
 
        movl    12(%ebx),%eax
        mull    %ebp
@@ -146,7 +243,7 @@
        decl    %ecx
        movl    %eax,12(%edi)
        movl    %edx,%esi
-       jz      .L003maw_end
+       jz      .L009maw_end
 
        movl    16(%ebx),%eax
        mull    %ebp
@@ -157,7 +254,7 @@
        decl    %ecx
        movl    %eax,16(%edi)
        movl    %edx,%esi
-       jz      .L003maw_end
+       jz      .L009maw_end
 
        movl    20(%ebx),%eax
        mull    %ebp
@@ -168,7 +265,7 @@
        decl    %ecx
        movl    %eax,20(%edi)
        movl    %edx,%esi
-       jz      .L003maw_end
+       jz      .L009maw_end
 
        movl    24(%ebx),%eax
        mull    %ebp
@@ -178,7 +275,7 @@
        adcl    $0,%edx
        movl    %eax,24(%edi)
        movl    %edx,%esi
-.L003maw_end:
+.L009maw_end:
        movl    %esi,%eax
        popl    %ecx
        popl    %edi
@@ -192,6 +289,34 @@
 .align 16
 bn_mul_words:
 .L_bn_mul_words_begin:
+       call    .L010PIC_me_up
+.L010PIC_me_up:
+       popl    %eax
+       leal    _GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax
+       movl    OPENSSL_ia32cap_P@GOT(%eax),%eax
+       btl     $26,(%eax)
+       jnc     .L011mw_non_sse2
+       movl    4(%esp),%eax
+       movl    8(%esp),%edx
+       movl    12(%esp),%ecx
+       movd    16(%esp),%mm0
+       pxor    %mm1,%mm1
+.align 16
+.L012mw_sse2_loop:
+       movd    (%edx),%mm2
+       pmuludq %mm0,%mm2
+       leal    4(%edx),%edx
+       paddq   %mm2,%mm1
+       movd    %mm1,(%eax)
+       subl    $1,%ecx
+       psrlq   $32,%mm1
+       leal    4(%eax),%eax
+       jnz     .L012mw_sse2_loop
+       movd    %mm1,%eax
+       emms
+       ret
+.align 16
+.L011mw_non_sse2:
        pushl   %ebp
        pushl   %ebx
        pushl   %esi
@@ -203,8 +328,8 @@
        movl    28(%esp),%ebp
        movl    32(%esp),%ecx
        andl    $4294967288,%ebp
-       jz      .L004mw_finish
-.L005mw_loop:
+       jz      .L013mw_finish
+.L014mw_loop:
 
        movl    (%ebx),%eax
        mull    %ecx
@@ -265,14 +390,14 @@
        addl    $32,%ebx
        addl    $32,%edi
        subl    $8,%ebp
-       jz      .L004mw_finish
-       jmp     .L005mw_loop
-.L004mw_finish:
+       jz      .L013mw_finish
+       jmp     .L014mw_loop
+.L013mw_finish:
        movl    28(%esp),%ebp
        andl    $7,%ebp
-       jnz     .L006mw_finish2
-       jmp     .L007mw_end
-.L006mw_finish2:
+       jnz     .L015mw_finish2
+       jmp     .L016mw_end
+.L015mw_finish2:
 
        movl    (%ebx),%eax
        mull    %ecx
@@ -281,7 +406,7 @@
        movl    %eax,(%edi)
        movl    %edx,%esi
        decl    %ebp
-       jz      .L007mw_end
+       jz      .L016mw_end
 
        movl    4(%ebx),%eax
        mull    %ecx
@@ -290,7 +415,7 @@
        movl    %eax,4(%edi)
        movl    %edx,%esi
        decl    %ebp
-       jz      .L007mw_end



Home | Main Index | Thread Index | Old Index