Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/lib/libc/arch/sparc64/string I seem to have stumbled on an e...



details:   https://anonhg.NetBSD.org/src/rev/9c2491f711bb
branches:  trunk
changeset: 511990:9c2491f711bb
user:      eeh <eeh%NetBSD.org@localhost>
date:      Sun Jul 01 22:19:51 2001 +0000

description:
I seem to have stumbled on an even faster bcopy implementation....

diffstat:

 lib/libc/arch/sparc64/string/bcopy.S |  370 +++++++++++++++++++++-------------
 1 files changed, 231 insertions(+), 139 deletions(-)

diffs (truncated from 425 to 300 lines):

diff -r 0c3f04fb0032 -r 9c2491f711bb lib/libc/arch/sparc64/string/bcopy.S
--- a/lib/libc/arch/sparc64/string/bcopy.S      Sun Jul 01 21:41:58 2001 +0000
+++ b/lib/libc/arch/sparc64/string/bcopy.S      Sun Jul 01 22:19:51 2001 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: bcopy.S,v 1.1 2001/06/30 00:10:48 eeh Exp $    */
+/*     $NetBSD: bcopy.S,v 1.2 2001/07/01 22:19:51 eeh Exp $    */
 
 /*
  * Copyright (c) 2001  Eduardo E. Horvath
@@ -46,7 +46,7 @@
 #include <machine/psl.h>
 
 #if defined(LIBC_SCCS) && !defined(lint)
-       RCSID("$NetBSD: bcopy.S,v 1.1 2001/06/30 00:10:48 eeh Exp $")
+       RCSID("$NetBSD: bcopy.S,v 1.2 2001/07/01 22:19:51 eeh Exp $")
 #endif  /* LIBC_SCCS and not lint */
 
 #define        EMPTY   nop
@@ -101,7 +101,7 @@
         cmp    %o2, BCOPY_SMALL
 Lbcopy_start:
        bge     Lbcopy_fancy    ! if >= this many, go be fancy.
-        btst   7, %o0          ! (part of being fancy)
+        cmp    %o2, 256
 
        /*
         * Not much to copy, just do it a byte at a time.
@@ -124,147 +124,247 @@
        /*
         * Plenty of data to copy, so try to do it optimally.
         */
+1:
+#if 1
+       ! If it is big enough, use VIS instructions
+       bge     Lbcopy_block
+        nop
+#endif
 Lbcopy_fancy:
-       ! check for common case first: everything lines up.
-!      btst    7, %o0          ! done already
-       bne     1f
-        EMPTY
-       btst    7, %o1
-       be,a    Lbcopy_doubles
-        dec    8, %o2          ! if all lined up, len -= 8, goto bcopy_doubes
-1:
-       ! If it is big enough, use VIS instructions
-       cmp     %o2, 256
-       bge     Lbcopy_block
+
+       !!
+       !! First align the output to a 8-byte entity
+       !! 
+
+       save    %sp, -CC64FSZ, %sp
+       mov     %i0, %o0
+       mov     %i1, %o1
+       mov     %i2, %o2
+       
+       btst    1, %o1
+       bz,pt   %icc, 4f
+        btst   2, %o1
 
-       ! If the low bits match, we can make these line up.
-1:
-        xor    %o0, %o1, %o3   ! t = src ^ dst;
-       btst    1, %o3          ! if (t & 1) {
-       be      1f
-        btst   1, %o0          ! [delay slot: if (src & 1)]
+       ldub    [%o0], %o4                              ! Load 1st byte
+       dec     1, %o2
+       brlez,pn        %o2, Lbcopy_finish                      ! XXXX
+        inc    1, %o0
+       stb     %o4, [%o1]                              ! Store 1st byte
+       inc     1, %o1                                  ! Update address
+       btst    2, %o1
+4:     
+       bz,pt   %icc, 4f
+        btst   1, %o0
+       
+       bz,a    1f
+        lduh   [%o0], %o4                              ! Load short
 
-       ! low bits do not match, must copy by bytes.
-0:
-       ldsb    [%o0], %o4      !       do {
-       inc     %o0             !               (++dst)[-1] = *src++;
-       inc     %o1
-       deccc   %o2
-       bnz     0b              !       } while (--len != 0);
-        stb    %o4, [%o1 - 1]
-       retl
-        nop
-       NOTREACHED
+       ldub    [%o0], %o4                              ! Load bytes
+       ldub    [%o0+1], %o3
+       sllx    %o4, 8, %o4
+       or      %o3, %o4, %o4
+1:     
+       dec     2, %o2
+       brlez,pn        %o2, Lbcopy_finish                      ! XXXX
+        inc    2, %o0
+       sth     %o4, [%o1]                              ! Store 1st short
+       inc     2, %o1
+4:
+       btst    4, %o1
+       bz      4f
+        btst   3, %o0
 
-       ! lowest bit matches, so we can copy by words, if nothing else
-1:
-       be      1f              ! if (src & 1) {
-        btst   2, %o3          ! [delay slot: if (t & 2)]
+       bz,a    1f
+        lduw   [%o0], %o4                              ! Load word -1
 
-       ! although low bits match, both are 1: must copy 1 byte to align
-       ldsb    [%o0], %o4      !       *dst++ = *src++;
-       stb     %o4, [%o1]
-       inc     %o0
-       inc     %o1
-       dec     %o2             !       len--;
-       btst    2, %o3          ! } [if (t & 2)]
-1:
-       be      1f              ! if (t & 2) {
-        btst   2, %o0          ! [delay slot: if (src & 2)]
-       dec     2, %o2          !       len -= 2;
-0:
-       ldsh    [%o0], %o4      !       do {
-       sth     %o4, [%o1]      !               *(short *)dst = *(short *)src;
-       inc     2, %o0          !               dst += 2, src += 2;
-       deccc   2, %o2          !       } while ((len -= 2) >= 0);
-       bge     0b
-        inc    2, %o1
-       b       Lbcopy_mopb     !       goto mop_up_byte;
-        btst   1, %o2          ! } [delay slot: if (len & 1)]
-       NOTREACHED
+       btst    1, %o0
+       bz,a    2f
+        lduh   [%o0], %o4
+       
+       ldub    [%o0], %o4
+       lduh    [%o0+1], %o3
+       sllx    %o4, 16, %o4
+       or      %o4, %o3, %o4
+       ldub    [%o0+3], %o3
+       sllx    %o4, 8, %o4
+       ba      1f
+        or     %o4, %o3, %o4
+2:
+       lduh    [%o0+2], %o3
+       sllx    %o4, 16, %o4
+       or      %o4, %o3, %o4
+1:     
+       dec     4, %o2
+       brlez,pn        %o2, Lbcopy_finish              ! XXXX
+        inc    4, %o0
+       st      %o4, [%o1]                              ! Store word
+       inc     4, %o1
+4:
+       !!
+       !! We are now 32-bit aligned in the dest.
+       !!
+Lbcopy__common:        
 
-       ! low two bits match, so we can copy by longwords
-1:
-       be      1f              ! if (src & 2) {
-        btst   4, %o3          ! [delay slot: if (t & 4)]
+       and     %o0, 7, %o4                             ! Shift amount
+       andn    %o0, 7, %o3                             ! Source addr
+       sllx    %o4, 3, %o4                             ! In bits
+
+       brz     %o4, Lbcopy_noshift8
+        nop
+
+       ldx     [%o3], %l0                              ! Load word -1
+       add     %o3, 8, %o0                             ! now use %o0 for src
+       ldx     [%o0], %l1                              ! Load word 0
+
+       add     %o3, 8, %o0                             ! now use %o0 for src
+       sllx    %l0, %o4, %l0                           ! Shift high word
+       
+       mov     8<<3, %o3
+       sub     %o3, %o4, %o3                   ! Reverse shift
+       and     %o3, 0x38, %o3
+       !!
+       !! Continue until our dest is block aligned
+       !!
 
-       ! although low 2 bits match, they are 10: must copy one short to align
-       ldsh    [%o0], %o4      !       (*short *)dst = *(short *)src;
-       sth     %o4, [%o1]
-       inc     2, %o0          !       dst += 2;
-       inc     2, %o1          !       src += 2;
-       dec     2, %o2          !       len -= 2;
-       btst    4, %o3          ! } [if (t & 4)]
+       !! Unrolled 8 times
+Lbcopy_aligned8:       
+       brz     %o2, Lbcopy_finish
+        srlx   %l1, %o3, %o5                           ! Shift low word
+       
+       inc     8, %o0
+       ldx     [%o0], %l2                              ! Load next part
 1:
-       be      1f              ! if (t & 4) {
-        btst   4, %o0          ! [delay slot: if (src & 4)]
-       dec     4, %o2          !       len -= 4;
-0:
-       ld      [%o0], %o4      !       do {
-       st      %o4, [%o1]      !               *(int *)dst = *(int *)src;
-       inc     4, %o0          !               dst += 4, src += 4;
-       deccc   4, %o2          !       } while ((len -= 4) >= 0);
-       bge     0b
-        inc    4, %o1
-       b       Lbcopy_mopw     !       goto mop_up_word_and_byte;
-        btst   2, %o2          ! } [delay slot: if (len & 2)]
-       NOTREACHED
+       
+       dec     8, %o2
+       srlx    %l1, %o3, %o5                           ! Shift low word
+       brlez,pn        %o2, Lbcopy_finish      ! Should never happen
+        or     %o5, %l0, %o5                           ! Combine
+       
+       inc     8, %o0
+       sllx    %l1, %o4, %l0
+       
+       ldx     [%o0], %l3                              ! Load next part
+       stx     %o5, [%o1]                              ! Store result
+       inc     8, %o1
+
+       dec     8, %o2
+       srlx    %l2, %o3, %o5                           ! Shift low word
+       brlez,pn        %o2, Lbcopy_finish      ! Should never happen
+        or     %o5, %l0, %o5                           ! Combine
+
+       inc     8, %o0
+       sllx    %l2, %o4, %l0
+       
+       ldx     [%o0], %l4                              ! Load next part
+       stx     %o5, [%o1]                              ! Store result
+       inc     8, %o1
+
+       dec     8, %o2
+       srlx    %l3, %o3, %o5                           ! Shift low word
+       brlez,pn        %o2, Lbcopy_finish      ! Should never happen
+        or     %o5, %l0, %o5                           ! Combine
+
+       inc     8, %o0
+       sllx    %l3, %o4, %l0
+
+       ldx     [%o0], %l5                              ! Load next part
+       stx     %o5, [%o1]                              ! Store result
+       inc     8, %o1
+
+       dec     8, %o2
+       srlx    %l4, %o3, %o5                           ! Shift low word
+       brlez,pn        %o2, Lbcopy_finish      ! Should never happen
+        or     %o5, %l0, %o5                           ! Combine
 
-       ! low three bits match, so we can copy by doublewords
-1:
-       be      1f              ! if (src & 4) {
-        dec    8, %o2          ! [delay slot: len -= 8]
-       ld      [%o0], %o4      !       *(int *)dst = *(int *)src;
-       st      %o4, [%o1]
-       inc     4, %o0          !       dst += 4, src += 4, len -= 4;
-       inc     4, %o1
-       dec     4, %o2          ! }
-1:
-Lbcopy_doubles:
-       ldx     [%o0], %g5      ! do {
-       stx     %g5, [%o1]      !       *(double *)dst = *(double *)src;
-       inc     8, %o0          !       dst += 8, src += 8;
-       deccc   8, %o2          ! } while ((len -= 8) >= 0);
-       bge     Lbcopy_doubles
+       inc     8, %o0
+       sllx    %l4, %o4, %l0
+       
+       ldx     [%o0], %l6                              ! Load next part
+       stx     %o5, [%o1]                              ! Store result
+       inc     8, %o1
+
+       dec     8, %o2
+       srlx    %l5, %o3, %o5                           ! Shift low word
+       brlez,pn        %o2, Lbcopy_finish      ! Should never happen
+        or     %o5, %l0, %o5                           ! Combine
+
+       inc     8, %o0
+       sllx    %l5, %o4, %l0
+       
+       ldx     [%o0], %l7                              ! Load next part
+       stx     %o5, [%o1]                              ! Store result
+       inc     8, %o1
+
+       dec     8, %o2
+       srlx    %l6, %o3, %o5                           ! Shift low word
+       brlez,pn        %o2, Lbcopy_finish      ! Should never happen
+        or     %o5, %l0, %o5                           ! Combine
+



Home | Main Index | Thread Index | Old Index