Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/lib/libc/arch/sparc64/string New bzero() using block store i...



details:   https://anonhg.NetBSD.org/src/rev/1956b56653ce
branches:  trunk
changeset: 480007:1956b56653ce
user:      eeh <eeh%NetBSD.org@localhost>
date:      Thu Dec 30 15:31:39 1999 +0000

description:
New bzero() using block store insns.

diffstat:

 lib/libc/arch/sparc64/string/bzero.S |  294 +++++++++++++++++++++++++---------
 1 files changed, 212 insertions(+), 82 deletions(-)

diffs (truncated from 335 to 300 lines):

diff -r 8a07920ad508 -r 1956b56653ce lib/libc/arch/sparc64/string/bzero.S
--- a/lib/libc/arch/sparc64/string/bzero.S      Thu Dec 30 15:30:26 1999 +0000
+++ b/lib/libc/arch/sparc64/string/bzero.S      Thu Dec 30 15:31:39 1999 +0000
@@ -1,7 +1,7 @@
-/*     $NetBSD: bzero.S,v 1.1 1998/09/11 04:56:32 eeh Exp $    */
+/*     $NetBSD: bzero.S,v 1.2 1999/12/30 15:31:39 eeh Exp $    */
 
 /*
- * Copyright (c) 1992, 1993
+ * Copyright (c) 1992, 1993, 1999
  *     The Regents of the University of California.  All rights reserved.
  *
  * This software was developed by the Computer Systems Engineering group
@@ -40,111 +40,241 @@
  */
 
 #include <machine/asm.h>
+#ifndef _LOCORE
+#define _LOCORE
+#endif
+#include <machine/ctlreg.h>
+#include <machine/frame.h>
+#include <machine/psl.h>
+
 #if defined(LIBC_SCCS) && !defined(lint)
 #if 0
        .asciz "@(#)bzero.s     8.1 (Berkeley) 6/4/93"
 #else
-       RCSID("$NetBSD: bzero.S,v 1.1 1998/09/11 04:56:32 eeh Exp $")
+       RCSID("$NetBSD: bzero.S,v 1.2 1999/12/30 15:31:39 eeh Exp $")
 #endif
 #endif  /* LIBC_SCCS and not lint */
 
+#ifdef MEMSET
+/*
+ * memset(addr, c, len)
+ *
+ * Duplicate the pattern so it fills 64-bits, then swap around the
+ * arguments and call bzero.
+ */
+ENTRY(memset)
+       and     %o1, 0x0ff, %o3
+       mov     %o2, %o1
+       sllx    %o3, 8, %o2
+       or      %o2, %o3, %o2
+       mov     %o0, %o4                ! Save original pointer
+       sllx    %o2, 16, %o3
+       or      %o2, %o3, %o2
+       sllx    %o2, 32, %o3
+       or      %o2, %o3, %o2
+#else
 /*
  * bzero(addr, len)
  *
- * We should unroll the loop, but at the moment this would
- * gain nothing since the `std' instructions are what limits us.
+ * We want to use VIS instructions if we're clearing out more than
+ * 256 bytes, but to do that we need to properly save and restore the
+ * FP registers.  Unfortunately the code to do that in the kernel needs
+ * to keep track of the current owner of the FPU, hence the different
+ * code.
+ *
  */
 ENTRY(bzero)
        ! %o0 = addr, %o1 = len
-
-       ! Optimize a common case: addr and len are both multiples of 8.
-       or      %o0, %o1, %o2
-       btst    7, %o2                  ! ((addr | len) & 7) != 0?
-       bnz,pt  %icc, 1f                ! if so, cannot optimize
-        cmp    %o1, 15                 ! len >= 15? -- 1st instr of 1: below
-
-       /* `Good' operands, can just store doubles. */
+       clr     %o2                     ! Initialize our pattern
+#endif
+Lbzero_internal:
+       brz,pn  %o1, Lbzero_done        ! No bytes to copy??
+!       cmp    %o1, 8                  ! Less than 8 bytes to go?
+!      ble,a,pn        %icc, Lbzero_small      ! Do it byte at a time.
+!       deccc  8, %o1                  ! pre-decrement
+       
+        btst   7, %o0                  ! 64-bit aligned?  Optimization
+       bz,pt   %xcc, 2f
+        btst   3, %o0                  ! 32-bit aligned?
+       bz,pt   %xcc, 1f
+        btst   1, %o0                  ! 16-bit aligned?
+       bz,pt   %xcc, 0f
+        btst   3, %o0
+       
+       !! unaligned -- store 1 byte
+       stb     %o2, [%o0]
+       dec     1, %o1                  ! Record storing 1 byte
+       inc     %o0
+       cmp     %o1, 2
+       bl,a,pn %icc, 7f                ! 1 or 0 left
+        dec    8, %o1                  ! Fixup count -8
 0:
-       deccc   8, %o1                  ! while ((len -= 8) >= 0)
-       bge,a   0b
-        stx    %g0, [%o0 + %o1]        !       *(quad *)(addr + len) = 0;
-       retl
-       nop
-
-       /*
-        * Either the address is unaligned, or the count is not a
-        * multiple of 8, or both.  We will have to align the address
-        * in order to use anything `better' than stb.
-        */
-1:
-!      cmp     %o1, 15                 ! len >= 15?
-       bge,a,pn        %xcc, Lstx                      ! yes, use stx
-        btst   1, %o0                  ! (but first check alignment)
-
-       ! not enough to bother: do byte-at-a-time loop.
-2:
-       deccc   %o1                     ! while (--len >= 0)
-       brnz,a,pt       %o1, 2b
-        stb    %g0, [%o0 + %o1]        !       addr[len] = 0;
-       retl
-        nop
+       btst    3, %o0
+       bz,pt   %xcc, 1f
+        btst   7, %o0                  ! 64-bit aligned?
 
-Lstx:
-       /*
-        * There are at least 15 bytes to zero.
-        * We may have to zero some initial stuff to align
-        * the address.
-        */
-       bz,a    %icc, 1f                ! if (addr & 1) {
-        btst   2, %o0
-       stb     %g0, [%o0]              !       *addr = 0;
-       inc     %o0                     !       addr++;
-       dec     %o1                     !       len--;
-       btst    2, %o0                  ! }
+       !! 16-bit aligned -- store half word
+       sth     %o2, [%o0]
+       dec     2, %o1                  ! Prepare to store 2 bytes
+       inc     2, %o0
+       cmp     %o1, 4
+       bl,a,pn %icc, 5f                ! Less than 4 left
+        dec    8, %o1                  ! Fixup count -8
 1:
-       bz,a    1f                      ! if (addr & 2) {
-        btst   4, %o0
-       sth     %g0, [%o0]              !       *(short *)addr = 0;
-       inc     2, %o0                  !       addr += 2;
-       dec     2, %o1                  !       len -= 2;
-       btst    4, %o0                  ! }
-1:
-       bz      1f                      ! if (addr & 4) {
-        dec    8, %o1
-       st      %g0, [%o0]              !       *(int *)addr = 0;
-       inc     4, %o0                  !       addr += 4;
-       dec     4, %o1                  !       len -= 4;
-                                       ! }
-       /*
-        * Address is double word aligned; len is 8 less than
-        * the number of bytes remaining (i.e., len is 0 if
-        * the remaining count is 8, 1 if it is 9, etc.).
-        */
-1:
-       stx     %g0, [%o0]              ! do {
-2:                                     !       *(quad *)addr = 0;
-       inc     8, %o0                  !       addr += 8;
-       deccc   8, %o1                  ! } while ((len -= 8) >= 0);
-       bge,a   2b
-        stx    %g0, [%o0]
+       btst    7, %o0                  ! 64-bit aligned?
+       bz,pt   %xcc, 2f
+        nop
+       !! 32-bit aligned -- store word
+       stw     %o2, [%o0]
+       dec     4, %o1
+       inc     4, %o0  
+       cmp     %o1, 8
+       bl,a,pn %icc, Lbzero_cleanup    ! Less than 8 left
+        dec    8, %o1                  ! Fixup count -8
+2:
+       !! Now we're 64-bit aligned
+       cmp     %o1, 256                ! Use block clear if len > 256
+       bge,pt  %xcc, Lbzero_block      ! use block store insns
+        deccc  8, %o1
+Lbzero_longs:
+       bl,pn   %xcc, Lbzero_cleanup    ! Less than 8 bytes left
+        nop
+3:     
+       stx     %o2, [%o0]              ! Do 1 longword at a time
+       deccc   8, %o1
+       bge,pt  %xcc, 3b
+        inc    8, %o0
 
        /*
         * Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
         * -6 => two bytes, etc.  Mop up this remainder, if any.
         */
+Lbzero_cleanup:        
        btst    4, %o1
-       bz      1f                      ! if (len & 4) {
+       bz,pt   %xcc, 6f                ! if (len & 4) {
         btst   2, %o1
-       stw     %g0, [%o0]              !       *(int *)addr = 0;
+       stw     %o2, [%o0]              !       *(int *)addr = 0;
        inc     4, %o0                  !       addr += 4;
-1:
-       bz      1f                      ! if (len & 2) {
+5:     
+       btst    2, %o1
+6:
+       bz,pt   %xcc, 8f                ! if (len & 2) {
         btst   1, %o1
-       sth     %g0, [%o0]              !       *(short *)addr = 0;
+       sth     %o2, [%o0]              !       *(short *)addr = 0;
        inc     2, %o0                  !       addr += 2;
-1:
-       bnz,a   1f                      ! if (len & 1)
-        stb    %g0, [%o0]              !       *addr = 0;
-1:
+7:     
+       btst    1, %o1
+8:     
+       bnz,a   %icc, Lbzero_done       ! if (len & 1)
+        stb    %o2, [%o0]              !       *addr = 0;
+Lbzero_done:
        retl
+        mov    %o4, %o0                ! Restore ponter for memset (ugh)
+
+       /*
+        * Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
+        * -6 => two bytes, etc. but we're potentially unaligned.
+        * Do byte stores since it's easiest.
+        */
+Lbzero_small:
+       inccc   8, %o1
+       bz,pn   %icc, Lbzero_done
+1:     
+        deccc  %o1
+       stb     %o2, [%o0]
+       bge,pt  %icc, 1b
+        inc    %o0
+       ba,a,pt %icc, Lbzero_done
+        nop                            ! XXX spitfire bug?
+       
+Lbzero_block:
+/*
+ * Userland:
+ *
+ * We allocate enough space on the stack to save our registers and save
+ * our floating point state.  We really don't need to do this if the
+ * registers were not in use before, but we can't really tell if they
+ * were in use or not.
+ *
+ * See locore.s for the kernel version.
+ *
+ */    
+       save    %sp, -(CC64FSZ+32*8+BLOCK_SIZE), %sp    ! Allocate an fpstate
+       add     %sp, (CC64FSZ+BLOCK_SIZE-1), %l0        ! Calculate pointer to fpstate
+       andn    %l0, BLOCK_ALIGN, %l0                   ! And make it block aligned
+       btst    1, %sp
+       add     %l0, BIAS, %l1                          ! Fixup 64-bit stack pointers
+       movnz   %xcc, %l1, %l0
+
+!      wr      %g0, FPRS_FEF, %fprs                    ! Enable FPU
+       stda    %f0, [%l0] ASI_BLK_P
+       add     %l0, BLOCK_SIZE, %l1
+       stda    %f16, [%l1] ASI_BLK_COMMIT_P            ! We only need two banks
+
+       !! We are now 8-byte aligned.  We need to become 64-byte aligned.
+       btst    63, %i0
+       bz,pt   %xcc, 2f
         nop
+1:     
+       stx     %i2, [%i0]
+       inc     8, %i0
+       btst    63, %i0
+       bnz,pt  %xcc, 1b
+        dec    8, %i1
+
+2:
+       brz,pt  %i2, 4f                                 ! Do we have a pattern to load?
+        fzero  %f0                                     ! Set up FPU
+
+       btst    1, %fp
+       bnz,pt  %icc, 3f                                ! 64-bit stack?
+        nop
+       stw     %i2, [%fp + 0x28]                       ! Flush this puppy to RAM
+       membar  #StoreLoad
+       ld      [%fp + 0x28], %f0
+       ba,pt   %icc, 4f
+        fmovsa %icc, %f0, %f1
+3:     
+       stx     %i2, [%fp + BIAS + 0x50]                ! Flush this puppy to RAM
+       membar  #StoreLoad
+       ldd     [%fp + BIAS + 0x50], %f0
+4:     
+       fmovda  %icc, %f0, %f2                          ! Duplicate the pattern
+       fmovda  %icc, %f0, %f4
+       fmovda  %icc, %f0, %f6
+       fmovda  %icc, %f0, %f8



Home | Main Index | Thread Index | Old Index