Subject: bcopy
To: None <port-alpha@NetBSD.ORG>
From: Trevor Blackwell <tlb@eecs.harvard.edu>
List: port-alpha
Date: 08/12/1995 07:30:55
Here's a good bcopy. I looked at the GNU bcopy, but it turned out not
be as efficient as I was hoping. Even with gcc-2.7.0, it made a lot of
references to the stack. This one performs within 5 percent of the
OSF1 libc bcopy. I tested it very thoroughly for correctness.

The old one was actually incorrect, as overlapping source &
destination regions were not copied correctly (you have to work
backwards.)

Chris - you can put it under CMU copyright. In fact, lets just say you
can do that with any code I submit to this list unless I say
otherwise.


*** locore.s    Sat Aug 12 07:20:44 1995
--- locore.s-orig       Fri Mar 24 12:11:54 1995
***************
*** 1039,1265 ****
   *
   * int bcopy(char *from, char *to, u_int len);
   */
- #if 1
- LEAF(bcopy,3)
- 
-       /* Check for negative length */ 
-       ble     a2,bcopy_done
- 
- /* Check for overlap */
-          subq    a1,a0,t5
-         cmpult  t5,a2,t5
-         bne     t5,bcopy_overlap
- 
- /* a3 = end address */
-         addq    a0,a2,a3
- 
- /* Get the first word */
-         ldq_u   t2,0(a0)
- 
- /* Do they have the same alignment? */
-         xor     a0,a1,t0
-         and     t0,7,t0       
-         and     a1,7,t1
-         bne     t0,bcopy_different_alignment
- 
- /* src & dst have same alignment */
-         beq     t1,bcopy_all_aligned
-       
-         ldq_u   t3,0(a1)
-         addq    a2,t1,a2
-         mskqh   t2,a0,t2
-         mskql   t3,a0,t3
-         or      t2,t3,t2
- 
- /* Dst is 8-byte aligned */
- 
- /* If less than 8 bytes,skip loop */
- bcopy_all_aligned:
-         subq    a2,1,t0
-         and     a2,7,a2
-         bic     t0,7,t0
-         beq     t0,bcopy_samealign_lp_end
- 
- bcopy_samealign_lp:   
-         stq_u   t2,0(a1)
-         addq    a1,8,a1
-         ldq_u   t2,8(a0)
-         subq    t0,8,t0
-         addq    a0,8,a0
-         bne     t0,bcopy_samealign_lp
- 
- /* If we're done,exit */
- bcopy_samealign_lp_end:
-         bne     a2,bcopy_small_left
-         stq_u   t2,0(a1)
-         RET
- 
- bcopy_small_left:
-         mskql   t2,a2,t4
-         ldq_u   t3,0(a1)
-         mskqh   t3,a2,t3
-         or      t4,t3,t4
-         stq_u   t4,0(a1)
-         RET
- 
- /* this is the fun part */
- bcopy_different_alignment:    
-         addq    a0,a2,a3
-         cmpule  a2,8,t0
-         bne     t0,bcopy_da_finish
-       
-         beq     t1,bcopy_da_noentry
- 
- /* Do the initial partial word */
-         subq    zero,a1,t0
-         and     t0,7,t0
-         ldq_u   t3,7(a0)
-         extql   t2,a0,t2
-         extqh   t3,a0,t3
-         or      t2,t3,t5
-         insql   t5,a1,t5
-         ldq_u   t6,0(a1)
-         mskql   t6,a1,t6
-         or      t5,t6,t5
-         stq_u   t5,0(a1)
-         addq    a0,t0,a0
-         addq    a1,t0,a1
-         subq    a2,t0,a2
-         ldq_u   t2,0(a0)
-       
- bcopy_da_noentry:     
-         subq    a2,1,t0
-         bic     t0,7,t0
-         and     a2,7,a2
-         beq     t0,bcopy_da_finish2
- 
- bcopy_da_lp:  
-         ldq_u   t3,7(a0)
-         addq    a0,8,a0
-         extql   t2,a0,t4
-         extqh   t3,a0,t5
-         subq    t0,8,t0
-         or      t4,t5,t5
-         stq     t5,0(a1)
-         addq    a1,8,a1
-         beq     t0,bcopy_da_finish1
-         ldq_u   t2,7(a0)
-         addq    a0,8,a0
-         extql   t3,a0,t4
-         extqh   t2,a0,t5
-         subq    t0,8,t0
-         or      t4,t5,t5
-         stq     t5,0(a1)
-         addq    a1,8,a1
-         bne     t0,bcopy_da_lp
- 
- /* Do the last new word */
- bcopy_da_finish2:     
-         mov     t2,t3
- 
- /* Do the last partial word */
- bcopy_da_finish1:     
-         ldq_u   t2,-1(a3)
-         extql   t3,a0,t3
-         extqh   t2,a0,t2
-         or      t2,t3,t2
-         br      zero,bcopy_samealign_lp_end
- 
- /* Do the last word in the next source word */
- bcopy_da_finish:      
-         ldq_u   t3,-1(a3)
-         extql   t2,a0,t2
-         extqh   t3,a0,t3
-         or      t2,t3,t2
-         insqh   t2,a1,t3
-         insql   t2,a1,t2
-         lda     t4,-1(zero)
-         mskql   t4,a2,t5
-         cmovne  t5,t5,t4
-         insqh   t4,a1,t5
-         insql   t4,a1,t4
-         addq    a1,a2,a4
-         ldq_u   t6,0(a1)
-         ldq_u   t7,-1(a4)
-         bic     t6,t4,t6
-         bic     t7,t5,t7
-         and     t2,t4,t2
-         and     t3,t5,t3
-         or      t2,t6,t2
-         or      t3,t7,t3
-         stq_u   t3,-1(a4)
-         stq_u   t2,0(a1)
-         RET
- 
- /* Basically equivalent to previous case, only backwards.
-    Not quite as highly optimized */
- bcopy_overlap:                
-         addq    a0,a2,a3
-         addq    a1,a2,a4
- 
- /* less than 8 bytes - don't worry about overlap */
-         cmpule  a2,8,t0
-         bne     t0,bcopy_ov_short
- 
- /* Possibly do a partial first word */
-         and     a4,7,t4
-         beq     t4,bcopy_ov_nostart2
-         subq    a3,t4,a3
-         subq    a4,t4,a4
-         ldq_u   t1,0(a3)
-         subq    a2,t4,a2
-         ldq_u   t2,7(a3)
-         ldq     t3,0(a4)
-         extql   t1,a3,t1
-         extqh   t2,a3,t2
-         or      t1,t2,t1
-       mskqh   t3,t4,t3
-       mskql   t1,t4,t1
-       or      t1,t3,t1
-       stq     t1,0(a4)
- 
- bcopy_ov_nostart2:
-       bic     a2,7,t4
-       and     a2,7,a2
-       beq     t4,bcopy_ov_lp_end
- 
- /* This could be more pipelined, but it doesn't seem worth it */
- bcopy_ov_lp:  
-       ldq_u   t0,-8(a3)
-       subq    a4,8,a4
-       ldq_u   t1,-1(a3)
-       subq    a3,8,a3
-       extql   t0,a3,t0
-       extqh   t1,a3,t1
-       subq    t4,8,t4
-       or      t0,t1,t0
-       stq     t0,0(a4)
-       bne     t4,bcopy_ov_lp
- 
- bcopy_ov_lp_end:      
-       beq     a2,bcopy_done
-       
-       ldq_u   t0,0(a0)
-       ldq_u   t1,7(a0)
-       ldq_u   t2,0(a1)
-       extql   t0,a0,t0
-       extqh   t1,a0,t1
-       or      t0,t1,t0
-       insql   t0,a1,t0
-       mskql   t2,a1,t2
-       or      t2,t0,t2
-       stq_u   t2,0(a1)
-       
- bcopy_done:   
-       RET
- 
- bcopy_ov_short:       
-         ldq_u   t2,0(a0)
-         br      zero,bcopy_da_finish
-       
-       END(bcopy)
- 
- #else
  LEAF(bcopy, 3)
        SETGP(pv)
        mov     a2, t0                  /* t0 = i = len */
--- 1039,1044 ----
***************
*** 1283,1289 ****
        mov     zero, v0                /* return 0. */
        RET
        END(bcopy)
- #endif
  
  NESTED(copyin, 3, 16, ra, 0, 0)
        SETGP(pv)
--- 1062,1067 ----

--
Trevor Blackwell         tlb@eecs.harvard.edu          (617) 495-8912