Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/lib/libkern/arch/sh5 Replace the SuperH memcpy() with ho...



details:   https://anonhg.NetBSD.org/src/rev/f23534e851ea
branches:  trunk
changeset: 538542:f23534e851ea
user:      scw <scw%NetBSD.org@localhost>
date:      Tue Oct 22 12:25:18 2002 +0000

description:
Replace the SuperH memcpy() with homebrewed code. The former seems to have
a subtle failure mode which can result in corruption of memory outside the
bounds of the destination buffer.

diffstat:

 sys/lib/libkern/arch/sh5/memcpy.S |  337 ++++++++++++++++++-------------------
 1 files changed, 168 insertions(+), 169 deletions(-)

diffs (truncated from 366 to 300 lines):

diff -r 9a54e4a179e9 -r f23534e851ea sys/lib/libkern/arch/sh5/memcpy.S
--- a/sys/lib/libkern/arch/sh5/memcpy.S Tue Oct 22 12:25:17 2002 +0000
+++ b/sys/lib/libkern/arch/sh5/memcpy.S Tue Oct 22 12:25:18 2002 +0000
@@ -1,194 +1,193 @@
-/*     $NetBSD: memcpy.S,v 1.1 2002/10/17 11:53:33 scw Exp $   */
+/*     $NetBSD: memcpy.S,v 1.2 2002/10/22 12:25:18 scw Exp $   */
 
 /*
- * Fast SH5 memcpy, by J"orn Rennecke (joern.rennecke%superh.com@localhost)
- *
- * Copyright 2002 SuperH, Inc. All rights reserved
+ * Copyright 2002 Wasabi Systems, Inc.
+ * All rights reserved.
  *
- * This software is the property of SuperH, Inc (SuperH) which specifically
- * grants the user the right to modify, use and distribute this software
- * provided this notice is not removed or altered.  All other rights are
- * reserved by SuperH.
- *
- * SUPERH MAKES NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, WITH REGARD TO
- * THIS SOFTWARE.  IN NO EVENT SHALL SUPERH BE LIABLE FOR INDIRECT, SPECIAL, 
- * INCIDENTAL OR CONSEQUENTIAL DAMAGES IN CONNECTION WITH OR ARISING FROM
- * THE FURNISHING, PERFORMANCE, OR USE OF THIS SOFTWARE.
+ * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
- * So that all may benefit from your experience, please report any problems
- * or suggestions about this software to the SuperH Support Center via
- * e-mail at softwaresupport%superh.com@localhost .
- *
- * SuperH, Inc.
- * 405 River Oaks Parkway
- * San Jose
- * CA 95134
- * USA
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
  *
- * The code assumes that any quadword can be read in its
- * enirety if at least one byte is included in the copy.
- */
-
-/*
- * Slightly modified for use in NetBSD
- * by Steve Woodford (scw%wasabisystems.com@localhost):
- *  - LP64 support,
- *  - tweak register usage, mostly to avoid using r24.
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 
-#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
-#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
-#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
-#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
+/*
+ * void *memcpy(void *dest, void *src, size_t bytes)
+ *
+ * This is reasonably fast memcpy() routine.
+ *
+ * If the src/dest parameters are suitably aligned, it will try to align
+ * things such that "alloco" can be used to pre-allocate a cache-line for
+ * "dest".
+ *
+ * If the alignment of src and dest are different, the routine falls back
+ * to a byte-wise copy. This ain't great, but it serves the caller right.
+ *
+ * This algorithm could be improved upon, but I'm wary of trying to be
+ * too smart, given the lossage experienced with SuperH's memcpy() from
+ * newlib.
+ */
 
 ENTRY(memcpy)
 #ifndef _LP64
-       add.l   r2, r63, r2
+       add.l   r2, r63, r7
        add.l   r3, r63, r3
        addz.l  r4, r63, r4
+#else
+       add     r2, r63, r7
 #endif
-       ld.b    r3, 0, r63
-       pta/l   Large, tr0
-       movi    25, r0
-       bgeu/u  r4, r0, tr0
-       nsb     r4, r0
-       shlli   r0, 5, r0
-       movi    (L1 - L0 + 63*32 + 1) & 0xffff, r1
-       sub     r1, r0, r0
-L0:    ptrel   r0, tr0
-       add     r2, r4, r5
-       ptabs   r18, tr1
-       add     r3, r4, r6
-       blink   tr0, r63
+       ptabs/u r18, tr0
+       beq/u   r4, r63, tr0            /* Bail now if bytes == 0 */
+
+       /*
+        * First, try to align operands. This can only be done if the low 3
+        * bits match.
+        */
+       pta/l   Laligned, tr1
+       or      r7, r3, r1
+       andi    r1, 7, r1
+       beq/l   r1, r63, tr1            /* Operands are already aligned */
+
+       pta/u   Lbyte_copy, tr1
+       xor     r7, r3, r0
+       andi    r0, 7, r0               /* Operands misaligned differently? */
+       bne/u   r0, r63, tr1            /* Yup. Fallback to copying byte-wise */
+
+       add     r4, r1, r0
+       movi    8, r8
+       bgtu/l  r8, r0, tr1
+
+       ldlo.q  r3, 0, r0
+       stlo.q  r7, 0, r0
+       sub     r8, r1, r0
+       sub     r4, r0, r4
+       add     r7, r0, r7
+       add     r3, r0, r3
 
-       .balign 8
-L1:
-       /* 0 byte memcpy */
+       /*
+        * The buffers are quad aligned. Now align src to a 32-byte boundary
+        * if possible.
+        */
+Laligned:
+       movi    0x1f, r6
+       pta/u   Ltrailer, tr2
+       bgeu/u  r6, r4, tr2             /* Jump if less than 32 bytes left */
+       add     r7, r63, r5
+       add     r7, r6, r7
+       andc    r7, r6, r7              /* Round dst up to 32-byte boundary */
+       sub     r7, r5, r1
+       add     r3, r1, r3              /* Adjust src to match */
+       sub     r4, r1, r4
+       xor     r1, r6, r1
+       addi    r1, 2, r1
+       ptrel/l r1, tr1 
        blink   tr1, r63
+       ld.q    r3, -24, r0
+       st.q    r7, -24, r0
+       ld.q    r3, -16, r0
+       st.q    r7, -16, r0
+       ld.q    r3, -8, r0
+       st.q    r7, -8, r0
 
-L4_7:  /* 4..7 byte memcpy cntd. */
-       stlo.l  r2, 0, r0
-       or      r6, r7, r6
-       sthi.l  r5, -1, r6
-       stlo.l  r5, -4, r6
-       blink   tr1, r63
-
-L2_3:  /* 2 or 3 byte memcpy cntd. */
-       st.b    r5, -1, r6
+       /*
+        * "src" is now aligned to a multiple of 32 bytes
+        */
+       bgeu/u  r6, r4, tr2             /* Jump if less than 32 bytes left */
+       pta/l   Lcache_enter, tr1
+       pta/u   Lcache_loop, tr2
+       ld.q    r3, 0, r63              /* Prefetch one cache-line in advance */
+       alloco  r7, 0                   /* Allocate one cache-line in advance */
+       add     r7, r4, r5
+       and     r4, r6, r4
+       andc    r5, r6, r5
        blink   tr1, r63
 
-       /* 1 byte memcpy */
-       ld.b    r3, 0, r0
-       st.b    r2, 0, r0
-       blink   tr1, r63
+Lcache_loop:
+       ld.q    r3, 0, r63              /* Prefetch in advance */
+       alloco  r7, 0                   /* Allocate one cache-line in advance */
+       ld.q    r3, -32, r19
+       ld.q    r3, -24, r20
+       ld.q    r3, -16, r21
+       ld.q    r3, -8, r22
+       st.q    r7, -32, r19            /* Copy the previous cache-line */
+       st.q    r7, -24, r20
+       st.q    r7, -16, r21
+       st.q    r7, -8, r22
+Lcache_enter:
+       addi    r7, 32, r7              /* Next cache-line */
+       addi    r3, 32, r3
+       bne/l   r5, r7, tr2
 
-L8_15: /* 8..15 byte memcpy cntd. */
-       stlo.q  r2, 0, r0
-       or      r6, r7, r6
-       sthi.q  r5, -1, r6
-       stlo.q  r5, -8, r6
+       ld.q    r3, -32, r19
+       ld.q    r3, -24, r20
+       ld.q    r3, -16, r21
+       ld.q    r3, -8, r22
+       st.q    r7, -32, r19
+       st.q    r7, -24, r20
+       st.q    r7, -16, r21
+       st.q    r7, -8, r22
+
+       /*
+        * We have, at most, 31 bytes left to deal with.
+        */
+Ltrailer:
+       beq/u   r4, r63, tr0            /* Return to caller if done. */
+       add     r4, r7, r8
+       add     r4, r3, r9
+       andi    r4, 0x18, r4
+       add     r7, r4, r7
+       add     r3, r4, r3
+       xori    r4, 0x1f, r4
+       addi    r4, 2, r4
+       ptrel/l r4, tr1
        blink   tr1, r63
-       
-       /* 2 or 3 byte memcpy */
-       ld.b    r3, 0, r0
-       ld.b    r2, 0, r63
-       ld.b    r3, 1, r1
-       st.b    r2, 0, r0
-       pta/l   L2_3, tr0
-       ld.b    r6, -1, r6
-       st.b    r2, 1, r1
-       blink   tr0, r63
-
-       /* 4 .. 7 byte memcpy */
-       LDUAL   (r3, 0, r0, r1)
-       pta     L4_7, tr0
-       ldlo.l  r6, -4, r7
-       or      r0, r1, r0
-       sthi.l  r2, 3, r0
-       ldhi.l  r6, -1, r6
-       blink   tr0, r63
-
-       /* 8 .. 15 byte memcpy */
-       LDUAQ   (r3, 0, r0, r1)
-       pta     L8_15, tr0
-       ldlo.q  r6, -8, r7
-       or      r0, r1, r0
-       sthi.q  r2, 7, r0
-       ldhi.q  r6, -1, r6
+       ld.q    r3, -24, r0
+       st.q    r7, -24, r0
+       ld.q    r3, -16, r0
+       st.q    r7, -16, r0
+       ld.q    r3, -8, r0
+       st.q    r7, -8, r0
+       ldhi.q  r9, -1, r0
+       sthi.q  r8, -1, r0
        blink   tr0, r63
 
-       /* 16 .. 24 byte memcpy */
-       LDUAQ   (r3, 0, r0, r1)
-       LDUAQ   (r3, 8, r8, r9)
-       or      r0, r1, r0
-       sthi.q  r2, 7, r0
-       or      r8, r9, r8
-       sthi.q  r2, 15, r8
-       ldlo.q  r6, -8, r7
-       ldhi.q  r6, -1, r6
-       stlo.q  r2, 8, r8
-       stlo.q  r2, 0, r0
-       or      r6, r7, r6
-       sthi.q  r5, -1, r6
-       stlo.q  r5, -8, r6
-       blink   tr1, r63
 
-Large:
-       ld.b    r2, 0, r63
-       pta/l   Loop_ua, tr1
-       ori     r3, -8, r7



Home | Main Index | Thread Index | Old Index