Subject: port-arm/36513: Pre-cache load length exceeds source memory area in memcpy
To: None <port-arm-maintainer@netbsd.org, gnats-admin@netbsd.org,>
From: Hiroki Doshita <doshita@iij.ad.jp>
List: netbsd-bugs
Date: 06/20/2007 02:15:00
>Number:         36513
>Category:       port-arm
>Synopsis:       Pre-cache load length exceeds source memory area in memcpy
>Confidential:   no
>Severity:       non-critical
>Priority:       medium
>Responsible:    port-arm-maintainer
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Wed Jun 20 02:15:00 +0000 2007
>Originator:     Hiroki Doshita
>Release:        NetBSD 3.1
>Organization:
	Internet Initiative Japan, Inc.
>Environment:
Architecture: arm
Machine: armeb
>Description:
Pre-cache load length exceeds source memory area in memcpy.
Unrelated area is written back to memory, which causes
some troubles.
>How-To-Repeat:
>Fix:
Index: memcpy_xscale.S
===================================================================
RCS file: /cvs/cvsroot/src/common/lib/libc/arch/arm/string/memcpy_xscale.S,v
retrieving revision 1.1
diff -u -r1.1 memcpy_xscale.S
--- memcpy_xscale.S	20 Dec 2005 19:28:49 -0000	1.1
+++ memcpy_xscale.S	20 Jun 2007 02:08:52 -0000
@@ -247,13 +247,50 @@
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
-.Lmemcpy_bad1:
 	subs	r2, r2, #0x10
+
+.Lmemcpy_bad1:
+	cmp	r2, #0x20
 	bge	.Lmemcpy_bad1_loop16
+	cmp	r2, #0x10
+	blt	.Lmemcpy_bad1_loop16_short
 
-	adds	r2, r2, #0x10
+	/* copy last 16 bytes (without preload) */
+#ifdef __ARMEB__
+	mov	r4, ip, lsl #8
+#else
+	mov	r4, ip, lsr #8
+#endif
+	ldr	r5, [r1], #0x04
+	ldr	r6, [r1], #0x04
+	ldr	r7, [r1], #0x04
+	ldr	ip, [r1], #0x04
+#ifdef __ARMEB__
+	orr	r4, r4, r5, lsr #24
+	mov	r5, r5, lsl #8
+	orr	r5, r5, r6, lsr #24
+	mov	r6, r6, lsl #8
+	orr	r6, r6, r7, lsr #24
+	mov	r7, r7, lsl #8
+	orr	r7, r7, ip, lsr #24
+#else
+	orr	r4, r4, r5, lsl #24
+	mov	r5, r5, lsr #8
+	orr	r5, r5, r6, lsl #24
+	mov	r6, r6, lsr #8
+	orr	r6, r6, r7, lsl #24
+	mov	r7, r7, lsr #8
+	orr	r7, r7, ip, lsl #24
+#endif
+	str	r4, [r3], #0x04
+	str	r5, [r3], #0x04
+	str	r6, [r3], #0x04
+	str	r7, [r3], #0x04
+	subs	r2, r2, #0x10
 	ldmeqfd	sp!, {r4-r7}
 	bxeq	lr			/* Return now if done */
+
+.Lmemcpy_bad1_loop16_short:
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x03
 	blt	.Lmemcpy_bad_done
@@ -308,13 +345,50 @@
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
-.Lmemcpy_bad2:
 	subs	r2, r2, #0x10
+
+.Lmemcpy_bad2:
+	cmp	r2, #0x20
 	bge	.Lmemcpy_bad2_loop16
+	cmp	r2, #0x10
+	blt	.Lmemcpy_bad2_loop16_short
 
-	adds	r2, r2, #0x10
+	/* copy last 16 bytes (without preload) */
+#ifdef __ARMEB__
+	mov	r4, ip, lsl #16
+#else
+	mov	r4, ip, lsr #16
+#endif
+	ldr	r5, [r1], #0x04
+	ldr	r6, [r1], #0x04
+	ldr	r7, [r1], #0x04
+	ldr	ip, [r1], #0x04
+#ifdef __ARMEB__
+	orr	r4, r4, r5, lsr #16
+	mov	r5, r5, lsl #16
+	orr	r5, r5, r6, lsr #16
+	mov	r6, r6, lsl #16
+	orr	r6, r6, r7, lsr #16
+	mov	r7, r7, lsl #16
+	orr	r7, r7, ip, lsr #16
+#else
+	orr	r4, r4, r5, lsl #16
+	mov	r5, r5, lsr #16
+	orr	r5, r5, r6, lsl #16
+	mov	r6, r6, lsr #16
+	orr	r6, r6, r7, lsl #16
+	mov	r7, r7, lsr #16
+	orr	r7, r7, ip, lsl #16
+#endif
+	str	r4, [r3], #0x04
+	str	r5, [r3], #0x04
+	str	r6, [r3], #0x04
+	str	r7, [r3], #0x04
+	subs	r2, r2, #0x10
 	ldmeqfd	sp!, {r4-r7}
 	bxeq	lr			/* Return now if done */
+
+.Lmemcpy_bad2_loop16_short:
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x02
 	blt	.Lmemcpy_bad_done
@@ -369,13 +443,50 @@
 	str	r5, [r3], #0x04
 	str	r6, [r3], #0x04
 	str	r7, [r3], #0x04
-.Lmemcpy_bad3:
 	subs	r2, r2, #0x10
+
+.Lmemcpy_bad3:
+	cmp	r2, #0x20
 	bge	.Lmemcpy_bad3_loop16
+	cmp	r2, #0x10
+	blt	.Lmemcpy_bad3_loop16_short
 
-	adds	r2, r2, #0x10
+	/* copy last 16 bytes (without preload) */
+#ifdef __ARMEB__
+	mov	r4, ip, lsl #24
+#else
+	mov	r4, ip, lsr #24
+#endif
+	ldr	r5, [r1], #0x04
+	ldr	r6, [r1], #0x04
+	ldr	r7, [r1], #0x04
+	ldr	ip, [r1], #0x04
+#ifdef __ARMEB__
+	orr	r4, r4, r5, lsr #8
+	mov	r5, r5, lsl #24
+	orr	r5, r5, r6, lsr #8
+	mov	r6, r6, lsl #24
+	orr	r6, r6, r7, lsr #8
+	mov	r7, r7, lsl #24
+	orr	r7, r7, ip, lsr #8
+#else
+	orr	r4, r4, r5, lsl #8
+	mov	r5, r5, lsr #24
+	orr	r5, r5, r6, lsl #8
+	mov	r6, r6, lsr #24
+	orr	r6, r6, r7, lsl #8
+	mov	r7, r7, lsr #24
+	orr	r7, r7, ip, lsl #8
+#endif
+	str	r4, [r3], #0x04
+	str	r5, [r3], #0x04
+	str	r6, [r3], #0x04
+	str	r7, [r3], #0x04
+	subs	r2, r2, #0x10
 	ldmeqfd	sp!, {r4-r7}
 	bxeq	lr			/* Return now if done */
+
+.Lmemcpy_bad3_loop16_short:
 	subs	r2, r2, #0x04
 	sublt	r1, r1, #0x01
 	blt	.Lmemcpy_bad_done