Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/arm/arm32 Speed up bcopy_page() on the XScale sligh...



details:   https://anonhg.NetBSD.org/src/rev/aa377ad8f791
branches:  trunk
changeset: 534984:aa377ad8f791
user:      thorpej <thorpej%NetBSD.org@localhost>
date:      Wed Aug 07 16:21:29 2002 +0000

description:
Speed up bcopy_page() on the XScale slightly by using the "pld"
insn (prefetch) to look-ahead to the next chunk while we copy the
current chunk.

This could probably use a bit more tuning.

diffstat:

 sys/arch/arm/arm32/bcopy_page.S |  78 ++++++++++++++++++++++------------------
 1 files changed, 43 insertions(+), 35 deletions(-)

diffs (111 lines):

diff -r 7c8376101bff -r aa377ad8f791 sys/arch/arm/arm32/bcopy_page.S
--- a/sys/arch/arm/arm32/bcopy_page.S   Wed Aug 07 15:39:43 2002 +0000
+++ b/sys/arch/arm/arm32/bcopy_page.S   Wed Aug 07 16:21:29 2002 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: bcopy_page.S,v 1.2 2001/08/11 12:44:42 chris Exp $     */
+/*     $NetBSD: bcopy_page.S,v 1.3 2002/08/07 16:21:29 thorpej Exp $   */
 
 /*
  * Copyright (c) 1995 Scott Stevens
@@ -57,8 +57,32 @@
  *   otherwise.
  */
 
+#define        CHUNK_SIZE      32
+
+#ifdef __XSCALE__
+       /* Conveniently, the chunk size is the XScale cache line size. */
+#define        PREFETCH_FIRST_CHUNK    pld     [r0]
+#define        PREFETCH_NEXT_CHUNK     pld     [r0, #(CHUNK_SIZE)]
+#else
+#define        PREFETCH_FIRST_CHUNK    /* nothing */
+#define        PREFETCH_NEXT_CHUNK     /* nothing */
+#endif
+
+#ifndef COPY_CHUNK
+#define        COPY_CHUNK \
+       PREFETCH_NEXT_CHUNK ; \
+       ldmia   r0!, {r3-r8,ip,lr} ; \
+       stmia   r1!, {r3-r8,ip,lr}
+#endif /* ! COPY_CHUNK */
+
+#ifndef SAVE_REGS
+#define        SAVE_REGS       stmfd   sp!, {r4-r8, lr}
+#define        RESTORE_REGS    ldmfd   sp!, {r4-r8, pc}
+#endif
+
 ENTRY(bcopy_page)
-       stmfd   sp!, {r4-r8, lr}
+       PREFETCH_FIRST_CHUNK
+       SAVE_REGS
 #ifdef BIG_LOOPS
        mov     r2, #(NBPG >> 9)
 #else
@@ -66,50 +90,34 @@
 #endif
 
 Lloopcopy:
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
 
 #ifdef BIG_LOOPS
        /* There is little point making the loop any larger; unless we are
           running with the cache off, the load/store overheads will
           completely dominate this loop.  */
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
 
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
 
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
-       ldmia   r0!, {r3-r8,ip,lr}
-       stmia   r1!, {r3-r8,ip,lr}
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
+       COPY_CHUNK
 #endif
        subs    r2, r2, #1
        bne     Lloopcopy
 
-       ldmfd   sp!, {r4-r8, pc}
+       RESTORE_REGS            /* ...and return. */
 
 /*
  * bzero_page(dest)



Home | Main Index | Thread Index | Old Index