Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/common/lib/libc/arch/arm/string This is a working version of...



details:   https://anonhg.NetBSD.org/src/rev/f4f4556a6d1f
branches:  trunk
changeset: 783670:f4f4556a6d1f
user:      matt <matt%NetBSD.org@localhost>
date:      Thu Jan 03 09:34:44 2013 +0000

description:
This is a working version of memcpy implemented using NEON instructions.
Still needs tuning as it is still about 15% than the non-NEON version.

diffstat:

 common/lib/libc/arch/arm/string/memcpy_neon.S |  277 ++++++++++++++++++++++++++
 1 files changed, 277 insertions(+), 0 deletions(-)

diffs (281 lines):

diff -r ea690d1901b5 -r f4f4556a6d1f common/lib/libc/arch/arm/string/memcpy_neon.S
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/common/lib/libc/arch/arm/string/memcpy_neon.S     Thu Jan 03 09:34:44 2013 +0000
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2013 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Matt Thomas of 3am Software Foundry.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: memcpy_neon.S,v 1.1 2013/01/03 09:34:44 matt Exp $")
+
+       .text
+ENTRY(memcpy)
+       teq     r2, #0                  /* 0 length? */
+       cmpne   r0, r1                  /*   if not, does src == dst? */
+       RETc(eq)                        /*   yes, (to either) return */
+
+       mov     r3, r0                  /* keep r0 unchanged */
+#if 0
+       cmp     r2, #16                 /* copy less than 8 bytes? */
+       bge     .Ldst_aligner           /*   nope, do it the long way */
+
+1:     ldrb    ip, [r1], #1            /* load a byte from src */
+       subs    r2, r2, #1              /* and more to transfer? */
+       strb    ip, [r3], #1            /* save it to dst */
+       bne     1b                      /*   yes, do next byte */
+       RET                             /* return */
+#endif
+
+.Ldst_aligner:
+       tst     r3, #7                  /* is dst pointer word aligned? */
+       beq     .Lsrc_aligner           /*   yes, check src pointer */
+       /*
+        * Until the dst pointer is word aligned, read src and dst byte by
+        * byte until it is aligned or we've copied everything.
+        */
+       ldrb    ip, [r1], #1            /* load a byte from src */
+       strb    ip, [r3], #1            /* save the byte to dst */
+       subs    r2, r2, #1              /* end of transfer? */
+       bne     .Ldst_aligner           /*   no, try next byte */
+       RET                             /* yes, we're done! */
+
+.Lsrc_aligner:
+       push    {r4-r5}                 /* save some registers */
+       add     r4, r2, r3              /* keep a pointer to the end of src */
+       ands    r5, r1, #7              /* get misalignment of src pointer */
+       beq     .Lcongruent_main        /*   aligned, do it the fast way */
+
+       vdup.8  d1, r5                  /* set offset for table */
+       rsb     r5, r5, #8              /* calculate leftover of each word */
+       bic     r1, r1, #7              /* dword align src pointer */
+
+       vldr    d0, .Ltbl_value         /* load table value */
+       vadd.u8 d0, d0, d1              /* add offset to it */
+
+       vld1.64 {d1}, [r1:64]!          /* load a dword from src */
+
+       cmp     r2, r5                  /* do we already have enough? */
+       bgt     .Lincongruent           /*   no, so read more */
+
+.Lincongruent_finish:
+       vtbl.8  d0, {d1-d2}, d0         /* merge last dwords */
+       cmp     r2, #8                  /* room for a full dword? */ 
+#ifdef __ARMEB__
+       vrev64.32 d0, d0                /* word swap to LE */
+#endif
+       blt     .Lfinish                /*   no, write final partial dword */
+       vst1.32 {d0}, [r3:64]           /*   yes, write final full dword */
+       b       .Ldone                  /* and we're done! */
+
+.Lincongruent:
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       cmp     r2, #8                  /* can we write a full dword? */
+       blt     .Lincongruent_finish    /*   no, finish it. */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vst1.64 {d1}, [r3:64]!          /* store a dword */
+       subs    r2, r2, #8              /* have we written everything? */
+       beq     .Ldone                  /*   yes, we're done! */
+       vmov    d1, d2                  /* prepare for next dword */
+       tst     r3, #63                 /* are we 64-byte aligned? */
+       bne     .Lincongruent           /*   no, load next dword */
+
+       /*
+        * We are now 64-byte aligneds so all writes should fill one or more
+        * cachelines.  Even if d1 has 7 bytes cached, to write 32 bytes we
+        * still need to read 4 dwords (3 full dwords and 1 dword for that
+        * last byte).
+        */
+       cmp     r2, #32                 /* can we write 4 more dwords? */
+       blt     .Lincongruent_dword     /*   no, handle dword by dword */
+       vld1.64 {d2-d5}, [r1:64]!       /* read 4 dwords */
+       cmp     r2, #64                 /* can we write 4 more dwords? */
+       blt     .Lincongruent_4dword    /*   no, handle it */
+
+1:     vld1.64 {d7-d10}, [r1:64]!      /* read 4 dwords */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vtbl.8  d2, {d2-d3}, d0         /* reorder */
+       vtbl.8  d3, {d3-d4}, d0         /* reorder */
+       vtbl.8  d4, {d4-d5}, d0         /* reorder */
+       vst1.64 {d1-d4}, [r3:64]!       /* write 4 dwords */
+       vmov    d6, d5                  /* move out of the way the load */
+       cmp     r2, #96                 /* have 8+4 dwords to write? */
+       blt     2f                      /*   no more data, skip the load */
+       vld1.64 {d2-d5}, [r1:64]!       /* more data, load 4 dwords */
+2:     vtbl.8  d6, {d6-d7}, d0         /* reorder */
+       vtbl.8  d7, {d7-d8}, d0         /* reorder */
+       vtbl.8  d8, {d8-d9}, d0         /* reorder */
+       vtbl.8  d9, {d9-d10}, d0        /* reorder */
+       vst1.64 {d6-d9}, [r3:64]!       /* write 4 dwords */
+       subs    r2, r2, #64
+       beq     .Ldone
+       vmov    d1, d10
+       cmp     r2, #64
+       bge     1b
+
+       /*
+        * we have leftovers in d1 and new untranslated date in d2-d5.
+        */
+.Lincongruent_4dword:
+       cmp     r2, #32
+       blt     .Lincongruent_dword
+
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vtbl.8  d2, {d2-d3}, d0         /* reorder */
+       vtbl.8  d3, {d3-d4}, d0         /* reorder */
+       vtbl.8  d4, {d4-d5}, d0         /* reorder */
+       vst1.64 {d1-d4}, [r3:64]!       /* write 4 dwords */
+       vmov    d1, d5                  /* move leftovers */
+       subs    r2, r2, #32
+       beq     .Ldone
+
+.Lincongruent_dword:
+#if 0
+       cmp     r2, r5                  /* enough in leftovers? */
+       ble     .Lincongruent_finish    /*   yes, finish it. */
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       cmp     r2, #8                  /* can we write a full dword? */
+       blt     .Lincongruent_finish    /*   no, finish it. */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vst1.64 {d1}, [r3:64]!          /* store a dword */
+       subs    r2, r2, #8              /* have we written everything? */
+       beq     .Ldone                  /*   yes, we're done! */
+       b       .Lincongruent_dword     /* and go get it */
+#else
+       cmp     r2, r5                  /* are the bytes we have enough? */
+       ble     .Lincongruent_finish    /*   yes, finish it. */
+       mov     ip, r2                  /* get remaining count */
+       bic     ip, ip, #7              /* truncate to a dword */
+       rsb     ip, ip, #32             /* subtract from 32 */
+       ands    r2, r2, #7              /* count mod 8 */
+       add     pc, pc, ip, lsl #1      /* and jump! */
+       nop
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vst1.64 {d1}, [r3:64]!          /* store a dword */
+       vmov    d1, d2                  /* prepare for next dword */
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vst1.64 {d1}, [r3:64]!          /* store a dword */
+       vmov    d1, d2                  /* prepare for next dword */
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vst1.64 {d1}, [r3:64]!          /* store a dword */
+       vmov    d1, d2                  /* prepare for next dword */
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       vtbl.8  d1, {d1-d2}, d0         /* reorder */
+       vst1.64 {d1}, [r3:64]!          /* store a dword */
+       vmov    d1, d2                  /* prepare for next dword */
+       beq     .Ldone
+       vld1.64 {d2}, [r1:64]!          /* load a dword */
+       b       .Lincongruent_finish    /* write last partial dowrd */
+#endif
+
+.Lcongruent_main:
+       vld1.32 {d0}, [r1:64]!          /* load next dword */
+       cmp     r2, #8                  /* compare current ptr against end */
+       blt     .Lfinish                /*   greater so write final dword */
+       vst1.32 {d0}, [r3:64]!          /* store dword */
+       subs    r2, r2, #8              /* compare current ptr against end */
+       beq     .Ldone                  /*   equal? we're done! */
+       tst     r3, #63                 /* have we hit a 64-byte boundary? */
+       bne     .Lcongruent_main        /*   no, write next word */
+
+       cmp     r2, #64                 /* can we write 4 dwords? */
+       blt     .Lcongruent_loop        /*   no, this dword by dword */
+       vldm    r1!, {d0-d7}            /* load next 7 dwords */
+       cmp     r2, #128                /* can we write 16 dwords */
+       blt     3f                      /*   no, then deal with 8 dwords */
+
+       /*
+        * The following writes two 64-byte interleaving stores and loads.
+        */
+1:     vldm    r1!, {d8-d15}           /* load next 8 dwords */
+       vstm    r3!, {d0-d7}            /* store 8 more dwords */
+       cmp     r2, #192                /* can we write 16+8 dwords? */
+       blt     2f                      /*   no, don't load the next 8 dwords */
+       vldm    r1!, {d0-d7}            /*   yes, load next 8 dwords */
+2:     vstm    r3!, {d8-d15}           /* store 8 more dwords */
+       sub     r2, r2, #128            /* we just stored 16 (8+8) dwords */
+       beq     .Ldone                  /*   if 0, we're done! */
+       cmp     r2, #128                /* can we write 16 dwords */
+       bge     1b                      /*   yes, do it again */
+       cmp     r2, #64                 /* have we loaded 8 dwords? */
+       blt     .Lcongruent_loop        /*   no, proceed to do it dword */
+
+       /*
+        * We now have 8 dwords we can write in d0-d7.
+        */
+3:     vstm    r3!, {d0-d7}            /* store 8 more dwords */
+       subs    r2, r2, #64             /* we wrote 8 dwords */
+       beq     .Ldone                  /*   if 0, we're done! */
+
+.Lcongruent_loop:
+       vld1.32 {d0}, [r1]!             /* load dword from src */
+       cmp     r2, #8                  /* can we write a full dword? */
+       blt     .Lfinish                /*   no, write last partial dword */
+.Lcongruent_loop_start:
+       vst1.32 {d0}, [r3]!             /* store dword into dst */
+       subs    r2, r2, #8              /* subtract it from length */
+       beq     .Ldone                  /*   if 0, we're done! */
+       vld1.32 {d0}, [r1]!             /* load dword from src */
+       cmp     r2, #8                  /* can we write a full dword? */
+       bge     .Lcongruent_loop_start  /*   yes, so do it */
+
+.Lfinish:
+       vmov    r4, r5, d0              /* get last dword from NEON */
+       tst     r2, #4                  /* do we have at least 4 bytes left? */
+       strne   r4, [r3], #4            /* store the 1st word */
+       movne   r4, r5                  /* move 2nd word into place */
+       tst     r2, #2                  /* do we have at least 2 bytes left? */
+#ifdef __ARMEB__
+       movne   r4, r4, ror #16         /*   yes, swap halfwords */
+#endif
+       strneh  r4, [r3], #2            /*   yes, store the halfword */
+#ifdef __ARMEL__
+       movne   r4, r4, lsr #16         /*   yes, discard just written bytes */
+#endif
+       tst     r2, #1                  /* do we have a final byte? */
+#ifdef __ARMEB__
+       movne   r4, r4, lsr #24         /*   yes, move MSB to LSB */
+#endif
+       strneb  r4, [r3], #1            /*   yes, store it */
+
+.Ldone:
+       pop     {r4-r5}                 /* restore registers */
+       RET
+
+       .p2align 3
+.Ltbl_value:
+#ifdef __ARMEL__
+       .quad   0x0706050403020100
+#else
+       .quad   0x0001020304050607
+#endif
+END(memcpy)



Home | Main Index | Thread Index | Old Index