[src/trunk]: src/common/lib/libc/arch/arm/string Debug. This becomes faster ...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/common/lib/libc/arch/arm/string Debug. This becomes faster ...
From: matt <matt%NetBSD.org@localhost>
Date: Tue, 07 Apr 2020 00:02:09 +0000

details:   https://anonhg.NetBSD.org/src/rev/534f4eb554c4
branches:  trunk
changeset: 783540:534f4eb554c4
user:      matt <matt%NetBSD.org@localhost>
date:      Fri Dec 28 05:15:08 2012 +0000

description:
Debug.  This becomes faster than the normal strlen at about 80 characters.

diffstat:

 common/lib/libc/arch/arm/string/strlen_neon.S |  53 ++++++++++++--------------
 1 files changed, 25 insertions(+), 28 deletions(-)

diffs (87 lines):

diff -r 332729c890e9 -r 534f4eb554c4 common/lib/libc/arch/arm/string/strlen_neon.S
--- a/common/lib/libc/arch/arm/string/strlen_neon.S     Fri Dec 28 03:48:00 2012 +0000
+++ b/common/lib/libc/arch/arm/string/strlen_neon.S     Fri Dec 28 05:15:08 2012 +0000
@@ -29,32 +29,31 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: strlen_neon.S,v 1.2 2012/12/15 22:23:31 matt Exp $")
+RCSID("$NetBSD: strlen_neon.S,v 1.3 2012/12/28 05:15:08 matt Exp $")
        .text
 
 ENTRY(strlen)
-       mov     ip, r0          /* we r0 for return value */
+       mov     ip, r0          /* we use r0 for return value */
        ands    r1, r0, #15     /* verify qword alignment */
        neg     r0, r1          /* subtract misalignment from count */
        veor    q2, q2, q2      /* clear mask */
        mov     r3, #7          /* NBBY - 1 */
        vdup.32 q3, r3          /* dup throughout q3 */
-       mov     r3, #0x04       /* magic since there are 4 bytes per U32 */
-       orr     r3, r3, lsl #8  /* copy to next 8 bits */
+       movw    r3, #0x0404     /* magic since there are 4 bytes per U32 */
        orr     r3, r3, lsl #16 /* copy to upper 16 bits */
        beq     .Lmain_loop
        veor    q0, q0, q0      /* clear q0 */
        vmvn    q2, q2          /* set all 16 bytes of mask to all 1s */
        bic     ip, ip, #15     /* qword align string address */
-       lsl     r2, r1, #3      /* convert to bits */
-       neg     r2, r2          /* make negative since we are shifting right */
-       tst     r1, #8          /* do we need skip the first 8? */
-       bne     1f              /* yes, we need to skip */
-       veor    d4, d4, d4      /* clear lower 8 bytes (upper is set) */
-       vmov    s2, r2          /* set shift amount for upper half */
-       b       2f
-1:     vmov    s0, r2          /* set shift amount for lower half */
-2:     vshl.u64 q2, q2, q0     /* shift */
+       lsl     r1, r1, #3      /* convert to bits */
+       cmp     r1, #64
+       rsbgt   r1, r1, #128    /*   > 64? BE so we are shifting LSW right */
+       movgt   r2, #0          /*   > 64? leave MSW alone */
+       rsble   r2, r1, #64     /*   <=64? BE so we are shifting MSW right */
+       movle   r1, #64         /*   <=64? clear LSW */
+       vmov    d0, r1, r2      /* set shifts for lower and upper halves */
+       vmovl.u32 q0, d0        /* 2 U32 -> 2 U64 */
+       vshl.u64 q2, q2, q0     /* shift */
        /*
         * Main loop.  Load 16 bytes, do a clz, 
         */
@@ -65,25 +64,23 @@
 #endif
        vswp    d0, d1          /* swap dwords to get BE qword */
        vorr    q0, q0, q2      /* or "in" leading byte mask */
-       veor    q2, q2, q2      /* clear byte mask */
+       veor    q2, q2, q2      /* clear leading byte mask */
        vceq.i8 q1, q0, #0      /* test each byte for 0 */
        /* Why couldn't there be a 64-bit CLZ? */
-       vclz.i32 q1, q1         /* count leading zeroes to find the 0 byte */
-       vadd.i32 q1, q1, q3     /* round up to byte bounary */
+       vclz.u32 q1, q1         /* count leading zeroes to find the 0 byte */
+       vadd.u32 q1, q1, q3     /* round up to byte bounary */
        vshr.u32 q1, q1, #3     /* convert to bytes */
-       vmovn.i32 d0, q1        /* 4 I32 -> 4 I16 */
-       vmovn.i16 d0, q0        /* 4 I16 -> 4  I8 */
+       vmovn.u32 d0, q1        /* 4 I32 -> 4 I16 */
+       vmovn.u16 d0, q0        /* 4 I16 -> 4  I8 */
        vmov    r2, s0          /* get counts */
-       cmp     r2, r3          /* count eq 4 in each byte? */
-       addeq   r0, #16         /*  no NULs */
+       eors    r2, r2, r3      /* xor with 0x04040404 */
+       addeq   r0, #16         /*   0?  no NULs */
        beq     .Lmain_loop     /* get next qword */
-                               /* r2[31:24] already has 1st word byte count */
-       tst     r2, #(4 << 24)  /* first word has 4 non-NUL? */
-       addne   r2, r2, r2, lsl #8 /* add second word byte-count */
-       tstne   r2, #(4 << 16)  /* second word has 4 non-NUL? */
-       addne   r2, r2, r2, lsl #16 /* add thirs word byte-count */
-       tstne   r2, #(4 << 8)   /* third has 4 non-NULL? */
-       addne   r2, r2, r2, lsl #24 /* add fourth word byte-count */
-       add     r0, r0, r2, lsr #24 /* add accumulated byte-count to length */
+       clz     ip, r2          /* count leading zeros */
+       mov     r2, r2, lsl ip  /* discard them */
+       mov     ip, ip, lsr #3  /* divide leading zeroes by 8 */
+       add     r0, r0, ip, lsl #2 /* multiple by 4 and add to count */
+       and     r2, r2, #(3 << 29)
+       add     r0, r0, r2, lsr #29
        RET                     /* and return. */
 END(strlen)

Prev by Date: [src/trunk]: src/common/lib/libc/arch/arm/string strlen implementation for ar...
Next by Date: [src/trunk]: src/sys/arch/arm/arm Enable DCache Streaming Switch and Write Al...
Previous by Thread: [src/trunk]: src/common/lib/libc/arch/arm/string strlen implementation for ar...
Next by Thread: [src/trunk]: src/sys/arch/arm/arm Enable DCache Streaming Switch and Write Al...
Indexes:

Home | Main Index | Thread Index | Old Index