Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/arm/arm When doing 16 bytes at a time, alternate re...



details:   https://anonhg.NetBSD.org/src/rev/d98c960917ef
branches:  trunk
changeset: 783449:d98c960917ef
user:      matt <matt%NetBSD.org@localhost>
date:      Sat Dec 22 08:12:26 2012 +0000

description:
When doing 16 bytes at a time, alternate register sets to reduce load stall
times.

diffstat:

 sys/arch/arm/arm/cpu_in_cksum_buffer.S |  142 +++++++++++++++++---------------
 1 files changed, 77 insertions(+), 65 deletions(-)

diffs (216 lines):

diff -r 6c7a9c0eba90 -r d98c960917ef sys/arch/arm/arm/cpu_in_cksum_buffer.S
--- a/sys/arch/arm/arm/cpu_in_cksum_buffer.S    Sat Dec 22 08:10:40 2012 +0000
+++ b/sys/arch/arm/arm/cpu_in_cksum_buffer.S    Sat Dec 22 08:12:26 2012 +0000
@@ -29,7 +29,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: cpu_in_cksum_buffer.S,v 1.5 2012/12/22 08:10:40 matt Exp $")
+RCSID("$NetBSD: cpu_in_cksum_buffer.S,v 1.6 2012/12/22 08:12:26 matt Exp $")
 
 /*
  * Special note:
@@ -64,7 +64,7 @@
  */
 
 ENTRY(cpu_in_cksum_buffer)
-#ifdef _ARM_ARCH_DWORD_OK
+#if defined(_ARM_ARCH_DWORD_OK) && !defined(__OPTIMIZE_SIZE__)
        pld     [r0]                    /* prefetch the first data */
 #endif
        mov     ip, r2                  /* initialize accumulator */
@@ -73,18 +73,29 @@
        beq     .Lfold_nopop            /* fold the checksum */
        ands    r2, r0, #7              /* test for dword alignment */
        bne     .Ldword_misaligned      /*   no, fixup non dword aligned */
-
        push    {r4-r5}                 /* save temporaries */
+       sub     RLO, r1, #1             /* subtract 1 from length */
+       bics    RLO, RLO, #3            /* more than 1 word? */
+       beq     .Lfinal_word            /*   no, just load final word */
        add     r2, r1, r0              /* point r2 just past end */
        LOAD_DWORD_INTO_R4(r0)          /* load first dword */
-       sub     r1, r2, r0              /* we've read one dword */
+       sub     r1, r1, #8              /* we've read one dword */
+#if defined(_ARM_ARCH_DWORD_OK) && !defined(__OPTIMIZE_SIZE__)
+       pld     [r0, #32]               /* prefetch data */
+#endif
+       .p2align 3
 .Ldword_aligned_noload:
-       add     r3, r1, #3              /* round up to word length */
+       add     r1, r1, #3              /* round up word length */
+       bics    r3, r1, #15             /* at least 16 bytes to do? */
+       beq     3f
+       push    {r6-r7}
 #if !defined(__OPTIMIZE_SIZE__)
-       bics    r3, r3, #63             /* at least 64 bytes to do? */
-       beq     2f                      /*   no, then do final collection */
-       push    {r6-r7}
-1:     
+       bics    r3, r1, #63             /* at least 64 bytes to do? */
+       bne     .Lloop64                /*   yes, then do them */
+       tst     r1, #32                 /* what about 32 bytes */
+       bne     .Lloop32                /*   yes, then do them */
+       b       .Lloop16                /* then we must have 16 bytes */
+.Lloop64:
        LOAD_DWORD_INTO_R6(r0)          /* 8 dwords left */
        adcs    ip, ip, r4
        adcs    ip, ip, r5
@@ -97,12 +108,15 @@
        LOAD_DWORD_INTO_R4(r0)          /* 5 dwords left */
        adcs    ip, ip, r6
        adcs    ip, ip, r7
+.Lloop32:
        LOAD_DWORD_INTO_R6(r0)          /* 4 dwords left */
        adcs    ip, ip, r4
        adcs    ip, ip, r5
        LOAD_DWORD_INTO_R4(r0)          /* 3 dwords left */
        adcs    ip, ip, r6
        adcs    ip, ip, r7
+#endif /* !__OPTIMIZE_SIZE__ */
+.Lloop16:
        LOAD_DWORD_INTO_R6(r0)          /* 2 dwords left */
        adcs    ip, ip, r4
        adcs    ip, ip, r5
@@ -111,65 +125,61 @@
        adcs    ip, ip, r7
 
        sub     r1, r2, r0              /* find how much is left */
-       add     r3, r1, #3              /* round up to word length */
-       bics    r3, r3, #63             /* at least 64 bytes to do? */
-       bne     1b                      /*   yes, run the loop again */
+       add     r1, r1, #3              /* round up word length */
+#if !defined(__OPTIMIZE_SIZE__)
+       bics    r3, r1, #63             /* at least 64 bytes to do? */
+       bne     .Lloop64                /*   yes, run the loop again */
+       tst     r1, #32                 /* what about 32 bytes? */
+       bne     .Lloop32                /*   yes, do 32-bytes */
+#endif /* !__OPTIMIZE_SIZE__ */
+
+       bics    r3, r1, #15             /* at least 16 bytes to do? */
+       bne     .Lloop16                /*   yes, deal with them. */
 
        pop     {r6-r7}                 /* done with these so restore them */
-#endif /* __OPTIMIZE_SIZE__ */
 
-2:     teq     r1, #0                  /* at the end? */
-       beq     .Lfinal_add_one_dword   /*   yes, do the final add */
-       bmi     .Lfinal_dword_noload    /*   past it, handle the final dword */
-3:
-#ifdef _ARM_ARCH_DWORD_OK
-       pld     [r0, #32]               /* grab next cache line */
-#endif
-#ifndef __OPTIMIZE_SIZE__
-       add     r3, r1, #3              /* round to word length */
-       bic     r3, r3, #7              /* find out how many dwords to do */
-       rsb     r3, r3, #56             /* subtract from 56 */
-       add     r3, r3, r3, lsr #1      /* multiply by 1.5 */
-       add     pc, pc, r3              /* and jump! */
-       nop
-       adcs    ip, ip, r4              /* 7 dwords left */
-       adcs    ip, ip, r5
-       LOAD_DWORD_INTO_R4(r0)
-       adcs    ip, ip, r4              /* 6 dwords left */
+3:     sub     r1, r2, r0              /* find how much is left */
+       teq     r1, #0                  /* how much left?? */
+       beq     .Lfinal_add_one_dword   /*   = 0? do the final add */
+       bmi     .Lfinal_dword_noload    /*   < 0? trim last word */
+       /*
+        * We have from 1-12 bytes left to do.
+        */
+       add     r3, r1, #3              /* round up word length */
+       tst     r3, #8                  /* at least one dword (5+ bytes)? */
+       beq     .Lfinal_word            /*   no, deal with the final word. */
+       /*
+        * We have at least 5 bytes so we need to load at least 8 (maybe 12)
+        * so load 8.
+        */
+       adcs    ip, ip, r4
        adcs    ip, ip, r5
        LOAD_DWORD_INTO_R4(r0)
-       adcs    ip, ip, r4              /* 5 dwords left */
-       adcs    ip, ip, r5
-       LOAD_DWORD_INTO_R4(r0)
-       adcs    ip, ip, r4              /* 4 dwords left */
-       adcs    ip, ip, r5
-       LOAD_DWORD_INTO_R4(r0)
-       adcs    ip, ip, r4              /* 3 dwords left */
-       adcs    ip, ip, r5
-       LOAD_DWORD_INTO_R4(r0)
-       adcs    ip, ip, r4              /* 2 dwords left */
-       adcs    ip, ip, r5
-       LOAD_DWORD_INTO_R4(r0)
-#endif /* __OPTIMIZE_SIZE__ */
-       adcs    ip, ip, r4              /* 1 dword left */
-       adcs    ip, ip, r5
-       LOAD_DWORD_INTO_R4(r0)
-       sub     r1, r2, r0              /* find out much left to do? */
-       teq     r1, #0                  /* at the end? */
-       beq     .Lfinal_add_one_dword   /*   yep, proceed to fold */
-       bmi     .Lfinal_dword_noload    /*   past it, handle the final dword */
-#ifdef __OPTIMIZE_SIZE__
-       add     r3, r1, #3              /* round up to word length */
-       bics    r3, r3, #7              /* exhaust all dwords? */
-       bne     3b                      /*   not yet, do more */
-#endif
-       adcs    ip, ip, RHI             /*   > 0? add previous HI */
-       ldr     RHI, [r0]               /*   > 0? load new hi word */
-       tst     r1, #3
-       beq     .Lfinal_add_one_dword   /*   = 0? just add that word. */
+       sub     r1, r1, #8              /* subtract dword from length */
+       teq     r1, #0                  /* how much left?? */
+       beq     .Lfinal_add_one_dword   /*   = 0? do the final add */
+       bmi     .Lfinal_dword_noload    /*   < 0? trim last word */
+.Lfinal_word:
+       /*
+        * Finally we are at the word to load.
+        */
+       adcs    ip, ip, RHI             /* accumulate RHI */
+       ldr     RHI, [r0]               /* load last word */
+       tst     r1, #3                  /* are we word aligned */
+       beq     .Lfinal_add_one_dword   /*   yes, accumulate last dword */
 
 .Lfinal_dword_noload:
        rsb     r1, r1, #4              /* find out many bytes to discard */
+       and     r1, r1, #3              /* limit to a single word length */
+       mov     r1, r1, lsl #3          /* bytes -> bits */
+#ifdef __ARMEL__
+       mov     RHI, RHI, lsl r1        /* discard unneeded bits */
+       mov     RHI, RHI, lsr r1        /* replace with zero bits */
+#else
+       mov     RHI, RHI, lsr r1        /* discard unneeded bits */
+       mov     RHI, RHI, lsl r1        /* replace with zero bits */
+#endif
+#if 0 
        tst     r1, #2                  /* discard at least 2? */
 #ifdef __ARMEL__
        movne   RHI, RHI, lsl #16       /*   yes, discard upper halfword */
@@ -178,6 +188,7 @@
 #endif
        tst     r1, #1                  /* discard odd? */
        bicne   RHI, RHI, #BYTE3        /*   yes, discard odd byte */
+#endif
 .Lfinal_add_one_dword:
        adcs    ip, ip, RLO             /* add 1st to accumulator */
 .Lfinal_add_one_word:
@@ -235,14 +246,15 @@
        /*
         * We are now dword aligned.
         */
-       tst     r2, #2                  /* discard at least 2? */
+       and     r3, r2, #3              /* limit to a single word length */
+       mov     r3, r3, lsl #3          /* bytes -> bits */
 #ifdef __ARMEL__
-       movne   RLO, RLO, lsr #16       /*   yes, discard lower halfword */
+       mov     RLO, RLO, lsr r3        /* discard unneeded bits */
+       mov     RLO, RLO, lsl r3        /* replace with zero bits */
 #else
-       movne   RLO, RLO, lsl #16       /*   yes, discard upper halfword */
+       mov     RLO, RLO, lsl r3        /* discard unneeded bits */
+       mov     RLO, RLO, lsr r3        /* replace with zero bits */
 #endif
-       tst     r2, #1                  /* start odd? */
-       bicne   RLO, RLO, #BYTE0        /*   yes, discard even byte */
        /*
         * See if we have a least a full dword to process.  If we do, jump
         * into the main loop as if we just load a single dword.



Home | Main Index | Thread Index | Old Index