Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/arm/cortex Rework considerably. Use alternating se...



details:   https://anonhg.NetBSD.org/src/rev/89ce5350b644
branches:  trunk
changeset: 783453:89ce5350b644
user:      matt <matt%NetBSD.org@localhost>
date:      Sat Dec 22 18:58:29 2012 +0000

description:
Rework considerably.  Use alternating sets of registers.
(Still not faster than normal ARM code).

diffstat:

 sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S |  218 ++++++++++++++++-----------
 1 files changed, 130 insertions(+), 88 deletions(-)

diffs (244 lines):

diff -r 9f077935454e -r 89ce5350b644 sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S
--- a/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S       Sat Dec 22 17:51:19 2012 +0000
+++ b/sys/arch/arm/cortex/cpu_in_cksum_asm_neon.S       Sat Dec 22 18:58:29 2012 +0000
@@ -29,7 +29,7 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.2 2012/12/18 06:05:56 matt Exp $")
+RCSID("$NetBSD: cpu_in_cksum_asm_neon.S,v 1.3 2012/12/22 18:58:29 matt Exp $")
 
 /*
  * uint32_t
@@ -39,102 +39,144 @@
  *     r1 = dlen
  */
 ENTRY(cpu_in_cksum_neon)
-       str             lr, [sp, #-8]!  /* save lr */
        mov             ip, r0          /* leave r0 as temp */
        add             r3, r1, ip      /* get end pointer */
-       ands            r1, ip, #15     /* get qword offset */
-       bic             ip, ip, #15     /* start on a qword boundary */
-       veor            q3, q3, q3      /* clear accumulator */
-       beq             .Lpre_main_loop /* ya, qword boundary start */
-
-       sub             r0, r3, ip      /* get length to qword start */
-       cmp             r0, #16         /* do we have at least a qword? */
-       andlt           r2, r3, #15     /* no, factor in trailing bytes */
-       blt             .Ltrailing_bytes /*   and do the last partial qword */
-       mov             r2, #0          /* yes, no trailing bytes */
-       bl              partial_qword   /* do the partial initial qword */
-       mov             r1, #0          /* no more leading bytes */
+       and             r1, ip, #7      /* get start offset (leading btyes) */
+       and             r2, r3, #7      /* get end offset (trailing bytes) */
+       bic             ip, ip, #7      /* start on a dword boundary */
+       add             r3, r3, #7      /* round up to a dword boundary */
+       bic             r3, r3, #7      /* end on a dword boundary */
+       veor            q2, q2, q2      /* clear accumulator */
+       vmvn.u64        q1, q2          /* create leading/trailing masks */
+       /*
+        * Normally the lower addressed is in d6 but in this case we want to
+        * reverse it since we might only have a single dword and the final
+        * fold will want the dword to trim in d7 so put the first dword in
+        * d7 until we know we are going to read more than one. 
+        */
+       veor            d6, d6, d6      /* clear second dword */
+       vld1.64         {d7}, [ip:64]!  /* load first dword */
+       orrs            r0, r1, r2      /* do we have any offsets */
+       beq             .Lpre_main_loop /*   no, proceed to main loop. */
+       mov             r1, r1, lsl #3  /* leading bytes -> bits */
+       movs            r2, r2, lsl #3  /* trailing bytes -> bits */
+#ifdef __ARMEL__
+       subne           r2, r2, #64     /* trim trailing MSBs */
+#else
+       rsb             r1, r1, #0      /* trim leading MSBs */
+       rsbne           r2, r2, #64     /* trim trailing LSBs */
+#endif
+       vmov            d0, r1, r2      /* move shifts */
+       vmovl.u32       q0, d0          /* 2 U32 -> 2 U64 */
+       vshl.u64        q1, q1, q0      /* apply shifts to masks */
+       vand.u32        d7, d7, d2      /* apply leading mask to 1st dword */
+       tst             r1, #8          /* was the starting address odd? */
+       beq             .Lpre_main_loop /*   no, go to pre_main_loop */
+       veor            d2, d2, d2      /* clear d2 (indicate odd addr) */
 
 .Lpre_main_loop:
-       and             r2, r3, #15     /* trailing bytes */
-       bic             r3, r3, #15     /* last partial or empty qword */
-       cmp             ip, r3          /* at or past the end? */
-       bge             .Ltrailing_bytes /* yes, deal with any trailing bytes */
+       cmp             ip, r3          /* do we just have a single dword? */
+       beq             .Lfinish_up     /*   yes, let finish up! */
+       vmov            d6, d7          /* move 1st dword to loaddr reg */
+       vld1.64         {d7}, [ip:64]!  /* read rest of initial qword */
 
 .Lmain_loop:
-       vld1.64         {d4-d5}, [ip:128]!
-       vmovl.u16       q0, d4          /* 4 U16 -> 4 U32 */
-       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
-       vmovl.u16       q0, d5          /* 4 U16 -> 4 U32 */
-       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
-       cmp             ip, r3
-       blt             .Lmain_loop
+       subs            r1, r3, ip      /* how much left to do? */
+       beq             .Lfinish_up     /*   = 0? we are done. */
+
+       bics            r0, r1, #31     /* we deal with octawords only */
+       beq             .Lloop_end      /*   no octawords? exit loop */
+       rsbs            r0, r0, #128    /* subtract from 128 */
+       ble             .Lloop128       /*   <= 0?, do 128 at a time. */
+       add             r0, r0, r0, lsr #2 /* multiple by 1.25 */
+       add             pc, pc, r0      /* and jump! */
+       nop
+
+.Lloop128:
+       vld1.64         {d8-d9}, [ip:64]!       /* 128 left */
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d7          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d6-d7}, [ip:64]!
+       vmovl.u16       q0, d8          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d9          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+
+       vld1.64         {d8-d9}, [ip:64]!       /* 96 left */
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d7          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d6-d7}, [ip:64]!
+       vmovl.u16       q0, d8          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d9          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
 
-.Ltrailing_bytes:
-       cmp             r2, #0          /* any trailing bytes? */
-       blne            partial_qword   /* yes, do final qword */
-       ldr             lr, [sp], #8    /* fetch LR */
+       vld1.64         {d8-d9}, [ip:64]!       /* 64 left */
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d7          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d6-d7}, [ip:64]!
+       vmovl.u16       q0, d8          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d9          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+
+       vld1.64         {d8-d9}, [ip:64]!       /* 32 left */
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d7          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d6-d7}, [ip:64]!
+       vmovl.u16       q0, d8          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vmovl.u16       q0, d9          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+
+       b               .Lmain_loop
 
-.Lfold_csum:
+.Lloop_end:
        /*
-        * We now have 4 32-bit sums in q3 (each is 20-bits or less).
+        * We have one to 3 more dwords to process
+        */
+       rsb             r0, r1, #24
+       add             r0, r0, r0, lsr #1
+       add             pc, pc, r0
+       nop
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d6}, [ip:64]!
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d6}, [ip:64]!
+       vmovl.u16       q0, d7          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vld1.64         {d7}, [ip:64]!
+
+.Lfinish_up:
+       /*
+        * Apply remaining data in d6 and d7
+        */
+       vmovl.u16       q0, d6          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+       vand            d7, d7, d3      /* apply trailing mask */
+       vmovl.u16       q0, d7          /* 4 U16 -> 4 U32 */
+       vadd.u32        q2, q2, q0      /* add 4 U32 to accumulator */
+
+       /*
+        * We now have 4 32-bit sums in q2 (each is 20-bits or less).
         * Now to get to 1 I32 bit sum.
         */
-       vadd.u32        d6, d6, d7      /* 4 I32 -> 2 I32 */
-       vmovl.u32       q3, d6          /* split two I32 into two I64 */
-       vadd.u32        d6, d6, d7      /* 2 I32 -> 1 I32 */
-       vmovl.u16       q3, d6          /* split two I16 into two I32 */
-       vmovl.u32       q3, d6          /* split two I32 into two I64 */
-       vadd.u32        d6, d6, d7      /* 2 I16 -> 1 I32 */
-       vmov            r0, s12         /* fetch csum from d6/q3 */
-       /*
-        * The result could be 0x10000 but we expect the caller to deal
-        * with it
-        */
-       RET
+       vadd.u32        d4, d4, d5      /* 4 I32 -> 2 I32 */
+       vmov            r2, s4          /* get flag for odd start */
+       teq             r2, #0          /* was start addr even? */
+       vmov            r0, r1, d4      /* extract two I32 */
+       rev16eq         r0, r0          /* byte swap if start was odd */
+       rev16eq         r1, r1          /* byte swap if start was odd */
+       adds            ip, r0, r1      /* add them producing carry */
+#include "arm/arm/cpu_in_cksum_fold.S"
 END(cpu_in_cksum_neon)
-
-/*
- * Handling partial qwords is tricky.
- */
-       .type           partial_qword, %function
-partial_qword:
-       str             lr, [sp, #-8]!  /* save LR */
-       vld1.64         {d4-d5}, [ip:128]!      /* fetch data */
-#ifdef __ARMEB__
-       vswp            d5, d4          /* on BE, MSW should be in d5 */
-#endif
-       veor            q0, q0, q0      /* create a null mask */
-       movs            r0, r1, lsl #3  /* any leading bytes? */
-       blne            _C_LABEL(__neon_leading_qword_bitmask)
-       vmvn.u64        q0, q0          /* invert leading mask to trailing */
-       vand.u32        q2, q2, q0      /* preserve them */
-       vmvn.u64        q0, #0          /* create mask */
-       movs            r0, r2, lsl #3  /* if equal, no trailing bytes */
-       blne            _C_LABEL(__neon_leading_qword_bitmask)
-       vand.u32        q2, q2, q0      /* preserve them */
-       ldr             lr, [sp], #8    /* Fetch LR */
-       vmovl.u16       q0, d4          /* 4 U16 -> 4 U32 */
-       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
-       vmovl.u16       q0, d5          /* 4 U16 -> 4 U32 */
-       vadd.u32        q3, q3, q0      /* add 4 U32 to accumulator */
-       RET
-       .size           partial_qword, . - partial_qword
-
-/*
- * uint32_t cpu_in_cksum_neon_v4hdr(void *dptr)
- */
-ENTRY(cpu_in_cksum_neon_v4hdr)
-       bic             ip, r0, #7
-       vld1.32         {d0-d2},[ip]    /* it must be in 24 bytes */
-       tst             r0, #4          /* depending on 64-bit alignment */
-       beq             1f
-       vmov            s0, s5          /* move last U32 to first U32 */
-1:     vmovl.u32       q1, d2          /* move s5 to d3 and clear s5 */
-       vmovl.u16       q3, d0          /* 4 U16 -> 4 U32 */
-       vmovl.u16       q2, d1          /* 4 U16 -> 4 U32 */
-       vadd.u32        q3, q3, q2      /* add 4 U32 to accumulator */
-       vmovl.u16       q2, d2          /* 4 U16 -> 4 U32 */
-       vadd.u32        q3, q3, q2      /* add 4 U32 to accumulator */
-       b               .Lfold_csum
-END(cpu_in_cksum_neon_v4hdr)



Home | Main Index | Thread Index | Old Index