Source-Changes archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: CVS commit: src/sys/arch/arm/ep93xx



> On Sun, May 25, 2008 at 03:57:22AM +0000, Katsuomi Hamajima wrote:
>> Modified Files:
>>      src/sys/arch/arm/ep93xx: epclk.c
>> 
>> Log Message:
>> speed up an initial value  calculation of "remaining" variable in delay().
> 
> Are you sure that GCC doesn't do exactly that? For unsigned arithmetic,
> GCC will normally use unsigned mul + shift and not a division. It would
> be strongly prefered to not have inline assembly here.

I do not understand your opinion, sorry. I attach disassembled delay().
Please tell me details.

void
delay(unsigned int n)
{
        unsigned int cur_tick, initial_tick;
        int remaining;

#ifdef DEBUG
        if (epclk_sc == NULL) {
                printf("delay: called before start epclk\n");
                return;
        }
#endif

        /*
         * Read the counter first, so that the rest of the setup overhead is
         * counted.
         */
        initial_tick = TIMER4VAL();

        if (n <= UINT_MAX / TIMER_FREQ) {
                /*
                 * For unsigned arithmetic, division can be replaced with
                 * multiplication with the inverse and a shift.
                 */
                remaining = n * TIMER_FREQ / 1000000;
        } else {
                /* This is a very long delay.
                 * Being slow here doesn't matter.
                 */
                remaining = (unsigned long long) n * TIMER_FREQ / 1000000;
        }

        while (remaining > 0) {
                cur_tick = TIMER4VAL();
                if (cur_tick > initial_tick)
                        remaining -= UINT_MAX - (cur_tick - initial_tick);
                else
                        remaining -= initial_tick - cur_tick;
                initial_tick = cur_tick;
        }
}

0000000c <delay>:
   c:   e1a0c00d        mov     ip, sp
  10:   e92dd8f0        stmdb   sp!, {r4, r5, r6, r7, fp, ip, lr, pc}
  14:   e3a02c11        mov     r2, #4352       ; 0x1100
  18:   e2822011        add     r2, r2, #17     ; 0x11
  1c:   e3a0320f        mov     r3, #-268435456 ; 0xf0000000
  20:   e2833811        add     r3, r3, #1114112        ; 0x110000
  24:   e1500002        cmp     r0, r2
  28:   e24cb004        sub     fp, ip, #4      ; 0x4
  2c:   e5937060        ldr     r7, [r3, #96]
  30:   e1a0c000        mov     ip, r0
  34:   8a000020        bhi     88 <delay+0x7c>
  38:   e1a00a00        mov     r0, r0, lsl #20
  3c:   e3a0193d        mov     r1, #999424     ; 0xf4000
  40:   e040080c        sub     r0, r0, ip, lsl #16
  44:   e2811d09        add     r1, r1, #576    ; 0x240
  48:   ebfffffe        bl      0 <__udivsi3>
  4c:   e3500000        cmp     r0, #0  ; 0x0
  50:   d89da8f0        ldmleia sp, {r4, r5, r6, r7, fp, sp, pc}
  54:   e3a0320f        mov     r3, #-268435456 ; 0xf0000000
  58:   e2833811        add     r3, r3, #1114112        ; 0x110000
  5c:   e5933060        ldr     r3, [r3, #96]
  60:   e0672003        rsb     r2, r7, r3
  64:   e1530007        cmp     r3, r7
  68:   e0631007        rsb     r1, r3, r7
  6c:   e1e02002        mvn     r2, r2
  70:   80620000        rsbhi   r0, r2, r0
  74:   90610000        rsbls   r0, r1, r0
  78:   e3500000        cmp     r0, #0  ; 0x0
  7c:   e1a07003        mov     r7, r3
  80:   ca000013        bgt     54 <delay+0x48>
  84:   e89da8f0        ldmia   sp, {r4, r5, r6, r7, fp, sp, pc}
  88:   e1a03000        mov     r3, r0
  8c:   e3a04000        mov     r4, #0  ; 0x0
  90:   e1a06804        mov     r6, r4, lsl #16
  94:   e1866823        orr     r6, r6, r3, lsr #16
  98:   e1a05803        mov     r5, r3, lsl #16
  9c:   e1a04206        mov     r4, r6, lsl #4
  a0:   e1a03205        mov     r3, r5, lsl #4
  a4:   e1844e25        orr     r4, r4, r5, lsr #28
  a8:   e1a00003        mov     r0, r3
  ac:   e1a01004        mov     r1, r4
  b0:   e3a0293d        mov     r2, #999424     ; 0xf4000
  b4:   e0500005        subs    r0, r0, r5
  b8:   e0c11006        sbc     r1, r1, r6
  bc:   e2822d09        add     r2, r2, #576    ; 0x240
  c0:   e3a03000        mov     r3, #0  ; 0x0
  c4:   ebfffffe        bl      0 <__udivdi3>
  c8:   e3500000        cmp     r0, #0  ; 0x0
  cc:   ca000013        bgt     54 <delay+0x48>
  d0:   e89da8f0        ldmia   sp, {r4, r5, r6, r7, fp, sp, pc}
void
delay(unsigned int n)
{
        unsigned int cur_tick, initial_tick;
        int remaining;
        u_int32_t scalar = 4222124650UL;

#ifdef DEBUG
        if (epclk_sc == NULL) {
                printf("delay: called before start epclk\n");
                return;
        }
#endif

        /*
         * Read the counter first, so that the rest of the setup overhead is
         * counted.
         */
        initial_tick = TIMER4VAL();

        /* This is a quick ARM way to multiply by 983040/1000000 */
        __asm volatile ("umull %0, %1, %2, %3;"
                        : "=r"(n), "=&r"(remaining)
                        : "r"((scalar)), "0"(n));

        while (remaining > 0) {
                cur_tick = TIMER4VAL();
                if (cur_tick >= initial_tick)
                        remaining -= cur_tick - initial_tick;
                else
                        remaining -= UINT_MAX - initial_tick + cur_tick + 1;
                initial_tick = cur_tick;
        }
}

0000000c <delay>:
   c:   e52de004        str     lr, [sp, #-4]!
  10:   e3a0220f        mov     r2, #-268435456 ; 0xf0000000
  14:   e2822811        add     r2, r2, #1114112        ; 0x110000
  18:   e5921060        ldr     r1, [r2, #96]
  1c:   e59f303c        ldr     r3, [pc, #60]   ; 60 <.text+0x60>
  20:   e08c0093        umull   r0, ip, r3, r0
  24:   e35c0000        cmp     ip, #0  ; 0x0
  28:   d49df004        ldrle   pc, [sp], #4
  2c:   e1a0e002        mov     lr, r2
  30:   e59e2060        ldr     r2, [lr, #96]
  34:   e1e03001        mvn     r3, r1
  38:   e0823003        add     r3, r2, r3
  3c:   e1520001        cmp     r2, r1
  40:   e0610002        rsb     r0, r1, r2
  44:   e063300c        rsb     r3, r3, ip
  48:   3243c001        subcc   ip, r3, #1      ; 0x1
  4c:   2060c00c        rsbcs   ip, r0, ip
  50:   e35c0000        cmp     ip, #0  ; 0x0
  54:   e1a01002        mov     r1, r2
  58:   ca00000a        bgt     30 <delay+0x24>
  5c:   e49df004        ldr     pc, [sp], #4
  60:   fba8826a        blx     fea20a12 <_KERNEL_OPT_ARM_INTR_IMPL+0x793b03f1>


Home | Main Index | Thread Index | Old Index