Subject: New in_cksum/in4_cksum implementation
To: None <port-arm@netbsd.org>
From: Steve Woodford <scw@wasabisystems.com>
List: port-arm
Date: 09/11/2003 09:17:27
--Boundary-00=_X+CY/iV8AJzbxiI
Content-Type: text/plain;
  charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

Hi folks,

I've been doing some Xscale optimisation work recently for Wasabi, 
part of which involved re-writing in_cksum/in4_cksum in assembly. 
While the resulting code is hand-crafted for Xscale, I've added the 
necessary tweaks to support vanilla ARM cpus too. Thanks to Chris 
Gilbert for useful feedback on that side of things.

Benchmark tests with a gigabit ethernet card show between 7% and 29% 
improvement in throughput, depending on data size, compared to the 
old code (using pkgsrc/benchmarks/nttcp). I don't have a figure for 
regular ARM cpus, since I don't have an ARM board with fast enough 
ethernet. I'd still expect to see a bit of improvement, though.

Wasabi would like to contribute this code back to NetBSD. If there are 
no objections, I'd like to commit the attached code to the NetBSD 
tree asap. I'd also like to see some figures from non-xscale machines 
with decent ethernet. :)

Comments?

Cheers, Steve

-- 

Wasabi Systems Inc. - The NetBSD Company - 
http://www.wasabisystems.com/


--Boundary-00=_X+CY/iV8AJzbxiI
Content-Type: text/x-csrc;
  charset="us-ascii";
  name="in_cksum_arm.S"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="in_cksum_arm.S"

/*	$NetBSD$	*/

/*
 * Copyright 2003 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Steve C. Woodford for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
 */

#include "opt_inet.h"

#include <machine/asm.h>
#include "assym.h"


/*
 * int in_cksum(struct mbuf *m, int len)
 *
 * Entry:
 *	r0	m
 *	r1	len
 *
 * NOTE: Assumes 'm' is *never* NULL.
 */
/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
ENTRY(in_cksum)
	stmfd	sp!, {r4-r11,lr}
	mov	r8, #0x00
	mov	r9, r1
	mov	r10, #0x00
	mov	ip, r0

.Lin_cksum_loop:
	ldr	r1, [ip, #(M_LEN)]
	ldr	r0, [ip, #(M_DATA)]
	ldr	ip, [ip, #(M_NEXT)]
.Lin_cksum_entry4:
	cmp	r9, r1
	movlt	r1, r9
	sub	r9, r9, r1
	eor	r11, r10, r0
	add	r10, r10, r1
	adds	r2, r1, #0x00
	blne	_ASM_LABEL(L_cksumdata)
	tst	r11, #0x01
	movne	r2, r2, ror #8
	adds	r8, r8, r2
	adc	r8, r8, #0x00
	cmp	ip, #0x00
	bne	.Lin_cksum_loop

	mov	r1, #0xff
	orr	r1, r1, #0xff00
	and	r0, r8, r1
	add	r0, r0, r8, lsr #16
	add	r0, r0, r0, lsr #16
	and	r0, r0, r1
	eor	r0, r0, r1
	ldmfd	sp!, {r4-r11,pc}


#ifdef INET
/*
 * int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
 *
 * Entry:
 *	r0	m
 *	r1	nxt
 *	r2	off
 *	r3	len
 */
/* LINTSTUB: Func: int in4_cksum(struct mbuf *, u_int8_t, int, int) */
ENTRY(in4_cksum)
	stmfd	sp!, {r4-r11,lr}
	mov	r8, #0x00		/* Accumulate sum in r8 */

	/*
	 * First, deal with a pseudo header, if present
	 */
	ldr	r6, [r0, #(M_DATA)]
	cmp	r1, #0x00
	beq	.Lin4_cksum_skip_entry
	add	r8, r1, r3		/* sum = nxt + len */

	add	r1, r6, #(IP_SRC)
	tst	r1, #0x03		/* Data 32-bit aligned? */
	ldreq	r5, [r6, #(IP_SRC)]
	ldreq	r4, [r6, #(IP_DST)]
	beq	.Lin4_cksum_add_ips

/*
 * It would be nice to use this for more than just Xscale, but not all
 * ARM cpus (and acorn32's bus) can deal with half-word loads.
 */
#ifdef __XSCALE__
	tst	r1, #0x01		/* Data 16-bit aligned? */
	ldreqh	r5, [r6, #(IP_SRC)]
	ldreqh	r7, [r6, #(IP_DST + 2)]
	ldreq	r4, [r6, #(IP_SRC + 2)]
	orreq	r5, r5, r7, lsl #16
	beq	.Lin4_cksum_add_ips
#endif

	/* Data is aligned to an odd address. Bummer. Do it the slow way. */
	ldrb	r4, [r6, #(IP_SRC + 0)]
	ldrb	r1, [r6, #(IP_SRC + 1)]
	ldrb	r7, [r6, #(IP_SRC + 2)]
	ldrb	r9, [r6, #(IP_SRC + 3)]
#ifndef __ARMEB__
	orr	r4, r4, r1, lsl #8	/* ..10 */
	orr	r4, r4, r7, lsl #16	/* .210 */
	orr	r4, r4, r9, lsl #24	/* 3210 */
#else
	orr	r4, r9, r4, lsl #24	/* 0..3 */
	orr	r4, r4, r1, lsl #16	/* 01.3 */
	orr	r4, r4, r7, lsl #8	/* 0123 */
#endif
	ldrb	r5, [r6, #(IP_DST + 0)]
	ldrb	r1, [r6, #(IP_DST + 1)]
	ldrb	r7, [r6, #(IP_DST + 2)]
	ldrb	r9, [r6, #(IP_DST + 3)]
#ifndef __ARMEB__
	orr	r5, r5, r1, lsl #8	/* ..10 */
	orr	r5, r5, r7, lsl #16	/* .210 */
	orr	r5, r5, r9, lsl #24	/* 3210 */
#else
	orr	r5, r9, r4, lsl #24	/* 0..3 */
	orr	r5, r5, r1, lsl #16	/* 01.3 */
	orr	r5, r5, r7, lsl #8	/* 0123 */
#endif

.Lin4_cksum_add_ips:
	adds	r5, r5, r4
#ifndef __ARMEB__
	adcs	r8, r5, r8, lsl #8
#else
	adcs	r8, r5, r8
#endif
	adc	r8, r8, #0x00
	mov	r1, #0x00
	b	.Lin4_cksum_skip_entry

.Lin4_cksum_skip_loop:
	ldr	r1, [r0, #(M_LEN)]
	ldr	r6, [r0, #(M_DATA)]
	ldr	r0, [r0, #(M_NEXT)]
.Lin4_cksum_skip_entry:
	subs	r2, r2, r1
	blt	.Lin4_cksum_skip_done
	cmp	r0, #0x00
	bne	.Lin4_cksum_skip_loop
	b	.Lin4_cksum_whoops

.Lin4_cksum_skip_done:
	mov	ip, r0
	add	r0, r2, r6
	add	r0, r0, r1
	rsb	r1, r2, #0x00
	mov	r9, r3
	mov	r10, #0x00
	b	.Lin_cksum_entry4

.Lin4_cksum_whoops:
	adr	r0, .Lin4_cksum_whoops_str
	bl	_C_LABEL(panic)
.Lin4_cksum_whoops_str:
	.asciz	"in4_cksum: out of mbufs\n"
	.align	5
#endif	/* INET */


/*
 * The main in*_cksum() workhorse...
 *
 * Entry parameters:
 *	r0	Pointer to buffer
 *	r1	Buffer length
 *	lr	Return address
 *
 * Returns:
 *	r2	Accumulated 32-bit sum
 *
 * Clobbers:
 *	r0-r7
 */
/* LINTSTUB: Ignore */
ASENTRY_NP(L_cksumdata)
#ifdef __XSCALE__
	pld	[r0]			/* Pre-fetch the start of the buffer */
#endif
	mov	r2, #0
	mov	r3, #0

	/* We first have to word-align the buffer.  */
	ands	r7, r0, #0x03
	beq	.Lcksumdata_wordaligned
	rsb	r7, r7, #0x04
	cmp	r1, r7			/* Enough bytes left to make it? */
	blt	.Lcksumdata_endgame
	cmp	r7, #0x02
	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
	ldrgeb	r5, [r0], #0x01		/* Fetch 2nd byte */
	movlt	r5, #0x00
	ldrgtb	r6, [r0], #0x01		/* Fetch 3rd byte */
	movle	r6, #0x00
	/* Combine the three bytes depending on endianness and alignment */
#ifdef __ARMEB__
	orreq	r2, r5, r4, lsl #8
	orreq	r2, r2, r6, lsl #24
	orrne	r2, r4, r5, lsl #8
	orrne	r2, r2, r6, lsl #16
#else
	orreq	r2, r4, r5, lsl #8
	orreq	r2, r2, r6, lsl #16
	orrne	r2, r5, r4, lsl #8
	orrne	r2, r2, r6, lsl #24
#endif
	subs	r1, r1, r7		/* Update length */
	moveq	pc, lr			/* All done? */

	/* Buffer is now word aligned */
.Lcksumdata_wordaligned:

#ifdef __XSCALE__
	cmp	r1, #0x04		/* Less than 4 bytes left? */
	blt	.Lcksumdata_endgame	/* Yup */

	tst	r0, #0x04		/* Now try to quad-align */
	subne	r1, r1, #0x04
	ldrne	r3, [r0], #0x04
	subs	r1, r1, #0x40
	blt	.Lcksumdata_bigloop_end

	/*
	 * Buffer is now quad aligned. Sum 64 bytes at a time.
	 * Note: First ldrd is hoisted above the loop so that
	 * subsequent iterations avoid an additional one-cycle
	 * result penalty.
	 */
	ldrd	r4, [r0], #0x08
.Lcksumdata_bigloop:
	pld	[r0, #0x18]
	ldrd	r6, [r0], #0x08
	adds	r2, r2, r4
	adcs	r2, r2, r5
	ldrd	r4, [r0], #0x08
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldrd	r6, [r0], #0x08
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldrd	r4, [r0], #0x08
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	pld	[r0, #0x18]
	ldrd	r6, [r0], #0x08
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldrd	r4, [r0], #0x08
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldrd	r6, [r0], #0x08
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6		/* XXX: 1-cycle result penalty :-/ */
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x40
	ldrged	r4, [r0], #0x08
	bge	.Lcksumdata_bigloop

#else	/* !__XSCALE__ */

	subs	r1, r1, #0x40
	blt	.Lcksumdata_bigloop_end

.Lcksumdata_bigloop:
	ldmia	r0!, {r4, r5, r6, r7}
	adds	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldmia	r0!, {r4, r5, r6, r7}
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldmia	r0!, {r4, r5, r6, r7}
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldmia	r0!, {r4, r5, r6, r7}
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x40
	bge	.Lcksumdata_bigloop
#endif

.Lcksumdata_bigloop_end:
	adds	r2, r2, r3
	adc	r2, r2, #0x00
	adds	r1, r1, #0x40
	moveq	pc, lr
	cmp	r1, #0x20
	blt	.Lcksumdata_less_than_32

#ifdef __XSCALE__
	ldrd	r4, [r0], #0x08
	pld	[r0, #0x18]
	ldrd	r6, [r0], #0x08
	adds	r2, r2, r4
	adcs	r2, r2, r5
	ldrd	r4, [r0], #0x08
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldrd	r6, [r0], #0x08
#else
	ldmia	r0!, {r4, r5, r6, r7}
	adds	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	ldmia	r0!, {r4, r5, r6, r7}
#endif
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r6
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x20
	moveq	pc, lr

.Lcksumdata_less_than_32:
	/* There are less than 32 bytes left */
	and	r3, r1, #0x18
	rsb	r4, r3, #0x18
	sub	r1, r1, r3
	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
	add	pc, pc, r4
	nop

/*
 * Note: We use ldm here, even on Xscale, since the combined issue/result
 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
 */
	/* At least 24 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* At least 16 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* At least 8 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* Less than 8 bytes remaining... */
	adc	r2, r2, #0x00
	subs	r1, r1, #0x04
	blt	.Lcksumdata_lessthan4

	ldr	r4, [r0], #0x04
	sub	r1, r1, #0x04
	adds	r2, r2, r4
	adc	r2, r2, #0x00

	/* Deal with < 4 bytes remaining */
.Lcksumdata_lessthan4:
	adds	r1, r1, #0x04
	moveq	pc, lr

	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
.Lcksumdata_endgame:
	ldrb	r3, [r0]		/* Fetch first byte */
	cmp	r1, #0x02
	ldrgeb	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
	movlt	r4, #0x00
	ldrgtb	r5, [r0, #0x02]
	movle	r5, #0x00
	/* Combine the three bytes depending on endianness and alignment */
	tst	r0, #0x01
#ifdef __ARMEB__
	orreq	r3, r4, r3, lsl #8
	orreq	r3, r3, r5, lsl #24
	orrne	r3, r3, r4, lsl #8
	orrne	r3, r3, r5, lsl #16
#else
	orreq	r3, r3, r4, lsl #8
	orreq	r3, r3, r5, lsl #16
	orrne	r3, r4, r3, lsl #8
	orrne	r3, r3, r5, lsl #24
#endif
	adds	r2, r2, r3
	adc	r2, r2, #0x00
	mov	pc, lr

--Boundary-00=_X+CY/iV8AJzbxiI--