Subject: Re: FDDI on NetBSD/alpha: it runs!
To: Chris G Demetriou <Chris_G_Demetriou@ux2.sp.cs.cmu.edu>
From: Matt Thomas <matt@lkg.dec.com>
List: port-alpha
Date: 05/17/1996 22:48:43
This is a multipart MIME message.

--===_0_Fri_May_17_22:47:21__1996
Content-Type: text/plain; charset=us-ascii


In  <12772.832297223@UX2.SP.CS.CMU.EDU> , you wrote:

> > How fast is it?  Remember the is not compiled optimized.
> > How without too much tuning I got ~60Mb/s.
> 
> How does Digital UNIX (Formerly DEC OSF/1 8-) do on similar hardware?

Digital UNIX can easily saturate an FDDI.  Once I recompiled the network
pieces -O2 and wrote a better (though not "good") perforance jumped from
~55Mb/s to ~80Mb/s.  However, that's at 100% cpu utilization on the
sender and ~60% on the receiver (regardless whether either is the alpha
or the intel).  

> I'd be interested in seeing what causes any slowdown that might be
> seen...  Kernel profiling works, as far as I know, but the various
> assembly functions don't have proper profiling hooks...
> 
> "Oh, for an optimized in_cksum()!"  8-)

If gas had a brain and could do decenet scheduling, my new in_cksum
probably would do better.  Using RPCC while running this in a test
enviroment under Digital UNIX shows my new in_cksum is about 50% faster
than the portable one.


-- 
Matt Thomas               Internet:   matt@3am-software.com
3am Software Foundry      WWW URL:    http://www.3am-software.com/bio/matt.html
Westford, MA              Disclaimer: I disavow all knowledge of this message


--===_0_Fri_May_17_22:47:21__1996
Content-Type: text/plain
Content-Description: in_cksum.c

/*	$NetBSD: in_cksum.c,v 1.11 1996/04/08 19:55:37 jonathan Exp $	*/

/*
 * Copyright (c) 1988, 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 * Copyright (c) 1996
 *	Matt Thomas <matt@3am-software.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)in_cksum.c	8.1 (Berkeley) 6/10/93
 */

#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <netinet/in.h>

/*
 * Checksum routine for Internet Protocol family headers
 *    (Portable Alpha ersion).
 *
 * This routine is very heavily used in the network
 * code and should be modified for each CPU to be as fast as possible.
 */

#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
#define REDUCE32 {q_util.q = sum; sum = q_util.l[0] + q_util.l[1];}
#define REDUCE16 {q_util.q = sum; l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
			sum = l_util.s[0] + l_util.s[1]; \
			ADDCARRY(sum);}

static const u_int32_t in_masks[] = {
	/* 0 bytes */	/* 1 bytes */	/* 2 bytes */	/* 3 bytes */
	0x00000000,	0x000000FF,	0x0000FFFF,	0x00FFFFFF,	/* offset 0 */
	0x00000000,	0x0000FF00,	0x00FFFF00,	0xFFFFFF00,	/* offset 1 */
	0x00000000,	0x00FF0000,	0xFFFF0000,	0xFFFF0000,	/* offset 2 */
	0x00000000,	0xFF000000,	0xFF000000,	0xFF000000,	/* offset 3 */
};

union l_util {
	u_int16_t s[2];
	u_int32_t l;
};
union q_util {
	u_int16_t s[4];
	u_int32_t l[2];
	u_int64_t q;
};

u_int64_t
in_cksumdata(buf, len)
	register caddr_t buf;
	register int len;
{
	const u_int32_t *lw = (u_int32_t *) buf;
	u_int64_t sum = 0;
	u_int64_t prefilled;
	int offset;
	union q_util q_util;
	union l_util l_util;

	if ((3 & (long) lw) == 0 && len == 20) {
	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
	     REDUCE32;
	     return sum;
	}

	if ((offset = 3 & (long) lw) != 0) {
		const u_int32_t *masks = in_masks + (offset << 2);
		lw = (u_int32_t *) (((long) lw) - offset);
		sum = *lw++ & masks[len >= 3 ? 3 : len];
		len -= 4 - offset;
		if (len <= 0) {
			REDUCE32;
			return sum;
		}
	}
#if 0
	/*
	 * Force to cache line boundary.
	 */
	offset = 32 - (0x1f & (long) lw);
	if (offset < 32 && len > offset) {
		len -= offset;
		if (4 & offset)
			sum += (u_int64_t) lw[0], lw += 1;
		if (8 & offset)
			sum += (u_int64_t) lw[0] + lw[1], lw += 2;
		if (16 & offset)
			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3], lw += 4;
	}
#endif
	/*
	 * access prefilling to start load of next cache line.
	 * then add current cache line
	 * save result of prefilling for loop iteration.
	 */
	prefilled = lw[0];
	while ((len -= 32) >= 4) {
		u_int64_t prefilling = lw[8];
		sum += prefilled + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7];
		lw += 8;
		prefilled = prefilling;
	}
	if (len >= 0) {
		sum += prefilled + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7];
		lw += 8;
	} else {
		len += 32;
	}
	while ((len -= 16) >= 0) {
		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
		lw += 4;
	}
	len += 16;
	while ((len -= 4) >= 0) {
		sum += (u_int64_t) *lw++;
	}
	len += 4;
	if (len > 0)
		sum += (u_int64_t) (in_masks[len] & *lw);
	REDUCE32;
	return sum;
}

int
in_cksum(m, len)
	register struct mbuf *m;
	register int len;
{
	register u_int64_t sum = 0;
	register int mlen = 0;
	register int clen = 0;
	register caddr_t addr;
	union q_util q_util;
	union l_util l_util;

	for (;m && len; m = m->m_next) {
		if (m->m_len == 0)
			continue;
		mlen = m->m_len;
		if (len < mlen)
			mlen = len;
		addr = mtod(m, caddr_t);
		if ((clen ^ (long) addr) & 1)
		    sum += in_cksumdata(addr, mlen) << 8;
		else
		    sum += in_cksumdata(addr, mlen);

		clen += mlen;
		len -= mlen;
	}
	REDUCE16;
	return (~sum & 0xffff);
}

--===_0_Fri_May_17_22:47:21__1996--