Subject: Ethernet address copy performance tweak
To: None <tech-net@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 08/17/2002 10:52:18
--rJwd6BRFiFCcLxzm
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Over the last week or so, Allen Briggs and I have been working on
improving the performance of Gigabit Ethernet on an ARM platform
with very limited memory bandwidth.  In that time, we've managed to
gain a bit over 300% improvement.

A few of the changes we have made have been micro-optimizations.  One
of these optimizations, which erked out another 2-3Mb/s (190Mb/s to
193Mb/s), was to ensure that copies of an Ethernet address in ether_output()
and ether_input() are always inlined.

The previous code was using memcpy() (actually bcopy(), which is a CPP
macro in the kernel) with a constant length.  On the i386, which can
do unaligned access, GCC was already inlining this copy (it ended up
being 3 insns).

However, on the ARM (and this applies to RISCs in general, really), the
copy was NOT being inlined -- instead, a function call was being made,
which meant that there was a 6 cycle penalty (function call and return,
each costing branch + 2 pipeline stalls).  That is 1/2 the cycles needed
to do an unrolled copy (there is also some stupidity in the ARM memcpy(),
but I won't bother with that analysis here).

My solution is ether_copy(), a optimized inline function to copy an Ethernet
address.  It is written in C, but has #ifdef's in it to select which C
implementation to use -- this is because the decision is going to be different
on RISC vs CISC.

The result is a 12 insn block with no data stalls on ARM, which is a
significant improvement.  For i386, the code is no worse.  Alpha, MIPS,
and PowerPC (according to Allen :-) can benefit from the "RISC" version.
There is also a block, #if 0'd, which may be good on some CISCs (like m68k),
although I can't easily test that right now.

This seems like a worthwhile optimization, esp. considering that 10Gig-E
is right around the corner.

If there are no serious objections, I'd like to check in the following
patch (I didn't bother inlining anywhere but the fast path).

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--rJwd6BRFiFCcLxzm
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="ether_copy.diff"

Index: if_ether.h
===================================================================
RCS file: /cvsroot/syssrc/sys/net/if_ether.h,v
retrieving revision 1.27
diff -u -r1.27 if_ether.h
--- if_ether.h	2002/03/05 04:13:00	1.27
+++ if_ether.h	2002/08/17 17:35:51
@@ -236,6 +236,50 @@
 }
 
 #ifdef _KERNEL
+/*
+ * ether_copy:
+ *
+ *	Routine to copy an Ethernet address from one place to
+ *	another.  This is a fairly common operation, and worth
+ *	inlining, esp. in the presence of high-speed networks.
+ */
+static __inline void __attribute__((__unused__))
+ether_copy(void *vdst, const void *vsrc)
+{
+	/* ...pretty much any RISC here... */
+#if defined(__alpha__) || defined(__arm__) || defined(__mips__) || \
+    defined(__powerpc__)
+	const unsigned char *src = vsrc;
+	unsigned char *dst = vdst;
+	unsigned char t0, t1, t2, t3;
+
+	t0 = src[0];
+	t1 = src[1];
+	t2 = src[2];
+	t3 = src[3];
+	dst[0] = t0;
+	dst[1] = t1;
+	t0 = src[4];
+	t1 = src[5];
+	dst[2] = t2;
+	dst[3] = t3;
+	dst[4] = t0;
+	dst[5] = t1;
+#elif 0	/* Some CISC's might like this, or a variant of it. */
+	const unsigned char *src = vsrc;
+	unsigned char *dst = vdst;
+
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+#else	/* Let GCC try to inline... */
+	memcpy(vdst, vsrc, ETHER_ADDR_LEN);
+#endif
+}
+
 void	ether_ifattach(struct ifnet *, const u_int8_t *);
 void	ether_ifdetach(struct ifnet *);
 
Index: if_ethersubr.c
===================================================================
RCS file: /cvsroot/syssrc/sys/net/if_ethersubr.c,v
retrieving revision 1.95
diff -u -r1.95 if_ethersubr.c
--- if_ethersubr.c	2002/05/18 22:52:44	1.95
+++ if_ethersubr.c	2002/08/17 17:36:01
@@ -252,13 +252,10 @@
 #ifdef INET
 	case AF_INET:
 		if (m->m_flags & M_BCAST)
-                	bcopy((caddr_t)etherbroadcastaddr, (caddr_t)edst,
-				sizeof(edst));
-
+			ether_copy(edst, etherbroadcastaddr);
 		else if (m->m_flags & M_MCAST) {
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr,
 			    (caddr_t)edst)
-
 		} else if (!arpresolve(ifp, rt, m, dst, edst))
 			return (0);	/* if not yet resolved */
 		/* If broadcasting on a simplex interface, loopback a copy */
@@ -270,12 +267,10 @@
 	case AF_ARP:
 		ah = mtod(m, struct arphdr *);
 		if (m->m_flags & M_BCAST)
-                	bcopy((caddr_t)etherbroadcastaddr, (caddr_t)edst,
-				sizeof(edst));
+			ether_copy(edst, etherbroadcastaddr);
 		else
-			bcopy((caddr_t)ar_tha(ah),
-				(caddr_t)edst, sizeof(edst));
-		
+			ether_copy(edst, ar_tha(ah));
+
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
@@ -341,8 +336,8 @@
 #ifdef NS
 	case AF_NS:
 		etype = htons(ETHERTYPE_NS);
- 		bcopy((caddr_t)&(((struct sockaddr_ns *)dst)->sns_addr.x_host),
-		    (caddr_t)edst, sizeof (edst));
+		ether_copy(edst,
+		    &(((struct sockaddr_ns *)dst)->sns_addr.x_host));
 		if (!bcmp((caddr_t)edst, (caddr_t)&ns_thishost, sizeof(edst)))
 			return (looutput(ifp, m, dst, rt));
 		/* If broadcasting on a simplex interface, loopback a copy */
@@ -353,8 +348,8 @@
 #ifdef IPX
 	case AF_IPX:
 		etype = htons(ETHERTYPE_IPX);
- 		bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host),
-		    (caddr_t)edst, sizeof (edst));
+		ether_copy(edst,
+		    &(((struct sockaddr_ipx *)dst)->sipx_addr.x_host));
 		/* If broadcasting on a simplex interface, loopback a copy */
 		if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX))
 			mcopy = m_copy(m, 0, (int)M_COPYALL);
@@ -368,7 +363,7 @@
 
 		if (rt && (sdl = (struct sockaddr_dl *)rt->rt_gateway) &&
 		    sdl->sdl_family == AF_LINK && sdl->sdl_alen > 0) {
-			bcopy(LLADDR(sdl), (caddr_t)edst, sizeof(edst));
+			ether_copy(edst, LLADDR(sdl));
 		} else {
 			error = iso_snparesolve(ifp, (struct sockaddr_iso *)dst,
 						(char *)edst, &snpalen);
@@ -383,10 +378,9 @@
 			M_PREPEND(mcopy, sizeof (*eh), M_DONTWAIT);
 			if (mcopy) {
 				eh = mtod(mcopy, struct ether_header *);
-				bcopy((caddr_t)edst,
-				      (caddr_t)eh->ether_dhost, sizeof (edst));
-				bcopy(LLADDR(ifp->if_sadl),
-				      (caddr_t)eh->ether_shost, sizeof (edst));
+				ether_copy(eh->ether_dhost, edst);
+				ether_copy(eh->ether_shost,
+				    LLADDR(ifp->if_sadl));
 			}
 		}
 		M_PREPEND(m, 3, M_DONTWAIT);
@@ -414,18 +408,17 @@
 
 		if (sdl && sdl->sdl_family == AF_LINK
 		    && sdl->sdl_alen > 0) {
-			bcopy(LLADDR(sdl), (char *)edst,
-				sizeof(edst));
-		} else goto bad; /* Not a link interface ? Funny ... */
+			ether_copy(edst, LLADDR(sdl));
+		} else
+			goto bad; /* Not a link interface ? Funny ... */
 		if ((ifp->if_flags & IFF_SIMPLEX) && (*edst & 1) &&
 		    (mcopy = m_copy(m, 0, (int)M_COPYALL))) {
 			M_PREPEND(mcopy, sizeof (*eh), M_DONTWAIT);
 			if (mcopy) {
 				eh = mtod(mcopy, struct ether_header *);
-				bcopy((caddr_t)edst,
-				      (caddr_t)eh->ether_dhost, sizeof (edst));
-				bcopy(LLADDR(ifp->if_sadl),
-				      (caddr_t)eh->ether_shost, sizeof (edst));
+				ether_copy(eh->ether_dhost, edst);
+				ether_copy(eh->ether_shost,
+				    LLADDR(ifp->if_sadl));
 			}
 		}
 #ifdef LLC_DEBUG
@@ -448,12 +441,12 @@
 	case pseudo_AF_HDRCMPLT:
 		hdrcmplt = 1;
 		eh = (struct ether_header *)dst->sa_data;
-		bcopy((caddr_t)eh->ether_shost, (caddr_t)esrc, sizeof (esrc));
+		ether_copy(esrc, eh->ether_shost);
 		/* FALLTHROUGH */
 
 	case AF_UNSPEC:
 		eh = (struct ether_header *)dst->sa_data;
- 		bcopy((caddr_t)eh->ether_dhost, (caddr_t)edst, sizeof (edst));
+		ether_copy(edst, eh->ether_dhost);
 		/* AF_UNSPEC doesn't swap the byte order of the ether_type. */
 		etype = eh->ether_type;
 		break;
@@ -481,13 +474,11 @@
 	eh = mtod(m, struct ether_header *);
 	bcopy((caddr_t)&etype,(caddr_t)&eh->ether_type,
 		sizeof(eh->ether_type));
- 	bcopy((caddr_t)edst, (caddr_t)eh->ether_dhost, sizeof (edst));
+	ether_copy(eh->ether_dhost, edst);
 	if (hdrcmplt)
-		bcopy((caddr_t)esrc, (caddr_t)eh->ether_shost,
-		    sizeof(eh->ether_shost));
+		ether_copy(eh->ether_shost, esrc);
 	else
-	 	bcopy(LLADDR(ifp->if_sadl), (caddr_t)eh->ether_shost,
-		    sizeof(eh->ether_shost));
+		ether_copy(eh->ether_shost, LLADDR(ifp->if_sadl));
 
 #ifdef PFIL_HOOKS
 	if ((error = pfil_run_hooks(&ifp->if_pfil, &m, ifp, PFIL_OUT)) != 0)
@@ -950,8 +941,8 @@
 				l->llc_dsap = l->llc_ssap;
 				l->llc_ssap = c;
 				if (m->m_flags & (M_BCAST | M_MCAST))
-					bcopy(LLADDR(ifp->if_sadl),
-					      (caddr_t)eh->ether_dhost, 6);
+					ether_copy(eh->ether_dhost,
+					    LLADDR(ifp->if_sadl));
 				sa.sa_family = AF_UNSPEC;
 				sa.sa_len = sizeof(sa);
 				eh2 = (struct ether_header *)sa.sa_data;
@@ -1282,8 +1273,8 @@
 		splx(s);
 		return (ENOBUFS);
 	}
-	bcopy(addrlo, enm->enm_addrlo, 6);
-	bcopy(addrhi, enm->enm_addrhi, 6);
+	bcopy(addrlo, enm->enm_addrlo, ETHER_ADDR_LEN);
+	bcopy(addrhi, enm->enm_addrhi, ETHER_ADDR_LEN);
 	enm->enm_ec = ec;
 	enm->enm_refcount = 1;
 	LIST_INSERT_HEAD(&ec->ec_multiaddrs, enm, enm_list);

--rJwd6BRFiFCcLxzm--