Subject: Re: Ethernet address copy performance tweak
To: None <tech-net@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 08/17/2002 16:12:10
--hQiwHBbRI9kgIhsi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Sat, Aug 17, 2002 at 10:52:18AM -0700, Jason R Thorpe wrote:

 > A few of the changes we have made have been micro-optimizations.  One
 > of these optimizations, which erked out another 2-3Mb/s (190Mb/s to
 > 193Mb/s), was to ensure that copies of an Ethernet address in ether_output()
 > and ether_input() are always inlined.

I'd like to follow up on this...

This change has a bit more impact now that we've fixed other performance
issues on the platform in question.  ether_copy() is now responsible for
about a 6Mb/s improvement (205Mb/s to 211Mb/s)[*].

I've also made an additional change which should shave off a bunch more
cycles.  How much difference it actually makes, I can't say for sure right
now, since I'm traveling and away from my test systems; I'll need to get
Allen to benchmark this change later.

The additional change is to force the copy of the ethertype into the
packet to be inlined.  This is a two byte copy.  On platforms with
no strict alignment constraints (e.g. i386), GCC happily inlines this
for us.  On ARM (and other RISCs), it does not, and makes a function call.
Again, the function call/return overhead is 6 cycles, plus whatever memcpy()
does internally.  Inlining this 2 byte copy is 3 cycles on ARM, and actually
is fewer instructions than the old memcpy() call site (no register shuffling
to get args in the right place).

Attached is an updated diff -- I also went ahead and just changed all the
bcopy() calls to memcpy().

[*] We actually have it going at about 218Mb/s now (not including the change
to inline the ethertype copy).  This is 28Mb/s faster than another Unix-like
OS can go on the same hardware.  And, FWIW, the same Ethernet interface (Intel
i82544EI) in an Athlon or P4 system can go > 900Mb/s.

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--hQiwHBbRI9kgIhsi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="ether_copy.diff"

Index: if_ether.h
===================================================================
RCS file: /cvsroot/syssrc/sys/net/if_ether.h,v
retrieving revision 1.27
diff -u -r1.27 if_ether.h
--- if_ether.h	2002/03/05 04:13:00	1.27
+++ if_ether.h	2002/08/17 22:51:52
@@ -236,6 +236,50 @@
 }
 
 #ifdef _KERNEL
+/*
+ * ether_copy:
+ *
+ *	Routine to copy an Ethernet address from one place to
+ *	another.  This is a fairly common operation, and worth
+ *	inlining, esp. in the presence of high-speed networks.
+ */
+static __inline void __attribute__((__unused__))
+ether_copy(void *vdst, const void *vsrc)
+{
+	/* ...pretty much any RISC here... */
+#if defined(__alpha__) || defined(__arm__) || defined(__mips__) || \
+    defined(__powerpc__)
+	const unsigned char *src = vsrc;
+	unsigned char *dst = vdst;
+	unsigned char t0, t1, t2, t3;
+
+	t0 = src[0];
+	t1 = src[1];
+	t2 = src[2];
+	t3 = src[3];
+	dst[0] = t0;
+	dst[1] = t1;
+	t0 = src[4];
+	t1 = src[5];
+	dst[2] = t2;
+	dst[3] = t3;
+	dst[4] = t0;
+	dst[5] = t1;
+#elif 0	/* Some CISC's might like this, or a variant of it. */
+	const unsigned char *src = vsrc;
+	unsigned char *dst = vdst;
+
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+	*dst++ = *src++;
+#else	/* Let GCC try to inline... */
+	memcpy(vdst, vsrc, ETHER_ADDR_LEN);
+#endif
+}
+
 void	ether_ifattach(struct ifnet *, const u_int8_t *);
 void	ether_ifdetach(struct ifnet *);
 
Index: if_ethersubr.c
===================================================================
RCS file: /cvsroot/syssrc/sys/net/if_ethersubr.c,v
retrieving revision 1.95
diff -u -r1.95 if_ethersubr.c
--- if_ethersubr.c	2002/05/18 22:52:44	1.95
+++ if_ethersubr.c	2002/08/17 22:52:02
@@ -252,13 +252,10 @@
 #ifdef INET
 	case AF_INET:
 		if (m->m_flags & M_BCAST)
-                	bcopy((caddr_t)etherbroadcastaddr, (caddr_t)edst,
-				sizeof(edst));
-
+			ether_copy(edst, etherbroadcastaddr);
 		else if (m->m_flags & M_MCAST) {
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr,
 			    (caddr_t)edst)
-
 		} else if (!arpresolve(ifp, rt, m, dst, edst))
 			return (0);	/* if not yet resolved */
 		/* If broadcasting on a simplex interface, loopback a copy */
@@ -270,12 +267,10 @@
 	case AF_ARP:
 		ah = mtod(m, struct arphdr *);
 		if (m->m_flags & M_BCAST)
-                	bcopy((caddr_t)etherbroadcastaddr, (caddr_t)edst,
-				sizeof(edst));
+			ether_copy(edst, etherbroadcastaddr);
 		else
-			bcopy((caddr_t)ar_tha(ah),
-				(caddr_t)edst, sizeof(edst));
-		
+			ether_copy(edst, ar_tha(ah));
+
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
@@ -329,10 +324,10 @@
 			M_PREPEND(m, sizeof(struct llc), M_DONTWAIT);
 			llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP;
 			llc.llc_control = LLC_UI;
-			bcopy(at_org_code, llc.llc_snap_org_code,
+			memcpy(llc.llc_snap_org_code, at_org_code,
 			    sizeof(llc.llc_snap_org_code));
 			llc.llc_snap_ether_type = htons(ETHERTYPE_ATALK);
-			bcopy(&llc, mtod(m, caddr_t), sizeof(struct llc));
+			memcpy(mtod(m, caddr_t), &llc, sizeof(struct llc));
 		} else {
 			etype = htons(ETHERTYPE_ATALK);
 		}
@@ -341,8 +336,8 @@
 #ifdef NS
 	case AF_NS:
 		etype = htons(ETHERTYPE_NS);
- 		bcopy((caddr_t)&(((struct sockaddr_ns *)dst)->sns_addr.x_host),
-		    (caddr_t)edst, sizeof (edst));
+		ether_copy(edst,
+		    &(((struct sockaddr_ns *)dst)->sns_addr.x_host));
 		if (!bcmp((caddr_t)edst, (caddr_t)&ns_thishost, sizeof(edst)))
 			return (looutput(ifp, m, dst, rt));
 		/* If broadcasting on a simplex interface, loopback a copy */
@@ -353,8 +348,8 @@
 #ifdef IPX
 	case AF_IPX:
 		etype = htons(ETHERTYPE_IPX);
- 		bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host),
-		    (caddr_t)edst, sizeof (edst));
+		ether_copy(edst,
+		    &(((struct sockaddr_ipx *)dst)->sipx_addr.x_host));
 		/* If broadcasting on a simplex interface, loopback a copy */
 		if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX))
 			mcopy = m_copy(m, 0, (int)M_COPYALL);
@@ -368,7 +363,7 @@
 
 		if (rt && (sdl = (struct sockaddr_dl *)rt->rt_gateway) &&
 		    sdl->sdl_family == AF_LINK && sdl->sdl_alen > 0) {
-			bcopy(LLADDR(sdl), (caddr_t)edst, sizeof(edst));
+			ether_copy(edst, LLADDR(sdl));
 		} else {
 			error = iso_snparesolve(ifp, (struct sockaddr_iso *)dst,
 						(char *)edst, &snpalen);
@@ -383,10 +378,9 @@
 			M_PREPEND(mcopy, sizeof (*eh), M_DONTWAIT);
 			if (mcopy) {
 				eh = mtod(mcopy, struct ether_header *);
-				bcopy((caddr_t)edst,
-				      (caddr_t)eh->ether_dhost, sizeof (edst));
-				bcopy(LLADDR(ifp->if_sadl),
-				      (caddr_t)eh->ether_shost, sizeof (edst));
+				ether_copy(eh->ether_dhost, edst);
+				ether_copy(eh->ether_shost,
+				    LLADDR(ifp->if_sadl));
 			}
 		}
 		M_PREPEND(m, 3, M_DONTWAIT);
@@ -414,18 +408,17 @@
 
 		if (sdl && sdl->sdl_family == AF_LINK
 		    && sdl->sdl_alen > 0) {
-			bcopy(LLADDR(sdl), (char *)edst,
-				sizeof(edst));
-		} else goto bad; /* Not a link interface ? Funny ... */
+			ether_copy(edst, LLADDR(sdl));
+		} else
+			goto bad; /* Not a link interface ? Funny ... */
 		if ((ifp->if_flags & IFF_SIMPLEX) && (*edst & 1) &&
 		    (mcopy = m_copy(m, 0, (int)M_COPYALL))) {
 			M_PREPEND(mcopy, sizeof (*eh), M_DONTWAIT);
 			if (mcopy) {
 				eh = mtod(mcopy, struct ether_header *);
-				bcopy((caddr_t)edst,
-				      (caddr_t)eh->ether_dhost, sizeof (edst));
-				bcopy(LLADDR(ifp->if_sadl),
-				      (caddr_t)eh->ether_shost, sizeof (edst));
+				ether_copy(eh->ether_dhost, edst);
+				ether_copy(eh->ether_shost,
+				    LLADDR(ifp->if_sadl));
 			}
 		}
 #ifdef LLC_DEBUG
@@ -448,12 +441,12 @@
 	case pseudo_AF_HDRCMPLT:
 		hdrcmplt = 1;
 		eh = (struct ether_header *)dst->sa_data;
-		bcopy((caddr_t)eh->ether_shost, (caddr_t)esrc, sizeof (esrc));
+		ether_copy(esrc, eh->ether_shost);
 		/* FALLTHROUGH */
 
 	case AF_UNSPEC:
 		eh = (struct ether_header *)dst->sa_data;
- 		bcopy((caddr_t)eh->ether_dhost, (caddr_t)edst, sizeof (edst));
+		ether_copy(edst, eh->ether_dhost);
 		/* AF_UNSPEC doesn't swap the byte order of the ether_type. */
 		etype = eh->ether_type;
 		break;
@@ -479,15 +472,37 @@
 	if (m == 0)
 		senderr(ENOBUFS);
 	eh = mtod(m, struct ether_header *);
-	bcopy((caddr_t)&etype,(caddr_t)&eh->ether_type,
-		sizeof(eh->ether_type));
- 	bcopy((caddr_t)edst, (caddr_t)eh->ether_dhost, sizeof (edst));
+#ifdef __NO_STRICT_ALIGNMENT
+	/*
+	 * On platforms with no strict alignment constraints, the
+	 * compiler is likely to inline this copy.
+	 */
+	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
+#else
+	/*
+	 * On platforms with strict alignment constraints, the compiler is
+	 * probably not going to inline the copy of the ethertype.  However,
+	 * a 2 byte copy is hardly worth a function call, so we'll inline
+	 * it here.
+	 *
+	 * NOTE: "etype" is already in network byte order.
+	 */
+	{
+		uint8_t *dstp = (uint8_t *) &eh->ether_type;
+#if BYTE_ORDER == BIG_ENDIAN
+		dstp[0] = etype >> 8;
+		dstp[1] = etype;
+#else
+		dstp[0] = etype;
+		dstp[1] = etype >> 8;
+#endif
+	}
+#endif /* __NO_STRICT_ALIGNMENT */
+	ether_copy(eh->ether_dhost, edst);
 	if (hdrcmplt)
-		bcopy((caddr_t)esrc, (caddr_t)eh->ether_shost,
-		    sizeof(eh->ether_shost));
+		ether_copy(eh->ether_shost, esrc);
 	else
-	 	bcopy(LLADDR(ifp->if_sadl), (caddr_t)eh->ether_shost,
-		    sizeof(eh->ether_shost));
+		ether_copy(eh->ether_shost, LLADDR(ifp->if_sadl));
 
 #ifdef PFIL_HOOKS
 	if ((error = pfil_run_hooks(&ifp->if_pfil, &m, ifp, PFIL_OUT)) != 0)
@@ -950,8 +965,8 @@
 				l->llc_dsap = l->llc_ssap;
 				l->llc_ssap = c;
 				if (m->m_flags & (M_BCAST | M_MCAST))
-					bcopy(LLADDR(ifp->if_sadl),
-					      (caddr_t)eh->ether_dhost, 6);
+					ether_copy(eh->ether_dhost,
+					    LLADDR(ifp->if_sadl));
 				sa.sa_family = AF_UNSPEC;
 				sa.sa_len = sizeof(sa);
 				eh2 = (struct ether_header *)sa.sa_data;
@@ -1188,8 +1203,8 @@
 	switch (sa->sa_family) {
 
 	case AF_UNSPEC:
-		bcopy(sa->sa_data, addrlo, ETHER_ADDR_LEN);
-		bcopy(addrlo, addrhi, ETHER_ADDR_LEN);
+		memcpy(addrlo, sa->sa_data, ETHER_ADDR_LEN);
+		memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
 		break;
 
 #ifdef INET
@@ -1202,12 +1217,11 @@
 			 * multicast addresses used for IP.
 			 * (This is for the sake of IP multicast routers.)
 			 */
-			bcopy(ether_ipmulticast_min, addrlo, ETHER_ADDR_LEN);
-			bcopy(ether_ipmulticast_max, addrhi, ETHER_ADDR_LEN);
-		}
-		else {
+			memcpy(addrlo, ether_ipmulticast_min, ETHER_ADDR_LEN);
+			memcpy(addrhi, ether_ipmulticast_max, ETHER_ADDR_LEN);
+		} else {
 			ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo);
-			bcopy(addrlo, addrhi, ETHER_ADDR_LEN);
+			memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
 		}
 		break;
 #endif
@@ -1221,11 +1235,11 @@
 			 * address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
-			bcopy(ether_ip6multicast_min, addrlo, ETHER_ADDR_LEN);
-			bcopy(ether_ip6multicast_max, addrhi, ETHER_ADDR_LEN);
+			memcpy(addrlo, ether_ip6multicast_min, ETHER_ADDR_LEN);
+			memcpy(addrhi, ether_ip6multicast_max, ETHER_ADDR_LEN);
 		} else {
 			ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, addrlo);
-			bcopy(addrlo, addrhi, ETHER_ADDR_LEN);
+			memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
 		}
 		break;
 #endif
@@ -1282,8 +1296,8 @@
 		splx(s);
 		return (ENOBUFS);
 	}
-	bcopy(addrlo, enm->enm_addrlo, 6);
-	bcopy(addrhi, enm->enm_addrhi, 6);
+	memcpy(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN);
+	memcpy(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN);
 	enm->enm_ec = ec;
 	enm->enm_refcount = 1;
 	LIST_INSERT_HEAD(&ec->ec_multiaddrs, enm, enm_list);

--hQiwHBbRI9kgIhsi--