Subject: RTL8169 hw IP4CSUM_Tx workaround
To: None <tech-kern@NetBSD.org>
From: Izumi Tsutsui <tsutsui@ceres.dti.ne.jp>
List: tech-net
Date: 10/21/2006 15:15:32
Hi,

Currently hardware-assisted IPv4 header checksums for TX is
disabled on re(4) driver for RealTek 8169 chips, and a comment
in sys/dev/ic/rtl8169.c says that it may have some bugs on
sending small packets.

But according to a comment in FreeBSD's if_re.c,
the bug only happens if the re(4) chip is trying to autopad
small packets so we can avoid it by padding them manually:

>>	 * With some of the RealTek chips, using the checksum offload
>>	 * support in conjunction with the autopadding feature results
>>	 * in the transmission of corrupt frames. For example, if we
>>	 * need to send a really small IP fragment that's less than 60
>>	 * bytes in size, and IP header checksumming is enabled, the
>>	 * resulting ethernet frame that appears on the wire will
>>	 * have garbled payload.

FreeBSD's re(4) driver uses m_defrag() to allocate a new mbuf
and then adjusts packet size and zeroing data.
Since we don't have m_defrag() and I think a strategy used in
bge(4) driver seems better than always allocating a new mbuf.
(though some other drivers allocate own DMA memory for padding)

The attached patch seems to work on my RTL8169 and macppc,
but is there anyway to confirm if IFCAP_CSUM_IPv4_Tx
is actually working properly or not?
---
Izumi Tsutsui


Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.43
diff -u -r1.43 rtl8169.c
--- rtl8169.c	21 Oct 2006 03:44:47 -0000	1.43
+++ rtl8169.c	21 Oct 2006 05:43:17 -0000
@@ -152,8 +152,11 @@
 
 #include <dev/ic/rtl8169var.h>
 
+#define	ETHER_PAD_LEN	(ETHER_MIN_LEN - ETHER_CRC_LEN)
+
 
 static int re_encap(struct rtk_softc *, struct mbuf *, int *);
+static inline int re_cksum_pad(struct mbuf *);
 
 static int re_newbuf(struct rtk_softc *, int, struct mbuf *);
 static int re_rx_list_init(struct rtk_softc *);
@@ -758,7 +761,7 @@
 	 */
 
 	ifp->if_capabilities |=
-	    /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+	    IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
 	    IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
 	    IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
 	    IFCAP_TSOv4;
@@ -1499,6 +1502,52 @@
 	return handled;
 }
 
+/*
+ * This function is just taken and modified from bge(4) driver.
+ *
+ * Although re(4) chips support auto-padding small packets,
+ * but it seems to cause a bug on IPv4 hardware checksum offload.
+ * To avoid the bug, pad packets manually if ip4csum is enabled.
+ */
+static inline int
+re_cksum_pad(struct mbuf *pkt)
+{
+	struct mbuf *m_last, *m_pad;
+	int padlen;
+
+	padlen = ETHER_PAD_LEN - pkt->m_pkthdr.len;
+
+	/*
+	 * Walk packet chain to find last mbuf. We will either
+	 * pad there, or append a new mbuf and pad it.
+	 */
+	for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
+		continue;
+
+	/* `m_last' now points to last in chain. */
+	if (M_READONLY(m_last) || M_TRAILINGSPACE(m_last) < padlen) {
+		/*
+		 * Allocate new empty mbuf chain pad it. Compact later.
+		 *
+		 * XXX
+		 * Is it better to allocate a new mbuf by MCLGET(9)
+		 * and copy whole data to avoid one more fragment
+		 * since the packet size is small enough in this case.
+		 */
+		MGET(m_pad, M_DONTWAIT, MT_DATA);
+		if (m_pad == NULL)
+			return ENOMEM;
+		m_last->m_next = m_pad;
+		m_last = m_pad;
+	}
+
+	memset(mtod(m_last, char *) + m_last->m_len, 0, padlen);
+	m_last->m_len += padlen;
+	pkt->m_pkthdr.len += padlen;
+
+	return 0;
+}
+
 static int
 re_encap(struct rtk_softc *sc, struct mbuf *m, int *idx)
 {
@@ -1539,6 +1588,15 @@
 		if ((m->m_pkthdr.csum_flags &
 		    (M_CSUM_IPv4 | M_CSUM_TCPv4 | M_CSUM_UDPv4)) != 0) {
 			rtk_flags |= RTK_TDESC_CMD_IPCSUM;
+			/*
+			 * Pad small packets explicitly if ip4csum is enabled
+			 * to avoid a hardware bug around IPv4 outboard cksum.
+			 */
+			if (m->m_pkthdr.len < ETHER_PAD_LEN) {
+				error = re_cksum_pad(m);
+				if (error != 0)
+					return error;
+			}
 			if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) {
 				rtk_flags |= RTK_TDESC_CMD_TCPCSUM;
 			} else if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) {