Subject: Re: RTL8169 hw IP4CSUM_Tx workaround
To: None <tech-kern@NetBSD.org, tech-net@NetBSD.org>
From: Izumi Tsutsui <tsutsui@ceres.dti.ne.jp>
List: tech-net
Date: 11/12/2006 12:49:00
In article <061104203953.M0114017@mirage.ceres.dti.ne.jp>
I wrote:

> Then, what is the best way to workaround this
> (nasty but likely ;-p) hardware bug?

Both patches (padding mbuf, or padding by zero'ed dmamem) are attached.
---
Izumi Tsutsui


--- patch 1 (padding mbuf) ---

Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.59
diff -u -r1.59 rtl8169.c
--- rtl8169.c	12 Nov 2006 03:09:37 -0000	1.59
+++ rtl8169.c	12 Nov 2006 03:25:18 -0000
@@ -153,6 +153,8 @@
 #include <dev/ic/rtl8169var.h>
 
 
+static inline int re_cksum_pad(struct mbuf *);
+
 static int re_newbuf(struct rtk_softc *, int, struct mbuf *);
 static int re_rx_list_init(struct rtk_softc *);
 static int re_tx_list_init(struct rtk_softc *);
@@ -765,7 +767,7 @@
 	 */
 
 	ifp->if_capabilities |=
-	    /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+	    IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
 	    IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
 	    IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
 	    IFCAP_TSOv4;
@@ -1496,7 +1498,52 @@
 	return handled;
 }
 
+/*
+ * This function is just taken and modified from bge(4) driver.
+ *
+ * Although re(4) chips support auto-padding small packets,
+ * but it seems to cause a bug on IPv4 hardware checksum offload.
+ * To avoid the bug, pad packets manually if ip4csum is enabled.
+ */
+static inline int
+re_cksum_pad(struct mbuf *pkt)
+{
+	struct mbuf *m_last, *m_pad;
+	int padlen;
+
+	padlen = RE_IP4CSUMTX_PADLEN + 1 - pkt->m_pkthdr.len;
+
+	/*
+	 * Walk packet chain to find last mbuf. We will either
+	 * pad there, or append a new mbuf and pad it.
+	 */
+	for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
+		continue;
+
+	/* `m_last' now points to last in chain. */
+	if (M_TRAILINGSPACE(m_last) < padlen) {
+		/*
+		 * Allocate new empty mbuf chain pad it. Compact later.
+		 *
+		 * XXX
+		 * Is it better to allocate a new mbuf by MCLGET(9)
+		 * and copy whole data to avoid one more fragment
+		 * since the packet size is small enough in this case.
+		 */
+		MGET(m_pad, M_DONTWAIT, MT_DATA);
+		if (m_pad == NULL)
+			return ENOMEM;
+		m_pad->m_len = 0;
+		m_last->m_next = m_pad;
+		m_last = m_pad;
+	}
 
+	memset(mtod(m_last, char *) + m_last->m_len, 0, padlen);
+	m_last->m_len += padlen;
+	pkt->m_pkthdr.len += padlen;
+
+	return 0;
+}
 
 /*
  * Main transmit routine for C+ and gigE NICs.
@@ -1556,6 +1603,16 @@
 			    (M_CSUM_IPv4 | M_CSUM_TCPv4 | M_CSUM_UDPv4))
 			    != 0) {
 				re_flags |= RE_TDESC_CMD_IPCSUM;
+				/*
+				 * Pad small packets explicitly if ip4csum is
+				 * enabled to avoid hardware ip4csum-tx bug.
+				 */
+				if (m->m_pkthdr.len <= RE_IP4CSUMTX_PADLEN) {
+					if (re_cksum_pad(m))
+						/* not enough memory */
+						break;
+				}
+						
 				if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) {
 					re_flags |= RE_TDESC_CMD_TCPCSUM;
 				} else if (m->m_pkthdr.csum_flags &
Index: rtl81x9var.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl81x9var.h,v
retrieving revision 1.34
diff -u -r1.34 rtl81x9var.h
--- rtl81x9var.h	12 Nov 2006 03:09:37 -0000	1.34
+++ rtl81x9var.h	12 Nov 2006 03:25:18 -0000
@@ -244,6 +244,14 @@
 	    sizeof(struct re_desc),					\
 	    (ops))
 
+/*
+ * re(4) hardware ip4csum-tx could be mangled with 28 byte or less IP packets
+ */
+#define RE_IP4CSUMTX_MINLEN	28
+#define RE_IP4CSUMTX_PADLEN	\
+	(sizeof(struct ether_header) + RE_IP4CSUMTX_MINLEN)
+
+
 #define RTK_ATTACHED 0x00000001 /* attach has succeeded */
 #define RTK_ENABLED  0x00000002 /* chip is enabled	*/
 
--- end of patch 1 ---


--- patch 2 (padding by zero'ed dmamem) ---

Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.59
diff -u -r1.59 rtl8169.c
--- rtl8169.c	12 Nov 2006 03:09:37 -0000	1.59
+++ rtl8169.c	12 Nov 2006 03:39:01 -0000
@@ -690,7 +690,8 @@
 	}
 
 	/* Allocate DMA'able memory for the RX ring */
-	if ((error = bus_dmamem_alloc(sc->sc_dmat, RE_RX_LIST_SZ,
+	if ((error = bus_dmamem_alloc(sc->sc_dmat,
+	    RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN,
 	    RE_RING_ALIGN, 0, &sc->re_ldata.re_rx_listseg, 1,
 	    &sc->re_ldata.re_rx_listnseg, BUS_DMA_NOWAIT)) != 0) {
 		aprint_error("%s: can't allocate rx listseg, error = %d\n",
@@ -700,17 +701,18 @@
 
 	/* Load the map for the RX ring. */
 	if ((error = bus_dmamem_map(sc->sc_dmat, &sc->re_ldata.re_rx_listseg,
-	    sc->re_ldata.re_rx_listnseg, RE_RX_LIST_SZ,
+	    sc->re_ldata.re_rx_listnseg, RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN,
 	    (caddr_t *)&sc->re_ldata.re_rx_list,
 	    BUS_DMA_COHERENT | BUS_DMA_NOWAIT)) != 0) {
 		aprint_error("%s: can't map rx list, error = %d\n",
 		    sc->sc_dev.dv_xname, error);
 		goto fail_5;
 	}
-	memset(sc->re_ldata.re_rx_list, 0, RE_RX_LIST_SZ);
+	memset(sc->re_ldata.re_rx_list, 0, RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN);
 
-	if ((error = bus_dmamap_create(sc->sc_dmat, RE_RX_LIST_SZ, 1,
-	    RE_RX_LIST_SZ, 0, 0,
+	if ((error = bus_dmamap_create(sc->sc_dmat,
+	    RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN, 1,
+	    RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN, 0, 0,
 	    &sc->re_ldata.re_rx_list_map)) != 0) {
 		aprint_error("%s: can't create rx list map, error = %d\n",
 		    sc->sc_dev.dv_xname, error);
@@ -719,7 +721,7 @@
 
 	if ((error = bus_dmamap_load(sc->sc_dmat,
 	    sc->re_ldata.re_rx_list_map, sc->re_ldata.re_rx_list,
-	    RE_RX_LIST_SZ, NULL, BUS_DMA_NOWAIT)) != 0) {
+	    RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN, NULL, BUS_DMA_NOWAIT)) != 0) {
 		aprint_error("%s: can't load rx list, error = %d\n",
 		    sc->sc_dev.dv_xname, error);
 		goto fail_7;
@@ -765,7 +767,7 @@
 	 */
 
 	ifp->if_capabilities |=
-	    /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+	    IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
 	    IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
 	    IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
 	    IFCAP_TSOv4;
@@ -1324,7 +1326,7 @@
 			break;
 		}
 
-		sc->re_ldata.re_tx_free += txq->txq_dmamap->dm_nsegs;
+		sc->re_ldata.re_tx_free += txq->txq_nsegs;
 		KASSERT(sc->re_ldata.re_tx_free <= RE_TX_DESC_CNT(sc));
 		bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap,
 		    0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
@@ -1514,8 +1516,9 @@
 	struct m_tag		*mtag;
 #endif
 	uint32_t		cmdstat, re_flags;
-	int			ofree, idx, error, seg;
+	int			ofree, idx, error, nsegs, seg;
 	int			startdesc, curdesc, lastdesc;
+	boolean_t		pad;
 
 	sc = ifp->if_softc;
 	ofree = sc->re_ldata.re_txq_free;
@@ -1581,7 +1584,15 @@
 			continue;
 		}
 
-		if (map->dm_nsegs > sc->re_ldata.re_tx_free - RE_NTXDESC_RSVD) {
+		nsegs = map->dm_nsegs;
+		pad = FALSE;
+		if (m->m_pkthdr.len <= RE_IP4CSUMTX_PADLEN &&
+		    (re_flags & RE_TDESC_CMD_IPCSUM) != 0) {
+			pad = TRUE;
+			nsegs++;
+		}
+
+		if (nsegs > sc->re_ldata.re_tx_free - RE_NTXDESC_RSVD) {
 			/*
 			 * Not enough free descriptors to transmit this packet.
 			 */
@@ -1640,7 +1651,7 @@
 				cmdstat |= RE_TDESC_CMD_OWN;
 			if (curdesc == (RE_TX_DESC_CNT(sc) - 1))
 				cmdstat |= RE_TDESC_CMD_EOR;
-			if (seg == map->dm_nsegs - 1) {
+			if (seg == nsegs - 1) {
 				cmdstat |= RE_TDESC_CMD_EOF;
 				lastdesc = curdesc;
 			}
@@ -1648,6 +1659,26 @@
 			RE_TXDESCSYNC(sc, curdesc,
 			    BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
 		}
+		if (pad) {
+			bus_addr_t paddaddr;
+
+			d = &sc->re_ldata.re_tx_list[curdesc];
+			paddaddr =
+			    sc->re_ldata.re_rx_list_map->dm_segs[0].ds_addr +
+			    RE_TXPADOFF;
+			d->re_bufaddr_lo = htole32(RE_ADDR_LO(paddaddr));
+			d->re_bufaddr_hi = htole32(RE_ADDR_HI(paddaddr));
+			cmdstat = re_flags |
+			    RE_TDESC_CMD_OWN | RE_TDESC_CMD_EOF |
+			    (RE_IP4CSUMTX_PADLEN + 1 - m->m_pkthdr.len);
+			if (curdesc == (RE_TX_DESC_CNT(sc) - 1))
+				cmdstat |= RE_TDESC_CMD_EOR;
+			d->re_cmdstat = htole32(cmdstat);
+			RE_TXDESCSYNC(sc, curdesc,
+			    BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
+			lastdesc = curdesc;
+			curdesc = RE_NEXT_TX_DESC(sc, curdesc);
+		}
 		KASSERT(lastdesc != -1);
 
 		/*
@@ -1674,9 +1705,10 @@
 		/* update info of TX queue and descriptors */
 		txq->txq_mbuf = m;
 		txq->txq_descidx = lastdesc;
+		txq->txq_nsegs = nsegs;
 
 		sc->re_ldata.re_txq_free--;
-		sc->re_ldata.re_tx_free -= map->dm_nsegs;
+		sc->re_ldata.re_tx_free -= nsegs;
 		sc->re_ldata.re_tx_nextfree = curdesc;
 
 #if NBPFILTER > 0
Index: rtl81x9var.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl81x9var.h,v
retrieving revision 1.34
diff -u -r1.34 rtl81x9var.h
--- rtl81x9var.h	12 Nov 2006 03:09:37 -0000	1.34
+++ rtl81x9var.h	12 Nov 2006 03:39:01 -0000
@@ -148,6 +148,7 @@
 	struct mbuf *txq_mbuf;
 	bus_dmamap_t txq_dmamap;
 	int txq_descidx;
+	int txq_nsegs;
 };
 
 struct re_list_data {
@@ -244,6 +245,21 @@
 	    sizeof(struct re_desc),					\
 	    (ops))
 
+/*
+ * re(4) hardware ip4csum-tx could be mangled with 28 byte or less IP packets
+ */
+#define RE_IP4CSUMTX_MINLEN	28                                
+#define RE_IP4CSUMTX_PADLEN	\
+	(sizeof(struct ether_header) + RE_IP4CSUMTX_MINLEN)
+/*
+ * XXX
+ * We are allocating pad DMA buffer after RX DMA descs for now
+ * because RE_TX_LIST_SZ(sc) always occupies whole page but
+ * RE_RX_LIST_SZ is less than PAGE_SIZE so there is some unused region.
+ */
+#define RE_TXPADOFF		RE_RX_LIST_SZ
+
+
 #define RTK_ATTACHED 0x00000001 /* attach has succeeded */
 #define RTK_ENABLED  0x00000002 /* chip is enabled	*/
 
--- end of patch 2---