Subject: Re: RTL8169 hw IP4CSUM_Tx workaround
To: None <tech-kern@NetBSD.org, tech-net@NetBSD.org>
From: Izumi Tsutsui <tsutsui@ceres.dti.ne.jp>
List: tech-net
Date: 11/12/2006 12:49:00
In article <061104203953.M0114017@mirage.ceres.dti.ne.jp>
I wrote:
> Then, what is the best way to workaround this
> (nasty but likely ;-p) hardware bug?
Both patches (padding mbuf, or padding by zero'ed dmamem) are attached.
---
Izumi Tsutsui
--- patch 1 (padding mbuf) ---
Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.59
diff -u -r1.59 rtl8169.c
--- rtl8169.c 12 Nov 2006 03:09:37 -0000 1.59
+++ rtl8169.c 12 Nov 2006 03:25:18 -0000
@@ -153,6 +153,8 @@
#include <dev/ic/rtl8169var.h>
+static inline int re_cksum_pad(struct mbuf *);
+
static int re_newbuf(struct rtk_softc *, int, struct mbuf *);
static int re_rx_list_init(struct rtk_softc *);
static int re_tx_list_init(struct rtk_softc *);
@@ -765,7 +767,7 @@
*/
ifp->if_capabilities |=
- /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+ IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
IFCAP_TSOv4;
@@ -1496,7 +1498,52 @@
return handled;
}
+/*
+ * This function is just taken and modified from bge(4) driver.
+ *
+ * Although re(4) chips support auto-padding small packets,
+ * but it seems to cause a bug on IPv4 hardware checksum offload.
+ * To avoid the bug, pad packets manually if ip4csum is enabled.
+ */
+static inline int
+re_cksum_pad(struct mbuf *pkt)
+{
+ struct mbuf *m_last, *m_pad;
+ int padlen;
+
+ padlen = RE_IP4CSUMTX_PADLEN + 1 - pkt->m_pkthdr.len;
+
+ /*
+ * Walk packet chain to find last mbuf. We will either
+ * pad there, or append a new mbuf and pad it.
+ */
+ for (m_last = pkt; m_last->m_next != NULL; m_last = m_last->m_next)
+ continue;
+
+ /* `m_last' now points to last in chain. */
+ if (M_TRAILINGSPACE(m_last) < padlen) {
+ /*
+ * Allocate new empty mbuf chain pad it. Compact later.
+ *
+ * XXX
+ * Is it better to allocate a new mbuf by MCLGET(9)
+ * and copy whole data to avoid one more fragment
+ * since the packet size is small enough in this case.
+ */
+ MGET(m_pad, M_DONTWAIT, MT_DATA);
+ if (m_pad == NULL)
+ return ENOMEM;
+ m_pad->m_len = 0;
+ m_last->m_next = m_pad;
+ m_last = m_pad;
+ }
+ memset(mtod(m_last, char *) + m_last->m_len, 0, padlen);
+ m_last->m_len += padlen;
+ pkt->m_pkthdr.len += padlen;
+
+ return 0;
+}
/*
* Main transmit routine for C+ and gigE NICs.
@@ -1556,6 +1603,16 @@
(M_CSUM_IPv4 | M_CSUM_TCPv4 | M_CSUM_UDPv4))
!= 0) {
re_flags |= RE_TDESC_CMD_IPCSUM;
+ /*
+ * Pad small packets explicitly if ip4csum is
+ * enabled to avoid hardware ip4csum-tx bug.
+ */
+ if (m->m_pkthdr.len <= RE_IP4CSUMTX_PADLEN) {
+ if (re_cksum_pad(m))
+ /* not enough memory */
+ break;
+ }
+
if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4) {
re_flags |= RE_TDESC_CMD_TCPCSUM;
} else if (m->m_pkthdr.csum_flags &
Index: rtl81x9var.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl81x9var.h,v
retrieving revision 1.34
diff -u -r1.34 rtl81x9var.h
--- rtl81x9var.h 12 Nov 2006 03:09:37 -0000 1.34
+++ rtl81x9var.h 12 Nov 2006 03:25:18 -0000
@@ -244,6 +244,14 @@
sizeof(struct re_desc), \
(ops))
+/*
+ * re(4) hardware ip4csum-tx could be mangled with 28 byte or less IP packets
+ */
+#define RE_IP4CSUMTX_MINLEN 28
+#define RE_IP4CSUMTX_PADLEN \
+ (sizeof(struct ether_header) + RE_IP4CSUMTX_MINLEN)
+
+
#define RTK_ATTACHED 0x00000001 /* attach has succeeded */
#define RTK_ENABLED 0x00000002 /* chip is enabled */
--- end of patch 1 ---
--- patch 2 (padding by zero'ed dmamem) ---
Index: rtl8169.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl8169.c,v
retrieving revision 1.59
diff -u -r1.59 rtl8169.c
--- rtl8169.c 12 Nov 2006 03:09:37 -0000 1.59
+++ rtl8169.c 12 Nov 2006 03:39:01 -0000
@@ -690,7 +690,8 @@
}
/* Allocate DMA'able memory for the RX ring */
- if ((error = bus_dmamem_alloc(sc->sc_dmat, RE_RX_LIST_SZ,
+ if ((error = bus_dmamem_alloc(sc->sc_dmat,
+ RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN,
RE_RING_ALIGN, 0, &sc->re_ldata.re_rx_listseg, 1,
&sc->re_ldata.re_rx_listnseg, BUS_DMA_NOWAIT)) != 0) {
aprint_error("%s: can't allocate rx listseg, error = %d\n",
@@ -700,17 +701,18 @@
/* Load the map for the RX ring. */
if ((error = bus_dmamem_map(sc->sc_dmat, &sc->re_ldata.re_rx_listseg,
- sc->re_ldata.re_rx_listnseg, RE_RX_LIST_SZ,
+ sc->re_ldata.re_rx_listnseg, RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN,
(caddr_t *)&sc->re_ldata.re_rx_list,
BUS_DMA_COHERENT | BUS_DMA_NOWAIT)) != 0) {
aprint_error("%s: can't map rx list, error = %d\n",
sc->sc_dev.dv_xname, error);
goto fail_5;
}
- memset(sc->re_ldata.re_rx_list, 0, RE_RX_LIST_SZ);
+ memset(sc->re_ldata.re_rx_list, 0, RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN);
- if ((error = bus_dmamap_create(sc->sc_dmat, RE_RX_LIST_SZ, 1,
- RE_RX_LIST_SZ, 0, 0,
+ if ((error = bus_dmamap_create(sc->sc_dmat,
+ RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN, 1,
+ RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN, 0, 0,
&sc->re_ldata.re_rx_list_map)) != 0) {
aprint_error("%s: can't create rx list map, error = %d\n",
sc->sc_dev.dv_xname, error);
@@ -719,7 +721,7 @@
if ((error = bus_dmamap_load(sc->sc_dmat,
sc->re_ldata.re_rx_list_map, sc->re_ldata.re_rx_list,
- RE_RX_LIST_SZ, NULL, BUS_DMA_NOWAIT)) != 0) {
+ RE_RX_LIST_SZ + RE_IP4CSUMTX_PADLEN, NULL, BUS_DMA_NOWAIT)) != 0) {
aprint_error("%s: can't load rx list, error = %d\n",
sc->sc_dev.dv_xname, error);
goto fail_7;
@@ -765,7 +767,7 @@
*/
ifp->if_capabilities |=
- /* IFCAP_CSUM_IPv4_Tx | */ IFCAP_CSUM_IPv4_Rx |
+ IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
IFCAP_TSOv4;
@@ -1324,7 +1326,7 @@
break;
}
- sc->re_ldata.re_tx_free += txq->txq_dmamap->dm_nsegs;
+ sc->re_ldata.re_tx_free += txq->txq_nsegs;
KASSERT(sc->re_ldata.re_tx_free <= RE_TX_DESC_CNT(sc));
bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap,
0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
@@ -1514,8 +1516,9 @@
struct m_tag *mtag;
#endif
uint32_t cmdstat, re_flags;
- int ofree, idx, error, seg;
+ int ofree, idx, error, nsegs, seg;
int startdesc, curdesc, lastdesc;
+ boolean_t pad;
sc = ifp->if_softc;
ofree = sc->re_ldata.re_txq_free;
@@ -1581,7 +1584,15 @@
continue;
}
- if (map->dm_nsegs > sc->re_ldata.re_tx_free - RE_NTXDESC_RSVD) {
+ nsegs = map->dm_nsegs;
+ pad = FALSE;
+ if (m->m_pkthdr.len <= RE_IP4CSUMTX_PADLEN &&
+ (re_flags & RE_TDESC_CMD_IPCSUM) != 0) {
+ pad = TRUE;
+ nsegs++;
+ }
+
+ if (nsegs > sc->re_ldata.re_tx_free - RE_NTXDESC_RSVD) {
/*
* Not enough free descriptors to transmit this packet.
*/
@@ -1640,7 +1651,7 @@
cmdstat |= RE_TDESC_CMD_OWN;
if (curdesc == (RE_TX_DESC_CNT(sc) - 1))
cmdstat |= RE_TDESC_CMD_EOR;
- if (seg == map->dm_nsegs - 1) {
+ if (seg == nsegs - 1) {
cmdstat |= RE_TDESC_CMD_EOF;
lastdesc = curdesc;
}
@@ -1648,6 +1659,26 @@
RE_TXDESCSYNC(sc, curdesc,
BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
}
+ if (pad) {
+ bus_addr_t paddaddr;
+
+ d = &sc->re_ldata.re_tx_list[curdesc];
+ paddaddr =
+ sc->re_ldata.re_rx_list_map->dm_segs[0].ds_addr +
+ RE_TXPADOFF;
+ d->re_bufaddr_lo = htole32(RE_ADDR_LO(paddaddr));
+ d->re_bufaddr_hi = htole32(RE_ADDR_HI(paddaddr));
+ cmdstat = re_flags |
+ RE_TDESC_CMD_OWN | RE_TDESC_CMD_EOF |
+ (RE_IP4CSUMTX_PADLEN + 1 - m->m_pkthdr.len);
+ if (curdesc == (RE_TX_DESC_CNT(sc) - 1))
+ cmdstat |= RE_TDESC_CMD_EOR;
+ d->re_cmdstat = htole32(cmdstat);
+ RE_TXDESCSYNC(sc, curdesc,
+ BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
+ lastdesc = curdesc;
+ curdesc = RE_NEXT_TX_DESC(sc, curdesc);
+ }
KASSERT(lastdesc != -1);
/*
@@ -1674,9 +1705,10 @@
/* update info of TX queue and descriptors */
txq->txq_mbuf = m;
txq->txq_descidx = lastdesc;
+ txq->txq_nsegs = nsegs;
sc->re_ldata.re_txq_free--;
- sc->re_ldata.re_tx_free -= map->dm_nsegs;
+ sc->re_ldata.re_tx_free -= nsegs;
sc->re_ldata.re_tx_nextfree = curdesc;
#if NBPFILTER > 0
Index: rtl81x9var.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/rtl81x9var.h,v
retrieving revision 1.34
diff -u -r1.34 rtl81x9var.h
--- rtl81x9var.h 12 Nov 2006 03:09:37 -0000 1.34
+++ rtl81x9var.h 12 Nov 2006 03:39:01 -0000
@@ -148,6 +148,7 @@
struct mbuf *txq_mbuf;
bus_dmamap_t txq_dmamap;
int txq_descidx;
+ int txq_nsegs;
};
struct re_list_data {
@@ -244,6 +245,21 @@
sizeof(struct re_desc), \
(ops))
+/*
+ * re(4) hardware ip4csum-tx could be mangled with 28 byte or less IP packets
+ */
+#define RE_IP4CSUMTX_MINLEN 28
+#define RE_IP4CSUMTX_PADLEN \
+ (sizeof(struct ether_header) + RE_IP4CSUMTX_MINLEN)
+/*
+ * XXX
+ * We are allocating pad DMA buffer after RX DMA descs for now
+ * because RE_TX_LIST_SZ(sc) always occupies whole page but
+ * RE_RX_LIST_SZ is less than PAGE_SIZE so there is some unused region.
+ */
+#define RE_TXPADOFF RE_RX_LIST_SZ
+
+
#define RTK_ATTACHED 0x00000001 /* attach has succeeded */
#define RTK_ENABLED 0x00000002 /* chip is enabled */
--- end of patch 2---