tech-net archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: NetBSD 5.1 TCP performance issue (lots of ACK)



On Thu, Oct 27, 2011 at 07:51:43PM +0200, Manuel Bouyer wrote:
> On Thu, Oct 27, 2011 at 12:00:33PM -0400, Thor Lancelot Simon wrote:
> > It's possible this has to do with the interrupt moderation tuning.  I
> > believe we've been pending the checkin of better values than the ones
> > I worked out from the documentation for quite some time -- there were
> > highly unobvious performance effects with small buffers.  Simon did
> > a bunch of testing and concluded, as I recall, that the values used
> > by Intel in the Linux driver were "magic" and that we should use
> > those, not mine.
> > 
> > If this hasn't been adjusted to match the Linux driver, you might
> > want to take a quick look at the values it uses and see whether
> > they yield better small-buffer performance in your case.
> 
> I looked quickly at this and came up with the attached patch.
> 
> With this (installed on both NetBSD hosts) I get mittiged results:
> - the NetBSD client against the linux server gets degranded and unstable
>   performances several runs gives large variations in speed
> - the NetBSD client against the NetBSD server gets better performances
>   in average (but still not in the 90MB range) and also with large
>   variations between runs
> - the linux client against the NetBSD server gets a little boost and
>   the speed is stll stable between runs
> - ttcp performances between NetBSD hosts gets a little boost too,
>   and the speed is stll stable between runs
> 
> But I do get Ierrs on both NetBSD hosts now, with the ttcp or glusterfs
> test. I don't know where these errors comes from. Linux has no errors.
> I don't think it's wm_add_rxbuf(), netstat -m and vmstat -m shows
> no issues with mbuf allocations.
> So I guess these are errors at the adapter level, we may need to change
> more things to match these values.
> Also, linux seems to be using more advanced features for these adapters,
> this is something we may have to look at too.

Here is an updated patch. The key point to avoid the receive errors is
to do another BUS_DMASYNC after reading wrx_status, before reading the
other values to avoid reading e.g. len before status gets updated.
The errors were because of 0-len receive descriptors.

With this I get 113MB/s with the ttcp test, and between 70 and 90MB/s
with glusterfs. the NetBSD client now gets the same speed with a
NetBSD or linux server.

In the patch there is changes for the WM_F_NEWQUEUE adapters but they
may not be correct. When using WM_F_NEWQUEUE for the i80003 (which
the linux driver does), performances are a little lower and I get
a high interrupt rate - just as if interrupt mitigation was not
working.

-- 
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
     NetBSD: 26 ans d'experience feront toujours la difference
--
Index: sys/dev/pci/if_wm.c
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/if_wm.c,v
retrieving revision 1.162.4.15
diff -u -p -u -r1.162.4.15 if_wm.c
--- sys/dev/pci/if_wm.c 7 Mar 2011 04:14:19 -0000       1.162.4.15
+++ sys/dev/pci/if_wm.c 28 Oct 2011 14:03:33 -0000
@@ -327,6 +327,15 @@ struct wm_softc {
        struct evcnt sc_ev_rx_xoff;     /* Rx PAUSE(!0) frames */
        struct evcnt sc_ev_rx_xon;      /* Rx PAUSE(0) frames */
        struct evcnt sc_ev_rx_macctl;   /* Rx Unsupported */
+
+       struct evcnt sc_ev_errcrc;      /* WMREG_CRCERRS */
+       struct evcnt sc_ev_erralg;      /* WMREG_ALGNERRC */
+       struct evcnt sc_ev_errsym;      /* WMREG_SYMERRC */
+       struct evcnt sc_ev_errrx;       /* WMREG_RXERRC */
+       struct evcnt sc_ev_errsec;      /* WMREG_SEC */
+       struct evcnt sc_ev_errcext;     /* WMREG_CEXTERR */
+       struct evcnt sc_ev_errrlec;     /* WMREG_RLEC */
+       struct evcnt sc_ev_erralloc;    /* mbuf alloc failed */
 #endif /* WM_EVENT_COUNTERS */
 
        bus_addr_t sc_tdt_reg;          /* offset of TDT register */
@@ -1116,7 +1125,8 @@ wm_attach(device_t parent, device_t self
        }
 
        if ((sc->sc_type == WM_T_82575) || (sc->sc_type == WM_T_82576)
-           || (sc->sc_type == WM_T_82580) || (sc->sc_type == WM_T_82580ER))
+           || (sc->sc_type == WM_T_82580) || (sc->sc_type == WM_T_82580ER)
+           /* || (sc->sc_type == WM_T_80003) */)
          sc->sc_flags |= WM_F_NEWQUEUE;
 
        /* Set device properties (mactype) */
@@ -1924,6 +1934,24 @@ wm_attach(device_t parent, device_t self
            NULL, xname, "rx_xon");
        evcnt_attach_dynamic(&sc->sc_ev_rx_macctl, EVCNT_TYPE_MISC,
            NULL, xname, "rx_macctl");
+
+       
+       evcnt_attach_dynamic(&sc->sc_ev_errcrc, EVCNT_TYPE_MISC,
+           NULL, xname, "errcrc");
+       evcnt_attach_dynamic(&sc->sc_ev_erralg, EVCNT_TYPE_MISC,
+           NULL, xname, "erralg");
+       evcnt_attach_dynamic(&sc->sc_ev_errsym, EVCNT_TYPE_MISC,
+           NULL, xname, "errsym");
+       evcnt_attach_dynamic(&sc->sc_ev_errrx, EVCNT_TYPE_MISC,
+           NULL, xname, "errrx");
+       evcnt_attach_dynamic(&sc->sc_ev_errsec, EVCNT_TYPE_MISC,
+           NULL, xname, "errsec");
+       evcnt_attach_dynamic(&sc->sc_ev_errcext, EVCNT_TYPE_MISC,
+           NULL, xname, "errcext");
+       evcnt_attach_dynamic(&sc->sc_ev_errrlec, EVCNT_TYPE_MISC,
+           NULL, xname, "errrlec");
+       evcnt_attach_dynamic(&sc->sc_ev_erralloc, EVCNT_TYPE_MISC,
+           NULL, xname, "erralloc");
 #endif /* WM_EVENT_COUNTERS */
 
        if (!pmf_device_register(self, NULL, NULL))
@@ -2879,11 +2907,7 @@ wm_rxintr(struct wm_softc *sc)
                    device_xname(sc->sc_dev), i));
 
                WM_CDRXSYNC(sc, i, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
-
                status = sc->sc_rxdescs[i].wrx_status;
-               errors = sc->sc_rxdescs[i].wrx_errors;
-               len = le16toh(sc->sc_rxdescs[i].wrx_len);
-               vlantag = sc->sc_rxdescs[i].wrx_special;
 
                if ((status & WRX_ST_DD) == 0) {
                        /*
@@ -2892,6 +2916,14 @@ wm_rxintr(struct wm_softc *sc)
                        WM_CDRXSYNC(sc, i, BUS_DMASYNC_PREREAD);
                        break;
                }
+               /*
+                * sync again, to make sure the values below have been read
+                * after status.
+                */
+               WM_CDRXSYNC(sc, i, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
+               errors = sc->sc_rxdescs[i].wrx_errors;
+               len = le16toh(sc->sc_rxdescs[i].wrx_len);
+               vlantag = sc->sc_rxdescs[i].wrx_special;
 
                if (__predict_false(sc->sc_rxdiscard)) {
                        DPRINTF(WM_DEBUG_RX,
@@ -2924,6 +2956,7 @@ wm_rxintr(struct wm_softc *sc)
                         * far, and discard the rest of the packet.
                         */
                        ifp->if_ierrors++;
+                       WM_EVCNT_INCR(&sc->sc_ev_erralloc);
                        bus_dmamap_sync(sc->sc_dmat, rxs->rxs_dmamap, 0,
                            rxs->rxs_dmamap->dm_mapsize, BUS_DMASYNC_PREREAD);
                        WM_INIT_RXDESC(sc, i);
@@ -3232,6 +3265,7 @@ wm_tick(void *arg)
 {
        struct wm_softc *sc = arg;
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+       uint32_t reg;
        int s;
 
        s = splnet();
@@ -3245,14 +3279,20 @@ wm_tick(void *arg)
        }
 
        ifp->if_collisions += CSR_READ(sc, WMREG_COLC);
-       ifp->if_ierrors += 0ULL + /* ensure quad_t */
-           + CSR_READ(sc, WMREG_CRCERRS)
-           + CSR_READ(sc, WMREG_ALGNERRC)
-           + CSR_READ(sc, WMREG_SYMERRC)
-           + CSR_READ(sc, WMREG_RXERRC)
-           + CSR_READ(sc, WMREG_SEC)
-           + CSR_READ(sc, WMREG_CEXTERR)
-           + CSR_READ(sc, WMREG_RLEC);
+
+#define WM_IFERR(r, c) { \
+       reg = CSR_READ(sc, (r)); \
+       ifp->if_ierrors += 0ULL /* ensure quad_t */ + reg; \
+       WM_EVCNT_ADD((c), reg);\
+}
+       WM_IFERR(WMREG_CRCERRS, &sc->sc_ev_errcrc);
+       WM_IFERR(WMREG_ALGNERRC, &sc->sc_ev_erralg);
+       WM_IFERR(WMREG_SYMERRC, &sc->sc_ev_errsym);
+       WM_IFERR(WMREG_RXERRC, &sc->sc_ev_errrx);
+       WM_IFERR(WMREG_SEC, &sc->sc_ev_errsec);
+       WM_IFERR(WMREG_CEXTERR, &sc->sc_ev_errcext);
+       WM_IFERR(WMREG_RLEC, &sc->sc_ev_errrlec);
+#undef WM_IFERR
 
        if (sc->sc_flags & WM_F_HAS_MII)
                mii_tick(&sc->sc_mii);
@@ -3612,7 +3652,9 @@ wm_init(struct ifnet *ifp)
 
        /* update statistics before reset */
        ifp->if_collisions += CSR_READ(sc, WMREG_COLC);
-       ifp->if_ierrors += CSR_READ(sc, WMREG_RXERRC);
+       reg = CSR_READ(sc, WMREG_RXERRC);
+       ifp->if_ierrors += 0ULL + reg;
+       WM_EVCNT_ADD(&sc->sc_ev_errrx, reg);
 
        /* Reset the chip to a known state. */
        wm_reset(sc);
@@ -3664,14 +3706,23 @@ wm_init(struct ifnet *ifp)
                CSR_WRITE(sc, WMREG_TDLEN, WM_TXDESCSIZE(sc));
                CSR_WRITE(sc, WMREG_TDH, 0);
                CSR_WRITE(sc, WMREG_TDT, 0);
+#if 0
                CSR_WRITE(sc, WMREG_TIDV, 375);         /* ITR / 4 */
                CSR_WRITE(sc, WMREG_TADV, 375);         /* should be same */
+#endif
+               CSR_WRITE(sc, WMREG_TIDV, 8);           /* linux magic */
+               CSR_WRITE(sc, WMREG_TADV, 32);          /* ditto */
 
-               if ((sc->sc_flags & WM_F_NEWQUEUE) != 0)
-                       CSR_WRITE(sc, WMREG_TXDCTL, TXDCTL_QUEUE_ENABLE
-                           | TXDCTL_PTHRESH(0) | TXDCTL_HTHRESH(0)
-                           | TXDCTL_WTHRESH(0));
-               else {
+               if ((sc->sc_flags & WM_F_NEWQUEUE) != 0) {
+                       CSR_WRITE(sc, WMREG_TXDCTL_1,
+                             TXDCTL_COUNT_DESC | TXDCTL_DESC_GRAN
+                           | TXDCTL_PTHRESH(0x1f) | TXDCTL_HTHRESH(1)
+                           | TXDCTL_WTHRESH(1));
+                       CSR_WRITE(sc, WMREG_TXDCTL,
+                             TXDCTL_COUNT_DESC | TXDCTL_DESC_GRAN
+                           | TXDCTL_PTHRESH(0x1f) | TXDCTL_HTHRESH(1)
+                           | TXDCTL_WTHRESH(1));
+               } else {
                        CSR_WRITE(sc, WMREG_TXDCTL, TXDCTL_PTHRESH(0) |
                            TXDCTL_HTHRESH(0) | TXDCTL_WTHRESH(0));
                        CSR_WRITE(sc, WMREG_RXDCTL, RXDCTL_PTHRESH(0) |
@@ -3716,14 +3767,23 @@ wm_init(struct ifnet *ifp)
                                panic("%s: MCLBYTES %d unsupported for i2575 or 
higher\n", __func__, MCLBYTES);
                        CSR_WRITE(sc, WMREG_SRRCTL, SRRCTL_DESCTYPE_LEGACY
                            | (MCLBYTES >> SRRCTL_BSIZEPKT_SHIFT));
-                       CSR_WRITE(sc, WMREG_RXDCTL, RXDCTL_QUEUE_ENABLE
-                           | RXDCTL_PTHRESH(16) | RXDCTL_HTHRESH(8)
-                           | RXDCTL_WTHRESH(1));
+                       CSR_WRITE(sc, WMREG_RXDCTL, RXDCTL_GRAN
+                           | RXDCTL_PTHRESH(4) | RXDCTL_HTHRESH(0x20)
+                           | RXDCTL_WTHRESH(4));
+                       CSR_WRITE(sc, WMREG_RXDCTL_1, RXDCTL_GRAN
+                           | RXDCTL_PTHRESH(4) | RXDCTL_HTHRESH(0x20)
+                           | RXDCTL_WTHRESH(4));
+                       CSR_WRITE(sc, WMREG_RDTR, 0x20);/* linux magic */
+                       CSR_WRITE(sc, WMREG_RADV, 0x20);        /* ditto */
                } else {
                        CSR_WRITE(sc, WMREG_RDH, 0);
                        CSR_WRITE(sc, WMREG_RDT, 0);
+#if 0
                        CSR_WRITE(sc, WMREG_RDTR, 375 | RDTR_FPD);      /* 
ITR/4 */
                        CSR_WRITE(sc, WMREG_RADV, 375);         /* MUST be same 
*/
+#endif
+                       CSR_WRITE(sc, WMREG_RDTR, 0 | RDTR_FPD);/* linux magic 
*/
+                       CSR_WRITE(sc, WMREG_RADV, 8);           /* ditto */
                }
        }
        for (i = 0; i < WM_NRXDESC; i++) {
@@ -3892,7 +3952,10 @@ wm_init(struct ifnet *ifp)
                  * divided by 4 to get "simple timer" behavior.
                  */
 
+#if 0
                sc->sc_itr = 1500;              /* 2604 ints/sec */
+#endif
+               sc->sc_itr = 1000000000 / (20000 * 256); /* linux defaults */
                CSR_WRITE(sc, WMREG_ITR, sc->sc_itr);
        }
 
Index: sys/dev/pci/if_wmreg.h
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/if_wmreg.h,v
retrieving revision 1.24.20.5
diff -u -p -u -r1.24.20.5 if_wmreg.h
--- sys/dev/pci/if_wmreg.h      19 Nov 2010 23:40:28 -0000      1.24.20.5
+++ sys/dev/pci/if_wmreg.h      28 Oct 2011 14:03:33 -0000
@@ -495,9 +495,10 @@ struct livengood_tcpip_ctxdesc {
 #define        WMREG_RDT_2     0x0c18  /* for 82576 ... */
 
 #define        WMREG_RXDCTL    0x2828  /* Receive Descriptor Control */
+#define        WMREG_RXDCTL_1  0x2928  /* Receive Descriptor Control - queue 1 
*/
 #define        WMREG_RXDCTL_2  0x0c28  /* for 82576 ... */
-#define        RXDCTL_PTHRESH(x) ((x) << 0)    /* prefetch threshold */
-#define        RXDCTL_HTHRESH(x) ((x) << 8)    /* host threshold */
+#define        RXDCTL_PTHRESH(x) ((x) << 8)    /* prefetch threshold */
+#define        RXDCTL_HTHRESH(x) ((x) << 0)    /* host threshold */
 #define        RXDCTL_WTHRESH(x) ((x) << 16)   /* write back threshold */
 #define        RXDCTL_GRAN     (1U << 24)      /* 0 = cacheline, 1 = 
descriptor */
 /* flags used starting with 82575 ... */
@@ -625,10 +626,13 @@ struct livengood_tcpip_ctxdesc {
 #define        WMREG_TIDV      0x3820
 
 #define        WMREG_TXDCTL    0x3828  /* Trandmit Descriptor Control */
+#define        WMREG_TXDCTL_1  0x3928  /* Trandmit Descriptor Control - queue 
1 */
 #define        TXDCTL_PTHRESH(x) ((x) << 0)    /* prefetch threshold */
 #define        TXDCTL_HTHRESH(x) ((x) << 8)    /* host threshold */
 #define        TXDCTL_WTHRESH(x) ((x) << 16)   /* write back threshold */
 /* flags used starting with 82575 ... */
+#define TXDCTL_COUNT_DESC    0x00400000 /* enable counting of desc left */
+#define TXDCTL_DESC_GRAN     0x01000000 /* descriptor granularity */
 #define TXDCTL_QUEUE_ENABLE  0x02000000 /* Enable specific Tx Queue */
 #define TXDCTL_SWFLSH        0x04000000 /* Tx Desc. write-back flushing */
 #define TXDCTL_PRIORITY      0x08000000


Home | Main Index | Thread Index | Old Index