Port-sparc64 archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Revisiting gem(4) resets



Hi all,

Some time ago, we added a workaround for hardware receive lockups in the
gem(4) driver based on discussion in PR 46260 [*].  However, when I load
both CPU and network, I see a number of resets when the receiver isn't
locked up (i.e. in the "else" part of the check).

I don't think that we should reset in the cases where we are still making
progress (read/write pointers changing).  I also added a counter for the
overflow and I see that increase.  However, looking at the other counters,
 also see the > 63 descriptors counter increasing, so I suspect that we do
overflow the receive descriptor ring sometimes.  For example:

  # uptime; vmstat -e | grep gem
   9:22AM  up 23:23, 5 users, load averages: 4.50, 5.35, 5.48
  gem0 interrupts              298354724 3545 intr
  gem0 rx overflow                     3    0 intr
  gem0 tx interrupts           193911012 2304 intr
  gem0 rx interrupts           120872075 1436 intr
  gem0 rx 0desc                  1541503   18 intr
  gem0 rx 1desc                 32223639  382 intr
  gem0 rx 2desc                 23263318  276 intr
  gem0 rx 3desc                 51706004  614 intr
  gem0 rx >3desc                10289047  122 intr
  gem0 rx >7desc                 1847846   21 intr
  gem0 rx >15desc                    436    0 intr
  gem0 rx >31desc                    115    0 intr
  gem0 rx >63desc                    167    0 intr

where RX overflow has been reported 3 times, and we've processed more than
63 descriptors 167 times.

I would like to commit the attached patch, which still resets in the else
case if the read/write pointers aren't changing.

Regards,

Julian

PS.  I'm curious what causes the 0desc rate to increase, but I don't think
that it's relevant here.

[*]
  http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/dev/ic/gem.c.diff?r1=1.98&r2=1.99&only_with_tag=MAIN&f=h
  http://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=46260
Index: src/sys/dev/ic/gem.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/gem.c,v
retrieving revision 1.132
diff -u -r1.132 gem.c
--- src/sys/dev/ic/gem.c	15 Sep 2020 08:33:40 -0000	1.132
+++ src/sys/dev/ic/gem.c	21 Jan 2021 08:25:05 -0000
@@ -78,6 +78,7 @@
 #include <dev/mii/miivar.h>
 #include <dev/mii/mii_bitbang.h>
 
 #include <dev/ic/gemreg.h>
 #include <dev/ic/gemvar.h>
 
@@ -165,10 +166,11 @@
 #ifdef GEM_COUNTERS
 		for (i = __arraycount(sc->sc_ev_rxhist); --i >= 0; )
 			evcnt_detach(&sc->sc_ev_rxhist[i]);
-		evcnt_detach(&sc->sc_ev_rxnobuf);
-		evcnt_detach(&sc->sc_ev_rxfull);
 		evcnt_detach(&sc->sc_ev_rxint);
 		evcnt_detach(&sc->sc_ev_txint);
+		evcnt_detach(&sc->sc_ev_rxfull);
+		evcnt_detach(&sc->sc_ev_rxnobuf);
+		evcnt_detach(&sc->sc_ev_rxoverflow);
 #endif
 		evcnt_detach(&sc->sc_ev_intr);
 
@@ -588,14 +590,16 @@
 	evcnt_attach_dynamic(&sc->sc_ev_intr, EVCNT_TYPE_INTR,
 	    NULL, device_xname(sc->sc_dev), "interrupts");
 #ifdef GEM_COUNTERS
+	evcnt_attach_dynamic(&sc->sc_ev_rxoverflow, EVCNT_TYPE_INTR,
+	    &sc->sc_ev_rxint, device_xname(sc->sc_dev), "rx overflow");
+	evcnt_attach_dynamic(&sc->sc_ev_rxnobuf, EVCNT_TYPE_INTR,
+	    &sc->sc_ev_rxint, device_xname(sc->sc_dev), "rx malloc failure");
+	evcnt_attach_dynamic(&sc->sc_ev_rxfull, EVCNT_TYPE_INTR,
+	    &sc->sc_ev_rxint, device_xname(sc->sc_dev), "rx ring full");
 	evcnt_attach_dynamic(&sc->sc_ev_txint, EVCNT_TYPE_INTR,
 	    &sc->sc_ev_intr, device_xname(sc->sc_dev), "tx interrupts");
 	evcnt_attach_dynamic(&sc->sc_ev_rxint, EVCNT_TYPE_INTR,
 	    &sc->sc_ev_intr, device_xname(sc->sc_dev), "rx interrupts");
-	evcnt_attach_dynamic(&sc->sc_ev_rxfull, EVCNT_TYPE_INTR,
-	    &sc->sc_ev_rxint, device_xname(sc->sc_dev), "rx ring full");
-	evcnt_attach_dynamic(&sc->sc_ev_rxnobuf, EVCNT_TYPE_INTR,
-	    &sc->sc_ev_rxint, device_xname(sc->sc_dev), "rx malloc failure");
 	evcnt_attach_dynamic(&sc->sc_ev_rxhist[0], EVCNT_TYPE_INTR,
 	    &sc->sc_ev_rxint, device_xname(sc->sc_dev), "rx 0desc");
 	evcnt_attach_dynamic(&sc->sc_ev_rxhist[1], EVCNT_TYPE_INTR,
@@ -1803,8 +1807,8 @@
 
 		if (rxstat & GEM_RD_BAD_CRC) {
 			if_statinc(ifp, if_ierrors);
-			aprint_error_dev(sc->sc_dev,
-			    "receive error: CRC error\n");
+			DPRINTF(sc, ("%s: receive error: CRC error\n",
+			    device_xname(sc->sc_dev)));
 			GEM_INIT_RXDESC(sc, i);
 			continue;
 		}
@@ -2215,8 +2219,11 @@
 		 */
 		if (rxstat & GEM_MAC_RX_OVERFLOW) {
 			if_statinc(ifp, if_ierrors);
+			GEM_COUNTER_INCR(sc, sc_ev_rxoverflow);
+#ifdef GEM_DEBUG
 			aprint_error_dev(sc->sc_dev,
 			    "receive error: RX overflow sc->rxptr %d, complete %d\n", sc->sc_rxptr, bus_space_read_4(t, h, GEM_RX_COMPLETION));
+#endif
 			sc->sc_rx_fifo_wr_ptr =
 				bus_space_read_4(t, h, GEM_RX_FIFO_WR_PTR);
 			sc->sc_rx_fifo_rd_ptr =
@@ -2258,6 +2265,7 @@
 	uint32_t rx_fifo_wr_ptr;
 	uint32_t rx_fifo_rd_ptr;
 	uint32_t state;
+	int needreset;
 
 	if ((ifp->if_flags & IFF_RUNNING) == 0) {
 		aprint_error_dev(sc->sc_dev, "receiver not running\n");
@@ -2282,25 +2290,33 @@
 		    "receiver stuck in overflow, resetting\n");
 		gem_init(ifp);
 	} else {
+		needreset = 1;
 		if ((state & GEM_MAC_STATE_OVERFLOW) != GEM_MAC_STATE_OVERFLOW) {
-			aprint_error_dev(sc->sc_dev,
-				"rx_watchdog: not in overflow state: 0x%x\n",
-				state);
+			DPRINTF(sc,
+			    ("%s: rx_watchdog: not in overflow state: 0x%x\n",
+			    device_xname(sc->sc_dev), state));
 		}
 		if (rx_fifo_wr_ptr != rx_fifo_rd_ptr) {
-			aprint_error_dev(sc->sc_dev,
-				"rx_watchdog: wr & rd ptr different\n");
+			DPRINTF(sc,
+			    ("%s: rx_watchdog: wr & rd ptr different\n",
+			    device_xname(sc->sc_dev), state));
+			needreset = 0;
 		}
 		if (sc->sc_rx_fifo_wr_ptr != rx_fifo_wr_ptr) {
-			aprint_error_dev(sc->sc_dev,
-				"rx_watchdog: wr pointer != saved\n");
+			DPRINTF(sc, ("%s: rx_watchdog: wr pointer != saved\n",
+			    device_xname(sc->sc_dev), state));
+			needreset = 0;
 		}
 		if (sc->sc_rx_fifo_rd_ptr != rx_fifo_rd_ptr) {
+			DPRINTF(sc, ("%s: rx_watchdog: rd pointer != saved\n",
+			    device_xname(sc->sc_dev), state));
+			needreset = 0;
+		}
+		if (needreset) {
 			aprint_error_dev(sc->sc_dev,
-				"rx_watchdog: rd pointer != saved\n");
+			    "rx_watchdog: resetting anyway\n");
+			gem_init(ifp);
 		}
-		aprint_error_dev(sc->sc_dev, "resetting anyway\n");
-		gem_init(ifp);
 	}
 }
 
Index: src/sys/dev/ic/gemvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/gemvar.h,v
retrieving revision 1.27
diff -u -r1.27 gemvar.h
--- src/sys/dev/ic/gemvar.h	1 Mar 2020 05:50:56 -0000	1.27
+++ src/sys/dev/ic/gemvar.h	21 Jan 2021 08:25:06 -0000
@@ -217,10 +217,11 @@
 
 	struct evcnt sc_ev_intr;
 #ifdef GEM_COUNTERS
-	struct evcnt sc_ev_txint;
-	struct evcnt sc_ev_rxint;
+	struct evcnt sc_ev_rxoverflow;
 	struct evcnt sc_ev_rxnobuf;
 	struct evcnt sc_ev_rxfull;
+	struct evcnt sc_ev_txint;
+	struct evcnt sc_ev_rxint;
 	struct evcnt sc_ev_rxhist[9];
 #endif
 


Home | Main Index | Thread Index | Old Index