Subject: Patch to dynamically adjust bge driver Rx interrupt thresholds
To: None <tech-kern@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-kern
Date: 03/16/2004 09:54:06
The following patch is a back-port of proprietary code that allows for
dynamic adjustment of receive interrupt thresholds on the bge driver.

The original version used yet more proprietary code to monitor total
CPU usage, and auto-adjust the thresholds (up, down, or just fine)
accordingly. That code isn't released; so for NetBSD I'm assuming a
sysctl knob, either for all bge instances or (if there's an agreed
naming scheme, and template code I can reuse) for specific instances.

The iterator in the patch below shouldn't really be included; it just
iterates over all interfaces. I know there's a cleaner way to iterate
over just all bge instances (rusty memory of finding bge_cd and
iterating over cd_ndevs instances?), but I'd appreciate a quick
reminder of the currently-approved way to do that; I didn't find an
example in a very quick skim.  Examples with a permanent,
device-specific sysctl would be even nicer :-).

Other feedback welcomed, too, of course...

Index: if_bge.c
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/if_bge.c,v
retrieving revision 1.60
diff -u -r1.60 if_bge.c
--- if_bge.c	10 Mar 2004 18:46:10 -0000	1.60
+++ if_bge.c	16 Mar 2004 17:45:48 -0000
@@ -125,6 +125,51 @@
 
 #define ETHER_MIN_NOPAD (ETHER_MIN_LEN - ETHER_CRC_LEN) /* i.e., 60 */
 
+
+/*
+ * Tunable thresholds for rx-side bge interrupt mitigation.
+ */
+
+/*
+ * The pairs of values below were obtained from empirical measurement
+ * on bcm5700 rev B2; they ar designed to give roughly 1 receive
+ * interrupt for every N packets received, where N is, approximately,
+ * the second value (rx_max_bds) in each pair.  The values are chosen
+ * such that moving from one pair to the succeeding pair was observed
+ * to roughly halve interrupt rate under sustained input packet load.
+ * The values were empirically chosen to avoid overflowing internal
+ * limits on the  bcm5700: inreasing rx_ticks much beyond 600
+ * results in internal wrapping and higher interrupt rates.
+ * The limit of 46 frames was chosen to match NFS workloads.
+ * 
+ * These values also work well on bcm5701, bcm5704C, and (less
+ * tested) bcm5703.  On other chipsets, (including the Altima chip
+ * family), the larger values may overflow internal chip limits,
+ * leading to increasing interrupt rates rather than lower interrupt
+ * rates.
+ *
+ * Applications using heavy interrupt mitigation (interrupting every
+ * 32 or 46 frames) in both directions may need to increase the TCP
+ * windowsize to above 131072 bytes (e.g., to 199608 bytes) to sustain
+ * full link bandwidth, due to ACKs and window updates lingering 
+ * in the RX queue during the 30-to-40-frame interrupt-mitigation window.
+ */
+struct bge_load_rx_thresh {
+	int rx_ticks;
+	int rx_max_bds; }
+bge_rx_threshes[] = {
+	{ 32,   2 },
+	{ 50,   4 },
+	{ 100,  8 },
+	{ 192, 16 },
+	{ 416, 32 },
+	{ 598, 46 }
+};
+#define NBGE_RX_THRESH (sizeof(bge_rx_threshes) / sizeof(bge_rx_threshes[0]))
+
+/* XXX patchable; should be sysctl'able */
+int	bge_auto_thresh = 0;
+
 int bge_probe(struct device *, struct cfdata *, void *);
 void bge_attach(struct device *, struct device *, void *);
 void bge_release_resources(struct bge_softc *);
@@ -189,6 +234,9 @@
 
 void bge_reset(struct bge_softc *);
 
+void	bge_set_thresh(struct ifnet *  /*ifp*/, int /*lvl*/);
+void	bge_update_all_threshes(int /*lvl*/);
+
 void bge_dump_status(struct bge_softc *);
 void bge_dump_rxbd(struct bge_rx_bd *);
 
@@ -544,6 +592,60 @@
 }
 
 /*
+ * Update rx threshold levels to values in a particular slot
+ * of the interrupt-mitigation table bge_rx_threshes.
+ */
+void
+bge_set_thresh(struct ifnet *ifp, int lvl)
+{
+	struct bge_softc *sc = ifp->if_softc;
+	int s;
+
+	/* For now, just save the new Rx-intr thresholds and record
+	 * that a threshold update is pending.  Updating the hardware
+	 * registers here (even at splhigh()) is observed to
+	 * occasionaly cause glitches where Rx-interrupts are not
+	 * honoured for up to 10 seconds. jonathan@netbsd.org, 2003-04-05
+	 */
+	s = splnet();
+	sc->bge_rx_coal_ticks = bge_rx_threshes[lvl].rx_ticks;
+	sc->bge_rx_max_coal_bds = bge_rx_threshes[lvl].rx_max_bds;
+	sc->bge_pending_rxintr_change = 1;
+	splx(s);
+
+	 return;
+}
+
+
+/*
+ * Update Rx thresholds of all bge devices
+ */
+void
+bge_update_all_threshes(int lvl)
+{
+	struct ifnet *ifp;
+	const char * const namebuf = "bge";
+	int namelen;
+
+	if (lvl < 0)
+		lvl = 0;
+	else if( lvl >= NBGE_RX_THRESH)
+		lvl = NBGE_RX_THRESH - 1;
+    
+	namelen = strlen(namebuf);
+	/*
+	 * Now search all the interfaces for this name/number
+	 */
+	TAILQ_FOREACH(ifp, &ifnet, if_list) {
+		if (strncmp(ifp->if_xname, namebuf, namelen) != 0 ) 
+		      continue;
+		/* We got a match: update if doing auto-threshold-tuning */
+		if (bge_auto_thresh)
+			bge_set_thresh(ifp->if_softc, lvl);
+	}
+}
+
+/*
  * Handle events that have triggered interrupts.
  */
 void
@@ -2765,6 +2867,7 @@
 	}
 
 	if (sc->bge_pending_rxintr_change) {
+
 		uint32_t rx_ticks = sc->bge_rx_coal_ticks;
 		uint32_t rx_bds = sc->bge_rx_max_coal_bds;
 		uint32_t junk;