Subject: Hashing IP reassembly queues, phase 1 of 2: hash.
To: None <tech-net@netbsd.org>
From: Jonathan Stone <jonathan@DSG.Stanford.EDU>
List: tech-net
Date: 12/05/2003 15:07:51
One further item I've observed over a couple of years of pounding NFS:
with moderate-to-large numbers of UDP clients, a hashed IP reassembly
queue (instead of a global linked list) makes a small but measurable
improvement.

The following patch is based on code I've used for some time,
re-whacked slightly to resemble the FreeBSD version of the same idea.
In particular, note that one would normally use a prime modulus for a
hash; but certain CPUs some of us care about don't have hardware
divide. Therefore I used a cheap, power-of-two hash (same as FreeBSD).

This is the first of two patches;. A companion patch adds robustness
against buffer exhaustion due to (observed) Linux lameness and (as-yet
hypothetical) denial-of-service attacks.  This patch adds just the
hashed list of reassembly queues; certain optimizations (such as
maintaining an up-to-date counter of fragments) are left for the
companion patch. (The drain code is untested, and is replaced entirely
in the companion patch).

I'd appreciate feedback on this (collective work-in-progress, not
finished product) but I would like to commit it this-yearish.


--- /sys/netinet/ip_var.h.orig	2003-11-17 14:07:26.000000000 -0800
+++ /sys/netinet/ip_var.h	2003-12-04 16:17:07.000000000 -0800
@@ -192,7 +192,7 @@
 #endif
 
 extern struct ipstat ipstat;		/* ip statistics */
-extern LIST_HEAD(ipqhead, ipq) ipq;	/* ip reass. queue */
+extern LIST_HEAD(ipqhead, ipq) ipq[];	/* ip reass. queue */
 extern int   ip_defttl;			/* default IP ttl */
 extern int   ipforwarding;		/* ip forwarding */
 extern int   ip_mtudisc;		/* mtu discovery */
@@ -227,7 +227,7 @@
 int	 ip_fragment(struct mbuf *, struct ifnet *, u_long);
 int	 ip_pcbopts __P((struct mbuf **, struct mbuf *));
 struct mbuf *
-	 ip_reass __P((struct ipqent *, struct ipq *));
+	 ip_reass __P((struct ipqent *, struct ipq *, struct ipqhead *));
 struct in_ifaddr *
 	 ip_rtaddr __P((struct in_addr));
 void	 ip_savecontrol __P((struct inpcb *, struct mbuf **, struct ip *,


--- /sys/netinet/ip_input.c.orig	2003-11-19 19:29:57.000000000 -0800
+++ /sys/netinet/ip_input.c	2003-12-04 22:58:18.000000000 -0800
@@ -238,7 +238,16 @@
 struct pfil_head inet_pfil_hook;
 #endif
 
-struct ipqhead ipq;
+/* IP datagram reassembly queues (hashed) */
+#define IPREASS_NHASH_LOG2      6
+#define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
+#define IPREASS_HMASK           (IPREASS_NHASH - 1)
+#define IPREASS_HASH(x,y) \
+	(((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
+struct ipqhead ipq[IPREASS_NHASH];
+#ifdef notyet
+static int    nipq = 0;         /* total # of reass queues */
+#endif
 int	ipq_locked;
 int	ip_nfragpackets = 0;
 int	ip_maxfragpackets = 200;
@@ -365,7 +374,9 @@
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
 			ip_protox[pr->pr_protocol] = pr - inetsw;
-	LIST_INIT(&ipq);
+	for (i = 0; i < IPREASS_NHASH; i++)
+	    	LIST_INIT(&ipq[i]);
+
 	ip_id = time.tv_sec & 0xfffff;
 	ipintrq.ifq_maxlen = ipqmaxlen;
 	TAILQ_INIT(&in_ifaddrhead);
@@ -439,6 +450,7 @@
 	int downmatch;
 	int checkif;
 	int srcrt = 0;
+	u_int hash;
 #ifdef FAST_IPSEC
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
@@ -823,12 +835,17 @@
 		 * of this datagram.
 		 */
 		IPQ_LOCK();
-		LIST_FOREACH(fp, &ipq, ipq_q)
+		hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
+		/* XXX LIST_FOREACH(fp, &ipq[hash], ipq_q) */
+		for (fp = LIST_FIRST(&ipq[hash]); fp != NULL;
+		     fp = LIST_NEXT(fp, ipq_q)) {
 			if (ip->ip_id == fp->ipq_id &&
 			    in_hosteq(ip->ip_src, fp->ipq_src) &&
 			    in_hosteq(ip->ip_dst, fp->ipq_dst) &&
 			    ip->ip_p == fp->ipq_p)
 				goto found;
+
+		}
 		fp = 0;
 found:
 
@@ -869,7 +886,7 @@
 			ipqe->ipqe_mff = mff;
 			ipqe->ipqe_m = m;
 			ipqe->ipqe_ip = ip;
-			m = ip_reass(ipqe, fp);
+			m = ip_reass(ipqe, fp, &ipq[hash]);
 			if (m == 0) {
 				IPQ_UNLOCK();
 				return;
@@ -966,9 +983,10 @@
  * is given as fp; otherwise have to make a chain.
  */
 struct mbuf *
-ip_reass(ipqe, fp)
+ip_reass(ipqe, fp, ipqhead)
 	struct ipqent *ipqe;
 	struct ipq *fp;
+	struct ipqhead *ipqhead;
 {
 	struct mbuf *m = ipqe->ipqe_m;
 	struct ipqent *nq, *p, *q;
@@ -1005,7 +1023,7 @@
 		    M_FTABLE, M_NOWAIT);
 		if (fp == NULL)
 			goto dropfrag;
-		LIST_INSERT_HEAD(&ipq, fp, ipq_q);
+		LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
 		fp->ipq_ttl = IPFRAGTTL;
 		fp->ipq_p = ipqe->ipqe_ip->ip_p;
 		fp->ipq_id = ipqe->ipqe_ip->ip_id;
@@ -1171,27 +1189,47 @@
 void
 ip_slowtimo()
 {
+	static unsigned dropscanidx = 0;
+	unsigned i;
 	struct ipq *fp, *nfp;
 	int s = splsoftnet();
 
 	IPQ_LOCK();
-	for (fp = LIST_FIRST(&ipq); fp != NULL; fp = nfp) {
-		nfp = LIST_NEXT(fp, ipq_q);
-		if (--fp->ipq_ttl == 0) {
-			ipstat.ips_fragtimeout++;
-			ip_freef(fp);
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+			nfp = LIST_NEXT(fp, ipq_q);
+			if (--fp->ipq_ttl == 0) {
+				ipstat.ips_fragtimeout++;
+				ip_freef(fp);
+			}
 		}
 	}
 	/*
 	 * If we are over the maximum number of fragments
 	 * (due to the limit being lowered), drain off
-	 * enough to get down to the new limit.
+	 * enough to get down to the new limit. Start draining
+	 * from the reassembly hashqueue most recently drained.
 	 */
 	if (ip_maxfragpackets < 0)
 		;
 	else {
-		while (ip_nfragpackets > ip_maxfragpackets && LIST_FIRST(&ipq))
-			ip_freef(LIST_FIRST(&ipq));
+		int wrapped = 0;
+
+		i = dropscanidx;
+		while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
+			while (LIST_FIRST(&ipq[i]) != NULL)
+				ip_freef(LIST_FIRST(&ipq[0]));
+			if (++i >= IPREASS_NHASH) {
+				i = 0;
+			}
+			/*
+			 * Dont scan forever even if fragment counters are
+			 * wrong: stop after scanning entire reassembly queue.
+			 */
+			if (i == dropscanidx)
+			    wrapped = 1;
+		}
+		dropscanidx = i;
 	}
 	IPQ_UNLOCK();
 #ifdef GATEWAY
@@ -1206,6 +1244,7 @@
 void
 ip_drain()
 {
+	int i;
 
 	/*
 	 * We may be called from a device's interrupt context.  If
@@ -1214,11 +1253,15 @@
 	if (ipq_lock_try() == 0)
 		return;
 
-	while (LIST_FIRST(&ipq) != NULL) {
-		ipstat.ips_fragdropped++;
-		ip_freef(LIST_FIRST(&ipq));
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		struct ipqhead *ipqh = &ipq[i];
+		struct ipq *fp, *nfp;
+		for (fp = LIST_FIRST(ipqh); fp != NULL; fp = nfp) {
+			nfp = LIST_NEXT(fp, ipq_q);
+			ip_freef(fp);
+			ipstat.ips_fragdropped++;
+		}
 	}
-
 	IPQ_UNLOCK();
 }