Subject: Re: Hashing IP reassembly queues, phase 2 of 2: fragmeDoS
To: None <tech-net@NetBSD.org>
From: Jonathan Stone <jonathan@DSG.Stanford.EDU>
List: tech-net
Date: 12/13/2003 14:25:21
Here is a revised version of the patch which implements a `drop half'
strategy when the IP reassembly queue experiences mbuf fragment pressure.


This version includes the two improvements I mentioned before.  I
already checked in changes to -current to maintain both a total count of
all fragments in the reassembly queue, and a count of fragment in
each reasembly-queue chain (each fragmented packet).

Median computation is now folded into the loop which decrements
reassembly TTL once every ip_slowtimo(). Decrementing TTL requires
walking the chain of reassembly queues (fragmented packets) anyway,
and the marginal cost to compute medina TTL is trivial.

There is also code in ip_reass(), inside ``#ifdef notyet'', which will
make every call to ip_reass() check for fragment overflow, and if
found, execute a drop-half phase. Once the code has settled in, I plan
to enable that fragment, and disable the ip_maxfragpackets limit by setting
ip_maxfragpackets to 0.
 
I typically need to be able to support a thousand of NFS clients or
so, and the limit of 200 reassembly packets is far too low for worst-case
traffic in that configuration. If I up ip_maxfragpackets to 2000, it
becomes completely ineffective; whereas the ip_maxfrags/drop-half makes
it unecessary, too.

Any further comments before I commit this?



Index: ip_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_input.c,v
retrieving revision 1.192
diff -u -r1.192 ip_input.c
--- ip_input.c	2003/12/08 02:23:27	1.192
+++ ip_input.c	2003/12/13 04:02:13
@@ -247,7 +247,23 @@
 int	ip_nfragpackets = 0;
 int	ip_maxfragpackets = 200;
 int	ip_nfrags = 0;         /* total fragments in reass queues */
+int	ip_maxfrags = 0;         /* total fragments in reass queues */
 
+/*
+ * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
+ * IP reassembly queue buffer managment.
+ * 
+ * We keep a count of total IP fragments (NB: not fragmented packets!)
+ * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
+ * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
+ * total fragments in  reassembly queues.This AIMD policy avoids
+ * repeatedly deleting single packets under heavy fragmentation load
+ * (e.g., from lossy NFS peers).
+ */
+static u_int	ip_reass_ttl_decr __P((u_int ticks)); 
+static void	ip_reass_drophalf __P((void));
+
+
 static __inline int ipq_lock_try __P((void));
 static __inline void ipq_unlock __P((void));
 
@@ -375,7 +391,9 @@
 	    	LIST_INIT(&ipq[i]);
 
 	ip_id = time.tv_sec & 0xfffff;
+
 	ipintrq.ifq_maxlen = ipqmaxlen;
+	ip_maxfrags = nmbclusters / 4;
 	TAILQ_INIT(&in_ifaddrhead);
 	in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
 	    M_WAITOK, &in_ifaddrhash);
@@ -1003,6 +1021,11 @@
 	m->m_data += hlen;
 	m->m_len -= hlen;
 
+#ifdef	notyet
+	if (ip_nfrags >= ip_maxfrags)
+		ip_reass_drophalf(void);
+#endif
+
 	/*
 	 * We are about to add a fragment; increment frag count.
 	 */
@@ -1201,30 +1224,93 @@
 }
 
 /*
- * IP timer processing;
- * if a timer expires on a reassembly
- * queue, discard it.
+ * IP reassembly TTL machinery for  multiplicative drop.
  */
-void
-ip_slowtimo()
+static u_int	fragttl_histo[(IPFRAGTTL+1)];
+
+
+/*
+ * Decrement TTL of all reasembly queue entries by `ticks'.
+ * Count number of distinct fragments (as opposed to partial, fragmented
+ * datagrams) in the reassembly queue.  While we  traverse the entire
+ * reassembly queue, compute and return the median TTL over all fragments.
+ */
+static u_int
+ip_reass_ttl_decr(u_int ticks)
 {
-	static u_int dropscanidx = 0;
-	u_int i;
+	u_int i, nfrags, median;
 	struct ipq *fp, *nfp;
-	int s = splsoftnet();
 
-	IPQ_LOCK();
+	nfrags = 0;
+	memset(fragttl_histo, 0, sizeof fragttl_histo);
+	
 	for (i = 0; i < IPREASS_NHASH; i++) {
 		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+			fp->ipq_ttl = ((fp->ipq_ttl  <= ticks) ?
+				       0 : fp->ipq_ttl - ticks);
 			nfp = LIST_NEXT(fp, ipq_q);
-			if (--fp->ipq_ttl == 0) {
+			if (fp->ipq_ttl == 0) {
 				ipstat.ips_fragtimeout++;
 				ip_freef(fp);
+			} else {
+				nfrags += fp->ipq_nfrags;
+				fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
 			}
 		}
 	}
+
+	KASSERT(ip_nfrags == nfrags);
+
+	/* find median in histogram */
+	for (i = 0, median = 0; i <= IPFRAGTTL; i++) {
+		median +=  fragttl_histo[i];
+		if (median * 2 >= ip_nfrags)
+			break;
+	}
+
+	return (u_int)i;
+}
+
+void
+ip_reass_drophalf(void)
+{
+
+	u_int median_ticks;
+	/*
+	 * Compute median TTL of all fragments, and count frags
+	 * with that TTL or lower (roughly half of all fragments).
+	 */
+	median_ticks = ip_reass_ttl_decr(0);
+
+	/* Drop half. */
+	median_ticks = ip_reass_ttl_decr(median_ticks);
+
+}
+
+/*
+ * IP timer processing;
+ * if a timer expires on a reassembly
+ * queue, discard it.
+ */
+void
+ip_slowtimo()
+{
+	static u_int dropscanidx = 0;
+	u_int i;
+	u_int median_ttl;
+	int s = splsoftnet();
+
+	IPQ_LOCK();
+
+	/* Age TTL of all fragments by 1 tick .*/
+	median_ttl = ip_reass_ttl_decr(1);
+
+	/* If we have too many fragments, drop the older half. */
+	if (ip_nfrags > ip_maxfrags)
+		ip_reass_ttl_decr(median_ttl);
+
 	/*
-	 * If we are over the maximum number of fragments
+	 * If we are over the maximum number of fragmented packets
 	 * (due to the limit being lowered), drain off
 	 * enough to get down to the new limit. Start draining
 	 * from the reassembly hashqueue most recently drained.
@@ -1263,7 +1349,6 @@
 void
 ip_drain()
 {
-	int i;
 
 	/*
 	 * We may be called from a device's interrupt context.  If
@@ -1272,15 +1357,11 @@
 	if (ipq_lock_try() == 0)
 		return;
 
-	for (i = 0; i < IPREASS_NHASH; i++) {
-		struct ipqhead *ipqh = &ipq[i];
-		struct ipq *fp, *nfp;
-		for (fp = LIST_FIRST(ipqh); fp != NULL; fp = nfp) {
-			nfp = LIST_NEXT(fp, ipq_q);
-			ip_freef(fp);
-			ipstat.ips_fragdropped++;
-		}
-	}
+	/*
+	 * Drop half the total fragments now. If more mbufs are needed,
+	 *  we will be called again soon.
+	 */
+	ip_reass_drophalf();
 
 	IPQ_UNLOCK();
 }