Subject: Hashing IP reassembly queues, phase 2 of 2: fragmeDoS
To: None <tech-net@netbsd.org>
From: Jonathan Stone <jonathan@DSG.Stanford.EDU>
List: tech-net
Date: 12/05/2003 16:37:29
Here is the second change to IP reassembly: newly-arrived fragments
yield an additive increase. To guarantee stability (and to not consume
excessive time "cleaning" fragments,)  the reassembly code now executes
a multiplicative drop (drop-half) when the total fragment count
exceeds a thresh-hold. 

I implemented this multilcative-drop strategy after observing continal
mbuf exhaustion, due to a small number of Linux (circa RH 7.3)
clients: that is, the problem it solves is real, and sufficiently
well-known that commercial Unix/NFS-server vendors have released
patches to address the problem.  Other strategies did not solve that
problem in a satisfactory or scalable way. The method works fine
empirically, in addition to being founded on control-theory principles
which are well understood in the context of TCP (AIMD).

I decided to go further drop the oldest half of all packets, based on
observed traffic behaviour from the aforementioned NFS clients.


Think of this  pach as "shipped but unpolished". I know of two
ways it could be improved:

1.   Maintain a count of  total fragments: add one to the total
     for each new fragment inserted. Keep a counter in each
     reassembly entry, and decrement the total by that counter
     when a packet is completed or dropped.  Then we only execute
     the drop-half incremental cost
     of keeping the counters is balanced against scanning the entire
     list in every call to ip_slowtimo(). 
     
2.  Make the "drop half" threshhold a parameter.
    That will allow ip_slowtimo() to request "drop half the fragments
    from the queue, if and only if we exceed the threshold"
    whereas ip__drain() can request an unconditional drop
    (drop oldest half, rather than dropping all fragments).

I plan to implement both of these, and to also clean up the total
reporting from the median-TTL computation

Assuming thats done, any other comments before  this is
cleaned up and committed?

--- sys/netinet/ip_input.c.HASH	2003-12-04 22:58:18.000000000 -0800
+++ sys/netinet/ip_input.c	2003-12-05 00:14:23.000000000 -0800
@@ -252,6 +252,21 @@
 int	ip_nfragpackets = 0;
 int	ip_maxfragpackets = 200;
 
+/*
+ * Additive-increase/multiplicative-decrease drop strategy for
+ * IP reassembly queue:
+ * There is a limit of IP fragments (NB: not fragmented packets!)  
+ * we will hold for IP reassembly. If we receive  fragments than 
+ * the limit, we execute a multiplicative-decrease drop phase, toa
+ * ensuredstability under fragmentation denial-of-service attacks,
+ * or from peers whose whose "best effort"  approaces a DoS.
+ */
+static int ip_frag_drainthresh ;	/* nmbclusters / 4 */
+
+static int	ip_reass_ttl_decr __P((unsigned ticks)); 
+static unsigned	ip_reass_ttl_median __P((unsigned *ticks));
+static void	ip_reass_drophalf __P((void));
+
 static __inline int ipq_lock_try __P((void));
 static __inline void ipq_unlock __P((void));
 
@@ -1182,6 +1197,118 @@
 }
 
 /*
+ * IP reassembly TTL machinery for  multiplicative drop.
+ */
+unsigned	fragttl_histo[(IPFRAGTTL+1)];
+
+
+/*
+ * Decrement TTL of all reasembly queue entries by `ticks'.
+ * Count number of distinct fragments (as opposed to partial, fragmented
+ * datagrams) in the reassembly queue. 
+ * XXX: might as well compute TTL histogram for TTL median computation?
+ */
+static int
+ip_reass_ttl_decr(unsigned ticks)
+{
+	int i;
+	register struct ipq *fp, *nfp;
+	int nfrags = 0;
+
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+			register struct ipqent *ipqe;
+			fp->ipq_ttl = ((fp->ipq_ttl  <= ticks) ?
+				       0 : fp->ipq_ttl - ticks);
+			nfp = LIST_NEXT(fp, ipq_q);
+			if (fp->ipq_ttl == 0) {
+				ipstat.ips_fragtimeout++;
+				ip_freef(fp);
+			}
+			else
+			for (ipqe = TAILQ_FIRST(&fp->ipq_fragq);
+			     ipqe!= NULL; ipqe = TAILQ_NEXT(ipqe, ipqe_q))
+				nfrags++;
+		}
+	}
+	return nfrags;
+}
+
+/*
+ * Compute median TTL of all fragments in reassembly queue.
+ * (Count individual fragments, not datagrams.)
+ */
+static unsigned
+ip_reass_ttl_median(unsigned *ticks)
+{
+	int i;
+	int nfragpkts, totfrags;
+	unsigned median;
+	bzero(fragttl_histo, sizeof fragttl_histo);
+
+	nfragpkts = 0;
+	totfrags = 0;
+
+	/* compute histogram */
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		struct ipq *fp;
+		fp = LIST_FIRST(&ipq[i]);
+		if (fp == NULL)
+			continue;
+		for ( ; fp != NULL; ) {
+			register struct ipqent *ipqe;
+			int fp_nfrags = 0;
+
+			nfragpkts++;
+			for (ipqe = TAILQ_FIRST(&fp->ipq_fragq);
+			     ipqe!= NULL; ipqe = TAILQ_NEXT(ipqe, ipqe_q))
+				fp_nfrags++;
+
+			fragttl_histo[fp->ipq_ttl] += fp_nfrags;
+			totfrags += fp_nfrags;
+			fp = LIST_NEXT(fp, ipq_q);
+		}
+	}
+
+	/* find median in histogram */
+	for (i = 0, median = 0; i <= IPFRAGTTL; i++) {
+		median +=  fragttl_histo[i];
+		if (median * 2 >= totfrags)
+			break;
+	}
+
+	if (ticks != NULL)
+		*ticks = i;
+	return median;
+}
+
+/*
+ * Drop half the total fragments in the IP reassembly queue.
+ * Compute the median ttl value over all fragments, and count the
+ * total number of fragments with TTLs at or below that median.
+ * If that count is above our threshold, age all fragment TTLs
+ * by the median TTL value.  Thus, if the  threshold is exceeded,
+ * half of all fragments (packet buffers,  typically mbuf clusters)
+ * will be dropped; and the dropped fragments will be the older ones
+ * (those with lowest residual TTL).
+ */
+static void 
+ip_reass_drophalf(void)
+{
+	unsigned half_total_frags;
+	unsigned median_ticks = 0;
+
+	/*
+	 * Compute median TTL of all fragments, and count frags
+	 * with that TTL or lower (roughly half of all fragments).
+	 */
+	half_total_frags = ip_reass_ttl_median(&median_ticks);
+
+	if (half_total_frags > ip_frag_drainthresh)   
+		ip_reass_ttl_decr(median_ticks);
+}
+
+/*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
@@ -1191,21 +1318,18 @@
 {
 	static unsigned dropscanidx = 0;
 	unsigned i;
-	struct ipq *fp, *nfp;
 	int s = splsoftnet();
 
 	IPQ_LOCK();
-	for (i = 0; i < IPREASS_NHASH; i++) {
-		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
-			nfp = LIST_NEXT(fp, ipq_q);
-			if (--fp->ipq_ttl == 0) {
-				ipstat.ips_fragtimeout++;
-				ip_freef(fp);
-			}
-		}
-	}
+
+	/* Age TTL of all fragments by 1 tick */
+	ip_reass_ttl_decr(1);
+
+	/* If too many fragments, drop the older half */
+	ip_reass_drophalf();
+
 	/*
-	 * If we are over the maximum number of fragments
+	 * If we are over the maximum number of fragmented packets
 	 * (due to the limit being lowered), drain off
 	 * enough to get down to the new limit. Start draining
 	 * from the reassembly hashqueue most recently drained.