Subject: Re: Hashing IP reassembly queues, phase 2 of 2: fragmeDoS
To: Jonathan Stone <jonathan@DSG.Stanford.EDU>
From: Jason Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 12/13/2003 14:41:31
--Apple-Mail-18-706294300
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=US-ASCII; format=flowed


On Dec 13, 2003, at 2:25 PM, Jonathan Stone wrote:

> Here is a revised version of the patch which implements a `drop half'
> strategy when the IP reassembly queue experiences mbuf fragment 
> pressure.
>
>
> This version includes the two improvements I mentioned before.  I
> already checked in changes to -current to maintain both a total count 
> of
> all fragments in the reassembly queue, and a count of fragment in
> each reasembly-queue chain (each fragmented packet).

All good.  A couple of minor things below.  Otherwise, looks great to 
me!

> Index: ip_input.c
> ===================================================================
> RCS file: /cvsroot/src/sys/netinet/ip_input.c,v
> retrieving revision 1.192
> diff -u -r1.192 ip_input.c
> --- ip_input.c	2003/12/08 02:23:27	1.192
> +++ ip_input.c	2003/12/13 04:02:13
> @@ -247,7 +247,23 @@
>  int	ip_nfragpackets = 0;
>  int	ip_maxfragpackets = 200;
>  int	ip_nfrags = 0;         /* total fragments in reass queues */
> +int	ip_maxfrags = 0;         /* total fragments in reass queues */

Don't initialize ip_nfrags to 0.  Let it default to 0 by being in the 
BSS segment.  You would never want anyone to patch that vars, so don't 
make it patchable.  I think the same thing applies to ip_nfragpackets.


>
> +/*
> + * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
> + * IP reassembly queue buffer managment.
> + *
> + * We keep a count of total IP fragments (NB: not fragmented packets!)
> + * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on 
> fragments.
> + * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
> + * total fragments in  reassembly queues.This AIMD policy avoids
> + * repeatedly deleting single packets under heavy fragmentation load
> + * (e.g., from lossy NFS peers).
> + */
> +static u_int	ip_reass_ttl_decr __P((u_int ticks));
> +static void	ip_reass_drophalf __P((void));
> +
> +
>  static __inline int ipq_lock_try __P((void));
>  static __inline void ipq_unlock __P((void));
>
> @@ -375,7 +391,9 @@
>  	    	LIST_INIT(&ipq[i]);
>
>  	ip_id = time.tv_sec & 0xfffff;
> +
>  	ipintrq.ifq_maxlen = ipqmaxlen;
> +	ip_maxfrags = nmbclusters / 4;

On some arch's, nmbclusters can change at run-time (MIPS and Alpha fall 
into this category), so you might want to figure out some way to handle 
that.  Just put an XXX comment there for now, maybe.

>  	TAILQ_INIT(&in_ifaddrhead);
>  	in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
>  	    M_WAITOK, &in_ifaddrhash);
> @@ -1003,6 +1021,11 @@
>  	m->m_data += hlen;
>  	m->m_len -= hlen;
>
> +#ifdef	notyet
> +	if (ip_nfrags >= ip_maxfrags)
> +		ip_reass_drophalf(void);
> +#endif
> +
>  	/*
>  	 * We are about to add a fragment; increment frag count.
>  	 */
> @@ -1201,30 +1224,93 @@
>  }
>
>  /*
> - * IP timer processing;
> - * if a timer expires on a reassembly
> - * queue, discard it.
> + * IP reassembly TTL machinery for  multiplicative drop.
>   */
> -void
> -ip_slowtimo()
> +static u_int	fragttl_histo[(IPFRAGTTL+1)];
> +
> +
> +/*
> + * Decrement TTL of all reasembly queue entries by `ticks'.
> + * Count number of distinct fragments (as opposed to partial, 
> fragmented
> + * datagrams) in the reassembly queue.  While we  traverse the entire
> + * reassembly queue, compute and return the median TTL over all 
> fragments.
> + */
> +static u_int
> +ip_reass_ttl_decr(u_int ticks)
>  {
> -	static u_int dropscanidx = 0;
> -	u_int i;
> +	u_int i, nfrags, median;
>  	struct ipq *fp, *nfp;
> -	int s = splsoftnet();
>
> -	IPQ_LOCK();
> +	nfrags = 0;
> +	memset(fragttl_histo, 0, sizeof fragttl_histo);
> +	
>  	for (i = 0; i < IPREASS_NHASH; i++) {
>  		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
> +			fp->ipq_ttl = ((fp->ipq_ttl  <= ticks) ?
> +				       0 : fp->ipq_ttl - ticks);
>  			nfp = LIST_NEXT(fp, ipq_q);
> -			if (--fp->ipq_ttl == 0) {
> +			if (fp->ipq_ttl == 0) {
>  				ipstat.ips_fragtimeout++;
>  				ip_freef(fp);
> +			} else {
> +				nfrags += fp->ipq_nfrags;
> +				fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
>  			}
>  		}
>  	}
> +
> +	KASSERT(ip_nfrags == nfrags);
> +
> +	/* find median in histogram */
> +	for (i = 0, median = 0; i <= IPFRAGTTL; i++) {
> +		median +=  fragttl_histo[i];
> +		if (median * 2 >= ip_nfrags)
> +			break;
> +	}
> +
> +	return (u_int)i;
> +}
> +
> +void
> +ip_reass_drophalf(void)
> +{
> +
> +	u_int median_ticks;
> +	/*
> +	 * Compute median TTL of all fragments, and count frags
> +	 * with that TTL or lower (roughly half of all fragments).
> +	 */
> +	median_ticks = ip_reass_ttl_decr(0);
> +
> +	/* Drop half. */
> +	median_ticks = ip_reass_ttl_decr(median_ticks);
> +
> +}
> +
> +/*
> + * IP timer processing;
> + * if a timer expires on a reassembly
> + * queue, discard it.
> + */
> +void
> +ip_slowtimo()
> +{
> +	static u_int dropscanidx = 0;
> +	u_int i;
> +	u_int median_ttl;
> +	int s = splsoftnet();
> +
> +	IPQ_LOCK();
> +
> +	/* Age TTL of all fragments by 1 tick .*/
> +	median_ttl = ip_reass_ttl_decr(1);
> +
> +	/* If we have too many fragments, drop the older half. */
> +	if (ip_nfrags > ip_maxfrags)
> +		ip_reass_ttl_decr(median_ttl);
> +
>  	/*
> -	 * If we are over the maximum number of fragments
> +	 * If we are over the maximum number of fragmented packets
>  	 * (due to the limit being lowered), drain off
>  	 * enough to get down to the new limit. Start draining
>  	 * from the reassembly hashqueue most recently drained.
> @@ -1263,7 +1349,6 @@
>  void
>  ip_drain()
>  {
> -	int i;
>
>  	/*
>  	 * We may be called from a device's interrupt context.  If
> @@ -1272,15 +1357,11 @@
>  	if (ipq_lock_try() == 0)
>  		return;
>
> -	for (i = 0; i < IPREASS_NHASH; i++) {
> -		struct ipqhead *ipqh = &ipq[i];
> -		struct ipq *fp, *nfp;
> -		for (fp = LIST_FIRST(ipqh); fp != NULL; fp = nfp) {
> -			nfp = LIST_NEXT(fp, ipq_q);
> -			ip_freef(fp);
> -			ipstat.ips_fragdropped++;
> -		}
> -	}
> +	/*
> +	 * Drop half the total fragments now. If more mbufs are needed,
> +	 *  we will be called again soon.
> +	 */
> +	ip_reass_drophalf();
>
>  	IPQ_UNLOCK();
>  }
>
         -- Jason R. Thorpe <thorpej@wasabisystems.com>


--Apple-Mail-18-706294300
content-type: application/pgp-signature; x-mac-type=70674453;
	name=PGP.sig
content-description: This is a digitally signed message part
content-disposition: inline; filename=PGP.sig
content-transfer-encoding: 7bit

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (Darwin)

iD8DBQE/25WbOpVKkaBm8XkRAnTIAJ4sSzKglxUh3ANEOQxG3lZJ2hcapACfa2PO
tkFpeeMS4SwgAJA5E+nX70Q=
=pbDz
-----END PGP SIGNATURE-----

--Apple-Mail-18-706294300--