Subject: Re: TCP ECN diff
To: Rui Paulo <rpaulo@fnop.net>
From: Rui Paulo <rpaulo@fnop.net>
List: tech-net
Date: 08/31/2006 20:55:27
--Apple-Mail-3-505784705
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed


On Aug 31, 2006, at 8:32 PM, Rui Paulo wrote:

> This is only the kernel patch.

Attached is the whole patch.


--Apple-Mail-3-505784705
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0755;
	name=netbsd-tcp-ecn.diff
Content-Disposition: attachment;
	filename=netbsd-tcp-ecn.diff

Index: lib/libc/gen/sysctl.3
===================================================================
RCS file: /cvsroot/src/lib/libc/gen/sysctl.3,v
retrieving revision 1.178
diff -u -p -r1.178 sysctl.3
--- lib/libc/gen/sysctl.3	11 Aug 2006 19:17:47 -0000	1.178
+++ lib/libc/gen/sysctl.3	31 Aug 2006 19:53:47 -0000
@@ -1335,6 +1335,10 @@ Global number of TCP SACK holes.
 Global maximum number of TCP SACK holes.
 .It Li tcp.sack.maxholes
 Maximum number of TCP SACK holes allowed per connection.
+.It Li tcp.ecn.enable
+If set to 1, enables RFC 3168 Explicit Congestion Notification.
+.It Li tcp.ecn.maxretries
+Number of times to retry sending the ECN-setup packet.
 .It Li tcp.sendspace
 The default TCP send buffer size.
 .It Li tcp.slowhz
Index: sbin/sysctl/sysctl.8
===================================================================
RCS file: /cvsroot/src/sbin/sysctl/sysctl.8,v
retrieving revision 1.151
diff -u -p -r1.151 sysctl.8
--- sbin/sysctl/sysctl.8	8 Aug 2006 22:11:42 -0000	1.151
+++ sbin/sysctl/sysctl.8	31 Aug 2006 19:53:47 -0000
@@ -459,6 +459,8 @@ privilege can change the value.
 .It net.inet.tcp.sack.globalholes	integer	no
 .It net.inet.tcp.sack.globalmaxholes	integer	yes
 .It net.inet.tcp.sack.maxholes	integer	yes
+.It net.inet.tcp.ecn.enable	integer	yes
+.It net.inet.tcp.ecn.maxretries	integer	yes
 .It net.inet.tcp.sendspace	integer	yes
 .It net.inet.tcp.slowhz	integer	no
 .It net.inet.tcp.syn_bucket_limit	integer	yes
Index: sys/netinet/ip.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip.h,v
retrieving revision 1.27
diff -u -p -r1.27 ip.h
--- sys/netinet/ip.h	10 Dec 2005 23:36:23 -0000	1.27
+++ sys/netinet/ip.h	31 Aug 2006 19:53:47 -0000
@@ -76,11 +76,6 @@ struct ip {
 #define	IPTOS_THROUGHPUT	0x08
 #define	IPTOS_RELIABILITY	0x04
 /*	IPTOS_LOWCOST		0x02 XXX */
-#if 1
-/* ECN RFC3168 obsoletes RFC2481, and these will be deprecated soon. */
-#define IPTOS_CE		0x01	/* congestion experienced */
-#define IPTOS_ECT		0x02	/* ECN-capable transport */
-#endif
 
 /*
  * Definitions for IP precedence (also in ip_tos) (hopefully unused)
Index: sys/netinet/ip_ecn.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_ecn.c,v
retrieving revision 1.14
diff -u -p -r1.14 ip_ecn.c
--- sys/netinet/ip_ecn.c	11 Dec 2005 12:24:57 -0000	1.14
+++ sys/netinet/ip_ecn.c	31 Aug 2006 19:53:47 -0000
@@ -67,10 +67,10 @@ ip_ecn_ingress(int mode, u_int8_t *outer
 	*outer = *inner;
 	switch (mode) {
 	case ECN_ALLOWED:		/* ECN allowed */
-		*outer &= ~IPTOS_CE;
+		*outer &= ~IPTOS_ECN_CE;
 		break;
 	case ECN_FORBIDDEN:		/* ECN forbidden */
-		*outer &= ~(IPTOS_ECT | IPTOS_CE);
+		*outer &= ~(IPTOS_ECN_ECT0 | IPTOS_ECN_CE);
 		break;
 	case ECN_NOCARE:	/* no consideration to ECN */
 		break;
@@ -88,8 +88,8 @@ ip_ecn_egress(int mode, const u_int8_t *
 
 	switch (mode) {
 	case ECN_ALLOWED:
-		if (*outer & IPTOS_CE)
-			*inner |= IPTOS_CE;
+		if (*outer & IPTOS_ECN_CE)
+			*inner |= IPTOS_ECN_CE;
 		break;
 	case ECN_FORBIDDEN:		/* ECN forbidden */
 	case ECN_NOCARE:	/* no consideration to ECN */
Index: sys/netinet/tcp.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp.h,v
retrieving revision 1.21
diff -u -p -r1.21 tcp.h
--- sys/netinet/tcp.h	10 Dec 2005 23:36:23 -0000	1.21
+++ sys/netinet/tcp.h	31 Aug 2006 19:53:47 -0000
@@ -66,8 +66,8 @@ struct tcphdr {
 #define	TH_PUSH	  0x08
 #define	TH_ACK	  0x10
 #define	TH_URG	  0x20
-#define	TH_ECE	  0x40			/* (unimplemented) */
-#define	TH_CWR	  0x80			/* (unimplemented) */
+#define	TH_ECE	  0x40
+#define	TH_CWR	  0x80
 	u_int16_t th_win;			/* window */
 	u_int16_t th_sum;			/* checksum */
 	u_int16_t th_urp;			/* urgent pointer */
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.243
diff -u -p -r1.243 tcp_input.c
--- sys/netinet/tcp_input.c	7 Jun 2006 22:34:01 -0000	1.243
+++ sys/netinet/tcp_input.c	31 Aug 2006 19:53:48 -0000
@@ -70,7 +70,7 @@
  */
 
 /*-
- * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -78,6 +78,8 @@
  * Facility, NASA Ames Research Center.
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -236,6 +238,8 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,
 #endif
 #endif	/* FAST_IPSEC*/
 
+static inline void tcp_congestion_exp(struct tcpcb *);
+
 int	tcprexmtthresh = 3;
 int	tcp_log_refused;
 
@@ -405,6 +409,28 @@ tcpipqent_free(struct ipqent *ipqe)
 	splx(s);
 }
 
+/*
+ * Halve the congestion window and reduce the
+ * slow start threshold.
+ *
+ * Optionally, mark the packet.
+ */
+static inline void
+tcp_congestion_exp(struct tcpcb *tp)
+{
+	u_int win;
+
+	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+	if (win < 2)
+		win = 2;
+
+	tp->snd_ssthresh = win * tp->t_segsz;
+	tp->snd_recover = tp->snd_max;
+	tp->snd_cwnd = tp->snd_ssthresh;
+	if (TCP_ECN_ALLOWED(tp))
+		tp->t_flags |= TF_ECN_SND_CWR;
+}
+
 int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
 {
@@ -977,6 +1003,7 @@ tcp_input(struct mbuf *m, ...)
 	int af;		/* af on the wire */
 	struct mbuf *tcp_saveti = NULL;
 	uint32_t ts_rtt;
+	uint8_t iptos;
 
 	MCLAIM(m, &tcp_rx_mowner);
 	va_start(ap, m);
@@ -1033,6 +1060,7 @@ tcp_input(struct mbuf *m, ...)
 		/* We do the checksum after PCB lookup... */
 		len = ntohs(ip->ip_len);
 		tlen = len - toff;
+		iptos = ip->ip_tos;
 		break;
 #endif
 #ifdef INET6
@@ -1080,6 +1108,7 @@ tcp_input(struct mbuf *m, ...)
 		/* We do the checksum after PCB lookup... */
 		len = m->m_pkthdr.len;
 		tlen = len - toff;
+		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		break;
 #endif
 	default:
@@ -1587,6 +1616,31 @@ after_listen:
 		tcp_del_sackholes(tp, th);
 	}
 
+	if (TCP_ECN_ALLOWED(tp)) {
+		switch (iptos & IPTOS_ECN_MASK) {
+		case IPTOS_ECN_CE:
+			tp->t_flags |= TF_ECN_SND_ECE;
+			tcpstat.tcps_ecn_ce++;
+			break;
+		case IPTOS_ECN_ECT0:
+			tcpstat.tcps_ecn_ect++;
+			break;
+		case IPTOS_ECN_ECT1:
+			/* XXX: ignore for now -- rpaulo */
+			break;
+		}
+
+		if (tiflags & TH_CWR)
+			tp->t_flags &= ~TF_ECN_SND_ECE;
+
+		/*
+		 * Congestion experienced.
+		 * Ignore if we are already trying to recover.
+		 */
+		if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
+			tcp_congestion_exp(tp);
+	}
+
 	if (opti.ts_present && opti.ts_ecr) {
 		/*
 		 * Calculate the RTT from the returned time stamp and the
@@ -1617,7 +1671,8 @@ after_listen:
 	 * the socket buffer and note that we need a delayed ack.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
-	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK))
+	        == TH_ACK &&
 	    (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
 	    th->th_seq == tp->rcv_nxt &&
 	    tiwin && tiwin == tp->snd_wnd &&
@@ -1788,6 +1843,8 @@ after_listen:
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
+	 *	if seg contains a ECE and ECN support is enabled, the stream
+	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
@@ -1811,6 +1868,12 @@ after_listen:
 			if (SEQ_LT(tp->snd_high, tp->snd_una))
 				tp->snd_high = tp->snd_una;
 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
+
+			if ((tiflags & TH_ECE) && tcp_do_ecn) {
+				tp->t_flags |= TF_ECN_PERMIT;
+				tcpstat.tcps_ecn_shs++;
+			}
+
 		}
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
@@ -2200,6 +2263,9 @@ after_listen:
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
+				 * When using TCP ECN, notify the peer that
+				 * we reduced the cwnd.
+				 *
 				 * If we are using TCP/SACK, then enter
 				 * Fast Recovery if the receiver SACKs
 				 * data that is tcprexmtthresh * MSS
@@ -2213,9 +2279,8 @@ after_listen:
 					 (++tp->t_dupacks == tcprexmtthresh ||
 					 TCP_FACK_FASTRECOV(tp))) {
 					tcp_seq onxt;
-					u_int win;
 
-					if (tcp_do_newreno &&
+					if ((tcp_do_newreno || tcp_do_ecn) &&
 					    SEQ_LT(th->th_ack, tp->snd_high)) {
 						/*
 						 * False fast retransmit after
@@ -2227,12 +2292,7 @@ after_listen:
 					}
 
 					onxt = tp->snd_nxt;
-					win = min(tp->snd_wnd, tp->snd_cwnd) /
-					    2 /	tp->t_segsz;
-					if (win < 2)
-						win = 2;
-					tp->snd_ssthresh = win * tp->t_segsz;
-					tp->snd_recover = tp->snd_max;
+					tcp_congestion_exp(tp);
 					tp->t_partialacks = 0;
 					TCP_TIMER_DISARM(tp, TCPT_REXMT);
 					tp->t_rtttime = 0;
@@ -3809,6 +3869,9 @@ syn_cache_get(struct sockaddr *src, stru
 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
 		tp->t_flags |= TF_WILL_SACK;
 
+	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
+		tp->t_flags |= TF_ECN_PERMIT;
+
 #ifdef TCP_SIGNATURE
 	if (sc->sc_flags & SCF_SIGNATURE)
 		tp->t_flags |= TF_SIGNATURE;
@@ -4101,6 +4164,13 @@ syn_cache_add(struct sockaddr *src, stru
 	}
 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
 		sc->sc_flags |= SCF_SACK_PERMIT;
+
+	/*
+	 * ECN setup packet recieved.
+	 */
+	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
+		sc->sc_flags |= SCF_ECN_PERMIT;
+
 #ifdef TCP_SIGNATURE
 	if (tb.t_flags & TF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
@@ -4128,7 +4198,7 @@ syn_cache_respond(struct syn_cache *sc, 
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
-	struct tcpcb *tp;
+	struct tcpcb *tp = NULL;
 	struct tcphdr *th;
 	u_int hlen;
 	struct socket *so;
@@ -4269,6 +4339,55 @@ syn_cache_respond(struct syn_cache *sc, 
 		optp += 4;
 	}
 
+	/*
+	 * Send ECN SYN-ACK setup packet.
+	 * Routes can be asymetric, so, even if we receive a packet
+	 * with ECE and CWR set, we must not assume no one will block
+	 * the ECE packet we are about to send.
+	 */
+	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
+	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
+		th->th_flags |= TH_ECE;
+		tcpstat.tcps_ecn_shs++;
+
+		/*
+		 * draft-ietf-tcpm-ecnsyn-00.txt
+		 *
+		 * "[...] a TCP node MAY respond to an ECN-setup
+		 * SYN packet by setting ECT in the responding
+		 * ECN-setup SYN/ACK packet, indicating to routers 
+		 * that the SYN/ACK packet is ECN-Capable.
+		 * This allows a congested router along the path
+		 * to mark the packet instead of dropping the
+		 * packet as an indication of congestion."
+		 *
+		 * "[...] There can be a great benefit in setting
+		 * an ECN-capable codepoint in SYN/ACK packets [...]
+		 * Congestion is  most likely to occur in
+		 * the server-to-client direction.  As a result,
+		 * setting an ECN-capable codepoint in SYN/ACK
+		 * packets can reduce the occurence of three-second
+		 * retransmit timeouts resulting from the drop
+		 * of SYN/ACK packets."
+		 *
+		 * Page 4 and 6, January 2006.
+		 */
+
+		switch (sc->sc_src.sa.sa_family) {
+#ifdef INET
+		case AF_INET:
+			ip->ip_tos |= IPTOS_ECN_ECT0;
+			break;
+#endif
+#ifdef INET6
+		case AF_INET6:
+			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+			break;
+#endif
+		}
+		tcpstat.tcps_ecn_ect++;
+	}
+
 #ifdef TCP_SIGNATURE
 	if (sc->sc_flags & SCF_SIGNATURE) {
 		struct secasvar *sav;
Index: sys/netinet/tcp_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_output.c,v
retrieving revision 1.142
diff -u -p -r1.142 tcp_output.c
--- sys/netinet/tcp_output.c	25 Mar 2006 13:34:35 -0000	1.142
+++ sys/netinet/tcp_output.c	31 Aug 2006 19:53:49 -0000
@@ -70,7 +70,7 @@
  */
 
 /*-
- * Copyright (c) 1997, 1998, 2001, 2005 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -78,6 +78,8 @@
  * Facility, NASA Ames Research Center.
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -1235,6 +1237,58 @@ send:
 	bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen);
 
 	/*
+	 * If we are starting a connection, send ECN setup
+	 * SYN packet. If we are on a retransmit, we may
+	 * resend those bits a number of times as per
+	 * RFC 3168.
+	 */
+	if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
+		if (tp->t_flags & TF_SYN_REXMT) {
+			if (tp->t_ecn_retries--)
+				flags |= TH_ECE|TH_CWR;
+		} else {
+			flags |= TH_ECE|TH_CWR;
+			tp->t_ecn_retries = tcp_ecn_maxretries;
+		}
+	}
+
+	if (TCP_ECN_ALLOWED(tp)) {
+		/*
+		 * If the peer has ECN, mark data packets
+		 * ECN capable. Ignore pure ack packets, retransmissions
+		 * and window probes.
+		 */
+		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
+		    !(tp->t_force && len == 1)) {
+			switch (af) {
+#ifdef INET
+			case AF_INET:
+				tp->t_inpcb->inp_ip.ip_tos |= IPTOS_ECN_ECT0;
+				break;
+#endif
+#ifdef INET6
+			case AF_INET6:
+				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+				break;
+#endif
+			}
+			tcpstat.tcps_ecn_ect++;
+		}
+
+		/*
+		 * Reply with proper ECN notifications.
+		 */
+		if (tp->t_flags & TF_ECN_SND_CWR) {
+			flags |= TH_CWR;
+			tp->t_flags &= ~TF_ECN_SND_CWR;
+		} 
+		if (tp->t_flags & TF_ECN_SND_ECE) {
+			flags |= TH_ECE;
+		}
+	}
+
+
+	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
Index: sys/netinet/tcp_subr.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.198
diff -u -p -r1.198 tcp_subr.c
--- sys/netinet/tcp_subr.c	15 Apr 2006 02:30:39 -0000	1.198
+++ sys/netinet/tcp_subr.c	31 Aug 2006 19:53:49 -0000
@@ -184,6 +184,7 @@ int	tcp_do_win_scale = 1;	/* RFC1323 win
 int	tcp_do_timestamps = 1;	/* RFC1323 timestamps */
 int	tcp_do_newreno = 1;	/* Use the New Reno algorithms */
 int	tcp_ack_on_push = 0;	/* set to enable immediate ACK-on-PUSH */
+int	tcp_do_ecn = 0;		/* Explicit Congestion Notification */
 #ifndef TCP_INIT_WIN
 #define	TCP_INIT_WIN	0	/* initial slow start window */
 #endif
@@ -204,6 +205,7 @@ int	tcp_do_loopback_cksum = 0;
 int	tcp_sack_tp_maxholes = 32;
 int	tcp_sack_globalmaxholes = 1024;
 int	tcp_sack_globalholes = 0;
+int	tcp_ecn_maxretries = 1;
 
 
 /* tcb hash */
Index: sys/netinet/tcp_usrreq.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.119
diff -u -p -r1.119 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c	23 Jul 2006 22:06:13 -0000	1.119
+++ sys/netinet/tcp_usrreq.c	31 Aug 2006 19:53:50 -0000
@@ -30,7 +30,7 @@
  */
 
 /*-
- * Copyright (c) 1997, 1998, 2005 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997, 1998, 2005, 2006 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -38,6 +38,8 @@
  * Facility, NASA Ames Research Center.
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -1384,7 +1386,7 @@ static void
 sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname,
 			   const char *tcpname)
 {
-	const struct sysctlnode *sack_node;
+	const struct sysctlnode *sack_node, *ecn_node;
 #ifdef TCP_DEBUG
 	extern struct tcp_debug tcp_debug[TCP_NDEBUG];
 	extern int tcp_debx;
@@ -1475,6 +1477,12 @@ sysctl_net_inet_tcp_setup2(struct sysctl
 		       SYSCTL_DESCR("RFC2018 Selective ACKnowledgement tunables"),
 		       NULL, 0, NULL, 0,
 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, &ecn_node,
+	    	       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "ecn",
+	    	       SYSCTL_DESCR("RFC3168 Explicit Congestion Notification"),
+	    	       NULL, 0, NULL, 0,
+		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_ECN, CTL_EOL);
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 		       CTLTYPE_INT, "win_scale",
@@ -1600,6 +1608,23 @@ sysctl_net_inet_tcp_setup2(struct sysctl
 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE,
 		       CTL_EOL);
 
+	sysctl_createv(clog, 0, NULL, &ecn_node,
+	    	       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "enable",
+		       SYSCTL_DESCR("Enable TCP Explicit Congestion "
+			   "Notification"),
+	    	       NULL, 0, &tcp_do_ecn, 0,
+	    	       CTL_NET, pf, IPPROTO_TCP, TCPCTL_ECN,
+		       CTL_CREATE, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, &ecn_node,
+	    	       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "maxretries",
+		       SYSCTL_DESCR("Number of times to retry ECN setup "
+			       "before disabling ECN on the connection"),
+	    	       NULL, 0, &tcp_ecn_maxretries, 0,
+	    	       CTL_NET, pf, IPPROTO_TCP, TCPCTL_ECN,
+		       CTL_CREATE, CTL_EOL);
+
 	/* SACK gets it's own little subtree. */
 	sysctl_createv(clog, 0, NULL, &sack_node,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
Index: sys/netinet/tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.136
diff -u -p -r1.136 tcp_var.h
--- sys/netinet/tcp_var.h	22 Jul 2006 17:45:03 -0000	1.136
+++ sys/netinet/tcp_var.h	31 Aug 2006 19:53:50 -0000
@@ -217,6 +217,9 @@ struct tcpcb {
 #define	TF_REASSEMBLING	0x1000		/* we're busy reassembling */
 #define	TF_DEAD		0x2000		/* dead and to-be-released */
 #define	TF_PMTUD_PEND	0x4000		/* Path MTU Discovery pending */
+#define	TF_ECN_PERMIT	0x10000		/* other side said is ECN-ready */
+#define	TF_ECN_SND_CWR	0x20000		/* ECN CWR in queue */
+#define	TF_ECN_SND_ECE	0x40000		/* ECN ECE in queue */
 #define	TF_SIGNATURE	0x400000	/* require MD5 digests (RFC2385) */
 
 
@@ -320,9 +323,16 @@ struct tcpcb {
 	u_int	t_pmtud_nextmtu;	/* Advertised Next-Hop MTU from ICMP */
 	u_short	t_pmtud_ip_len;		/* IP length from ICMP payload */
 	u_short	t_pmtud_ip_hl;		/* IP header length from ICMP payload */
+
+	uint8_t t_ecn_retries;		/* # of ECN setup retries */
 };
 
 /*
+ * Macros to aid ECN TCP.
+ */
+#define TCP_ECN_ALLOWED(tp)	(tp->t_flags & TF_ECN_PERMIT)
+
+/*
  * Macros to aid SACK/FACK TCP.
  */
 #define TCP_SACK_ENABLED(tp)	(tp->t_flags & TF_WILL_SACK)
@@ -483,6 +493,7 @@ struct syn_cache {
 #define	SCF_TIMESTAMP		0x0002		/* peer will do timestamps */
 #define	SCF_DEAD		0x0004		/* this entry to be released */
 #define SCF_SACK_PERMIT		0x0008		/* peer will do SACK */
+#define SCF_ECN_PERMIT		0x0010		/* peer will do ECN */
 #define SCF_SIGNATURE	0x40			/* send MD5 digests */
 
 	struct mbuf *sc_ipopts;			/* IP options */
@@ -634,6 +645,10 @@ struct	tcpstat {
 	u_quad_t tcps_selfquench;	/* # of ENOBUFS we get on output */
 	u_quad_t tcps_badsig;		/* # of drops due to bad signature */
 	u_quad_t tcps_goodsig;		/* # of packets with good signature */
+
+	u_quad_t tcps_ecn_shs;		/* # of sucessful ECN handshakes */
+	u_quad_t tcps_ecn_ce;		/* # of packets with CE bit */
+	u_quad_t tcps_ecn_ect;		/* # of packets with ECT(0) bit */
 };
 
 /*
@@ -675,7 +690,8 @@ struct	tcpstat {
 #define	TCPCTL_STATS		30	/* TCP statistics */
 #define	TCPCTL_DEBUG		31	/* TCP debug sockets */
 #define	TCPCTL_DEBX		32	/* # of tcp debug sockets */
-#define	TCPCTL_MAXID		33
+#define	TCPCTL_ECN		33	/* RFC3168 ECN */
+#define	TCPCTL_MAXID		34
 
 #define	TCPCTL_NAMES { \
 	{ 0, 0 }, \
@@ -711,6 +727,7 @@ struct	tcpstat {
 	{ "stats", CTLTYPE_STRUCT }, \
 	{ "debug", CTLTYPE_STRUCT }, \
 	{ "debx", CTLTYPE_INT }, \
+	{ "ecn", CTLTYPE_NODE }, \
 }
 
 #ifdef _KERNEL
@@ -733,6 +750,8 @@ extern	int tcp_ack_on_push;	/* ACK immed
 extern	int tcp_syn_cache_limit; /* max entries for compressed state engine */
 extern	int tcp_syn_bucket_limit;/* max entries per hash bucket */
 extern	int tcp_log_refused;	/* log refused connections */
+extern	int tcp_do_ecn;		/* TCP ECN enabled/disabled? */
+extern	int tcp_ecn_maxretries;	/* Max ECN setup retries */
 extern int tcp_sack_tp_maxholes;	/* Max holes per connection. */
 extern int tcp_sack_globalmaxholes;	/* Max holes per system. */
 extern int tcp_sack_globalholes;	/* Number of holes present. */
Index: usr.bin/netstat/inet.c
===================================================================
RCS file: /cvsroot/src/usr.bin/netstat/inet.c,v
retrieving revision 1.75
diff -u -p -r1.75 inet.c
--- usr.bin/netstat/inet.c	17 Aug 2006 01:42:57 -0000	1.75
+++ usr.bin/netstat/inet.c	31 Aug 2006 19:53:51 -0000
@@ -391,6 +391,9 @@ tcp_stats(off, name)
 	p(tcps_badsig, "\t%llu packet%s with bad signature\n");
 	p(tcps_goodsig, "\t%llu packet%s with good signature\n");
 
+	p(tcps_ecn_shs, "\t%llu sucessful ECN handshake%s\n");
+	p(tcps_ecn_ce, "\t%llu packet%s with ECN CE bit\n");
+	p(tcps_ecn_ect, "\t%llu packet%s ECN ECT(0) bit\n");
 #undef p
 #undef ps
 #undef p2

--Apple-Mail-3-505784705
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed


	-- Rui Paulo



--Apple-Mail-3-505784705--