Subject: Re: Squashed Ack, was Re: Concerns about our NewReno code
To: Alistair Crooks <agc@pkgsrc.org>
From: Charles M. Hannum <abuse@spamalicious.com>
List: tech-net
Date: 01/26/2005 20:18:22
--Boundary-00=_Os/9BxwlpHW8l0w
Content-Type: text/plain;
  charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

Without having done much testing, I *think* this is a fix for the NewReno 
bugs.  Note that it is not the same as the FreeBSD code; AFAICT, the FreeBSD 
code is broken in at least two ways.  (No, I do not intend to help them fix 
it.)

Basically, the changes are like this:

* t_dupacks always tells us whether we are in fast recovery.  On receipt of a 
partial ack, we do *not* clear t_dupacks.  We do clear it on receipt of a 
full ack, or if the counter is non-zero and we are not in fast recovery.  
Thus, testing to see if we're in fast recovery is simpler -- the checks which 
test snd_recover>=snd_una in FreeBSD can simply test t_dupacks>=rexxmtthresh 
in my code -- which conveniently means that the same tests work for Reno and 
NewReno.

* snd_recover is not updated on a partial ack any more.  This would cause us 
to prematurely exit fast recovery if we received two partial acks.  It still 
chases snd_una to prevent sequence wraparound.

* I added snd_high, as per RFC 2582, to keep track of the maximum sequence 
number sent after slow retransmit.  I had to add logic to prevent sequence 
wraparound, by having snd_high chase behind snd_una.

AFAICT, this should not affect the Reno case (which is the default), so it 
should be safe to commit.  I would appreciate it if people could review 
and/or test the change with NewReno enabled.

(Note: The attached diff includes the time stamp fix I previously sent out.)

--Boundary-00=_Os/9BxwlpHW8l0w
Content-Type: text/x-diff;
  charset="iso-8859-1";
  name="tcp.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="tcp.diff"

Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.212
diff -u -r1.212 tcp_input.c
--- tcp_input.c	21 Dec 2004 05:51:31 -0000	1.212
+++ tcp_input.c	26 Jan 2005 20:06:24 -0000
@@ -1495,6 +1495,21 @@
 		if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
 			goto drop;
 
+	if (opti.ts_present && opti.ts_ecr) {
+		u_int32_t now;
+
+		/*
+		 * Calculate the RTT from the returned time stamp and the
+		 * connection's time base.  If the time stamp is later than
+		 * the current time, fall back to non-1323 RTT calculation.
+		 */
+		now = TCP_TIMESTAMP(tp);
+		if (SEQ_GEQ(now, opti.ts_ecr))
+			opti.ts_ecr = now - opti.ts_ecr + 1;
+		else
+			opti.ts_ecr = 0;
+	}
+
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
@@ -1528,6 +1543,7 @@
 		}
 
 		if (tlen == 0) {
+			/* Ack prediction. */
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
@@ -1537,8 +1553,7 @@
 				 */
 				++tcpstat.tcps_predack;
 				if (opti.ts_present && opti.ts_ecr)
-					tcp_xmit_timer(tp,
-					  TCP_TIMESTAMP(tp) - opti.ts_ecr + 1);
+					tcp_xmit_timer(tp, opti.ts_ecr);
 				else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq))
 					tcp_xmit_timer(tp,
@@ -1556,9 +1571,13 @@
 				/*
 				 * We want snd_recover to track snd_una to
 				 * avoid sequence wraparound problems for
-				 * very large transfers.
+				 * very large transfers.  Note that since we
+				 * are not in fast recovery, we know that
+				 * snd_recover<=snd_una already here.
 				 */
 				tp->snd_una = tp->snd_recover = th->th_ack;
+				if (SEQ_LT(tp->snd_high, tp->snd_una - 1))
+					tp->snd_high = tp->snd_una - 1;
 				m_freem(m);
 
 				/*
@@ -1686,6 +1705,8 @@
 			tp->snd_una = tp->snd_recover = th->th_ack;
 			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 				tp->snd_nxt = tp->snd_una;
+			if (SEQ_LT(tp->snd_high, tp->snd_una - 1))
+				tp->snd_high = tp->snd_una - 1;
 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
 		}
 		tp->irs = th->th_seq;
@@ -2078,22 +2099,23 @@
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks == tcprexmtthresh) {
-					tcp_seq onxt = tp->snd_nxt;
-					u_int win =
-					    min(tp->snd_wnd, tp->snd_cwnd) /
-					    2 /	tp->t_segsz;
-					if (tcp_do_newreno && SEQ_LT(th->th_ack,
-					    tp->snd_recover)) {
+					tcp_seq onxt;
+					u_int win;
+
+					if (tcp_do_newreno &&
+					    SEQ_LEQ(th->th_ack, tp->snd_high)) {
 						/*
 						 * False fast retransmit after
-						 * timeout.  Do not cut window.
+						 * timeout.  Do not enter fast
+						 * recovery.
 						 */
-						tp->snd_cwnd += tp->t_segsz;
 						tp->t_dupacks = 0;
-						(void) tcp_output(tp);
-						goto drop;
+						break;
 					}
 
+					onxt = tp->snd_nxt;
+					win = min(tp->snd_wnd, tp->snd_cwnd) /
+					    2 /	tp->t_segsz;
 					if (win < 2)
 						win = 2;
 					tp->snd_ssthresh = win * tp->t_segsz;
@@ -2130,24 +2152,13 @@
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (tcp_do_newreno == 0) {
+		if (!tcp_do_newreno) {
 			if (tp->t_dupacks >= tcprexmtthresh &&
 			    tp->snd_cwnd > tp->snd_ssthresh)
 				tp->snd_cwnd = tp->snd_ssthresh;
 			tp->t_dupacks = 0;
-		} else if (tp->t_dupacks >= tcprexmtthresh &&
-			   tcp_newreno(tp, th) == 0) {
-			tp->snd_cwnd = tp->snd_ssthresh;
-			/*
-			 * Window inflation should have left us with approx.
-			 * snd_ssthresh outstanding data.  But in case we
-			 * would be inclined to send a burst, better to do
-			 * it via the slow start mechanism.
-			 */
-			if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
-				tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
-				    + tp->t_segsz;
-			tp->t_dupacks = 0;
+		} else {
+			tcp_newreno(tp, th);
 		}
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			tcpstat.tcps_rcvacktoomuch++;
@@ -2167,7 +2178,7 @@
 		 * Recompute the initial retransmit timer.
 		 */
 		if (opti.ts_present && opti.ts_ecr)
-			tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1);
+			tcp_xmit_timer(tp, opti.ts_ecr);
 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 			tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 
@@ -2190,16 +2201,25 @@
 		 * (segsz^2 / cwnd per packet), plus a constant
 		 * fraction of a packet (segsz/8) to help larger windows
 		 * open quickly enough.
-		 */
-		{
-		u_int cw = tp->snd_cwnd;
-		u_int incr = tp->t_segsz;
-
-		if (cw > tp->snd_ssthresh)
-			incr = incr * incr / cw;
-		if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
+		 *
+		 * If we are still in fast recovery (meaning we are using
+		 * NewReno and we have only received partial acks), do not
+		 * inflate the window yet.
+		 */
+		if (tp->t_dupacks == 0) {
+			u_int cw = tp->snd_cwnd;
+			u_int incr = tp->t_segsz;
+
+			if (cw > tp->snd_ssthresh)
+				incr = incr * incr / cw;
 			tp->snd_cwnd = min(cw + incr,
 			    TCP_MAXWIN << tp->snd_scale);
+			/*
+			 * We want snd_recover to track snd_una to
+			 * avoid sequence wraparound problems for
+			 * very large transfers.
+			 */
+			tp->snd_recover = th->th_ack;
 		}
 		ND6_HINT(tp);
 		if (acked > so->so_snd.sb_cc) {
@@ -2215,14 +2235,11 @@
 			ourfinisacked = 0;
 		}
 		sowwakeup(so);
-		/*
-		 * We want snd_recover to track snd_una to
-		 * avoid sequence wraparound problems for
-		 * very large transfers.
-		 */
-		tp->snd_una = tp->snd_recover = th->th_ack;
+		tp->snd_una = th->th_ack;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
+		if (SEQ_LT(tp->snd_high, tp->snd_una - 1))
+			tp->snd_high = tp->snd_una - 1;
 
 		switch (tp->t_state) {
 
@@ -3041,16 +3058,30 @@
  * 1.  By setting snd_nxt to th_ack, this forces retransmission timer to
  * be started again.  If the ack advances at least to tp->snd_recover, return 0.
  */
-int
+void
 tcp_newreno(tp, th)
 	struct tcpcb *tp;
 	struct tcphdr *th;
 {
-	tcp_seq onxt = tp->snd_nxt;
-	u_long ocwnd = tp->snd_cwnd;
+	if (tp->t_dupacks < tcprexmtthresh) {
+		/*
+		 * We were not in fast recovery.  Reset the duplicate ack
+		 * counter.
+		 */
+		tp->t_dupacks = 0;
+		return;
+	}
 
 	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 		/*
+		 * This is a partial ack.  Retransmit the first unacknowledged
+		 * segment and deflate the congestion window by the amount of
+		 * unacknowledged data.  Do not exit fast recovery.
+		 */
+		tcp_seq onxt = tp->snd_nxt;
+		u_long ocwnd = tp->snd_cwnd;
+
+		/*
 		 * snd_una has not yet been updated and the socket's send
 		 * buffer has not yet drained off the ACK'd data, so we
 		 * have to leave snd_una as it was to get the correct data
@@ -3073,9 +3104,23 @@
 		 * not updated yet.
 		 */
 		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
-		return 1;
+	} else {
+		/*
+		 * Complete ack.  Inflate the congestion window to ssthresh
+		 * and exit fast recovery.
+		 *
+		 * Window inflation should have left us with approx.
+		 * snd_ssthresh outstanding data.  But in case we
+		 * would be inclined to send a burst, better to do
+		 * it via the slow start mechanism.
+		 */
+		if (SEQ_LT(tp->snd_max, th->th_ack + tp->snd_ssthresh))
+			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+			    + tp->t_segsz;
+		else
+			tp->snd_cwnd = tp->snd_ssthresh;
+		tp->t_dupacks = 0;
 	}
-	return 0;
 }
 
 
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.66
diff -u -r1.66 tcp_timer.c
--- tcp_timer.c	2 Jan 2004 15:51:04 -0000	1.66
+++ tcp_timer.c	26 Jan 2005 20:06:29 -0000
@@ -372,6 +372,7 @@
 		tp->t_srtt = 0;
 	}
 	tp->snd_nxt = tp->snd_una;
+	tp->snd_high = tp->snd_max;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.115
diff -u -r1.115 tcp_var.h
--- tcp_var.h	21 Dec 2004 05:51:31 -0000	1.115
+++ tcp_var.h	26 Jan 2005 20:06:29 -0000
@@ -217,6 +217,7 @@
 	tcp_seq	iss;			/* initial send sequence number */
 	u_long	snd_wnd;		/* send window */
 	tcp_seq snd_recover;		/* for use in fast recovery */
+	tcp_seq	snd_high;		/* NewReno false fast rexmit counter */
 /* receive sequence variables */
 	u_long	rcv_wnd;		/* receive window */
 	tcp_seq	rcv_nxt;		/* receive next */
@@ -822,7 +823,7 @@
 void	 syn_cache_timer(void *);
 void	 syn_cache_cleanup(struct tcpcb *);
 
-int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
+void	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 
 int	 tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
     int);

--Boundary-00=_Os/9BxwlpHW8l0w--