Subject: One last NewReno update
To: None <tech-net@netbsd.org>
From: Charles M. Hannum <abuse@spamalicious.com>
List: tech-net
Date: 01/26/2005 21:15:49
--Boundary-00=_FiA+BORupEvly82
Content-Type: text/plain;
  charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

It occured to me that keeping track of fast recovery state in t_dupacks allows 
another simplification.  To wit:

1) We set snd_recover when we enter fast recovery, and do not update it until 
we exit fast recovery.

2) We only test snd_recover while we're in fast recovery.

3) Fast recovery is constrained algorithmically to terminate in no more than 
one window -- in the Reno case because we receive an ack for the next packet, 
or in the NewReno case because we have acked all of the data that had been 
transmitted before fast recovery.

Therefore, we do not need to update snd_recover at any other time than when we 
enter fast retransmit.  Sequence wraparound is simply a non-issue.

Removing this brings the "fixed" code back down to the same size as the broken 
code on my x86 box.

--Boundary-00=_FiA+BORupEvly82
Content-Type: text/x-diff;
  charset="us-ascii";
  name="tcp.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="tcp.diff"

Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.212
diff -u -r1.212 tcp_input.c
--- tcp_input.c	21 Dec 2004 05:51:31 -0000	1.212
+++ tcp_input.c	26 Jan 2005 21:11:51 -0000
@@ -1495,6 +1495,21 @@
 		if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
 			goto drop;
 
+	if (opti.ts_present && opti.ts_ecr) {
+		u_int32_t now;
+
+		/*
+		 * Calculate the RTT from the returned time stamp and the
+		 * connection's time base.  If the time stamp is later than
+		 * the current time, fall back to non-1323 RTT calculation.
+		 */
+		now = TCP_TIMESTAMP(tp);
+		if (SEQ_GEQ(now, opti.ts_ecr))
+			opti.ts_ecr = now - opti.ts_ecr + 1;
+		else
+			opti.ts_ecr = 0;
+	}
+
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
@@ -1528,6 +1543,7 @@
 		}
 
 		if (tlen == 0) {
+			/* Ack prediction. */
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
@@ -1537,8 +1553,7 @@
 				 */
 				++tcpstat.tcps_predack;
 				if (opti.ts_present && opti.ts_ecr)
-					tcp_xmit_timer(tp,
-					  TCP_TIMESTAMP(tp) - opti.ts_ecr + 1);
+					tcp_xmit_timer(tp, opti.ts_ecr);
 				else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq))
 					tcp_xmit_timer(tp,
@@ -1553,12 +1568,9 @@
 				sbdrop(&so->so_snd, acked);
 				tp->t_lastoff -= acked;
 
-				/*
-				 * We want snd_recover to track snd_una to
-				 * avoid sequence wraparound problems for
-				 * very large transfers.
-				 */
-				tp->snd_una = tp->snd_recover = th->th_ack;
+				tp->snd_una = th->th_ack;
+				if (SEQ_LT(tp->snd_high, tp->snd_una))
+					tp->snd_high = tp->snd_una;
 				m_freem(m);
 
 				/*
@@ -1683,9 +1695,11 @@
 		if ((tiflags & TH_SYN) == 0)
 			goto drop;
 		if (tiflags & TH_ACK) {
-			tp->snd_una = tp->snd_recover = th->th_ack;
+			tp->snd_una = th->th_ack;
 			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 				tp->snd_nxt = tp->snd_una;
+			if (SEQ_LT(tp->snd_high, tp->snd_una))
+				tp->snd_high = tp->snd_una;
 			TCP_TIMER_DISARM(tp, TCPT_REXMT);
 		}
 		tp->irs = th->th_seq;
@@ -2078,22 +2092,23 @@
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks == tcprexmtthresh) {
-					tcp_seq onxt = tp->snd_nxt;
-					u_int win =
-					    min(tp->snd_wnd, tp->snd_cwnd) /
-					    2 /	tp->t_segsz;
-					if (tcp_do_newreno && SEQ_LT(th->th_ack,
-					    tp->snd_recover)) {
+					tcp_seq onxt;
+					u_int win;
+
+					if (tcp_do_newreno &&
+					    SEQ_LT(th->th_ack, tp->snd_high)) {
 						/*
 						 * False fast retransmit after
-						 * timeout.  Do not cut window.
+						 * timeout.  Do not enter fast
+						 * recovery.
 						 */
-						tp->snd_cwnd += tp->t_segsz;
 						tp->t_dupacks = 0;
-						(void) tcp_output(tp);
-						goto drop;
+						break;
 					}
 
+					onxt = tp->snd_nxt;
+					win = min(tp->snd_wnd, tp->snd_cwnd) /
+					    2 /	tp->t_segsz;
 					if (win < 2)
 						win = 2;
 					tp->snd_ssthresh = win * tp->t_segsz;
@@ -2130,24 +2145,13 @@
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (tcp_do_newreno == 0) {
+		if (!tcp_do_newreno) {
 			if (tp->t_dupacks >= tcprexmtthresh &&
 			    tp->snd_cwnd > tp->snd_ssthresh)
 				tp->snd_cwnd = tp->snd_ssthresh;
 			tp->t_dupacks = 0;
-		} else if (tp->t_dupacks >= tcprexmtthresh &&
-			   tcp_newreno(tp, th) == 0) {
-			tp->snd_cwnd = tp->snd_ssthresh;
-			/*
-			 * Window inflation should have left us with approx.
-			 * snd_ssthresh outstanding data.  But in case we
-			 * would be inclined to send a burst, better to do
-			 * it via the slow start mechanism.
-			 */
-			if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
-				tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
-				    + tp->t_segsz;
-			tp->t_dupacks = 0;
+		} else {
+			tcp_newreno(tp, th);
 		}
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			tcpstat.tcps_rcvacktoomuch++;
@@ -2167,7 +2171,7 @@
 		 * Recompute the initial retransmit timer.
 		 */
 		if (opti.ts_present && opti.ts_ecr)
-			tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1);
+			tcp_xmit_timer(tp, opti.ts_ecr);
 		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 			tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 
@@ -2190,14 +2194,17 @@
 		 * (segsz^2 / cwnd per packet), plus a constant
 		 * fraction of a packet (segsz/8) to help larger windows
 		 * open quickly enough.
-		 */
-		{
-		u_int cw = tp->snd_cwnd;
-		u_int incr = tp->t_segsz;
-
-		if (cw > tp->snd_ssthresh)
-			incr = incr * incr / cw;
-		if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
+		 *
+		 * If we are still in fast recovery (meaning we are using
+		 * NewReno and we have only received partial acks), do not
+		 * inflate the window yet.
+		 */
+		if (tp->t_dupacks == 0) {
+			u_int cw = tp->snd_cwnd;
+			u_int incr = tp->t_segsz;
+
+			if (cw > tp->snd_ssthresh)
+				incr = incr * incr / cw;
 			tp->snd_cwnd = min(cw + incr,
 			    TCP_MAXWIN << tp->snd_scale);
 		}
@@ -2215,14 +2222,11 @@
 			ourfinisacked = 0;
 		}
 		sowwakeup(so);
-		/*
-		 * We want snd_recover to track snd_una to
-		 * avoid sequence wraparound problems for
-		 * very large transfers.
-		 */
-		tp->snd_una = tp->snd_recover = th->th_ack;
+		tp->snd_una = th->th_ack;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
+		if (SEQ_LT(tp->snd_high, tp->snd_una))
+			tp->snd_high = tp->snd_una;
 
 		switch (tp->t_state) {
 
@@ -3036,20 +3040,29 @@
 }
 
 /*
- * Checks for partial ack.  If partial ack arrives, force the retransmission
- * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
- * 1.  By setting snd_nxt to th_ack, this forces retransmission timer to
- * be started again.  If the ack advances at least to tp->snd_recover, return 0.
+ * Implement the NewReno response to a new ack, checking for partial acks in
+ * fast recovery.
  */
-int
+void
 tcp_newreno(tp, th)
 	struct tcpcb *tp;
 	struct tcphdr *th;
 {
-	tcp_seq onxt = tp->snd_nxt;
-	u_long ocwnd = tp->snd_cwnd;
+	if (tp->t_dupacks < tcprexmtthresh) {
+		/*
+		 * We were not in fast recovery.  Reset the duplicate ack
+		 * counter.
+		 */
+		tp->t_dupacks = 0;
+	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+		/*
+		 * This is a partial ack.  Retransmit the first unacknowledged
+		 * segment and deflate the congestion window by the amount of
+		 * acknowledged data.  Do not exit fast recovery.
+		 */
+		tcp_seq onxt = tp->snd_nxt;
+		u_long ocwnd = tp->snd_cwnd;
 
-	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 		/*
 		 * snd_una has not yet been updated and the socket's send
 		 * buffer has not yet drained off the ACK'd data, so we
@@ -3073,9 +3086,23 @@
 		 * not updated yet.
 		 */
 		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
-		return 1;
+	} else {
+		/*
+		 * Complete ack.  Inflate the congestion window to ssthresh
+		 * and exit fast recovery.
+		 *
+		 * Window inflation should have left us with approx.
+		 * snd_ssthresh outstanding data.  But in case we
+		 * would be inclined to send a burst, better to do
+		 * it via the slow start mechanism.
+		 */
+		if (SEQ_LT(tp->snd_max, th->th_ack + tp->snd_ssthresh))
+			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+			    + tp->t_segsz;
+		else
+			tp->snd_cwnd = tp->snd_ssthresh;
+		tp->t_dupacks = 0;
 	}
-	return 0;
 }
 
 
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.66
diff -u -r1.66 tcp_timer.c
--- tcp_timer.c	2 Jan 2004 15:51:04 -0000	1.66
+++ tcp_timer.c	26 Jan 2005 21:11:52 -0000
@@ -372,6 +372,7 @@
 		tp->t_srtt = 0;
 	}
 	tp->snd_nxt = tp->snd_una;
+	tp->snd_high = tp->snd_max;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.115
diff -u -r1.115 tcp_var.h
--- tcp_var.h	21 Dec 2004 05:51:31 -0000	1.115
+++ tcp_var.h	26 Jan 2005 21:11:53 -0000
@@ -217,6 +217,7 @@
 	tcp_seq	iss;			/* initial send sequence number */
 	u_long	snd_wnd;		/* send window */
 	tcp_seq snd_recover;		/* for use in fast recovery */
+	tcp_seq	snd_high;		/* NewReno false fast rexmit counter */
 /* receive sequence variables */
 	u_long	rcv_wnd;		/* receive window */
 	tcp_seq	rcv_nxt;		/* receive next */
@@ -822,7 +823,7 @@
 void	 syn_cache_timer(void *);
 void	 syn_cache_cleanup(struct tcpcb *);
 
-int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
+void	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 
 int	 tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
     int);

--Boundary-00=_FiA+BORupEvly82--