Subject: Re: Squashed Ack, was Re: Concerns about our NewReno code
To: Alistair Crooks <agc@pkgsrc.org>
From: Charles M. Hannum <abuse@spamalicious.com>
List: tech-net
Date: 01/26/2005 20:18:22
--Boundary-00=_Os/9BxwlpHW8l0w
Content-Type: text/plain;
charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline
Without having done much testing, I *think* this is a fix for the NewReno
bugs. Note that it is not the same as the FreeBSD code; AFAICT, the FreeBSD
code is broken in at least two ways. (No, I do not intend to help them fix
it.)
Basically, the changes are like this:
* t_dupacks always tells us whether we are in fast recovery. On receipt of a
partial ack, we do *not* clear t_dupacks. We do clear it on receipt of a
full ack, or if the counter is non-zero and we are not in fast recovery.
Thus, testing to see if we're in fast recovery is simpler -- the checks which
test snd_recover>=snd_una in FreeBSD can simply test t_dupacks>=rexxmtthresh
in my code -- which conveniently means that the same tests work for Reno and
NewReno.
* snd_recover is not updated on a partial ack any more. This would cause us
to prematurely exit fast recovery if we received two partial acks. It still
chases snd_una to prevent sequence wraparound.
* I added snd_high, as per RFC 2582, to keep track of the maximum sequence
number sent after slow retransmit. I had to add logic to prevent sequence
wraparound, by having snd_high chase behind snd_una.
AFAICT, this should not affect the Reno case (which is the default), so it
should be safe to commit. I would appreciate it if people could review
and/or test the change with NewReno enabled.
(Note: The attached diff includes the time stamp fix I previously sent out.)
--Boundary-00=_Os/9BxwlpHW8l0w
Content-Type: text/x-diff;
charset="iso-8859-1";
name="tcp.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
filename="tcp.diff"
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.212
diff -u -r1.212 tcp_input.c
--- tcp_input.c 21 Dec 2004 05:51:31 -0000 1.212
+++ tcp_input.c 26 Jan 2005 20:06:24 -0000
@@ -1495,6 +1495,21 @@
if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
goto drop;
+ if (opti.ts_present && opti.ts_ecr) {
+ u_int32_t now;
+
+ /*
+ * Calculate the RTT from the returned time stamp and the
+ * connection's time base. If the time stamp is later than
+ * the current time, fall back to non-1323 RTT calculation.
+ */
+ now = TCP_TIMESTAMP(tp);
+ if (SEQ_GEQ(now, opti.ts_ecr))
+ opti.ts_ecr = now - opti.ts_ecr + 1;
+ else
+ opti.ts_ecr = 0;
+ }
+
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
@@ -1528,6 +1543,7 @@
}
if (tlen == 0) {
+ /* Ack prediction. */
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
@@ -1537,8 +1553,7 @@
*/
++tcpstat.tcps_predack;
if (opti.ts_present && opti.ts_ecr)
- tcp_xmit_timer(tp,
- TCP_TIMESTAMP(tp) - opti.ts_ecr + 1);
+ tcp_xmit_timer(tp, opti.ts_ecr);
else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp,
@@ -1556,9 +1571,13 @@
/*
* We want snd_recover to track snd_una to
* avoid sequence wraparound problems for
- * very large transfers.
+ * very large transfers. Note that since we
+ * are not in fast recovery, we know that
+ * snd_recover<=snd_una already here.
*/
tp->snd_una = tp->snd_recover = th->th_ack;
+ if (SEQ_LT(tp->snd_high, tp->snd_una - 1))
+ tp->snd_high = tp->snd_una - 1;
m_freem(m);
/*
@@ -1686,6 +1705,8 @@
tp->snd_una = tp->snd_recover = th->th_ack;
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
+ if (SEQ_LT(tp->snd_high, tp->snd_una - 1))
+ tp->snd_high = tp->snd_una - 1;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
}
tp->irs = th->th_seq;
@@ -2078,22 +2099,23 @@
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks == tcprexmtthresh) {
- tcp_seq onxt = tp->snd_nxt;
- u_int win =
- min(tp->snd_wnd, tp->snd_cwnd) /
- 2 / tp->t_segsz;
- if (tcp_do_newreno && SEQ_LT(th->th_ack,
- tp->snd_recover)) {
+ tcp_seq onxt;
+ u_int win;
+
+ if (tcp_do_newreno &&
+ SEQ_LEQ(th->th_ack, tp->snd_high)) {
/*
* False fast retransmit after
- * timeout. Do not cut window.
+ * timeout. Do not enter fast
+ * recovery.
*/
- tp->snd_cwnd += tp->t_segsz;
tp->t_dupacks = 0;
- (void) tcp_output(tp);
- goto drop;
+ break;
}
+ onxt = tp->snd_nxt;
+ win = min(tp->snd_wnd, tp->snd_cwnd) /
+ 2 / tp->t_segsz;
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_segsz;
@@ -2130,24 +2152,13 @@
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (tcp_do_newreno == 0) {
+ if (!tcp_do_newreno) {
if (tp->t_dupacks >= tcprexmtthresh &&
tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_dupacks = 0;
- } else if (tp->t_dupacks >= tcprexmtthresh &&
- tcp_newreno(tp, th) == 0) {
- tp->snd_cwnd = tp->snd_ssthresh;
- /*
- * Window inflation should have left us with approx.
- * snd_ssthresh outstanding data. But in case we
- * would be inclined to send a burst, better to do
- * it via the slow start mechanism.
- */
- if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
- tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
- + tp->t_segsz;
- tp->t_dupacks = 0;
+ } else {
+ tcp_newreno(tp, th);
}
if (SEQ_GT(th->th_ack, tp->snd_max)) {
tcpstat.tcps_rcvacktoomuch++;
@@ -2167,7 +2178,7 @@
* Recompute the initial retransmit timer.
*/
if (opti.ts_present && opti.ts_ecr)
- tcp_xmit_timer(tp, TCP_TIMESTAMP(tp) - opti.ts_ecr + 1);
+ tcp_xmit_timer(tp, opti.ts_ecr);
else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
@@ -2190,16 +2201,25 @@
* (segsz^2 / cwnd per packet), plus a constant
* fraction of a packet (segsz/8) to help larger windows
* open quickly enough.
- */
- {
- u_int cw = tp->snd_cwnd;
- u_int incr = tp->t_segsz;
-
- if (cw > tp->snd_ssthresh)
- incr = incr * incr / cw;
- if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
+ *
+ * If we are still in fast recovery (meaning we are using
+ * NewReno and we have only received partial acks), do not
+ * inflate the window yet.
+ */
+ if (tp->t_dupacks == 0) {
+ u_int cw = tp->snd_cwnd;
+ u_int incr = tp->t_segsz;
+
+ if (cw > tp->snd_ssthresh)
+ incr = incr * incr / cw;
tp->snd_cwnd = min(cw + incr,
TCP_MAXWIN << tp->snd_scale);
+ /*
+ * We want snd_recover to track snd_una to
+ * avoid sequence wraparound problems for
+ * very large transfers.
+ */
+ tp->snd_recover = th->th_ack;
}
ND6_HINT(tp);
if (acked > so->so_snd.sb_cc) {
@@ -2215,14 +2235,11 @@
ourfinisacked = 0;
}
sowwakeup(so);
- /*
- * We want snd_recover to track snd_una to
- * avoid sequence wraparound problems for
- * very large transfers.
- */
- tp->snd_una = tp->snd_recover = th->th_ack;
+ tp->snd_una = th->th_ack;
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
+ if (SEQ_LT(tp->snd_high, tp->snd_una - 1))
+ tp->snd_high = tp->snd_una - 1;
switch (tp->t_state) {
@@ -3041,16 +3058,30 @@
* 1. By setting snd_nxt to th_ack, this forces retransmission timer to
* be started again. If the ack advances at least to tp->snd_recover, return 0.
*/
-int
+void
tcp_newreno(tp, th)
struct tcpcb *tp;
struct tcphdr *th;
{
- tcp_seq onxt = tp->snd_nxt;
- u_long ocwnd = tp->snd_cwnd;
+ if (tp->t_dupacks < tcprexmtthresh) {
+ /*
+ * We were not in fast recovery. Reset the duplicate ack
+ * counter.
+ */
+ tp->t_dupacks = 0;
+ return;
+ }
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
/*
+ * This is a partial ack. Retransmit the first unacknowledged
+ * segment and deflate the congestion window by the amount of
+ * unacknowledged data. Do not exit fast recovery.
+ */
+ tcp_seq onxt = tp->snd_nxt;
+ u_long ocwnd = tp->snd_cwnd;
+
+ /*
* snd_una has not yet been updated and the socket's send
* buffer has not yet drained off the ACK'd data, so we
* have to leave snd_una as it was to get the correct data
@@ -3073,9 +3104,23 @@
* not updated yet.
*/
tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
- return 1;
+ } else {
+ /*
+ * Complete ack. Inflate the congestion window to ssthresh
+ * and exit fast recovery.
+ *
+ * Window inflation should have left us with approx.
+ * snd_ssthresh outstanding data. But in case we
+ * would be inclined to send a burst, better to do
+ * it via the slow start mechanism.
+ */
+ if (SEQ_LT(tp->snd_max, th->th_ack + tp->snd_ssthresh))
+ tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+ + tp->t_segsz;
+ else
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_dupacks = 0;
}
- return 0;
}
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.66
diff -u -r1.66 tcp_timer.c
--- tcp_timer.c 2 Jan 2004 15:51:04 -0000 1.66
+++ tcp_timer.c 26 Jan 2005 20:06:29 -0000
@@ -372,6 +372,7 @@
tp->t_srtt = 0;
}
tp->snd_nxt = tp->snd_una;
+ tp->snd_high = tp->snd_max;
/*
* If timing a segment in this window, stop the timer.
*/
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.115
diff -u -r1.115 tcp_var.h
--- tcp_var.h 21 Dec 2004 05:51:31 -0000 1.115
+++ tcp_var.h 26 Jan 2005 20:06:29 -0000
@@ -217,6 +217,7 @@
tcp_seq iss; /* initial send sequence number */
u_long snd_wnd; /* send window */
tcp_seq snd_recover; /* for use in fast recovery */
+ tcp_seq snd_high; /* NewReno false fast rexmit counter */
/* receive sequence variables */
u_long rcv_wnd; /* receive window */
tcp_seq rcv_nxt; /* receive next */
@@ -822,7 +823,7 @@
void syn_cache_timer(void *);
void syn_cache_cleanup(struct tcpcb *);
-int tcp_newreno(struct tcpcb *, struct tcphdr *);
+void tcp_newreno(struct tcpcb *, struct tcphdr *);
int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
int);
--Boundary-00=_Os/9BxwlpHW8l0w--