Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/netinet Commit TCP SACK patches from Kentaro A. Karahone...



details:   https://anonhg.NetBSD.org/src/rev/e606b3e4972b
branches:  trunk
changeset: 574450:e606b3e4972b
user:      jonathan <jonathan%NetBSD.org@localhost>
date:      Mon Feb 28 16:20:59 2005 +0000

description:
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
   http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz

Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.

The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.

There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.

After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over.  Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting  both
Kentaro's  wired-Ethernet NIC and in my two (different) WiFi NICs.

diffstat:

 sys/netinet/files.netinet |    3 +-
 sys/netinet/tcp_input.c   |   84 +++++-
 sys/netinet/tcp_output.c  |  186 ++++++++++++++-
 sys/netinet/tcp_sack.c    |  547 ++++++++++++++++++++++++++++++++++++++++++++++
 sys/netinet/tcp_subr.c    |   16 +-
 sys/netinet/tcp_timer.c   |   17 +-
 sys/netinet/tcp_var.h     |   59 ++++-
 7 files changed, 859 insertions(+), 53 deletions(-)

diffs (truncated from 1316 to 300 lines):

diff -r 3f1b87aea669 -r e606b3e4972b sys/netinet/files.netinet
--- a/sys/netinet/files.netinet Mon Feb 28 16:16:19 2005 +0000
+++ b/sys/netinet/files.netinet Mon Feb 28 16:20:59 2005 +0000
@@ -1,4 +1,4 @@
-#      $NetBSD: files.netinet,v 1.9 2005/01/13 19:09:40 drochner Exp $
+#      $NetBSD: files.netinet,v 1.10 2005/02/28 16:20:59 jonathan Exp $
 
 defflag opt_tcp_debug.h                TCP_DEBUG
 defparam opt_tcp_debug.h       TCP_NDEBUG
@@ -30,6 +30,7 @@
 file   netinet/tcp_debug.c     (inet | inet6) & tcp_debug
 file   netinet/tcp_input.c     inet | inet6
 file   netinet/tcp_output.c    inet | inet6
+file   netinet/tcp_sack.c      inet | inet6
 file   netinet/tcp_subr.c      inet | inet6
 file   netinet/tcp_timer.c     inet | inet6
 file   netinet/tcp_usrreq.c    inet | inet6
diff -r 3f1b87aea669 -r e606b3e4972b sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c   Mon Feb 28 16:16:19 2005 +0000
+++ b/sys/netinet/tcp_input.c   Mon Feb 28 16:20:59 2005 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: tcp_input.c,v 1.221 2005/02/26 22:45:12 perry Exp $    */
+/*     $NetBSD: tcp_input.c,v 1.222 2005/02/28 16:20:59 jonathan Exp $ */
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -148,7 +148,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.221 2005/02/26 22:45:12 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.222 2005/02/28 16:20:59 jonathan Exp $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -493,6 +493,7 @@
                    SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
                        tcpstat.tcps_rcvduppack++;
                        tcpstat.tcps_rcvdupbyte += pkt_len;
+                       tcp_new_dsack(tp, pkt_seq, pkt_len);
                        m_freem(m);
                        if (tiqe != NULL)
                                pool_put(&tcpipqent_pool, tiqe);
@@ -1484,6 +1485,10 @@
                if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
                        goto drop;
 
+       if (TCP_SACK_ENABLED(tp)) {
+               tcp_del_sackholes(tp, th);
+       }
+
        if (opti.ts_present && opti.ts_ecr) {
                /*
                 * Calculate the RTT from the returned time stamp and the
@@ -1556,6 +1561,7 @@
                                tp->t_lastoff -= acked;
 
                                tp->snd_una = th->th_ack;
+                               tp->snd_fack = tp->snd_una;
                                if (SEQ_LT(tp->snd_high, tp->snd_una))
                                        tp->snd_high = tp->snd_una;
                                m_freem(m);
@@ -1592,6 +1598,7 @@
                         * we have enough buffer space to take it.
                         */
                        ++tcpstat.tcps_preddat;
+                       tp->rcv_sack_num = 0;
                        tp->rcv_nxt += tlen;
                        tcpstat.tcps_rcvpack++;
                        tcpstat.tcps_rcvbyte += tlen;
@@ -1799,6 +1806,7 @@
                        tcpstat.tcps_rcvduppack++;
                        tcpstat.tcps_rcvdupbyte += tlen;
                        tcpstat.tcps_pawsdrop++;
+                       tcp_new_dsack(tp, th->th_seq, tlen);
                        goto dropafterack;
                }
        }
@@ -1847,6 +1855,7 @@
                        tcpstat.tcps_rcvpartduppack++;
                        tcpstat.tcps_rcvpartdupbyte += todrop;
                }
+               tcp_new_dsack(tp, th->th_seq, todrop);
                hdroptlen += todrop;    /*drop from head afterwards*/
                th->th_seq += todrop;
                tlen -= todrop;
@@ -2075,12 +2084,19 @@
                                 * so bump cwnd by the amount in the receiver
                                 * to keep a constant cwnd packets in the
                                 * network.
+                                *
+                                * If we are using TCP/SACK, then enter
+                                * Fast Recovery if the receiver SACKs
+                                * data that is tcprexmtthresh * MSS
+                                * bytes past the last ACKed segment,
+                                * irrespective of the number of DupAcks.
                                 */
                                if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
                                    th->th_ack != tp->snd_una)
                                        tp->t_dupacks = 0;
-                               else if (++tp->t_dupacks == tcprexmtthresh &&
-                                        tp->t_partialacks < 0) {
+                               else if (tp->t_partialacks < 0 &&
+                                        (++tp->t_dupacks == tcprexmtthresh ||
+                                        TCP_FACK_FASTRECOV(tp))) {
                                        tcp_seq onxt;
                                        u_int win;
 
@@ -2105,6 +2121,13 @@
                                        tp->t_partialacks = 0;
                                        TCP_TIMER_DISARM(tp, TCPT_REXMT);
                                        tp->t_rtttime = 0;
+                                       if (TCP_SACK_ENABLED(tp)) {
+                                               tp->t_dupacks = tcprexmtthresh;
+                                               tp->sack_newdata = tp->snd_nxt;
+                                               tp->snd_cwnd = tp->t_segsz;
+                                               (void) tcp_output(tp);
+                                               goto drop;
+                                       }
                                        tp->snd_nxt = th->th_ack;
                                        tp->snd_cwnd = tp->t_segsz;
                                        (void) tcp_output(tp);
@@ -2138,10 +2161,12 @@
                 * If the congestion window was inflated to account
                 * for the other side's cached packets, retract it.
                 */
-               if (!tcp_do_newreno)
+               if (TCP_SACK_ENABLED(tp))
+                       tcp_sack_newack(tp, th);
+               else if (tcp_do_newreno)
+                       tcp_newreno_newack(tp, th);
+               else
                        tcp_reno_newack(tp, th);
-               else
-                       tcp_newreno_newack(tp, th);
                if (SEQ_GT(th->th_ack, tp->snd_max)) {
                        tcpstat.tcps_rcvacktoomuch++;
                        goto dropafterack;
@@ -2212,6 +2237,8 @@
                }
                sowwakeup(so);
                tp->snd_una = th->th_ack;
+               if (SEQ_GT(tp->snd_una, tp->snd_fack))
+                       tp->snd_fack = tp->snd_una;
                if (SEQ_LT(tp->snd_nxt, tp->snd_una))
                        tp->snd_nxt = tp->snd_una;
                if (SEQ_LT(tp->snd_high, tp->snd_una))
@@ -2406,6 +2433,7 @@
                } else {
                        m_adj(m, hdroptlen);
                        tiflags = tcp_reass(tp, th, m, &tlen);
+                       tcp_update_sack_list(tp);
                        tp->t_flags |= TF_ACKNOW;
                }
                TCP_REASS_UNLOCK(tp);
@@ -2478,8 +2506,10 @@
        /*
         * Return any desired output.
         */
-       if (needoutput || (tp->t_flags & TF_ACKNOW))
+       if (needoutput || (tp->t_flags & TF_ACKNOW)) {
+               tcp_update_sack_list(tp);
                (void) tcp_output(tp);
+       }
        if (tcp_saveti)
                m_freem(tcp_saveti);
        return;
@@ -2515,6 +2545,7 @@
 dropafterack2:
        m_freem(m);
        tp->t_flags |= TF_ACKNOW;
+       tcp_update_sack_list(tp);
        (void) tcp_output(tp);
        if (tcp_saveti)
                m_freem(tcp_saveti);
@@ -2817,24 +2848,14 @@
                                continue;
                        if (!(th->th_flags & TH_SYN))
                                continue;
-                       tp->t_flags &= ~TF_CANT_TXSACK;
+                       if (tcp_do_sack) {
+                               tp->t_flags |= TF_SACK_PERMIT;
+                               tp->t_flags |= TF_WILL_SACK;
+                       }
                        break;
 
                case TCPOPT_SACK:
-                       if (tp->t_flags & TF_IGNR_RXSACK)
-                               continue;
-                       if (optlen % 8 != 2 || optlen < 10)
-                               continue;
-                       cp += 2;
-                       optlen -= 2;
-                       for (; optlen > 0; cp -= 8, optlen -= 8) {
-                               tcp_seq lwe, rwe;
-                               bcopy((char *)cp, (char *) &lwe, sizeof(lwe));
-                               NTOHL(lwe);
-                               bcopy((char *)cp, (char *) &rwe, sizeof(rwe));
-                               NTOHL(rwe);
-                               /* tcp_mark_sacked(tp, lwe, rwe); */
-                       }
+                       tcp_sack_option(tp, th, cp, optlen);
                        break;
 #ifdef TCP_SIGNATURE
                case TCPOPT_SIGNATURE:
@@ -3663,6 +3684,9 @@
        TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
        tcpstat.tcps_accepts++;
 
+       if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
+               tp->t_flags |= TF_WILL_SACK;
+
 #ifdef TCP_SIGNATURE
        if (sc->sc_flags & SCF_SIGNATURE)
                tp->t_flags |= TF_SIGNATURE;
@@ -3952,6 +3976,8 @@
                sc->sc_requested_s_scale = 15;
                sc->sc_request_r_scale = 15;
        }
+       if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
+               sc->sc_flags |= SCF_SACK_PERMIT;
 #ifdef TCP_SIGNATURE
        if (tb.t_flags & TF_SIGNATURE)
                sc->sc_flags |= SCF_SIGNATURE;
@@ -4003,6 +4029,7 @@
 
        /* Compute the size of the TCP options. */
        optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
+           ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) +
 #ifdef TCP_SIGNATURE
            ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) +
 #endif
@@ -4108,6 +4135,17 @@
                optp += TCPOLEN_TSTAMP_APPA;
        }
 
+       if (sc->sc_flags & SCF_SACK_PERMIT) {
+               u_int8_t *p = optp;
+
+               /* Let the peer know that we will SACK. */
+               p[0] = TCPOPT_SACK_PERMITTED;
+               p[1] = 2;
+               p[2] = TCPOPT_NOP;
+               p[3] = TCPOPT_NOP;
+               optp += 4;
+       }
+
 #ifdef TCP_SIGNATURE
        if (sc->sc_flags & SCF_SIGNATURE) {
                struct secasvar *sav;
diff -r 3f1b87aea669 -r e606b3e4972b sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c  Mon Feb 28 16:16:19 2005 +0000
+++ b/sys/netinet/tcp_output.c  Mon Feb 28 16:20:59 2005 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: tcp_output.c,v 1.117 2005/02/26 22:45:12 perry Exp $   */
+/*     $NetBSD: tcp_output.c,v 1.118 2005/02/28 16:20:59 jonathan Exp $        */
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -138,7 +138,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.117 2005/02/26 22:45:12 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.118 2005/02/28 16:20:59 jonathan Exp $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -203,7 +203,7 @@
 extern struct mbuf *m_copypack();
 #endif
 
-#define MAX_TCPOPTLEN  32      /* max # bytes that go in options */
+#define MAX_TCPOPTLEN  40      /* max # bytes that go in options */
 
 /*
  * Knob to enable Congestion Window Monitoring, and control the
@@ -554,6 +554,9 @@
        int maxburst = TCP_MAXBURST;
        int af;         /* address family on the wire */
        int iphdrlen;
+       int sack_rxmit;
+       int sack_bytes_rxmt;
+       struct sackhole *p;
 #ifdef TCP_SIGNATURE
        int sigoff = 0;
 #endif
@@ -654,12 +657,70 @@
         * flags that should be used.  If there is some data or critical
         * controls (SYN, RST) to send, then transmit; otherwise,
         * investigate further.
+        *
+        * Readjust SACK information to avoid resending duplicate data.
         */
+       if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
+               tcp_sack_adjust(tp);
        sendalot = 0;
        off = tp->snd_nxt - tp->snd_una;
        win = min(tp->snd_wnd, tp->snd_cwnd);
 



Home | Main Index | Thread Index | Old Index