tech-net archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Trimming TCP options



On 01/04/11 11:48, Mihai Chelaru wrote:
> I'll keep an eye on the last counter in the following days.

I need to reboot this one so I attached the latest stats and the
sys/netinet diff. I haven't seen anything strange during usage until now.

-- 
Mihai
tcp:
        46317610 packets sent
                36988340 data packets (27717538314 bytes)
                195387 data packets (159339998 bytes) retransmitted
                6355078 ack-only packets (7820889 delayed)
                0 URG only packets
                1396 window probe packets
                2420913 window update packets
                356502 control packets
                0 send attempts resulted in self-quench
        40556719 packets received
                27918469 acks (for 27628909944 bytes)
                0 duplicate acks
                229 acks for unsent data
                11497073 packets (10555150094 bytes) received in-sequence
                677061 completely duplicate packets (36428938 bytes)
                473 old duplicate packets
                33390 packets with some dup. data (22800833 bytes duped)
                673787 out-of-order packets (194682332 bytes)
                3 packets (1 byte) of data after window
                1 window probe
                27609 window update packets
                174177 packets received after close
                48 discarded for bad checksums
                0 discarded for bad header offset fields
                0 discarded because packet too short
        63831 connection requests
        208515 connection accepts
        250098 connections established (including accepts)
        274252 connections closed (including 3614 drops)
        6689 embryonic connections dropped
        15832 SYN options degraded
        161 connected with no options
        0 delayed frees of tcpcb
        22994926 segments updated rtt (of 21916325 attempts)
        174389 retransmit timeouts
                1215 connections dropped by rexmit timeout
        1773 persist timeouts (resulting in 2 dropped connections)
        1337 keepalive timeouts
                77 keepalive probes sent
                1228 connections dropped by keepalive
        1842450 correct ACK header predictions
        9056728 correct data packet header predictions
        779054 PCB hash misses
        17220 dropped due to no socket
        167 connections drained due to memory shortage
        3847 PMTUD blackholes detected
        319271 bad connection attempts
        210313 SYN cache entries added
                0 hash collisions
                208515 completed
                0 aborted (no space to build PCB)
                1436 timed out
                0 dropped due to overflow
                0 dropped due to bucket overflow
                360 dropped due to RST
                0 dropped due to ICMP unreachable
                208938 delayed free of SYN cache entries
        9849 SYN,ACKs retransmitted
        3891 duplicate SYNs received for entries already in the cache
        61 SYNs dropped (no route or no space)
        0 packets with bad signature
        137693 packets with good signature
        0 sucessful ECN handshakes
        0 packets with ECN CE bit
        0 packets ECN ECT(0) bit
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.306
diff -u -p -r1.306 tcp_input.c
--- tcp_input.c 2 Dec 2010 19:07:27 -0000       1.306
+++ tcp_input.c 6 Jan 2011 17:22:27 -0000
@@ -1987,6 +1987,8 @@ after_listen:
                tcp_rmx_rtt(tp);
                if (tiflags & TH_ACK) {
                        TCP_STATINC(TCP_STAT_CONNECTS);
+                       if (tp->t_rxtshift > TCP_SYN_RET_NOOPT)
+                               TCP_STATINC(TCP_STAT_NOOPT_CON);
                        soisconnected(so);
                        tcp_established(tp);
                        /* Do window scaling on this connection? */
@@ -2312,6 +2314,8 @@ after_listen:
                    SEQ_GT(th->th_ack, tp->snd_max))
                        goto dropwithreset;
                TCP_STATINC(TCP_STAT_CONNECTS);
+               if (tp->t_rxtshift > TCP_SYN_RET_NOOPT)
+                       TCP_STATINC(TCP_STAT_NOOPT_CON);
                soisconnected(so);
                tcp_established(tp);
                /* Do window scaling? */
@@ -4347,30 +4351,24 @@ syn_cache_respond(struct syn_cache *sc, 
        *optp++ = sc->sc_ourmaxseg & 0xff;
 
        if (sc->sc_request_r_scale != 15) {
-               *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
-                   TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
-                   sc->sc_request_r_scale);
-               optp += 4;
+               *((u_int32_t *)optp) = htonl(TCPOPT_WINDOW << 24 |
+                   TCPOLEN_WINDOW << 16 | sc->sc_request_r_scale << 8);
+               optp += 3;
        }
 
        if (sc->sc_flags & SCF_TIMESTAMP) {
+               *optp++ = TCPOPT_TIMESTAMP;
+               *optp++ = TCPOLEN_TIMESTAMP;
                u_int32_t *lp = (u_int32_t *)(optp);
-               /* Form timestamp option as shown in appendix A of RFC 1323. */
-               *lp++ = htonl(TCPOPT_TSTAMP_HDR);
                *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
                *lp   = htonl(sc->sc_timestamp);
-               optp += TCPOLEN_TSTAMP_APPA;
+               optp += TCPOLEN_TIMESTAMP - 2;
        }
 
        if (sc->sc_flags & SCF_SACK_PERMIT) {
-               u_int8_t *p = optp;
-
                /* Let the peer know that we will SACK. */
-               p[0] = TCPOPT_SACK_PERMITTED;
-               p[1] = 2;
-               p[2] = TCPOPT_NOP;
-               p[3] = TCPOPT_NOP;
-               optp += 4;
+               *optp++ = TCPOPT_SACK_PERMITTED;
+               *optp++ = 2;
        }
 
        /*
@@ -4440,8 +4438,6 @@ syn_cache_respond(struct syn_cache *sc, 
                sigp = optp;
                memset(optp, 0, TCP_SIGLEN);
                optp += TCP_SIGLEN;
-               *optp++ = TCPOPT_NOP;
-               *optp++ = TCPOPT_EOL;
 
                (void)tcp_signature(m, th, hlen, sav, sigp);
 
@@ -4453,7 +4449,12 @@ syn_cache_respond(struct syn_cache *sc, 
 #endif
        }
 #endif
-
+       /* Align options to 32-bit boundary */
+       if ((optp - (u_int8_t *)(th + 1)) % 4) {
+               *optp++ = TCPOPT_EOL;
+               while((optp - (u_int8_t *)(th + 1)) % 4)
+                       *optp++ = 0;
+       }
        /* Compute the packet's checksum. */
        switch (sc->sc_src.sa.sa_family) {
        case AF_INET:
Index: tcp_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_output.c,v
retrieving revision 1.169
diff -u -p -r1.169 tcp_output.c
--- tcp_output.c        26 Jan 2010 18:09:08 -0000      1.169
+++ tcp_output.c        6 Jan 2011 17:22:27 -0000
@@ -1134,28 +1134,31 @@ send:
                        opt[3] = tp->t_ourmss & 0xff;
                        optlen = 4;
 
+                       if (tp->t_rxtshift > TCP_SYN_RET_NOOPT) {
+                               tp->t_flags |= TF_NOOPT;
+                               TCP_STATINC(TCP_STAT_SYN_NOOPT);
+                               goto outopts;
+                       }
                        if ((tp->t_flags & TF_REQ_SCALE) &&
                            ((flags & TH_ACK) == 0 ||
                            (tp->t_flags & TF_RCVD_SCALE))) {
                                *((u_int32_t *) (opt + optlen)) = htonl(
-                                       TCPOPT_NOP << 24 |
-                                       TCPOPT_WINDOW << 16 |
-                                       TCPOLEN_WINDOW << 8 |
-                                       tp->request_r_scale);
-                               optlen += 4;
+                                       TCPOPT_WINDOW << 24 |
+                                       TCPOLEN_WINDOW << 16 |
+                                       tp->request_r_scale << 8);
+                               optlen += 3;
                        }
                        if (tcp_do_sack) {
                                u_int8_t *cp = (u_int8_t *)(opt + optlen);
 
                                cp[0] = TCPOPT_SACK_PERMITTED;
                                cp[1] = 2;
-                               cp[2] = TCPOPT_NOP;
-                               cp[3] = TCPOPT_NOP;
-                               optlen += 4;
+                               optlen += 2;
                        }
                }
        }
 
+outopts:
        /*
         * Send a timestamp and echo-reply if this is a SYN and our side
         * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
@@ -1165,13 +1168,24 @@ send:
             (flags & TH_RST) == 0 &&
            ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
             (tp->t_flags & TF_RCVD_TSTMP))) {
-               u_int32_t *lp = (u_int32_t *)(opt + optlen);
-
-               /* Form timestamp option as shown in appendix A of RFC 1323. */
-               *lp++ = htonl(TCPOPT_TSTAMP_HDR);
+               u_int32_t *lp;
+               if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) {
+                       u_char *bp = (u_char *)(opt + optlen);
+                       bp[0] = TCPOPT_TIMESTAMP;
+                       bp[1] = TCPOLEN_TIMESTAMP;
+                       lp = (u_int32_t *)(opt + optlen + 2);
+                       optlen += TCPOLEN_TIMESTAMP;
+               } else {
+                       /*
+                        * Form timestamp option as shown in
+                        * appendix A of RFC 1323.
+                        */
+                       lp = (u_int32_t *)(opt + optlen);
+                       *lp++ = htonl(TCPOPT_TSTAMP_HDR);
+                       optlen += TCPOLEN_TSTAMP_APPA;
+               }
                *lp++ = htonl(TCP_TIMESTAMP(tp));
                *lp   = htonl(tp->ts_recent);
-               optlen += TCPOLEN_TSTAMP_APPA;
 
                /* Set receive buffer autosizing timestamp. */
                if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
@@ -1184,14 +1198,12 @@ send:
        if (sack_numblks) {
                int sack_len;
                u_char *bp = (u_char *)(opt + optlen);
-               u_int32_t *lp = (u_int32_t *)(bp + 4);
+               u_int32_t *lp = (u_int32_t *)(bp + 2);
                struct ipqent *tiqe;
 
                sack_len = sack_numblks * 8 + 2;
-               bp[0] = TCPOPT_NOP;
-               bp[1] = TCPOPT_NOP;
-               bp[2] = TCPOPT_SACK;
-               bp[3] = sack_len;
+               bp[0] = TCPOPT_SACK;
+               bp[1] = sack_len;
                if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
                        sack_numblks--;
                        *lp++ = htonl(tp->rcv_dsack_block.left);
@@ -1206,32 +1218,39 @@ send:
                        *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
                            ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
                }
-               optlen += sack_len + 2;
+               optlen += sack_len;
        }
        TCP_REASS_UNLOCK(tp);
 
 #ifdef TCP_SIGNATURE
        if (tp->t_flags & TF_SIGNATURE) {
                u_char *bp;
+               if (optlen + TCPOLEN_SIGNATURE > MAX_TCPOPTLEN)
+                       return EINVAL;
                /*
                 * Initialize TCP-MD5 option (RFC2385)
                 */
-               bp = (u_char *)opt + optlen;
+               bp = (u_char *)(opt + optlen);
                *bp++ = TCPOPT_SIGNATURE;
                *bp++ = TCPOLEN_SIGNATURE;
                sigoff = optlen + 2;
                memset(bp, 0, TCP_SIGLEN);
-               bp += TCP_SIGLEN;
                optlen += TCPOLEN_SIGNATURE;
-               /*
-                * Terminate options list and maintain 32-bit alignment.
-                */
-               *bp++ = TCPOPT_NOP;
-               *bp++ = TCPOPT_EOL;
-               optlen += 2;
        }
 #endif /* TCP_SIGNATURE */
 
+       /*
+        * Terminate options list and maintain 32-bit alignment.
+        */
+       if (optlen % 4) {
+               u_char *bp;
+               bp = (u_char*)opt + optlen;
+               *bp++ = TCPOPT_EOL;
+               optlen++;
+               for (; optlen % 4; optlen++, bp++)
+                       *bp = 0;
+       }
+
        hdrlen += optlen;
 
 #ifdef DIAGNOSTIC
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.238
diff -u -p -r1.238 tcp_subr.c
--- tcp_subr.c  16 Sep 2009 15:23:05 -0000      1.238
+++ tcp_subr.c  6 Jan 2011 17:22:28 -0000
@@ -731,6 +731,13 @@ tcp_respond(struct tcpcb *tp, struct mbu
                /* clear h/w csum flags inherited from rx packet */
                m->m_pkthdr.csum_flags = 0;
 
+#ifdef TCP_SIGNATURE
+               if (tp != NULL && (tp->t_flags & TF_SIGNATURE) != 0 &&
+                   ((flags & TH_SYN) == 0 ||
+                   sizeof(*th0) + TCPOLEN_SIGNATURE > (th0->th_off << 2)))
+                       tlen = sizeof(*th0) + TCPOLEN_SIGNATURE;
+               else
+#endif
                if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
                        tlen = sizeof(*th0);
                else
@@ -773,6 +780,16 @@ tcp_respond(struct tcpcb *tp, struct mbu
                        m = n;
                        n = NULL;
                }
+#ifdef TCP_SIGNATURE
+               if (tp != NULL && (tp->t_flags & TF_SIGNATURE) != 0) {
+                       u_char *sigplace;
+                       sigplace = mtod(m, u_char *);
+                       sigplace += hlen + tlen - TCPOLEN_SIGLEN;
+                       memset(sigplace, 0, TCPOLEN_SIGLEN);
+                       sigplace[0] = TCPOPT_SIGNATURE;
+                       sigplace[1] = TCPOLEN_SIGNATURE;
+               }
+#endif
 
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
                switch (family) {
@@ -814,6 +831,10 @@ tcp_respond(struct tcpcb *tp, struct mbu
                th->th_win = htons((u_int16_t)win);
                th->th_off = sizeof (struct tcphdr) >> 2;
                tlen += sizeof(*th);
+#ifdef TCP_SIGNATURE
+               if (tp != NULL && (tp->t_flags & TF_SIGNATURE) != 0)
+                       tlen += TCPOLEN_SIGNATURE;
+#endif
        } else
                tlen += th->th_off << 2;
        m->m_len = hlen + tlen;
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.162
diff -u -p -r1.162 tcp_var.h
--- tcp_var.h   16 Sep 2009 15:23:05 -0000      1.162
+++ tcp_var.h   6 Jan 2011 17:22:28 -0000
@@ -643,8 +643,17 @@ struct syn_cache_head {
 #define        TCP_STAT_ECN_SHS        73      /* # of successful ECN 
handshakes */
 #define        TCP_STAT_ECN_CE         74      /* # of packets with CE bit */
 #define        TCP_STAT_ECN_ECT        75      /* # of packets with ECT(0) bit 
*/
+#define        TCP_STAT_SYN_NOOPT      76      /* SYN options downgrades */
+#define        TCP_STAT_NOOPT_CON      77      /* Conns established after 
RET_NOOPT */
 
-#define        TCP_NSTATS              76
+#define        TCP_NSTATS              78
+
+/*
+ * Two SYNs sent with full options
+ * The next one only with MSS
+ * The following sent with no options at all
+ */
+#define TCP_SYN_RET_NOOPT      1
 
 /*
  * Names for TCP sysctl objects.


Home | Main Index | Thread Index | Old Index