tech-net archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
TCP_INFO socket option
Hi,
I noticed that iperf3 on Linux (and presumably FreeBSD) can
display the number of retransmits and the current congestion
window for each measurement status interval for the TCP session
being tested.
The info for this comes from a TCP_INFO socket option initially
added in Linux 2.6. The attached diff tries to implement this
for NetBSD (pulled from FreeBSD and then adapted), and the
resulting diff relative to netbsd-7 follows below. The result
builds, at least -- that's as long as I've tested so far.
In order to implement this, I had to add three variables per
tcpcb. I've not been able to find where to initialize these, or
whether they would already be zeroed on allocation of the tcpcb.
Any comments?
Regards,
- Håvard
Index: tcp.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp.h,v
retrieving revision 1.30
diff -u -r1.30 tcp.h
--- tcp.h 7 Jan 2012 20:20:22 -0000 1.30
+++ tcp.h 6 Feb 2015 23:04:52 -0000
@@ -127,7 +127,80 @@
#ifdef notyet
#define TCP_NOOPT 8 /* reserved for FreeBSD compat */
#endif
+#define TCP_INFO 9 /* retrieve tcp_info structure */
#define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */
#define TCP_CONGCTL 0x20 /* selected congestion control */
+#define TCPI_OPT_TIMESTAMPS 0x01
+#define TCPI_OPT_SACK 0x02
+#define TCPI_OPT_WSCALE 0x04
+#define TCPI_OPT_ECN 0x08
+#define TCPI_OPT_TOE 0x10
+
+/*
+ * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
+ * the caller to query certain information about the state of a TCP
+ * connection. We provide an overlapping set of fields with the Linux
+ * implementation, but since this is a fixed size structure, room has been
+ * left for growth. In order to maximize potential future compatibility with
+ * the Linux API, the same variable names and order have been adopted, and
+ * padding left to make room for omitted fields in case they are added later.
+ *
+ * XXX: This is currently an unstable ABI/API, in that it is expected to
+ * change.
+ */
+struct tcp_info {
+ uint8_t tcpi_state; /* TCP FSM state. */
+ uint8_t __tcpi_ca_state;
+ uint8_t __tcpi_retransmits;
+ uint8_t __tcpi_probes;
+ uint8_t __tcpi_backoff;
+ uint8_t tcpi_options; /* Options enabled on conn. */
+ uint8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */
+ tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */
+
+ uint32_t tcpi_rto; /* Retransmission timeout (usec). */
+ uint32_t __tcpi_ato;
+ uint32_t tcpi_snd_mss; /* Max segment size for send. */
+ uint32_t tcpi_rcv_mss; /* Max segment size for receive. */
+
+ uint32_t __tcpi_unacked;
+ uint32_t __tcpi_sacked;
+ uint32_t __tcpi_lost;
+ uint32_t __tcpi_retrans;
+ uint32_t __tcpi_fackets;
+
+ /* Times; measurements in usecs. */
+ uint32_t __tcpi_last_data_sent;
+ uint32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */
+ uint32_t tcpi_last_data_recv; /* Time since last recv data. */
+ uint32_t __tcpi_last_ack_recv;
+
+ /* Metrics; variable units. */
+ uint32_t __tcpi_pmtu;
+ uint32_t __tcpi_rcv_ssthresh;
+ uint32_t tcpi_rtt; /* Smoothed RTT in usecs. */
+ uint32_t tcpi_rttvar; /* RTT variance in usecs. */
+ uint32_t tcpi_snd_ssthresh; /* Slow start threshold. */
+ uint32_t tcpi_snd_cwnd; /* Send congestion window. */
+ uint32_t __tcpi_advmss;
+ uint32_t __tcpi_reordering;
+
+ uint32_t __tcpi_rcv_rtt;
+ uint32_t tcpi_rcv_space; /* Advertised recv window. */
+
+ /* FreeBSD/NetBSD extensions to tcp_info. */
+ uint32_t tcpi_snd_wnd; /* Advertised send window. */
+ uint32_t tcpi_snd_bwnd; /* No longer used. */
+ uint32_t tcpi_snd_nxt; /* Next egress seqno */
+ uint32_t tcpi_rcv_nxt; /* Next ingress seqno */
+ uint32_t tcpi_toe_tid; /* HWTID for TOE endpoints */
+ uint32_t tcpi_snd_rexmitpack; /* Retransmitted packets */
+ uint32_t tcpi_rcv_ooopack; /* Out-of-order packets */
+ uint32_t tcpi_snd_zerowin; /* Zero-sized windows sent */
+
+ /* Padding to grow without breaking ABI. */
+ uint32_t __tcpi_pad[26]; /* Padding. */
+};
+
#endif /* !_NETINET_TCP_H_ */
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.334
diff -u -r1.334 tcp_input.c
--- tcp_input.c 8 Aug 2014 03:05:45 -0000 1.334
+++ tcp_input.c 6 Feb 2015 23:04:52 -0000
@@ -738,6 +738,7 @@
/*
* Update the counters.
*/
+ tp->t_rcvoopack++;
tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_RCVOOPACK]++;
tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
Index: tcp_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_output.c,v
retrieving revision 1.176.2.3
diff -u -r1.176.2.3 tcp_output.c
--- tcp_output.c 17 Jan 2015 12:10:53 -0000 1.176.2.3
+++ tcp_output.c 6 Feb 2015 23:04:52 -0000
@@ -439,6 +439,7 @@
if (tp->t_force && len == 1)
tcps[TCP_STAT_SNDPROBE]++;
else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ tp->t_sndrexmitpack++;
tcps[TCP_STAT_SNDREXMITPACK]++;
tcps[TCP_STAT_SNDREXMITBYTE] += len;
} else {
@@ -1401,6 +1402,9 @@
if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
+ if (th->th_win == 0) {
+ tp->t_sndzerowin++;
+ }
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
u_int32_t urp = tp->snd_up - tp->snd_nxt;
if (urp > IP_MAXPACKET)
Index: tcp_usrreq.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.200.2.1
diff -u -r1.200.2.1 tcp_usrreq.c
--- tcp_usrreq.c 17 Jan 2015 12:10:53 -0000 1.200.2.1
+++ tcp_usrreq.c 6 Feb 2015 23:04:52 -0000
@@ -119,6 +119,7 @@
#include <sys/domain.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
+#include <sys/kernel.h>
#include <sys/uidinfo.h>
#include <net/if.h>
@@ -271,6 +272,61 @@
TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
}
+/*
+ * Export TCP internal state information via a struct tcp_info, based on the
+ * Linux 2.6 API. Not ABI compatible as our constants are mapped differently
+ * (TCP state machine, etc). We export all information using FreeBSD-native
+ * constants -- for example, the numeric values for tcpi_state will differ
+ * from Linux.
+ */
+static void
+tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
+{
+
+ bzero(ti, sizeof(*ti));
+
+ ti->tcpi_state = tp->t_state;
+ if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
+ ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tp->t_flags & TF_SACK_PERMIT)
+ ti->tcpi_options |= TCPI_OPT_SACK;
+ if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
+ ti->tcpi_options |= TCPI_OPT_WSCALE;
+ ti->tcpi_snd_wscale = tp->snd_scale;
+ ti->tcpi_rcv_wscale = tp->rcv_scale;
+ }
+ if (tp->t_flags & TF_ECN_PERMIT) {
+ ti->tcpi_options |= TCPI_OPT_ECN;
+ }
+
+ ti->tcpi_rto = tp->t_rxtcur * tick;
+ ti->tcpi_last_data_recv = (long)(hardclock_ticks -
+ (int)tp->t_rcvtime) * tick;
+ ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
+ ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
+
+ ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ ti->tcpi_snd_cwnd = tp->snd_cwnd;
+
+ /*
+ * FreeBSD-specific extension fields for tcp_info.
+ */
+ ti->tcpi_rcv_space = tp->rcv_wnd;
+ ti->tcpi_rcv_nxt = tp->rcv_nxt;
+ ti->tcpi_snd_wnd = tp->snd_wnd;
+ ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
+ ti->tcpi_snd_nxt = tp->snd_nxt;
+ ti->tcpi_snd_mss = tp->t_segsz;
+ ti->tcpi_rcv_mss = tp->t_segsz;
+#ifdef TF_TOE
+ if (tp->t_flags & TF_TOE)
+ ti->tcpi_options |= TCPI_OPT_TOE;
+#endif
+ ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
+ ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
+ ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+}
+
int
tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
@@ -280,6 +336,7 @@
struct in6pcb *in6p;
#endif
struct tcpcb *tp;
+ struct tcp_info ti;
u_int ui;
int family; /* family of the socket */
int level, optname, optval;
@@ -450,6 +507,10 @@
optval = tp->t_peermss;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
+ case TCP_INFO:
+ tcp_fill_info(tp, &ti);
+ error = sockopt_set(sopt, &ti, sizeof ti);
+ break;
#ifdef notyet
case TCP_CONGCTL:
break;
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.175
diff -u -r1.175 tcp_var.h
--- tcp_var.h 31 Jul 2014 03:39:35 -0000 1.175
+++ tcp_var.h 6 Feb 2015 23:04:52 -0000
@@ -364,6 +364,11 @@
u_int t_maxidle; /* t_keepcnt * t_keepintvl */
u_int t_msl; /* MSL to use for this connexion */
+
+ /* try to implement a few of these per connection */
+ int t_rcvoopack; /* out-of-order packets received */
+ int t_sndrexmitpack; /* retransmit packets sent */
+ int t_sndzerowin; /* zero-window updates sent */
};
/*
Home |
Main Index |
Thread Index |
Old Index