tech-net archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

TCP_INFO socket option



Hi,

I noticed that iperf3 on Linux (and presumably FreeBSD) can
display the number of retransmits and the current congestion
window for each measurement status interval for the TCP session
being tested.

The info for this comes from a TCP_INFO socket option initially
added in Linux 2.6.  The attached diff tries to implement this
for NetBSD (pulled from FreeBSD and then adapted), and the
resulting diff relative to netbsd-7 follows below.  The result
builds, at least -- that's as long as I've tested so far.

In order to implement this, I had to add three variables per
tcpcb.  I've not been able to find where to initialize these, or
whether they would already be zeroed on allocation of the tcpcb.

Any comments?

Regards,

- Håvard
Index: tcp.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp.h,v
retrieving revision 1.30
diff -u -r1.30 tcp.h
--- tcp.h	7 Jan 2012 20:20:22 -0000	1.30
+++ tcp.h	6 Feb 2015 23:04:52 -0000
@@ -127,7 +127,80 @@
 #ifdef notyet
 #define	TCP_NOOPT	8	/* reserved for FreeBSD compat */
 #endif
+#define	TCP_INFO	9	/* retrieve tcp_info structure */
 #define	TCP_MD5SIG	0x10	/* use MD5 digests (RFC2385) */
 #define	TCP_CONGCTL	0x20	/* selected congestion control */
 
+#define	TCPI_OPT_TIMESTAMPS	0x01
+#define	TCPI_OPT_SACK		0x02
+#define	TCPI_OPT_WSCALE		0x04
+#define	TCPI_OPT_ECN		0x08
+#define	TCPI_OPT_TOE		0x10
+
+/*
+ * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
+ * the caller to query certain information about the state of a TCP
+ * connection.  We provide an overlapping set of fields with the Linux
+ * implementation, but since this is a fixed size structure, room has been
+ * left for growth.  In order to maximize potential future compatibility with
+ * the Linux API, the same variable names and order have been adopted, and
+ * padding left to make room for omitted fields in case they are added later.
+ *
+ * XXX: This is currently an unstable ABI/API, in that it is expected to
+ * change.
+ */
+struct tcp_info {
+	uint8_t		tcpi_state; /* TCP FSM state. */
+	uint8_t		__tcpi_ca_state;
+	uint8_t		__tcpi_retransmits;
+	uint8_t		__tcpi_probes;
+	uint8_t		__tcpi_backoff;
+	uint8_t		tcpi_options;	       /* Options enabled on conn. */
+	uint8_t		tcpi_snd_wscale:4,	/* RFC1323 send shift value. */
+			tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */
+
+	uint32_t	tcpi_rto;		/* Retransmission timeout (usec). */
+	uint32_t	__tcpi_ato;
+	uint32_t	tcpi_snd_mss;		/* Max segment size for send. */
+	uint32_t	tcpi_rcv_mss;		/* Max segment size for receive. */
+
+	uint32_t	__tcpi_unacked;
+	uint32_t	__tcpi_sacked;
+	uint32_t	__tcpi_lost;
+	uint32_t	__tcpi_retrans;
+	uint32_t	__tcpi_fackets;
+
+	/* Times; measurements in usecs. */
+	uint32_t	__tcpi_last_data_sent;
+	uint32_t	__tcpi_last_ack_sent;	/* Also unimpl. on Linux? */
+	uint32_t	tcpi_last_data_recv;	/* Time since last recv data. */
+	uint32_t	__tcpi_last_ack_recv;
+
+	/* Metrics; variable units. */
+	uint32_t	__tcpi_pmtu;
+	uint32_t	__tcpi_rcv_ssthresh;
+	uint32_t	tcpi_rtt;		/* Smoothed RTT in usecs. */
+	uint32_t	tcpi_rttvar;		/* RTT variance in usecs. */
+	uint32_t	tcpi_snd_ssthresh;	/* Slow start threshold. */
+	uint32_t	tcpi_snd_cwnd;		/* Send congestion window. */
+	uint32_t	__tcpi_advmss;
+	uint32_t	__tcpi_reordering;
+
+	uint32_t	__tcpi_rcv_rtt;
+	uint32_t	tcpi_rcv_space;		/* Advertised recv window. */
+
+	/* FreeBSD/NetBSD extensions to tcp_info. */
+	uint32_t	tcpi_snd_wnd;		/* Advertised send window. */
+	uint32_t	tcpi_snd_bwnd;		/* No longer used. */
+	uint32_t	tcpi_snd_nxt;		/* Next egress seqno */
+	uint32_t	tcpi_rcv_nxt;		/* Next ingress seqno */
+	uint32_t	tcpi_toe_tid;		/* HWTID for TOE endpoints */
+	uint32_t	tcpi_snd_rexmitpack;	/* Retransmitted packets */
+	uint32_t	tcpi_rcv_ooopack;	/* Out-of-order packets */
+	uint32_t	tcpi_snd_zerowin;	/* Zero-sized windows sent */
+	
+	/* Padding to grow without breaking ABI. */
+	uint32_t	__tcpi_pad[26];		/* Padding. */
+};
+
 #endif /* !_NETINET_TCP_H_ */
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.334
diff -u -r1.334 tcp_input.c
--- tcp_input.c	8 Aug 2014 03:05:45 -0000	1.334
+++ tcp_input.c	6 Feb 2015 23:04:52 -0000
@@ -738,6 +738,7 @@
 	/*
 	 * Update the counters.
 	 */
+	tp->t_rcvoopack++;
 	tcps = TCP_STAT_GETREF();
 	tcps[TCP_STAT_RCVOOPACK]++;
 	tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;
Index: tcp_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_output.c,v
retrieving revision 1.176.2.3
diff -u -r1.176.2.3 tcp_output.c
--- tcp_output.c	17 Jan 2015 12:10:53 -0000	1.176.2.3
+++ tcp_output.c	6 Feb 2015 23:04:52 -0000
@@ -439,6 +439,7 @@
 	if (tp->t_force && len == 1)
 		tcps[TCP_STAT_SNDPROBE]++;
 	else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+		tp->t_sndrexmitpack++;
 		tcps[TCP_STAT_SNDREXMITPACK]++;
 		tcps[TCP_STAT_SNDREXMITBYTE] += len;
 	} else {
@@ -1401,6 +1402,9 @@
 	if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
 		win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
 	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
+	if (th->th_win == 0) {
+		tp->t_sndzerowin++;
+	}
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		u_int32_t urp = tp->snd_up - tp->snd_nxt;
 		if (urp > IP_MAXPACKET)
Index: tcp_usrreq.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.200.2.1
diff -u -r1.200.2.1 tcp_usrreq.c
--- tcp_usrreq.c	17 Jan 2015 12:10:53 -0000	1.200.2.1
+++ tcp_usrreq.c	6 Feb 2015 23:04:52 -0000
@@ -119,6 +119,7 @@
 #include <sys/domain.h>
 #include <sys/sysctl.h>
 #include <sys/kauth.h>
+#include <sys/kernel.h>
 #include <sys/uidinfo.h>
 
 #include <net/if.h>
@@ -271,6 +272,61 @@
 		TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
 }
 
+/*
+ * Export TCP internal state information via a struct tcp_info, based on the
+ * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
+ * (TCP state machine, etc).  We export all information using FreeBSD-native
+ * constants -- for example, the numeric values for tcpi_state will differ
+ * from Linux.
+ */
+static void
+tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
+{
+
+	bzero(ti, sizeof(*ti));
+
+	ti->tcpi_state = tp->t_state;
+	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
+		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+	if (tp->t_flags & TF_SACK_PERMIT)
+		ti->tcpi_options |= TCPI_OPT_SACK;
+	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
+		ti->tcpi_options |= TCPI_OPT_WSCALE;
+		ti->tcpi_snd_wscale = tp->snd_scale;
+		ti->tcpi_rcv_wscale = tp->rcv_scale;
+	}
+	if (tp->t_flags & TF_ECN_PERMIT) {
+		ti->tcpi_options |= TCPI_OPT_ECN;
+	}
+
+	ti->tcpi_rto = tp->t_rxtcur * tick;
+	ti->tcpi_last_data_recv = (long)(hardclock_ticks -
+					 (int)tp->t_rcvtime) * tick;
+	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
+	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
+
+	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
+	ti->tcpi_snd_cwnd = tp->snd_cwnd;
+
+	/*
+	 * FreeBSD-specific extension fields for tcp_info.
+	 */
+	ti->tcpi_rcv_space = tp->rcv_wnd;
+	ti->tcpi_rcv_nxt = tp->rcv_nxt;
+	ti->tcpi_snd_wnd = tp->snd_wnd;
+	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
+	ti->tcpi_snd_nxt = tp->snd_nxt;
+	ti->tcpi_snd_mss = tp->t_segsz;
+	ti->tcpi_rcv_mss = tp->t_segsz;
+#ifdef TF_TOE
+	if (tp->t_flags & TF_TOE)
+		ti->tcpi_options |= TCPI_OPT_TOE;
+#endif
+	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
+	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
+	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+}
+
 int
 tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt)
 {
@@ -280,6 +336,7 @@
 	struct in6pcb *in6p;
 #endif
 	struct tcpcb *tp;
+	struct tcp_info ti;
 	u_int ui;
 	int family;	/* family of the socket */
 	int level, optname, optval;
@@ -450,6 +507,10 @@
 			optval = tp->t_peermss;
 			error = sockopt_set(sopt, &optval, sizeof(optval));
 			break;
+		case TCP_INFO:
+			tcp_fill_info(tp, &ti);
+			error = sockopt_set(sopt, &ti, sizeof ti);
+			break;
 #ifdef notyet
 		case TCP_CONGCTL:
 			break;
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.175
diff -u -r1.175 tcp_var.h
--- tcp_var.h	31 Jul 2014 03:39:35 -0000	1.175
+++ tcp_var.h	6 Feb 2015 23:04:52 -0000
@@ -364,6 +364,11 @@
 	u_int	t_maxidle;		/* t_keepcnt * t_keepintvl */
 
 	u_int	t_msl;			/* MSL to use for this connexion */
+
+	/* try to implement a few of these per connection */
+	int	t_rcvoopack;	 	/* out-of-order packets received */
+	int	t_sndrexmitpack; 	/* retransmit packets sent */
+	int	t_sndzerowin;		/* zero-window updates sent */
 };
 
 /*


Home | Main Index | Thread Index | Old Index