Subject: keepalive per socket settings patch
To: None <tech-net@netbsd.org>
From: Christos Zoulas <christos@zoulas.com>
List: tech-net
Date: 06/19/2007 21:28:10
Hi,

This is pretty straight forward... The question is do we let setsockopt
specify arbitrary values, or do we cap them to the global settings like
I do now? Comments? If there are no disagreements I will commit this after
I write the documentation. If there are, speak now and I won't bother
with the docs.

christos

Index: tcp.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp.h,v
retrieving revision 1.25
diff -u -u -r1.25 tcp.h
--- tcp.h	9 Oct 2006 16:27:07 -0000	1.25
+++ tcp.h	20 Jun 2007 01:16:48 -0000
@@ -112,10 +112,19 @@
 /*
  * User-settable options (used with setsockopt).
  */
-#define	TCP_NODELAY	0x01	/* don't delay send to coalesce packets */
-#define	TCP_MAXSEG	0x02	/* set maximum segment size */
-/* Bits 0x04, 0x08 reserved for FreeBSD compatibility: TCP_NOPUSH, TCP_NOOPT */
-#define TCP_MD5SIG	0x10	/* use MD5 digests (RFC2385) */
+#define	TCP_NODELAY	1	/* don't delay send to coalesce packets */
+#define	TCP_MAXSEG	2	/* set maximum segment size */
+#define	TCP_KEEPIDLE	3
+#ifdef notyet
+#define	TCP_NOPUSH	4	/* reserved for FreeBSD compat */
+#endif
+#define	TCP_KEEPINTVL	5
+#define	TCP_KEEPCNT	6
+#define	TCP_KEEPINIT	7
+#ifdef notyet
+#define	TCP_NOOPT	8	/* reserved for FreeBSD compat */
+#endif
+#define	TCP_MD5SIG	0x10	/* use MD5 digests (RFC2385) */
 #define	TCP_CONGCTL	0x20	/* selected congestion control */
 
 #endif /* !_NETINET_TCP_H_ */
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.266
diff -u -u -r1.266 tcp_input.c
--- tcp_input.c	18 May 2007 21:48:43 -0000	1.266
+++ tcp_input.c	20 Jun 2007 01:16:51 -0000
@@ -1606,7 +1606,7 @@
 	 */
 	tp->t_rcvtime = tcp_now;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
 
 	/*
 	 * Process options.
@@ -2366,9 +2366,9 @@
 				 */
 				if (so->so_state & SS_CANTRCVMORE) {
 					soisdisconnected(so);
-					if (tcp_maxidle > 0)
+					if (tp->t_maxidle > 0)
 						TCP_TIMER_ARM(tp, TCPT_2MSL,
-						    tcp_maxidle);
+						    tp->t_maxidle);
 				}
 				tp->t_state = TCPS_FIN_WAIT_2;
 			}
@@ -3377,7 +3377,7 @@
 	 * than the keep alive timer would allow, expire it.
 	 */
 	sc->sc_rxttot += sc->sc_rxtcur;
-	if (sc->sc_rxttot >= TCPTV_KEEP_INIT)
+	if (sc->sc_rxttot >= tcp_keepinit)
 		goto dropit;
 
 	tcpstat.tcps_sc_retransmitted++;
@@ -3713,7 +3713,7 @@
 	tcp_sendseqinit(tp);
 	tcp_rcvseqinit(tp);
 	tp->t_state = TCPS_SYN_RECEIVED;
-	TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
+	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
 	tcpstat.tcps_accepts++;
 
 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.214
diff -u -u -r1.214 tcp_subr.c
--- tcp_subr.c	2 May 2007 20:40:25 -0000	1.214
+++ tcp_subr.c	20 Jun 2007 01:16:52 -0000
@@ -379,9 +379,6 @@
 {
 	int hlen;
 
-	/* Initialize the TCPCB template. */
-	tcp_tcpcb_template();
-
 	in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize);
 
 	hlen = sizeof(struct ip) + sizeof(struct tcphdr);
@@ -410,6 +407,9 @@
 	/* Initialize the congestion control algorithms. */
 	tcp_congctl_init();
 
+	/* Initialize the TCPCB template. */
+	tcp_tcpcb_template();
+
 	MOWNER_ATTACH(&tcp_tx_mowner);
 	MOWNER_ATTACH(&tcp_rx_mowner);
 	MOWNER_ATTACH(&tcp_reass_mowner);
@@ -976,6 +976,13 @@
 	tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 	    TCPTV_MIN, TCPTV_REXMTMAX);
+
+	/* Keep Alive */
+	tp->t_keepinit = tcp_keepinit;
+	tp->t_keepidle = tcp_keepidle;
+	tp->t_keepintvl = tcp_keepintvl;
+	tp->t_keepcnt = tcp_keepcnt;
+	tp->t_maxidle = tp->t_keepcnt * tp->t_keepintvl;
 }
 
 /*
@@ -1049,7 +1056,7 @@
 	
 	tp->t_congctl = tcp_congctl_global;
 	tp->t_congctl->refcnt++;
-	
+
 	return (tp);
 }
 
@@ -2016,7 +2023,7 @@
 #endif
 
 	tp->t_state = TCPS_ESTABLISHED;
-	TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
 
 #ifdef RTV_RPIPE
 	if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.76
diff -u -u -r1.76 tcp_timer.c
--- tcp_timer.c	9 Oct 2006 16:27:07 -0000	1.76
+++ tcp_timer.c	20 Jun 2007 01:16:52 -0000
@@ -148,11 +148,12 @@
  * Various tunable timer parameters.  These are initialized in tcp_init(),
  * unless they are patched.
  */
-int	tcp_keepidle = 0;
-int	tcp_keepintvl = 0;
-int	tcp_keepcnt = 0;		/* max idle probes */
+u_int	tcp_keepinit = 0;
+u_int	tcp_keepidle = 0;
+u_int	tcp_keepintvl = 0;
+u_int	tcp_keepcnt = 0;		/* max idle probes */
+
 int	tcp_maxpersistidle = 0;		/* max idle time in persist */
-int	tcp_maxidle;			/* computed in tcp_slowtimo() */
 
 /*
  * Time to delay the ACK.  This is initialized in tcp_init(), unless
@@ -179,6 +180,9 @@
 tcp_timer_init(void)
 {
 
+	if (tcp_keepinit == 0)
+		tcp_keepinit = TCPTV_KEEP_INIT;
+
 	if (tcp_keepidle == 0)
 		tcp_keepidle = TCPTV_KEEP_IDLE;
 
@@ -251,7 +255,6 @@
 	int s;
 
 	s = splsoftnet();
-	tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
 	tcp_iss_seq += TCP_ISSINCR;			/* increment iss */
 	tcp_now++;					/* for timestamps */
 	splx(s);
@@ -542,9 +545,9 @@
 	KASSERT(so != NULL);
 	if (so->so_options & SO_KEEPALIVE &&
 	    tp->t_state <= TCPS_CLOSE_WAIT) {
-	    	if ((tcp_maxidle > 0) &&
+	    	if ((tp->t_maxidle > 0) &&
 		    ((tcp_now - tp->t_rcvtime) >=
-		     tcp_keepidle + tcp_maxidle))
+		     tp->t_keepidle + tp->t_maxidle))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response
@@ -572,9 +575,9 @@
 			    (struct mbuf *)NULL, NULL, tp->rcv_nxt,
 			    tp->snd_una - 1, 0);
 		}
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepintvl);
 	} else
-		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
 
 #ifdef TCP_DEBUG
 	if (tp && so->so_options & SO_DEBUG)
@@ -634,8 +637,9 @@
 	 * control block.  Otherwise, check again in a bit.
 	 */
 	if (tp->t_state != TCPS_TIME_WAIT &&
-	    ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle)))
-		TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
+	    ((tp->t_maxidle == 0) ||
+	    ((tcp_now - tp->t_rcvtime) <= tp->t_maxidle)))
+	    TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_keepintvl);
 	else
 		tp = tcp_close(tp);
 
Index: tcp_timer.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.h,v
retrieving revision 1.24
diff -u -u -r1.24 tcp_timer.h
--- tcp_timer.h	26 Sep 2006 06:39:22 -0000	1.24
+++ tcp_timer.h	20 Jun 2007 01:16:52 -0000
@@ -182,11 +182,11 @@
 
 extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
 
-extern int tcp_keepidle;		/* time before keepalive probes begin */
-extern int tcp_keepintvl;		/* time between keepalive probes */
-extern int tcp_keepcnt;			/* number of keepalives, 0=infty */
+extern u_int tcp_keepinit;		/* time before initial connection times out */
+extern u_int tcp_keepidle;		/* time before keepalive probes begin */
+extern u_int tcp_keepintvl;		/* time between keepalive probes */
+extern u_int tcp_keepcnt;		/* number of keepalives, 0=infty */
 extern int tcp_maxpersistidle;		/* max idle time in persist */
-extern int tcp_maxidle;			/* time to drop after starting probes */
 extern int tcp_ttl;			/* time to live for TCP segs */
 extern const int tcp_backoff[];
 
Index: tcp_usrreq.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.131
diff -u -u -r1.131 tcp_usrreq.c
--- tcp_usrreq.c	4 Mar 2007 06:03:22 -0000	1.131
+++ tcp_usrreq.c	20 Jun 2007 01:16:53 -0000
@@ -436,7 +436,7 @@
 		soisconnecting(so);
 		tcpstat.tcps_connattempt++;
 		tp->t_state = TCPS_SYN_SENT;
-		TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
 		tp->iss = tcp_new_iss(tp, 0);
 		tcp_sendseqinit(tp);
 		error = tcp_output(tp);
@@ -614,6 +614,28 @@
 	return (error);
 }
 
+static void
+change_keepalive(struct socket *so, struct tcpcb *tp)
+{
+	tp->t_maxidle = tp->t_keepcnt * tp->t_keepintvl;
+	TCP_TIMER_DISARM(tp, TCPT_KEEP);
+	TCP_TIMER_DISARM(tp, TCPT_2MSL);
+
+	if (tp->t_state == TCPS_SYN_RECEIVED ||
+	    tp->t_state == TCPS_SYN_SENT) {
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
+	} else if (so->so_options & SO_KEEPALIVE && 
+	    tp->t_state <= TCPS_CLOSE_WAIT) {
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepintvl);
+	} else {
+		TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
+	}
+
+	if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0))
+		TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
+}
+
+
 int
 tcp_ctloutput(int op, struct socket *so, int level, int optname,
     struct mbuf **mp)
@@ -715,7 +737,8 @@
 			break;
 
 		case TCP_MAXSEG:
-			if (m && (i = *mtod(m, int *)) > 0 &&
+			if (m && m->m_len >= sizeof(int) &&
+			    (i = *mtod(m, int *)) > 0 &&
 			    i <= tp->t_peermss)
 				tp->t_peermss = i;  /* limit on send size */
 			else
@@ -729,6 +752,46 @@
 #endif
 			break;
 
+		case TCP_KEEPIDLE:
+			if (m && m->m_len >= sizeof(int) &&
+			    (i = *mtod(m, int *)) >= 0 &&
+			    i <= tcp_keepidle) {
+				tp->t_keepidle = i;
+				change_keepalive(so, tp);
+			} else
+				error = EINVAL;
+			break;
+
+		case TCP_KEEPINTVL:
+			if (m && m->m_len >= sizeof(int) &&
+			    (i = *mtod(m, int *)) >= 0 &&
+			    i <= tcp_keepintvl) {
+				tp->t_keepintvl = i;
+				change_keepalive(so, tp);
+			} else
+				error = EINVAL;
+			break;
+
+		case TCP_KEEPCNT:
+			if (m && m->m_len >= sizeof(int) &&
+			    (i = *mtod(m, int *)) >= 0 &&
+			    i <= tcp_keepcnt) {
+				tp->t_keepcnt = i;
+				change_keepalive(so, tp);
+			} else
+				error = EINVAL;
+			break;
+
+		case TCP_KEEPINIT:
+			if (m && m->m_len >= sizeof(int) &&
+			    (i = *mtod(m, int *)) >= 0 &&
+			    i <= tcp_keepinit) {
+				tp->t_keepinit = i;
+				change_keepalive(so, tp);
+			} else
+				error = EINVAL;
+			break;
+
 		default:
 			error = ENOPROTOOPT;
 			break;
@@ -944,8 +1007,8 @@
 		 * a full close, we start a timer to make sure sockets are
 		 * not left in FIN_WAIT_2 forever.
 		 */
-		if ((tp->t_state == TCPS_FIN_WAIT_2) && (tcp_maxidle > 0))
-			TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
+		if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0))
+			TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
 	}
 	return (tp);
 }
@@ -1418,6 +1481,27 @@
 	return error;
 }
 
+static int
+sysctl_tcp_keep(SYSCTLFN_ARGS)
+{  
+	int error;
+	u_int tmp;
+	struct sysctlnode node;
+
+	node = *rnode;
+	tmp = *(u_int *)rnode->sysctl_data;
+	node.sysctl_data = &tmp;
+
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	if (error || newp == NULL)
+		return error;
+
+	*(u_int *)rnode->sysctl_data = tmp;
+	tcp_tcpcb_template();	/* update the template */
+	return 0;
+}
+
+
 /*
  * this (second stage) setup routine is a replacement for tcp_sysctl()
  * (which is currently used for ipv4 and ipv6)
@@ -1585,19 +1669,19 @@
 		       CTLTYPE_INT, "keepidle",
 		       SYSCTL_DESCR("Allowed connection idle ticks before a "
 				    "keepalive probe is sent"),
-		       NULL, 0, &tcp_keepidle, 0,
+		       sysctl_tcp_keep, 0, &tcp_keepidle, 0,
 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPIDLE, CTL_EOL);
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 		       CTLTYPE_INT, "keepintvl",
 		       SYSCTL_DESCR("Ticks before next keepalive probe is sent"),
-		       NULL, 0, &tcp_keepintvl, 0,
+		       sysctl_tcp_keep, 0, &tcp_keepintvl, 0,
 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPINTVL, CTL_EOL);
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 		       CTLTYPE_INT, "keepcnt",
 		       SYSCTL_DESCR("Number of keepalive probes to send"),
-		       NULL, 0, &tcp_keepcnt, 0,
+		       sysctl_tcp_keep, 0, &tcp_keepcnt, 0,
 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPCNT, CTL_EOL);
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
@@ -1658,6 +1742,12 @@
 		       sysctl_inpcblist, 0, &tcbtable, 0,
 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE,
 		       CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "keepinit",
+		       SYSCTL_DESCR("Ticks before initial tcp connection times out"),
+		       sysctl_tcp_keep, 0, &tcp_keepinit, 0,
+		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
 
 	/* ECN subtree */
 	sysctl_createv(clog, 0, NULL, &ecn_node,
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.146
diff -u -u -r1.146 tcp_var.h
--- tcp_var.h	2 May 2007 20:40:25 -0000	1.146
+++ tcp_var.h	20 Jun 2007 01:16:54 -0000
@@ -331,6 +331,14 @@
 	uint8_t t_ecn_retries;		/* # of ECN setup retries */
 	
 	struct tcp_congctl *t_congctl;	/* per TCB congctl algorithm */
+
+	/* Keepalive per socket */
+	u_int	t_keepinit;
+	u_int	t_keepidle;
+	u_int	t_keepintvl;
+	u_int	t_keepcnt;
+	u_int	t_maxidle;		/* t_keepcnt * t_keepintvl */
+
 };
 
 /*