Subject: Use callouts for SYN cache timers
To: None <tech-net@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 09/11/2001 12:19:59
--32u276st3Jlj2kUU
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

The following diff changes the TCP SYN cache to use callouts for
the timers, rather than multiple lists traversed via tcp_slowtimo().

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--32u276st3Jlj2kUU
Content-Type: text/plain; charset=us-ascii
Content-Description: syn_cache_timer.diff
Content-Disposition: attachment; filename=foo

Index: tcp_input.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_input.c,v
retrieving revision 1.129
diff -c -r1.129 tcp_input.c
*** tcp_input.c	2001/09/10 22:14:26	1.129
--- tcp_input.c	2001/09/11 19:15:10
***************
*** 2578,2595 ****
  	default:							\
  		hash = 0;						\
  	}								\
! } while (0)
  #endif /* INET6 */
  
  #define	SYN_CACHE_RM(sc)						\
  do {									\
! 	LIST_REMOVE((sc), sc_bucketq);					\
  	(sc)->sc_tp = NULL;						\
  	LIST_REMOVE((sc), sc_tpq);					\
  	tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;			\
! 	TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \
  	syn_cache_count--;						\
! } while (0)
  
  #define	SYN_CACHE_PUT(sc)						\
  do {									\
--- 2578,2596 ----
  	default:							\
  		hash = 0;						\
  	}								\
! } while (/*CONSTCOND*/0)
  #endif /* INET6 */
  
  #define	SYN_CACHE_RM(sc)						\
  do {									\
! 	TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket,	\
! 	    (sc), sc_bucketq);						\
  	(sc)->sc_tp = NULL;						\
  	LIST_REMOVE((sc), sc_tpq);					\
  	tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;			\
! 	callout_stop(&(sc)->sc_timer);					\
  	syn_cache_count--;						\
! } while (/*CONSTCOND*/0)
  
  #define	SYN_CACHE_PUT(sc)						\
  do {									\
***************
*** 2598,2622 ****
  	if ((sc)->sc_route4.ro_rt != NULL)				\
  		RTFREE((sc)->sc_route4.ro_rt);				\
  	pool_put(&syn_cache_pool, (sc));				\
! } while (0)
  
  struct pool syn_cache_pool;
  
  /*
   * We don't estimate RTT with SYNs, so each packet starts with the default
!  * RTT and each timer queue has a fixed timeout value.  This allows us to
!  * optimize the timer queues somewhat.
   */
  #define	SYN_CACHE_TIMER_ARM(sc)						\
  do {									\
  	TCPT_RANGESET((sc)->sc_rxtcur,					\
  	    TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN,	\
  	    TCPTV_REXMTMAX);						\
! 	PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur);			\
! } while (0)
  
- TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1];
- 
  #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
  
  void
--- 2599,2621 ----
  	if ((sc)->sc_route4.ro_rt != NULL)				\
  		RTFREE((sc)->sc_route4.ro_rt);				\
  	pool_put(&syn_cache_pool, (sc));				\
! } while (/*CONSTCOND*/0)
  
  struct pool syn_cache_pool;
  
  /*
   * We don't estimate RTT with SYNs, so each packet starts with the default
!  * RTT and each timer step has a fixed timeout value.
   */
  #define	SYN_CACHE_TIMER_ARM(sc)						\
  do {									\
  	TCPT_RANGESET((sc)->sc_rxtcur,					\
  	    TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN,	\
  	    TCPTV_REXMTMAX);						\
! 	callout_reset(&(sc)->sc_timer,					\
! 	    (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc));	\
! } while (/*CONSTCOND*/0)
  
  #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
  
  void
***************
*** 2626,2636 ****
  
  	/* Initialize the hash buckets. */
  	for (i = 0; i < tcp_syn_cache_size; i++)
! 		LIST_INIT(&tcp_syn_cache[i].sch_bucket);
! 
! 	/* Initialize the timer queues. */
! 	for (i = 0; i <= TCP_MAXRXTSHIFT; i++)
! 		TAILQ_INIT(&tcp_syn_cache_timeq[i]);
  
  	/* Initialize the syn cache pool. */
  	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
--- 2625,2631 ----
  
  	/* Initialize the hash buckets. */
  	for (i = 0; i < tcp_syn_cache_size; i++)
! 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
  
  	/* Initialize the syn cache pool. */
  	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
***************
*** 2644,2650 ****
  {
  	struct syn_cache_head *scp;
  	struct syn_cache *sc2;
! 	int s, i;
  
  	/*
  	 * If there are no entries in the hash table, reinitialize
--- 2639,2645 ----
  {
  	struct syn_cache_head *scp;
  	struct syn_cache *sc2;
! 	int s;
  
  	/*
  	 * If there are no entries in the hash table, reinitialize
***************
*** 2670,2741 ****
  		tcpstat.tcps_sc_bucketoverflow++;
  		/*
  		 * The bucket is full.  Toss the oldest element in the
! 		 * bucket.  This will be the entry with our bucket
! 		 * index closest to the front of the timer queue with
! 		 * the largest timeout value.
! 		 *
! 		 * Note: This timer queue traversal may be expensive, so
! 		 * we hope that this doesn't happen very often.  It is
! 		 * much more likely that we'll overflow the entire
! 		 * cache, which is much easier to handle; see below.
  		 */
! 		for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
! 			for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
! 			     sc2 != NULL;
! 			     sc2 = TAILQ_NEXT(sc2, sc_timeq)) {
! 				if (sc2->sc_bucketidx == sc->sc_bucketidx) {
! 					SYN_CACHE_RM(sc2);
! 					SYN_CACHE_PUT(sc2);
! 					goto insert;	/* 2 level break */
! 				}
! 			}
! 		}
  #ifdef DIAGNOSTIC
  		/*
  		 * This should never happen; we should always find an
  		 * entry in our bucket.
  		 */
! 		panic("syn_cache_insert: bucketoverflow: impossible");
  #endif
  	} else if (syn_cache_count >= tcp_syn_cache_limit) {
  		tcpstat.tcps_sc_overflowed++;
  		/*
  		 * The cache is full.  Toss the oldest entry in the
! 		 * entire cache.  This is the front entry in the
! 		 * first non-empty timer queue with the largest
! 		 * timeout value.
  		 */
! 		for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
! 			sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
! 			if (sc2 == NULL)
! 				continue;
! 			SYN_CACHE_RM(sc2);
! 			SYN_CACHE_PUT(sc2);
! 			goto insert;		/* symmetry with above */
! 		}
  #ifdef DIAGNOSTIC
! 		/*
! 		 * This should never happen; we should always find an
! 		 * entry in the cache.
! 		 */
! 		panic("syn_cache_insert: cache overflow: impossible");
  #endif
  	}
  
-  insert:
  	/*
  	 * Initialize the entry's timer.
  	 */
  	sc->sc_rxttot = 0;
  	sc->sc_rxtshift = 0;
  	SYN_CACHE_TIMER_ARM(sc);
- 	TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq);
  
  	/* Link it from tcpcb entry */
  	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
  
  	/* Put it into the bucket. */
! 	LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq);
  	scp->sch_length++;
  	syn_cache_count++;
  
--- 2665,2731 ----
  		tcpstat.tcps_sc_bucketoverflow++;
  		/*
  		 * The bucket is full.  Toss the oldest element in the
! 		 * bucket.  This will be the first entry in the bucket.
  		 */
! 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
  #ifdef DIAGNOSTIC
  		/*
  		 * This should never happen; we should always find an
  		 * entry in our bucket.
  		 */
! 		if (sc2 == NULL)
! 			panic("syn_cache_insert: bucketoverflow: impossible");
  #endif
+ 		SYN_CACHE_RM(sc2);
+ 		SYN_CACHE_PUT(sc2);
  	} else if (syn_cache_count >= tcp_syn_cache_limit) {
+ 		struct syn_cache_head *scp2, *sce;
+ 
  		tcpstat.tcps_sc_overflowed++;
  		/*
  		 * The cache is full.  Toss the oldest entry in the
! 		 * first non-empty bucket we can find.
! 		 *
! 		 * XXX We would really like to toss the oldest
! 		 * entry in the cache, but we hope that this
! 		 * condition doesn't happen very often.
  		 */
! 		scp2 = scp;
! 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
! 			sce = &tcp_syn_cache[tcp_syn_cache_size];
! 			for (++scp2; scp2 != scp; scp2++) {
! 				if (scp2 >= sce)
! 					scp2 = &tcp_syn_cache[0];
! 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
! 					break;
! 			}
  #ifdef DIAGNOSTIC
! 			/*
! 			 * This should never happen; we should always find a
! 			 * non-empty bucket.
! 			 */
! 			if (scp2 == scp)
! 				panic("syn_cache_insert: cacheoverflow: "
! 				    "impossible");
  #endif
+ 		}
+ 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
+ 		SYN_CACHE_RM(sc2);
+ 		SYN_CACHE_PUT(sc2);
  	}
  
  	/*
  	 * Initialize the entry's timer.
  	 */
  	sc->sc_rxttot = 0;
  	sc->sc_rxtshift = 0;
  	SYN_CACHE_TIMER_ARM(sc);
  
  	/* Link it from tcpcb entry */
  	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
  
  	/* Put it into the bucket. */
! 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
  	scp->sch_length++;
  	syn_cache_count++;
  
***************
*** 2749,2808 ****
   * that entry.
   */
  void
! syn_cache_timer()
  {
! 	struct syn_cache *sc, *nsc;
! 	int i, s;
  
  	s = splsoftnet();
  
  	/*
! 	 * First, get all the entries that need to be retransmitted, or
! 	 * must be expired due to exceeding the initial keepalive time.
  	 */
! 	for (i = 0; i < TCP_MAXRXTSHIFT; i++) {
! 		for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
! 		     sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
! 		     sc = nsc) {
! 			nsc = TAILQ_NEXT(sc, sc_timeq);
  
! 			/*
! 			 * Compute the total amount of time this entry has
! 			 * been on a queue.  If this entry has been on longer
! 			 * than the keep alive timer would allow, expire it.
! 			 */
! 			sc->sc_rxttot += sc->sc_rxtcur;
! 			if (sc->sc_rxttot >= TCPTV_KEEP_INIT) {
! 				tcpstat.tcps_sc_timed_out++;
! 				SYN_CACHE_RM(sc);
! 				SYN_CACHE_PUT(sc);
! 				continue;
! 			}
  
! 			tcpstat.tcps_sc_retransmitted++;
! 			(void) syn_cache_respond(sc, NULL);
  
! 			/* Advance this entry onto the next timer queue. */
! 			TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq);
! 			sc->sc_rxtshift = i + 1;
! 			SYN_CACHE_TIMER_ARM(sc);
! 			TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift],
! 			    sc, sc_timeq);
! 		}
! 	}
  
! 	/*
! 	 * Now get all the entries that are expired due to too many
! 	 * retransmissions.
! 	 */
! 	for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]);
! 	     sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
! 	     sc = nsc) {
! 		nsc = TAILQ_NEXT(sc, sc_timeq);
! 		tcpstat.tcps_sc_timed_out++;
! 		SYN_CACHE_RM(sc);
! 		SYN_CACHE_PUT(sc);
! 	}
  	splx(s);
  }
  
--- 2739,2779 ----
   * that entry.
   */
  void
! syn_cache_timer(void *arg)
  {
! 	struct syn_cache *sc = arg;
! 	int s;
  
  	s = splsoftnet();
  
+ 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
+ 		/* Drop it -- too many retransmissions. */
+ 		goto dropit;
+ 	}
+ 
  	/*
! 	 * Compute the total amount of time this entry has
! 	 * been on a queue.  If this entry has been on longer
! 	 * than the keep alive timer would allow, expire it.
  	 */
! 	sc->sc_rxttot += sc->sc_rxtcur;
! 	if (sc->sc_rxttot >= TCPTV_KEEP_INIT)
! 		goto dropit;
  
! 	tcpstat.tcps_sc_retransmitted++;
! 	(void) syn_cache_respond(sc, NULL);
  
! 	/* Advance the timer back-off. */
! 	sc->sc_rxtshift++;
! 	SYN_CACHE_TIMER_ARM(sc);
  
! 	splx(s);
! 	return;
  
!  dropit:
! 	tcpstat.tcps_sc_timed_out++;
! 	SYN_CACHE_RM(sc);
! 	SYN_CACHE_PUT(sc);
  	splx(s);
  }
  
***************
*** 2855,2862 ****
  	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
  	*headp = scp;
  	s = splsoftnet();
! 	for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL;
! 	     sc = LIST_NEXT(sc, sc_bucketq)) {
  		if (sc->sc_hash != hash)
  			continue;
  		if (!bcmp(&sc->sc_src, src, src->sa_len) &&
--- 2826,2833 ----
  	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
  	*headp = scp;
  	s = splsoftnet();
! 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
! 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
  		if (sc->sc_hash != hash)
  			continue;
  		if (!bcmp(&sc->sc_src, src, src->sa_len) &&
***************
*** 3347,3352 ****
--- 3318,3324 ----
  	 * Fill in the cache, and put the necessary IP and TCP
  	 * options into the reply.
  	 */
+ 	callout_init(&sc->sc_timer);
  	bzero(sc, sizeof(struct syn_cache));
  	bcopy(src, &sc->sc_src, src->sa_len);
  	bcopy(dst, &sc->sc_dst, dst->sa_len);
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_subr.c,v
retrieving revision 1.118
diff -c -r1.118 tcp_subr.c
*** tcp_subr.c	2001/09/10 22:14:27	1.118
--- tcp_subr.c	2001/09/11 19:15:11
***************
*** 194,200 ****
  int	tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
  int	tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
  struct	syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
- int	tcp_syn_cache_interval = 1;	/* runs timer twice a second */
  
  int	tcp_freeq __P((struct tcpcb *));
  
--- 194,199 ----
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_timer.c,v
retrieving revision 1.54
diff -c -r1.54 tcp_timer.c
*** tcp_timer.c	2001/09/10 22:45:46	1.54
--- tcp_timer.c	2001/09/11 19:15:11
***************
*** 221,237 ****
  void
  tcp_slowtimo()
  {
- 	static int syn_cache_last = 0;
  	int s;
  
  	s = splsoftnet();
  	tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
  	tcp_iss_seq += TCP_ISSINCR;			/* increment iss */
  	tcp_now++;					/* for timestamps */
- 	if (++syn_cache_last >= tcp_syn_cache_interval) {
- 		syn_cache_timer();
- 		syn_cache_last = 0;
- 	}
  	splx(s);
  }
  
--- 221,232 ----
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/tcp_var.h,v
retrieving revision 1.86
diff -c -r1.86 tcp_var.h
*** tcp_var.h	2001/09/10 22:14:28	1.86
--- tcp_var.h	2001/09/11 19:15:12
***************
*** 352,359 ****
  };
  
  struct syn_cache {
! 	LIST_ENTRY(syn_cache) sc_bucketq;	/* link on bucket list */
! 	TAILQ_ENTRY(syn_cache) sc_timeq;	/* link on timer queue */
  	union {					/* cached route */
  		struct route route4;
  #ifdef INET6
--- 352,359 ----
  };
  
  struct syn_cache {
! 	TAILQ_ENTRY(syn_cache) sc_bucketq;	/* link on bucket list */
! 	struct callout sc_timer;		/* rexmt timer */
  	union {					/* cached route */
  		struct route route4;
  #ifdef INET6
***************
*** 373,379 ****
  	union syn_cache_sa sc_dst;
  	tcp_seq sc_irs;
  	tcp_seq sc_iss;
- 	u_int sc_rexmt;				/* retransmit timer */
  	u_int sc_rxtcur;			/* current rxt timeout */
  	u_int sc_rxttot;			/* total time spend on queues */
  	u_short sc_rxtshift;			/* for computing backoff */
--- 373,378 ----
***************
*** 393,399 ****
  };
  
  struct syn_cache_head {
! 	LIST_HEAD(, syn_cache) sch_bucket;	/* bucket entries */
  	u_short sch_length;			/* # entries in bucket */
  };
  
--- 392,398 ----
  };
  
  struct syn_cache_head {
! 	TAILQ_HEAD(, syn_cache) sch_bucket;	/* bucket entries */
  	u_short sch_length;			/* # entries in bucket */
  };
  
***************
*** 537,543 ****
--- 536,544 ----
  #define	TCPCTL_MSSDFLT		4	/* default seg size */
  #define	TCPCTL_SYN_CACHE_LIMIT	5	/* max size of comp. state engine */
  #define	TCPCTL_SYN_BUCKET_LIMIT	6	/* max size of hash bucket */
+ #if 0	/*obsoleted*/
  #define	TCPCTL_SYN_CACHE_INTER	7	/* interval of comp. state timer */
+ #endif
  #define	TCPCTL_INIT_WIN		8	/* initial window */
  #define	TCPCTL_MSS_IFMTU	9	/* mss from interface, not in_maxmtu */
  #define	TCPCTL_SACK		10	/* RFC2018 selective acknowledgement */
***************
*** 568,574 ****
  	{ "mssdflt",	CTLTYPE_INT }, \
  	{ "syn_cache_limit", CTLTYPE_INT }, \
  	{ "syn_bucket_limit", CTLTYPE_INT }, \
! 	{ "syn_cache_interval", CTLTYPE_INT },\
  	{ "init_win", CTLTYPE_INT }, \
  	{ "mss_ifmtu", CTLTYPE_INT }, \
  	{ "sack", CTLTYPE_INT }, \
--- 569,575 ----
  	{ "mssdflt",	CTLTYPE_INT }, \
  	{ "syn_cache_limit", CTLTYPE_INT }, \
  	{ "syn_bucket_limit", CTLTYPE_INT }, \
! 	{ 0, 0 },\
  	{ "init_win", CTLTYPE_INT }, \
  	{ "mss_ifmtu", CTLTYPE_INT }, \
  	{ "sack", CTLTYPE_INT }, \
***************
*** 610,616 ****
  extern	int tcp_ack_on_push;	/* ACK immediately on PUSH */
  extern	int tcp_syn_cache_limit; /* max entries for compressed state engine */
  extern	int tcp_syn_bucket_limit;/* max entries per hash bucket */
- extern	int tcp_syn_cache_interval; /* compressed state timer */
  extern	int tcp_log_refused;	/* log refused connections */
  
  extern	int tcp_rst_ppslim;
--- 611,616 ----
***************
*** 627,633 ****
  	{ 1, 0, &tcp_mssdflt },			\
  	{ 1, 0, &tcp_syn_cache_limit },		\
  	{ 1, 0, &tcp_syn_bucket_limit },	\
! 	{ 1, 0, &tcp_syn_cache_interval },	\
  	{ 1, 0, &tcp_init_win },		\
  	{ 1, 0, &tcp_mss_ifmtu },		\
  	{ 1, 0, &tcp_do_sack },			\
--- 627,633 ----
  	{ 1, 0, &tcp_mssdflt },			\
  	{ 1, 0, &tcp_syn_cache_limit },		\
  	{ 1, 0, &tcp_syn_bucket_limit },	\
! 	{ 0 },					\
  	{ 1, 0, &tcp_init_win },		\
  	{ 1, 0, &tcp_mss_ifmtu },		\
  	{ 1, 0, &tcp_do_sack },			\
***************
*** 720,726 ****
  void	 syn_cache_reset __P((struct sockaddr *, struct sockaddr *,
  		struct tcphdr *));
  int	 syn_cache_respond __P((struct syn_cache *, struct mbuf *));
! void	 syn_cache_timer __P((void));
  void	 syn_cache_cleanup __P((struct tcpcb *));
  
  int	tcp_newreno __P((struct tcpcb *, struct tcphdr *));
--- 720,726 ----
  void	 syn_cache_reset __P((struct sockaddr *, struct sockaddr *,
  		struct tcphdr *));
  int	 syn_cache_respond __P((struct syn_cache *, struct mbuf *));
! void	 syn_cache_timer __P((void *));
  void	 syn_cache_cleanup __P((struct tcpcb *));
  
  int	tcp_newreno __P((struct tcpcb *, struct tcphdr *));

--32u276st3Jlj2kUU--