[src/trunk]: src/sys/netinet Implement retransmit logic for the SYN cache eng...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/netinet Implement retransmit logic for the SYN cache eng...
From: thorpej <thorpej%NetBSD.org@localhost>
Date: Fri, 24 Jan 2020 05:53:23 +0000
details:   https://anonhg.NetBSD.org/src/rev/7eb0647b815a
branches:  trunk
changeset: 472409:7eb0647b815a
user:      thorpej <thorpej%NetBSD.org@localhost>
date:      Thu Apr 29 03:54:22 1999 +0000

description:
Implement retransmit logic for the SYN cache engine.  Fixes a rare condition
where one side can think a connection exists, where the other side thinks
the connection was never established.

The original problem was first reported by Ty Sarna in PR #5909.  The
original fix I made to the code didn't cover all cases.  The problem this
fix addresses was reported by Christoph Badura via private e-mail.

Many thanks to Bill Sommerfeld for helping me to test this code, and
for finding a subtle bug.

diffstat:

 sys/netinet/in_proto.c  |    5 +-
 sys/netinet/tcp_input.c |  248 +++++++++++++++++++++++++++++------------------
 sys/netinet/tcp_var.h   |   23 ++-
 3 files changed, 168 insertions(+), 108 deletions(-)

diffs (truncated from 532 to 300 lines):

diff -r cb3cc27e7d04 -r 7eb0647b815a sys/netinet/in_proto.c
--- a/sys/netinet/in_proto.c    Thu Apr 29 03:38:39 1999 +0000
+++ b/sys/netinet/in_proto.c    Thu Apr 29 03:54:22 1999 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: in_proto.c,v 1.29 1999/01/14 01:16:55 thorpej Exp $    */
+/*     $NetBSD: in_proto.c,v 1.30 1999/04/29 03:54:22 thorpej Exp $    */
 
 /*
  * Copyright (c) 1982, 1986, 1993
@@ -188,5 +188,4 @@
 int    tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
 int    tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
 struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
-int    tcp_syn_cache_interval = 8;     /* runs timer every 4 seconds */
-int    tcp_syn_cache_timeo = TCPTV_KEEP_INIT;
+int    tcp_syn_cache_interval = 1;     /* runs timer twice a second */
diff -r cb3cc27e7d04 -r 7eb0647b815a sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c   Thu Apr 29 03:38:39 1999 +0000
+++ b/sys/netinet/tcp_input.c   Thu Apr 29 03:54:22 1999 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: tcp_input.c,v 1.79 1999/04/22 01:32:30 simonb Exp $    */
+/*     $NetBSD: tcp_input.c,v 1.80 1999/04/29 03:54:22 thorpej Exp $   */
 
 /*-
  * Copyright (c) 1997, 1998, 1999 The NetBSD Foundation, Inc.
@@ -1846,13 +1846,11 @@
        ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
                                     ((u_int32_t)(sp)))^syn_hash2)))
 
-LIST_HEAD(, syn_cache_head) tcp_syn_cache_queue;
-
-#define        SYN_CACHE_RM(sc, scp)                                           \
+#define        SYN_CACHE_RM(sc)                                                \
 do {                                                                   \
-       TAILQ_REMOVE(&(scp)->sch_queue, (sc), sc_queue);                \
-       if (--(scp)->sch_length == 0)                                   \
-               LIST_REMOVE((scp), sch_headq);                          \
+       LIST_REMOVE((sc), sc_bucketq);                                  \
+       tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;                 \
+       TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \
        syn_cache_count--;                                              \
 } while (0)
 
@@ -1867,17 +1865,33 @@
 
 struct pool syn_cache_pool;
 
+/*
+ * We don't estimate RTT with SYNs, so each packet starts with the default
+ * RTT and each timer queue has a fixed timeout value.  This allows us to
+ * optimize the timer queues somewhat.
+ */
+#define        SYN_CACHE_TIMER_ARM(sc)                                         \
+do {                                                                   \
+       TCPT_RANGESET((sc)->sc_rxtcur,                                  \
+           TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
+           TCPTV_REXMTMAX);                                            \
+       PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur);                  \
+} while (0)
+
+TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1];
+
 void
 syn_cache_init()
 {
        int i;
 
-       /* Initialize the hash bucket queues. */
+       /* Initialize the hash buckets. */
        for (i = 0; i < tcp_syn_cache_size; i++)
-               TAILQ_INIT(&tcp_syn_cache[i].sch_queue);
+               LIST_INIT(&tcp_syn_cache[i].sch_bucket);
 
-       /* Initialize the active hash bucket cache. */
-       LIST_INIT(&tcp_syn_cache_queue);
+       /* Initialize the timer queues. */
+       for (i = 0; i <= TCP_MAXRXTSHIFT; i++)
+               TAILQ_INIT(&tcp_syn_cache_timeq[i]);
 
        /* Initialize the syn cache pool. */
        pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
@@ -1888,9 +1902,9 @@
 syn_cache_insert(sc)
        struct syn_cache *sc;
 {
-       struct syn_cache_head *scp, *scp2, *sce;
+       struct syn_cache_head *scp;
        struct syn_cache *sc2;
-       int s;
+       int s, i;
 
        /*
         * If there are no entries in the hash table, reinitialize
@@ -1904,7 +1918,8 @@
        }
 
        sc->sc_hash = SYN_HASH(&sc->sc_src, sc->sc_sport, sc->sc_dport);
-       scp = &tcp_syn_cache[sc->sc_hash % tcp_syn_cache_size];
+       sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
+       scp = &tcp_syn_cache[sc->sc_bucketidx];
 
        /*
         * Make sure that we don't overflow the per-bucket
@@ -1914,44 +1929,71 @@
        if (scp->sch_length >= tcp_syn_bucket_limit) {
                tcpstat.tcps_sc_bucketoverflow++;
                /*
-                * The bucket is full.  Toss the first (i.e. oldest)
-                * element in this bucket.
+                * The bucket is full.  Toss the oldest element in the
+                * bucket.  This will be the entry with our bucket
+                * index closest to the front of the timer queue with
+                * the largest timeout value.
+                *
+                * Note: This timer queue traversal may be expensive, so
+                * we hope that this doesn't happen very often.  It is
+                * much more likely that we'll overflow the entire
+                * cache, which is much easier to handle; see below.
                 */
-               sc2 = TAILQ_FIRST(&scp->sch_queue);
-               SYN_CACHE_RM(sc2, scp);
-               SYN_CACHE_PUT(sc2);
+               for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
+                       for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
+                            sc2 != NULL;
+                            sc2 = TAILQ_NEXT(sc2, sc_timeq)) {
+                               if (sc2->sc_bucketidx == sc->sc_bucketidx) {
+                                       SYN_CACHE_RM(sc2);
+                                       SYN_CACHE_PUT(sc2);
+                                       goto insert;    /* 2 level break */
+                               }
+                       }
+               }
+#ifdef DIAGNOSTIC
+               /*
+                * This should never happen; we should always find an
+                * entry in our bucket.
+                */
+               panic("syn_cache_insert: bucketoverflow: impossible");
+#endif
        } else if (syn_cache_count >= tcp_syn_cache_limit) {
                tcpstat.tcps_sc_overflowed++;
                /*
-                * The cache is full.  Toss the first (i.e. oldest)
-                * element in the first non-empty bucket we can find.
+                * The cache is full.  Toss the oldest entry in the
+                * entire cache.  This is the front entry in the
+                * first non-empty timer queue with the largest
+                * timeout value.
                 */
-               scp2 = scp;
-               if (TAILQ_FIRST(&scp2->sch_queue) == NULL) {
-                       sce = &tcp_syn_cache[tcp_syn_cache_size];
-                       for (++scp2; scp2 != scp; scp2++) {
-                               if (scp2 >= sce)
-                                       scp2 = &tcp_syn_cache[0];
-                               if (TAILQ_FIRST(&scp2->sch_queue) != NULL)
-                                       break;
-                       }
+               for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
+                       sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
+                       if (sc2 == NULL)
+                               continue;
+                       SYN_CACHE_RM(sc2);
+                       SYN_CACHE_PUT(sc2);
+                       goto insert;            /* symmetry with above */
                }
-               sc2 = TAILQ_FIRST(&scp2->sch_queue);
-               if (sc2 == NULL) {
-                       SYN_CACHE_PUT(sc);
-                       return;
-               }
-               SYN_CACHE_RM(sc2, scp2);
-               SYN_CACHE_PUT(sc2);
+#ifdef DIAGNOSTIC
+               /*
+                * This should never happen; we should always find an
+                * entry in the cache.
+                */
+               panic("syn_cache_insert: cache overflow: impossible");
+#endif
        }
 
-       /* Set entry's timer. */
-       PRT_SLOW_ARM(sc->sc_timer, tcp_syn_cache_timeo);
+ insert:
+       /*
+        * Initialize the entry's timer.
+        */
+       sc->sc_rxttot = 0;
+       sc->sc_rxtshift = 0;
+       SYN_CACHE_TIMER_ARM(sc);
+       TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq);
 
        /* Put it into the bucket. */
-       TAILQ_INSERT_TAIL(&scp->sch_queue, sc, sc_queue);
-       if (++scp->sch_length == 1)
-               LIST_INSERT_HEAD(&tcp_syn_cache_queue, scp, sch_headq);
+       LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq);
+       scp->sch_length++;
        syn_cache_count++;
 
        tcpstat.tcps_sc_added++;
@@ -1959,31 +2001,64 @@
 }
 
 /*
- * Walk down the cache list, looking for expired entries in each bucket.
+ * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
+ * If we have retransmitted an entry the maximum number of times, expire
+ * that entry.
  */
 void
 syn_cache_timer()
 {
-       struct syn_cache_head *scp, *nscp;
        struct syn_cache *sc, *nsc;
-       int s;
+       int i, s;
 
        s = splsoftnet();
-       for (scp = LIST_FIRST(&tcp_syn_cache_queue); scp != NULL; scp = nscp) {
-#ifdef DIAGNOSTIC
-               if (TAILQ_FIRST(&scp->sch_queue) == NULL)
-                       panic("syn_cache_timer: queue inconsistency");
-#endif
-               nscp = LIST_NEXT(scp, sch_headq);
-               for (sc = TAILQ_FIRST(&scp->sch_queue);
-                    sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_timer);
+
+       /*
+        * First, get all the entries that need to be retransmitted, or
+        * must be expired due to exceeding the initial keepalive time.
+        */
+       for (i = 0; i < TCP_MAXRXTSHIFT; i++) {
+               for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
+                    sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
                     sc = nsc) {
-                       nsc = TAILQ_NEXT(sc, sc_queue);
-                       tcpstat.tcps_sc_timed_out++;
-                       SYN_CACHE_RM(sc, scp);
-                       SYN_CACHE_PUT(sc);
+                       nsc = TAILQ_NEXT(sc, sc_timeq);
+
+                       /*
+                        * Compute the total amount of time this entry has
+                        * been on a queue.  If this entry has been on longer
+                        * than the keep alive timer would allow, expire it.
+                        */
+                       sc->sc_rxttot += sc->sc_rxtcur;
+                       if (sc->sc_rxttot >= TCPTV_KEEP_INIT) {
+                               tcpstat.tcps_sc_timed_out++;
+                               SYN_CACHE_RM(sc);
+                               SYN_CACHE_PUT(sc);
+                               continue;
+                       }
+
+                       tcpstat.tcps_sc_retransmitted++;
+                       (void) syn_cache_respond(sc, NULL);
+
+                       /* Advance this entry onto the next timer queue. */
+                       TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq);
+                       sc->sc_rxtshift = i + 1;
+                       SYN_CACHE_TIMER_ARM(sc);
+                       TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift],
+                           sc, sc_timeq);
                }
        }
+
+       /*
+        * Now get all the entries that are expired due to too many
+        * retransmissions.
+        */
+       for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]);
+            sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
+            sc = nsc) {
+               tcpstat.tcps_sc_timed_out++;
+               SYN_CACHE_RM(sc);
+               SYN_CACHE_PUT(sc);
+       }
        splx(s);
 }
 
@@ -2005,8 +2080,8 @@
        scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
        *headp = scp;
        s = splsoftnet();
-       for (sc = TAILQ_FIRST(&scp->sch_queue); sc != NULL;
-            sc = TAILQ_NEXT(sc, sc_queue)) {
+       for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL;
+            sc = LIST_NEXT(sc, sc_bucketq)) {
                if (sc->sc_hash != hash)
                        continue;
                if (sc->sc_src.s_addr == ti->ti_src.s_addr &&
@@ -2056,7 +2131,6 @@
        register struct tcpiphdr *ti;
        struct sockaddr_in *sin;
        struct mbuf *am;
-       long win;
        int s;
Prev by Date: [src/trunk]: src/sys/arch/macppc/stand/ofwboot Use MI loadfile().
Next by Date: [src/trunk]: src/usr.bin/netstat Print SYN, ACK retransmission statistics.
Previous by Thread: [src/trunk]: src/sys/arch/macppc/stand/ofwboot Use MI loadfile().
Next by Thread: [src/trunk]: src/usr.bin/netstat Print SYN, ACK retransmission statistics.
Indexes:
Home | Main Index | Thread Index | Old Index