Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/netinet * implement TCP CUBIC congestion control algorithm



details:   https://anonhg.NetBSD.org/src/rev/aeccba13f496
branches:  trunk
changeset: 791294:aeccba13f496
user:      kefren <kefren%NetBSD.org@localhost>
date:      Tue Nov 12 09:02:05 2013 +0000

description:
* implement TCP CUBIC congestion control algorithm
* move tcp_sack_newack bits inside reno and newreno_fast_retransmit_newack
* notify ECN peer about cwnd shrink in [new]reno_slow_retransmit

Based on the patch proposed on tech-net@ on Nov 7 with minor improvments:
 * adapt wmax for no-fast convergence case
 * correct cbrt calculation for big window sizes (>750KB)

diffstat:

 sys/netinet/tcp_congctl.c |  303 +++++++++++++++++++++++++++++++++++++++++----
 sys/netinet/tcp_congctl.h |    3 +-
 sys/netinet/tcp_input.c   |   12 +-
 sys/netinet/tcp_sack.c    |   67 +---------
 sys/netinet/tcp_subr.c    |    7 +-
 sys/netinet/tcp_var.h     |    8 +-
 6 files changed, 294 insertions(+), 106 deletions(-)

diffs (truncated from 612 to 300 lines):

diff -r 56f4e9a7b741 -r aeccba13f496 sys/netinet/tcp_congctl.c
--- a/sys/netinet/tcp_congctl.c Tue Nov 12 06:07:30 2013 +0000
+++ b/sys/netinet/tcp_congctl.c Tue Nov 12 09:02:05 2013 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: tcp_congctl.c,v 1.17 2013/10/25 16:29:20 martin Exp $  */
+/*     $NetBSD: tcp_congctl.c,v 1.18 2013/11/12 09:02:05 kefren Exp $  */
 
 /*-
  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
@@ -135,7 +135,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.17 2013/10/25 16:29:20 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.18 2013/11/12 09:02:05 kefren Exp $");
 
 #include "opt_inet.h"
 #include "opt_tcp_debug.h"
@@ -194,6 +194,9 @@
  *   consider separating the actual implementations in another file.
  */
 
+static void tcp_common_congestion_exp(struct tcpcb *, int, int);
+
+static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
 static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
 static void tcp_reno_slow_retransmit(struct tcpcb *);
 static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
@@ -206,6 +209,10 @@
        const struct tcphdr *);
 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
 
+static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
+static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
+static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
+static void tcp_cubic_congestion_exp(struct tcpcb *);
 
 static void tcp_congctl_fillnames(void);
 
@@ -241,6 +248,8 @@
        KASSERT(r == 0);
        r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
        KASSERT(r == 0);
+       r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
+       KASSERT(r == 0);
 
        /* NewReno is the default. */
 #ifndef TCP_CONGCTL_DEFAULT
@@ -406,18 +415,28 @@
 /* ------------------------------------------------------------------------ */
 
 /*
- * TCP/Reno congestion control.
+ * Common stuff
  */
+
+/* Window reduction (1-beta) for [New]Reno: 0.5 */
+#define RENO_BETAA 1
+#define RENO_BETAB 2
+/* Window reduction (1-beta) for Cubic: 0.8 */
+#define CUBIC_BETAA 4
+#define CUBIC_BETAB 5
+/* Draft Rhee Section 4.1 */
+#define CUBIC_CA 4
+#define CUBIC_CB 10
+
 static void
-tcp_reno_congestion_exp(struct tcpcb *tp)
+tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
 {
        u_int win;
 
        /* 
-        * Halve the congestion window and reduce the
-        * slow start threshold.
+        * Reduce the congestion window and the slow start threshold.
         */
-       win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+       win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
        if (win < 2)
                win = 2;
 
@@ -434,9 +453,20 @@
 }
 
 
+/* ------------------------------------------------------------------------ */
+
+/*
+ * TCP/Reno congestion control.
+ */
+static void
+tcp_reno_congestion_exp(struct tcpcb *tp)
+{
+
+       tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
+}
 
 static int
-tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
+tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
 {
        /*
         * We know we're losing at the current
@@ -458,10 +488,8 @@
         * irrespective of the number of DupAcks.
         */
        
-       tcp_seq onxt;
-       
-       onxt = tp->snd_nxt;
-       tcp_reno_congestion_exp(tp);
+       tcp_seq onxt = tp->snd_nxt;
+
        tp->t_partialacks = 0;
        TCP_TIMER_DISARM(tp, TCPT_REXMT);
        tp->t_rtttime = 0;
@@ -482,6 +510,14 @@
        return 0;
 }
 
+static int
+tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
+{
+
+       tcp_reno_congestion_exp(tp);
+       return tcp_reno_do_fast_retransmit(tp, th);
+}
+
 static void
 tcp_reno_slow_retransmit(struct tcpcb *tp)
 {
@@ -521,6 +557,9 @@
        tp->t_partialacks = -1;
        tp->t_dupacks = 0;
        tp->t_bytes_acked = 0;
+
+       if (TCP_ECN_ALLOWED(tp))
+               tp->t_flags |= TF_ECN_SND_CWR;
 }
 
 static void
@@ -543,6 +582,8 @@
                tp->t_partialacks = -1;
                tp->t_dupacks = 0;
                tp->t_bytes_acked = 0;
+               if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
+                       tp->snd_fack = th->th_ack;
        }
 }
 
@@ -653,6 +694,7 @@
                 */
                tcp_seq onxt = tp->snd_nxt;
                u_long ocwnd = tp->snd_cwnd;
+               int sack_num_segs = 1, sack_bytes_rxmt = 0;
 
                /*
                 * snd_una has not yet been updated and the socket's send
@@ -660,24 +702,52 @@
                 * have to leave snd_una as it was to get the correct data
                 * offset in tcp_output().
                 */
-               if (++tp->t_partialacks == 1)
-                       TCP_TIMER_DISARM(tp, TCPT_REXMT);
+               tp->t_partialacks++;
+               TCP_TIMER_DISARM(tp, TCPT_REXMT);
                tp->t_rtttime = 0;
                tp->snd_nxt = th->th_ack;
-               /*
-                * Set snd_cwnd to one segment beyond ACK'd offset.  snd_una
-                * is not yet updated when we're called.
-                */
-               tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
-               (void) tcp_output(tp);
-               tp->snd_cwnd = ocwnd;
-               if (SEQ_GT(onxt, tp->snd_nxt))
-                       tp->snd_nxt = onxt;
-               /*
-                * Partial window deflation.  Relies on fact that tp->snd_una
-                * not updated yet.
-                */
-               tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
+
+               if (TCP_SACK_ENABLED(tp)) {
+                       /*
+                        * Partial ack handling within a sack recovery episode.
+                        * Keeping this very simple for now. When a partial ack
+                        * is received, force snd_cwnd to a value that will
+                        * allow the sender to transmit no more than 2 segments.
+                        * If necessary, a fancier scheme can be adopted at a
+                        * later point, but for now, the goal is to prevent the
+                        * sender from bursting a large amount of data in the
+                        * midst of sack recovery.
+                        */
+
+                       /*
+                        * send one or 2 segments based on how much
+                        * new data was acked
+                        */
+                       if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
+                               sack_num_segs = 2;
+                       (void)tcp_sack_output(tp, &sack_bytes_rxmt);
+                       tp->snd_cwnd = sack_bytes_rxmt +
+                           (tp->snd_nxt - tp->sack_newdata) +
+                           sack_num_segs * tp->t_segsz;
+                       tp->t_flags |= TF_ACKNOW;
+                       (void) tcp_output(tp);
+               } else {
+                       /*
+                        * Set snd_cwnd to one segment beyond ACK'd offset
+                        * snd_una is not yet updated when we're called
+                        */
+                       tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
+                       (void) tcp_output(tp);
+                       tp->snd_cwnd = ocwnd;
+                       if (SEQ_GT(onxt, tp->snd_nxt))
+                               tp->snd_nxt = onxt;
+                       /*
+                        * Partial window deflation.  Relies on fact that
+                        * tp->snd_una not updated yet.
+                        */
+                       tp->snd_cwnd -= (th->th_ack - tp->snd_una -
+                           tp->t_segsz);
+               }
        } else {
                /*
                 * Complete ack.  Inflate the congestion window to ssthresh
@@ -696,6 +766,8 @@
                tp->t_partialacks = -1;
                tp->t_dupacks = 0;
                tp->t_bytes_acked = 0;
+               if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
+                       tp->snd_fack = th->th_ack;
        }
 }
 
@@ -720,4 +792,179 @@
        .cong_exp = tcp_reno_congestion_exp,
 };
 
+/*
+ * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
+ */
 
+/* Cubic prototypes */
+static void    tcp_cubic_update_ctime(struct tcpcb *tp);
+static uint32_t        tcp_cubic_diff_ctime(struct tcpcb *);
+static uint32_t        tcp_cubic_cbrt(uint32_t);
+static uint32_t        tcp_cubic_getW(struct tcpcb *);
+
+/* Cubic TIME functions - XXX I don't like using timevals and microuptime */
+/*
+ * Set congestion timer to now
+ */
+static void
+tcp_cubic_update_ctime(struct tcpcb *tp)
+{
+       struct timeval now_timeval;
+
+       getmicrouptime(&now_timeval);
+       tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
+           now_timeval.tv_usec / 1000;
+}
+
+/*
+ * miliseconds from last congestion
+ */
+static uint32_t
+tcp_cubic_diff_ctime(struct tcpcb *tp)
+{
+       struct timeval now_timeval;
+
+       getmicrouptime(&now_timeval);
+       return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
+           tp->snd_cubic_ctime;
+}
+
+/*
+ * Approximate cubic root
+ */
+#define CBRT_ROUNDS 30
+static uint32_t
+tcp_cubic_cbrt(uint32_t v)
+{
+       int i, rounds = CBRT_ROUNDS;
+       uint64_t x = v / 3;
+
+       /* We fail to calculate correct for small numbers */
+       if (v == 0)
+               return 0;
+       else if (v < 4)
+               return 1;
+
+       /*
+        * largest x that 2*x^3+3*x fits 64bit
+        * Avoid overflow for a time cost
+        */
+       if (x > 2097151)
+               rounds += 10;



Home | Main Index | Thread Index | Old Index