Subject: Re: Refactoring Congestion Control (take 2)
To: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
From: Rui Paulo <rpaulo@fnop.net>
List: tech-net
Date: 09/23/2006 18:32:20
--Apple-Mail-1-336914240
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	delsp=yes;
	format=flowed


On Sep 21, 2006, at 4:18 AM, YAMAMOTO Takashi wrote:

>> Any other comments?
>
> i think it's better to copy tcp_congctl_global to a member in tcpcb
> so that it's somewhat static for a given connection.
> switching the sysctl knob correctly when it affects existing  
> connections
> is a locking nightmare.
>

Maybe something like:


--Apple-Mail-1-336914240
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name=tcp_congctl.diff
Content-Disposition: attachment;
	filename=tcp_congctl.diff

Index: files.netinet
===================================================================
RCS file: /cvsroot/src/sys/netinet/files.netinet,v
retrieving revision 1.11
diff -u -p -r1.11 files.netinet
--- files.netinet	11 Dec 2005 12:24:57 -0000	1.11
+++ files.netinet	23 Sep 2006 17:28:35 -0000
@@ -15,6 +15,8 @@ defparam opt_tcp_space.h	TCP_RECVSPACE T
 defflag opt_inet_csum.h		INET_CSUM_COUNTERS TCP_CSUM_COUNTERS
 				UDP_CSUM_COUNTERS
 
+defparam opt_tcp_congctl.h	TCP_CONGCTL_DEFAULT
+
 file	netinet/igmp.c		inet
 file	netinet/in.c		inet
 file	netinet/in_pcb.c	inet
@@ -34,5 +36,6 @@ file	netinet/tcp_sack.c	inet | inet6
 file	netinet/tcp_subr.c	inet | inet6
 file	netinet/tcp_timer.c	inet | inet6
 file	netinet/tcp_usrreq.c	inet | inet6
+file	netinet/tcp_congctl.c	inet | inet6
 
 file	netinet/udp_usrreq.c	inet | inet6
Index: tcp.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp.h,v
retrieving revision 1.22
diff -u -p -r1.22 tcp.h
--- tcp.h	5 Sep 2006 00:29:36 -0000	1.22
+++ tcp.h	23 Sep 2006 17:28:35 -0000
@@ -118,5 +118,6 @@ struct tcphdr {
 #define	TCP_MAXSEG	0x02	/* set maximum segment size */
 /* Bits 0x04, 0x08 reserved for FreeBSD compatibility: TCP_NOPUSH, TCP_NOOPT */
 #define TCP_MD5SIG	0x10	/* use MD5 digests (RFC2385) */
+#define	TCP_CONGCTL	0x20	/* selected congestion control */
 
 #endif /* !_NETINET_TCP_H_ */
Index: tcp_congctl.c
===================================================================
RCS file: tcp_congctl.c
diff -N tcp_congctl.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tcp_congctl.c	23 Sep 2006 17:28:36 -0000
@@ -0,0 +1,615 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
+ *
+ * NRL grants permission for redistribution and use in source and binary
+ * forms, with or without modification, of the software and documentation
+ * created at NRL provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgements:
+ *      This product includes software developed by the University of
+ *      California, Berkeley and its contributors.
+ *      This product includes software developed at the Information
+ *      Technology Division, US Naval Research Laboratory.
+ * 4. Neither the name of the NRL nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
+ * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation
+ * are those of the authors and should not be interpreted as representing
+ * official policies, either expressed or implied, of the US Naval
+ * Research Laboratory (NRL).
+ */
+
+/*-
+ * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
+ * Facility, NASA Ames Research Center.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the NetBSD
+ *	Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include "opt_inet.h"
+#include "opt_tcp_debug.h"
+#include "opt_tcp_congctl.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/errno.h>
+#include <sys/syslog.h>
+#include <sys/pool.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+
+#ifdef INET6
+#ifndef INET
+#include <netinet/in.h>
+#endif
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_var.h>
+#include <netinet/icmp6.h>
+#include <netinet6/nd6.h>
+#endif
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_congctl.h>
+#ifdef TCP_DEBUG
+#include <netinet/tcp_debug.h>
+#endif
+
+/*
+ * TODO:
+ *   consider separating the actual implementations in another file.
+ */
+
+static int  tcp_reno_fast_retransmit(struct tcpcb *, struct tcphdr *);
+static void tcp_reno_slow_retransmit(struct tcpcb *);
+static void tcp_reno_cwnd_inflation(struct tcpcb *, struct tcphdr *);
+static void tcp_reno_new_data_acked(struct tcpcb *, struct tcphdr *);
+
+static int  tcp_newreno_fast_retransmit(struct tcpcb *, struct tcphdr *);
+static void tcp_newreno_cwnd_inflation(struct tcpcb *, struct tcphdr *);
+
+static void tcp_congctl_fillnames(void);
+
+extern int tcprexmtthresh;
+
+MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
+
+/*
+ * Used to list the available congestion control algorithms.
+ */
+struct tcp_congctlent {
+	TAILQ_ENTRY(tcp_congctlent) congctl_ent;
+        char               congctl_name[TCPCC_MAXLEN];
+	struct tcp_congctl *congctl_ctl;
+};
+TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd;
+
+struct simplelock tcp_congctl_slock;
+
+void
+tcp_congctl_init(void)
+{
+	int r;
+	
+	TAILQ_INIT(&tcp_congctlhd);
+	simple_lock_init(&tcp_congctl_slock);
+
+	/* Base algorithms. */
+	r = tcp_congctl_register("reno", &tcp_reno_ctl);
+	KASSERT(r == 0);
+	r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
+	KASSERT(r == 0);
+
+	/* NewReno is the default. */
+#ifndef TCP_CONGCTL_DEFAULT
+#define TCP_CONGCTL_DEFAULT "newreno"
+#endif
+
+	r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
+	KASSERT(r == 0);
+}
+
+/*
+ * Register a congestion algorithm and select it if we have none.
+ */
+int
+tcp_congctl_register(const char *name, struct tcp_congctl *tcc)
+{
+	struct tcp_congctlent *ntcc, *tccp;
+
+	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 
+		if (!strcmp(name, tccp->congctl_name)) {
+			/* name already registered */
+			return EEXIST;
+		}
+
+	ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK);
+
+	strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
+	ntcc->congctl_ctl = tcc;
+
+	TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
+	tcp_congctl_fillnames();
+
+	if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
+		tcp_congctl_select(NULL, name);
+		
+	return 0;
+}
+
+int
+tcp_congctl_unregister(const char *name)
+{
+	struct tcp_congctlent *tccp, *rtccp;
+	unsigned int size;
+	
+	rtccp = NULL;
+	size = 0;
+	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
+		if (!strcmp(name, tccp->congctl_name))
+			rtccp = tccp;
+		size++;
+	}
+	
+	if (!rtccp)
+		return ENOENT;
+
+	if (size <= 1 || tcp_congctl_global == rtccp->congctl_ctl ||
+	    rtccp->congctl_ctl->refcnt)
+		return EBUSY;
+
+	TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
+	free(rtccp, M_TCPCONGCTL);
+	tcp_congctl_fillnames();
+
+	return 0;
+}
+
+/*
+ * Select a congestion algorithm by name.
+ */
+int
+tcp_congctl_select(struct tcpcb *tp, const char *name)
+{
+	struct tcp_congctlent *tccp;
+
+	KASSERT(name);
+
+	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
+		if (!strcmp(name, tccp->congctl_name)) {
+			if (tp) {
+				simple_lock(&tcp_congctl_slock);
+				tp->t_congctl->refcnt--;
+				tp->t_congctl = tccp->congctl_ctl;
+				tp->t_congctl->refcnt++;
+				simple_unlock(&tcp_congctl_slock);
+			} else {
+				tcp_congctl_global = tccp->congctl_ctl;
+				strlcpy(tcp_congctl_global_name,
+				    tccp->congctl_name,
+				    sizeof(tcp_congctl_global_name) - 1);
+			}
+			return 0;
+		}
+	
+	return EINVAL;
+}
+
+/*
+ * Returns the name of a congestion algorithm.
+ */
+const char *
+tcp_congctl_bystruct(const struct tcp_congctl *tcc)
+{
+	struct tcp_congctlent *tccp;
+	
+	KASSERT(tcc);
+	
+	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
+		if (tccp->congctl_ctl == tcc)
+			return tccp->congctl_name;
+
+	return NULL;
+}
+
+static void
+tcp_congctl_fillnames(void)
+{
+	struct tcp_congctlent *tccp;
+	const char *delim = " ";
+	
+	tcp_congctl_avail[0] = '\0';
+	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
+		strlcat(tcp_congctl_avail, tccp->congctl_name,
+		    sizeof(tcp_congctl_avail) - 1);
+		if (TAILQ_NEXT(tccp, congctl_ent))
+			strlcat(tcp_congctl_avail, delim, 
+			    sizeof(tcp_congctl_avail) - 1);
+	}	
+	
+}
+
+/* ------------------------------------------------------------------------ */
+
+inline void
+tcp_reno_congestion_exp(struct tcpcb *tp)
+{
+	u_int win;
+
+	/* 
+	 * Halve the congestion window and reduce the
+	 * slow start threshold.
+	 */
+	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+	if (win < 2)
+		win = 2;
+
+	tp->snd_ssthresh = win * tp->t_segsz;
+	tp->snd_recover = tp->snd_max;
+	tp->snd_cwnd = tp->snd_ssthresh;
+
+	if (TCP_ECN_ALLOWED(tp))
+		tp->t_flags |= TF_ECN_SND_CWR;
+}
+
+
+/*
+ * TCP/Reno congestion control.
+ */
+static int
+tcp_reno_fast_retransmit(struct tcpcb *tp, struct tcphdr *th)
+{
+	tcp_seq onxt;
+	
+	onxt = tp->snd_nxt;
+	tcp_reno_congestion_exp(tp);
+	tp->t_partialacks = 0;
+	TCP_TIMER_DISARM(tp, TCPT_REXMT);
+	tp->t_rtttime = 0;
+	if (TCP_SACK_ENABLED(tp)) {
+		tp->t_dupacks = tcprexmtthresh;
+		tp->sack_newdata = tp->snd_nxt;
+		tp->snd_cwnd = tp->t_segsz;
+		(void) tcp_output(tp);
+		return 0;
+	}
+	tp->snd_nxt = th->th_ack;
+	tp->snd_cwnd = tp->t_segsz;
+	(void) tcp_output(tp);
+	tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
+	if (SEQ_GT(onxt, tp->snd_nxt))
+		tp->snd_nxt = onxt;
+	
+	return 0;
+}
+
+static void
+tcp_reno_slow_retransmit(struct tcpcb *tp)
+{
+	u_int win;
+
+	/*
+	 * Close the congestion window down to one segment
+	 * (we'll open it by one segment for each ack we get).
+	 * Since we probably have a window's worth of unacked
+	 * data accumulated, this "slow start" keeps us from
+	 * dumping all that data as back-to-back packets (which
+	 * might overwhelm an intermediate gateway).
+	 *
+	 * There are two phases to the opening: Initially we
+	 * open by one mss on each ack.  This makes the window
+	 * size increase exponentially with time.  If the
+	 * window is larger than the path can handle, this
+	 * exponential growth results in dropped packet(s)
+	 * almost immediately.  To get more time between
+	 * drops but still "push" the network to take advantage
+	 * of improving conditions, we switch from exponential
+	 * to linear window opening at some threshhold size.
+	 * For a threshhold, we use half the current window
+	 * size, truncated to a multiple of the mss.
+	 *
+	 * (the minimum cwnd that will give us exponential
+	 * growth is 2 mss.  We don't allow the threshhold
+	 * to go below this.)
+	 */
+
+	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+	if (win < 2)
+		win = 2;
+	/* Loss Window MUST be one segment. */
+	tp->snd_cwnd = tp->t_segsz;
+	tp->snd_ssthresh = win * tp->t_segsz;
+	tp->t_partialacks = -1;
+	tp->t_dupacks = 0;
+}
+
+static void
+tcp_reno_cwnd_inflation(struct tcpcb *tp, struct tcphdr *th)
+{
+	if (tp->t_partialacks < 0) {
+		/*
+		 * We were not in fast recovery.  Reset the duplicate ack
+		 * counter.
+		 */
+		tp->t_dupacks = 0;
+	} else {
+		/*
+		 * Clamp the congestion window to the crossover point and
+		 * exit fast recovery.
+		 */
+		if (tp->snd_cwnd > tp->snd_ssthresh)
+			tp->snd_cwnd = tp->snd_ssthresh;
+		tp->t_partialacks = -1;
+		tp->t_dupacks = 0;
+	}
+}
+
+static void
+tcp_reno_new_data_acked(struct tcpcb *tp, struct tcphdr *th)
+{
+	u_int cw;
+	u_int incr;
+	
+	/*
+	 * When new data is acked, open the congestion window.
+	 * If the window gives us less than ssthresh packets
+	 * in flight, open exponentially (segsz per packet).
+	 * Otherwise open linearly: segsz per window
+	 * (segsz^2 / cwnd per packet), plus a constant
+	 * fraction of a packet (segsz/8) to help larger windows
+	 * open quickly enough.
+	 */
+	cw = tp->snd_cwnd;
+	incr = tp->t_segsz;
+	if (cw > tp->snd_ssthresh)
+		incr = incr * incr / cw;
+	if (tp->t_congctl == &tcp_reno_ctl ||
+	    SEQ_GEQ(th->th_ack, tp->snd_recover))
+		tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale);
+}
+
+struct tcp_congctl tcp_reno_ctl = {
+	.fast_retransmit = tcp_reno_fast_retransmit,
+	.slow_retransmit = tcp_reno_slow_retransmit,
+	.cwnd_inflation = tcp_reno_cwnd_inflation,
+	.new_data_acked = tcp_reno_new_data_acked,
+};
+
+/*
+ * TCP/NewReno Congestion control.
+ */
+static int
+tcp_newreno_fast_retransmit(struct tcpcb *tp, struct tcphdr *th)
+{
+	if (SEQ_LT(th->th_ack, tp->snd_high)) {
+		/*
+		 * False fast retransmit after timeout.
+		 * Do not enter fast recovery
+		 */
+		tp->t_dupacks = 0;
+		return 1;
+	} else {
+		/*
+		 * Fast retransmit is same as reno.
+		 */
+		return tcp_reno_fast_retransmit(tp, th);
+	}
+
+	return 0;
+}
+
+/*
+ * Implement the NewReno response to a new ack, checking for partial acks in
+ * fast recovery.
+ */
+static void
+tcp_newreno_cwnd_inflation(struct tcpcb *tp, struct tcphdr *th)
+{
+	if (tp->t_partialacks < 0) {
+		/*
+		 * We were not in fast recovery.  Reset the duplicate ack
+		 * counter.
+		 */
+		tp->t_dupacks = 0;
+	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+		/*
+		 * This is a partial ack.  Retransmit the first unacknowledged
+		 * segment and deflate the congestion window by the amount of
+		 * acknowledged data.  Do not exit fast recovery.
+		 */
+		tcp_seq onxt = tp->snd_nxt;
+		u_long ocwnd = tp->snd_cwnd;
+
+		/*
+		 * snd_una has not yet been updated and the socket's send
+		 * buffer has not yet drained off the ACK'd data, so we
+		 * have to leave snd_una as it was to get the correct data
+		 * offset in tcp_output().
+		 */
+		if (++tp->t_partialacks == 1)
+			TCP_TIMER_DISARM(tp, TCPT_REXMT);
+		tp->t_rtttime = 0;
+		tp->snd_nxt = th->th_ack;
+		/*
+		 * Set snd_cwnd to one segment beyond ACK'd offset.  snd_una
+		 * is not yet updated when we're called.
+		 */
+		tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
+		(void) tcp_output(tp);
+		tp->snd_cwnd = ocwnd;
+		if (SEQ_GT(onxt, tp->snd_nxt))
+			tp->snd_nxt = onxt;
+		/*
+		 * Partial window deflation.  Relies on fact that tp->snd_una
+		 * not updated yet.
+		 */
+		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
+	} else {
+		/*
+		 * Complete ack.  Inflate the congestion window to ssthresh
+		 * and exit fast recovery.
+		 *
+		 * Window inflation should have left us with approx.
+		 * snd_ssthresh outstanding data.  But in case we
+		 * would be inclined to send a burst, better to do
+		 * it via the slow start mechanism.
+		 */
+		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
+			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+			    + tp->t_segsz;
+		else
+			tp->snd_cwnd = tp->snd_ssthresh;
+		tp->t_partialacks = -1;
+		tp->t_dupacks = 0;
+	}
+}
+
+struct tcp_congctl tcp_newreno_ctl = {
+	.fast_retransmit = tcp_newreno_fast_retransmit,
+	.slow_retransmit = tcp_reno_slow_retransmit,
+	.cwnd_inflation = tcp_newreno_cwnd_inflation,
+	.new_data_acked = tcp_reno_new_data_acked,
+};
+
+
Index: tcp_congctl.h
===================================================================
RCS file: tcp_congctl.h
diff -N tcp_congctl.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tcp_congctl.h	23 Sep 2006 17:28:36 -0000
@@ -0,0 +1,77 @@
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2006 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the NetBSD
+ *	Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_TCP_CONGCTL_H
+#define _NETINET_TCP_CONGCTL_H
+
+/*
+ * Congestion control function table.
+ */
+struct tcp_congctl {
+	int  (*fast_retransmit)(struct tcpcb *, struct tcphdr *);
+	void (*slow_retransmit)(struct tcpcb *);
+	void (*cwnd_inflation)(struct tcpcb *, struct tcphdr *);
+	void (*new_data_acked)(struct tcpcb *, struct tcphdr *);
+	
+	int32_t refcnt;
+};
+
+extern struct tcp_congctl tcp_reno_ctl;
+extern struct tcp_congctl tcp_newreno_ctl;
+
+extern struct simplelock tcp_congctl_slock;
+
+#define TCPCC_MAXLEN 12
+
+/* currently selected global congestion control */
+struct tcp_congctl *tcp_congctl_global;
+char   tcp_congctl_global_name[TCPCC_MAXLEN];
+
+/* available global congestion control algorithms */
+char   tcp_congctl_avail[10 * TCPCC_MAXLEN];
+
+void   tcp_congctl_init(void);
+int    tcp_congctl_register(const char *, struct tcp_congctl *);
+int    tcp_congctl_unregister(const char *);
+int    tcp_congctl_select(struct tcpcb *, const char *);
+const char *
+       tcp_congctl_bystruct(const struct tcp_congctl *);
+
+inline void tcp_reno_congestion_exp(struct tcpcb *);
+
+#endif
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.244
diff -u -p -r1.244 tcp_input.c
--- tcp_input.c	5 Sep 2006 00:29:36 -0000	1.244
+++ tcp_input.c	23 Sep 2006 17:28:41 -0000
@@ -214,6 +214,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
+#include <netinet/tcp_congctl.h>
 #include <netinet/tcp_debug.h>
 
 #include <machine/stdarg.h>
@@ -238,8 +239,6 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,
 #endif
 #endif	/* FAST_IPSEC*/
 
-static inline void tcp_congestion_exp(struct tcpcb *);
-
 int	tcprexmtthresh = 3;
 int	tcp_log_refused;
 
@@ -409,28 +408,6 @@ tcpipqent_free(struct ipqent *ipqe)
 	splx(s);
 }
 
-/*
- * Halve the congestion window and reduce the
- * slow start threshold.
- *
- * Optionally, mark the packet.
- */
-static inline void
-tcp_congestion_exp(struct tcpcb *tp)
-{
-	u_int win;
-
-	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
-	if (win < 2)
-		win = 2;
-
-	tp->snd_ssthresh = win * tp->t_segsz;
-	tp->snd_recover = tp->snd_max;
-	tp->snd_cwnd = tp->snd_ssthresh;
-	if (TCP_ECN_ALLOWED(tp))
-		tp->t_flags |= TF_ECN_SND_CWR;
-}
-
 int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
 {
@@ -1638,7 +1615,7 @@ after_listen:
 		 * Ignore if we are already trying to recover.
 		 */
 		if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
-			tcp_congestion_exp(tp);
+			tcp_reno_congestion_exp(tp);
 	}
 
 	if (opti.ts_present && opti.ts_ecr) {
@@ -2278,39 +2255,15 @@ after_listen:
 				else if (tp->t_partialacks < 0 &&
 					 (++tp->t_dupacks == tcprexmtthresh ||
 					 TCP_FACK_FASTRECOV(tp))) {
-					tcp_seq onxt;
-
-					if ((tcp_do_newreno || tcp_do_ecn) &&
-					    SEQ_LT(th->th_ack, tp->snd_high)) {
-						/*
-						 * False fast retransmit after
-						 * timeout.  Do not enter fast
-						 * recovery.
-						 */
-						tp->t_dupacks = 0;
+					/*
+					 * Do the fast retransmit, and adjust
+					 * congestion control paramenters.
+					 */
+					if (tp->t_congctl->fast_retransmit(tp, th)) {
+						/* False fast retransmit */
 						break;
-					}
-
-					onxt = tp->snd_nxt;
-					tcp_congestion_exp(tp);
-					tp->t_partialacks = 0;
-					TCP_TIMER_DISARM(tp, TCPT_REXMT);
-					tp->t_rtttime = 0;
-					if (TCP_SACK_ENABLED(tp)) {
-						tp->t_dupacks = tcprexmtthresh;
-						tp->sack_newdata = tp->snd_nxt;
-						tp->snd_cwnd = tp->t_segsz;
-						(void) tcp_output(tp);
+					} else
 						goto drop;
-					}
-					tp->snd_nxt = th->th_ack;
-					tp->snd_cwnd = tp->t_segsz;
-					(void) tcp_output(tp);
-					tp->snd_cwnd = tp->snd_ssthresh +
-					       tp->t_segsz * tp->t_dupacks;
-					if (SEQ_GT(onxt, tp->snd_nxt))
-						tp->snd_nxt = onxt;
-					goto drop;
 				} else if (tp->t_dupacks > tcprexmtthresh) {
 					tp->snd_cwnd += tp->t_segsz;
 					(void) tcp_output(tp);
@@ -2336,12 +2289,12 @@ after_listen:
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
+		/* XXX: make SACK have his own congestion control
+		 * struct -- rpaulo */
 		if (TCP_SACK_ENABLED(tp))
 			tcp_sack_newack(tp, th);
-		else if (tcp_do_newreno)
-			tcp_newreno_newack(tp, th);
 		else
-			tcp_reno_newack(tp, th);
+			tp->t_congctl->cwnd_inflation(tp, th);
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
@@ -2375,26 +2328,12 @@ after_listen:
 			needoutput = 1;
 		} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 			TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
+
 		/*
-		 * When new data is acked, open the congestion window.
-		 * If the window gives us less than ssthresh packets
-		 * in flight, open exponentially (segsz per packet).
-		 * Otherwise open linearly: segsz per window
-		 * (segsz^2 / cwnd per packet).
-		 *
-		 * If we are still in fast recovery (meaning we are using
-		 * NewReno and we have only received partial acks), do not
-		 * inflate the window yet.
-		 */
-		if (tp->t_partialacks < 0) {
-			u_int cw = tp->snd_cwnd;
-			u_int incr = tp->t_segsz;
-
-			if (cw >= tp->snd_ssthresh)
-				incr = incr * incr / cw;
-			tp->snd_cwnd = min(cw + incr,
-			    TCP_MAXWIN << tp->snd_scale);
-		}
+		 * New data has been acked, adjust the congestion window.
+		 */
+		tp->t_congctl->new_data_acked(tp, th);
+
 		ND6_HINT(tp);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
@@ -3220,93 +3159,6 @@ tcp_xmit_timer(struct tcpcb *tp, uint32_
 	tp->t_softerror = 0;
 }
 
-void
-tcp_reno_newack(struct tcpcb *tp, struct tcphdr *th)
-{
-	if (tp->t_partialacks < 0) {
-		/*
-		 * We were not in fast recovery.  Reset the duplicate ack
-		 * counter.
-		 */
-		tp->t_dupacks = 0;
-	} else {
-		/*
-		 * Clamp the congestion window to the crossover point and
-		 * exit fast recovery.
-		 */
-		if (tp->snd_cwnd > tp->snd_ssthresh)
-			tp->snd_cwnd = tp->snd_ssthresh;
-		tp->t_partialacks = -1;
-		tp->t_dupacks = 0;
-	}
-}
-
-/*
- * Implement the NewReno response to a new ack, checking for partial acks in
- * fast recovery.
- */
-void
-tcp_newreno_newack(struct tcpcb *tp, struct tcphdr *th)
-{
-	if (tp->t_partialacks < 0) {
-		/*
-		 * We were not in fast recovery.  Reset the duplicate ack
-		 * counter.
-		 */
-		tp->t_dupacks = 0;
-	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-		/*
-		 * This is a partial ack.  Retransmit the first unacknowledged
-		 * segment and deflate the congestion window by the amount of
-		 * acknowledged data.  Do not exit fast recovery.
-		 */
-		tcp_seq onxt = tp->snd_nxt;
-		u_long ocwnd = tp->snd_cwnd;
-
-		/*
-		 * snd_una has not yet been updated and the socket's send
-		 * buffer has not yet drained off the ACK'd data, so we
-		 * have to leave snd_una as it was to get the correct data
-		 * offset in tcp_output().
-		 */
-		if (++tp->t_partialacks == 1)
-			TCP_TIMER_DISARM(tp, TCPT_REXMT);
-		tp->t_rtttime = 0;
-		tp->snd_nxt = th->th_ack;
-		/*
-		 * Set snd_cwnd to one segment beyond ACK'd offset.  snd_una
-		 * is not yet updated when we're called.
-		 */
-		tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
-		(void) tcp_output(tp);
-		tp->snd_cwnd = ocwnd;
-		if (SEQ_GT(onxt, tp->snd_nxt))
-			tp->snd_nxt = onxt;
-		/*
-		 * Partial window deflation.  Relies on fact that tp->snd_una
-		 * not updated yet.
-		 */
-		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
-	} else {
-		/*
-		 * Complete ack.  Inflate the congestion window to ssthresh
-		 * and exit fast recovery.
-		 *
-		 * Window inflation should have left us with approx.
-		 * snd_ssthresh outstanding data.  But in case we
-		 * would be inclined to send a burst, better to do
-		 * it via the slow start mechanism.
-		 */
-		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
-			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
-			    + tp->t_segsz;
-		else
-			tp->snd_cwnd = tp->snd_ssthresh;
-		tp->t_partialacks = -1;
-		tp->t_dupacks = 0;
-	}
-}
-
 
 /*
  * TCP compressed state engine.  Currently used to hold compressed
Index: tcp_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_output.c,v
retrieving revision 1.143
diff -u -p -r1.143 tcp_output.c
--- tcp_output.c	5 Sep 2006 00:29:36 -0000	1.143
+++ tcp_output.c	23 Sep 2006 17:28:50 -0000
@@ -196,6 +196,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_output.c
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_debug.h>
 #include <netinet/in_offload.h>
@@ -1604,7 +1605,7 @@ out:
 	if (maxburst < 0)
 		printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
 #endif
-	if (sendalot && (!tcp_do_newreno || --maxburst))
+	if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
 		goto again;
 	return (0);
 }
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.199
diff -u -p -r1.199 tcp_subr.c
--- tcp_subr.c	5 Sep 2006 00:29:36 -0000	1.199
+++ tcp_subr.c	23 Sep 2006 17:29:05 -0000
@@ -151,6 +151,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
 #include <netinet/tcpip.h>
 
 #ifdef IPSEC
@@ -182,7 +183,6 @@ int	tcp_do_rfc1948 = 0;	/* ISS by crypto
 int	tcp_do_sack = 1;	/* selective acknowledgement */
 int	tcp_do_win_scale = 1;	/* RFC1323 window scaling */
 int	tcp_do_timestamps = 1;	/* RFC1323 timestamps */
-int	tcp_do_newreno = 1;	/* Use the New Reno algorithms */
 int	tcp_ack_on_push = 0;	/* set to enable immediate ACK-on-PUSH */
 int	tcp_do_ecn = 0;		/* Explicit Congestion Notification */
 #ifndef TCP_INIT_WIN
@@ -207,7 +207,6 @@ int	tcp_sack_globalmaxholes = 1024;
 int	tcp_sack_globalholes = 0;
 int	tcp_ecn_maxretries = 1;
 
-
 /* tcb hash */
 #ifndef TCBHASHSIZE
 #define	TCBHASHSIZE	128
@@ -402,6 +401,9 @@ tcp_init(void)
 	/* Initialize the compressed state engine. */
 	syn_cache_init();
 
+	/* Initialize the congestion control algorithms. */
+	tcp_congctl_init();
+
 	MOWNER_ATTACH(&tcp_tx_mowner);
 	MOWNER_ATTACH(&tcp_rx_mowner);
 	MOWNER_ATTACH(&tcp_mowner);
@@ -1034,7 +1036,10 @@ tcp_newtcpcb(int family, void *aux)
 	 * and thus how many TCP sequence increments have occurred.
 	 */
 	tp->ts_timebase = tcp_now;
-
+	
+	tp->t_congctl = tcp_congctl_global;
+	tp->t_congctl->refcnt++;
+	
 	return (tp);
 }
 
@@ -1211,6 +1216,8 @@ tcp_close(struct tcpcb *tp)
 
 	/* free the SACK holes list. */
 	tcp_free_sackholes(tp);
+	
+	tp->t_congctl->refcnt--;
 
 	tcp_canceltimers(tp);
 	TCP_CLEAR_DELACK(tp);
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.75
diff -u -p -r1.75 tcp_timer.c
--- tcp_timer.c	14 May 2006 21:19:34 -0000	1.75
+++ tcp_timer.c	23 Sep 2006 17:29:05 -0000
@@ -138,6 +138,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_timer.c,
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
 #include <netinet/tcpip.h>
 #ifdef TCP_DEBUG
 #include <netinet/tcp_debug.h>
@@ -416,40 +417,12 @@ tcp_timer_rexmt(void *arg)
 	 */
 	if (tp->t_state == TCPS_SYN_SENT)
 		tp->t_flags |= TF_SYN_REXMT;
+
 	/*
-	 * Close the congestion window down to one segment
-	 * (we'll open it by one segment for each ack we get).
-	 * Since we probably have a window's worth of unacked
-	 * data accumulated, this "slow start" keeps us from
-	 * dumping all that data as back-to-back packets (which
-	 * might overwhelm an intermediate gateway).
-	 *
-	 * There are two phases to the opening: Initially we
-	 * open by one mss on each ack.  This makes the window
-	 * size increase exponentially with time.  If the
-	 * window is larger than the path can handle, this
-	 * exponential growth results in dropped packet(s)
-	 * almost immediately.  To get more time between
-	 * drops but still "push" the network to take advantage
-	 * of improving conditions, we switch from exponential
-	 * to linear window opening at some threshhold size.
-	 * For a threshhold, we use half the current window
-	 * size, truncated to a multiple of the mss.
-	 *
-	 * (the minimum cwnd that will give us exponential
-	 * growth is 2 mss.  We don't allow the threshhold
-	 * to go below this.)
-	 */
-	{
-	u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
-	if (win < 2)
-		win = 2;
-	/* Loss Window MUST be one segment. */
-	tp->snd_cwnd = tp->t_segsz;
-	tp->snd_ssthresh = win * tp->t_segsz;
-	tp->t_partialacks = -1;
-	tp->t_dupacks = 0;
-	}
+	 * Adjust congestion control parameters.
+	 */
+	tp->t_congctl->slow_retransmit(tp);
+
 	(void) tcp_output(tp);
 
  out:
Index: tcp_usrreq.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.122
diff -u -p -r1.122 tcp_usrreq.c
--- tcp_usrreq.c	13 Sep 2006 10:07:42 -0000	1.122
+++ tcp_usrreq.c	23 Sep 2006 17:29:08 -0000
@@ -149,6 +149,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_debug.h>
 
@@ -716,6 +717,13 @@ tcp_ctloutput(int op, struct socket *so,
 			else
 				error = EINVAL;
 			break;
+#if 0
+		case TCP_CONGCTL:
+			if (m == NULL)
+				error = EINVAL;
+			error = tcp_congctl_select(tp, mtod(m, char *));
+#endif
+			break;
 
 		default:
 			error = ENOPROTOOPT;
@@ -742,6 +750,10 @@ tcp_ctloutput(int op, struct socket *so,
 		case TCP_MAXSEG:
 			*mtod(m, int *) = tp->t_peermss;
 			break;
+#if 0
+		case TCP_CONGCTL:
+			break;
+#endif
 		default:
 			error = ENOPROTOOPT;
 			break;
@@ -1380,6 +1392,32 @@ sysctl_inpcblist(SYSCTLFN_ARGS)
 	return (error);
 }
 
+static int
+sysctl_tcp_congctl(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+	int error, r;
+	char newname[TCPCC_MAXLEN];
+
+	strlcpy(newname, tcp_congctl_global_name, sizeof(newname) - 1);
+	
+	node = *rnode;
+	node.sysctl_data = newname;
+	node.sysctl_size = sizeof(newname);
+
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	
+	if (error || 
+	    newp == NULL ||
+	    strncmp(newname, tcp_congctl_global_name, sizeof(newname)) == 0)
+		return error;
+
+	if ((r = tcp_congctl_select(NULL, newname)))
+		return r;
+	
+	return error;
+}
+
 /*
  * this (second stage) setup routine is a replacement for tcp_sysctl()
  * (which is currently used for ipv4 and ipv6)
@@ -1388,7 +1426,7 @@ static void
 sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname,
 			   const char *tcpname)
 {
-	int ecn_node;
+	int ecn_node, congctl_node;
 	const struct sysctlnode *sack_node, *node;
 #ifdef TCP_DEBUG
 	extern struct tcp_debug tcp_debug[TCP_NDEBUG];
@@ -1487,6 +1525,28 @@ sysctl_net_inet_tcp_setup2(struct sysctl
 	    	       NULL, 0, NULL, 0,
 		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
 	ecn_node = node->sysctl_num;
+	sysctl_createv(clog, 0, NULL, &node,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "congctl",
+		       SYSCTL_DESCR("TCP Congestion Control"),
+		       NULL, 0, NULL, 0,
+		       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
+	congctl_node = node->sysctl_num;
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_STRING, "available",
+		       SYSCTL_DESCR("Available Congestion Control Mechanisms"),
+		       NULL, 0, &tcp_congctl_avail, 0,
+		       CTL_NET, pf, IPPROTO_TCP, congctl_node,
+		       CTL_CREATE, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_STRING, "selected",
+		       SYSCTL_DESCR("Selected Congestion Control Mechanism"),
+		       sysctl_tcp_congctl, 0, &tcp_congctl_global_name, 0,
+		       CTL_NET, pf, IPPROTO_TCP, congctl_node,
+		       CTL_CREATE, CTL_EOL);
+
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 		       CTLTYPE_INT, "win_scale",
@@ -1554,12 +1614,6 @@ sysctl_net_inet_tcp_setup2(struct sysctl
 		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SLOWHZ, CTL_EOL);
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
-		       CTLTYPE_INT, "newreno",
-		       SYSCTL_DESCR("NewReno congestion control algorithm"),
-		       NULL, 0, &tcp_do_newreno, 0,
-		       CTL_NET, pf, IPPROTO_TCP, TCPCTL_NEWRENO, CTL_EOL);
-	sysctl_createv(clog, 0, NULL, NULL,
-		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 		       CTLTYPE_INT, "log_refused",
 		       SYSCTL_DESCR("Log refused TCP connections"),
 		       NULL, 0, &tcp_log_refused, 0,
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.137
diff -u -p -r1.137 tcp_var.h
--- tcp_var.h	5 Sep 2006 00:29:36 -0000	1.137
+++ tcp_var.h	23 Sep 2006 17:29:09 -0000
@@ -325,6 +325,8 @@ struct tcpcb {
 	u_short	t_pmtud_ip_hl;		/* IP header length from ICMP payload */
 
 	uint8_t t_ecn_retries;		/* # of ECN setup retries */
+	
+	struct tcp_congctl *t_congctl;	/* per TCB congctl algorithm */
 };
 
 /*
@@ -714,7 +716,7 @@ struct	tcpstat {
 	{ "keepintvl",	CTLTYPE_INT }, \
 	{ "keepcnt",	CTLTYPE_INT }, \
 	{ "slowhz",	CTLTYPE_INT }, \
-	{ "newreno",	CTLTYPE_INT }, \
+	{ 0, 0 }, \
 	{ "log_refused",CTLTYPE_INT }, \
 	{ 0, 0 }, \
 	{ "rstppslimit", CTLTYPE_INT }, \
@@ -736,7 +738,6 @@ extern	int tcp_do_rfc1323;	/* enabled/di
 extern	int tcp_do_sack;	/* SACK enabled/disabled? */
 extern	int tcp_do_win_scale;	/* RFC1323 window scaling enabled/disabled? */
 extern	int tcp_do_timestamps;	/* RFC1323 timestamps enabled/disabled? */
-extern	int tcp_do_newreno;	/* Use the New Reno algorithms */
 extern	int tcp_mssdflt;	/* default seg size */
 extern	int tcp_init_win;	/* initial window */
 extern	int tcp_init_win_local;	/* initial window for local nets */
@@ -789,7 +790,7 @@ extern	struct mowner tcp_mowner;
 	{ 1, 0, &tcp_keepintvl },		\
 	{ 1, 0, &tcp_keepcnt },			\
 	{ 1, 1, 0, PR_SLOWHZ },			\
-	{ 1, 0, &tcp_do_newreno },		\
+	{ 0 },					\
 	{ 1, 0, &tcp_log_refused },		\
 	{ 0 },					\
 	{ 1, 0, &tcp_rst_ppslim },		\
@@ -909,9 +910,6 @@ int	 syn_cache_respond(struct syn_cache 
 void	 syn_cache_timer(void *);
 void	 syn_cache_cleanup(struct tcpcb *);
 
-void	 tcp_reno_newack(struct tcpcb *, struct tcphdr *);
-void	 tcp_newreno_newack(struct tcpcb *, struct tcphdr *);
-
 int	 tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
     int);
 #endif

--Apple-Mail-1-336914240
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed


--
Rui Paulo



--Apple-Mail-1-336914240--