Subject: Re: Refactoring Congestion Control (take 2)
To: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
From: Rui Paulo <rpaulo@fnop.net>
List: tech-net
Date: 09/23/2006 18:32:20
--Apple-Mail-1-336914240
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
delsp=yes;
format=flowed
On Sep 21, 2006, at 4:18 AM, YAMAMOTO Takashi wrote:
>> Any other comments?
>
> i think it's better to copy tcp_congctl_global to a member in tcpcb
> so that it's somewhat static for a given connection.
> switching the sysctl knob correctly when it affects existing
> connections
> is a locking nightmare.
>
Maybe something like:
--Apple-Mail-1-336914240
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
x-unix-mode=0644;
name=tcp_congctl.diff
Content-Disposition: attachment;
filename=tcp_congctl.diff
Index: files.netinet
===================================================================
RCS file: /cvsroot/src/sys/netinet/files.netinet,v
retrieving revision 1.11
diff -u -p -r1.11 files.netinet
--- files.netinet 11 Dec 2005 12:24:57 -0000 1.11
+++ files.netinet 23 Sep 2006 17:28:35 -0000
@@ -15,6 +15,8 @@ defparam opt_tcp_space.h TCP_RECVSPACE T
defflag opt_inet_csum.h INET_CSUM_COUNTERS TCP_CSUM_COUNTERS
UDP_CSUM_COUNTERS
+defparam opt_tcp_congctl.h TCP_CONGCTL_DEFAULT
+
file netinet/igmp.c inet
file netinet/in.c inet
file netinet/in_pcb.c inet
@@ -34,5 +36,6 @@ file netinet/tcp_sack.c inet | inet6
file netinet/tcp_subr.c inet | inet6
file netinet/tcp_timer.c inet | inet6
file netinet/tcp_usrreq.c inet | inet6
+file netinet/tcp_congctl.c inet | inet6
file netinet/udp_usrreq.c inet | inet6
Index: tcp.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp.h,v
retrieving revision 1.22
diff -u -p -r1.22 tcp.h
--- tcp.h 5 Sep 2006 00:29:36 -0000 1.22
+++ tcp.h 23 Sep 2006 17:28:35 -0000
@@ -118,5 +118,6 @@ struct tcphdr {
#define TCP_MAXSEG 0x02 /* set maximum segment size */
/* Bits 0x04, 0x08 reserved for FreeBSD compatibility: TCP_NOPUSH, TCP_NOOPT */
#define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */
+#define TCP_CONGCTL 0x20 /* selected congestion control */
#endif /* !_NETINET_TCP_H_ */
Index: tcp_congctl.c
===================================================================
RCS file: tcp_congctl.c
diff -N tcp_congctl.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tcp_congctl.c 23 Sep 2006 17:28:36 -0000
@@ -0,0 +1,615 @@
+/* $NetBSD$ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
+ *
+ * NRL grants permission for redistribution and use in source and binary
+ * forms, with or without modification, of the software and documentation
+ * created at NRL provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgements:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * This product includes software developed at the Information
+ * Technology Division, US Naval Research Laboratory.
+ * 4. Neither the name of the NRL nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
+ * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation
+ * are those of the authors and should not be interpreted as representing
+ * official policies, either expressed or implied, of the US Naval
+ * Research Laboratory (NRL).
+ */
+
+/*-
+ * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
+ * Facility, NASA Ames Research Center.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD$");
+
+#include "opt_inet.h"
+#include "opt_tcp_debug.h"
+#include "opt_tcp_congctl.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/errno.h>
+#include <sys/syslog.h>
+#include <sys/pool.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+
+#ifdef INET6
+#ifndef INET
+#include <netinet/in.h>
+#endif
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_var.h>
+#include <netinet/icmp6.h>
+#include <netinet6/nd6.h>
+#endif
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_congctl.h>
+#ifdef TCP_DEBUG
+#include <netinet/tcp_debug.h>
+#endif
+
+/*
+ * TODO:
+ * consider separating the actual implementations in another file.
+ */
+
+static int tcp_reno_fast_retransmit(struct tcpcb *, struct tcphdr *);
+static void tcp_reno_slow_retransmit(struct tcpcb *);
+static void tcp_reno_cwnd_inflation(struct tcpcb *, struct tcphdr *);
+static void tcp_reno_new_data_acked(struct tcpcb *, struct tcphdr *);
+
+static int tcp_newreno_fast_retransmit(struct tcpcb *, struct tcphdr *);
+static void tcp_newreno_cwnd_inflation(struct tcpcb *, struct tcphdr *);
+
+static void tcp_congctl_fillnames(void);
+
+extern int tcprexmtthresh;
+
+MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
+
+/*
+ * Used to list the available congestion control algorithms.
+ */
+struct tcp_congctlent {
+ TAILQ_ENTRY(tcp_congctlent) congctl_ent;
+ char congctl_name[TCPCC_MAXLEN];
+ struct tcp_congctl *congctl_ctl;
+};
+TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd;
+
+struct simplelock tcp_congctl_slock;
+
+void
+tcp_congctl_init(void)
+{
+ int r;
+
+ TAILQ_INIT(&tcp_congctlhd);
+ simple_lock_init(&tcp_congctl_slock);
+
+ /* Base algorithms. */
+ r = tcp_congctl_register("reno", &tcp_reno_ctl);
+ KASSERT(r == 0);
+ r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
+ KASSERT(r == 0);
+
+ /* NewReno is the default. */
+#ifndef TCP_CONGCTL_DEFAULT
+#define TCP_CONGCTL_DEFAULT "newreno"
+#endif
+
+ r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
+ KASSERT(r == 0);
+}
+
+/*
+ * Register a congestion algorithm and select it if we have none.
+ */
+int
+tcp_congctl_register(const char *name, struct tcp_congctl *tcc)
+{
+ struct tcp_congctlent *ntcc, *tccp;
+
+ TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
+ if (!strcmp(name, tccp->congctl_name)) {
+ /* name already registered */
+ return EEXIST;
+ }
+
+ ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK);
+
+ strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
+ ntcc->congctl_ctl = tcc;
+
+ TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
+ tcp_congctl_fillnames();
+
+ if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
+ tcp_congctl_select(NULL, name);
+
+ return 0;
+}
+
+int
+tcp_congctl_unregister(const char *name)
+{
+ struct tcp_congctlent *tccp, *rtccp;
+ unsigned int size;
+
+ rtccp = NULL;
+ size = 0;
+ TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
+ if (!strcmp(name, tccp->congctl_name))
+ rtccp = tccp;
+ size++;
+ }
+
+ if (!rtccp)
+ return ENOENT;
+
+ if (size <= 1 || tcp_congctl_global == rtccp->congctl_ctl ||
+ rtccp->congctl_ctl->refcnt)
+ return EBUSY;
+
+ TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
+ free(rtccp, M_TCPCONGCTL);
+ tcp_congctl_fillnames();
+
+ return 0;
+}
+
+/*
+ * Select a congestion algorithm by name.
+ */
+int
+tcp_congctl_select(struct tcpcb *tp, const char *name)
+{
+ struct tcp_congctlent *tccp;
+
+ KASSERT(name);
+
+ TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
+ if (!strcmp(name, tccp->congctl_name)) {
+ if (tp) {
+ simple_lock(&tcp_congctl_slock);
+ tp->t_congctl->refcnt--;
+ tp->t_congctl = tccp->congctl_ctl;
+ tp->t_congctl->refcnt++;
+ simple_unlock(&tcp_congctl_slock);
+ } else {
+ tcp_congctl_global = tccp->congctl_ctl;
+ strlcpy(tcp_congctl_global_name,
+ tccp->congctl_name,
+ sizeof(tcp_congctl_global_name) - 1);
+ }
+ return 0;
+ }
+
+ return EINVAL;
+}
+
+/*
+ * Returns the name of a congestion algorithm.
+ */
+const char *
+tcp_congctl_bystruct(const struct tcp_congctl *tcc)
+{
+ struct tcp_congctlent *tccp;
+
+ KASSERT(tcc);
+
+ TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
+ if (tccp->congctl_ctl == tcc)
+ return tccp->congctl_name;
+
+ return NULL;
+}
+
+static void
+tcp_congctl_fillnames(void)
+{
+ struct tcp_congctlent *tccp;
+ const char *delim = " ";
+
+ tcp_congctl_avail[0] = '\0';
+ TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
+ strlcat(tcp_congctl_avail, tccp->congctl_name,
+ sizeof(tcp_congctl_avail) - 1);
+ if (TAILQ_NEXT(tccp, congctl_ent))
+ strlcat(tcp_congctl_avail, delim,
+ sizeof(tcp_congctl_avail) - 1);
+ }
+
+}
+
+/* ------------------------------------------------------------------------ */
+
+inline void
+tcp_reno_congestion_exp(struct tcpcb *tp)
+{
+ u_int win;
+
+ /*
+ * Halve the congestion window and reduce the
+ * slow start threshold.
+ */
+ win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+ if (win < 2)
+ win = 2;
+
+ tp->snd_ssthresh = win * tp->t_segsz;
+ tp->snd_recover = tp->snd_max;
+ tp->snd_cwnd = tp->snd_ssthresh;
+
+ if (TCP_ECN_ALLOWED(tp))
+ tp->t_flags |= TF_ECN_SND_CWR;
+}
+
+
+/*
+ * TCP/Reno congestion control.
+ */
+static int
+tcp_reno_fast_retransmit(struct tcpcb *tp, struct tcphdr *th)
+{
+ tcp_seq onxt;
+
+ onxt = tp->snd_nxt;
+ tcp_reno_congestion_exp(tp);
+ tp->t_partialacks = 0;
+ TCP_TIMER_DISARM(tp, TCPT_REXMT);
+ tp->t_rtttime = 0;
+ if (TCP_SACK_ENABLED(tp)) {
+ tp->t_dupacks = tcprexmtthresh;
+ tp->sack_newdata = tp->snd_nxt;
+ tp->snd_cwnd = tp->t_segsz;
+ (void) tcp_output(tp);
+ return 0;
+ }
+ tp->snd_nxt = th->th_ack;
+ tp->snd_cwnd = tp->t_segsz;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+
+ return 0;
+}
+
+static void
+tcp_reno_slow_retransmit(struct tcpcb *tp)
+{
+ u_int win;
+
+ /*
+ * Close the congestion window down to one segment
+ * (we'll open it by one segment for each ack we get).
+ * Since we probably have a window's worth of unacked
+ * data accumulated, this "slow start" keeps us from
+ * dumping all that data as back-to-back packets (which
+ * might overwhelm an intermediate gateway).
+ *
+ * There are two phases to the opening: Initially we
+ * open by one mss on each ack. This makes the window
+ * size increase exponentially with time. If the
+ * window is larger than the path can handle, this
+ * exponential growth results in dropped packet(s)
+ * almost immediately. To get more time between
+ * drops but still "push" the network to take advantage
+ * of improving conditions, we switch from exponential
+ * to linear window opening at some threshhold size.
+ * For a threshhold, we use half the current window
+ * size, truncated to a multiple of the mss.
+ *
+ * (the minimum cwnd that will give us exponential
+ * growth is 2 mss. We don't allow the threshhold
+ * to go below this.)
+ */
+
+ win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+ if (win < 2)
+ win = 2;
+ /* Loss Window MUST be one segment. */
+ tp->snd_cwnd = tp->t_segsz;
+ tp->snd_ssthresh = win * tp->t_segsz;
+ tp->t_partialacks = -1;
+ tp->t_dupacks = 0;
+}
+
+static void
+tcp_reno_cwnd_inflation(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (tp->t_partialacks < 0) {
+ /*
+ * We were not in fast recovery. Reset the duplicate ack
+ * counter.
+ */
+ tp->t_dupacks = 0;
+ } else {
+ /*
+ * Clamp the congestion window to the crossover point and
+ * exit fast recovery.
+ */
+ if (tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_partialacks = -1;
+ tp->t_dupacks = 0;
+ }
+}
+
+static void
+tcp_reno_new_data_acked(struct tcpcb *tp, struct tcphdr *th)
+{
+ u_int cw;
+ u_int incr;
+
+ /*
+ * When new data is acked, open the congestion window.
+ * If the window gives us less than ssthresh packets
+ * in flight, open exponentially (segsz per packet).
+ * Otherwise open linearly: segsz per window
+ * (segsz^2 / cwnd per packet), plus a constant
+ * fraction of a packet (segsz/8) to help larger windows
+ * open quickly enough.
+ */
+ cw = tp->snd_cwnd;
+ incr = tp->t_segsz;
+ if (cw > tp->snd_ssthresh)
+ incr = incr * incr / cw;
+ if (tp->t_congctl == &tcp_reno_ctl ||
+ SEQ_GEQ(th->th_ack, tp->snd_recover))
+ tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale);
+}
+
+struct tcp_congctl tcp_reno_ctl = {
+ .fast_retransmit = tcp_reno_fast_retransmit,
+ .slow_retransmit = tcp_reno_slow_retransmit,
+ .cwnd_inflation = tcp_reno_cwnd_inflation,
+ .new_data_acked = tcp_reno_new_data_acked,
+};
+
+/*
+ * TCP/NewReno Congestion control.
+ */
+static int
+tcp_newreno_fast_retransmit(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (SEQ_LT(th->th_ack, tp->snd_high)) {
+ /*
+ * False fast retransmit after timeout.
+ * Do not enter fast recovery
+ */
+ tp->t_dupacks = 0;
+ return 1;
+ } else {
+ /*
+ * Fast retransmit is same as reno.
+ */
+ return tcp_reno_fast_retransmit(tp, th);
+ }
+
+ return 0;
+}
+
+/*
+ * Implement the NewReno response to a new ack, checking for partial acks in
+ * fast recovery.
+ */
+static void
+tcp_newreno_cwnd_inflation(struct tcpcb *tp, struct tcphdr *th)
+{
+ if (tp->t_partialacks < 0) {
+ /*
+ * We were not in fast recovery. Reset the duplicate ack
+ * counter.
+ */
+ tp->t_dupacks = 0;
+ } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ /*
+ * This is a partial ack. Retransmit the first unacknowledged
+ * segment and deflate the congestion window by the amount of
+ * acknowledged data. Do not exit fast recovery.
+ */
+ tcp_seq onxt = tp->snd_nxt;
+ u_long ocwnd = tp->snd_cwnd;
+
+ /*
+ * snd_una has not yet been updated and the socket's send
+ * buffer has not yet drained off the ACK'd data, so we
+ * have to leave snd_una as it was to get the correct data
+ * offset in tcp_output().
+ */
+ if (++tp->t_partialacks == 1)
+ TCP_TIMER_DISARM(tp, TCPT_REXMT);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = th->th_ack;
+ /*
+ * Set snd_cwnd to one segment beyond ACK'd offset. snd_una
+ * is not yet updated when we're called.
+ */
+ tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ /*
+ * Partial window deflation. Relies on fact that tp->snd_una
+ * not updated yet.
+ */
+ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
+ } else {
+ /*
+ * Complete ack. Inflate the congestion window to ssthresh
+ * and exit fast recovery.
+ *
+ * Window inflation should have left us with approx.
+ * snd_ssthresh outstanding data. But in case we
+ * would be inclined to send a burst, better to do
+ * it via the slow start mechanism.
+ */
+ if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
+ tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+ + tp->t_segsz;
+ else
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_partialacks = -1;
+ tp->t_dupacks = 0;
+ }
+}
+
+struct tcp_congctl tcp_newreno_ctl = {
+ .fast_retransmit = tcp_newreno_fast_retransmit,
+ .slow_retransmit = tcp_reno_slow_retransmit,
+ .cwnd_inflation = tcp_newreno_cwnd_inflation,
+ .new_data_acked = tcp_reno_new_data_acked,
+};
+
+
Index: tcp_congctl.h
===================================================================
RCS file: tcp_congctl.h
diff -N tcp_congctl.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ tcp_congctl.h 23 Sep 2006 17:28:36 -0000
@@ -0,0 +1,77 @@
+/* $NetBSD$ */
+
+/*
+ * Copyright (c) 2006 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Rui Paulo.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_TCP_CONGCTL_H
+#define _NETINET_TCP_CONGCTL_H
+
+/*
+ * Congestion control function table.
+ */
+struct tcp_congctl {
+ int (*fast_retransmit)(struct tcpcb *, struct tcphdr *);
+ void (*slow_retransmit)(struct tcpcb *);
+ void (*cwnd_inflation)(struct tcpcb *, struct tcphdr *);
+ void (*new_data_acked)(struct tcpcb *, struct tcphdr *);
+
+ int32_t refcnt;
+};
+
+extern struct tcp_congctl tcp_reno_ctl;
+extern struct tcp_congctl tcp_newreno_ctl;
+
+extern struct simplelock tcp_congctl_slock;
+
+#define TCPCC_MAXLEN 12
+
+/* currently selected global congestion control */
+struct tcp_congctl *tcp_congctl_global;
+char tcp_congctl_global_name[TCPCC_MAXLEN];
+
+/* available global congestion control algorithms */
+char tcp_congctl_avail[10 * TCPCC_MAXLEN];
+
+void tcp_congctl_init(void);
+int tcp_congctl_register(const char *, struct tcp_congctl *);
+int tcp_congctl_unregister(const char *);
+int tcp_congctl_select(struct tcpcb *, const char *);
+const char *
+ tcp_congctl_bystruct(const struct tcp_congctl *);
+
+inline void tcp_reno_congestion_exp(struct tcpcb *);
+
+#endif
Index: tcp_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_input.c,v
retrieving revision 1.244
diff -u -p -r1.244 tcp_input.c
--- tcp_input.c 5 Sep 2006 00:29:36 -0000 1.244
+++ tcp_input.c 23 Sep 2006 17:28:41 -0000
@@ -214,6 +214,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/tcp_congctl.h>
#include <netinet/tcp_debug.h>
#include <machine/stdarg.h>
@@ -238,8 +239,6 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,
#endif
#endif /* FAST_IPSEC*/
-static inline void tcp_congestion_exp(struct tcpcb *);
-
int tcprexmtthresh = 3;
int tcp_log_refused;
@@ -409,28 +408,6 @@ tcpipqent_free(struct ipqent *ipqe)
splx(s);
}
-/*
- * Halve the congestion window and reduce the
- * slow start threshold.
- *
- * Optionally, mark the packet.
- */
-static inline void
-tcp_congestion_exp(struct tcpcb *tp)
-{
- u_int win;
-
- win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
- if (win < 2)
- win = 2;
-
- tp->snd_ssthresh = win * tp->t_segsz;
- tp->snd_recover = tp->snd_max;
- tp->snd_cwnd = tp->snd_ssthresh;
- if (TCP_ECN_ALLOWED(tp))
- tp->t_flags |= TF_ECN_SND_CWR;
-}
-
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
{
@@ -1638,7 +1615,7 @@ after_listen:
* Ignore if we are already trying to recover.
*/
if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
- tcp_congestion_exp(tp);
+ tcp_reno_congestion_exp(tp);
}
if (opti.ts_present && opti.ts_ecr) {
@@ -2278,39 +2255,15 @@ after_listen:
else if (tp->t_partialacks < 0 &&
(++tp->t_dupacks == tcprexmtthresh ||
TCP_FACK_FASTRECOV(tp))) {
- tcp_seq onxt;
-
- if ((tcp_do_newreno || tcp_do_ecn) &&
- SEQ_LT(th->th_ack, tp->snd_high)) {
- /*
- * False fast retransmit after
- * timeout. Do not enter fast
- * recovery.
- */
- tp->t_dupacks = 0;
+ /*
+ * Do the fast retransmit, and adjust
+ * congestion control paramenters.
+ */
+ if (tp->t_congctl->fast_retransmit(tp, th)) {
+ /* False fast retransmit */
break;
- }
-
- onxt = tp->snd_nxt;
- tcp_congestion_exp(tp);
- tp->t_partialacks = 0;
- TCP_TIMER_DISARM(tp, TCPT_REXMT);
- tp->t_rtttime = 0;
- if (TCP_SACK_ENABLED(tp)) {
- tp->t_dupacks = tcprexmtthresh;
- tp->sack_newdata = tp->snd_nxt;
- tp->snd_cwnd = tp->t_segsz;
- (void) tcp_output(tp);
+ } else
goto drop;
- }
- tp->snd_nxt = th->th_ack;
- tp->snd_cwnd = tp->t_segsz;
- (void) tcp_output(tp);
- tp->snd_cwnd = tp->snd_ssthresh +
- tp->t_segsz * tp->t_dupacks;
- if (SEQ_GT(onxt, tp->snd_nxt))
- tp->snd_nxt = onxt;
- goto drop;
} else if (tp->t_dupacks > tcprexmtthresh) {
tp->snd_cwnd += tp->t_segsz;
(void) tcp_output(tp);
@@ -2336,12 +2289,12 @@ after_listen:
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
+ /* XXX: make SACK have his own congestion control
+ * struct -- rpaulo */
if (TCP_SACK_ENABLED(tp))
tcp_sack_newack(tp, th);
- else if (tcp_do_newreno)
- tcp_newreno_newack(tp, th);
else
- tcp_reno_newack(tp, th);
+ tp->t_congctl->cwnd_inflation(tp, th);
if (SEQ_GT(th->th_ack, tp->snd_max)) {
tcpstat.tcps_rcvacktoomuch++;
goto dropafterack;
@@ -2375,26 +2328,12 @@ after_listen:
needoutput = 1;
} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
+
/*
- * When new data is acked, open the congestion window.
- * If the window gives us less than ssthresh packets
- * in flight, open exponentially (segsz per packet).
- * Otherwise open linearly: segsz per window
- * (segsz^2 / cwnd per packet).
- *
- * If we are still in fast recovery (meaning we are using
- * NewReno and we have only received partial acks), do not
- * inflate the window yet.
- */
- if (tp->t_partialacks < 0) {
- u_int cw = tp->snd_cwnd;
- u_int incr = tp->t_segsz;
-
- if (cw >= tp->snd_ssthresh)
- incr = incr * incr / cw;
- tp->snd_cwnd = min(cw + incr,
- TCP_MAXWIN << tp->snd_scale);
- }
+ * New data has been acked, adjust the congestion window.
+ */
+ tp->t_congctl->new_data_acked(tp, th);
+
ND6_HINT(tp);
if (acked > so->so_snd.sb_cc) {
tp->snd_wnd -= so->so_snd.sb_cc;
@@ -3220,93 +3159,6 @@ tcp_xmit_timer(struct tcpcb *tp, uint32_
tp->t_softerror = 0;
}
-void
-tcp_reno_newack(struct tcpcb *tp, struct tcphdr *th)
-{
- if (tp->t_partialacks < 0) {
- /*
- * We were not in fast recovery. Reset the duplicate ack
- * counter.
- */
- tp->t_dupacks = 0;
- } else {
- /*
- * Clamp the congestion window to the crossover point and
- * exit fast recovery.
- */
- if (tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd = tp->snd_ssthresh;
- tp->t_partialacks = -1;
- tp->t_dupacks = 0;
- }
-}
-
-/*
- * Implement the NewReno response to a new ack, checking for partial acks in
- * fast recovery.
- */
-void
-tcp_newreno_newack(struct tcpcb *tp, struct tcphdr *th)
-{
- if (tp->t_partialacks < 0) {
- /*
- * We were not in fast recovery. Reset the duplicate ack
- * counter.
- */
- tp->t_dupacks = 0;
- } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- /*
- * This is a partial ack. Retransmit the first unacknowledged
- * segment and deflate the congestion window by the amount of
- * acknowledged data. Do not exit fast recovery.
- */
- tcp_seq onxt = tp->snd_nxt;
- u_long ocwnd = tp->snd_cwnd;
-
- /*
- * snd_una has not yet been updated and the socket's send
- * buffer has not yet drained off the ACK'd data, so we
- * have to leave snd_una as it was to get the correct data
- * offset in tcp_output().
- */
- if (++tp->t_partialacks == 1)
- TCP_TIMER_DISARM(tp, TCPT_REXMT);
- tp->t_rtttime = 0;
- tp->snd_nxt = th->th_ack;
- /*
- * Set snd_cwnd to one segment beyond ACK'd offset. snd_una
- * is not yet updated when we're called.
- */
- tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
- (void) tcp_output(tp);
- tp->snd_cwnd = ocwnd;
- if (SEQ_GT(onxt, tp->snd_nxt))
- tp->snd_nxt = onxt;
- /*
- * Partial window deflation. Relies on fact that tp->snd_una
- * not updated yet.
- */
- tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
- } else {
- /*
- * Complete ack. Inflate the congestion window to ssthresh
- * and exit fast recovery.
- *
- * Window inflation should have left us with approx.
- * snd_ssthresh outstanding data. But in case we
- * would be inclined to send a burst, better to do
- * it via the slow start mechanism.
- */
- if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
- tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
- + tp->t_segsz;
- else
- tp->snd_cwnd = tp->snd_ssthresh;
- tp->t_partialacks = -1;
- tp->t_dupacks = 0;
- }
-}
-
/*
* TCP compressed state engine. Currently used to hold compressed
Index: tcp_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_output.c,v
retrieving revision 1.143
diff -u -p -r1.143 tcp_output.c
--- tcp_output.c 5 Sep 2006 00:29:36 -0000 1.143
+++ tcp_output.c 23 Sep 2006 17:28:50 -0000
@@ -196,6 +196,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_output.c
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_debug.h>
#include <netinet/in_offload.h>
@@ -1604,7 +1605,7 @@ out:
if (maxburst < 0)
printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
#endif
- if (sendalot && (!tcp_do_newreno || --maxburst))
+ if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
goto again;
return (0);
}
Index: tcp_subr.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.199
diff -u -p -r1.199 tcp_subr.c
--- tcp_subr.c 5 Sep 2006 00:29:36 -0000 1.199
+++ tcp_subr.c 23 Sep 2006 17:29:05 -0000
@@ -151,6 +151,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
#include <netinet/tcpip.h>
#ifdef IPSEC
@@ -182,7 +183,6 @@ int tcp_do_rfc1948 = 0; /* ISS by crypto
int tcp_do_sack = 1; /* selective acknowledgement */
int tcp_do_win_scale = 1; /* RFC1323 window scaling */
int tcp_do_timestamps = 1; /* RFC1323 timestamps */
-int tcp_do_newreno = 1; /* Use the New Reno algorithms */
int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
int tcp_do_ecn = 0; /* Explicit Congestion Notification */
#ifndef TCP_INIT_WIN
@@ -207,7 +207,6 @@ int tcp_sack_globalmaxholes = 1024;
int tcp_sack_globalholes = 0;
int tcp_ecn_maxretries = 1;
-
/* tcb hash */
#ifndef TCBHASHSIZE
#define TCBHASHSIZE 128
@@ -402,6 +401,9 @@ tcp_init(void)
/* Initialize the compressed state engine. */
syn_cache_init();
+ /* Initialize the congestion control algorithms. */
+ tcp_congctl_init();
+
MOWNER_ATTACH(&tcp_tx_mowner);
MOWNER_ATTACH(&tcp_rx_mowner);
MOWNER_ATTACH(&tcp_mowner);
@@ -1034,7 +1036,10 @@ tcp_newtcpcb(int family, void *aux)
* and thus how many TCP sequence increments have occurred.
*/
tp->ts_timebase = tcp_now;
-
+
+ tp->t_congctl = tcp_congctl_global;
+ tp->t_congctl->refcnt++;
+
return (tp);
}
@@ -1211,6 +1216,8 @@ tcp_close(struct tcpcb *tp)
/* free the SACK holes list. */
tcp_free_sackholes(tp);
+
+ tp->t_congctl->refcnt--;
tcp_canceltimers(tp);
TCP_CLEAR_DELACK(tp);
Index: tcp_timer.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.75
diff -u -p -r1.75 tcp_timer.c
--- tcp_timer.c 14 May 2006 21:19:34 -0000 1.75
+++ tcp_timer.c 23 Sep 2006 17:29:05 -0000
@@ -138,6 +138,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_timer.c,
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
#include <netinet/tcpip.h>
#ifdef TCP_DEBUG
#include <netinet/tcp_debug.h>
@@ -416,40 +417,12 @@ tcp_timer_rexmt(void *arg)
*/
if (tp->t_state == TCPS_SYN_SENT)
tp->t_flags |= TF_SYN_REXMT;
+
/*
- * Close the congestion window down to one segment
- * (we'll open it by one segment for each ack we get).
- * Since we probably have a window's worth of unacked
- * data accumulated, this "slow start" keeps us from
- * dumping all that data as back-to-back packets (which
- * might overwhelm an intermediate gateway).
- *
- * There are two phases to the opening: Initially we
- * open by one mss on each ack. This makes the window
- * size increase exponentially with time. If the
- * window is larger than the path can handle, this
- * exponential growth results in dropped packet(s)
- * almost immediately. To get more time between
- * drops but still "push" the network to take advantage
- * of improving conditions, we switch from exponential
- * to linear window opening at some threshhold size.
- * For a threshhold, we use half the current window
- * size, truncated to a multiple of the mss.
- *
- * (the minimum cwnd that will give us exponential
- * growth is 2 mss. We don't allow the threshhold
- * to go below this.)
- */
- {
- u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
- if (win < 2)
- win = 2;
- /* Loss Window MUST be one segment. */
- tp->snd_cwnd = tp->t_segsz;
- tp->snd_ssthresh = win * tp->t_segsz;
- tp->t_partialacks = -1;
- tp->t_dupacks = 0;
- }
+ * Adjust congestion control parameters.
+ */
+ tp->t_congctl->slow_retransmit(tp);
+
(void) tcp_output(tp);
out:
Index: tcp_usrreq.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.122
diff -u -p -r1.122 tcp_usrreq.c
--- tcp_usrreq.c 13 Sep 2006 10:07:42 -0000 1.122
+++ tcp_usrreq.c 23 Sep 2006 17:29:08 -0000
@@ -149,6 +149,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_congctl.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_debug.h>
@@ -716,6 +717,13 @@ tcp_ctloutput(int op, struct socket *so,
else
error = EINVAL;
break;
+#if 0
+ case TCP_CONGCTL:
+ if (m == NULL)
+ error = EINVAL;
+ error = tcp_congctl_select(tp, mtod(m, char *));
+#endif
+ break;
default:
error = ENOPROTOOPT;
@@ -742,6 +750,10 @@ tcp_ctloutput(int op, struct socket *so,
case TCP_MAXSEG:
*mtod(m, int *) = tp->t_peermss;
break;
+#if 0
+ case TCP_CONGCTL:
+ break;
+#endif
default:
error = ENOPROTOOPT;
break;
@@ -1380,6 +1392,32 @@ sysctl_inpcblist(SYSCTLFN_ARGS)
return (error);
}
+static int
+sysctl_tcp_congctl(SYSCTLFN_ARGS)
+{
+ struct sysctlnode node;
+ int error, r;
+ char newname[TCPCC_MAXLEN];
+
+ strlcpy(newname, tcp_congctl_global_name, sizeof(newname) - 1);
+
+ node = *rnode;
+ node.sysctl_data = newname;
+ node.sysctl_size = sizeof(newname);
+
+ error = sysctl_lookup(SYSCTLFN_CALL(&node));
+
+ if (error ||
+ newp == NULL ||
+ strncmp(newname, tcp_congctl_global_name, sizeof(newname)) == 0)
+ return error;
+
+ if ((r = tcp_congctl_select(NULL, newname)))
+ return r;
+
+ return error;
+}
+
/*
* this (second stage) setup routine is a replacement for tcp_sysctl()
* (which is currently used for ipv4 and ipv6)
@@ -1388,7 +1426,7 @@ static void
sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname,
const char *tcpname)
{
- int ecn_node;
+ int ecn_node, congctl_node;
const struct sysctlnode *sack_node, *node;
#ifdef TCP_DEBUG
extern struct tcp_debug tcp_debug[TCP_NDEBUG];
@@ -1487,6 +1525,28 @@ sysctl_net_inet_tcp_setup2(struct sysctl
NULL, 0, NULL, 0,
CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
ecn_node = node->sysctl_num;
+ sysctl_createv(clog, 0, NULL, &node,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "congctl",
+ SYSCTL_DESCR("TCP Congestion Control"),
+ NULL, 0, NULL, 0,
+ CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
+ congctl_node = node->sysctl_num;
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_STRING, "available",
+ SYSCTL_DESCR("Available Congestion Control Mechanisms"),
+ NULL, 0, &tcp_congctl_avail, 0,
+ CTL_NET, pf, IPPROTO_TCP, congctl_node,
+ CTL_CREATE, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_STRING, "selected",
+ SYSCTL_DESCR("Selected Congestion Control Mechanism"),
+ sysctl_tcp_congctl, 0, &tcp_congctl_global_name, 0,
+ CTL_NET, pf, IPPROTO_TCP, congctl_node,
+ CTL_CREATE, CTL_EOL);
+
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "win_scale",
@@ -1554,12 +1614,6 @@ sysctl_net_inet_tcp_setup2(struct sysctl
CTL_NET, pf, IPPROTO_TCP, TCPCTL_SLOWHZ, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
- CTLTYPE_INT, "newreno",
- SYSCTL_DESCR("NewReno congestion control algorithm"),
- NULL, 0, &tcp_do_newreno, 0,
- CTL_NET, pf, IPPROTO_TCP, TCPCTL_NEWRENO, CTL_EOL);
- sysctl_createv(clog, 0, NULL, NULL,
- CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "log_refused",
SYSCTL_DESCR("Log refused TCP connections"),
NULL, 0, &tcp_log_refused, 0,
Index: tcp_var.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/tcp_var.h,v
retrieving revision 1.137
diff -u -p -r1.137 tcp_var.h
--- tcp_var.h 5 Sep 2006 00:29:36 -0000 1.137
+++ tcp_var.h 23 Sep 2006 17:29:09 -0000
@@ -325,6 +325,8 @@ struct tcpcb {
u_short t_pmtud_ip_hl; /* IP header length from ICMP payload */
uint8_t t_ecn_retries; /* # of ECN setup retries */
+
+ struct tcp_congctl *t_congctl; /* per TCB congctl algorithm */
};
/*
@@ -714,7 +716,7 @@ struct tcpstat {
{ "keepintvl", CTLTYPE_INT }, \
{ "keepcnt", CTLTYPE_INT }, \
{ "slowhz", CTLTYPE_INT }, \
- { "newreno", CTLTYPE_INT }, \
+ { 0, 0 }, \
{ "log_refused",CTLTYPE_INT }, \
{ 0, 0 }, \
{ "rstppslimit", CTLTYPE_INT }, \
@@ -736,7 +738,6 @@ extern int tcp_do_rfc1323; /* enabled/di
extern int tcp_do_sack; /* SACK enabled/disabled? */
extern int tcp_do_win_scale; /* RFC1323 window scaling enabled/disabled? */
extern int tcp_do_timestamps; /* RFC1323 timestamps enabled/disabled? */
-extern int tcp_do_newreno; /* Use the New Reno algorithms */
extern int tcp_mssdflt; /* default seg size */
extern int tcp_init_win; /* initial window */
extern int tcp_init_win_local; /* initial window for local nets */
@@ -789,7 +790,7 @@ extern struct mowner tcp_mowner;
{ 1, 0, &tcp_keepintvl }, \
{ 1, 0, &tcp_keepcnt }, \
{ 1, 1, 0, PR_SLOWHZ }, \
- { 1, 0, &tcp_do_newreno }, \
+ { 0 }, \
{ 1, 0, &tcp_log_refused }, \
{ 0 }, \
{ 1, 0, &tcp_rst_ppslim }, \
@@ -909,9 +910,6 @@ int syn_cache_respond(struct syn_cache
void syn_cache_timer(void *);
void syn_cache_cleanup(struct tcpcb *);
-void tcp_reno_newack(struct tcpcb *, struct tcphdr *);
-void tcp_newreno_newack(struct tcpcb *, struct tcphdr *);
-
int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
int);
#endif
--Apple-Mail-1-336914240
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
format=flowed
--
Rui Paulo
--Apple-Mail-1-336914240--