Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/netinet * implement TCP CUBIC congestion control algorithm
details: https://anonhg.NetBSD.org/src/rev/aeccba13f496
branches: trunk
changeset: 791294:aeccba13f496
user: kefren <kefren%NetBSD.org@localhost>
date: Tue Nov 12 09:02:05 2013 +0000
description:
* implement TCP CUBIC congestion control algorithm
* move tcp_sack_newack bits inside reno and newreno_fast_retransmit_newack
* notify ECN peer about cwnd shrink in [new]reno_slow_retransmit
Based on the patch proposed on tech-net@ on Nov 7 with minor improvments:
* adapt wmax for no-fast convergence case
* correct cbrt calculation for big window sizes (>750KB)
diffstat:
sys/netinet/tcp_congctl.c | 303 +++++++++++++++++++++++++++++++++++++++++----
sys/netinet/tcp_congctl.h | 3 +-
sys/netinet/tcp_input.c | 12 +-
sys/netinet/tcp_sack.c | 67 +---------
sys/netinet/tcp_subr.c | 7 +-
sys/netinet/tcp_var.h | 8 +-
6 files changed, 294 insertions(+), 106 deletions(-)
diffs (truncated from 612 to 300 lines):
diff -r 56f4e9a7b741 -r aeccba13f496 sys/netinet/tcp_congctl.c
--- a/sys/netinet/tcp_congctl.c Tue Nov 12 06:07:30 2013 +0000
+++ b/sys/netinet/tcp_congctl.c Tue Nov 12 09:02:05 2013 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: tcp_congctl.c,v 1.17 2013/10/25 16:29:20 martin Exp $ */
+/* $NetBSD: tcp_congctl.c,v 1.18 2013/11/12 09:02:05 kefren Exp $ */
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
@@ -135,7 +135,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.17 2013/10/25 16:29:20 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.18 2013/11/12 09:02:05 kefren Exp $");
#include "opt_inet.h"
#include "opt_tcp_debug.h"
@@ -194,6 +194,9 @@
* consider separating the actual implementations in another file.
*/
+static void tcp_common_congestion_exp(struct tcpcb *, int, int);
+
+static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_slow_retransmit(struct tcpcb *);
static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
@@ -206,6 +209,10 @@
const struct tcphdr *);
static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
+static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
+static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
+static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
+static void tcp_cubic_congestion_exp(struct tcpcb *);
static void tcp_congctl_fillnames(void);
@@ -241,6 +248,8 @@
KASSERT(r == 0);
r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
KASSERT(r == 0);
+ r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
+ KASSERT(r == 0);
/* NewReno is the default. */
#ifndef TCP_CONGCTL_DEFAULT
@@ -406,18 +415,28 @@
/* ------------------------------------------------------------------------ */
/*
- * TCP/Reno congestion control.
+ * Common stuff
*/
+
+/* Window reduction (1-beta) for [New]Reno: 0.5 */
+#define RENO_BETAA 1
+#define RENO_BETAB 2
+/* Window reduction (1-beta) for Cubic: 0.8 */
+#define CUBIC_BETAA 4
+#define CUBIC_BETAB 5
+/* Draft Rhee Section 4.1 */
+#define CUBIC_CA 4
+#define CUBIC_CB 10
+
static void
-tcp_reno_congestion_exp(struct tcpcb *tp)
+tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
{
u_int win;
/*
- * Halve the congestion window and reduce the
- * slow start threshold.
+ * Reduce the congestion window and the slow start threshold.
*/
- win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
+ win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
if (win < 2)
win = 2;
@@ -434,9 +453,20 @@
}
+/* ------------------------------------------------------------------------ */
+
+/*
+ * TCP/Reno congestion control.
+ */
+static void
+tcp_reno_congestion_exp(struct tcpcb *tp)
+{
+
+ tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
+}
static int
-tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
+tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* We know we're losing at the current
@@ -458,10 +488,8 @@
* irrespective of the number of DupAcks.
*/
- tcp_seq onxt;
-
- onxt = tp->snd_nxt;
- tcp_reno_congestion_exp(tp);
+ tcp_seq onxt = tp->snd_nxt;
+
tp->t_partialacks = 0;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
@@ -482,6 +510,14 @@
return 0;
}
+static int
+tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
+{
+
+ tcp_reno_congestion_exp(tp);
+ return tcp_reno_do_fast_retransmit(tp, th);
+}
+
static void
tcp_reno_slow_retransmit(struct tcpcb *tp)
{
@@ -521,6 +557,9 @@
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
+
+ if (TCP_ECN_ALLOWED(tp))
+ tp->t_flags |= TF_ECN_SND_CWR;
}
static void
@@ -543,6 +582,8 @@
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
+ if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
+ tp->snd_fack = th->th_ack;
}
}
@@ -653,6 +694,7 @@
*/
tcp_seq onxt = tp->snd_nxt;
u_long ocwnd = tp->snd_cwnd;
+ int sack_num_segs = 1, sack_bytes_rxmt = 0;
/*
* snd_una has not yet been updated and the socket's send
@@ -660,24 +702,52 @@
* have to leave snd_una as it was to get the correct data
* offset in tcp_output().
*/
- if (++tp->t_partialacks == 1)
- TCP_TIMER_DISARM(tp, TCPT_REXMT);
+ tp->t_partialacks++;
+ TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
- /*
- * Set snd_cwnd to one segment beyond ACK'd offset. snd_una
- * is not yet updated when we're called.
- */
- tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
- (void) tcp_output(tp);
- tp->snd_cwnd = ocwnd;
- if (SEQ_GT(onxt, tp->snd_nxt))
- tp->snd_nxt = onxt;
- /*
- * Partial window deflation. Relies on fact that tp->snd_una
- * not updated yet.
- */
- tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
+
+ if (TCP_SACK_ENABLED(tp)) {
+ /*
+ * Partial ack handling within a sack recovery episode.
+ * Keeping this very simple for now. When a partial ack
+ * is received, force snd_cwnd to a value that will
+ * allow the sender to transmit no more than 2 segments.
+ * If necessary, a fancier scheme can be adopted at a
+ * later point, but for now, the goal is to prevent the
+ * sender from bursting a large amount of data in the
+ * midst of sack recovery.
+ */
+
+ /*
+ * send one or 2 segments based on how much
+ * new data was acked
+ */
+ if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
+ sack_num_segs = 2;
+ (void)tcp_sack_output(tp, &sack_bytes_rxmt);
+ tp->snd_cwnd = sack_bytes_rxmt +
+ (tp->snd_nxt - tp->sack_newdata) +
+ sack_num_segs * tp->t_segsz;
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ } else {
+ /*
+ * Set snd_cwnd to one segment beyond ACK'd offset
+ * snd_una is not yet updated when we're called
+ */
+ tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ /*
+ * Partial window deflation. Relies on fact that
+ * tp->snd_una not updated yet.
+ */
+ tp->snd_cwnd -= (th->th_ack - tp->snd_una -
+ tp->t_segsz);
+ }
} else {
/*
* Complete ack. Inflate the congestion window to ssthresh
@@ -696,6 +766,8 @@
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
+ if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
+ tp->snd_fack = th->th_ack;
}
}
@@ -720,4 +792,179 @@
.cong_exp = tcp_reno_congestion_exp,
};
+/*
+ * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
+ */
+/* Cubic prototypes */
+static void tcp_cubic_update_ctime(struct tcpcb *tp);
+static uint32_t tcp_cubic_diff_ctime(struct tcpcb *);
+static uint32_t tcp_cubic_cbrt(uint32_t);
+static uint32_t tcp_cubic_getW(struct tcpcb *);
+
+/* Cubic TIME functions - XXX I don't like using timevals and microuptime */
+/*
+ * Set congestion timer to now
+ */
+static void
+tcp_cubic_update_ctime(struct tcpcb *tp)
+{
+ struct timeval now_timeval;
+
+ getmicrouptime(&now_timeval);
+ tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
+ now_timeval.tv_usec / 1000;
+}
+
+/*
+ * miliseconds from last congestion
+ */
+static uint32_t
+tcp_cubic_diff_ctime(struct tcpcb *tp)
+{
+ struct timeval now_timeval;
+
+ getmicrouptime(&now_timeval);
+ return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
+ tp->snd_cubic_ctime;
+}
+
+/*
+ * Approximate cubic root
+ */
+#define CBRT_ROUNDS 30
+static uint32_t
+tcp_cubic_cbrt(uint32_t v)
+{
+ int i, rounds = CBRT_ROUNDS;
+ uint64_t x = v / 3;
+
+ /* We fail to calculate correct for small numbers */
+ if (v == 0)
+ return 0;
+ else if (v < 4)
+ return 1;
+
+ /*
+ * largest x that 2*x^3+3*x fits 64bit
+ * Avoid overflow for a time cost
+ */
+ if (x > 2097151)
+ rounds += 10;
Home |
Main Index |
Thread Index |
Old Index