Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/netinet Commit TCP SACK patches from Kentaro A. Karahone...
details: https://anonhg.NetBSD.org/src/rev/e606b3e4972b
branches: trunk
changeset: 574450:e606b3e4972b
user: jonathan <jonathan%NetBSD.org@localhost>
date: Mon Feb 28 16:20:59 2005 +0000
description:
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
diffstat:
sys/netinet/files.netinet | 3 +-
sys/netinet/tcp_input.c | 84 +++++-
sys/netinet/tcp_output.c | 186 ++++++++++++++-
sys/netinet/tcp_sack.c | 547 ++++++++++++++++++++++++++++++++++++++++++++++
sys/netinet/tcp_subr.c | 16 +-
sys/netinet/tcp_timer.c | 17 +-
sys/netinet/tcp_var.h | 59 ++++-
7 files changed, 859 insertions(+), 53 deletions(-)
diffs (truncated from 1316 to 300 lines):
diff -r 3f1b87aea669 -r e606b3e4972b sys/netinet/files.netinet
--- a/sys/netinet/files.netinet Mon Feb 28 16:16:19 2005 +0000
+++ b/sys/netinet/files.netinet Mon Feb 28 16:20:59 2005 +0000
@@ -1,4 +1,4 @@
-# $NetBSD: files.netinet,v 1.9 2005/01/13 19:09:40 drochner Exp $
+# $NetBSD: files.netinet,v 1.10 2005/02/28 16:20:59 jonathan Exp $
defflag opt_tcp_debug.h TCP_DEBUG
defparam opt_tcp_debug.h TCP_NDEBUG
@@ -30,6 +30,7 @@
file netinet/tcp_debug.c (inet | inet6) & tcp_debug
file netinet/tcp_input.c inet | inet6
file netinet/tcp_output.c inet | inet6
+file netinet/tcp_sack.c inet | inet6
file netinet/tcp_subr.c inet | inet6
file netinet/tcp_timer.c inet | inet6
file netinet/tcp_usrreq.c inet | inet6
diff -r 3f1b87aea669 -r e606b3e4972b sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c Mon Feb 28 16:16:19 2005 +0000
+++ b/sys/netinet/tcp_input.c Mon Feb 28 16:20:59 2005 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: tcp_input.c,v 1.221 2005/02/26 22:45:12 perry Exp $ */
+/* $NetBSD: tcp_input.c,v 1.222 2005/02/28 16:20:59 jonathan Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -148,7 +148,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.221 2005/02/26 22:45:12 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.222 2005/02/28 16:20:59 jonathan Exp $");
#include "opt_inet.h"
#include "opt_ipsec.h"
@@ -493,6 +493,7 @@
SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
tcpstat.tcps_rcvduppack++;
tcpstat.tcps_rcvdupbyte += pkt_len;
+ tcp_new_dsack(tp, pkt_seq, pkt_len);
m_freem(m);
if (tiqe != NULL)
pool_put(&tcpipqent_pool, tiqe);
@@ -1484,6 +1485,10 @@
if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
goto drop;
+ if (TCP_SACK_ENABLED(tp)) {
+ tcp_del_sackholes(tp, th);
+ }
+
if (opti.ts_present && opti.ts_ecr) {
/*
* Calculate the RTT from the returned time stamp and the
@@ -1556,6 +1561,7 @@
tp->t_lastoff -= acked;
tp->snd_una = th->th_ack;
+ tp->snd_fack = tp->snd_una;
if (SEQ_LT(tp->snd_high, tp->snd_una))
tp->snd_high = tp->snd_una;
m_freem(m);
@@ -1592,6 +1598,7 @@
* we have enough buffer space to take it.
*/
++tcpstat.tcps_preddat;
+ tp->rcv_sack_num = 0;
tp->rcv_nxt += tlen;
tcpstat.tcps_rcvpack++;
tcpstat.tcps_rcvbyte += tlen;
@@ -1799,6 +1806,7 @@
tcpstat.tcps_rcvduppack++;
tcpstat.tcps_rcvdupbyte += tlen;
tcpstat.tcps_pawsdrop++;
+ tcp_new_dsack(tp, th->th_seq, tlen);
goto dropafterack;
}
}
@@ -1847,6 +1855,7 @@
tcpstat.tcps_rcvpartduppack++;
tcpstat.tcps_rcvpartdupbyte += todrop;
}
+ tcp_new_dsack(tp, th->th_seq, todrop);
hdroptlen += todrop; /*drop from head afterwards*/
th->th_seq += todrop;
tlen -= todrop;
@@ -2075,12 +2084,19 @@
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
* network.
+ *
+ * If we are using TCP/SACK, then enter
+ * Fast Recovery if the receiver SACKs
+ * data that is tcprexmtthresh * MSS
+ * bytes past the last ACKed segment,
+ * irrespective of the number of DupAcks.
*/
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
- else if (++tp->t_dupacks == tcprexmtthresh &&
- tp->t_partialacks < 0) {
+ else if (tp->t_partialacks < 0 &&
+ (++tp->t_dupacks == tcprexmtthresh ||
+ TCP_FACK_FASTRECOV(tp))) {
tcp_seq onxt;
u_int win;
@@ -2105,6 +2121,13 @@
tp->t_partialacks = 0;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
+ if (TCP_SACK_ENABLED(tp)) {
+ tp->t_dupacks = tcprexmtthresh;
+ tp->sack_newdata = tp->snd_nxt;
+ tp->snd_cwnd = tp->t_segsz;
+ (void) tcp_output(tp);
+ goto drop;
+ }
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_segsz;
(void) tcp_output(tp);
@@ -2138,10 +2161,12 @@
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (!tcp_do_newreno)
+ if (TCP_SACK_ENABLED(tp))
+ tcp_sack_newack(tp, th);
+ else if (tcp_do_newreno)
+ tcp_newreno_newack(tp, th);
+ else
tcp_reno_newack(tp, th);
- else
- tcp_newreno_newack(tp, th);
if (SEQ_GT(th->th_ack, tp->snd_max)) {
tcpstat.tcps_rcvacktoomuch++;
goto dropafterack;
@@ -2212,6 +2237,8 @@
}
sowwakeup(so);
tp->snd_una = th->th_ack;
+ if (SEQ_GT(tp->snd_una, tp->snd_fack))
+ tp->snd_fack = tp->snd_una;
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
if (SEQ_LT(tp->snd_high, tp->snd_una))
@@ -2406,6 +2433,7 @@
} else {
m_adj(m, hdroptlen);
tiflags = tcp_reass(tp, th, m, &tlen);
+ tcp_update_sack_list(tp);
tp->t_flags |= TF_ACKNOW;
}
TCP_REASS_UNLOCK(tp);
@@ -2478,8 +2506,10 @@
/*
* Return any desired output.
*/
- if (needoutput || (tp->t_flags & TF_ACKNOW))
+ if (needoutput || (tp->t_flags & TF_ACKNOW)) {
+ tcp_update_sack_list(tp);
(void) tcp_output(tp);
+ }
if (tcp_saveti)
m_freem(tcp_saveti);
return;
@@ -2515,6 +2545,7 @@
dropafterack2:
m_freem(m);
tp->t_flags |= TF_ACKNOW;
+ tcp_update_sack_list(tp);
(void) tcp_output(tp);
if (tcp_saveti)
m_freem(tcp_saveti);
@@ -2817,24 +2848,14 @@
continue;
if (!(th->th_flags & TH_SYN))
continue;
- tp->t_flags &= ~TF_CANT_TXSACK;
+ if (tcp_do_sack) {
+ tp->t_flags |= TF_SACK_PERMIT;
+ tp->t_flags |= TF_WILL_SACK;
+ }
break;
case TCPOPT_SACK:
- if (tp->t_flags & TF_IGNR_RXSACK)
- continue;
- if (optlen % 8 != 2 || optlen < 10)
- continue;
- cp += 2;
- optlen -= 2;
- for (; optlen > 0; cp -= 8, optlen -= 8) {
- tcp_seq lwe, rwe;
- bcopy((char *)cp, (char *) &lwe, sizeof(lwe));
- NTOHL(lwe);
- bcopy((char *)cp, (char *) &rwe, sizeof(rwe));
- NTOHL(rwe);
- /* tcp_mark_sacked(tp, lwe, rwe); */
- }
+ tcp_sack_option(tp, th, cp, optlen);
break;
#ifdef TCP_SIGNATURE
case TCPOPT_SIGNATURE:
@@ -3663,6 +3684,9 @@
TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
tcpstat.tcps_accepts++;
+ if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
+ tp->t_flags |= TF_WILL_SACK;
+
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
@@ -3952,6 +3976,8 @@
sc->sc_requested_s_scale = 15;
sc->sc_request_r_scale = 15;
}
+ if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
+ sc->sc_flags |= SCF_SACK_PERMIT;
#ifdef TCP_SIGNATURE
if (tb.t_flags & TF_SIGNATURE)
sc->sc_flags |= SCF_SIGNATURE;
@@ -4003,6 +4029,7 @@
/* Compute the size of the TCP options. */
optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
+ ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) +
#ifdef TCP_SIGNATURE
((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) +
#endif
@@ -4108,6 +4135,17 @@
optp += TCPOLEN_TSTAMP_APPA;
}
+ if (sc->sc_flags & SCF_SACK_PERMIT) {
+ u_int8_t *p = optp;
+
+ /* Let the peer know that we will SACK. */
+ p[0] = TCPOPT_SACK_PERMITTED;
+ p[1] = 2;
+ p[2] = TCPOPT_NOP;
+ p[3] = TCPOPT_NOP;
+ optp += 4;
+ }
+
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE) {
struct secasvar *sav;
diff -r 3f1b87aea669 -r e606b3e4972b sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c Mon Feb 28 16:16:19 2005 +0000
+++ b/sys/netinet/tcp_output.c Mon Feb 28 16:20:59 2005 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: tcp_output.c,v 1.117 2005/02/26 22:45:12 perry Exp $ */
+/* $NetBSD: tcp_output.c,v 1.118 2005/02/28 16:20:59 jonathan Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -138,7 +138,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.117 2005/02/26 22:45:12 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.118 2005/02/28 16:20:59 jonathan Exp $");
#include "opt_inet.h"
#include "opt_ipsec.h"
@@ -203,7 +203,7 @@
extern struct mbuf *m_copypack();
#endif
-#define MAX_TCPOPTLEN 32 /* max # bytes that go in options */
+#define MAX_TCPOPTLEN 40 /* max # bytes that go in options */
/*
* Knob to enable Congestion Window Monitoring, and control the
@@ -554,6 +554,9 @@
int maxburst = TCP_MAXBURST;
int af; /* address family on the wire */
int iphdrlen;
+ int sack_rxmit;
+ int sack_bytes_rxmt;
+ struct sackhole *p;
#ifdef TCP_SIGNATURE
int sigoff = 0;
#endif
@@ -654,12 +657,70 @@
* flags that should be used. If there is some data or critical
* controls (SYN, RST) to send, then transmit; otherwise,
* investigate further.
+ *
+ * Readjust SACK information to avoid resending duplicate data.
*/
+ if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
+ tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
win = min(tp->snd_wnd, tp->snd_cwnd);
Home |
Main Index |
Thread Index |
Old Index