Subject: Re: Hashing IP reassembly queues, phase 2 of 2: fragmeDoS
To: None <tech-net@NetBSD.org>
From: Jonathan Stone <jonathan@DSG.Stanford.EDU>
List: tech-net
Date: 12/13/2003 14:25:21
Here is a revised version of the patch which implements a `drop half'
strategy when the IP reassembly queue experiences mbuf fragment pressure.
This version includes the two improvements I mentioned before. I
already checked in changes to -current to maintain both a total count of
all fragments in the reassembly queue, and a count of fragment in
each reasembly-queue chain (each fragmented packet).
Median computation is now folded into the loop which decrements
reassembly TTL once every ip_slowtimo(). Decrementing TTL requires
walking the chain of reassembly queues (fragmented packets) anyway,
and the marginal cost to compute medina TTL is trivial.
There is also code in ip_reass(), inside ``#ifdef notyet'', which will
make every call to ip_reass() check for fragment overflow, and if
found, execute a drop-half phase. Once the code has settled in, I plan
to enable that fragment, and disable the ip_maxfragpackets limit by setting
ip_maxfragpackets to 0.
I typically need to be able to support a thousand of NFS clients or
so, and the limit of 200 reassembly packets is far too low for worst-case
traffic in that configuration. If I up ip_maxfragpackets to 2000, it
becomes completely ineffective; whereas the ip_maxfrags/drop-half makes
it unecessary, too.
Any further comments before I commit this?
Index: ip_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_input.c,v
retrieving revision 1.192
diff -u -r1.192 ip_input.c
--- ip_input.c 2003/12/08 02:23:27 1.192
+++ ip_input.c 2003/12/13 04:02:13
@@ -247,7 +247,23 @@
int ip_nfragpackets = 0;
int ip_maxfragpackets = 200;
int ip_nfrags = 0; /* total fragments in reass queues */
+int ip_maxfrags = 0; /* total fragments in reass queues */
+/*
+ * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
+ * IP reassembly queue buffer managment.
+ *
+ * We keep a count of total IP fragments (NB: not fragmented packets!)
+ * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
+ * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
+ * total fragments in reassembly queues.This AIMD policy avoids
+ * repeatedly deleting single packets under heavy fragmentation load
+ * (e.g., from lossy NFS peers).
+ */
+static u_int ip_reass_ttl_decr __P((u_int ticks));
+static void ip_reass_drophalf __P((void));
+
+
static __inline int ipq_lock_try __P((void));
static __inline void ipq_unlock __P((void));
@@ -375,7 +391,9 @@
LIST_INIT(&ipq[i]);
ip_id = time.tv_sec & 0xfffff;
+
ipintrq.ifq_maxlen = ipqmaxlen;
+ ip_maxfrags = nmbclusters / 4;
TAILQ_INIT(&in_ifaddrhead);
in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
M_WAITOK, &in_ifaddrhash);
@@ -1003,6 +1021,11 @@
m->m_data += hlen;
m->m_len -= hlen;
+#ifdef notyet
+ if (ip_nfrags >= ip_maxfrags)
+ ip_reass_drophalf(void);
+#endif
+
/*
* We are about to add a fragment; increment frag count.
*/
@@ -1201,30 +1224,93 @@
}
/*
- * IP timer processing;
- * if a timer expires on a reassembly
- * queue, discard it.
+ * IP reassembly TTL machinery for multiplicative drop.
*/
-void
-ip_slowtimo()
+static u_int fragttl_histo[(IPFRAGTTL+1)];
+
+
+/*
+ * Decrement TTL of all reasembly queue entries by `ticks'.
+ * Count number of distinct fragments (as opposed to partial, fragmented
+ * datagrams) in the reassembly queue. While we traverse the entire
+ * reassembly queue, compute and return the median TTL over all fragments.
+ */
+static u_int
+ip_reass_ttl_decr(u_int ticks)
{
- static u_int dropscanidx = 0;
- u_int i;
+ u_int i, nfrags, median;
struct ipq *fp, *nfp;
- int s = splsoftnet();
- IPQ_LOCK();
+ nfrags = 0;
+ memset(fragttl_histo, 0, sizeof fragttl_histo);
+
for (i = 0; i < IPREASS_NHASH; i++) {
for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+ fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
+ 0 : fp->ipq_ttl - ticks);
nfp = LIST_NEXT(fp, ipq_q);
- if (--fp->ipq_ttl == 0) {
+ if (fp->ipq_ttl == 0) {
ipstat.ips_fragtimeout++;
ip_freef(fp);
+ } else {
+ nfrags += fp->ipq_nfrags;
+ fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
}
}
}
+
+ KASSERT(ip_nfrags == nfrags);
+
+ /* find median in histogram */
+ for (i = 0, median = 0; i <= IPFRAGTTL; i++) {
+ median += fragttl_histo[i];
+ if (median * 2 >= ip_nfrags)
+ break;
+ }
+
+ return (u_int)i;
+}
+
+void
+ip_reass_drophalf(void)
+{
+
+ u_int median_ticks;
+ /*
+ * Compute median TTL of all fragments, and count frags
+ * with that TTL or lower (roughly half of all fragments).
+ */
+ median_ticks = ip_reass_ttl_decr(0);
+
+ /* Drop half. */
+ median_ticks = ip_reass_ttl_decr(median_ticks);
+
+}
+
+/*
+ * IP timer processing;
+ * if a timer expires on a reassembly
+ * queue, discard it.
+ */
+void
+ip_slowtimo()
+{
+ static u_int dropscanidx = 0;
+ u_int i;
+ u_int median_ttl;
+ int s = splsoftnet();
+
+ IPQ_LOCK();
+
+ /* Age TTL of all fragments by 1 tick .*/
+ median_ttl = ip_reass_ttl_decr(1);
+
+ /* If we have too many fragments, drop the older half. */
+ if (ip_nfrags > ip_maxfrags)
+ ip_reass_ttl_decr(median_ttl);
+
/*
- * If we are over the maximum number of fragments
+ * If we are over the maximum number of fragmented packets
* (due to the limit being lowered), drain off
* enough to get down to the new limit. Start draining
* from the reassembly hashqueue most recently drained.
@@ -1263,7 +1349,6 @@
void
ip_drain()
{
- int i;
/*
* We may be called from a device's interrupt context. If
@@ -1272,15 +1357,11 @@
if (ipq_lock_try() == 0)
return;
- for (i = 0; i < IPREASS_NHASH; i++) {
- struct ipqhead *ipqh = &ipq[i];
- struct ipq *fp, *nfp;
- for (fp = LIST_FIRST(ipqh); fp != NULL; fp = nfp) {
- nfp = LIST_NEXT(fp, ipq_q);
- ip_freef(fp);
- ipstat.ips_fragdropped++;
- }
- }
+ /*
+ * Drop half the total fragments now. If more mbufs are needed,
+ * we will be called again soon.
+ */
+ ip_reass_drophalf();
IPQ_UNLOCK();
}