Subject: Hashing IP reassembly queues, phase 2 of 2: fragmeDoS
To: None <tech-net@netbsd.org>
From: Jonathan Stone <jonathan@DSG.Stanford.EDU>
List: tech-net
Date: 12/05/2003 16:37:29
Here is the second change to IP reassembly: newly-arrived fragments
yield an additive increase. To guarantee stability (and to not consume
excessive time "cleaning" fragments,) the reassembly code now executes
a multiplicative drop (drop-half) when the total fragment count
exceeds a thresh-hold.
I implemented this multilcative-drop strategy after observing continal
mbuf exhaustion, due to a small number of Linux (circa RH 7.3)
clients: that is, the problem it solves is real, and sufficiently
well-known that commercial Unix/NFS-server vendors have released
patches to address the problem. Other strategies did not solve that
problem in a satisfactory or scalable way. The method works fine
empirically, in addition to being founded on control-theory principles
which are well understood in the context of TCP (AIMD).
I decided to go further drop the oldest half of all packets, based on
observed traffic behaviour from the aforementioned NFS clients.
Think of this pach as "shipped but unpolished". I know of two
ways it could be improved:
1. Maintain a count of total fragments: add one to the total
for each new fragment inserted. Keep a counter in each
reassembly entry, and decrement the total by that counter
when a packet is completed or dropped. Then we only execute
the drop-half incremental cost
of keeping the counters is balanced against scanning the entire
list in every call to ip_slowtimo().
2. Make the "drop half" threshhold a parameter.
That will allow ip_slowtimo() to request "drop half the fragments
from the queue, if and only if we exceed the threshold"
whereas ip__drain() can request an unconditional drop
(drop oldest half, rather than dropping all fragments).
I plan to implement both of these, and to also clean up the total
reporting from the median-TTL computation
Assuming thats done, any other comments before this is
cleaned up and committed?
--- sys/netinet/ip_input.c.HASH 2003-12-04 22:58:18.000000000 -0800
+++ sys/netinet/ip_input.c 2003-12-05 00:14:23.000000000 -0800
@@ -252,6 +252,21 @@
int ip_nfragpackets = 0;
int ip_maxfragpackets = 200;
+/*
+ * Additive-increase/multiplicative-decrease drop strategy for
+ * IP reassembly queue:
+ * There is a limit of IP fragments (NB: not fragmented packets!)
+ * we will hold for IP reassembly. If we receive fragments than
+ * the limit, we execute a multiplicative-decrease drop phase, toa
+ * ensuredstability under fragmentation denial-of-service attacks,
+ * or from peers whose whose "best effort" approaces a DoS.
+ */
+static int ip_frag_drainthresh ; /* nmbclusters / 4 */
+
+static int ip_reass_ttl_decr __P((unsigned ticks));
+static unsigned ip_reass_ttl_median __P((unsigned *ticks));
+static void ip_reass_drophalf __P((void));
+
static __inline int ipq_lock_try __P((void));
static __inline void ipq_unlock __P((void));
@@ -1182,6 +1197,118 @@
}
/*
+ * IP reassembly TTL machinery for multiplicative drop.
+ */
+unsigned fragttl_histo[(IPFRAGTTL+1)];
+
+
+/*
+ * Decrement TTL of all reasembly queue entries by `ticks'.
+ * Count number of distinct fragments (as opposed to partial, fragmented
+ * datagrams) in the reassembly queue.
+ * XXX: might as well compute TTL histogram for TTL median computation?
+ */
+static int
+ip_reass_ttl_decr(unsigned ticks)
+{
+ int i;
+ register struct ipq *fp, *nfp;
+ int nfrags = 0;
+
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+ register struct ipqent *ipqe;
+ fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
+ 0 : fp->ipq_ttl - ticks);
+ nfp = LIST_NEXT(fp, ipq_q);
+ if (fp->ipq_ttl == 0) {
+ ipstat.ips_fragtimeout++;
+ ip_freef(fp);
+ }
+ else
+ for (ipqe = TAILQ_FIRST(&fp->ipq_fragq);
+ ipqe!= NULL; ipqe = TAILQ_NEXT(ipqe, ipqe_q))
+ nfrags++;
+ }
+ }
+ return nfrags;
+}
+
+/*
+ * Compute median TTL of all fragments in reassembly queue.
+ * (Count individual fragments, not datagrams.)
+ */
+static unsigned
+ip_reass_ttl_median(unsigned *ticks)
+{
+ int i;
+ int nfragpkts, totfrags;
+ unsigned median;
+ bzero(fragttl_histo, sizeof fragttl_histo);
+
+ nfragpkts = 0;
+ totfrags = 0;
+
+ /* compute histogram */
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ struct ipq *fp;
+ fp = LIST_FIRST(&ipq[i]);
+ if (fp == NULL)
+ continue;
+ for ( ; fp != NULL; ) {
+ register struct ipqent *ipqe;
+ int fp_nfrags = 0;
+
+ nfragpkts++;
+ for (ipqe = TAILQ_FIRST(&fp->ipq_fragq);
+ ipqe!= NULL; ipqe = TAILQ_NEXT(ipqe, ipqe_q))
+ fp_nfrags++;
+
+ fragttl_histo[fp->ipq_ttl] += fp_nfrags;
+ totfrags += fp_nfrags;
+ fp = LIST_NEXT(fp, ipq_q);
+ }
+ }
+
+ /* find median in histogram */
+ for (i = 0, median = 0; i <= IPFRAGTTL; i++) {
+ median += fragttl_histo[i];
+ if (median * 2 >= totfrags)
+ break;
+ }
+
+ if (ticks != NULL)
+ *ticks = i;
+ return median;
+}
+
+/*
+ * Drop half the total fragments in the IP reassembly queue.
+ * Compute the median ttl value over all fragments, and count the
+ * total number of fragments with TTLs at or below that median.
+ * If that count is above our threshold, age all fragment TTLs
+ * by the median TTL value. Thus, if the threshold is exceeded,
+ * half of all fragments (packet buffers, typically mbuf clusters)
+ * will be dropped; and the dropped fragments will be the older ones
+ * (those with lowest residual TTL).
+ */
+static void
+ip_reass_drophalf(void)
+{
+ unsigned half_total_frags;
+ unsigned median_ticks = 0;
+
+ /*
+ * Compute median TTL of all fragments, and count frags
+ * with that TTL or lower (roughly half of all fragments).
+ */
+ half_total_frags = ip_reass_ttl_median(&median_ticks);
+
+ if (half_total_frags > ip_frag_drainthresh)
+ ip_reass_ttl_decr(median_ticks);
+}
+
+/*
* IP timer processing;
* if a timer expires on a reassembly
* queue, discard it.
@@ -1191,21 +1318,18 @@
{
static unsigned dropscanidx = 0;
unsigned i;
- struct ipq *fp, *nfp;
int s = splsoftnet();
IPQ_LOCK();
- for (i = 0; i < IPREASS_NHASH; i++) {
- for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
- nfp = LIST_NEXT(fp, ipq_q);
- if (--fp->ipq_ttl == 0) {
- ipstat.ips_fragtimeout++;
- ip_freef(fp);
- }
- }
- }
+
+ /* Age TTL of all fragments by 1 tick */
+ ip_reass_ttl_decr(1);
+
+ /* If too many fragments, drop the older half */
+ ip_reass_drophalf();
+
/*
- * If we are over the maximum number of fragments
+ * If we are over the maximum number of fragmented packets
* (due to the limit being lowered), drain off
* enough to get down to the new limit. Start draining
* from the reassembly hashqueue most recently drained.