Subject: Re: Hashing IP reassembly queues, phase 2 of 2: fragmeDoS
To: Jonathan Stone <jonathan@DSG.Stanford.EDU>
From: Jason Thorpe <thorpej@wasabisystems.com>
List: tech-net
Date: 12/13/2003 14:41:31
--Apple-Mail-18-706294300
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=US-ASCII; format=flowed
On Dec 13, 2003, at 2:25 PM, Jonathan Stone wrote:
> Here is a revised version of the patch which implements a `drop half'
> strategy when the IP reassembly queue experiences mbuf fragment
> pressure.
>
>
> This version includes the two improvements I mentioned before. I
> already checked in changes to -current to maintain both a total count
> of
> all fragments in the reassembly queue, and a count of fragment in
> each reasembly-queue chain (each fragmented packet).
All good. A couple of minor things below. Otherwise, looks great to
me!
> Index: ip_input.c
> ===================================================================
> RCS file: /cvsroot/src/sys/netinet/ip_input.c,v
> retrieving revision 1.192
> diff -u -r1.192 ip_input.c
> --- ip_input.c 2003/12/08 02:23:27 1.192
> +++ ip_input.c 2003/12/13 04:02:13
> @@ -247,7 +247,23 @@
> int ip_nfragpackets = 0;
> int ip_maxfragpackets = 200;
> int ip_nfrags = 0; /* total fragments in reass queues */
> +int ip_maxfrags = 0; /* total fragments in reass queues */
Don't initialize ip_nfrags to 0. Let it default to 0 by being in the
BSS segment. You would never want anyone to patch that vars, so don't
make it patchable. I think the same thing applies to ip_nfragpackets.
>
> +/*
> + * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
> + * IP reassembly queue buffer managment.
> + *
> + * We keep a count of total IP fragments (NB: not fragmented packets!)
> + * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on
> fragments.
> + * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
> + * total fragments in reassembly queues.This AIMD policy avoids
> + * repeatedly deleting single packets under heavy fragmentation load
> + * (e.g., from lossy NFS peers).
> + */
> +static u_int ip_reass_ttl_decr __P((u_int ticks));
> +static void ip_reass_drophalf __P((void));
> +
> +
> static __inline int ipq_lock_try __P((void));
> static __inline void ipq_unlock __P((void));
>
> @@ -375,7 +391,9 @@
> LIST_INIT(&ipq[i]);
>
> ip_id = time.tv_sec & 0xfffff;
> +
> ipintrq.ifq_maxlen = ipqmaxlen;
> + ip_maxfrags = nmbclusters / 4;
On some arch's, nmbclusters can change at run-time (MIPS and Alpha fall
into this category), so you might want to figure out some way to handle
that. Just put an XXX comment there for now, maybe.
> TAILQ_INIT(&in_ifaddrhead);
> in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
> M_WAITOK, &in_ifaddrhash);
> @@ -1003,6 +1021,11 @@
> m->m_data += hlen;
> m->m_len -= hlen;
>
> +#ifdef notyet
> + if (ip_nfrags >= ip_maxfrags)
> + ip_reass_drophalf(void);
> +#endif
> +
> /*
> * We are about to add a fragment; increment frag count.
> */
> @@ -1201,30 +1224,93 @@
> }
>
> /*
> - * IP timer processing;
> - * if a timer expires on a reassembly
> - * queue, discard it.
> + * IP reassembly TTL machinery for multiplicative drop.
> */
> -void
> -ip_slowtimo()
> +static u_int fragttl_histo[(IPFRAGTTL+1)];
> +
> +
> +/*
> + * Decrement TTL of all reasembly queue entries by `ticks'.
> + * Count number of distinct fragments (as opposed to partial,
> fragmented
> + * datagrams) in the reassembly queue. While we traverse the entire
> + * reassembly queue, compute and return the median TTL over all
> fragments.
> + */
> +static u_int
> +ip_reass_ttl_decr(u_int ticks)
> {
> - static u_int dropscanidx = 0;
> - u_int i;
> + u_int i, nfrags, median;
> struct ipq *fp, *nfp;
> - int s = splsoftnet();
>
> - IPQ_LOCK();
> + nfrags = 0;
> + memset(fragttl_histo, 0, sizeof fragttl_histo);
> +
> for (i = 0; i < IPREASS_NHASH; i++) {
> for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
> + fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
> + 0 : fp->ipq_ttl - ticks);
> nfp = LIST_NEXT(fp, ipq_q);
> - if (--fp->ipq_ttl == 0) {
> + if (fp->ipq_ttl == 0) {
> ipstat.ips_fragtimeout++;
> ip_freef(fp);
> + } else {
> + nfrags += fp->ipq_nfrags;
> + fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
> }
> }
> }
> +
> + KASSERT(ip_nfrags == nfrags);
> +
> + /* find median in histogram */
> + for (i = 0, median = 0; i <= IPFRAGTTL; i++) {
> + median += fragttl_histo[i];
> + if (median * 2 >= ip_nfrags)
> + break;
> + }
> +
> + return (u_int)i;
> +}
> +
> +void
> +ip_reass_drophalf(void)
> +{
> +
> + u_int median_ticks;
> + /*
> + * Compute median TTL of all fragments, and count frags
> + * with that TTL or lower (roughly half of all fragments).
> + */
> + median_ticks = ip_reass_ttl_decr(0);
> +
> + /* Drop half. */
> + median_ticks = ip_reass_ttl_decr(median_ticks);
> +
> +}
> +
> +/*
> + * IP timer processing;
> + * if a timer expires on a reassembly
> + * queue, discard it.
> + */
> +void
> +ip_slowtimo()
> +{
> + static u_int dropscanidx = 0;
> + u_int i;
> + u_int median_ttl;
> + int s = splsoftnet();
> +
> + IPQ_LOCK();
> +
> + /* Age TTL of all fragments by 1 tick .*/
> + median_ttl = ip_reass_ttl_decr(1);
> +
> + /* If we have too many fragments, drop the older half. */
> + if (ip_nfrags > ip_maxfrags)
> + ip_reass_ttl_decr(median_ttl);
> +
> /*
> - * If we are over the maximum number of fragments
> + * If we are over the maximum number of fragmented packets
> * (due to the limit being lowered), drain off
> * enough to get down to the new limit. Start draining
> * from the reassembly hashqueue most recently drained.
> @@ -1263,7 +1349,6 @@
> void
> ip_drain()
> {
> - int i;
>
> /*
> * We may be called from a device's interrupt context. If
> @@ -1272,15 +1357,11 @@
> if (ipq_lock_try() == 0)
> return;
>
> - for (i = 0; i < IPREASS_NHASH; i++) {
> - struct ipqhead *ipqh = &ipq[i];
> - struct ipq *fp, *nfp;
> - for (fp = LIST_FIRST(ipqh); fp != NULL; fp = nfp) {
> - nfp = LIST_NEXT(fp, ipq_q);
> - ip_freef(fp);
> - ipstat.ips_fragdropped++;
> - }
> - }
> + /*
> + * Drop half the total fragments now. If more mbufs are needed,
> + * we will be called again soon.
> + */
> + ip_reass_drophalf();
>
> IPQ_UNLOCK();
> }
>
-- Jason R. Thorpe <thorpej@wasabisystems.com>
--Apple-Mail-18-706294300
content-type: application/pgp-signature; x-mac-type=70674453;
name=PGP.sig
content-description: This is a digitally signed message part
content-disposition: inline; filename=PGP.sig
content-transfer-encoding: 7bit
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (Darwin)
iD8DBQE/25WbOpVKkaBm8XkRAnTIAJ4sSzKglxUh3ANEOQxG3lZJ2hcapACfa2PO
tkFpeeMS4SwgAJA5E+nX70Q=
=pbDz
-----END PGP SIGNATURE-----
--Apple-Mail-18-706294300--