Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/usr.sbin/ypbind Instead of using magic numbers in what looks...
details: https://anonhg.NetBSD.org/src/rev/796b3e4dd37f
branches: trunk
changeset: 329813:796b3e4dd37f
user: dholland <dholland%NetBSD.org@localhost>
date: Tue Jun 10 17:19:22 2014 +0000
description:
Instead of using magic numbers in what looks like a boolean
(dom_alive), create a state enumeration (domainstates) and use it
instead.
Instead of three states (new, alive, and, effectively, 'troubled') go
to five: new, alive, pinging, lost, and dead.
Domains start in the NEW state. When we get a reply from a server, the
state goes to ALIVE. The state is set to PINGING when we ping the
server (once a minute normally) and if the ping times out, it goes to
LOST. If we stay lost for a minute, go to DEAD, and in DEAD, do
exponential backoff of nag_servers calls.
Getting rid of the broken logic attached to the 'troubled' state fixes
PR 15355 (ypbind defeats disk idle spindown) -- it will now only
rewrite the binding file when the binding changes.
Also, fix the HEURISTIC code so it doesn't trigger except in ALIVE
state. I think this was the source of a lot of the spamming behavior
seen in PR 32519, which is now fixed.
Might also fix PR 23135 (broadcast ypbind sometimes fails to find
servers).
diffstat:
usr.sbin/ypbind/ypbind.c | 234 +++++++++++++++++++++++++++++++++++++---------
1 files changed, 187 insertions(+), 47 deletions(-)
diffs (truncated from 379 to 300 lines):
diff -r 7f5739afc905 -r 796b3e4dd37f usr.sbin/ypbind/ypbind.c
--- a/usr.sbin/ypbind/ypbind.c Tue Jun 10 17:19:12 2014 +0000
+++ b/usr.sbin/ypbind/ypbind.c Tue Jun 10 17:19:22 2014 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: ypbind.c,v 1.95 2014/06/10 17:19:12 dholland Exp $ */
+/* $NetBSD: ypbind.c,v 1.96 2014/06/10 17:19:22 dholland Exp $ */
/*
* Copyright (c) 1992, 1993 Theo de Raadt <deraadt%fsa.ca@localhost>
@@ -28,7 +28,7 @@
#include <sys/cdefs.h>
#ifndef LINT
-__RCSID("$NetBSD: ypbind.c,v 1.95 2014/06/10 17:19:12 dholland Exp $");
+__RCSID("$NetBSD: ypbind.c,v 1.96 2014/06/10 17:19:22 dholland Exp $");
#endif
#include <sys/types.h>
@@ -84,16 +84,26 @@
YPBIND_DIRECT, YPBIND_BROADCAST,
} ypbind_mode_t;
+enum domainstates {
+ DOM_NEW, /* not yet bound */
+ DOM_ALIVE, /* bound and healthy */
+ DOM_PINGING, /* ping outstanding */
+ DOM_LOST, /* binding timed out, looking for a new one */
+ DOM_DEAD, /* long-term lost, in exponential backoff */
+};
+
struct domain {
struct domain *dom_next;
char dom_name[YPMAXDOMAIN + 1];
struct sockaddr_in dom_server_addr;
long dom_vers;
- time_t dom_checktime;
- time_t dom_asktime;
+ time_t dom_checktime; /* time of next check/contact */
+ time_t dom_asktime; /* time we were last DOMAIN'd */
+ time_t dom_losttime; /* time the binding was lost, or 0 */
+ unsigned dom_backofftime; /* current backoff period, when DEAD */
int dom_lockfd;
- int dom_alive;
+ enum domainstates dom_state;
uint32_t dom_xid;
FILE *dom_serversfile; /* /var/yp/binding/foo.ypservers */
int dom_been_ypset; /* ypset been done on this domain? */
@@ -145,6 +155,39 @@
return fd;
}
+/*
+ * Exponential backoff for pinging servers for a dead domain.
+ *
+ * We go 10 -> 20 -> 40 -> 60 seconds, then 2 -> 4 -> 8 -> 15 -> 30 ->
+ * 60 minutes, and stay at 60 minutes. This is overengineered.
+ *
+ * With a 60 minute max backoff the response time for when things come
+ * back is not awful, but we only try (and log) about 60 times even if
+ * things are down for a whole long weekend. This is an acceptable log
+ * load, I think.
+ */
+static void
+backoff(unsigned *psecs)
+{
+ unsigned secs;
+
+ secs = *psecs;
+ if (secs < 60) {
+ secs *= 2;
+ if (secs > 60) {
+ secs = 60;
+ }
+ } else if (secs < 60 * 15) {
+ secs *= 2;
+ if (secs > 60 * 15) {
+ secs = 60 * 15;
+ }
+ } else if (secs < 60 * 60) {
+ secs *= 2;
+ }
+ *psecs = secs;
+}
+
////////////////////////////////////////////////////////////
// logging
@@ -198,14 +241,28 @@
// struct domain
/*
- * State transition is done like this:
+ * The state transitions of a domain work as follows:
+ *
+ * in state NEW:
+ * nag_servers every 5 seconds
+ * upon answer, state is ALIVE
+ *
+ * in state ALIVE:
+ * every 60 seconds, send ping and switch to state PINGING
*
- * STATE EVENT ACTION NEWSTATE TIMEOUT
- * no binding timeout broadcast no binding 5 sec
- * no binding answer -- binding 60 sec
- * binding timeout ping server checking 5 sec
- * checking timeout ping server + broadcast checking 5 sec
- * checking answer -- binding 60 sec
+ * in state PINGING:
+ * upon answer, go to state ALIVE
+ * if no answer in 5 seconds, go to state LOST and do nag_servers
+ *
+ * in state LOST:
+ * do nag_servers every 5 seconds
+ * upon answer, go to state ALIVE
+ * if no answer in 60 seconds, go to state DEAD
+ *
+ * in state DEAD
+ * do nag_servers every backofftime seconds (starts at 10)
+ * upon answer go to state ALIVE
+ * backofftime doubles (approximately) each try, with a cap of 1 hour
*/
/*
@@ -263,8 +320,10 @@
dom->dom_vers = YPVERS;
dom->dom_checktime = 0;
dom->dom_asktime = 0;
+ dom->dom_losttime = 0;
+ dom->dom_backofftime = 10;
dom->dom_lockfd = -1;
- dom->dom_alive = 0;
+ dom->dom_state = DOM_NEW;
dom->dom_xid = unique_xid(dom);
dom->dom_been_ypset = 0;
dom->dom_serversfile = NULL;
@@ -456,36 +515,71 @@
}
/*
- * If the domain is alive and we aren't ypset, we don't need
- * to do anything.
- *
- * This code is unreachable (AFAIK) unless we receive an
- * unsolicited ping reply from the ypserver: because dom_alive
- * is 0 until we have a binding, but set from 1 to 2 when we
- * ping, it will never normally be 1 when a reply comes in,
- * even a reply to a ping. In the case where we lost the
- * binding and are getting a reply arising from nag_servers,
- * we lost the binding because we never got a ping response so
- * in that case dom_alive will also be 2.
- *
- * This logic is clearly wrong. XXX.
+ * If the domain is alive and we aren't being called by ypset,
+ * we shouldn't be getting a response at all. Log it, as it
+ * might be hostile.
*/
- if (dom->dom_alive == 1 && force == 0) {
+ if (dom->dom_state == DOM_ALIVE && force == 0) {
+ if (!memcmp(&dom->dom_server_addr, raddrp,
+ sizeof(dom->dom_server_addr))) {
+ yp_log(LOG_WARNING,
+ "Unexpected reply from server %s for domain %s",
+ inet_ntoa(dom->dom_server_addr.sin_addr),
+ dom->dom_name);
+ } else {
+ yp_log(LOG_WARNING,
+ "Falsified reply from %s for domain %s",
+ inet_ntoa(dom->dom_server_addr.sin_addr),
+ dom->dom_name);
+ }
+ return;
+ }
+
+ /*
+ * If we're expected a ping response, and we've got it
+ * (meaning we aren't being called by ypset), we don't need to
+ * do anything.
+ */
+ if (dom->dom_state == DOM_PINGING && force == 0) {
/*
* If the reply came from the server we expect, set
- * dom_alive back to 1 and ping again in 60 seconds.
+ * dom_state back to ALIVE and ping again in 60
+ * seconds.
*
- * If it came from somewhere else, ignore it.
+ * If it came from somewhere else, log it.
*/
if (!memcmp(&dom->dom_server_addr, raddrp,
sizeof(dom->dom_server_addr))) {
- dom->dom_alive = 1;
+ dom->dom_state = DOM_ALIVE;
/* recheck binding in 60 sec */
dom->dom_checktime = time(NULL) + 60;
+ } else {
+ yp_log(LOG_WARNING,
+ "Falsified reply from %s for domain %s",
+ inet_ntoa(dom->dom_server_addr.sin_addr),
+ dom->dom_name);
}
return;
}
+#ifdef HEURISTIC
+ /*
+ * If transitioning to the alive state from a non-alive state,
+ * clear dom_asktime. This will help prevent any requests that
+ * are still coming in from triggering unnecessary pings via
+ * the HEURISTIC code.
+ *
+ * XXX: this may not be an adequate measure; we may need to
+ * keep more state so we can disable the HEURISTIC code for
+ * the first few seconds after rebinding.
+ */
+ if (dom->dom_state == DOM_NEW ||
+ dom->dom_state == DOM_LOST ||
+ dom->dom_state == DOM_DEAD) {
+ dom->dom_asktime = 0;
+ }
+#endif
+
/*
* Take the address we got the message from (or in the case of
* ypset, the explicit address we were given) as the server
@@ -514,8 +608,7 @@
*
* 3. Either way we should not accept a response from an
* arbitrary host unless we don't currently have a binding.
- * (The logic in the previous if statement is probably
- * supposed to handle this, but it doesn't currently work.)
+ * (This is now fixed above.)
*
* Note that for a random unsolicited reply to work it has to
* carry the XID of one of the domains we know about; but
@@ -525,7 +618,11 @@
sizeof(dom->dom_server_addr));
/* recheck binding in 60 seconds */
dom->dom_checktime = time(NULL) + 60;
- dom->dom_alive = 1;
+ dom->dom_state = DOM_ALIVE;
+
+ /* Clear the dead/backoff state. */
+ dom->dom_losttime = 0;
+ dom->dom_backofftime = 10;
/*
* Generate a new binding file. If this fails, forget about it.
@@ -641,8 +738,8 @@
return NULL;
}
- if (dom->dom_alive == 0) {
- DPRINTF("dead domain %s\n", arg);
+ if (dom->dom_state == DOM_NEW) {
+ DPRINTF("new domain %s\n", arg);
return NULL;
}
@@ -655,9 +752,16 @@
* elsewhere uses the binding file.
*
* Note: HEURISTIC is enabled by default.
+ *
+ * dholland 20140609: I think this is part of the mechanism
+ * that causes ypbind to spam. I'm changing this logic so it
+ * only triggers when the state is DOM_ALIVE: if the domain
+ * is new, lost, or dead we shouldn't send more requests than
+ * the ones already scheduled, and if we're already in the
+ * middle of pinging there's no point doing it again.
*/
(void)time(&now);
- if (now < dom->dom_asktime + 5) {
+ if (dom->dom_state == DOM_ALIVE && now < dom->dom_asktime + 5) {
/*
* Hmm. More than 2 requests in 5 seconds have indicated
* that my binding is possibly incorrect.
@@ -1263,7 +1367,7 @@
removelock(dom);
}
- if (dom->dom_alive == 2) {
+ if (dom->dom_state == DOM_PINGING || dom->dom_state == DOM_LOST) {
/*
* This resolves the following situation:
* ypserver on other subnet was once bound,
@@ -1300,8 +1404,6 @@
/*
* Send a ping message to a domain's current ypserver.
- *
- * Note that dom_alive is 2 while waiting for the response.
*/
static int
ping(struct domain *dom)
@@ -1352,7 +1454,6 @@
}
AUTH_DESTROY(rpcua);
- dom->dom_alive = 2;
Home |
Main Index |
Thread Index |
Old Index