Subject: Re: bin/31502: rpc.statd doesn't save failed notifies
To: None <gnats-bugs@netbsd.org, gnats-admin@netbsd.org,>
From: Christos Zoulas <christos@zoulas.com>
List: netbsd-bugs
Date: 10/07/2005 08:39:39
On Oct 7, 11:34am, xcc98be0c43465684@f4n.org (xcc98be0c43465684@f4n.org) wrote:
-- Subject: bin/31502: rpc.statd doesn't save failed notifies
Can you try this?
christos
Index: statd.c
===================================================================
RCS file: /cvsroot/src/usr.sbin/rpc.statd/statd.c,v
retrieving revision 1.23
diff -u -u -r1.23 statd.c
--- statd.c 14 Jan 2004 10:29:46 -0000 1.23
+++ statd.c 7 Oct 2005 12:39:10 -0000
@@ -459,54 +459,52 @@
time_t now = *(time_t *) ptr;
char *name = key->data;
DBT data;
+ int error;
if (hi->notifyReqd == 0 || hi->notifyReqd > now)
return 0;
- if (notify_one_host(name)) {
-give_up:
+ /*
+ * If one of the initial attempts fails, we wait
+ * for a while and have another go. This is necessary
+ * because when we have crashed, (eg. a power outage)
+ * it is quite possible that we won't be able to
+ * contact all monitored hosts immediately on restart,
+ * either because they crashed too and take longer
+ * to come up (in which case the notification isn't
+ * really required), or more importantly if some
+ * router etc. needed to reach the monitored host
+ * has not come back up yet. In this case, we will
+ * be a bit late in re-establishing locks (after the
+ * grace period) but that is the best we can do. We
+ * try 10 times at 5 sec intervals, 10 more times at
+ * 1 minute intervals, then 24 more times at hourly
+ * intervals, finally giving up altogether if the
+ * host hasn't come back to life after 24 hours.
+ */
+ if (notify_one_host(name) || hi->attempts++ >= 44) {
+ error = 0;
hi->notifyReqd = 0;
hi->attempts = 0;
- data.data = hi;
- data.size = sizeof(*hi);
- switch ((*db->put)(db, key, &data, 0)) {
- case -1:
- syslog(LOG_ERR, "Error storing %s (%m)", name);
- case 0:
- return 0;
-
- default:
- abort();
- }
- }
- else {
- /*
- * If one of the initial attempts fails, we wait
- * for a while and have another go. This is necessary
- * because when we have crashed, (eg. a power outage)
- * it is quite possible that we won't be able to
- * contact all monitored hosts immediately on restart,
- * either because they crashed too and take longer
- * to come up (in which case the notification isn't
- * really required), or more importantly if some
- * router etc. needed to reach the monitored host
- * has not come back up yet. In this case, we will
- * be a bit late in re-establishing locks (after the
- * grace period) but that is the best we can do. We
- * try 10 times at 5 sec intervals, 10 more times at
- * 1 minute intervals, then 24 more times at hourly
- * intervals, finally giving up altogether if the
- * host hasn't come back to life after 24 hours.
- */
- if (hi->attempts++ >= 44)
- goto give_up;
- else if (hi->attempts < 10)
+ } else {
+ error = -1;
+ if (hi->attempts < 10)
hi->notifyReqd += 5;
else if (hi->attempts < 20)
hi->notifyReqd += 60;
else
hi->notifyReqd += 60 * 60;
- return -1;
+ }
+ data.data = hi;
+ data.size = sizeof(*hi);
+ switch ((*db->put)(db, key, &data, 0)) {
+ case -1:
+ syslog(LOG_ERR, "Error storing %s (%m)", name);
+ case 0:
+ return error;
+
+ default:
+ abort();
}
}