Subject: Re: bin/31502: rpc.statd doesn't save failed notifies
To: None <gnats-bugs@netbsd.org, gnats-admin@netbsd.org,>
From: Christos Zoulas <christos@zoulas.com>
List: netbsd-bugs
Date: 10/07/2005 08:39:39
On Oct 7, 11:34am, xcc98be0c43465684@f4n.org (xcc98be0c43465684@f4n.org) wrote:
-- Subject: bin/31502: rpc.statd doesn't save failed notifies

Can you try this?

christos

Index: statd.c
===================================================================
RCS file: /cvsroot/src/usr.sbin/rpc.statd/statd.c,v
retrieving revision 1.23
diff -u -u -r1.23 statd.c
--- statd.c	14 Jan 2004 10:29:46 -0000	1.23
+++ statd.c	7 Oct 2005 12:39:10 -0000
@@ -459,54 +459,52 @@
 	time_t now = *(time_t *) ptr;
 	char *name = key->data;
 	DBT data;
+	int error;
 
 	if (hi->notifyReqd == 0 || hi->notifyReqd > now)
 		return 0;
 
-	if (notify_one_host(name)) {
-give_up:
+	/*
+	 * If one of the initial attempts fails, we wait
+	 * for a while and have another go.  This is necessary
+	 * because when we have crashed, (eg. a power outage)
+	 * it is quite possible that we won't be able to
+	 * contact all monitored hosts immediately on restart,
+	 * either because they crashed too and take longer
+	 * to come up (in which case the notification isn't
+	 * really required), or more importantly if some
+	 * router etc. needed to reach the monitored host
+	 * has not come back up yet.  In this case, we will
+	 * be a bit late in re-establishing locks (after the
+	 * grace period) but that is the best we can do.  We
+	 * try 10 times at 5 sec intervals, 10 more times at
+	 * 1 minute intervals, then 24 more times at hourly
+	 * intervals, finally giving up altogether if the
+	 * host hasn't come back to life after 24 hours.
+	 */
+	if (notify_one_host(name) || hi->attempts++ >= 44) {
+		error = 0;
 		hi->notifyReqd = 0;
 		hi->attempts = 0;
-		data.data = hi;
-		data.size = sizeof(*hi);
-		switch ((*db->put)(db, key, &data, 0)) {
-		case -1:
-			syslog(LOG_ERR, "Error storing %s (%m)", name);
-		case 0:
-			return 0;
-
-		default:
-			abort();
-		}
-	}
-	else {
-		/*
-		 * If one of the initial attempts fails, we wait
-		 * for a while and have another go.  This is necessary
-		 * because when we have crashed, (eg. a power outage)
-		 * it is quite possible that we won't be able to
-		 * contact all monitored hosts immediately on restart,
-		 * either because they crashed too and take longer
-		 * to come up (in which case the notification isn't
-		 * really required), or more importantly if some
-		 * router etc. needed to reach the monitored host
-		 * has not come back up yet.  In this case, we will
-		 * be a bit late in re-establishing locks (after the
-		 * grace period) but that is the best we can do.  We
-		 * try 10 times at 5 sec intervals, 10 more times at
-		 * 1 minute intervals, then 24 more times at hourly
-		 * intervals, finally giving up altogether if the
-		 * host hasn't come back to life after 24 hours.
-		 */
-		if (hi->attempts++ >= 44)
-			goto give_up;
-		else if (hi->attempts < 10)
+	} else {
+		error = -1;
+		if (hi->attempts < 10)
 			hi->notifyReqd += 5;
 		else if (hi->attempts < 20)
 			hi->notifyReqd += 60;
 		else
 			hi->notifyReqd += 60 * 60;
-		return -1;
+	}
+	data.data = hi;
+	data.size = sizeof(*hi);
+	switch ((*db->put)(db, key, &data, 0)) {
+	case -1:
+		syslog(LOG_ERR, "Error storing %s (%m)", name);
+	case 0:
+		return error;
+
+	default:
+		abort();
 	}
 }