Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/usr.sbin/ypbind Instead of using magic numbers in what looks...



details:   https://anonhg.NetBSD.org/src/rev/796b3e4dd37f
branches:  trunk
changeset: 329813:796b3e4dd37f
user:      dholland <dholland%NetBSD.org@localhost>
date:      Tue Jun 10 17:19:22 2014 +0000

description:
Instead of using magic numbers in what looks like a boolean
(dom_alive), create a state enumeration (domainstates) and use it
instead.

Instead of three states (new, alive, and, effectively, 'troubled') go
to five: new, alive, pinging, lost, and dead.

Domains start in the NEW state. When we get a reply from a server, the
state goes to ALIVE. The state is set to PINGING when we ping the
server (once a minute normally) and if the ping times out, it goes to
LOST. If we stay lost for a minute, go to DEAD, and in DEAD, do
exponential backoff of nag_servers calls.

Getting rid of the broken logic attached to the 'troubled' state fixes
PR 15355 (ypbind defeats disk idle spindown) -- it will now only
rewrite the binding file when the binding changes.

Also, fix the HEURISTIC code so it doesn't trigger except in ALIVE
state. I think this was the source of a lot of the spamming behavior
seen in PR 32519, which is now fixed.

Might also fix PR 23135 (broadcast ypbind sometimes fails to find
servers).

diffstat:

 usr.sbin/ypbind/ypbind.c |  234 +++++++++++++++++++++++++++++++++++++---------
 1 files changed, 187 insertions(+), 47 deletions(-)

diffs (truncated from 379 to 300 lines):

diff -r 7f5739afc905 -r 796b3e4dd37f usr.sbin/ypbind/ypbind.c
--- a/usr.sbin/ypbind/ypbind.c  Tue Jun 10 17:19:12 2014 +0000
+++ b/usr.sbin/ypbind/ypbind.c  Tue Jun 10 17:19:22 2014 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: ypbind.c,v 1.95 2014/06/10 17:19:12 dholland Exp $     */
+/*     $NetBSD: ypbind.c,v 1.96 2014/06/10 17:19:22 dholland Exp $     */
 
 /*
  * Copyright (c) 1992, 1993 Theo de Raadt <deraadt%fsa.ca@localhost>
@@ -28,7 +28,7 @@
 
 #include <sys/cdefs.h>
 #ifndef LINT
-__RCSID("$NetBSD: ypbind.c,v 1.95 2014/06/10 17:19:12 dholland Exp $");
+__RCSID("$NetBSD: ypbind.c,v 1.96 2014/06/10 17:19:22 dholland Exp $");
 #endif
 
 #include <sys/types.h>
@@ -84,16 +84,26 @@
        YPBIND_DIRECT, YPBIND_BROADCAST,
 } ypbind_mode_t;
 
+enum domainstates {
+       DOM_NEW,                /* not yet bound */
+       DOM_ALIVE,              /* bound and healthy */
+       DOM_PINGING,            /* ping outstanding */
+       DOM_LOST,               /* binding timed out, looking for a new one */
+       DOM_DEAD,               /* long-term lost, in exponential backoff */
+};
+
 struct domain {
        struct domain *dom_next;
 
        char dom_name[YPMAXDOMAIN + 1];
        struct sockaddr_in dom_server_addr;
        long dom_vers;
-       time_t dom_checktime;
-       time_t dom_asktime;
+       time_t dom_checktime;           /* time of next check/contact */
+       time_t dom_asktime;             /* time we were last DOMAIN'd */
+       time_t dom_losttime;            /* time the binding was lost, or 0 */
+       unsigned dom_backofftime;       /* current backoff period, when DEAD */
        int dom_lockfd;
-       int dom_alive;
+       enum domainstates dom_state;
        uint32_t dom_xid;
        FILE *dom_serversfile;          /* /var/yp/binding/foo.ypservers */
        int dom_been_ypset;             /* ypset been done on this domain? */
@@ -145,6 +155,39 @@
        return fd;
 }
 
+/*
+ * Exponential backoff for pinging servers for a dead domain.
+ *
+ * We go 10 -> 20 -> 40 -> 60 seconds, then 2 -> 4 -> 8 -> 15 -> 30 ->
+ * 60 minutes, and stay at 60 minutes. This is overengineered.
+ *
+ * With a 60 minute max backoff the response time for when things come
+ * back is not awful, but we only try (and log) about 60 times even if
+ * things are down for a whole long weekend. This is an acceptable log
+ * load, I think.
+ */
+static void
+backoff(unsigned *psecs)
+{
+       unsigned secs;
+
+       secs = *psecs;
+       if (secs < 60) {
+               secs *= 2;
+               if (secs > 60) {
+                       secs = 60;
+               }
+       } else if (secs < 60 * 15) {
+               secs *= 2;
+               if (secs > 60 * 15) {
+                       secs = 60 * 15;
+               }
+       } else if (secs < 60 * 60) {
+               secs *= 2;
+       }
+       *psecs = secs;
+}
+
 ////////////////////////////////////////////////////////////
 // logging
 
@@ -198,14 +241,28 @@
 // struct domain
 
 /*
- * State transition is done like this: 
+ * The state transitions of a domain work as follows:
+ *
+ * in state NEW:
+ *    nag_servers every 5 seconds
+ *    upon answer, state is ALIVE
+ *
+ * in state ALIVE:
+ *    every 60 seconds, send ping and switch to state PINGING
  *
- * STATE       EVENT           ACTION                  NEWSTATE        TIMEOUT
- * no binding  timeout         broadcast               no binding      5 sec
- * no binding  answer          --                      binding         60 sec
- * binding     timeout         ping server             checking        5 sec
- * checking    timeout         ping server + broadcast checking        5 sec
- * checking    answer          --                      binding         60 sec
+ * in state PINGING:
+ *    upon answer, go to state ALIVE
+ *    if no answer in 5 seconds, go to state LOST and do nag_servers
+ *
+ * in state LOST:
+ *    do nag_servers every 5 seconds
+ *    upon answer, go to state ALIVE
+ *    if no answer in 60 seconds, go to state DEAD
+ *
+ * in state DEAD
+ *    do nag_servers every backofftime seconds (starts at 10)
+ *    upon answer go to state ALIVE
+ *    backofftime doubles (approximately) each try, with a cap of 1 hour
  */
 
 /*
@@ -263,8 +320,10 @@
        dom->dom_vers = YPVERS;
        dom->dom_checktime = 0;
        dom->dom_asktime = 0;
+       dom->dom_losttime = 0;
+       dom->dom_backofftime = 10;
        dom->dom_lockfd = -1;
-       dom->dom_alive = 0;
+       dom->dom_state = DOM_NEW;
        dom->dom_xid = unique_xid(dom);
        dom->dom_been_ypset = 0;
        dom->dom_serversfile = NULL;
@@ -456,36 +515,71 @@
        }
 
        /*
-        * If the domain is alive and we aren't ypset, we don't need
-        * to do anything.
-        *
-        * This code is unreachable (AFAIK) unless we receive an
-        * unsolicited ping reply from the ypserver: because dom_alive
-        * is 0 until we have a binding, but set from 1 to 2 when we
-        * ping, it will never normally be 1 when a reply comes in,
-        * even a reply to a ping. In the case where we lost the
-        * binding and are getting a reply arising from nag_servers,
-        * we lost the binding because we never got a ping response so
-        * in that case dom_alive will also be 2.
-        *
-        * This logic is clearly wrong. XXX.
+        * If the domain is alive and we aren't being called by ypset,
+        * we shouldn't be getting a response at all. Log it, as it
+        * might be hostile.
         */
-       if (dom->dom_alive == 1 && force == 0) {
+       if (dom->dom_state == DOM_ALIVE && force == 0) {
+               if (!memcmp(&dom->dom_server_addr, raddrp,
+                           sizeof(dom->dom_server_addr))) {
+                       yp_log(LOG_WARNING,
+                              "Unexpected reply from server %s for domain %s",
+                              inet_ntoa(dom->dom_server_addr.sin_addr),
+                              dom->dom_name);
+               } else {
+                       yp_log(LOG_WARNING,
+                              "Falsified reply from %s for domain %s",
+                              inet_ntoa(dom->dom_server_addr.sin_addr),
+                              dom->dom_name);
+               }
+               return;
+       }
+
+       /*
+        * If we're expected a ping response, and we've got it
+        * (meaning we aren't being called by ypset), we don't need to
+        * do anything.
+        */
+       if (dom->dom_state == DOM_PINGING && force == 0) {
                /*
                 * If the reply came from the server we expect, set
-                * dom_alive back to 1 and ping again in 60 seconds.
+                * dom_state back to ALIVE and ping again in 60
+                * seconds.
                 *
-                * If it came from somewhere else, ignore it.
+                * If it came from somewhere else, log it.
                 */
                if (!memcmp(&dom->dom_server_addr, raddrp,
                            sizeof(dom->dom_server_addr))) {
-                       dom->dom_alive = 1;
+                       dom->dom_state = DOM_ALIVE;
                        /* recheck binding in 60 sec */
                        dom->dom_checktime = time(NULL) + 60;
+               } else {
+                       yp_log(LOG_WARNING,
+                              "Falsified reply from %s for domain %s",
+                              inet_ntoa(dom->dom_server_addr.sin_addr),
+                              dom->dom_name);
                }
                return;
        }
 
+#ifdef HEURISTIC
+       /*
+        * If transitioning to the alive state from a non-alive state,
+        * clear dom_asktime. This will help prevent any requests that
+        * are still coming in from triggering unnecessary pings via
+        * the HEURISTIC code.
+        *
+        * XXX: this may not be an adequate measure; we may need to
+        * keep more state so we can disable the HEURISTIC code for
+        * the first few seconds after rebinding.
+        */
+       if (dom->dom_state == DOM_NEW ||
+           dom->dom_state == DOM_LOST ||
+           dom->dom_state == DOM_DEAD) {
+               dom->dom_asktime = 0;
+       }
+#endif
+
        /*
         * Take the address we got the message from (or in the case of
         * ypset, the explicit address we were given) as the server
@@ -514,8 +608,7 @@
         *
         * 3. Either way we should not accept a response from an
         * arbitrary host unless we don't currently have a binding.
-        * (The logic in the previous if statement is probably
-        * supposed to handle this, but it doesn't currently work.)
+        * (This is now fixed above.)
         *
         * Note that for a random unsolicited reply to work it has to
         * carry the XID of one of the domains we know about; but
@@ -525,7 +618,11 @@
            sizeof(dom->dom_server_addr));
        /* recheck binding in 60 seconds */
        dom->dom_checktime = time(NULL) + 60;
-       dom->dom_alive = 1;
+       dom->dom_state = DOM_ALIVE;
+
+       /* Clear the dead/backoff state. */
+       dom->dom_losttime = 0;
+       dom->dom_backofftime = 10;
 
        /*
         * Generate a new binding file. If this fails, forget about it.
@@ -641,8 +738,8 @@
                return NULL;
        }
 
-       if (dom->dom_alive == 0) {
-               DPRINTF("dead domain %s\n", arg);
+       if (dom->dom_state == DOM_NEW) {
+               DPRINTF("new domain %s\n", arg);
                return NULL;
        }
 
@@ -655,9 +752,16 @@
         * elsewhere uses the binding file.
         *
         * Note: HEURISTIC is enabled by default.
+        *
+        * dholland 20140609: I think this is part of the mechanism
+        * that causes ypbind to spam. I'm changing this logic so it
+        * only triggers when the state is DOM_ALIVE: if the domain
+        * is new, lost, or dead we shouldn't send more requests than
+        * the ones already scheduled, and if we're already in the
+        * middle of pinging there's no point doing it again.
         */
        (void)time(&now);
-       if (now < dom->dom_asktime + 5) {
+       if (dom->dom_state == DOM_ALIVE && now < dom->dom_asktime + 5) {
                /*
                 * Hmm. More than 2 requests in 5 seconds have indicated
                 * that my binding is possibly incorrect.
@@ -1263,7 +1367,7 @@
                removelock(dom);
        }
 
-       if (dom->dom_alive == 2) {
+       if (dom->dom_state == DOM_PINGING || dom->dom_state == DOM_LOST) {
                /*
                 * This resolves the following situation:
                 * ypserver on other subnet was once bound,
@@ -1300,8 +1404,6 @@
 
 /*
  * Send a ping message to a domain's current ypserver.
- *
- * Note that dom_alive is 2 while waiting for the response.
  */
 static int
 ping(struct domain *dom)
@@ -1352,7 +1454,6 @@
        }
        AUTH_DESTROY(rpcua);
 
-       dom->dom_alive = 2;



Home | Main Index | Thread Index | Old Index