Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/arch/sparc - return early in xcall() if the function is ...



details:   https://anonhg.NetBSD.org/src/rev/c6dfd7fca9a7
branches:  trunk
changeset: 828201:c6dfd7fca9a7
user:      macallan <macallan%NetBSD.org@localhost>
date:      Sat Dec 02 00:48:04 2017 +0000

description:
- return early in xcall() if the function is sparc_noop() instead of triggering
  the IPI and then ignoring responses ( or lack thereof )
- write the .tag field last to avoid a race when polling for an incoming
  IPI
- add event counters for IPIs being caught with the mutex not held, and for
  messages that are already marked as completed
With this my SS20 made it through 48 hours of pkgsrc with MAKE_JOBS=3 and a
pair of SM81s.
Hypersparcs still crash but instead of craziness we get actual error messages,
apparently one CPU will occasionally do a watchdog reset, which according to
the manual is caused by catching a trap with traps disabled. Now to figure
out how that can even happen...

diffstat:

 sys/arch/sparc/include/cpu.h |   6 ++++--
 sys/arch/sparc/sparc/cpu.c   |  24 ++++++++++++++++++------
 sys/arch/sparc/sparc/intr.c  |  29 ++++++++++++++++++++++++++---
 3 files changed, 48 insertions(+), 11 deletions(-)

diffs (178 lines):

diff -r 53bd1cda0334 -r c6dfd7fca9a7 sys/arch/sparc/include/cpu.h
--- a/sys/arch/sparc/include/cpu.h      Sat Dec 02 00:00:57 2017 +0000
+++ b/sys/arch/sparc/include/cpu.h      Sat Dec 02 00:48:04 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: cpu.h,v 1.98 2017/06/16 18:17:42 jdolecek Exp $ */
+/*     $NetBSD: cpu.h,v 1.99 2017/12/02 00:48:04 macallan Exp $ */
 
 /*
  * Copyright (c) 1992, 1993
@@ -164,7 +164,7 @@
         * the pending register to avoid a hardware bug.
         */
 #define raise_ipi(cpi,lvl)     do {                    \
-       int x;                                          \
+       volatile int x;                                         \
        (cpi)->intreg_4m->pi_set = PINTR_SINTRLEV(lvl); \
        x = (cpi)->intreg_4m->pi_pend; __USE(x);        \
 } while (0)
@@ -333,6 +333,8 @@
        struct evcnt ci_savefpstate_null;
        struct evcnt ci_xpmsg_mutex_fail;
        struct evcnt ci_xpmsg_mutex_fail_call;
+       struct evcnt ci_xpmsg_mutex_not_held;
+       struct evcnt ci_xpmsg_bogus;
        struct evcnt ci_intrcnt[16];
        struct evcnt ci_sintrcnt[16];
 };
diff -r 53bd1cda0334 -r c6dfd7fca9a7 sys/arch/sparc/sparc/cpu.c
--- a/sys/arch/sparc/sparc/cpu.c        Sat Dec 02 00:00:57 2017 +0000
+++ b/sys/arch/sparc/sparc/cpu.c        Sat Dec 02 00:48:04 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: cpu.c,v 1.249 2017/01/18 21:33:25 macallan Exp $ */
+/*     $NetBSD: cpu.c,v 1.250 2017/12/02 00:48:05 macallan Exp $ */
 
 /*
  * Copyright (c) 1996
@@ -52,7 +52,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.249 2017/01/18 21:33:25 macallan Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.250 2017/12/02 00:48:05 macallan Exp $");
 
 #include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
@@ -183,7 +183,7 @@
  * This must be locked around all message transactions to ensure only
  * one CPU is generating them.
  */
-static kmutex_t xpmsg_mutex;
+kmutex_t xpmsg_mutex;
 
 #endif /* MULTIPROCESSOR */
 
@@ -367,6 +367,10 @@
                             NULL, cpu_name(cpi), "IPI mutex_trylock fail");
        evcnt_attach_dynamic(&cpi->ci_xpmsg_mutex_fail_call, EVCNT_TYPE_MISC,
                             NULL, cpu_name(cpi), "IPI mutex_trylock fail/call");
+       evcnt_attach_dynamic(&cpi->ci_xpmsg_mutex_not_held, EVCNT_TYPE_MISC,
+                            NULL, cpu_name(cpi), "IPI with mutex not held");
+       evcnt_attach_dynamic(&cpi->ci_xpmsg_bogus, EVCNT_TYPE_MISC,
+                            NULL, cpu_name(cpi), "bogus IPI");
 
        /*
         * These are the per-cpu per-IPL hard & soft interrupt counters.
@@ -653,6 +657,8 @@
        char *bufp = errbuf;
        size_t bufsz = sizeof errbuf, wrsz;
 
+       if (is_noop) return;
+
        mybit = (1 << cpuinfo.ci_cpuid);
        callself = func && (cpuset & mybit) != 0;
        cpuset &= ~mybit;
@@ -714,7 +720,10 @@
                if ((cpuset & (1 << n)) == 0)
                        continue;
 
-               cpi->msg.tag = XPMSG_FUNC;
+               /*
+                * Write msg.tag last - if another CPU is polling above it may
+                * end up seeing an incomplete message. Not likely but still.
+                */ 
                cpi->msg.complete = 0;
                p = &cpi->msg.u.xpmsg_func;
                p->func = func;
@@ -722,6 +731,9 @@
                p->arg0 = arg0;
                p->arg1 = arg1;
                p->arg2 = arg2;
+               __insn_barrier();
+               cpi->msg.tag = XPMSG_FUNC;
+               __insn_barrier();
                /* Fast cross calls use interrupt level 14 */
                raise_ipi(cpi,13+fasttrap);/*xcall_cookie->pil*/
        }
@@ -737,7 +749,7 @@
         * have completed (bailing if it takes "too long", being loud about
         * this in the process).
         */
-       done = is_noop;
+       done = 0;
        i = 1000000;    /* time-out, not too long, but still an _AGE_ */
        while (!done) {
                if (--i < 0) {
@@ -774,7 +786,7 @@
 
        if (i >= 0 || debug_xcall == 0) {
                if (i < 0)
-                       printf_nolog("%s\n", errbuf);
+                       aprint_error("%s\n", errbuf);
                mutex_spin_exit(&xpmsg_mutex);
                return;
        }
diff -r 53bd1cda0334 -r c6dfd7fca9a7 sys/arch/sparc/sparc/intr.c
--- a/sys/arch/sparc/sparc/intr.c       Sat Dec 02 00:00:57 2017 +0000
+++ b/sys/arch/sparc/sparc/intr.c       Sat Dec 02 00:48:04 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: intr.c,v 1.118 2013/11/16 23:54:01 mrg Exp $ */
+/*     $NetBSD: intr.c,v 1.119 2017/12/02 00:48:05 macallan Exp $ */
 
 /*
  * Copyright (c) 1992, 1993
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: intr.c,v 1.118 2013/11/16 23:54:01 mrg Exp $");
+__KERNEL_RCSID(0, "$NetBSD: intr.c,v 1.119 2017/12/02 00:48:05 macallan Exp $");
 
 #include "opt_multiprocessor.h"
 #include "opt_sparc_arch.h"
@@ -76,6 +76,8 @@
 void *xcall_cookie;
 #endif
 
+extern kmutex_t xpmsg_mutex;
+
 void   strayintr(struct clockframe *);
 #ifdef DIAGNOSTIC
 void   bogusintr(struct clockframe *);
@@ -241,7 +243,7 @@
                        DELAY(1);
                        if (n-- > 0)
                                continue;
-                       printf("nmi_hard: SMP botch.");
+                       printf("nmi_hard: SMP botch.\n");
                        break;
                }
        }
@@ -364,6 +366,27 @@
        if (v != xcallintr)
                cpuinfo.ci_sintrcnt[13].ev_count++;
 
+       if (mutex_owned(&xpmsg_mutex) == 0) {
+               cpuinfo.ci_xpmsg_mutex_not_held.ev_count++;
+#ifdef DEBUG
+               printf("%s: mutex not held\n", __func__);
+#endif
+               cpuinfo.msg.complete = 1;
+               kpreempt_enable();
+               return;
+       }
+
+       if (cpuinfo.msg.complete != 0) {
+               cpuinfo.ci_xpmsg_bogus.ev_count++;
+#ifdef DEBUG
+               volatile struct xpmsg_func *p = &cpuinfo.msg.u.xpmsg_func;
+               printf("%s: bogus message %08x %08x %08x %08x\n", __func__,
+                   cpuinfo.msg.tag, (uint32_t)p->func, p->arg0, p->arg1);
+#endif
+               kpreempt_enable();
+               return;
+       }
+
        /* notyet - cpuinfo.msg.received = 1; */
        switch (cpuinfo.msg.tag) {
        case XPMSG_FUNC:



Home | Main Index | Thread Index | Old Index