tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

xcall while cold == 1



 Hi.

 While debugging PR#52820 ("boot -1" panics on systems with ixgX interfaces),
I've noticed that xcall doesn't work while cold == 1.

 When I added softint_disestablish() near the end of the ixgbe_attach().
The following panic occured:

panic: kernel diagnostic assertion "xc->xc_donep < xc->xc_headp" failed: file "../../../../kern/subr_xcall.c", line 278

This KASSERT is:

static inline uint64_t
xc_lowpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci)
{
        xc_state_t *xc = &xc_low_pri;
        CPU_INFO_ITERATOR cii;
        uint64_t where;

        mutex_enter(&xc->xc_lock);
        while (xc->xc_headp != xc->xc_donep) {
                cv_wait(&xc->xc_busy, &xc->xc_lock);
        }
        xc->xc_arg1 = arg1;
        xc->xc_arg2 = arg2;
        xc->xc_func = func;
        if (ci == NULL) {
                xc_broadcast_ev.ev_count++;
                for (CPU_INFO_FOREACH(cii, ci)) {
                        if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
                                continue;
                        xc->xc_headp += 1;
                        ci->ci_data.cpu_xcall_pending = true;
                        cv_signal(&ci->ci_data.cpu_xcall);
                }
        } else {
                xc_unicast_ev.ev_count++;
                xc->xc_headp += 1;
                ci->ci_data.cpu_xcall_pending = true;
                cv_signal(&ci->ci_data.cpu_xcall);
        }
        KASSERT(xc->xc_donep < xc->xc_headp);    <===== Here!
        where = xc->xc_headp;
        mutex_exit(&xc->xc_lock);

        /* Return a low priority ticket. */
        KASSERT((where & XC_PRI_BIT) == 0);
        return where;
}

So I added the following debug printf:
---------------------------------------------------------------------
@@ -252,6 +253,7 @@ xc_lowpri(xcfunc_t func, void *arg1, voi
        xc_state_t *xc = &xc_low_pri;
        CPU_INFO_ITERATOR cii;
        uint64_t where;
+       int i = -1;

        mutex_enter(&xc->xc_lock);
        while (xc->xc_headp != xc->xc_donep) {
@@ -263,8 +265,11 @@ xc_lowpri(xcfunc_t func, void *arg1, voi
        if (ci == NULL) {
                xc_broadcast_ev.ev_count++;
                for (CPU_INFO_FOREACH(cii, ci)) {
-                       if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
+                       i++;
+                       if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0) {
+                               printf("cpu %d: XXX not running\n", i);
                                continue;
+                       }
                        xc->xc_headp += 1;
                        ci->ci_data.cpu_xcall_pending = true;
                        cv_signal(&ci->ci_data.cpu_xcall);
@@ -275,6 +280,9 @@ xc_lowpri(xcfunc_t func, void *arg1, voi
                ci->ci_data.cpu_xcall_pending = true;
                cv_signal(&ci->ci_data.cpu_xcall);
        }
+       if (xc->xc_donep >= xc->xc_headp)
+               printf("XXX donep = %" PRIu64 ", headp = %" PRIu64
+                   ", ci = %p\n", xc->xc_donep, xc->xc_headp, ci);
        KASSERT(xc->xc_donep < xc->xc_headp);
        where = xc->xc_headp;
        mutex_exit(&xc->xc_lock);
-------------------------------------------------------------------------

The output says
cpu 0: XXX not running
cpu 1: XXX not running
cpu 2: XXX not running
cpu 3: XXX not running
XXX donep = 0, headp = 0, ci = 0x0
panic: kernel diagnostic assertion "xc->xc_donep < xc->xc_headp" failed: file "../../../../kern/subr_xcall.c", line 286

(Yes, the exact reason is not cold==1 but all CPU's SPCF_RUNNING is not set)


 Is this intended behavior? Is it possible to use xcall while
cold==1?

For softint_establish(), the following diff avoid panic:
----------------------------------------
Index: kern_softint.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_softint.c,v
retrieving revision 1.44
diff -u -p -r1.44 kern_softint.c
--- kern_softint.c      22 Nov 2017 02:20:21 -0000      1.44
+++ kern_softint.c      25 Dec 2017 06:31:57 -0000
@@ -177,6 +177,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_softint
 #include <sys/intr.h>
 #include <sys/ipi.h>
 #include <sys/mutex.h>
+#include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/evcnt.h>
 #include <sys/cpu.h>
@@ -430,8 +431,10 @@ softint_disestablish(void *arg)
         * it again.  So, we are only looking for handler records with
         * SOFTINT_ACTIVE already set.
         */
-       where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
-       xc_wait(where);
+       if (__predict_true(cold == 0)) {
+               where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
+               xc_wait(where);
+       }

        for (;;) {
                /* Collect flag values from each CPU. */
----------------------------------------

 I don't know whether this is good fix or not.

 One of the workaround is not call softint_disesablish() in
xxx_attach() and defer it using with config_interrupts(), but
I think it's dirty.

 I'm now writing a code for ixg(4) to fallback from MSI-X to
legacy interrupt when resource shortage occurred. The shortage
may occurs in the middle of MSI-X interrupt establish and softint
establish, so the code to clean interrupt is required in
ixgbe_attach()

 Any advice? Of course patch is welcomed!

--
-----------------------------------------------
                SAITOH Masanobu (msaitoh%execsw.org@localhost
                                 msaitoh%netbsd.org@localhost)


Home | Main Index | Thread Index | Old Index