tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
xcall while cold == 1
Hi.
While debugging PR#52820 ("boot -1" panics on systems with ixgX interfaces),
I've noticed that xcall doesn't work while cold == 1.
When I added softint_disestablish() near the end of the ixgbe_attach().
The following panic occured:
panic: kernel diagnostic assertion "xc->xc_donep < xc->xc_headp" failed: file "../../../../kern/subr_xcall.c", line 278
This KASSERT is:
static inline uint64_t
xc_lowpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci)
{
xc_state_t *xc = &xc_low_pri;
CPU_INFO_ITERATOR cii;
uint64_t where;
mutex_enter(&xc->xc_lock);
while (xc->xc_headp != xc->xc_donep) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
xc->xc_arg1 = arg1;
xc->xc_arg2 = arg2;
xc->xc_func = func;
if (ci == NULL) {
xc_broadcast_ev.ev_count++;
for (CPU_INFO_FOREACH(cii, ci)) {
if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
continue;
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
} else {
xc_unicast_ev.ev_count++;
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
KASSERT(xc->xc_donep < xc->xc_headp); <===== Here!
where = xc->xc_headp;
mutex_exit(&xc->xc_lock);
/* Return a low priority ticket. */
KASSERT((where & XC_PRI_BIT) == 0);
return where;
}
So I added the following debug printf:
---------------------------------------------------------------------
@@ -252,6 +253,7 @@ xc_lowpri(xcfunc_t func, void *arg1, voi
xc_state_t *xc = &xc_low_pri;
CPU_INFO_ITERATOR cii;
uint64_t where;
+ int i = -1;
mutex_enter(&xc->xc_lock);
while (xc->xc_headp != xc->xc_donep) {
@@ -263,8 +265,11 @@ xc_lowpri(xcfunc_t func, void *arg1, voi
if (ci == NULL) {
xc_broadcast_ev.ev_count++;
for (CPU_INFO_FOREACH(cii, ci)) {
- if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
+ i++;
+ if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0) {
+ printf("cpu %d: XXX not running\n", i);
continue;
+ }
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
@@ -275,6 +280,9 @@ xc_lowpri(xcfunc_t func, void *arg1, voi
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
+ if (xc->xc_donep >= xc->xc_headp)
+ printf("XXX donep = %" PRIu64 ", headp = %" PRIu64
+ ", ci = %p\n", xc->xc_donep, xc->xc_headp, ci);
KASSERT(xc->xc_donep < xc->xc_headp);
where = xc->xc_headp;
mutex_exit(&xc->xc_lock);
-------------------------------------------------------------------------
The output says
cpu 0: XXX not running
cpu 1: XXX not running
cpu 2: XXX not running
cpu 3: XXX not running
XXX donep = 0, headp = 0, ci = 0x0
panic: kernel diagnostic assertion "xc->xc_donep < xc->xc_headp" failed: file "../../../../kern/subr_xcall.c", line 286
(Yes, the exact reason is not cold==1 but all CPU's SPCF_RUNNING is not set)
Is this intended behavior? Is it possible to use xcall while
cold==1?
For softint_establish(), the following diff avoid panic:
----------------------------------------
Index: kern_softint.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_softint.c,v
retrieving revision 1.44
diff -u -p -r1.44 kern_softint.c
--- kern_softint.c 22 Nov 2017 02:20:21 -0000 1.44
+++ kern_softint.c 25 Dec 2017 06:31:57 -0000
@@ -177,6 +177,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_softint
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/mutex.h>
+#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
@@ -430,8 +431,10 @@ softint_disestablish(void *arg)
* it again. So, we are only looking for handler records with
* SOFTINT_ACTIVE already set.
*/
- where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
- xc_wait(where);
+ if (__predict_true(cold == 0)) {
+ where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
+ xc_wait(where);
+ }
for (;;) {
/* Collect flag values from each CPU. */
----------------------------------------
I don't know whether this is good fix or not.
One of the workaround is not call softint_disesablish() in
xxx_attach() and defer it using with config_interrupts(), but
I think it's dirty.
I'm now writing a code for ixg(4) to fallback from MSI-X to
legacy interrupt when resource shortage occurred. The shortage
may occurs in the middle of MSI-X interrupt establish and softint
establish, so the code to clean interrupt is required in
ixgbe_attach()
Any advice? Of course patch is welcomed!
--
-----------------------------------------------
SAITOH Masanobu (msaitoh%execsw.org@localhost
msaitoh%netbsd.org@localhost)
Home |
Main Index |
Thread Index |
Old Index