NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: kern/53202: Kernel hangs running t_ptrace_wait:resume1 test
The following reply was made to PR kern/53202; it has been noted by GNATS.
From: Andreas Gustafsson <gson%gson.org@localhost>
To: christos%NetBSD.org@localhost, gnats-bugs%NetBSD.org@localhost
Cc:
Subject: Re: kern/53202: Kernel hangs running t_ptrace_wait:resume1 test
Date: Mon, 23 Apr 2018 18:16:25 +0300
My analysis of this bug is that the kernel ends up in a loop where the
current lwp is waiting for another lwp to exit, but the loop never
yields the CPU by calling mi_switch(), so the other lwp never gets a
chance to run and exit, unless we happen to be running on a
multiprocessor.
Before kern_lwp.c 1.191, this worked because sleepq_block() was called
with catch_p=false, so the the "early" flag in sleepq_block() was
false and sleepq_block() called mi_switch() in the "else" clause of
"if (early) ...". Now catch_p=true, "early" is true, and mi_switch()
is never called.
Below is a gdb transcript from single stepping around the entire loop,
beginning and ending at "goto retry".
Breakpoint 1, exit_lwps (l=0xc1f9b020) at /usr/src/sys/kern/kern_exit.c:637
637 goto retry;
(gdb) n
610 KASSERT(mutex_owned(p->p_lock));
(gdb)
616 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
(gdb)
617 if (l2 == l)
(gdb)
618 continue;
(gdb)
616 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
(gdb)
617 if (l2 == l)
(gdb)
619 lwp_lock(l2);
(gdb) n
620 l2->l_flag |= LW_WEXIT;
(gdb) n
621 if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
(gdb) n
622 l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
(gdb) print /x l2->l_stat
$1 = 0x2
(gdb) print /x l2->l_flag
$2 = 0x1100000
(gdb) n
621 if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
(gdb)
622 l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
(gdb)
627 lwp_unlock(l2);
(gdb)
616 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
(gdb)
635 while (p->p_nlwps > 1) {
(gdb)
636 if (lwp_wait(l, 0, NULL, true)) {
(gdb) s
lwp_wait (l=0xc1f9b020, lid=0, departed=0x0, exiting=true)
at /usr/src/sys/kern/kern_lwp.c:531
531 const lwpid_t curlid = l->l_lid;
(gdb) n
532 proc_t *p = l->l_proc;
(gdb)
536 KASSERT(mutex_owned(p->p_lock));
(gdb)
538 p->p_nlwpwait++;
(gdb) n
539 l->l_waitingfor = lid;
(gdb) print p->p_nlwpwait
$3 = 1
(gdb) n
550 if ((p->p_sflag & PS_WCORE) != 0) {
(gdb) n
560 while ((l2 = p->p_zomblwp) != NULL) {
(gdb)
571 nfound = 0;
(gdb)
572 error = 0;
(gdb)
573 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
(gdb)
583 if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
(gdb)
587 if (l2 == l)
(gdb)
588 continue;
(gdb)
573 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
(gdb)
583 if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
(gdb)
587 if (l2 == l)
(gdb)
589 if ((l2->l_prflag & LPR_DETACHED) != 0) {
(gdb)
593 if (lid != 0) {
(gdb)
602 } else if (l2->l_waiter != 0) {
(gdb)
612 nfound++;
(gdb)
615 if (l2->l_stat != LSZOMB)
(gdb)
616 continue;
(gdb)
573 LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
(gdb)
635 if (error != 0)
(gdb)
637 if (nfound == 0) {
(gdb)
646 if (exiting) {
(gdb)
647 KASSERT(p->p_nlwps > 1);
(gdb)
648 error = cv_wait_sig(&p->p_lwpcv, p->p_lock);
(gdb) s
cv_wait_sig (cv=0xc20d6d80, mtx=0xc2604180)
at /usr/src/sys/kern/kern_condvar.c:266
266 lwp_t *l = curlwp;
(gdb) n
269 KASSERT(mutex_owned(mtx));
(gdb)
271 cv_enter(cv, mtx, l);
(gdb)
272 error = sleepq_block(0, true);
(gdb) s
sleepq_block (timo=0, catch_p=true) at /usr/src/sys/kern/kern_sleepq.c:235
235 int error = 0, sig;
(gdb) n
237 lwp_t *l = curlwp;
(gdb)
238 bool early = false;
(gdb)
239 int biglocks = l->l_biglocks;
(gdb)
241 ktrcsw(1, 0);
(gdb)
247 if (catch_p) {
(gdb)
248 l->l_flag |= LW_SINTR;
(gdb)
249 if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) {
(gdb)
253 } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0))
(gdb)
254 early = true;
(gdb)
257 if (early) {
(gdb)
259 lwp_unsleep(l, true);
(gdb) s
lwp_unsleep (l=0xc1f9b020, cleanup=true) at /usr/src/sys/kern/kern_lwp.c:1525
1525 KASSERT(mutex_owned(l->l_mutex));
(gdb) n
1526 (*l->l_syncobj->sobj_unsleep)(l, cleanup);
(gdb) s
1527 }
(gdb) s
sleepq_block (timo=0, catch_p=true) at /usr/src/sys/kern/kern_sleepq.c:277
277 if (catch_p && error == 0) {
(gdb) print l->l_syncobj->sobj_unsleep
$4 = (void (*)(struct lwp *, _Bool)) 0xc0bdb63f <sched_unsleep>
(gdb) s
278 p = l->l_proc;
(gdb) s
279 if ((l->l_flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0)
(gdb) s
281 else if ((l->l_flag & LW_PENDSIG) != 0) {
(gdb) s
289 mutex_enter(p->p_lock);
(gdb) n
290 if (((sig = sigispending(l, 0)) != 0 &&
(gdb) n
291 (sigprop[sig] & SA_STOP) == 0) ||
(gdb) n
290 if (((sig = sigispending(l, 0)) != 0 &&
(gdb) n
293 error = sleepq_sigtoerror(l, sig);
(gdb) s
sleepq_sigtoerror (l=0xc1f9b020, sig=9) at /usr/src/sys/kern/kern_sleepq.c:387
387 struct proc *p = l->l_proc;
(gdb) n
390 KASSERT(mutex_owned(p->p_lock));
(gdb)
395 if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
(gdb)
398 error = ERESTART;
(gdb)
400 return error;
(gdb)
401 }
(gdb)
sleepq_block (timo=0, catch_p=true) at /usr/src/sys/kern/kern_sleepq.c:294
294 mutex_exit(p->p_lock);
(gdb)
298 ktrcsw(0, 0);
(gdb) n
299 if (__predict_false(biglocks != 0)) {
(gdb) n
302 return error;
(gdb)
303 }
(gdb)
cv_wait_sig (cv=0xc20d6d80, mtx=0xc2604180)
at /usr/src/sys/kern/kern_condvar.c:273
273 return cv_exit(cv, mtx, l, error);
(gdb) n
274 }
(gdb) n
lwp_wait (l=0xc1f9b020, lid=0, departed=0x0, exiting=true)
at /usr/src/sys/kern/kern_lwp.c:649
649 if (error == 0)
(gdb) n
651 break;
(gdb)
685 if (lid != 0) {
(gdb)
694 p->p_nlwpwait--;
(gdb)
695 l->l_waitingfor = 0;
(gdb)
696 cv_broadcast(&p->p_lwpcv);
(gdb) n
698 return error;
(gdb)
699 }
(gdb)
Breakpoint 1, exit_lwps (l=0xc1f9b020) at /usr/src/sys/kern/kern_exit.c:637
637 goto retry;
(gdb)
Home |
Main Index |
Thread Index |
Old Index