NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: kern/53202: Kernel hangs running t_ptrace_wait:resume1 test



The following reply was made to PR kern/53202; it has been noted by GNATS.

From: Andreas Gustafsson <gson%gson.org@localhost>
To: christos%NetBSD.org@localhost, gnats-bugs%NetBSD.org@localhost
Cc: 
Subject: Re: kern/53202: Kernel hangs running t_ptrace_wait:resume1 test
Date: Mon, 23 Apr 2018 18:16:25 +0300

 My analysis of this bug is that the kernel ends up in a loop where the
 current lwp is waiting for another lwp to exit, but the loop never
 yields the CPU by calling mi_switch(), so the other lwp never gets a
 chance to run and exit, unless we happen to be running on a
 multiprocessor.
 
 Before kern_lwp.c 1.191, this worked because sleepq_block() was called
 with catch_p=false, so the the "early" flag in sleepq_block() was 
 false and sleepq_block() called mi_switch() in the "else" clause of
 "if (early) ...".  Now catch_p=true, "early" is true, and mi_switch()
 is never called.
 
 Below is a gdb transcript from single stepping around the entire loop,
 beginning and ending at "goto retry".
 
 Breakpoint 1, exit_lwps (l=0xc1f9b020) at /usr/src/sys/kern/kern_exit.c:637
 637                             goto retry;
 (gdb) n
 610             KASSERT(mutex_owned(p->p_lock));
 (gdb) 
 616             LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
 (gdb) 
 617                     if (l2 == l)
 (gdb) 
 618                             continue;
 (gdb) 
 616             LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
 (gdb) 
 617                     if (l2 == l)
 (gdb) 
 619                     lwp_lock(l2);
 (gdb) n
 620                     l2->l_flag |= LW_WEXIT;
 (gdb) n
 621                     if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
 (gdb) n
 622                         l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
 (gdb) print /x l2->l_stat
 $1 = 0x2
 (gdb) print /x l2->l_flag
 $2 = 0x1100000
 (gdb) n
 621                     if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
 (gdb) 
 622                         l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
 (gdb) 
 627                     lwp_unlock(l2);
 (gdb) 
 616             LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
 (gdb) 
 635             while (p->p_nlwps > 1) {
 (gdb) 
 636                     if (lwp_wait(l, 0, NULL, true)) {
 (gdb) s
 lwp_wait (l=0xc1f9b020, lid=0, departed=0x0, exiting=true)
     at /usr/src/sys/kern/kern_lwp.c:531
 531             const lwpid_t curlid = l->l_lid;
 (gdb) n
 532             proc_t *p = l->l_proc;
 (gdb) 
 536             KASSERT(mutex_owned(p->p_lock));
 (gdb) 
 538             p->p_nlwpwait++;
 (gdb) n
 539             l->l_waitingfor = lid;
 (gdb) print p->p_nlwpwait
 $3 = 1
 (gdb) n
 550                     if ((p->p_sflag & PS_WCORE) != 0) {
 (gdb) n
 560                     while ((l2 = p->p_zomblwp) != NULL) {
 (gdb) 
 571                     nfound = 0;
 (gdb) 
 572                     error = 0;
 (gdb) 
 573                     LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
 (gdb) 
 583                             if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
 (gdb) 
 587                             if (l2 == l)
 (gdb) 
 588                                     continue;
 (gdb) 
 573                     LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
 (gdb) 
 583                             if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
 (gdb) 
 587                             if (l2 == l)
 (gdb) 
 589                             if ((l2->l_prflag & LPR_DETACHED) != 0) {
 (gdb) 
 593                             if (lid != 0) {
 (gdb) 
 602                             } else if (l2->l_waiter != 0) {
 (gdb) 
 612                             nfound++;
 (gdb) 
 615                             if (l2->l_stat != LSZOMB)
 (gdb) 
 616                                     continue;
 (gdb) 
 573                     LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
 (gdb) 
 635                     if (error != 0)
 (gdb) 
 637                     if (nfound == 0) {
 (gdb) 
 646                     if (exiting) {
 (gdb) 
 647                             KASSERT(p->p_nlwps > 1);
 (gdb) 
 648                             error = cv_wait_sig(&p->p_lwpcv, p->p_lock);
 (gdb) s
 cv_wait_sig (cv=0xc20d6d80, mtx=0xc2604180)
     at /usr/src/sys/kern/kern_condvar.c:266
 266             lwp_t *l = curlwp;
 (gdb) n
 269             KASSERT(mutex_owned(mtx));
 (gdb) 
 271             cv_enter(cv, mtx, l);
 (gdb) 
 272             error = sleepq_block(0, true);
 (gdb) s
 sleepq_block (timo=0, catch_p=true) at /usr/src/sys/kern/kern_sleepq.c:235
 235             int error = 0, sig;
 (gdb) n
 237             lwp_t *l = curlwp;
 (gdb) 
 238             bool early = false;
 (gdb) 
 239             int biglocks = l->l_biglocks;
 (gdb) 
 241             ktrcsw(1, 0);
 (gdb) 
 247             if (catch_p) {
 (gdb) 
 248                     l->l_flag |= LW_SINTR;
 (gdb) 
 249                     if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) {
 (gdb) 
 253                     } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0))
 (gdb) 
 254                             early = true;
 (gdb) 
 257             if (early) {
 (gdb) 
 259                     lwp_unsleep(l, true);
 (gdb) s
 lwp_unsleep (l=0xc1f9b020, cleanup=true) at /usr/src/sys/kern/kern_lwp.c:1525
 1525            KASSERT(mutex_owned(l->l_mutex));
 (gdb) n
 1526            (*l->l_syncobj->sobj_unsleep)(l, cleanup);
 (gdb) s
 1527    }
 (gdb) s
 sleepq_block (timo=0, catch_p=true) at /usr/src/sys/kern/kern_sleepq.c:277
 277             if (catch_p && error == 0) {
 (gdb) print l->l_syncobj->sobj_unsleep
 $4 = (void (*)(struct lwp *, _Bool)) 0xc0bdb63f <sched_unsleep>
 (gdb) s
 278                     p = l->l_proc;
 (gdb) s
 279                     if ((l->l_flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0)
 (gdb) s
 281                     else if ((l->l_flag & LW_PENDSIG) != 0) {
 (gdb) s
 289                             mutex_enter(p->p_lock);
 (gdb) n
 290                             if (((sig = sigispending(l, 0)) != 0 &&
 (gdb) n
 291                                 (sigprop[sig] & SA_STOP) == 0) ||
 (gdb) n
 290                             if (((sig = sigispending(l, 0)) != 0 &&
 (gdb) n
 293                                     error = sleepq_sigtoerror(l, sig);
 (gdb) s
 sleepq_sigtoerror (l=0xc1f9b020, sig=9) at /usr/src/sys/kern/kern_sleepq.c:387
 387             struct proc *p = l->l_proc;
 (gdb) n
 390             KASSERT(mutex_owned(p->p_lock));
 (gdb) 
 395             if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
 (gdb) 
 398                     error = ERESTART;
 (gdb) 
 400             return error;
 (gdb) 
 401     }
 (gdb) 
 sleepq_block (timo=0, catch_p=true) at /usr/src/sys/kern/kern_sleepq.c:294
 294                             mutex_exit(p->p_lock);
 (gdb) 
 298             ktrcsw(0, 0);
 (gdb) n
 299             if (__predict_false(biglocks != 0)) {
 (gdb) n
 302             return error;
 (gdb) 
 303     }
 (gdb) 
 cv_wait_sig (cv=0xc20d6d80, mtx=0xc2604180)
     at /usr/src/sys/kern/kern_condvar.c:273
 273             return cv_exit(cv, mtx, l, error);
 (gdb) n
 274     }
 (gdb) n
 lwp_wait (l=0xc1f9b020, lid=0, departed=0x0, exiting=true)
     at /usr/src/sys/kern/kern_lwp.c:649
 649                             if (error == 0)
 (gdb) n
 651                             break;
 (gdb) 
 685             if (lid != 0) {
 (gdb) 
 694             p->p_nlwpwait--;
 (gdb) 
 695             l->l_waitingfor = 0;
 (gdb) 
 696             cv_broadcast(&p->p_lwpcv);
 (gdb) n
 698             return error;
 (gdb) 
 699     }
 (gdb) 
 
 Breakpoint 1, exit_lwps (l=0xc1f9b020) at /usr/src/sys/kern/kern_exit.c:637
 637                             goto retry;
 (gdb) 
 


Home | Main Index | Thread Index | Old Index