I'm retooling how threads in SA sleep, and I've run into a weird issue. With the attached WIP diff (complete with hard-coded printfs), I've had weird stuff happen when I resume a suspended program. With the new diff, I have specific unsleep operations for threads in lwpcache and lwpublk. These are sleepqueues, and essentially home-grown condvars. The main reason to not use condvars is so that sa_switch() won't run into locking issues when it wakes the waiting thread. All the home-grown unsleep routines do is decriment the counts of threads in state X then call the normal sleepq_unsleep(). A test that's failing is I have an empty named config. On boot, named starts then exits. When it's exiting, I get an assert failure in sa_lwpublk_unsleep(). The problem is that I don't have any threads in the unblock queue, so I don't see why that routine's getting called. I have a running thread, one in select, one in sigwait, and two in lwpcache. Any suggestions as to what I'm doing wrong? I don't see why the wait state's correct but the sleepq object calls are wrong. ?? Take care, Bill
? ID ? Rejects ? StallOut ? cscope.out ? plog ? secondUpdate ? arch/cscope.out ? arch/i386/compile/DIAGO ? arch/i386/compile/INSTALL_FLOPPY ? arch/i386/compile/MOOMOO ? arch/i386/conf/DIAGO ? arch/i386/conf/GENERIC_DIAGO ? arch/i386/conf/MOOMOO ? kern/.kern_sleepq.c.swo ? kern/.kern_sleepq.c.swp ? kern/ktrace.out ? kern/q ? kern/wchan.diff.txt Index: compat/sa/compat_sa.c =================================================================== RCS file: /cvsroot/src/sys/compat/sa/compat_sa.c,v retrieving revision 1.7 diff -u -p -r1.7 compat_sa.c --- compat/sa/compat_sa.c 1 Nov 2008 05:59:33 -0000 1.7 +++ compat/sa/compat_sa.c 10 Nov 2008 23:39:42 -0000 @@ -43,6 +43,8 @@ #include "opt_sa.h" __KERNEL_RCSID(0, "$NetBSD$"); +#define QUEUEDEBUG 1 + #include <sys/param.h> #include <sys/systm.h> #include <sys/cpu.h> @@ -105,7 +107,7 @@ static inline int sast_compare(struct sa static int sa_increaseconcurrency(struct lwp *, int); #endif static void sa_switchcall(void *); -static void sa_neverrun(void *); +static void sa_cache_lingerer(void *); static int sa_newcachelwp(struct lwp *, struct sadata_vp *); static void sa_makeupcalls(struct lwp *, struct sadata_upcall *); @@ -118,6 +120,8 @@ static void sa_upcall_getstate(union sau void sa_putcachelwp(struct proc *, struct lwp *); struct lwp *sa_getcachelwp(struct proc *, struct sadata_vp *); static void sa_setrunning(struct lwp *); +static u_int sa_lwpcache_unsleep(lwp_t *, bool); +static u_int sa_lwpublk_unsleep(lwp_t *, bool); #define SA_DEBUG @@ -130,9 +134,17 @@ int sadebug = 0; #define DPRINTFN(n,x) #endif -static syncobj_t sa_sobj = { +static syncobj_t sa_lwpcache_sobj = { + SOBJ_SLEEPQ_FIFO, + sa_lwpcache_unsleep, + sleepq_changepri, + sleepq_lendpri, + syncobj_noowner, +}; + +static syncobj_t sa_lwpublk_sobj = { SOBJ_SLEEPQ_FIFO, - sleepq_unsleep, + sa_lwpublk_unsleep, sleepq_changepri, sleepq_lendpri, syncobj_noowner, @@ -150,6 +162,10 @@ static const char *sa_lwpwoken_wmesg = " (l)->l_pflag ^= (f); \ } while (/*CONSTCOND*/ 0) +#define SA_LWP_SHOULD_EXIT(l, p) \ + (((l)->l_flag & (LW_WEXIT|LW_WCORE)) \ + || ((p)->p_sflag & (PS_WCORE|PS_WEXIT))) + RB_PROTOTYPE(sasttree, sastack, sast_node, sast_compare); RB_GENERATE(sasttree, sastack, sast_node, sast_compare); @@ -870,7 +886,7 @@ sa_increaseconcurrency(struct lwp *l, in mutex_exit(&vp->savp_mutex); lwp_lock(l2); l2->l_savp = l->l_savp; - cpu_setfunc(l2, sa_neverrun, NULL); + cpu_setfunc(l2, sa_cache_lingerer, NULL); lwp_unlock(l2); mutex_enter(&l->l_savp->savp_mutex); sa_putcachelwp(p, l2); @@ -1471,7 +1487,7 @@ sa_switch(struct lwp *l) DPRINTFN(4,("sa_switch(%d.%d VP %d)\n", p->p_pid, l->l_lid, vp->savp_lwp ? vp->savp_lwp->l_lid : 0)); - if ((l->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) { + if (SA_LWP_SHOULD_EXIT(l, p)) { mi_switch(l); return; } @@ -1566,7 +1582,6 @@ sa_switch(struct lwp *l) */ l2 = sa_getcachelwp(p, vp); if (l2 == NULL) { - /* XXXSMP */ /* No upcall for you! */ /* XXX The consequences of this are more subtle and * XXX the recovery from this situation deserves @@ -1574,9 +1589,15 @@ sa_switch(struct lwp *l) */ /* XXXUPSXXX Should only happen with concurrency > 1 */ + /* + * WRS: can also happen if SIGCONT or SIGSTOP woke + * (all) our cached thread(s) and it/they haven't + * haven't gotten back to sleep yet. + */ + if (vp->savp_sleeper_upcall == NULL) + vp->savp_sleeper_upcall = sau; mutex_exit(&vp->savp_mutex); mi_switch(l); - sadata_upcall_free(sau); return; } @@ -1598,7 +1619,7 @@ sa_switch(struct lwp *l) */ if ((l->l_flag & LP_SA_PAGEFAULT) && sa_pagefault(l, &sau->sau_event.ss_captured.ss_ctx) != 0) { - cpu_setfunc(l2, sa_neverrun, NULL); + cpu_setfunc(l2, sa_cache_lingerer, NULL); sa_putcachelwp(p, l2); /* uvm_lwp_hold from sa_getcachelwp */ mutex_exit(&vp->savp_mutex); DPRINTFN(4,("sa_switch(%d.%d) Pagefault\n", @@ -1662,20 +1683,45 @@ sa_switch(struct lwp *l) } /* - * sa_neverrun + * sa_cache_lingerer * - * Routine for threads that have never run. Calls lwp_exit. - * New, never-run cache threads get pointed at this routine, which just runs - * and calls lwp_exit(). + * Routine for threads that have never run. Hangs around until + * either the lwp gets used (and gets set to a differet routine) or + * the process exits. Since our sleep is interruptable, we have to + * catch when the signal code wakes us (usually in resoinse to a SIGCONT). + * If we get woken too soon, just put ourselves back to sleep. */ static void -sa_neverrun(void *arg) +sa_cache_lingerer(void *arg) { struct lwp *l; + struct proc *p; + struct sadata_vp *vp; + sleepq_t *sq; + int f; l = curlwp; + p = l->l_proc; + + vp = l->l_savp; + sq = &vp->savp_lwpcache; - DPRINTFN(1,("sa_neverrun(%d.%d %x) exiting\n", l->l_proc->p_pid, + while (!(SA_LWP_SHOULD_EXIT(l, p))) { + /* Not time to go, back to sleep with us */ + mutex_enter(&vp->savp_mutex); +printf("sa_cache_lingerer(%d.%d) cachecount %d\n", l->l_proc->p_pid, l->l_lid,vp->savp_lwpcache_count); + sleepq_enter(&vp->savp_woken, l, &vp->savp_mutex); + sleepq_enqueue(sq, (void *)sq, sa_lwpcache_wmesg, &sa_lwpcache_sobj); + vp->savp_lwpcache_count++; + SA_LWP_STATE_LOCK(l, f); + l->l_flag &= ~LW_PENDSIG; + sleepq_block(0, true); +DPRINTF(("sa_cache_lingerer(%d.%d): back out, l %p \n", l->l_proc->p_pid, + l->l_lid, l)); + SA_LWP_STATE_UNLOCK(l, f); + } + + DPRINTFN(1,("sa_cache_lingerer(%d.%d %x) exiting\n", l->l_proc->p_pid, l->l_lid, l->l_flag)); lwp_exit(l); @@ -1707,7 +1753,7 @@ sa_switchcall(void *arg) lwp_lock(l2); KASSERT(vp->savp_lwp == l2); - if ((l2->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) { + if (SA_LWP_SHOULD_EXIT(l2, p)) { lwp_unlock(l2); sadata_upcall_free(sau); lwp_exit(l2); @@ -1814,7 +1860,7 @@ sa_newcachelwp(struct lwp *l, struct sad } error = lwp_create(l, p, uaddr, inmem, 0, NULL, 0, - sa_neverrun, NULL, &l2, l->l_class); + sa_cache_lingerer, NULL, &l2, l->l_class); if (error) { uvm_uarea_free(uaddr, curcpu()); return error; @@ -1863,8 +1909,7 @@ sa_putcachelwp(struct proc *p, struct lw p->p_nlwps--; l->l_prflag |= LPR_DETACHED; #endif - l->l_flag |= LW_SA; - membar_producer(); + l->l_flag |= (LW_SA | LW_SINTR); DPRINTFN(5,("sa_putcachelwp(%d.%d) Adding LWP %d to cache\n", p->p_pid, curlwp->l_lid, l->l_lid)); @@ -1879,11 +1924,11 @@ sa_putcachelwp(struct proc *p, struct lw /* * XXXWRS: Following is a hand-rolled call of the form: - * sleepq_enqueue(sq, (void *)sq, "lwpcache", sa_sobj); but + * sleepq_enqueue(sq, (void *)sq, "lwpcache", sa_lwpcache_sobj); but * hand-done since l might not be curlwp. */ - l->l_syncobj = &sa_sobj; + l->l_syncobj = &sa_lwpcache_sobj; l->l_wchan = sq; l->l_sleepq = sq; l->l_wmesg = sa_lwpcache_wmesg; @@ -1892,7 +1937,38 @@ sa_putcachelwp(struct proc *p, struct lw l->l_sleeperr = 0; vp->savp_lwpcache_count++; - sleepq_insert(sq, l, &sa_sobj); + sleepq_insert(sq, l, &sa_lwpcache_sobj); + membar_producer(); +} + +/* + * sa_{lwpcache,lwpblk}_unsleep + * + * Another part of the kernel wants to wake up a thread + * in one of our special sleep queues. Since we keep a + * count of threads we've put in said queues, adjust it + * as part of removing it. + */ +static u_int +sa_lwpcache_unsleep(lwp_t *l, bool cleanup) +{ + DPRINTFN(3,("sa_lwpcache_unsleep(%d.%d), flags %x count %d\n", + l->l_proc->p_pid, l->l_lid, + l->l_flag, l->l_savp->savp_lwpcache_count)); + KASSERT(l->l_savp->savp_lwpcache_count > 0); + l->l_savp->savp_lwpcache_count--; + return sleepq_unsleep(l, cleanup); +} + +static u_int +sa_lwpublk_unsleep(lwp_t *l, bool cleanup) +{ + DPRINTFN(3,("sa_lwpcache_unsleep(%d.%d), flags %x count %d\n", + l->l_proc->p_pid, l->l_lid, + l->l_flag, l->l_savp->savp_woken_count)); + KASSERT(l->l_savp->savp_woken_count > 0); + l->l_savp->savp_woken_count--; + return sleepq_unsleep(l, cleanup); } /* @@ -1907,7 +1983,9 @@ sa_getcachelwp(struct proc *p, struct sa sleepq_t *sq = &vp->savp_lwpcache; KASSERT(mutex_owned(&vp->savp_mutex)); - KASSERT(vp->savp_lwpcache_count > 0); + + if (vp->savp_lwpcache_count == 0) + return NULL; vp->savp_lwpcache_count--; l= TAILQ_FIRST(sq); @@ -1922,7 +2000,7 @@ sa_getcachelwp(struct proc *p, struct sa l->l_syncobj = &sched_syncobj; l->l_wchan = NULL; l->l_sleepq = NULL; - l->l_flag &= ~LW_SINTR; + l->l_flag &= ~(LW_SINTR | LW_PENDSIG); #if 0 /* Not now, for now leave lwps in lwp list */ LIST_INSERT_HEAD(&p->p_lwps, l, l_sibling); @@ -2062,6 +2140,9 @@ sa_upcall_userret(struct lwp *l) * we deliver the call. */ l2 = TAILQ_FIRST(&vp->savp_woken); +DPRINTF(("sa_userret got l2 %p on count %d\n", l2, vp->savp_woken_count)); +DPRINTF(("sa_userret(%d.%d): just pulled l %p id %d mutex good %d\n", l->l_proc->p_pid, + l->l_lid, l2, l2->l_lid, l2->l_mutex == &vp->savp_mutex)); TAILQ_REMOVE(&vp->savp_woken, l2, l_sleepchain); vp->savp_woken_count--; mutex_exit(&vp->savp_mutex); @@ -2069,8 +2150,7 @@ sa_upcall_userret(struct lwp *l) DPRINTFN(9,("sa_upcall_userret(%d.%d) using stack %p\n", l->l_proc->p_pid, l->l_lid, sast->sast_stack.ss_sp)); - if ((l->l_flag & LW_WEXIT) - || (p->p_sflag & (PS_WCORE | PS_WEXIT))) { + if (SA_LWP_SHOULD_EXIT(l, p)) { lwp_exit(l); /* NOTREACHED */ } @@ -2079,8 +2159,7 @@ sa_upcall_userret(struct lwp *l) p->p_pid, l->l_lid, l2->l_lid)); sau = sadata_upcall_alloc(1); - if ((l->l_flag & LW_WEXIT) - || (p->p_sflag & (PS_WCORE | PS_WEXIT))) { + if (SA_LWP_SHOULD_EXIT(l, p)) { sadata_upcall_free(sau); lwp_exit(l); /* NOTREACHED */ @@ -2097,7 +2176,7 @@ sa_upcall_userret(struct lwp *l) l2->l_wchan = sq; l2->l_wmesg = sa_lwpcache_wmesg; vp->savp_lwpcache_count++; - sleepq_insert(sq, l2, &sa_sobj); + sleepq_insert(sq, l2, &sa_lwpcache_sobj); /* uvm_lwp_hold from sa_unblock_userret */ } else if (sast) sa_setstackfree(sast, sa); @@ -2207,7 +2286,6 @@ sa_makeupcalls(struct lwp *l, struct sad sas[2] = &sau->sau_interrupted.ss_captured.ss_sa; nint = 1; } -#if 0 /* For now, limit ourselves to one unblock at once. */ if (sau->sau_type == SA_UPCALL_UNBLOCKED) { mutex_enter(&vp->savp_mutex); @@ -2215,7 +2293,6 @@ sa_makeupcalls(struct lwp *l, struct sad mutex_exit(&vp->savp_mutex); /* XXX WRS Need to limit # unblocks we copy out at once! */ } -#endif /* Copy out the activation's ucontext */ up = (void *)STACK_ALLOC(stack, ucsize); @@ -2290,8 +2367,9 @@ sa_makeupcalls(struct lwp *l, struct sad sq = &vp->savp_lwpcache; l2->l_wchan = sq; l2->l_wmesg = sa_lwpcache_wmesg; + cpu_setfunc(l2, sa_cache_lingerer, NULL); vp->savp_lwpcache_count++; - sleepq_insert(sq, l2, &sa_sobj); + sleepq_insert(sq, l2, &sa_lwpcache_sobj); /* uvm_lwp_hold from sa_unblock_userret */ mutex_exit(&vp->savp_mutex); @@ -2392,13 +2470,13 @@ sa_unblock_userret(struct lwp *l) struct proc *p; struct sadata *sa; struct sadata_vp *vp; - int swapper; + int swapper, do_once = 1; p = l->l_proc; sa = p->p_sa; vp = l->l_savp; - if ((l->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) + if (SA_LWP_SHOULD_EXIT(l, p)) return; if ((l->l_flag & LW_SA_BLOCKING) == 0) @@ -2410,6 +2488,7 @@ sa_unblock_userret(struct lwp *l) p = l->l_proc; sa = p->p_sa; vp = l->l_savp; +retry: vp_lwp = vp->savp_lwp; l2 = NULL; swapper = 0; @@ -2527,8 +2606,13 @@ sa_unblock_userret(struct lwp *l) */ sleepq_enter(&vp->savp_woken, l, &vp->savp_mutex); sleepq_enqueue(&vp->savp_woken, &vp->savp_woken, sa_lwpwoken_wmesg, - &sa_sobj); - uvm_lwp_hold(l); + &sa_lwpublk_sobj); + l->l_flag |= LW_SINTR; + l->l_flag &= ~LW_PENDSIG; + if (do_once) { + uvm_lwp_hold(l); + do_once = 0; + } vp->savp_woken_count++; //l->l_stat = LSSUSPENDED; mi_switch(l); @@ -2542,6 +2626,21 @@ sa_unblock_userret(struct lwp *l) * In the normal lwp lifecycle, cpu_setfunc() will make this lwp * run in a different routine by the time we next run. */ + +printf("unblock back from miswitch\n"); + if (!(SA_LWP_SHOULD_EXIT(l, p))) { + /* + * Not time to go, back to sleep with us. However + * the blessed LWP may need a wakeup again to notice + * us. So try this routine again. + */ + DPRINTFN(3,("sa_unblock_userret(%d.%d) going around again" + ", flags %x, vp %d\n", l->l_proc->p_pid, l->l_lid, + l->l_flag, vp_lwp->l_lid)); + goto retry; + } + +printf("unblock exiting\n"); lwp_exit(l); /* NOTREACHED */ } Index: kern/kern_exit.c =================================================================== RCS file: /cvsroot/src/sys/kern/kern_exit.c,v retrieving revision 1.215 diff -u -p -r1.215 kern_exit.c --- kern/kern_exit.c 1 Nov 2008 05:59:33 -0000 1.215 +++ kern/kern_exit.c 10 Nov 2008 23:39:48 -0000 @@ -595,34 +595,33 @@ exit_lwps(struct lwp *l) p = l->l_proc; KASSERT(mutex_owned(p->p_lock)); -#ifdef KERN_SA +#if 0 if (p->p_sa != NULL) { - struct sadata_vp *vp; SLIST_FOREACH(vp, &p->p_sa->sa_vps, savp_next) { /* - * Make SA-cached LWPs normal process interruptable - * so that the exit code can wake them. Locking - * savp_mutex locks all the lwps on this vp that - * we need to adjust. + * Make SA-cached LWPs normal process runnable + * LWPs so that they'll also self-destruct. */ - mutex_enter(&vp->savp_mutex); DPRINTF(("exit_lwps: Making cached LWPs of %d on " - "VP %d interruptable: ", p->p_pid, vp->savp_id)); - TAILQ_FOREACH(l2, &vp->savp_lwpcache, l_sleepchain) { - l2->l_flag |= LW_SINTR; - DPRINTF(("%d ", l2->l_lid)); - } - DPRINTF(("\n")); + "VP %d runnable: ", p->p_pid, vp->savp_id)); + while ((l2 = sa_getcachelwp(p, vp)) != 0) { + lwp_lock(l2); + l2->l_flag = (l2->l_flag & ~LW_SA) | LW_WEXIT; + l2->l_priority = MAXPRI_USER; /* XXX WRS needs thought */ - DPRINTF(("exit_lwps: Making unblocking LWPs of %d on " - "VP %d interruptable: ", p->p_pid, vp->savp_id)); - TAILQ_FOREACH(l2, &vp->savp_woken, l_sleepchain) { - vp->savp_woken_count--; - l2->l_flag |= LW_SINTR; + /* setrunnable() will release the mutex. */ + setrunnable(l2); DPRINTF(("%d ", l2->l_lid)); } DPRINTF(("\n")); - mutex_exit(&vp->savp_mutex); + + /* + * Clear wokenq, the LWPs on the queue will + * run below. Workes as these threads are still + * on the p_lwps list (even though they are no longer + * counted). + */ + TAILQ_INIT(&vp->savp_woken); } } #endif
Attachment:
pgpGOVYypATAK.pgp
Description: PGP signature