I'm retooling how threads in SA sleep, and I've run into a weird issue. With the attached WIP diff (complete with hard-coded printfs), I've had weird stuff happen when I resume a suspended program. With the new diff, I have specific unsleep operations for threads in lwpcache and lwpublk. These are sleepqueues, and essentially home-grown condvars. The main reason to not use condvars is so that sa_switch() won't run into locking issues when it wakes the waiting thread. All the home-grown unsleep routines do is decriment the counts of threads in state X then call the normal sleepq_unsleep(). A test that's failing is I have an empty named config. On boot, named starts then exits. When it's exiting, I get an assert failure in sa_lwpublk_unsleep(). The problem is that I don't have any threads in the unblock queue, so I don't see why that routine's getting called. I have a running thread, one in select, one in sigwait, and two in lwpcache. Any suggestions as to what I'm doing wrong? I don't see why the wait state's correct but the sleepq object calls are wrong. ?? Take care, Bill
? ID
? Rejects
? StallOut
? cscope.out
? plog
? secondUpdate
? arch/cscope.out
? arch/i386/compile/DIAGO
? arch/i386/compile/INSTALL_FLOPPY
? arch/i386/compile/MOOMOO
? arch/i386/conf/DIAGO
? arch/i386/conf/GENERIC_DIAGO
? arch/i386/conf/MOOMOO
? kern/.kern_sleepq.c.swo
? kern/.kern_sleepq.c.swp
? kern/ktrace.out
? kern/q
? kern/wchan.diff.txt
Index: compat/sa/compat_sa.c
===================================================================
RCS file: /cvsroot/src/sys/compat/sa/compat_sa.c,v
retrieving revision 1.7
diff -u -p -r1.7 compat_sa.c
--- compat/sa/compat_sa.c 1 Nov 2008 05:59:33 -0000 1.7
+++ compat/sa/compat_sa.c 10 Nov 2008 23:39:42 -0000
@@ -43,6 +43,8 @@
#include "opt_sa.h"
__KERNEL_RCSID(0, "$NetBSD$");
+#define QUEUEDEBUG 1
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/cpu.h>
@@ -105,7 +107,7 @@ static inline int sast_compare(struct sa
static int sa_increaseconcurrency(struct lwp *, int);
#endif
static void sa_switchcall(void *);
-static void sa_neverrun(void *);
+static void sa_cache_lingerer(void *);
static int sa_newcachelwp(struct lwp *, struct sadata_vp *);
static void sa_makeupcalls(struct lwp *, struct sadata_upcall *);
@@ -118,6 +120,8 @@ static void sa_upcall_getstate(union sau
void sa_putcachelwp(struct proc *, struct lwp *);
struct lwp *sa_getcachelwp(struct proc *, struct sadata_vp *);
static void sa_setrunning(struct lwp *);
+static u_int sa_lwpcache_unsleep(lwp_t *, bool);
+static u_int sa_lwpublk_unsleep(lwp_t *, bool);
#define SA_DEBUG
@@ -130,9 +134,17 @@ int sadebug = 0;
#define DPRINTFN(n,x)
#endif
-static syncobj_t sa_sobj = {
+static syncobj_t sa_lwpcache_sobj = {
+ SOBJ_SLEEPQ_FIFO,
+ sa_lwpcache_unsleep,
+ sleepq_changepri,
+ sleepq_lendpri,
+ syncobj_noowner,
+};
+
+static syncobj_t sa_lwpublk_sobj = {
SOBJ_SLEEPQ_FIFO,
- sleepq_unsleep,
+ sa_lwpublk_unsleep,
sleepq_changepri,
sleepq_lendpri,
syncobj_noowner,
@@ -150,6 +162,10 @@ static const char *sa_lwpwoken_wmesg = "
(l)->l_pflag ^= (f); \
} while (/*CONSTCOND*/ 0)
+#define SA_LWP_SHOULD_EXIT(l, p) \
+ (((l)->l_flag & (LW_WEXIT|LW_WCORE)) \
+ || ((p)->p_sflag & (PS_WCORE|PS_WEXIT)))
+
RB_PROTOTYPE(sasttree, sastack, sast_node, sast_compare);
RB_GENERATE(sasttree, sastack, sast_node, sast_compare);
@@ -870,7 +886,7 @@ sa_increaseconcurrency(struct lwp *l, in
mutex_exit(&vp->savp_mutex);
lwp_lock(l2);
l2->l_savp = l->l_savp;
- cpu_setfunc(l2, sa_neverrun, NULL);
+ cpu_setfunc(l2, sa_cache_lingerer, NULL);
lwp_unlock(l2);
mutex_enter(&l->l_savp->savp_mutex);
sa_putcachelwp(p, l2);
@@ -1471,7 +1487,7 @@ sa_switch(struct lwp *l)
DPRINTFN(4,("sa_switch(%d.%d VP %d)\n", p->p_pid, l->l_lid,
vp->savp_lwp ? vp->savp_lwp->l_lid : 0));
- if ((l->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+ if (SA_LWP_SHOULD_EXIT(l, p)) {
mi_switch(l);
return;
}
@@ -1566,7 +1582,6 @@ sa_switch(struct lwp *l)
*/
l2 = sa_getcachelwp(p, vp);
if (l2 == NULL) {
- /* XXXSMP */
/* No upcall for you! */
/* XXX The consequences of this are more subtle and
* XXX the recovery from this situation deserves
@@ -1574,9 +1589,15 @@ sa_switch(struct lwp *l)
*/
/* XXXUPSXXX Should only happen with concurrency > 1 */
+ /*
+ * WRS: can also happen if SIGCONT or SIGSTOP woke
+ * (all) our cached thread(s) and it/they haven't
+ * haven't gotten back to sleep yet.
+ */
+ if (vp->savp_sleeper_upcall == NULL)
+ vp->savp_sleeper_upcall = sau;
mutex_exit(&vp->savp_mutex);
mi_switch(l);
- sadata_upcall_free(sau);
return;
}
@@ -1598,7 +1619,7 @@ sa_switch(struct lwp *l)
*/
if ((l->l_flag & LP_SA_PAGEFAULT) && sa_pagefault(l,
&sau->sau_event.ss_captured.ss_ctx) != 0) {
- cpu_setfunc(l2, sa_neverrun, NULL);
+ cpu_setfunc(l2, sa_cache_lingerer, NULL);
sa_putcachelwp(p, l2); /* uvm_lwp_hold from
sa_getcachelwp */
mutex_exit(&vp->savp_mutex);
DPRINTFN(4,("sa_switch(%d.%d) Pagefault\n",
@@ -1662,20 +1683,45 @@ sa_switch(struct lwp *l)
}
/*
- * sa_neverrun
+ * sa_cache_lingerer
*
- * Routine for threads that have never run. Calls lwp_exit.
- * New, never-run cache threads get pointed at this routine, which just runs
- * and calls lwp_exit().
+ * Routine for threads that have never run. Hangs around until
+ * either the lwp gets used (and gets set to a differet routine) or
+ * the process exits. Since our sleep is interruptable, we have to
+ * catch when the signal code wakes us (usually in resoinse to a SIGCONT).
+ * If we get woken too soon, just put ourselves back to sleep.
*/
static void
-sa_neverrun(void *arg)
+sa_cache_lingerer(void *arg)
{
struct lwp *l;
+ struct proc *p;
+ struct sadata_vp *vp;
+ sleepq_t *sq;
+ int f;
l = curlwp;
+ p = l->l_proc;
+
+ vp = l->l_savp;
+ sq = &vp->savp_lwpcache;
- DPRINTFN(1,("sa_neverrun(%d.%d %x) exiting\n", l->l_proc->p_pid,
+ while (!(SA_LWP_SHOULD_EXIT(l, p))) {
+ /* Not time to go, back to sleep with us */
+ mutex_enter(&vp->savp_mutex);
+printf("sa_cache_lingerer(%d.%d) cachecount %d\n", l->l_proc->p_pid,
l->l_lid,vp->savp_lwpcache_count);
+ sleepq_enter(&vp->savp_woken, l, &vp->savp_mutex);
+ sleepq_enqueue(sq, (void *)sq, sa_lwpcache_wmesg,
&sa_lwpcache_sobj);
+ vp->savp_lwpcache_count++;
+ SA_LWP_STATE_LOCK(l, f);
+ l->l_flag &= ~LW_PENDSIG;
+ sleepq_block(0, true);
+DPRINTF(("sa_cache_lingerer(%d.%d): back out, l %p \n", l->l_proc->p_pid,
+ l->l_lid, l));
+ SA_LWP_STATE_UNLOCK(l, f);
+ }
+
+ DPRINTFN(1,("sa_cache_lingerer(%d.%d %x) exiting\n", l->l_proc->p_pid,
l->l_lid, l->l_flag));
lwp_exit(l);
@@ -1707,7 +1753,7 @@ sa_switchcall(void *arg)
lwp_lock(l2);
KASSERT(vp->savp_lwp == l2);
- if ((l2->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+ if (SA_LWP_SHOULD_EXIT(l2, p)) {
lwp_unlock(l2);
sadata_upcall_free(sau);
lwp_exit(l2);
@@ -1814,7 +1860,7 @@ sa_newcachelwp(struct lwp *l, struct sad
}
error = lwp_create(l, p, uaddr, inmem, 0, NULL, 0,
- sa_neverrun, NULL, &l2, l->l_class);
+ sa_cache_lingerer, NULL, &l2, l->l_class);
if (error) {
uvm_uarea_free(uaddr, curcpu());
return error;
@@ -1863,8 +1909,7 @@ sa_putcachelwp(struct proc *p, struct lw
p->p_nlwps--;
l->l_prflag |= LPR_DETACHED;
#endif
- l->l_flag |= LW_SA;
- membar_producer();
+ l->l_flag |= (LW_SA | LW_SINTR);
DPRINTFN(5,("sa_putcachelwp(%d.%d) Adding LWP %d to cache\n",
p->p_pid, curlwp->l_lid, l->l_lid));
@@ -1879,11 +1924,11 @@ sa_putcachelwp(struct proc *p, struct lw
/*
* XXXWRS: Following is a hand-rolled call of the form:
- * sleepq_enqueue(sq, (void *)sq, "lwpcache", sa_sobj); but
+ * sleepq_enqueue(sq, (void *)sq, "lwpcache", sa_lwpcache_sobj); but
* hand-done since l might not be curlwp.
*/
- l->l_syncobj = &sa_sobj;
+ l->l_syncobj = &sa_lwpcache_sobj;
l->l_wchan = sq;
l->l_sleepq = sq;
l->l_wmesg = sa_lwpcache_wmesg;
@@ -1892,7 +1937,38 @@ sa_putcachelwp(struct proc *p, struct lw
l->l_sleeperr = 0;
vp->savp_lwpcache_count++;
- sleepq_insert(sq, l, &sa_sobj);
+ sleepq_insert(sq, l, &sa_lwpcache_sobj);
+ membar_producer();
+}
+
+/*
+ * sa_{lwpcache,lwpblk}_unsleep
+ *
+ * Another part of the kernel wants to wake up a thread
+ * in one of our special sleep queues. Since we keep a
+ * count of threads we've put in said queues, adjust it
+ * as part of removing it.
+ */
+static u_int
+sa_lwpcache_unsleep(lwp_t *l, bool cleanup)
+{
+ DPRINTFN(3,("sa_lwpcache_unsleep(%d.%d), flags %x count %d\n",
+ l->l_proc->p_pid, l->l_lid,
+ l->l_flag, l->l_savp->savp_lwpcache_count));
+ KASSERT(l->l_savp->savp_lwpcache_count > 0);
+ l->l_savp->savp_lwpcache_count--;
+ return sleepq_unsleep(l, cleanup);
+}
+
+static u_int
+sa_lwpublk_unsleep(lwp_t *l, bool cleanup)
+{
+ DPRINTFN(3,("sa_lwpcache_unsleep(%d.%d), flags %x count %d\n",
+ l->l_proc->p_pid, l->l_lid,
+ l->l_flag, l->l_savp->savp_woken_count));
+ KASSERT(l->l_savp->savp_woken_count > 0);
+ l->l_savp->savp_woken_count--;
+ return sleepq_unsleep(l, cleanup);
}
/*
@@ -1907,7 +1983,9 @@ sa_getcachelwp(struct proc *p, struct sa
sleepq_t *sq = &vp->savp_lwpcache;
KASSERT(mutex_owned(&vp->savp_mutex));
- KASSERT(vp->savp_lwpcache_count > 0);
+
+ if (vp->savp_lwpcache_count == 0)
+ return NULL;
vp->savp_lwpcache_count--;
l= TAILQ_FIRST(sq);
@@ -1922,7 +2000,7 @@ sa_getcachelwp(struct proc *p, struct sa
l->l_syncobj = &sched_syncobj;
l->l_wchan = NULL;
l->l_sleepq = NULL;
- l->l_flag &= ~LW_SINTR;
+ l->l_flag &= ~(LW_SINTR | LW_PENDSIG);
#if 0 /* Not now, for now leave lwps in lwp list */
LIST_INSERT_HEAD(&p->p_lwps, l, l_sibling);
@@ -2062,6 +2140,9 @@ sa_upcall_userret(struct lwp *l)
* we deliver the call.
*/
l2 = TAILQ_FIRST(&vp->savp_woken);
+DPRINTF(("sa_userret got l2 %p on count %d\n", l2, vp->savp_woken_count));
+DPRINTF(("sa_userret(%d.%d): just pulled l %p id %d mutex good %d\n",
l->l_proc->p_pid,
+ l->l_lid, l2, l2->l_lid, l2->l_mutex == &vp->savp_mutex));
TAILQ_REMOVE(&vp->savp_woken, l2, l_sleepchain);
vp->savp_woken_count--;
mutex_exit(&vp->savp_mutex);
@@ -2069,8 +2150,7 @@ sa_upcall_userret(struct lwp *l)
DPRINTFN(9,("sa_upcall_userret(%d.%d) using stack %p\n",
l->l_proc->p_pid, l->l_lid, sast->sast_stack.ss_sp));
- if ((l->l_flag & LW_WEXIT)
- || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+ if (SA_LWP_SHOULD_EXIT(l, p)) {
lwp_exit(l);
/* NOTREACHED */
}
@@ -2079,8 +2159,7 @@ sa_upcall_userret(struct lwp *l)
p->p_pid, l->l_lid, l2->l_lid));
sau = sadata_upcall_alloc(1);
- if ((l->l_flag & LW_WEXIT)
- || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+ if (SA_LWP_SHOULD_EXIT(l, p)) {
sadata_upcall_free(sau);
lwp_exit(l);
/* NOTREACHED */
@@ -2097,7 +2176,7 @@ sa_upcall_userret(struct lwp *l)
l2->l_wchan = sq;
l2->l_wmesg = sa_lwpcache_wmesg;
vp->savp_lwpcache_count++;
- sleepq_insert(sq, l2, &sa_sobj);
+ sleepq_insert(sq, l2, &sa_lwpcache_sobj);
/* uvm_lwp_hold from sa_unblock_userret */
} else if (sast)
sa_setstackfree(sast, sa);
@@ -2207,7 +2286,6 @@ sa_makeupcalls(struct lwp *l, struct sad
sas[2] = &sau->sau_interrupted.ss_captured.ss_sa;
nint = 1;
}
-#if 0
/* For now, limit ourselves to one unblock at once. */
if (sau->sau_type == SA_UPCALL_UNBLOCKED) {
mutex_enter(&vp->savp_mutex);
@@ -2215,7 +2293,6 @@ sa_makeupcalls(struct lwp *l, struct sad
mutex_exit(&vp->savp_mutex);
/* XXX WRS Need to limit # unblocks we copy out at once! */
}
-#endif
/* Copy out the activation's ucontext */
up = (void *)STACK_ALLOC(stack, ucsize);
@@ -2290,8 +2367,9 @@ sa_makeupcalls(struct lwp *l, struct sad
sq = &vp->savp_lwpcache;
l2->l_wchan = sq;
l2->l_wmesg = sa_lwpcache_wmesg;
+ cpu_setfunc(l2, sa_cache_lingerer, NULL);
vp->savp_lwpcache_count++;
- sleepq_insert(sq, l2, &sa_sobj);
+ sleepq_insert(sq, l2, &sa_lwpcache_sobj);
/* uvm_lwp_hold from sa_unblock_userret */
mutex_exit(&vp->savp_mutex);
@@ -2392,13 +2470,13 @@ sa_unblock_userret(struct lwp *l)
struct proc *p;
struct sadata *sa;
struct sadata_vp *vp;
- int swapper;
+ int swapper, do_once = 1;
p = l->l_proc;
sa = p->p_sa;
vp = l->l_savp;
- if ((l->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT)))
+ if (SA_LWP_SHOULD_EXIT(l, p))
return;
if ((l->l_flag & LW_SA_BLOCKING) == 0)
@@ -2410,6 +2488,7 @@ sa_unblock_userret(struct lwp *l)
p = l->l_proc;
sa = p->p_sa;
vp = l->l_savp;
+retry:
vp_lwp = vp->savp_lwp;
l2 = NULL;
swapper = 0;
@@ -2527,8 +2606,13 @@ sa_unblock_userret(struct lwp *l)
*/
sleepq_enter(&vp->savp_woken, l, &vp->savp_mutex);
sleepq_enqueue(&vp->savp_woken, &vp->savp_woken, sa_lwpwoken_wmesg,
- &sa_sobj);
- uvm_lwp_hold(l);
+ &sa_lwpublk_sobj);
+ l->l_flag |= LW_SINTR;
+ l->l_flag &= ~LW_PENDSIG;
+ if (do_once) {
+ uvm_lwp_hold(l);
+ do_once = 0;
+ }
vp->savp_woken_count++;
//l->l_stat = LSSUSPENDED;
mi_switch(l);
@@ -2542,6 +2626,21 @@ sa_unblock_userret(struct lwp *l)
* In the normal lwp lifecycle, cpu_setfunc() will make this lwp
* run in a different routine by the time we next run.
*/
+
+printf("unblock back from miswitch\n");
+ if (!(SA_LWP_SHOULD_EXIT(l, p))) {
+ /*
+ * Not time to go, back to sleep with us. However
+ * the blessed LWP may need a wakeup again to notice
+ * us. So try this routine again.
+ */
+ DPRINTFN(3,("sa_unblock_userret(%d.%d) going around again"
+ ", flags %x, vp %d\n", l->l_proc->p_pid, l->l_lid,
+ l->l_flag, vp_lwp->l_lid));
+ goto retry;
+ }
+
+printf("unblock exiting\n");
lwp_exit(l);
/* NOTREACHED */
}
Index: kern/kern_exit.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_exit.c,v
retrieving revision 1.215
diff -u -p -r1.215 kern_exit.c
--- kern/kern_exit.c 1 Nov 2008 05:59:33 -0000 1.215
+++ kern/kern_exit.c 10 Nov 2008 23:39:48 -0000
@@ -595,34 +595,33 @@ exit_lwps(struct lwp *l)
p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
-#ifdef KERN_SA
+#if 0
if (p->p_sa != NULL) {
- struct sadata_vp *vp;
SLIST_FOREACH(vp, &p->p_sa->sa_vps, savp_next) {
/*
- * Make SA-cached LWPs normal process interruptable
- * so that the exit code can wake them. Locking
- * savp_mutex locks all the lwps on this vp that
- * we need to adjust.
+ * Make SA-cached LWPs normal process runnable
+ * LWPs so that they'll also self-destruct.
*/
- mutex_enter(&vp->savp_mutex);
DPRINTF(("exit_lwps: Making cached LWPs of %d on "
- "VP %d interruptable: ", p->p_pid, vp->savp_id));
- TAILQ_FOREACH(l2, &vp->savp_lwpcache, l_sleepchain) {
- l2->l_flag |= LW_SINTR;
- DPRINTF(("%d ", l2->l_lid));
- }
- DPRINTF(("\n"));
+ "VP %d runnable: ", p->p_pid, vp->savp_id));
+ while ((l2 = sa_getcachelwp(p, vp)) != 0) {
+ lwp_lock(l2);
+ l2->l_flag = (l2->l_flag & ~LW_SA) | LW_WEXIT;
+ l2->l_priority = MAXPRI_USER; /* XXX WRS needs
thought */
- DPRINTF(("exit_lwps: Making unblocking LWPs of %d on "
- "VP %d interruptable: ", p->p_pid, vp->savp_id));
- TAILQ_FOREACH(l2, &vp->savp_woken, l_sleepchain) {
- vp->savp_woken_count--;
- l2->l_flag |= LW_SINTR;
+ /* setrunnable() will release the mutex. */
+ setrunnable(l2);
DPRINTF(("%d ", l2->l_lid));
}
DPRINTF(("\n"));
- mutex_exit(&vp->savp_mutex);
+
+ /*
+ * Clear wokenq, the LWPs on the queue will
+ * run below. Workes as these threads are still
+ * on the p_lwps list (even though they are no longer
+ * counted).
+ */
+ TAILQ_INIT(&vp->savp_woken);
}
}
#endif
Attachment:
pgpGOVYypATAK.pgp
Description: PGP signature