Weird revivesa issue

To: tech-kern%netbsd.org@localhost
Subject: Weird revivesa issue
From: Bill Stouder-Studenmund <wrstuden%netbsd.org@localhost>
Date: Tue, 11 Nov 2008 09:01:53 -0800

I'm retooling how threads in SA sleep, and I've run into a weird issue.

With the attached WIP diff (complete with hard-coded printfs), I've had 
weird stuff happen when I resume a suspended program.

With the new diff, I have specific unsleep operations for threads in
lwpcache and lwpublk. These are sleepqueues, and essentially home-grown
condvars. The main reason to not use condvars is so that sa_switch() won't
run into locking issues when it wakes the waiting thread. All the
home-grown unsleep routines do is decriment the counts of threads in state
X then call the normal sleepq_unsleep().

A test that's failing is I have an empty named config. On boot, named 
starts then exits. When it's exiting, I get an assert failure in 
sa_lwpublk_unsleep(). The problem is that I don't have any threads in the 
unblock queue, so I don't see why that routine's getting called. I have a 
running thread, one in select, one in sigwait, and two in lwpcache.

Any suggestions as to what I'm doing wrong? I don't see why the wait 
state's correct but the sleepq object calls are wrong. ??

Take care,

Bill

? ID
? Rejects
? StallOut
? cscope.out
? plog
? secondUpdate
? arch/cscope.out
? arch/i386/compile/DIAGO
? arch/i386/compile/INSTALL_FLOPPY
? arch/i386/compile/MOOMOO
? arch/i386/conf/DIAGO
? arch/i386/conf/GENERIC_DIAGO
? arch/i386/conf/MOOMOO
? kern/.kern_sleepq.c.swo
? kern/.kern_sleepq.c.swp
? kern/ktrace.out
? kern/q
? kern/wchan.diff.txt
Index: compat/sa/compat_sa.c
===================================================================
RCS file: /cvsroot/src/sys/compat/sa/compat_sa.c,v
retrieving revision 1.7
diff -u -p -r1.7 compat_sa.c
--- compat/sa/compat_sa.c       1 Nov 2008 05:59:33 -0000       1.7
+++ compat/sa/compat_sa.c       10 Nov 2008 23:39:42 -0000
@@ -43,6 +43,8 @@
 #include "opt_sa.h"
 __KERNEL_RCSID(0, "$NetBSD$");
 
+#define QUEUEDEBUG     1
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpu.h>
@@ -105,7 +107,7 @@ static inline int sast_compare(struct sa
 static int sa_increaseconcurrency(struct lwp *, int);
 #endif
 static void sa_switchcall(void *);
-static void sa_neverrun(void *);
+static void sa_cache_lingerer(void *);
 static int sa_newcachelwp(struct lwp *, struct sadata_vp *);
 static void sa_makeupcalls(struct lwp *, struct sadata_upcall *);
 
@@ -118,6 +120,8 @@ static void sa_upcall_getstate(union sau
 void   sa_putcachelwp(struct proc *, struct lwp *);
 struct lwp *sa_getcachelwp(struct proc *, struct sadata_vp *);
 static void    sa_setrunning(struct lwp *);
+static u_int   sa_lwpcache_unsleep(lwp_t *, bool);
+static u_int   sa_lwpublk_unsleep(lwp_t *, bool);
 
 #define SA_DEBUG
 
@@ -130,9 +134,17 @@ int        sadebug = 0;
 #define DPRINTFN(n,x)
 #endif
 
-static syncobj_t sa_sobj = {
+static syncobj_t sa_lwpcache_sobj = {
+       SOBJ_SLEEPQ_FIFO,
+       sa_lwpcache_unsleep,
+       sleepq_changepri,
+       sleepq_lendpri,
+       syncobj_noowner,
+};
+
+static syncobj_t sa_lwpublk_sobj = {
        SOBJ_SLEEPQ_FIFO,
-       sleepq_unsleep,
+       sa_lwpublk_unsleep,
        sleepq_changepri,
        sleepq_lendpri,
        syncobj_noowner,
@@ -150,6 +162,10 @@ static const char *sa_lwpwoken_wmesg = "
        (l)->l_pflag ^= (f);                                    \
 } while (/*CONSTCOND*/ 0)
 
+#define SA_LWP_SHOULD_EXIT(l, p)                               \
+       (((l)->l_flag & (LW_WEXIT|LW_WCORE))                    \
+               || ((p)->p_sflag & (PS_WCORE|PS_WEXIT)))
+
 RB_PROTOTYPE(sasttree, sastack, sast_node, sast_compare);
 RB_GENERATE(sasttree, sastack, sast_node, sast_compare);
 
@@ -870,7 +886,7 @@ sa_increaseconcurrency(struct lwp *l, in
                        mutex_exit(&vp->savp_mutex);
                        lwp_lock(l2);
                        l2->l_savp = l->l_savp;
-                       cpu_setfunc(l2, sa_neverrun, NULL);
+                       cpu_setfunc(l2, sa_cache_lingerer, NULL);
                        lwp_unlock(l2);
                        mutex_enter(&l->l_savp->savp_mutex);
                        sa_putcachelwp(p, l2);
@@ -1471,7 +1487,7 @@ sa_switch(struct lwp *l)
        DPRINTFN(4,("sa_switch(%d.%d VP %d)\n", p->p_pid, l->l_lid,
            vp->savp_lwp ? vp->savp_lwp->l_lid : 0));
 
-       if ((l->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+       if (SA_LWP_SHOULD_EXIT(l, p)) {
                mi_switch(l);
                return;
        }
@@ -1566,7 +1582,6 @@ sa_switch(struct lwp *l)
                 */
                l2 = sa_getcachelwp(p, vp);
                if (l2 == NULL) {
-                       /* XXXSMP */
                        /* No upcall for you! */
                        /* XXX The consequences of this are more subtle and
                         * XXX the recovery from this situation deserves
@@ -1574,9 +1589,15 @@ sa_switch(struct lwp *l)
                         */
 
                        /* XXXUPSXXX Should only happen with concurrency > 1 */
+                       /*
+                        * WRS: can also happen if SIGCONT or SIGSTOP woke
+                        * (all) our cached thread(s) and it/they haven't
+                        * haven't gotten back to sleep yet.
+                        */
+                       if (vp->savp_sleeper_upcall == NULL)
+                               vp->savp_sleeper_upcall = sau;
                        mutex_exit(&vp->savp_mutex);
                        mi_switch(l);
-                       sadata_upcall_free(sau);
                        return;
                }
 
@@ -1598,7 +1619,7 @@ sa_switch(struct lwp *l)
                 */
                if ((l->l_flag & LP_SA_PAGEFAULT) && sa_pagefault(l,
                        &sau->sau_event.ss_captured.ss_ctx) != 0) {
-                       cpu_setfunc(l2, sa_neverrun, NULL);
+                       cpu_setfunc(l2, sa_cache_lingerer, NULL);
                        sa_putcachelwp(p, l2); /* uvm_lwp_hold from 
sa_getcachelwp */
                        mutex_exit(&vp->savp_mutex);
                        DPRINTFN(4,("sa_switch(%d.%d) Pagefault\n",
@@ -1662,20 +1683,45 @@ sa_switch(struct lwp *l)
 }
 
 /*
- * sa_neverrun
+ * sa_cache_lingerer
  *
- *     Routine for threads that have never run. Calls lwp_exit.
- * New, never-run cache threads get pointed at this routine, which just runs
- * and calls lwp_exit().
+ *     Routine for threads that have never run. Hangs around until
+ * either the lwp gets used (and gets set to a differet routine) or
+ * the process exits. Since our sleep is interruptable, we have to
+ * catch when the signal code wakes us (usually in resoinse to a SIGCONT).
+ *     If we get woken too soon, just put ourselves back to sleep.
  */
 static void
-sa_neverrun(void *arg)
+sa_cache_lingerer(void *arg)
 {
        struct lwp *l;
+       struct proc *p;
+       struct sadata_vp *vp;
+       sleepq_t        *sq;
+       int f;
 
        l = curlwp;
+       p = l->l_proc;
+
+       vp = l->l_savp;
+       sq = &vp->savp_lwpcache;
 
-       DPRINTFN(1,("sa_neverrun(%d.%d %x) exiting\n", l->l_proc->p_pid,
+       while (!(SA_LWP_SHOULD_EXIT(l, p))) {
+               /* Not time to go, back to sleep with us */
+               mutex_enter(&vp->savp_mutex);
+printf("sa_cache_lingerer(%d.%d) cachecount %d\n", l->l_proc->p_pid, 
l->l_lid,vp->savp_lwpcache_count);
+               sleepq_enter(&vp->savp_woken, l, &vp->savp_mutex);
+               sleepq_enqueue(sq, (void *)sq, sa_lwpcache_wmesg, 
&sa_lwpcache_sobj);
+               vp->savp_lwpcache_count++;
+               SA_LWP_STATE_LOCK(l, f);
+               l->l_flag &= ~LW_PENDSIG;
+               sleepq_block(0, true);
+DPRINTF(("sa_cache_lingerer(%d.%d): back out, l %p \n", l->l_proc->p_pid,
+           l->l_lid, l));
+               SA_LWP_STATE_UNLOCK(l, f);
+       }
+
+       DPRINTFN(1,("sa_cache_lingerer(%d.%d %x) exiting\n", l->l_proc->p_pid,
            l->l_lid, l->l_flag));
 
        lwp_exit(l);
@@ -1707,7 +1753,7 @@ sa_switchcall(void *arg)
 
        lwp_lock(l2);
        KASSERT(vp->savp_lwp == l2);
-       if ((l2->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+       if (SA_LWP_SHOULD_EXIT(l2, p)) {
                lwp_unlock(l2);
                sadata_upcall_free(sau);
                lwp_exit(l2);
@@ -1814,7 +1860,7 @@ sa_newcachelwp(struct lwp *l, struct sad
        }
 
        error = lwp_create(l, p, uaddr, inmem, 0, NULL, 0,
-           sa_neverrun, NULL, &l2, l->l_class);
+           sa_cache_lingerer, NULL, &l2, l->l_class);
        if (error) {
                uvm_uarea_free(uaddr, curcpu());
                return error;
@@ -1863,8 +1909,7 @@ sa_putcachelwp(struct proc *p, struct lw
        p->p_nlwps--;
        l->l_prflag |= LPR_DETACHED;
 #endif
-       l->l_flag |= LW_SA;
-       membar_producer();
+       l->l_flag |= (LW_SA | LW_SINTR);
        DPRINTFN(5,("sa_putcachelwp(%d.%d) Adding LWP %d to cache\n",
            p->p_pid, curlwp->l_lid, l->l_lid));
 
@@ -1879,11 +1924,11 @@ sa_putcachelwp(struct proc *p, struct lw
 
        /*
         * XXXWRS: Following is a hand-rolled call of the form:
-        * sleepq_enqueue(sq, (void *)sq, "lwpcache", sa_sobj); but
+        * sleepq_enqueue(sq, (void *)sq, "lwpcache", sa_lwpcache_sobj); but
         * hand-done since l might not be curlwp.
         */
 
-        l->l_syncobj = &sa_sobj;
+        l->l_syncobj = &sa_lwpcache_sobj;
         l->l_wchan = sq;
         l->l_sleepq = sq;
         l->l_wmesg = sa_lwpcache_wmesg;
@@ -1892,7 +1937,38 @@ sa_putcachelwp(struct proc *p, struct lw
         l->l_sleeperr = 0;
          
         vp->savp_lwpcache_count++;
-        sleepq_insert(sq, l, &sa_sobj);
+        sleepq_insert(sq, l, &sa_lwpcache_sobj);
+       membar_producer();
+}
+
+/*
+ * sa_{lwpcache,lwpblk}_unsleep
+ *
+ *     Another part of the kernel wants to wake up a thread
+ * in one of our special sleep queues. Since we keep a
+ * count of threads we've put in said queues, adjust it
+ * as part of removing it.
+ */
+static u_int
+sa_lwpcache_unsleep(lwp_t *l, bool cleanup)
+{
+       DPRINTFN(3,("sa_lwpcache_unsleep(%d.%d), flags %x count %d\n",
+               l->l_proc->p_pid, l->l_lid,
+               l->l_flag, l->l_savp->savp_lwpcache_count));
+       KASSERT(l->l_savp->savp_lwpcache_count > 0);
+       l->l_savp->savp_lwpcache_count--;
+       return sleepq_unsleep(l, cleanup);
+}
+
+static u_int
+sa_lwpublk_unsleep(lwp_t *l, bool cleanup)
+{
+       DPRINTFN(3,("sa_lwpcache_unsleep(%d.%d), flags %x count %d\n",
+               l->l_proc->p_pid, l->l_lid,
+               l->l_flag, l->l_savp->savp_woken_count));
+       KASSERT(l->l_savp->savp_woken_count > 0);
+       l->l_savp->savp_woken_count--;
+       return sleepq_unsleep(l, cleanup);
 }
 
 /*
@@ -1907,7 +1983,9 @@ sa_getcachelwp(struct proc *p, struct sa
        sleepq_t        *sq = &vp->savp_lwpcache;
 
        KASSERT(mutex_owned(&vp->savp_mutex));
-       KASSERT(vp->savp_lwpcache_count > 0);
+
+       if (vp->savp_lwpcache_count == 0)
+               return NULL;
 
        vp->savp_lwpcache_count--;
        l= TAILQ_FIRST(sq);
@@ -1922,7 +2000,7 @@ sa_getcachelwp(struct proc *p, struct sa
        l->l_syncobj = &sched_syncobj;
        l->l_wchan = NULL;
        l->l_sleepq = NULL;
-       l->l_flag &= ~LW_SINTR;
+       l->l_flag &= ~(LW_SINTR | LW_PENDSIG);
 
 #if 0 /* Not now, for now leave lwps in lwp list */
        LIST_INSERT_HEAD(&p->p_lwps, l, l_sibling);
@@ -2062,6 +2140,9 @@ sa_upcall_userret(struct lwp *l)
                 * we deliver the call.
                 */
                l2 = TAILQ_FIRST(&vp->savp_woken);
+DPRINTF(("sa_userret got l2 %p on count %d\n", l2, vp->savp_woken_count));
+DPRINTF(("sa_userret(%d.%d): just pulled l %p id %d mutex good %d\n", 
l->l_proc->p_pid,
+           l->l_lid, l2, l2->l_lid, l2->l_mutex == &vp->savp_mutex));
                TAILQ_REMOVE(&vp->savp_woken, l2, l_sleepchain);
                vp->savp_woken_count--;
                mutex_exit(&vp->savp_mutex);
@@ -2069,8 +2150,7 @@ sa_upcall_userret(struct lwp *l)
                DPRINTFN(9,("sa_upcall_userret(%d.%d) using stack %p\n",
                    l->l_proc->p_pid, l->l_lid, sast->sast_stack.ss_sp));
 
-               if ((l->l_flag & LW_WEXIT)
-                   || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+               if (SA_LWP_SHOULD_EXIT(l, p)) {
                        lwp_exit(l);
                        /* NOTREACHED */
                }
@@ -2079,8 +2159,7 @@ sa_upcall_userret(struct lwp *l)
                    p->p_pid, l->l_lid, l2->l_lid));
 
                sau = sadata_upcall_alloc(1);
-               if ((l->l_flag & LW_WEXIT)
-                   || (p->p_sflag & (PS_WCORE | PS_WEXIT))) {
+               if (SA_LWP_SHOULD_EXIT(l, p)) {
                        sadata_upcall_free(sau);
                        lwp_exit(l);
                        /* NOTREACHED */
@@ -2097,7 +2176,7 @@ sa_upcall_userret(struct lwp *l)
                l2->l_wchan = sq;
                l2->l_wmesg = sa_lwpcache_wmesg;
                        vp->savp_lwpcache_count++;
-                       sleepq_insert(sq, l2, &sa_sobj);
+                       sleepq_insert(sq, l2, &sa_lwpcache_sobj);
                        /* uvm_lwp_hold from sa_unblock_userret */
        } else if (sast)
                sa_setstackfree(sast, sa);
@@ -2207,7 +2286,6 @@ sa_makeupcalls(struct lwp *l, struct sad
                sas[2] = &sau->sau_interrupted.ss_captured.ss_sa;
                nint = 1;
        }
-#if 0
        /* For now, limit ourselves to one unblock at once. */
        if (sau->sau_type == SA_UPCALL_UNBLOCKED) {
                mutex_enter(&vp->savp_mutex);
@@ -2215,7 +2293,6 @@ sa_makeupcalls(struct lwp *l, struct sad
                mutex_exit(&vp->savp_mutex);
                /* XXX WRS Need to limit # unblocks we copy out at once! */
        }
-#endif
 
        /* Copy out the activation's ucontext */
        up = (void *)STACK_ALLOC(stack, ucsize);
@@ -2290,8 +2367,9 @@ sa_makeupcalls(struct lwp *l, struct sad
                        sq = &vp->savp_lwpcache;
                        l2->l_wchan = sq;
                        l2->l_wmesg = sa_lwpcache_wmesg;
+                       cpu_setfunc(l2, sa_cache_lingerer, NULL);
                        vp->savp_lwpcache_count++;
-                               sleepq_insert(sq, l2, &sa_sobj);
+                               sleepq_insert(sq, l2, &sa_lwpcache_sobj);
                                /* uvm_lwp_hold from sa_unblock_userret */
                        mutex_exit(&vp->savp_mutex);
 
@@ -2392,13 +2470,13 @@ sa_unblock_userret(struct lwp *l)
        struct proc *p;
        struct sadata *sa;
        struct sadata_vp *vp;
-       int swapper;
+       int swapper, do_once = 1;
 
        p = l->l_proc;
        sa = p->p_sa;
        vp = l->l_savp;
 
-       if ((l->l_flag & LW_WEXIT) || (p->p_sflag & (PS_WCORE | PS_WEXIT)))
+       if (SA_LWP_SHOULD_EXIT(l, p))
                return;
 
        if ((l->l_flag & LW_SA_BLOCKING) == 0)
@@ -2410,6 +2488,7 @@ sa_unblock_userret(struct lwp *l)
        p = l->l_proc;
        sa = p->p_sa;
        vp = l->l_savp;
+retry:
        vp_lwp = vp->savp_lwp;
        l2 = NULL;
        swapper = 0;
@@ -2527,8 +2606,13 @@ sa_unblock_userret(struct lwp *l)
         */
        sleepq_enter(&vp->savp_woken, l, &vp->savp_mutex);
        sleepq_enqueue(&vp->savp_woken, &vp->savp_woken, sa_lwpwoken_wmesg,
-           &sa_sobj);
-       uvm_lwp_hold(l);
+           &sa_lwpublk_sobj);
+       l->l_flag |= LW_SINTR;
+       l->l_flag &= ~LW_PENDSIG;
+       if (do_once) {
+               uvm_lwp_hold(l);
+               do_once = 0;
+       }
        vp->savp_woken_count++;
        //l->l_stat = LSSUSPENDED;
        mi_switch(l);
@@ -2542,6 +2626,21 @@ sa_unblock_userret(struct lwp *l)
         * In the normal lwp lifecycle, cpu_setfunc() will make this lwp
         * run in a different routine by the time we next run.
         */
+
+printf("unblock back from miswitch\n");
+       if (!(SA_LWP_SHOULD_EXIT(l, p))) {
+               /*
+                * Not time to go, back to sleep with us. However
+                * the blessed LWP may need a wakeup again to notice
+                * us. So try this routine again.
+                */
+               DPRINTFN(3,("sa_unblock_userret(%d.%d) going around again"
+                       ", flags %x, vp %d\n", l->l_proc->p_pid, l->l_lid,
+                       l->l_flag, vp_lwp->l_lid));
+               goto retry;
+       }
+
+printf("unblock exiting\n");
        lwp_exit(l);
        /* NOTREACHED */
 }
Index: kern/kern_exit.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_exit.c,v
retrieving revision 1.215
diff -u -p -r1.215 kern_exit.c
--- kern/kern_exit.c    1 Nov 2008 05:59:33 -0000       1.215
+++ kern/kern_exit.c    10 Nov 2008 23:39:48 -0000
@@ -595,34 +595,33 @@ exit_lwps(struct lwp *l)
        p = l->l_proc;
        KASSERT(mutex_owned(p->p_lock));
 
-#ifdef KERN_SA
+#if 0
        if (p->p_sa != NULL) {
-               struct sadata_vp *vp;
                SLIST_FOREACH(vp, &p->p_sa->sa_vps, savp_next) {
                        /*
-                        * Make SA-cached LWPs normal process interruptable
-                        * so that the exit code can wake them. Locking
-                        * savp_mutex locks all the lwps on this vp that
-                        * we need to adjust.
+                        * Make SA-cached LWPs normal process runnable
+                        * LWPs so that they'll also self-destruct.
                         */
-                       mutex_enter(&vp->savp_mutex);
                        DPRINTF(("exit_lwps: Making cached LWPs of %d on "
-                           "VP %d interruptable: ", p->p_pid, vp->savp_id));
-                       TAILQ_FOREACH(l2, &vp->savp_lwpcache, l_sleepchain) {
-                               l2->l_flag |= LW_SINTR;
-                               DPRINTF(("%d ", l2->l_lid));
-                       }
-                       DPRINTF(("\n"));
+                           "VP %d runnable: ", p->p_pid, vp->savp_id));
+                       while ((l2 = sa_getcachelwp(p, vp)) != 0) {
+                               lwp_lock(l2);
+                               l2->l_flag = (l2->l_flag & ~LW_SA) | LW_WEXIT;
+                               l2->l_priority = MAXPRI_USER; /* XXX WRS needs 
thought */
 
-                       DPRINTF(("exit_lwps: Making unblocking LWPs of %d on "
-                           "VP %d interruptable: ", p->p_pid, vp->savp_id));
-                       TAILQ_FOREACH(l2, &vp->savp_woken, l_sleepchain) {
-                               vp->savp_woken_count--;
-                               l2->l_flag |= LW_SINTR;
+                               /* setrunnable() will release the mutex. */
+                               setrunnable(l2);
                                DPRINTF(("%d ", l2->l_lid));
                        }
                        DPRINTF(("\n"));
-                       mutex_exit(&vp->savp_mutex);
+
+                       /*
+                        * Clear wokenq, the LWPs on the queue will
+                        * run below. Workes as these threads are still
+                        * on the p_lwps list (even though they are no longer
+                        * counted).
+                        */
+                       TAILQ_INIT(&vp->savp_woken);
                }
        }
 #endif

Attachment: pgpGOVYypATAK.pgp
Description: PGP signature

Prev by Date: Re: Realtek ALC268 support in azalia_codec
Next by Date: Re: -current panic on amd64
Previous by Thread: Realtek ALC268 support in azalia_codec
Next by Thread: Path to kmods
Indexes:

Home | Main Index | Thread Index | Old Index