Subject: alternate rough SA patch (works on SMP)
To: None <tech-kern@netbsd.org>
From: Christian Limpach <chris@pin.lu>
List: tech-kern
Date: 06/24/2003 02:45:59
Quoting Stephan Uphoff <ups@stups.com>:

> I mangled Nathan's SA kernel code to hopefully squash a few bugs 
> without introducing new ones and added a modified patch from 
> YAMAMOTO Takashi.

I've also come up with a patch for pthread/SMP.  My test program is 
transcode.  Without the patch it would not even print its copyright line, 
now it manages to at least decode about 1000 frames of an mpeg2 stream and 
then end with SEGV or panic: fp_save ipi didn't[1].  I believe transcode's 
thread usage is not very clean since it also causes many thread warnings when 
run on cygwin, yet it shouldn't panic our kernel.

I can't claim to know the pthread/SMP code well, I've only poked around in it 
over the weekend.  Here's what I noticed and patched:

- remrunqueue panic: one cause seems to be case 2 of sa_switch when the sa_vp 
lwp is not runnable (possibly kern/21927)

- panic: SA VP is in state 7/8: this seems to happen when upcall_userret 
handles the SA_BLOCKING case and the sa_vp is LSONPROC (possibly kern/20367)

- SA_UPCALL set but sa_upcalls empty: I tried to lock everything that touches 
the SA_UPCALL flag but it still happens, my fix for this is definitely a hack.

Is this going in the right direction?

Index: arch/i386/include/userret.h
===================================================================
RCS file: /cvs/netbsd/src/sys/arch/i386/include/userret.h,v
retrieving revision 1.3
diff -u -r1.3 userret.h
--- arch/i386/include/userret.h	17 Jan 2003 23:10:29 -0000	1.3
+++ arch/i386/include/userret.h	23 Jun 2003 22:55:21 -0000
@@ -96,8 +96,11 @@
 		(p->p_userret)(l, p->p_userret_arg);
 
 	/* Invoke any pending upcalls. */
-	while (l->l_flag & L_SA_UPCALL)
+	if (l->l_flag & (L_SA_UPCALL | L_SA_BLOCKING)) {
 		sa_upcall_userret(l);
+                while (l->l_flag & L_SA_UPCALL)
+                        sa_upcall_userret(l);
+        }
 
 	curcpu()->ci_schedstate.spc_curpriority = l->l_priority = l->l_usrpri;
 }
Index: kern/kern_sa.c
===================================================================
RCS file: /cvs/netbsd/src/sys/kern/kern_sa.c,v
retrieving revision 1.16
diff -u -r1.16 kern_sa.c
--- kern/kern_sa.c	28 May 2003 22:17:20 -0000	1.16
+++ kern/kern_sa.c	23 Jun 2003 23:00:57 -0000
@@ -316,9 +316,14 @@
 	
 		SCHED_LOCK(s);
 		l2 = sa->sa_woken;
-		sa->sa_woken = NULL;
-		sa->sa_vp = NULL;
-		p->p_nrlwps--;
+		if (l2 && l2->l_stat != LSRUN) {
+			KDASSERT((l2->l_stat == LSONPROC) || (l2->l_stat == 
LSSUSPENDED));
+			l2 = NULL;
+		} else {
+			sa->sa_woken = NULL;
+			sa->sa_vp = NULL;
+			p->p_nrlwps--;
+		}
 		sa_putcachelwp(p, l);
 		KDASSERT((l2 == NULL) || (l2->l_proc == l->l_proc));
 		KDASSERT((l2 == NULL) || (l2->l_stat == LSRUN));
@@ -371,6 +376,7 @@
 	struct sadata_upcall *sau;
 	struct sadata *sa = l->l_proc->p_sa;
 	stack_t st;
+	int ret, s;
 
 	l->l_flag &= ~L_SA; /* XXX prevent recursive upcalls if we sleep for
 			      memory */
@@ -388,7 +394,10 @@
 	DPRINTFN(9,("sa_upcall(%d.%d) nstacks--   = %2d\n", 
 	    l->l_proc->p_pid, l->l_lid, sa->sa_nstacks));
 
-	return sa_upcall0(l, type, event, interrupted, argsize, arg, sau, 
&st);
+	SCHED_LOCK(s);
+	ret = sa_upcall0(l, type, event, interrupted, argsize, arg, sau, &st);
+	SCHED_UNLOCK(s);
+	return ret;
 }
 
 int
@@ -471,7 +480,7 @@
 	struct sadata_upcall *sau;
 	struct lwp *l2;
 	stack_t st;
-	int error;
+	int error, s;
 
 	DPRINTFN(4,("sa_switch(%d.%d type %d VP %d)\n", p->p_pid, l->l_lid,
 	    type, sa->sa_vp ? sa->sa_vp->l_lid : 0));
@@ -562,7 +571,7 @@
 		 * go. If the LWP on the VP was idling, don't make it
 		 * run again, though.
 		 */
-		if (sa->sa_idle)
+		if (sa->sa_idle || sa->sa_vp->l_stat != LSRUN)
 			l2 = NULL;
 		else
 			l2 = sa->sa_vp;
@@ -619,10 +628,11 @@
 	 * for a SA_UNBLOCKED upcall when we get back to userlevel, provided
 	 * that the SA_BLOCKED upcall happened.
 	 */
-	if (l->l_flag & L_SA_BLOCKING)
-		l->l_flag |= L_SA_UPCALL;
-	else
+	if ((l->l_flag & L_SA_BLOCKING) == 0) {
+		SCHED_LOCK(s);
 		sa_vp_repossess(l);
+		SCHED_UNLOCK(s);
+	}
 }
 
 void
@@ -768,17 +778,20 @@
 	void *stack, *ap;
 	ucontext_t u, *up;
 	int i, nsas, nint, nevents, type;
+	int s;
 
 	p = l->l_proc;
 	sa = p->p_sa;
 
 	KERNEL_PROC_LOCK(l);
-	l->l_flag &= ~L_SA;
+	SCHED_LOCK(s);
+	l->l_flag &= ~(L_SA | L_SA_UPCALL);
 
 	DPRINTFN(7,("sa_upcall_userret(%d.%d %x) \n", p->p_pid, l->l_lid,
 	    l->l_flag));
 
-	if (l->l_flag & L_SA_BLOCKING) {
+	if ((l->l_flag & L_SA_BLOCKING) &&
+	    (sa->sa_vp == NULL || (sa->sa_vp->l_stat != LSONPROC))) {
 		/* Invoke an "unblocked" upcall */
 		struct lwp *l2;
 		DPRINTFN(8,("sa_upcall_userret(%d.%d) unblocking\n",
@@ -805,14 +818,14 @@
 			 * Ideally, tsleep() would have a variant that took
 			 * a LWP to switch to.
 			 */
-			l->l_flag &= ~L_SA;
+			/* l->l_flag &= ~L_SA; */
 			DPRINTFN(7, ("sa_upcall_userret(%d.%d) sleeping"
 			    " for stacks\n", l->l_proc->p_pid, l->l_lid));
 			tsleep((caddr_t) &sa->sa_nstacks, PWAIT|PCATCH, 
 			    "sastacks", 0);
 			if (p->p_flag & P_WEXIT)
 				lwp_exit(l);
-			l->l_flag |= L_SA;
+			/* l->l_flag |= L_SA; */
 		}
 		l2 = sa_vp_repossess(l);
 
@@ -831,18 +844,30 @@
 			printf("sa_upcall_userret: out of upcall resources"
 			    " for %d.%d\n", p->p_pid, l->l_lid);
 #endif
+			SCHED_UNLOCK(s);
 			sigexit(l, SIGABRT);
 			/* NOTREACHED */
 		}
 		l->l_flag &= ~L_SA_BLOCKING;
 	}
 
+	/* XXX figure out how SA_UPCALL gets set without something
+	 * XXX in sa_upcalls */
+	/* if ((l->l_flag & L_SA_UPCALL) == 0) { */
+	if (SIMPLEQ_EMPTY(&sa->sa_upcalls)) {
+		DPRINTFN(8,("sa_upcall_userret(%d.%d): abort L_SA_UPCALL 
cleared%s\n",
+			     p->p_pid, l->l_lid,
+			     (l->l_flag & L_SA_BLOCKING) ? " and sa->vp 
ONPROC" : ""));
+		l->l_flag |= L_SA;
+		SCHED_UNLOCK(s);
+		KERNEL_PROC_UNLOCK(l);
+		return;
+	}
+
 	KDASSERT(SIMPLEQ_EMPTY(&sa->sa_upcalls) == 0);
 
 	sau = SIMPLEQ_FIRST(&sa->sa_upcalls);
 	SIMPLEQ_REMOVE_HEAD(&sa->sa_upcalls, sau_next);
-	if (SIMPLEQ_EMPTY(&sa->sa_upcalls))
-		l->l_flag &= ~L_SA_UPCALL;
 
 	if (sau->sau_flags & SAU_FLAG_DEFERRED) {
 		sa_upcall_getstate(sau,
@@ -869,6 +894,7 @@
 			    " context of event LWP %d\n",
 			    p->p_pid, l->l_lid, sau-
>sau_state.captured.e_sa.sa_id);
 #endif
+			SCHED_UNLOCK(s);
 			sigexit(l, SIGILL);
 			/* NOTREACHED */
 		}
@@ -887,6 +913,7 @@
 			    " context of interrupted LWP %d\n",
 			    p->p_pid, l->l_lid, sau-
>sau_state.captured.i_sa.sa_id);
 #endif
+			SCHED_UNLOCK(s);
 			sigexit(l, SIGILL);
 			/* NOTREACHED */
 		}
@@ -906,6 +933,7 @@
 		printf("sa_upcall_userret: couldn't copyout activation"
 		    " ucontext for %d.%d\n", l->l_proc->p_pid, l->l_lid);
 #endif
+		SCHED_UNLOCK(s);
 		sigexit(l, SIGILL);
 		/* NOTREACHED */
 	}
@@ -925,6 +953,7 @@
 			printf("sa_upcall_userret: couldn't copyout sa_t "
 			    "%d for %d.%d\n", i, p->p_pid, l->l_lid);
 #endif
+			SCHED_UNLOCK(s);
 			sigexit(l, SIGILL);
 			/* NOTREACHED */
 		}
@@ -946,6 +975,7 @@
 			    p->p_pid, l->l_lid,
 			    sau->sau_arg, (long) sau->sau_argsize, ap);
 #endif
+			SCHED_UNLOCK(s);
 			sigexit(l, SIGILL);
 			/* NOTREACHED */
 		}
@@ -960,7 +990,10 @@
 	DPRINTFN(7,("sa_upcall_userret(%d.%d): type %d\n",p->p_pid,
 	    l->l_lid, type));
 
+	SCHED_UNLOCK(s);
 	cpu_upcall(l, type, nevents, nint, sapp, ap, stack, sa->sa_upcall);
+	if (SIMPLEQ_EMPTY(&sa->sa_upcalls) == 0)
+		l->l_flag |= L_SA_UPCALL;
 	l->l_flag |= L_SA;
 	KERNEL_PROC_UNLOCK(l);
 }
@@ -971,7 +1004,7 @@
 	struct lwp *l2;
 	struct proc *p = l->l_proc;
 	struct sadata *sa = p->p_sa;
-	int s;
+	/* int s; */
 
 	/*
 	 * Put ourselves on the virtual processor and note that the
@@ -984,7 +1017,7 @@
 
 	KDASSERT(l2 != l);
 	if (l2) {
-		SCHED_LOCK(s);
+		/* SCHED_LOCK(s); */
 		switch (l2->l_stat) {
 		case LSRUN:
 			remrunqueue(l2);
@@ -994,6 +1027,8 @@
 			unsleep(l2);
 			l2->l_flag &= ~L_SINTR;
 			break;
+		case LSSUSPENDED:
+			break;
 #ifdef DIAGNOSTIC
 		default:
 			panic("SA VP %d.%d is in state %d, not running"
@@ -1007,7 +1042,7 @@
 		 * captured before the upcall before we make it possible
 		 * for another processor to grab it.
 		 */
-		SCHED_UNLOCK(s);
+		/* SCHED_UNLOCK(s); */
 	}
 	return l2;
 }





[1] here's the stack trace for this panic, I couldn't get a dump but I've had 
this panic a second time so it's somewhat reproducible:

panic: fp_save ipi didn't
Stopped in pid 411.1 (transcode) at     netbsd:cpu_Debugger+0x4:        leave
db{0}> mach cpu 0
using cpu 0
db{0}> bt
cpu_Debugger(cb1fcd00,1,cb1fcd00,cb1fcd00,8c00c1) at netbsd:cpu_Debugger+0x4
panic(c03c03a6,c031fc22,8,286,cb1fcd00) at netbsd:panic+0xbd
npxsave_lwp(cb1fcd00,1,0,0,cb4e7668) at netbsd:npxsave_lwp+0xfe
cpu_getmcontext(cb1fcd00,cb4e768c,cb4e7668,cb4e7670,cb4e7648) at 
netbsd:cpu_getmcontext+0xfa
getucontext(cb1fcd00,cb4e7668,0,0,cb4e7648) at netbsd:getucontext+0x88
sa_upcall_getstate(cb4e7648,cb1fcd00,0,c0836900,cb1fce00) at 
netbsd:sa_upcall_getstate+0x23
sa_upcall0(cb1fce00,2,cb1fcd00,0,0) at netbsd:sa_upcall0+0x8f
sa_switch(cb1fcd00,2,0,1,cb214f88) at netbsd:sa_switch+0x132
ltsleep(c08743d4,110,c0385f1d,0,c08743d4) at netbsd:ltsleep+0x30e
pipe_read(cb214f88,cb214fb0,cb4dded0,c08c5500,1) at netbsd:pipe_read+0x1d0
dofileread(cb48bd28,3,cb214f88,8a2dc00,200) at netbsd:dofileread+0xa2
sys_read(cb1fcd00,cb4ddf80,cb4ddf78,c02e3aa7,cb1fcd00) at netbsd:sys_read+0x7e
syscall_plain(2b,2b,2b,2b,200) at netbsd:syscall_plain+0xc0
db{0}> mach cpu 1
cpu 1 not paused
db{0}> kgdb

0xc02d74c8 in cpu_Debugger () at machine/cpufunc.h:272
272             __asm __volatile("int $3");
(gdb) bt
#0  0xc02d74c8 in cpu_Debugger () at machine/cpufunc.h:272
#1  0xc026af85 in panic (fmt=0xc03c03a6 "fp_save ipi didn't")
    at ../../../../kern/subr_prf.c:230
#2  0xc031fc16 in npxsave_lwp (l=0xcb1fcd00, save=1) 
at ../../../../arch/i386/isa/npx.c:700
#3  0xc02db3da in cpu_getmcontext (l=0xcb1fcd00, mcp=0xcb4e768c, 
flags=0xcb4e7668)
    at ../../../../arch/i386/i386/machdep.c:2282
#4  0xc025ae44 in getucontext (l=0xcb1fcd00, ucp=0xcb4e7668)
    at ../../../../kern/kern_sig.c:1802
#5  0xc0257807 in sa_upcall_getstate (sau=0xcb4e7648, event=0xcb1fcd00, 
interrupted=0x0)
    at ../../../../kern/kern_sa.c:438
#6  0xc02577c3 in sa_upcall0 (l=0xcb1fce00, type=2, event=0xcb1fcd00, 
interrupted=0x0, 
    argsize=0, arg=0x0, sau=0xcb4e7648, st=0xcb4ddde4) 
at ../../../../kern/kern_sa.c:423
#7  0xc02579c2 in sa_switch (l=0xcb1fcd00, type=2) 
at ../../../../kern/kern_sa.c:547
#8  0xc025d136 in ltsleep (ident=0xc08743d4, priority=272, 
wmesg=0xc0385f1d "piperd", 
    timo=0, interlock=0xc08743d4) at ../../../../kern/kern_synch.c:493
#9  0xc0270f90 in pipe_read (fp=0xcb214f88, offset=0xcb214fb0, 
uio=0xcb4dded0, 
    cred=0xc08c5500, flags=1) at ../../../../kern/sys_pipe.c:542
#10 0xc026f032 in dofileread (p=0xcb48bd28, fd=3, fp=0xcb214f88, 
buf=0x8a2dc00, nbyte=512, 
    offset=0xcb214fb0, flags=1, retval=0xcb4ddf78) 
at ../../../../kern/sys_generic.c:148
#11 0xc026ef86 in sys_read (l=0xcb1fcd00, v=0xcb4ddf80, retval=0xcb4ddf78)
    at ../../../../kern/sys_generic.c:104
#12 0xc02e3ab8 in syscall_plain (frame={tf_gs = 43, tf_fs = 43, tf_es = 43, 
tf_ds = 43, 
      tf_edi = 512, tf_esi = 1215823872, tf_ebp = 1216085856, tf_ebx = 
1208730148, 
      tf_edx = 0, tf_ecx = 1, tf_eax = 3, tf_trapno = 3, tf_err = 2, tf_eip = 
1208842107, 
      tf_cs = 35, tf_eflags = 531, tf_esp = 1216085796, tf_ss = 43, 
tf_vm86_es = 0, 
      tf_vm86_ds = 0, tf_vm86_fs = 0, tf_vm86_gs = 0})
    at ../../../../arch/i386/i386/syscall.c:156
#13 0xc0100b79 in syscall1 ()
#14 0x8057949 in ?? ()
Cannot access memory at address 0x3
(gdb) f 2
#2  0xc031fc16 in npxsave_lwp (l=0xcb1fcd00, save=1) 
at ../../../../arch/i386/isa/npx.c:700
700                                     panic("fp_save ipi didn't");
(gdb) l
695                     while (l->l_addr->u_pcb.pcb_fpcpu != NULL)
696     #ifdef DIAGNOSTIC
697                     {
698                             spincount++;
699                             if (spincount > 10000000) {
700                                     panic("fp_save ipi didn't");
701                             }
702                     }
703     #else
704                     __splbarrier();         /* XXX replace by generic 
barrier */
(gdb) 

-- 
Christian Limpach <chris@pin.lu>