Subject: proposed fix for awkward allocation in sa_switch()
To: None <tech-kern@netbsd.org>
From: Nathan J. Williams <nathanw@wasabisystems.com>
List: tech-kern
Date: 03/03/2004 15:19:30
--=enigma-analyzer-Lon-Horiuchi-offensive-information-warfare-TWA-Consu


I recently went and re-examined a memory-allocation problem in
sa_switch(), where we need to allocate a structure while inside
tsleep(). Even with all the WAITOK flags turned off, this causes
problems when the pool code tries to wake up the pagedaemon with the
scheduler lock held.

There's a comment in the code that "It would be nice if we could
safely allocate the sadata_upcall structure on the stack". It can't be
allocated on the caller's stack, because the caller may wake up and
return to userland asynchronously with the new LWP using the
structure. However, since the new LWP is the one that needs the
structure, it can be allocated on the new LWP's stack, with some
slight abuse of proc_trampoline to clean up the stack space, and
adjusting the two users of proc_trampoline per architecture. It does
depend (as implemented so far) on being able to pass one more bit of
data to proc_trampoline; I haven't looked at all architectures yet,
but this seems doable.

Comments? 

        - Nathan


--=enigma-analyzer-Lon-Horiuchi-offensive-information-warfare-TWA-Consu
Content-Type: text/x-patch
Content-Disposition: attachment; filename=sadiff
Content-Description: sa upcall allocation patch

Index: sys/lwp.h
===================================================================
RCS file: /cvsroot/src/sys/sys/lwp.h,v
retrieving revision 1.19
diff -u -p -r1.19 lwp.h
--- sys/lwp.h	11 Jan 2004 18:37:52 -0000	1.19
+++ sys/lwp.h	3 Mar 2004 20:04:46 -0000
@@ -175,7 +175,7 @@ int newlwp(struct lwp *, struct proc *, 
 #define LWPWAIT_EXITCONTROL	0x00000001
 int 	lwp_wait1(struct lwp *, lwpid_t, lwpid_t *, int);
 void	lwp_continue(struct lwp *);
-void	cpu_setfunc(struct lwp *, void (*)(void *), void *);
+void	*cpu_setfunc(struct lwp *, void (*)(void *), size_t, void *);
 void	startlwp(void *);
 void	upcallret(struct lwp *);
 void	lwp_exit (struct lwp *);
Index: sys/savar.h
===================================================================
RCS file: /cvsroot/src/sys/sys/savar.h,v
retrieving revision 1.14
diff -u -p -r1.14 savar.h
--- sys/savar.h	2 Jan 2004 18:52:17 -0000	1.14
+++ sys/savar.h	3 Mar 2004 20:04:46 -0000
@@ -70,6 +70,7 @@ struct sadata_upcall {
 
 #define	SAU_FLAG_DEFERRED_EVENT		0x1
 #define	SAU_FLAG_DEFERRED_INTERRUPTED	0x2
+#define	SAU_FLAG_STACKALLOC		0x4
 
 #define	SA_UPCALL_TYPE_MASK		0x00FF
 
Index: kern/kern_sa.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_sa.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_sa.c
--- kern/kern_sa.c	2 Jan 2004 18:52:17 -0000	1.47
+++ kern/kern_sa.c	3 Mar 2004 20:04:46 -0000
@@ -138,7 +138,8 @@ sadata_upcall_free(struct sadata_upcall 
 		}
 	}
 
-	pool_put(&saupcall_pool, sau);
+	if ((sau->sau_flags & SAU_FLAG_STACKALLOC) == 0)
+		pool_put(&saupcall_pool, sau);
 }
 
 int
@@ -898,24 +899,8 @@ sa_switch(struct lwp *l, int type)
 			return;
 		}
 
-		/*
-		 * XXX We need to allocate the sadata_upcall structure here,
-		 * XXX since we can't sleep while waiting for memory inside
-		 * XXX sa_upcall().  It would be nice if we could safely
-		 * XXX allocate the sadata_upcall structure on the stack, here.
-		 */
-		sau = sadata_upcall_alloc(0);
-		if (sau == NULL) {
-#ifdef DIAGNOSTIC
-			printf("sa_switch(%d.%d): couldn't allocate upcall data.\n",
-			    p->p_pid, l->l_lid);
-#endif
-			sa_putcachelwp(p, l2); /* PHOLD from sa_getcachelwp */
-			mi_switch(l, NULL);
-			return;
-		}
-
-		cpu_setfunc(l2, sa_switchcall, sau);
+		sau = cpu_setfunc(l2, sa_switchcall,
+		    sizeof(struct sadata_upcall), NULL);
 		error = sa_upcall0(l2, SA_UPCALL_BLOCKED, l, NULL, 0, NULL,
 		    sau);
 		if (error) {
@@ -927,6 +912,7 @@ sa_switch(struct lwp *l, int type)
 			mi_switch(l, NULL);
 			return;
 		}
+		sau->sau_flags |= SAU_FLAG_STACKALLOC;
 
 		/* 
 		 * Perform the double/upcall pagefault check.
Index: arch/i386/i386/vm_machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/vm_machdep.c,v
retrieving revision 1.116
diff -u -p -r1.116 vm_machdep.c
--- arch/i386/i386/vm_machdep.c	6 Feb 2004 10:28:03 -0000	1.116
+++ arch/i386/i386/vm_machdep.c	3 Mar 2004 20:04:47 -0000
@@ -198,24 +198,32 @@ cpu_lwp_fork(struct lwp *l1, struct lwp 
 
 	sf = (struct switchframe *)tf - 1;
 	sf->sf_esi = (int)func;
+	sf->sf_edi = 0;
 	sf->sf_ebx = (int)arg;
 	sf->sf_eip = (int)proc_trampoline;
 	pcb->pcb_esp = (int)sf;
 	pcb->pcb_ebp = 0;
 }
 
-void
-cpu_setfunc(struct lwp *l, void (*func)(void *), void *arg)
+void *
+cpu_setfunc(struct lwp *l, void (*func)(void *), size_t space, void *arg)
 {
 	struct pcb *pcb = &l->l_addr->u_pcb;
 	struct trapframe *tf = l->l_md.md_regs;
-	struct switchframe *sf = (struct switchframe *)tf - 1;
+	void *spaceptr = (char *)tf - space;
+	struct switchframe *sf = (struct switchframe *)spaceptr - 1;
+	
+	if (space != 0)
+		arg = spaceptr;
 
 	sf->sf_esi = (int)func;
+	sf->sf_edi = (int)space;
 	sf->sf_ebx = (int)arg;
 	sf->sf_eip = (int)proc_trampoline;
 	pcb->pcb_esp = (int)sf;
 	pcb->pcb_ebp = 0;
+
+	return arg;
 }	
 
 void
Index: arch/i386/i386/locore.S
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/locore.S,v
retrieving revision 1.24
diff -u -p -r1.24 locore.S
--- arch/i386/i386/locore.S	20 Feb 2004 17:35:01 -0000	1.24
+++ arch/i386/i386/locore.S	3 Mar 2004 20:04:47 -0000
@@ -701,6 +701,7 @@ NENTRY(proc_trampoline)
 	pushl	%ebx
 	call	*%esi
 	addl	$4,%esp
+	addl	%edi,%esp
 	DO_DEFERRED_SWITCH(%eax)
 	INTRFASTEXIT
 	/* NOTREACHED */
Index: arch/alpha/alpha/vm_machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/alpha/alpha/vm_machdep.c,v
retrieving revision 1.82
diff -u -p -r1.82 vm_machdep.c
--- arch/alpha/alpha/vm_machdep.c	5 Jan 2004 23:51:19 -0000	1.82
+++ arch/alpha/alpha/vm_machdep.c	3 Mar 2004 20:04:47 -0000
@@ -209,35 +209,41 @@ cpu_lwp_fork(struct lwp *l1, struct lwp 
 		up->u_pcb.pcb_context[0] =
 		    (u_int64_t)func;			/* s0: pc */
 		up->u_pcb.pcb_context[1] =
-		    (u_int64_t)exception_return;	/* s1: ra */
+		    (u_int64_t)arg;			/* s1: arg */
 		up->u_pcb.pcb_context[2] =
-		    (u_int64_t)arg;			/* s2: arg */
+		    (u_int64_t)0;			/* s2: size */
 		up->u_pcb.pcb_context[7] =
 		    (u_int64_t)proc_trampoline;		/* ra: assembly magic */
 		up->u_pcb.pcb_context[8] = ALPHA_PSL_IPL_0; /* ps: IPL */
 	}
 }
 
-void
-cpu_setfunc(l, func, arg)
+void *
+cpu_setfunc(l, func, space, arg)
 	struct lwp *l;
 	void (*func) __P((void *));
+	size_t space;
 	void *arg;
 {
 	struct user *up = l->l_addr;
+	void *spaceptr = (char *)l->l_md.md_tf - space;
+
+	if (space != 0)
+		arg = spaceptr;
 
 	up->u_pcb.pcb_hw.apcb_ksp =
-	    (u_int64_t)l->l_md.md_tf;
+	    (u_int64_t)l->l_md.md_tf - space;
 	up->u_pcb.pcb_context[0] =
 	    (u_int64_t)func;			/* s0: pc */
 	up->u_pcb.pcb_context[1] =
-	    (u_int64_t)exception_return;	/* s1: ra */
+	    (u_int64_t)arg;			/* s1: arg */
 	up->u_pcb.pcb_context[2] =
-	    (u_int64_t)arg;			/* s2: arg */
+	    (u_int64_t)space;			/* s2: size */
 	up->u_pcb.pcb_context[7] =
 	    (u_int64_t)proc_trampoline;		/* ra: assembly magic */
 	up->u_pcb.pcb_context[8] = ALPHA_PSL_IPL_0; /* ps: IPL */
 
+	return arg;
 }	
 
 /*
Index: arch/alpha/alpha/locore.s
===================================================================
RCS file: /cvsroot/src/sys/arch/alpha/alpha/locore.s,v
retrieving revision 1.105
diff -u -p -r1.105 locore.s
--- arch/alpha/alpha/locore.s	4 Nov 2003 10:33:16 -0000	1.105
+++ arch/alpha/alpha/locore.s	3 Mar 2004 20:04:48 -0000
@@ -984,9 +984,10 @@ LEAF_NOPROFILE(proc_trampoline, 0)
 	CALL(proc_trampoline_mp)
 #endif
 	mov	s0, pv
-	mov	s1, ra
-	mov	s2, a0
-	jmp	zero, (pv)
+	mov	s1, a0
+	jsr	ra, (pv)
+	addq	sp, s2, sp
+	br	exception_return
 	END(proc_trampoline)
 
 /*
Index: arch/arm/arm32/vm_machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/arm/arm32/vm_machdep.c,v
retrieving revision 1.31
diff -u -p -r1.31 vm_machdep.c
--- arch/arm/arm32/vm_machdep.c	4 Jan 2004 11:33:29 -0000	1.31
+++ arch/arm/arm32/vm_machdep.c	3 Mar 2004 20:04:48 -0000
@@ -199,21 +199,29 @@ cpu_lwp_fork(l1, l2, stack, stacksize, f
 	sf = (struct switchframe *)tf - 1;
 	sf->sf_r4 = (u_int)func;
 	sf->sf_r5 = (u_int)arg;
+	sf->sf_r6 = (u_int)0;
 	sf->sf_pc = (u_int)proc_trampoline;
 	pcb->pcb_un.un_32.pcb32_sp = (u_int)sf;
 }
 
-void
-cpu_setfunc(struct lwp *l, void (*func)(void *), void *arg)
+void *
+cpu_setfunc(struct lwp *l, void (*func)(void *), size_t space, void *arg)
 {
 	struct pcb *pcb = &l->l_addr->u_pcb;
 	struct trapframe *tf = pcb->pcb_tf;
-	struct switchframe *sf = (struct switchframe *)tf - 1;
+	void *spaceptr = (char *)tf - space;
+	struct switchframe *sf = (struct switchframe *)spaceptr - 1;
+
+	if (space != 0)
+		arg = spaceptr;
 
 	sf->sf_r4 = (u_int)func;
 	sf->sf_r5 = (u_int)arg;
+	sf->sf_r6 = (u_int)space;
 	sf->sf_pc = (u_int)proc_trampoline;
 	pcb->pcb_un.un_32.pcb32_sp = (u_int)sf;
+
+	return arg;
 }
 
 /*
Index: arch/arm/arm32/cpuswitch.S
===================================================================
RCS file: /cvsroot/src/sys/arch/arm/arm32/cpuswitch.S,v
retrieving revision 1.41
diff -u -p -r1.41 cpuswitch.S
--- arch/arm/arm32/cpuswitch.S	15 Nov 2003 08:44:18 -0000	1.41
+++ arch/arm/arm32/cpuswitch.S	3 Mar 2004 20:04:49 -0000
@@ -1023,6 +1023,7 @@ ENTRY(proc_trampoline)
 	mov	lr, pc
 	mov	pc, r4
 
+	add	sp, sp, r6
 	/* Kill irq's */
         mrs     r0, cpsr
         orr     r0, r0, #(I32_bit)

--=enigma-analyzer-Lon-Horiuchi-offensive-information-warfare-TWA-Consu--