Subject: kern/36969: locking cleanup patches for limit structures
To: None <kern-bug-people@netbsd.org, gnats-admin@netbsd.org,>
From: None <dholland@eecs.harvard.edu>
List: netbsd-bugs
Date: 09/11/2007 20:45:00
>Number:         36969
>Category:       kern
>Synopsis:       locking cleanup patches for limit structures
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Tue Sep 11 20:45:00 +0000 2007
>Originator:     David A. Holland <dholland@eecs.harvard.edu>
>Release:        NetBSD 4.99.30 (20070910)
>Organization:
   Harvard EECS
>Environment:
System: NetBSD tanaqui 4.99.30 NetBSD 4.99.30 (TANAQUI) #18: Tue Sep 4 19:02:21 EDT 2007 dholland@tanaqui:/usr/src/sys/arch/i386/compile/TANAQUI i386
Architecture: i386
Machine: i386
>Description:

I have, occasionally, been getting this DIAGNOSTIC panic:

panic: pool_get(plimitpl): free list modified: magic=deadbef4; page [...]; item addr [...]

(This then almost invariably leads to a deadlock trying to sync.)

Upon investigation, it turns out that the locking for the plimit
objects held in plimitpl is bodgy and inconsistent. I have reworked
it; patches follow.

>How-To-Repeat:

The panic is not readily repeatable. One may, however, read the code.

>Fix:

This patch is against -current of 20070910.

I haven't tested the compat_irix bit. However, as the currently
existing code there misuses p_refcnt and will panic, I doubt I've made
it any worse.


Index: sys/compat/irix/irix_prctl.c
===================================================================
RCS file: /cvsroot/src/sys/compat/irix/irix_prctl.c,v
retrieving revision 1.37
diff -u -p -r1.37 irix_prctl.c
--- sys/compat/irix/irix_prctl.c	6 Mar 2007 12:43:09 -0000	1.37
+++ sys/compat/irix/irix_prctl.c	11 Sep 2007 01:31:14 -0000
@@ -498,10 +498,8 @@ irix_sproc_child(isc)
 	 */
 	if (inh & IRIX_PR_SULIMIT) {
 		pl = p2->p_limit;
-		parent->p_limit->p_refcnt++;
-		p2->p_limit = parent->p_limit;
-		if(--pl->p_refcnt == 0)
-			limfree(pl);
+		p2->p_limit = limshare(parent);
+		limfree(pl);
 	}
 
 	/*
Index: sys/compat/netbsd32/netbsd32_netbsd.c
===================================================================
RCS file: /cvsroot/src/sys/compat/netbsd32/netbsd32_netbsd.c,v
retrieving revision 1.126
diff -u -p -r1.126 netbsd32_netbsd.c
--- sys/compat/netbsd32/netbsd32_netbsd.c	15 Aug 2007 12:07:31 -0000	1.126
+++ sys/compat/netbsd32/netbsd32_netbsd.c	11 Sep 2007 01:31:25 -0000
@@ -2298,13 +2298,7 @@ netbsd32_adjust_limits(struct proc *p)
 		return;
 	}
 
-	if (p->p_limit->p_refcnt > 1 &&
-	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
-		struct plimit *oldplim;
-		oldplim = p->p_limit;
-		p->p_limit = limcopy(p);
-		limfree(oldplim);
-	}
+	limprivatize(p);
 
 	for (i = 0; i < __arraycount(val); i++)
 		p->p_rlimit[lm[i].id] = val[i];
Index: sys/kern/kern_acct.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_acct.c,v
retrieving revision 1.75
diff -u -p -r1.75 kern_acct.c
--- sys/kern/kern_acct.c	9 Jul 2007 21:10:51 -0000	1.75
+++ sys/kern/kern_acct.c	11 Sep 2007 01:34:47 -0000
@@ -387,7 +387,6 @@ acct_process(struct lwp *l)
 	struct timeval ut, st, tmp;
 	struct rusage *r;
 	int t, error = 0;
-	struct plimit *oplim = NULL;
 	struct proc *p = l->l_proc;
 
 	mutex_enter(&acct_lock);
@@ -403,10 +402,7 @@ acct_process(struct lwp *l)
 	 * XXX We should think about the CPU limit, too.
 	 */
 	mutex_enter(&p->p_mutex);
-	if (p->p_limit->p_refcnt > 1) {
-		oplim = p->p_limit;
-		p->p_limit = limcopy(p);
-	}
+	limprivatize(p, 1 /* even when sharing changes */);
 	p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	mutex_exit(&p->p_mutex);
 
@@ -467,13 +463,6 @@ acct_process(struct lwp *l)
 	if (error != 0)
 		log(LOG_ERR, "Accounting: write failed %d\n", error);
 
-	if (oplim) {
-		mutex_enter(&p->p_mutex);
-		limfree(p->p_limit);
-		p->p_limit = oplim;
-		mutex_exit(&p->p_mutex);
-	}
-
  out:
 	mutex_exit(&acct_lock);
 	return (error);
Index: sys/kern/kern_core.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_core.c,v
retrieving revision 1.5
diff -u -p -r1.5 kern_core.c
--- sys/kern/kern_core.c	3 Apr 2007 16:11:31 -0000	1.5
+++ sys/kern/kern_core.c	11 Sep 2007 01:34:47 -0000
@@ -141,12 +141,15 @@ coredump(struct lwp *l, const char *patt
 	if ((p->p_flag & PK_SUGID) && security_setidcore_dump)
 		pattern = security_setidcore_path;
 
+	/* Could avoid taking p_limit->p_lock if we don't use pl_corename */
+	mutex_enter(&p->p_limit->p_lock);
 	if (pattern == NULL)
 		pattern = p->p_limit->pl_corename;
 	if (name == NULL) {
 		name = PNBUF_GET();
 	}
 	error = coredump_buildname(p, name, pattern, MAXPATHLEN);
+	mutex_exit(&p->p_limit->p_lock);
 	mutex_exit(&p->p_mutex);
 	mutex_exit(&proclist_lock);
 	if (error)
Index: sys/kern/kern_fork.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_fork.c,v
retrieving revision 1.142
diff -u -p -r1.142 kern_fork.c
--- sys/kern/kern_fork.c	15 Aug 2007 12:07:33 -0000	1.142
+++ sys/kern/kern_fork.c	11 Sep 2007 01:34:48 -0000
@@ -352,21 +352,13 @@ fork1(struct lwp *l1, int flags, int exi
 		p2->p_cwdi = cwdinit(p1);
 
 	/*
-	 * If p_limit is still copy-on-write, bump refcnt,
-	 * otherwise get a copy that won't be modified.
-	 * (If PL_SHAREMOD is clear, the structure is shared
-	 * copy-on-write.)
-	 */
-	if (p1->p_limit->p_lflags & PL_SHAREMOD) {
-		mutex_enter(&p1->p_mutex);
-		p2->p_limit = limcopy(p1);
-		mutex_exit(&p1->p_mutex);
-	} else {
-		mutex_enter(&p1->p_limit->p_lock);
-		p1->p_limit->p_refcnt++;
-		mutex_exit(&p1->p_limit->p_lock);
-		p2->p_limit = p1->p_limit;
-	}
+	 * Share p_limit copy-on-write.
+	 * (If flags grows a FORK_SHARELIMIT for compat_irix, use limshare
+	 * for that instead of limsharecow.)
+	 */
+	mutex_enter(&p1->p_mutex);
+	p2->p_limit = limsharecow(p1);
+	mutex_exit(&p1->p_mutex);
 
 	p2->p_sflag = ((flags & FORK_PPWAIT) ? PS_PPWAIT : 0);
 	p2->p_lflag = 0;
Index: sys/kern/kern_proc.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_proc.c,v
retrieving revision 1.115
diff -u -p -r1.115 kern_proc.c
--- sys/kern/kern_proc.c	6 Sep 2007 23:58:57 -0000	1.115
+++ sys/kern/kern_proc.c	11 Sep 2007 01:34:49 -0000
@@ -1325,13 +1325,12 @@ proc_crmod_enter(void)
 
 	/* Reset what needs to be reset in plimit. */
 	lim = p->p_limit;
+	mutex_enter(&lim->p_lock);
 	if (lim->pl_corename != defcorename) {
-		if (lim->p_refcnt > 1 &&
-		    (lim->p_lflags & PL_SHAREMOD) == 0) {
-			p->p_limit = limcopy(p);
-			limfree(lim);
-			lim = p->p_limit;
-		}
+		mutex_exit(&lim->p_lock);
+		limprivatize(p, 0);
+		/* refetch lim, it might have changed */
+		lim = p->p_limit;
 		mutex_enter(&lim->p_lock);
 		cn = lim->pl_corename;
 		lim->pl_corename = defcorename;
@@ -1339,6 +1338,9 @@ proc_crmod_enter(void)
 		if (cn != defcorename)
 			free(cn, M_TEMP);
 	}
+	else {
+		mutex_exit(&lim->p_lock);
+	}
 }
 
 /*
Index: sys/kern/kern_resource.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_resource.c,v
retrieving revision 1.120
diff -u -p -r1.120 kern_resource.c
--- sys/kern/kern_resource.c	6 Sep 2007 02:03:06 -0000	1.120
+++ sys/kern/kern_resource.c	11 Sep 2007 01:34:49 -0000
@@ -256,7 +256,6 @@ int
 dosetrlimit(struct lwp *l, struct proc *p, int which, struct rlimit *limp)
 {
 	struct rlimit *alimp;
-	struct plimit *oldplim;
 	int error;
 
 	if ((u_int)which >= RLIM_NLIMITS)
@@ -266,7 +265,7 @@ dosetrlimit(struct lwp *l, struct proc *
 		return (EINVAL);
 
 	alimp = &p->p_rlimit[which];
-	/* if we don't change the value, no need to limcopy() */
+	/* if we don't change the value, no need to limprivatize() */
 	if (limp->rlim_cur == alimp->rlim_cur &&
 	    limp->rlim_max == alimp->rlim_max)
 		return 0;
@@ -284,13 +283,9 @@ dosetrlimit(struct lwp *l, struct proc *
 			return (error);
 
 	mutex_enter(&p->p_mutex);
-	if (p->p_limit->p_refcnt > 1 &&
-	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
-	    	oldplim = p->p_limit;
-		p->p_limit = limcopy(p);
-		limfree(oldplim);
-		alimp = &p->p_rlimit[which];
-	}
+	limprivatize(p, 0);
+	/* refetch this in case it's moved */
+	alimp = &p->p_rlimit[which];
 
 	switch (which) {
 
@@ -527,63 +522,201 @@ ruadd(struct rusage *ru, struct rusage *
 }
 
 /*
- * Make a copy of the plimit structure.
- * We share these structures copy-on-write after fork,
- * and copy when a limit is changed.
+ * Make a copy of the target process's plimit structure.
+ *
+ * Must hold p->p_mutex, but *not* p->p_limit->p_lock.
  *
- * XXXSMP This is atrocious, need to simplify.
+ * We can't hold either of these locks while mallocing, so we have to
+ * do gross things. (The only reason we have p at all is so we can
+ * drop and reacquire p->p_mutex.)
+ *
+ * Note that p is not necessarily curproc.
  */
-struct plimit *
+static struct plimit *
 limcopy(struct proc *p)
 {
-	struct plimit *lim, *newlim;
-	char *corename;
-	size_t l;
+	struct plimit *oldlim;
+	struct plimit *newlim;
+	char *newcorename;
+	size_t newcorelen;
+	size_t needlen;
 
 	KASSERT(mutex_owned(&p->p_mutex));
 
-	mutex_exit(&p->p_mutex);
-	newlim = pool_get(&plimit_pool, PR_WAITOK);
+	oldlim = NULL;
+	newlim = NULL;
+	newcorename = NULL;
+	newcorelen = 0;
+
+	for (;;) {
+		/* Figure out what we're copying */
+		oldlim = p->p_limit;
+		mutex_enter(&oldlim->p_lock);
+
+		/* Work out what we need */
+		if (oldlim->pl_corename == defcorename) {
+			needlen = 0;
+		}
+		else {
+			needlen = strlen(oldlim->pl_corename) + 1;
+		}
+
+		/* If we have what we need, stop */
+		if (newlim != NULL && newcorelen == needlen) {
+			break;
+		}
+
+		/* Otherwise, try to get it */
+		mutex_exit(&oldlim->p_lock);
+		mutex_exit(&p->p_mutex);
+
+		if (newlim == NULL) {
+			newlim = pool_get(&plimit_pool, PR_WAITOK);
+		}
+
+		if (newcorename) {
+			free(newcorename, M_TEMP);
+		}
+		if (needlen > 0) {
+			newcorename = malloc(needlen, M_TEMP, M_WAITOK);
+		}
+		else {
+			newcorename = NULL;
+		}
+		newcorelen = needlen;
+
+		mutex_enter(&p->p_mutex);
+	}
+
 	mutex_init(&newlim->p_lock, MUTEX_DEFAULT, IPL_NONE);
 	newlim->p_lflags = 0;
 	newlim->p_refcnt = 1;
-	mutex_enter(&p->p_mutex);
+	memcpy(newlim->pl_rlimit, oldlim->pl_rlimit,
+	       sizeof(struct rlimit) * RLIM_NLIMITS);
+	if (newcorelen != 0) {
+		strlcpy(newcorename, oldlim->pl_corename, newcorelen);
+		newlim->pl_corename = newcorename;
+	}
+	else {
+		newlim->pl_corename = defcorename;
+	}
 
-	for (;;) {
-		lim = p->p_limit;
-		mutex_enter(&lim->p_lock);
-		if (lim->pl_corename != defcorename) {
-			l = strlen(lim->pl_corename) + 1;
+	mutex_exit(&oldlim->p_lock);
 
-			mutex_exit(&lim->p_lock);
-			mutex_exit(&p->p_mutex);
-			corename = malloc(l, M_TEMP, M_WAITOK);
-			mutex_enter(&p->p_mutex);
-			mutex_enter(&lim->p_lock);
-
-			if (l != strlen(lim->pl_corename) + 1) {
-				mutex_exit(&lim->p_lock);
-				mutex_exit(&p->p_mutex);
-				free(corename, M_TEMP);
-				mutex_enter(&p->p_mutex);
-				continue;
-			}
-		} else
-			l = 0;
-			
-		memcpy(newlim->pl_rlimit, lim->pl_rlimit,
-		    sizeof(struct rlimit) * RLIM_NLIMITS);
-		if (l != 0)
-			strlcpy(newlim->pl_corename, lim->pl_corename, l);
-		else
-			newlim->pl_corename = defcorename;
-		mutex_exit(&lim->p_lock);
-		break;
+	return (newlim);
+}
+
+/*
+ * Share a copy of p->p_limit, including future changes.
+ *
+ * Must hold p->p_mutex.
+ *
+ * Return the new reference.
+ */
+struct plimit *
+limshare(struct proc *p)
+{
+	struct plimit *lim;
+
+	KASSERT(mutex_owned(&p->p_mutex));
+
+	/* Must not affect anyone who has p->p_limit shared copy-on-write. */
+	limprivatize(p, 0);
+
+	lim = p->p_limit;
+	mutex_enter(&lim->p_lock);
+	lim->p_refcnt++;
+	lim->p_lflags |= PL_SHAREMOD;
+	mutex_exit(&lim->p_lock);
+
+	return lim;
+}
+
+/*
+ * Make a logically new plimit from p->p_limit.
+ *
+ * If the plimit structure is being shared for changes (PL_SHAREMOD
+ * set), copy it. Otherwise, share it. (If PL_SHAREMOD is not set,
+ * sharing is copy on write.)
+ *
+ * Must hold p->p_mutex.
+ *
+ * Return the new reference.
+ */
+struct plimit *
+limsharecow(struct proc *p)
+{
+	struct plimit *oldlim, *newlim;
+
+	KASSERT(mutex_owned(&p->p_mutex));
+
+	/*
+	 * Because limcopy drops locks, p->p_limit might change under
+	 * us. limcopy always copies the latest p_limit, however, so
+	 * even if PL_SHAREMOD ceases to be set (which shouldn't
+	 * happen anyhow) at worst we end up doing an unnecessary
+	 * copy. Note: reload oldlim after limcopy if it becomes needed.
+	 */
+
+	oldlim = p->p_limit;
+	mutex_enter(&oldlim->p_lock);
+	if (oldlim->p_lflags & PL_SHAREMOD) {
+		mutex_exit(&oldlim->p_lock);
+		newlim = limcopy(p);
+	}
+	else {
+		oldlim->p_refcnt++;
+		mutex_exit(&oldlim->p_lock);
+		newlim = oldlim;
 	}
 
-	return (newlim);
+	return newlim;
+}
+
+/*
+ * Give the (current) proc its own private plimit structure.
+ *
+ * We share these structures copy-on-write after fork, and copy when a
+ * limit is changed.
+ *
+ * Must hold p->p_mutex and *not* p->p_limit->p_lock.
+ */
+void
+limprivatize(struct proc *p, int unshare)
+{
+	struct plimit *oldlim, *newlim;
+
+	KASSERT(mutex_owned(&p->p_mutex));
+
+	/*
+	 * Because limcopy drops locks, someone else might call
+	 * limprivatize behind its back. limcopy always copies the
+	 * latest p->p_limit, as protected by p->p_mutex and
+	 * p->p_limit->p_lock; so as long as we're careful not to hold
+	 * onto a copy of an older p->p_limit, the worst that can
+	 * happen is that we make an unnecessary copy.
+	 */
+
+	oldlim = p->p_limit;
+	mutex_enter(&oldlim->p_lock);
+	if (oldlim->p_refcnt > 1 && 
+	    (unshare || (oldlim->p_lflags & PL_SHAREMOD)==0)) {
+		mutex_exit(&oldlim->p_lock);
+
+		newlim = limcopy(p);
+		/* refetch oldlim; it might have changed */
+		oldlim = p->p_limit;
+		p->p_limit = newlim;
+		limfree(oldlim);
+	}
+	else {
+		mutex_exit(&oldlim->p_lock);
+	}
 }
 
+/*
+ * decref (and optionally release) a plimit.
+ */
 void
 limfree(struct plimit *lim)
 {
@@ -598,6 +731,9 @@ limfree(struct plimit *lim)
 	if (n < 0)
 		panic("limfree");
 #endif
+	KASSERT(lim != &limit0);
+
+	/* no further locking necessary as we have the only ref */
 	if (lim->pl_corename != defcorename)
 		free(lim->pl_corename, M_TEMP);
 	mutex_destroy(&lim->p_lock);
@@ -694,7 +830,9 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 	 * let them modify a temporary copy of the core name
 	 */
 	node = *rnode;
+	mutex_enter(&ptmp->p_limit->p_lock);
 	strlcpy(cname, ptmp->p_limit->pl_corename, MAXPATHLEN);
+	mutex_exit(&ptmp->p_limit->p_lock);
 	node.sysctl_data = cname;
 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
 
@@ -702,10 +840,13 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 	 * if that failed, or they have nothing new to say, or we've
 	 * heard it before...
 	 */
+	mutex_enter(&ptmp->p_limit->p_lock);
 	if (error || newp == NULL ||
 	    strcmp(cname, ptmp->p_limit->pl_corename) == 0) {
+		mutex_exit(&ptmp->p_limit->p_lock);
 		goto done;
 	}
+	mutex_exit(&ptmp->p_limit->p_lock);
 
 	error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CORENAME,
 	    ptmp, cname, NULL, NULL);
@@ -740,15 +881,13 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 	strlcpy(tmp, cname, len + 1);
 
 	mutex_enter(&ptmp->p_mutex);
+	limprivatize(ptmp, 0);
 	lim = ptmp->p_limit;
-	if (lim->p_refcnt > 1 && (lim->p_lflags & PL_SHAREMOD) == 0) {
-		ptmp->p_limit = limcopy(ptmp);
-		limfree(lim);
-		lim = ptmp->p_limit;
-	}
+	mutex_enter(&lim->p_lock);
 	if (lim->pl_corename != defcorename)
 		free(lim->pl_corename, M_TEMP);
 	lim->pl_corename = tmp;
+	mutex_exit(&lim->p_lock);
 	mutex_exit(&ptmp->p_mutex);
 done:
 	PNBUF_PUT(cname);
Index: sys/sys/resourcevar.h
===================================================================
RCS file: /cvsroot/src/sys/sys/resourcevar.h,v
retrieving revision 1.38
diff -u -p -r1.38 resourcevar.h
--- sys/sys/resourcevar.h	12 Jul 2007 11:05:42 -0000	1.38
+++ sys/sys/resourcevar.h	11 Sep 2007 01:35:28 -0000
@@ -67,6 +67,9 @@ struct pstats {
  * ("threads") share modifications, the PL_SHAREMOD flag is set,
  * and a copy must be made for the child of a new fork that isn't
  * sharing modifications to the limits.
+ *
+ * Because of this sharing, all members are locked by p_lock and never
+ * struct proc's p_mutex.
  */
 struct plimit {
 	struct	rlimit pl_rlimit[RLIM_NLIMITS];
@@ -121,7 +124,9 @@ void	 addupc_intr(struct lwp *, u_long);
 void	 addupc_task(struct lwp *, u_long, u_int);
 void	 calcru(struct proc *, struct timeval *, struct timeval *,
 	    struct timeval *, struct timeval *);
-struct plimit *limcopy(struct proc *);
+struct plimit *limshare(struct proc *);
+struct plimit *limsharecow(struct proc *);
+void limprivatize(struct proc *, int);
 void limfree(struct plimit *);
 void	ruadd(struct rusage *, struct rusage *);
 struct	pstats *pstatscopy(struct pstats *);