Subject: struct plimit locking
To: None <tech-kern@netbsd.org>
From: David Laight <david@l8s.co.uk>
List: tech-kern
Date: 09/22/2007 15:00:37
The patch below fixes (I hope) the locking for p->p_limit (aka process
rlimit values and corename).
Inspired by the patch in PR/36939, but I think it covers all the issues
and is MP safe.

Since a lot of code looks at p->p_limit->pl_rlimit[foo] without acquiring
any lock (usually as p->p_rlimit) I've taken care to ensure that the
value of p->p_limit will always be a pointer to a valid 'struct plimit'
until the process exits.

In order to modify a process's p_rlimit values, the p_limit structure
must be marked 'PL_WRITEABLE' (by taking a copy). When this is done the
old p_limit (which another thread might have an active pointer to) is
saved inside the new structure - only being released when the process dies.

During fork() the p_limit structure is copied if marked 'PL_WRITEABLE',
otherwise a ref count is incremented (as before).

The effect is that once the PL_WRITEABLE bit is set, the structure can
always be modified, and the bit can never get cleared.

Changes to, and accesses of, the pl-corename field must be done with
the pl_lock held.  Fortunately this doesn't affect much.

The code is made rather more complicated by the PL_SHAREMOD flag (limits
shared writable over fork).  From the code, irix emulation should have
set this, and the clone() for linux threads possibly set it.
In any case I've made it a feature of fork1().
The history of this flag is steeped in the deleted revisions of the
netbsd cvs repository (or may even predate it) - I can't find any hint
of any code that might have set it!

Initial tests show that the code isn't broken...

	David

Index: kern/kern_acct.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_acct.c,v
retrieving revision 1.76
diff -u -p -r1.76 kern_acct.c
--- kern/kern_acct.c	21 Sep 2007 19:19:20 -0000	1.76
+++ kern/kern_acct.c	22 Sep 2007 13:35:45 -0000
@@ -387,9 +387,12 @@ acct_process(struct lwp *l)
 	struct timeval ut, st, tmp;
 	struct rusage *r;
 	int t, error = 0;
-	struct plimit *oplim = NULL;
+	struct rlimit orlim;
 	struct proc *p = l->l_proc;
 
+	if (acct_state != ACCT_ACTIVE)
+		return 0;
+
 	mutex_enter(&acct_lock);
 
 	/* If accounting isn't enabled, don't bother */
@@ -397,18 +400,16 @@ acct_process(struct lwp *l)
 		goto out;
 
 	/*
-	 * Raise the file limit so that accounting can't be stopped by
-	 * the user.
+	 * Temporarily raise the file limit so that accounting can't
+	 * be stopped by the user.
 	 *
 	 * XXX We should think about the CPU limit, too.
 	 */
-	mutex_enter(&p->p_mutex);
-	if (p->p_limit->pl_refcnt > 1) {
-		oplim = p->p_limit;
-		p->p_limit = limcopy(p);
-	}
+	lim_privatise(p, false);
+	orlim = p->p_rlimit[RLIMIT_FSIZE];
+	/* Set current and max to avoid illegal values */
 	p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	mutex_exit(&p->p_mutex);
+	p->p_rlimit[RLIMIT_FSIZE].rlim_max = RLIM_INFINITY;
 
 	/*
 	 * Get process accounting information.
@@ -467,12 +468,8 @@ acct_process(struct lwp *l)
 	if (error != 0)
 		log(LOG_ERR, "Accounting: write failed %d\n", error);
 
-	if (oplim) {
-		mutex_enter(&p->p_mutex);
-		limfree(p->p_limit);
-		p->p_limit = oplim;
-		mutex_exit(&p->p_mutex);
-	}
+	/* Restore limit - rather pointless since process is about to exit */
+	p->p_rlimit[RLIMIT_FSIZE] = orlim;
 
  out:
 	mutex_exit(&acct_lock);
Index: kern/kern_fork.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_fork.c,v
retrieving revision 1.143
diff -u -p -r1.143 kern_fork.c
--- kern/kern_fork.c	21 Sep 2007 19:19:20 -0000	1.143
+++ kern/kern_fork.c	22 Sep 2007 13:35:45 -0000
@@ -215,6 +215,7 @@ fork1(struct lwp *l1, int flags, int exi
     struct proc **rnewprocp)
 {
 	struct proc	*p1, *p2, *parent;
+	struct plimit   *p1_lim;
 	uid_t		uid;
 	struct lwp	*l2;
 	int		count;
@@ -352,20 +353,20 @@ fork1(struct lwp *l1, int flags, int exi
 		p2->p_cwdi = cwdinit(p1);
 
 	/*
-	 * If p_limit is still copy-on-write, bump refcnt,
-	 * otherwise get a copy that won't be modified.
-	 * (If PL_SHAREMOD is clear, the structure is shared
-	 * copy-on-write.)
-	 */
-	if (p1->p_limit->pl_flags & PL_SHAREMOD) {
-		mutex_enter(&p1->p_mutex);
-		p2->p_limit = limcopy(p1);
-		mutex_exit(&p1->p_mutex);
-	} else {
-		mutex_enter(&p1->p_limit->pl_lock);
-		p1->p_limit->pl_refcnt++;
-		mutex_exit(&p1->p_limit->pl_lock);
-		p2->p_limit = p1->p_limit;
+	 * p_limit (rlimit stuff) is usually copy-on-write, so we just need
+	 * to bump pl_refcnt.
+	 * However in some cases (see compat irix, and plausibly from clone)
+	 * the parent and child share limits - in which case nothing else
+	 * must have a copy of the limits (PL_SHAREMOD is set).
+	 */
+	if (__predict_false(flags & FORK_SHARELIMIT))
+		lim_privatise(p1, 1);
+	p1_lim = p1->p_limit;
+	if (p1_lim->pl_flags & PL_WRITEABLE && !(flags & FORK_SHARELIMIT))
+		p2->p_limit = lim_copy(p1_lim);
+	else {
+		lim_addref(p1_lim);
+		p2->p_limit = p1_lim;
 	}
 
 	p2->p_sflag = ((flags & FORK_PPWAIT) ? PS_PPWAIT : 0);
Index: kern/kern_proc.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_proc.c,v
retrieving revision 1.116
diff -u -p -r1.116 kern_proc.c
--- kern/kern_proc.c	21 Sep 2007 19:19:20 -0000	1.116
+++ kern/kern_proc.c	22 Sep 2007 13:35:46 -0000
@@ -407,6 +407,7 @@ proc0_init(void)
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = lim / 3;
 	limit0.pl_corename = defcorename;
 	limit0.pl_refcnt = 1;
+	limit0.pl_sv_limit = NULL;
 
 	/* Configure virtual memory system, set vm rlimits. */
 	uvm_init_limits(p);
@@ -1314,6 +1315,18 @@ proc_crmod_enter(void)
 	kauth_cred_t oc;
 	char *cn;
 
+	/* Reset what needs to be reset in plimit. */
+	if (p->p_limit->pl_corename != defcorename) {
+		lim_privatise(p, false);
+		lim = p->p_limit;
+		mutex_enter(&lim->pl_lock);
+		cn = lim->pl_corename;
+		lim->pl_corename = defcorename;
+		mutex_exit(&lim->pl_lock);
+		if (cn != defcorename)
+			free(cn, M_TEMP);
+	}
+
 	mutex_enter(&p->p_mutex);
 
 	/* Ensure the LWP cached credentials are up to date. */
@@ -1323,22 +1336,6 @@ proc_crmod_enter(void)
 		kauth_cred_free(oc);
 	}
 
-	/* Reset what needs to be reset in plimit. */
-	lim = p->p_limit;
-	if (lim->pl_corename != defcorename) {
-		if (lim->pl_refcnt > 1 &&
-		    (lim->pl_flags & PL_SHAREMOD) == 0) {
-			p->p_limit = limcopy(p);
-			limfree(lim);
-			lim = p->p_limit;
-		}
-		mutex_enter(&lim->pl_lock);
-		cn = lim->pl_corename;
-		lim->pl_corename = defcorename;
-		mutex_exit(&lim->pl_lock);
-		if (cn != defcorename)
-			free(cn, M_TEMP);
-	}
 }
 
 /*
Index: kern/kern_resource.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_resource.c,v
retrieving revision 1.121
diff -u -p -r1.121 kern_resource.c
--- kern/kern_resource.c	21 Sep 2007 19:19:21 -0000	1.121
+++ kern/kern_resource.c	22 Sep 2007 13:35:46 -0000
@@ -256,7 +256,6 @@ int
 dosetrlimit(struct lwp *l, struct proc *p, int which, struct rlimit *limp)
 {
 	struct rlimit *alimp;
-	struct plimit *oldplim;
 	int error;
 
 	if ((u_int)which >= RLIM_NLIMITS)
@@ -265,12 +264,6 @@ dosetrlimit(struct lwp *l, struct proc *
 	if (limp->rlim_cur < 0 || limp->rlim_max < 0)
 		return (EINVAL);
 
-	alimp = &p->p_rlimit[which];
-	/* if we don't change the value, no need to limcopy() */
-	if (limp->rlim_cur == alimp->rlim_cur &&
-	    limp->rlim_max == alimp->rlim_max)
-		return 0;
-
 	if (limp->rlim_cur > limp->rlim_max) {
 		/*
 		 * This is programming error. According to SUSv2, we should
@@ -278,19 +271,21 @@ dosetrlimit(struct lwp *l, struct proc *
 		 */
 		return (EINVAL);
 	}
+
+	alimp = &p->p_rlimit[which];
+	/* if we don't change the value, no need to limcopy() */
+	if (limp->rlim_cur == alimp->rlim_cur &&
+	    limp->rlim_max == alimp->rlim_max)
+		return 0;
+
 	error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
 	    p, limp, KAUTH_ARG(which), NULL);
 	if (error)
-			return (error);
+		return (error);
 
-	mutex_enter(&p->p_mutex);
-	if (p->p_limit->pl_refcnt > 1 &&
-	    (p->p_limit->pl_flags & PL_SHAREMOD) == 0) {
-	    	oldplim = p->p_limit;
-		p->p_limit = limcopy(p);
-		limfree(oldplim);
-		alimp = &p->p_rlimit[which];
-	}
+	lim_privatise(p, false);
+	/* p->p_limit is now unchangeable */
+	alimp = &p->p_rlimit[which];
 
 	switch (which) {
 
@@ -315,7 +310,6 @@ dosetrlimit(struct lwp *l, struct proc *
 		 */
 		if (limp->rlim_cur < p->p_vmspace->vm_ssize * PAGE_SIZE
 		    || limp->rlim_max < p->p_vmspace->vm_ssize * PAGE_SIZE) {
-			mutex_exit(&p->p_mutex);
 			return (EINVAL);
 		}
 
@@ -366,8 +360,10 @@ dosetrlimit(struct lwp *l, struct proc *
 			limp->rlim_max = maxproc;
 		break;
 	}
+
+	mutex_enter(&p->p_limit->pl_lock);
 	*alimp = *limp;
-	mutex_exit(&p->p_mutex);
+	mutex_exit(&p->p_limit->pl_lock);
 	return (0);
 }
 
@@ -531,77 +527,126 @@ ruadd(struct rusage *ru, struct rusage *
  * We share these structures copy-on-write after fork,
  * and copy when a limit is changed.
  *
- * XXXSMP This is atrocious, need to simplify.
+ * Unfortunately (due to PL_SHAREMOD) it is possibly for the structure
+ * we are copying to change beneath our feet!
  */
 struct plimit *
-limcopy(struct proc *p)
+lim_copy(struct plimit *lim)
 {
-	struct plimit *lim, *newlim;
+	struct plimit *newlim;
 	char *corename;
-	size_t l;
+	size_t alen, len;
 
-	KASSERT(mutex_owned(&p->p_mutex));
-
-	mutex_exit(&p->p_mutex);
 	newlim = pool_get(&plimit_pool, PR_WAITOK);
 	mutex_init(&newlim->pl_lock, MUTEX_DEFAULT, IPL_NONE);
 	newlim->pl_flags = 0;
 	newlim->pl_refcnt = 1;
-	mutex_enter(&p->p_mutex);
+	newlim->pl_sv_limit = NULL;
 
-	for (;;) {
-		lim = p->p_limit;
-		mutex_enter(&lim->pl_lock);
-		if (lim->pl_corename != defcorename) {
-			l = strlen(lim->pl_corename) + 1;
+	mutex_enter(&lim->pl_lock);
+	memcpy(newlim->pl_rlimit, lim->pl_rlimit,
+	    sizeof(struct rlimit) * RLIM_NLIMITS);
 
-			mutex_exit(&lim->pl_lock);
-			mutex_exit(&p->p_mutex);
-			corename = malloc(l, M_TEMP, M_WAITOK);
-			mutex_enter(&p->p_mutex);
-			mutex_enter(&lim->pl_lock);
-
-			if (l != strlen(lim->pl_corename) + 1) {
-				mutex_exit(&lim->pl_lock);
-				mutex_exit(&p->p_mutex);
-				free(corename, M_TEMP);
-				mutex_enter(&p->p_mutex);
-				continue;
-			}
-		} else
-			l = 0;
-			
-		memcpy(newlim->pl_rlimit, lim->pl_rlimit,
-		    sizeof(struct rlimit) * RLIM_NLIMITS);
-		if (l != 0)
-			strlcpy(newlim->pl_corename, lim->pl_corename, l);
-		else
+	alen = 0;
+	corename = NULL;
+	for (;;) {
+		if (lim->pl_corename == defcorename) {
 			newlim->pl_corename = defcorename;
+			break;
+		}
+		len = strlen(lim->pl_corename) + 1;
+		if (len <= alen) {
+			newlim->pl_corename = corename;
+			memcpy(corename, lim->pl_corename, len);
+			corename = NULL;
+			break;
+		}
 		mutex_exit(&lim->pl_lock);
-		break;
+		if (corename != NULL)
+			free(corename, M_TEMP);
+		alen = len;
+		corename = malloc(alen, M_TEMP, M_WAITOK);
+		mutex_enter(&lim->pl_lock);
+	}
+	mutex_exit(&lim->pl_lock);
+	if (corename != NULL)
+		free(corename, M_TEMP);
+	return newlim;
+}
+
+void
+lim_addref(struct plimit *lim)
+{
+	mutex_enter(&lim->pl_lock);
+	lim->pl_refcnt++;
+	mutex_exit(&lim->pl_lock);
+}
+
+/*
+ * Give a process it's own private plimit structure.
+ * This will only be shared (in fork) if modifications are to be shared.
+ */
+void
+lim_privatise(struct proc *p, bool set_shared)
+{
+	struct plimit *lim, *newlim;
+
+	lim = p->p_limit;
+	if (lim->pl_flags & PL_WRITEABLE) {
+		if (set_shared)
+			lim->pl_flags |= PL_SHAREMOD;
+		return;
+	}
+
+	if (set_shared && lim->pl_flags & PL_SHAREMOD)
+		return;
+
+	newlim = lim_copy(lim);
+
+	mutex_enter(&p->p_mutex);
+	if (p->p_limit->pl_flags & PL_WRITEABLE) {
+		/* Someone crept in while we were busy */
+		mutex_exit(&p->p_mutex);
+		limfree(newlim);
+		if (set_shared)
+			p->p_limit->pl_flags |= PL_SHAREMOD;
+		return;
 	}
 
-	return (newlim);
+	/*
+	 * Since most accesses to p->p_limit aren't locked, we must not
+	 * delete the old limit structure yet.
+	 */
+	newlim->pl_sv_limit = p->p_limit;
+	newlim->pl_flags |= PL_WRITEABLE;
+	if (set_shared)
+		newlim->pl_flags |= PL_SHAREMOD;
+	p->p_limit = newlim;
+	mutex_exit(&p->p_mutex);
 }
 
 void
 limfree(struct plimit *lim)
 {
+	struct plimit *sv_lim;
 	int n;
 
-	mutex_enter(&lim->pl_lock);
-	n = --lim->pl_refcnt;
-	mutex_exit(&lim->pl_lock);
-	if (n > 0)
-		return;
+	do {
+		mutex_enter(&lim->pl_lock);
+		n = --lim->pl_refcnt;
+		mutex_exit(&lim->pl_lock);
+		if (n > 0)
+			return;
 #ifdef DIAGNOSTIC
-	if (n < 0)
-		panic("limfree");
+		if (n < 0)
+			panic("limfree");
 #endif
-	if (lim->pl_corename != defcorename)
-		free(lim->pl_corename, M_TEMP);
-	mutex_destroy(&lim->pl_lock);
-	pool_put(&plimit_pool, lim);
+		if (lim->pl_corename != defcorename)
+			free(lim->pl_corename, M_TEMP);
+		sv_lim = lim->pl_sv_limit;
+		mutex_destroy(&lim->pl_lock);
+		pool_put(&plimit_pool, lim);
+	} while ((lim = sv_lim) != NULL);
 }
 
 struct pstats *
@@ -665,6 +710,7 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 	struct plimit *lim;
 	int error = 0, len;
 	char *cname;
+	char *ocore;
 	char *tmp;
 	struct sysctlnode node;
 
@@ -689,12 +735,16 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 	if (error)
 		return (error);
 
-	cname = PNBUF_GET();
 	/*
 	 * let them modify a temporary copy of the core name
 	 */
+	cname = PNBUF_GET();
+	lim = ptmp->p_limit;
+	mutex_enter(&lim->pl_lock);
+	strlcpy(cname, lim->pl_corename, MAXPATHLEN);
+	mutex_exit(&lim->pl_lock);
+
 	node = *rnode;
-	strlcpy(cname, ptmp->p_limit->pl_corename, MAXPATHLEN);
 	node.sysctl_data = cname;
 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
 
@@ -702,10 +752,15 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 	 * if that failed, or they have nothing new to say, or we've
 	 * heard it before...
 	 */
-	if (error || newp == NULL ||
-	    strcmp(cname, ptmp->p_limit->pl_corename) == 0) {
+	if (error || newp == NULL)
+		goto done;
+	lim = ptmp->p_limit;
+	mutex_enter(&lim->pl_lock);
+	error = strcmp(cname, lim->pl_corename);
+	mutex_exit(&lim->pl_lock);
+	if (error == 0)
+		/* Unchanged */
 		goto done;
-	}
 
 	error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CORENAME,
 	    ptmp, cname, NULL, NULL);
@@ -737,19 +792,17 @@ sysctl_proc_corename(SYSCTLFN_ARGS)
 		error = ENOMEM;
 		goto done;
 	}
-	strlcpy(tmp, cname, len + 1);
+	memcpy(tmp, cname, len + 1);
 
-	mutex_enter(&ptmp->p_mutex);
+	lim_privatise(ptmp, false);
 	lim = ptmp->p_limit;
-	if (lim->pl_refcnt > 1 && (lim->pl_flags & PL_SHAREMOD) == 0) {
-		ptmp->p_limit = limcopy(ptmp);
-		limfree(lim);
-		lim = ptmp->p_limit;
-	}
-	if (lim->pl_corename != defcorename)
-		free(lim->pl_corename, M_TEMP);
+	mutex_enter(&lim->pl_lock);
+	ocore = lim->pl_corename;
 	lim->pl_corename = tmp;
-	mutex_exit(&ptmp->p_mutex);
+	mutex_exit(&lim->pl_lock);
+	if (ocore != defcorename)
+		free(ocore, M_TEMP);
+
 done:
 	PNBUF_PUT(cname);
 	return error;
Index: sys/proc.h
===================================================================
RCS file: /cvsroot/src/sys/sys/proc.h,v
retrieving revision 1.254
diff -u -p -r1.254 proc.h
--- sys/proc.h	7 Sep 2007 18:56:12 -0000	1.254
+++ sys/proc.h	22 Sep 2007 13:35:46 -0000
@@ -444,14 +444,15
 #define	FORK_SHARESIGS	0x10		/* Share signal actions */
 #define	FORK_NOWAIT	0x20		/* Make init the parent of the child */
 #define	FORK_CLEANFILES	0x40		/* Start with a clean descriptor set */
 #define	FORK_SYSTEM	0x80		/* Fork a kernel thread */
+#define	FORK_SHARELIMIT	0x0100		/* Share rlimit values */
 
 /*
  * Allow machine-dependent code to override curlwp in <machine/cpu.h> for
Index: sys/resourcevar.h
===================================================================
RCS file: /cvsroot/src/sys/sys/resourcevar.h,v
retrieving revision 1.39
diff -u -p -r1.39 resourcevar.h
--- sys/resourcevar.h	21 Sep 2007 19:19:20 -0000	1.39
+++ sys/resourcevar.h	22 Sep 2007 13:35:46 -0000
@@ -67,14 +67,19 @@ struct pstats {
  * ("threads") share modifications, the PL_SHAREMOD flag is set,
  * and a copy must be made for the child of a new fork that isn't
  * sharing modifications to the limits.
+ *
+ * The PL_xxx flags are never cleared, once either is set p->p_limit
+ * will never be changed again.
  */
 struct plimit {
 	struct	rlimit pl_rlimit[RLIM_NLIMITS];
 	char	*pl_corename;
 #define	PL_SHAREMOD	0x01		/* modifications are shared */
+#define	PL_WRITEABLE	0x02		/* private to this process */
 	int	pl_flags;
 	int	pl_refcnt;		/* number of references */
 	kmutex_t pl_lock;		/* mutex for pl_refcnt */
+	struct plimit *pl_sv_limit;	/* saved when PL_WRITEABLE set */
 };
 
 /* add user profiling from AST XXXSMP */
@@ -121,8 +126,12 @@ void	 addupc_intr(struct lwp *, u_long);
 void	 addupc_task(struct lwp *, u_long, u_int);
 void	 calcru(struct proc *, struct timeval *, struct timeval *,
 	    struct timeval *, struct timeval *);
-struct plimit *limcopy(struct proc *);
+struct plimit *lim_copy(struct plimit *lim);
+void lim_addref(struct plimit *lim);
+void lim_privatise(struct proc *p, bool set_shared);
 void limfree(struct plimit *);
+
 void	ruadd(struct rusage *, struct rusage *);
 struct	pstats *pstatscopy(struct pstats *);
 void 	pstatsfree(struct pstats *);
-- 
David Laight: david@l8s.co.uk