Subject: Re: Revising CPU usage calculations
To: None <tech-kern@netbsd.org>
From: Daniel Sieger <dsieger@TechFak.Uni-Bielefeld.DE>
List: tech-kern
Date: 03/25/2007 15:09:55
--Qxx1br4bt0+wmkIi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi all,
the attached patch handles CPU usage calculations in a
scheduler-independent manner. In short, here is what it does:

- Move the ccpu sysctl back to init_sysctl.c, as it is not
  scheduler-dependent.
- Move the scheduler-independent parts of 4BSD's schedcpu() to
  kern_synch.c.
- Add scheduler-specific hook to satisfy individual scheduler's
  needs.
- Slightly reduce the locking burden.

There are two new functions added to the scheduler API, sched_pstats()
(formerly schedcpu()) and sched_pstats_hook(). The naming is not
perfect, but unless someone has a better suggestion I'd like to stick
with it. IMHO it describes best what it actually does.

Any comments/suggestions?

Regards,
Daniel

-- 
Daniel Sieger
Faculty of Technology
Bielefeld University
wwwhomes.uni-bielefeld.de/dsieger

--Qxx1br4bt0+wmkIi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="pstats.diff"

Index: kern/init_sysctl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_sysctl.c,v
retrieving revision 1.96.2.4
diff -u -r1.96.2.4 init_sysctl.c
--- kern/init_sysctl.c	23 Mar 2007 15:56:07 -0000	1.96.2.4
+++ kern/init_sysctl.c	25 Mar 2007 12:53:08 -0000
@@ -328,6 +328,7 @@
 SYSCTL_SETUP(sysctl_kern_setup, "sysctl kern subtree setup")
 {
 	extern int kern_logsigexit;	/* defined in kern/kern_sig.c */
+	extern fixpt_t ccpu;		/* defined in kern/kern_synch.c */
 	extern int dumponpanic;		/* defined in kern/subr_prf.c */
 	const struct sysctlnode *rnode;
 
@@ -643,6 +644,12 @@
 		       CTL_KERN, KERN_FSCALE, CTL_EOL);
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT,
+		       CTLTYPE_INT, "ccpu",
+		       SYSCTL_DESCR("Scheduler exponential decay value"),
+		       NULL, 0, &ccpu, 0,
+		       CTL_KERN, KERN_CCPU, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
 		       CTLTYPE_STRUCT, "cp_time",
 		       SYSCTL_DESCR("Clock ticks spent in different CPU states"),
 		       sysctl_kern_cptime, 0, NULL, 0,
Index: kern/kern_synch.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_synch.c,v
retrieving revision 1.177.2.20
diff -u -r1.177.2.20 kern_synch.c
--- kern/kern_synch.c	24 Mar 2007 17:13:14 -0000	1.177.2.20
+++ kern/kern_synch.c	25 Mar 2007 12:53:08 -0000
@@ -100,6 +100,9 @@
 
 #include <uvm/uvm_extern.h>
 
+struct callout sched_pstats_ch = CALLOUT_INITIALIZER_SETFUNC(sched_pstats, NULL);
+unsigned int sched_pstats_ticks;
+
 int	lbolt;			/* once a second sleep address */
 
 static void	sched_unsleep(struct lwp *);
@@ -777,3 +780,116 @@
 
 	return NULL;
 }
+
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you dont want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define	CCPU_SHIFT	(FSHIFT + 1)
+
+/*
+ * sched_pstats:
+ *
+ * Update process statistics and check CPU resource allocation.
+ * Call scheduler-specific hook to eventually adjust process/LWP
+ * priorities.
+ *
+ *	XXXSMP This needs to be reorganised in order to reduce the locking
+ *	burden.
+ */
+/* ARGSUSED */
+void
+sched_pstats(void *arg)
+{
+	struct rlimit *rlim;
+	struct lwp *l;
+	struct proc *p;
+	int minslp, sig, clkhz;
+	long runtm;
+
+	sched_pstats_ticks++;
+
+	mutex_enter(&proclist_mutex);
+	PROCLIST_FOREACH(p, &allproc) {
+		/*
+		 * Increment time in/out of memory and sleep time (if
+		 * sleeping).  We ignore overflow; with 16-bit int's
+		 * (remember them?) overflow takes 45 days.
+		 */
+		minslp = 2;
+		mutex_enter(&p->p_smutex);
+		runtm = p->p_rtime.tv_sec;
+		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+			if ((l->l_flag & LW_IDLE) != 0)
+				continue;
+			lwp_lock(l);
+			runtm += l->l_rtime.tv_sec;
+			l->l_swtime++;
+			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
+			    l->l_stat == LSSUSPENDED) {
+				l->l_slptime++;
+				minslp = min(minslp, l->l_slptime);
+			} else
+				minslp = 0;
+			lwp_unlock(l);
+		}
+
+		/*
+		 * Check if the process exceeds its CPU resource allocation.
+		 * If over max, kill it.
+		 */
+		rlim = &p->p_rlimit[RLIMIT_CPU];
+		sig = 0;
+		if (runtm >= rlim->rlim_cur) {
+			if (runtm >= rlim->rlim_max)
+				sig = SIGKILL;
+			else {
+				sig = SIGXCPU;
+				if (rlim->rlim_cur < rlim->rlim_max)
+					rlim->rlim_cur += 5;
+			}
+		}
+
+		mutex_spin_enter(&p->p_stmutex);
+		if (minslp < 1) {
+			/*
+			 * p_pctcpu is only for ps.
+			 */
+			p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
+			clkhz = stathz != 0 ? stathz : hz;
+#if	(FSHIFT >= CCPU_SHIFT)
+			p->p_pctcpu += (clkhz == 100)?
+			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
+				100 * (((fixpt_t) p->p_cpticks)
+				       << (FSHIFT - CCPU_SHIFT)) / clkhz;
+#else
+			p->p_pctcpu += ((FSCALE - ccpu) *
+					(p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
+#endif
+			p->p_cpticks = 0;
+		}
+
+		sched_pstats_hook(p, minslp, runtm);
+		mutex_spin_exit(&p->p_stmutex);
+		mutex_exit(&p->p_smutex);
+		if (sig) {
+			psignal(p, sig);
+		}
+	}
+	mutex_exit(&proclist_mutex);
+	uvm_meter();
+	wakeup(&lbolt);
+	callout_schedule(&sched_pstats_ch, hz);
+}
Index: kern/sched_4bsd.c
===================================================================
RCS file: /cvsroot/src/sys/kern/Attic/sched_4bsd.c,v
retrieving revision 1.1.2.22
diff -u -r1.1.2.22 sched_4bsd.c
--- kern/sched_4bsd.c	24 Mar 2007 16:50:26 -0000	1.1.2.22
+++ kern/sched_4bsd.c	25 Mar 2007 12:53:09 -0000
@@ -120,13 +120,11 @@
 } runqueue_t;
 static runqueue_t global_queue; 
 
-static void schedcpu(void *);
 static void updatepri(struct lwp *);
 static void resetpriority(struct lwp *);
 static void resetprocpriority(struct proc *);
 
-struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
-static unsigned int schedcpu_ticks;
+extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */
 
 /* The global scheduler state */
 kmutex_t sched_mutex;
@@ -275,141 +273,45 @@
 	return estcpu;
 }
 
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
-
-/*
- * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
- * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
- * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
- *
- * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
- *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
- *
- * If you dont want to bother with the faster/more-accurate formula, you
- * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
- * (more general) method of calculating the %age of CPU used by a process.
- */
-#define	CCPU_SHIFT	11
-
 /*
- * schedcpu:
+ * sched_pstats_hook:
  *
- *	Recompute process priorities, every hz ticks.
- *
- *	XXXSMP This needs to be reorganised in order to reduce the locking
- *	burden.
+ * Periodically called from sched_pstats(). Used for autonice
+ * and priority recalculation.
  */
-/* ARGSUSED */
-static void
-schedcpu(void *arg)
+inline void
+sched_pstats_hook(struct proc *p, int minslp, int runtm)
 {
-	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
-	struct rlimit *rlim;
 	struct lwp *l;
-	struct proc *p;
-	int minslp, clkhz, sig;
-	long runtm;
-
-	schedcpu_ticks++;
-
-	mutex_enter(&proclist_mutex);
-	PROCLIST_FOREACH(p, &allproc) {
-		/*
-		 * Increment time in/out of memory and sleep time (if
-		 * sleeping).  We ignore overflow; with 16-bit int's
-		 * (remember them?) overflow takes 45 days.
-		 */
-		minslp = 2;
-		mutex_enter(&p->p_smutex);
-		runtm = p->p_rtime.tv_sec;
+	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+	/* 
+	 * If the process has run for more than autonicetime, reduce
+	 * priority to give others a chance.
+	 */
+	if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
+	    && kauth_cred_geteuid(p->p_cred)) {
+		p->p_nice = autoniceval + NZERO;
+		resetprocpriority(p);
+	}
+	
+	/*
+	 * If the process has slept the entire second,
+	 * stop recalculating its priority until it wakes up.
+	 */
+	if (minslp <= 1) {
+		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
+		
 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 			if ((l->l_flag & LW_IDLE) != 0)
 				continue;
 			lwp_lock(l);
-			runtm += l->l_rtime.tv_sec;
-			l->l_swtime++;
-			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
-			    l->l_stat == LSSUSPENDED) {
-				l->l_slptime++;
-				minslp = min(minslp, l->l_slptime);
-			} else
-				minslp = 0;
+			if (l->l_slptime <= 1 &&
+			    l->l_priority >= PUSER)
+				resetpriority(l);
 			lwp_unlock(l);
 		}
-		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
-
-		/*
-		 * Check if the process exceeds its CPU resource allocation.
-		 * If over max, kill it.
-		 */
-		rlim = &p->p_rlimit[RLIMIT_CPU];
-		sig = 0;
-		if (runtm >= rlim->rlim_cur) {
-			if (runtm >= rlim->rlim_max)
-				sig = SIGKILL;
-			else {
-				sig = SIGXCPU;
-				if (rlim->rlim_cur < rlim->rlim_max)
-					rlim->rlim_cur += 5;
-			}
-		}
-
-		/* 
-		 * If the process has run for more than autonicetime, reduce
-		 * priority to give others a chance.
-		 */
-		if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
-		    && kauth_cred_geteuid(p->p_cred)) {
-			mutex_spin_enter(&p->p_stmutex);
-			p->p_nice = autoniceval + NZERO;
-			resetprocpriority(p);
-			mutex_spin_exit(&p->p_stmutex);
-		}
-
-		/*
-		 * If the process has slept the entire second,
-		 * stop recalculating its priority until it wakes up.
-		 */
-		if (minslp <= 1) {
-			/*
-			 * p_pctcpu is only for ps.
-			 */
-			mutex_spin_enter(&p->p_stmutex);
-			clkhz = stathz != 0 ? stathz : hz;
-#if	(FSHIFT >= CCPU_SHIFT)
-			p->p_pctcpu += (clkhz == 100)?
-			    ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
-			    100 * (((fixpt_t) p->p_cpticks)
-			    << (FSHIFT - CCPU_SHIFT)) / clkhz;
-#else
-			p->p_pctcpu += ((FSCALE - ccpu) *
-			    (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
-#endif
-			p->p_cpticks = 0;
-			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
-
-			LIST_FOREACH(l, &p->p_lwps, l_sibling) {
-				if ((l->l_flag & LW_IDLE) != 0)
-					continue;
-				lwp_lock(l);
-				if (l->l_slptime <= 1 &&
-				    l->l_priority >= PUSER)
-					resetpriority(l);
-				lwp_unlock(l);
-			}
-			mutex_spin_exit(&p->p_stmutex);
-		}
-
-		mutex_exit(&p->p_smutex);
-		if (sig) {
-			psignal(p, sig);
-		}
 	}
-	mutex_exit(&proclist_mutex);
-	uvm_meter();
-	wakeup(&lbolt);
-	callout_schedule(&schedcpu_ch, hz);
 }
 
 /*
@@ -426,7 +328,7 @@
 
 	loadfac = loadfactor(averunnable.ldavg[0]);
 
-	l->l_slptime--; /* the first time was done in schedcpu */
+	l->l_slptime--; /* the first time was done in sched_pstats */
 	/* XXX NJWLWP */
 	/* XXXSMP occasionally unlocked, should be per-LWP */
 	p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
@@ -619,7 +521,7 @@
 {
 
 	rrticks = hz / 10;
-	schedcpu(NULL);
+	sched_pstats(NULL);
 }
 
 void
@@ -728,7 +630,7 @@
 	LOCK_ASSERT(mutex_owned(&parent->p_smutex));
 
 	child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
-	child->p_forktime = schedcpu_ticks;
+	child->p_forktime = sched_pstats_ticks;
 }
 
 /*
@@ -746,7 +648,7 @@
 
 	mutex_spin_enter(&parent->p_stmutex);
 	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
-	    schedcpu_ticks - child->p_forktime);
+	    sched_pstats_ticks - child->p_forktime);
 	if (child->p_estcpu > estcpu)
 		parent->p_estcpu =
 		    ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
@@ -803,7 +705,6 @@
 
 SYSCTL_SETUP(sysctl_sched_setup, "sysctl kern.sched subtree setup")
 {
-
 	sysctl_createv(clog, 0, NULL, NULL,
 		CTLFLAG_PERMANENT,
 		CTLTYPE_NODE, "kern", NULL,
@@ -820,12 +721,6 @@
 		CTLTYPE_STRING, "name", NULL,
 		NULL, 0, __UNCONST("4.4BSD"), 0,
 		CTL_KERN, KERN_SCHED, CTL_CREATE, CTL_EOL);
-	sysctl_createv(clog, 0, NULL, NULL,
-		CTLFLAG_PERMANENT,
-		CTLTYPE_INT, "ccpu",
-		SYSCTL_DESCR("Scheduler exponential decay value"),
-		NULL, 0, &ccpu, 0,
-		CTL_KERN, KERN_SCHED, CTL_CREATE, CTL_EOL);
 }
 
 #if defined(DDB)
Index: sys/sched.h
===================================================================
RCS file: /cvsroot/src/sys/sys/sched.h,v
retrieving revision 1.30.2.13
diff -u -r1.30.2.13 sched.h
--- sys/sched.h	24 Mar 2007 16:50:26 -0000	1.30.2.13
+++ sys/sched.h	25 Mar 2007 12:53:09 -0000
@@ -193,10 +193,12 @@
 void sched_slept(struct lwp *);
 void sched_setrunnable(struct lwp *);	/* Scheduler-specific actions for setrunnable() */
 void sched_print_runqueue(void (*pr)(const char *, ...));	/* Print runqueues in DDB */
+inline void sched_pstats_hook(struct proc *, int, int);
 
 /* Functions common to all scheduler implementations */
 void sched_wakeup(volatile const void *);
 pri_t sched_kpri(struct lwp *);
+void sched_pstats(void *arg);
 
 inline void resched_cpu(struct lwp *); /* Arrange reschedule */
 void setrunnable(struct lwp *);

--Qxx1br4bt0+wmkIi--