Subject: Re: Revising CPU usage calculations
To: None <tech-kern@netbsd.org>
From: Daniel Sieger <dsieger@TechFak.Uni-Bielefeld.DE>
List: tech-kern
Date: 03/25/2007 15:09:55
--Qxx1br4bt0+wmkIi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Hi all,
the attached patch handles CPU usage calculations in a
scheduler-independent manner. In short, here is what it does:
- Move the ccpu sysctl back to init_sysctl.c, as it is not
scheduler-dependent.
- Move the scheduler-independent parts of 4BSD's schedcpu() to
kern_synch.c.
- Add scheduler-specific hook to satisfy individual scheduler's
needs.
- Slightly reduce the locking burden.
There are two new functions added to the scheduler API, sched_pstats()
(formerly schedcpu()) and sched_pstats_hook(). The naming is not
perfect, but unless someone has a better suggestion I'd like to stick
with it. IMHO it describes best what it actually does.
Any comments/suggestions?
Regards,
Daniel
--
Daniel Sieger
Faculty of Technology
Bielefeld University
wwwhomes.uni-bielefeld.de/dsieger
--Qxx1br4bt0+wmkIi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="pstats.diff"
Index: kern/init_sysctl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_sysctl.c,v
retrieving revision 1.96.2.4
diff -u -r1.96.2.4 init_sysctl.c
--- kern/init_sysctl.c 23 Mar 2007 15:56:07 -0000 1.96.2.4
+++ kern/init_sysctl.c 25 Mar 2007 12:53:08 -0000
@@ -328,6 +328,7 @@
SYSCTL_SETUP(sysctl_kern_setup, "sysctl kern subtree setup")
{
extern int kern_logsigexit; /* defined in kern/kern_sig.c */
+ extern fixpt_t ccpu; /* defined in kern/kern_synch.c */
extern int dumponpanic; /* defined in kern/subr_prf.c */
const struct sysctlnode *rnode;
@@ -643,6 +644,12 @@
CTL_KERN, KERN_FSCALE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
+ CTLTYPE_INT, "ccpu",
+ SYSCTL_DESCR("Scheduler exponential decay value"),
+ NULL, 0, &ccpu, 0,
+ CTL_KERN, KERN_CCPU, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "cp_time",
SYSCTL_DESCR("Clock ticks spent in different CPU states"),
sysctl_kern_cptime, 0, NULL, 0,
Index: kern/kern_synch.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_synch.c,v
retrieving revision 1.177.2.20
diff -u -r1.177.2.20 kern_synch.c
--- kern/kern_synch.c 24 Mar 2007 17:13:14 -0000 1.177.2.20
+++ kern/kern_synch.c 25 Mar 2007 12:53:08 -0000
@@ -100,6 +100,9 @@
#include <uvm/uvm_extern.h>
+struct callout sched_pstats_ch = CALLOUT_INITIALIZER_SETFUNC(sched_pstats, NULL);
+unsigned int sched_pstats_ticks;
+
int lbolt; /* once a second sleep address */
static void sched_unsleep(struct lwp *);
@@ -777,3 +780,116 @@
return NULL;
}
+
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you dont want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define CCPU_SHIFT (FSHIFT + 1)
+
+/*
+ * sched_pstats:
+ *
+ * Update process statistics and check CPU resource allocation.
+ * Call scheduler-specific hook to eventually adjust process/LWP
+ * priorities.
+ *
+ * XXXSMP This needs to be reorganised in order to reduce the locking
+ * burden.
+ */
+/* ARGSUSED */
+void
+sched_pstats(void *arg)
+{
+ struct rlimit *rlim;
+ struct lwp *l;
+ struct proc *p;
+ int minslp, sig, clkhz;
+ long runtm;
+
+ sched_pstats_ticks++;
+
+ mutex_enter(&proclist_mutex);
+ PROCLIST_FOREACH(p, &allproc) {
+ /*
+ * Increment time in/out of memory and sleep time (if
+ * sleeping). We ignore overflow; with 16-bit int's
+ * (remember them?) overflow takes 45 days.
+ */
+ minslp = 2;
+ mutex_enter(&p->p_smutex);
+ runtm = p->p_rtime.tv_sec;
+ LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+ if ((l->l_flag & LW_IDLE) != 0)
+ continue;
+ lwp_lock(l);
+ runtm += l->l_rtime.tv_sec;
+ l->l_swtime++;
+ if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
+ l->l_stat == LSSUSPENDED) {
+ l->l_slptime++;
+ minslp = min(minslp, l->l_slptime);
+ } else
+ minslp = 0;
+ lwp_unlock(l);
+ }
+
+ /*
+ * Check if the process exceeds its CPU resource allocation.
+ * If over max, kill it.
+ */
+ rlim = &p->p_rlimit[RLIMIT_CPU];
+ sig = 0;
+ if (runtm >= rlim->rlim_cur) {
+ if (runtm >= rlim->rlim_max)
+ sig = SIGKILL;
+ else {
+ sig = SIGXCPU;
+ if (rlim->rlim_cur < rlim->rlim_max)
+ rlim->rlim_cur += 5;
+ }
+ }
+
+ mutex_spin_enter(&p->p_stmutex);
+ if (minslp < 1) {
+ /*
+ * p_pctcpu is only for ps.
+ */
+ p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
+ clkhz = stathz != 0 ? stathz : hz;
+#if (FSHIFT >= CCPU_SHIFT)
+ p->p_pctcpu += (clkhz == 100)?
+ ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
+ 100 * (((fixpt_t) p->p_cpticks)
+ << (FSHIFT - CCPU_SHIFT)) / clkhz;
+#else
+ p->p_pctcpu += ((FSCALE - ccpu) *
+ (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
+#endif
+ p->p_cpticks = 0;
+ }
+
+ sched_pstats_hook(p, minslp, runtm);
+ mutex_spin_exit(&p->p_stmutex);
+ mutex_exit(&p->p_smutex);
+ if (sig) {
+ psignal(p, sig);
+ }
+ }
+ mutex_exit(&proclist_mutex);
+ uvm_meter();
+ wakeup(&lbolt);
+ callout_schedule(&sched_pstats_ch, hz);
+}
Index: kern/sched_4bsd.c
===================================================================
RCS file: /cvsroot/src/sys/kern/Attic/sched_4bsd.c,v
retrieving revision 1.1.2.22
diff -u -r1.1.2.22 sched_4bsd.c
--- kern/sched_4bsd.c 24 Mar 2007 16:50:26 -0000 1.1.2.22
+++ kern/sched_4bsd.c 25 Mar 2007 12:53:09 -0000
@@ -120,13 +120,11 @@
} runqueue_t;
static runqueue_t global_queue;
-static void schedcpu(void *);
static void updatepri(struct lwp *);
static void resetpriority(struct lwp *);
static void resetprocpriority(struct proc *);
-struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
-static unsigned int schedcpu_ticks;
+extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */
/* The global scheduler state */
kmutex_t sched_mutex;
@@ -275,141 +273,45 @@
return estcpu;
}
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
-
-/*
- * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
- * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
- * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
- *
- * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
- * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
- *
- * If you dont want to bother with the faster/more-accurate formula, you
- * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
- * (more general) method of calculating the %age of CPU used by a process.
- */
-#define CCPU_SHIFT 11
-
/*
- * schedcpu:
+ * sched_pstats_hook:
*
- * Recompute process priorities, every hz ticks.
- *
- * XXXSMP This needs to be reorganised in order to reduce the locking
- * burden.
+ * Periodically called from sched_pstats(). Used for autonice
+ * and priority recalculation.
*/
-/* ARGSUSED */
-static void
-schedcpu(void *arg)
+inline void
+sched_pstats_hook(struct proc *p, int minslp, int runtm)
{
- fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
- struct rlimit *rlim;
struct lwp *l;
- struct proc *p;
- int minslp, clkhz, sig;
- long runtm;
-
- schedcpu_ticks++;
-
- mutex_enter(&proclist_mutex);
- PROCLIST_FOREACH(p, &allproc) {
- /*
- * Increment time in/out of memory and sleep time (if
- * sleeping). We ignore overflow; with 16-bit int's
- * (remember them?) overflow takes 45 days.
- */
- minslp = 2;
- mutex_enter(&p->p_smutex);
- runtm = p->p_rtime.tv_sec;
+ fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+ /*
+ * If the process has run for more than autonicetime, reduce
+ * priority to give others a chance.
+ */
+ if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
+ && kauth_cred_geteuid(p->p_cred)) {
+ p->p_nice = autoniceval + NZERO;
+ resetprocpriority(p);
+ }
+
+ /*
+ * If the process has slept the entire second,
+ * stop recalculating its priority until it wakes up.
+ */
+ if (minslp <= 1) {
+ p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
+
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if ((l->l_flag & LW_IDLE) != 0)
continue;
lwp_lock(l);
- runtm += l->l_rtime.tv_sec;
- l->l_swtime++;
- if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
- l->l_stat == LSSUSPENDED) {
- l->l_slptime++;
- minslp = min(minslp, l->l_slptime);
- } else
- minslp = 0;
+ if (l->l_slptime <= 1 &&
+ l->l_priority >= PUSER)
+ resetpriority(l);
lwp_unlock(l);
}
- p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
-
- /*
- * Check if the process exceeds its CPU resource allocation.
- * If over max, kill it.
- */
- rlim = &p->p_rlimit[RLIMIT_CPU];
- sig = 0;
- if (runtm >= rlim->rlim_cur) {
- if (runtm >= rlim->rlim_max)
- sig = SIGKILL;
- else {
- sig = SIGXCPU;
- if (rlim->rlim_cur < rlim->rlim_max)
- rlim->rlim_cur += 5;
- }
- }
-
- /*
- * If the process has run for more than autonicetime, reduce
- * priority to give others a chance.
- */
- if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
- && kauth_cred_geteuid(p->p_cred)) {
- mutex_spin_enter(&p->p_stmutex);
- p->p_nice = autoniceval + NZERO;
- resetprocpriority(p);
- mutex_spin_exit(&p->p_stmutex);
- }
-
- /*
- * If the process has slept the entire second,
- * stop recalculating its priority until it wakes up.
- */
- if (minslp <= 1) {
- /*
- * p_pctcpu is only for ps.
- */
- mutex_spin_enter(&p->p_stmutex);
- clkhz = stathz != 0 ? stathz : hz;
-#if (FSHIFT >= CCPU_SHIFT)
- p->p_pctcpu += (clkhz == 100)?
- ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
- 100 * (((fixpt_t) p->p_cpticks)
- << (FSHIFT - CCPU_SHIFT)) / clkhz;
-#else
- p->p_pctcpu += ((FSCALE - ccpu) *
- (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
-#endif
- p->p_cpticks = 0;
- p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
-
- LIST_FOREACH(l, &p->p_lwps, l_sibling) {
- if ((l->l_flag & LW_IDLE) != 0)
- continue;
- lwp_lock(l);
- if (l->l_slptime <= 1 &&
- l->l_priority >= PUSER)
- resetpriority(l);
- lwp_unlock(l);
- }
- mutex_spin_exit(&p->p_stmutex);
- }
-
- mutex_exit(&p->p_smutex);
- if (sig) {
- psignal(p, sig);
- }
}
- mutex_exit(&proclist_mutex);
- uvm_meter();
- wakeup(&lbolt);
- callout_schedule(&schedcpu_ch, hz);
}
/*
@@ -426,7 +328,7 @@
loadfac = loadfactor(averunnable.ldavg[0]);
- l->l_slptime--; /* the first time was done in schedcpu */
+ l->l_slptime--; /* the first time was done in sched_pstats */
/* XXX NJWLWP */
/* XXXSMP occasionally unlocked, should be per-LWP */
p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
@@ -619,7 +521,7 @@
{
rrticks = hz / 10;
- schedcpu(NULL);
+ sched_pstats(NULL);
}
void
@@ -728,7 +630,7 @@
LOCK_ASSERT(mutex_owned(&parent->p_smutex));
child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
- child->p_forktime = schedcpu_ticks;
+ child->p_forktime = sched_pstats_ticks;
}
/*
@@ -746,7 +648,7 @@
mutex_spin_enter(&parent->p_stmutex);
estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
- schedcpu_ticks - child->p_forktime);
+ sched_pstats_ticks - child->p_forktime);
if (child->p_estcpu > estcpu)
parent->p_estcpu =
ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
@@ -803,7 +705,6 @@
SYSCTL_SETUP(sysctl_sched_setup, "sysctl kern.sched subtree setup")
{
-
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "kern", NULL,
@@ -820,12 +721,6 @@
CTLTYPE_STRING, "name", NULL,
NULL, 0, __UNCONST("4.4BSD"), 0,
CTL_KERN, KERN_SCHED, CTL_CREATE, CTL_EOL);
- sysctl_createv(clog, 0, NULL, NULL,
- CTLFLAG_PERMANENT,
- CTLTYPE_INT, "ccpu",
- SYSCTL_DESCR("Scheduler exponential decay value"),
- NULL, 0, &ccpu, 0,
- CTL_KERN, KERN_SCHED, CTL_CREATE, CTL_EOL);
}
#if defined(DDB)
Index: sys/sched.h
===================================================================
RCS file: /cvsroot/src/sys/sys/sched.h,v
retrieving revision 1.30.2.13
diff -u -r1.30.2.13 sched.h
--- sys/sched.h 24 Mar 2007 16:50:26 -0000 1.30.2.13
+++ sys/sched.h 25 Mar 2007 12:53:09 -0000
@@ -193,10 +193,12 @@
void sched_slept(struct lwp *);
void sched_setrunnable(struct lwp *); /* Scheduler-specific actions for setrunnable() */
void sched_print_runqueue(void (*pr)(const char *, ...)); /* Print runqueues in DDB */
+inline void sched_pstats_hook(struct proc *, int, int);
/* Functions common to all scheduler implementations */
void sched_wakeup(volatile const void *);
pri_t sched_kpri(struct lwp *);
+void sched_pstats(void *arg);
inline void resched_cpu(struct lwp *); /* Arrange reschedule */
void setrunnable(struct lwp *);
--Qxx1br4bt0+wmkIi--