Subject: Scheduler API changes for yamt-idlelwp
To: None <tech-kern@netbsd.org>
From: Daniel Sieger <dsieger@TechFak.Uni-Bielefeld.DE>
List: tech-kern
Date: 02/17/2007 22:17:36
--3uo+9/B/ebqu+fSQ
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Hi folks,
Here's a proposed patch to introduce a new scheduler API in
Yamamoto-san's idle lwp branch. It is a slightly modified/improved
version of the patch I send a couple of weeks ago (see [1]).
Note: schedcpu() will need some further work as some parts are
scheduler dependent and others are not. For example, schedcpu()
currently takes care of updating p->p_estcpu and p->p_pctcpu. While
the former is highly 4BSD scheduler specific, the latter is a POSIX
requirement. Simply splitting schedcpu() into two distinct functions
is not an option, since it has a negative impact on performance.
Backwards compatibility is also an issue here. Since we can't simply
get rid of p_estcpu, we may need to retain schedcpu() even for other
schedulers, which is not very nice.
Comments, suggestions?
Regards,
Daniel
[1]: http://mail-index.netbsd.org/tech-kern/2007/01/14/0006.html
--
Daniel Sieger
Faculty of Technology
Bielefeld University
wwwhomes.uni-bielefeld.de/dsieger
--3uo+9/B/ebqu+fSQ
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="csf.diff"
Index: arch/i386/conf/GENERIC
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/conf/GENERIC,v
retrieving revision 1.808
diff -u -r1.808 GENERIC
--- arch/i386/conf/GENERIC 17 Feb 2007 00:28:23 -0000 1.808
+++ arch/i386/conf/GENERIC 17 Feb 2007 20:38:31 -0000
@@ -119,6 +119,9 @@
#options BUFQ_READPRIO
#options BUFQ_PRIOCSCAN
+# Scheduling algorithm
+options SCHED_4BSD
+
# Diagnostic/debugging support options
#options DIAGNOSTIC # expensive kernel consistency checks
#options DEBUG # expensive debugging checks/support
Index: conf/files
===================================================================
RCS file: /cvsroot/src/sys/conf/files,v
retrieving revision 1.831.2.1
diff -u -r1.831.2.1 files
--- conf/files 17 Feb 2007 10:30:49 -0000 1.831.2.1
+++ conf/files 17 Feb 2007 20:38:15 -0000
@@ -36,6 +36,8 @@
defflag BUFQ_READPRIO
defflag NEW_BUFQ_STRATEGY # same as BUFQ_READPRIO
+defflag opt_sched.h SCHED_4BSD
+
defparam SOMAXKVA
defflag opt_sock_counters.h SOSEND_COUNTERS
defflag opt_sosend_loan.h SOSEND_NO_LOAN
@@ -1318,6 +1320,7 @@
file kern/kern_uuid.c
file kern/kern_xxx.c
file kern/kgdb_stub.c kgdb
+file kern/sched_4bsd.c sched_4bsd
file kern/subr_autoconf.c
file kern/subr_blist.c vmswap
file kern/subr_bufq.c
Index: kern/init_main.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_main.c,v
retrieving revision 1.294.2.1
diff -u -r1.294.2.1 init_main.c
--- kern/init_main.c 17 Feb 2007 10:30:51 -0000 1.294.2.1
+++ kern/init_main.c 17 Feb 2007 20:38:16 -0000
@@ -120,6 +120,7 @@
#include <sys/sysctl.h>
#include <sys/event.h>
#include <sys/mbuf.h>
+#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/iostat.h>
#ifdef FAST_IPSEC
@@ -242,7 +243,6 @@
struct pdevinit *pdev;
int s, error;
extern struct pdevinit pdevinit[];
- extern void schedcpu(void *);
#ifdef NVNODE_IMPLICIT
int usevnodes;
#endif
@@ -339,7 +339,7 @@
(void)chgproccnt(0, 1);
/* Initialize the run queues, turnstiles and sleep queues. */
- rqinit();
+ sched_rqinit();
turnstile_init();
sleeptab_init(&sleeptab);
@@ -467,8 +467,8 @@
/* Initialize system accouting. */
acct_init();
- /* Kick off timeout driven events by calling first time. */
- schedcpu(NULL);
+ /* Setup the scheduler */
+ sched_setup();
#ifdef KTRACE
/* Initialize ktrace. */
Index: kern/kern_clock.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_clock.c,v
retrieving revision 1.106.2.1
diff -u -r1.106.2.1 kern_clock.c
--- kern/kern_clock.c 17 Feb 2007 10:30:53 -0000 1.106.2.1
+++ kern/kern_clock.c 17 Feb 2007 20:38:19 -0000
@@ -331,7 +331,9 @@
int schedhz;
int profprocs;
int hardclock_ticks;
+#ifdef SCHED_4BSD
static int statscheddiv; /* stat => sched divider (used if schedhz == 0) */
+#endif
static int psdiv; /* prof => stat divider */
int psratio; /* ratio: prof / stat */
#ifndef __HAVE_TIMECOUNTER
@@ -404,19 +406,20 @@
cpu_initclocks();
/*
- * Compute profhz/stathz/rrticks, and fix profhz if needed.
+ * Compute profhz and stathz, fix profhz if needed.
*/
i = stathz ? stathz : hz;
if (profhz == 0)
profhz = i;
psratio = profhz / i;
- rrticks = hz / 10;
+#ifdef SCHED_4BSD
if (schedhz == 0) {
/* 16Hz is best */
statscheddiv = i / 16;
if (statscheddiv <= 0)
panic("statscheddiv");
}
+#endif /* SCHED_4BSD */
#ifndef __HAVE_TIMECOUNTER
#ifdef NTP
@@ -533,8 +536,8 @@
*/
if (stathz == 0)
statclock(frame);
- if ((--ci->ci_schedstate.spc_rrticks) <= 0)
- roundrobin(ci);
+ if ((--ci->ci_schedstate.spc_ticks) <= 0)
+ sched_tick(ci);
#if defined(MULTIPROCESSOR)
/*
@@ -1243,6 +1246,7 @@
++p->p_cpticks;
mutex_spin_exit(&p->p_stmutex);
+#ifdef SCHED_4BSD
/*
* If no separate schedclock is provided, call it here
* at about 16 Hz.
@@ -1253,6 +1257,7 @@
ci->ci_schedstate.spc_schedticks = statscheddiv;
}
}
+#endif /* SCHED_4BSD */
}
#ifndef __HAVE_TIMECOUNTER
Index: kern/kern_exit.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_exit.c,v
retrieving revision 1.166.2.1
diff -u -r1.166.2.1 kern_exit.c
--- kern/kern_exit.c 17 Feb 2007 10:30:54 -0000 1.166.2.1
+++ kern/kern_exit.c 17 Feb 2007 20:38:21 -0000
@@ -892,7 +892,7 @@
leavepgrp(p);
parent = p->p_pptr;
- scheduler_wait_hook(parent, p);
+ sched_proc_exit(parent, p);
ruadd(&parent->p_stats->p_cru, p->p_ru);
p->p_xstat = 0;
Index: kern/kern_fork.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_fork.c,v
retrieving revision 1.131.2.1
diff -u -r1.131.2.1 kern_fork.c
--- kern/kern_fork.c 17 Feb 2007 10:30:55 -0000 1.131.2.1
+++ kern/kern_fork.c 17 Feb 2007 20:38:21 -0000
@@ -397,7 +397,7 @@
p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS);
p2->p_sflag |=
(p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
- scheduler_fork_hook(p1, p2);
+ sched_proc_fork(p1, p2);
mutex_exit(&p1->p_smutex);
p2->p_stflag = p1->p_stflag;
@@ -520,7 +520,7 @@
lwp_lock(l2);
l2->l_stat = LSRUN;
l2->l_flag |= tmp;
- setrunqueue(l2);
+ sched_enqueue(l2);
lwp_unlock(l2);
}
Index: kern/kern_resource.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_resource.c,v
retrieving revision 1.113
diff -u -r1.113 kern_resource.c
--- kern/kern_resource.c 9 Feb 2007 21:55:31 -0000 1.113
+++ kern/kern_resource.c 17 Feb 2007 20:38:22 -0000
@@ -230,8 +230,7 @@
mutex_spin_exit(&chgp->p_stmutex);
goto again;
}
- chgp->p_nice = n;
- (void)resetprocpriority(chgp);
+ sched_nice(chgp, n);
mutex_spin_exit(&chgp->p_stmutex);
return (0);
}
Index: kern/kern_sleepq.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_sleepq.c,v
retrieving revision 1.4
diff -u -r1.4 kern_sleepq.c
--- kern/kern_sleepq.c 15 Feb 2007 20:21:13 -0000 1.4
+++ kern/kern_sleepq.c 17 Feb 2007 20:38:23 -0000
@@ -171,12 +171,11 @@
* Set it running. We'll try to get the last CPU that ran
* this LWP to pick it up again.
*/
- if (l->l_slptime > 1)
- updatepri(l);
+ sched_setrunnable(l);
l->l_stat = LSRUN;
l->l_slptime = 0;
if ((l->l_flag & L_INMEM) != 0) {
- setrunqueue(l);
+ sched_enqueue(l);
if (l->l_priority < ci->ci_schedstate.spc_curpriority)
cpu_need_resched(ci);
sched_unlock(1);
Index: kern/kern_synch.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_synch.c,v
retrieving revision 1.177.2.3
diff -u -r1.177.2.3 kern_synch.c
--- kern/kern_synch.c 17 Feb 2007 11:13:51 -0000 1.177.2.3
+++ kern/kern_synch.c 17 Feb 2007 20:38:23 -0000
@@ -6,7 +6,8 @@
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
+ * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
+ * Daniel Sieger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -105,23 +106,13 @@
#include <machine/cpu.h>
int lbolt; /* once a second sleep address */
-int rrticks; /* number of hardclock ticks per roundrobin() */
/*
* The global scheduler state.
*/
kmutex_t sched_mutex; /* global sched state mutex */
-struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
-volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
-
-void schedcpu(void *);
-void updatepri(struct lwp *);
void sched_unsleep(struct lwp *);
-void sched_changepri(struct lwp *, int);
-
-struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
-static unsigned int schedcpu_ticks;
syncobj_t sleep_syncobj = {
SOBJ_SLEEPQ_SORTED,
@@ -136,306 +127,6 @@
};
/*
- * Force switch among equal priority processes every 100ms.
- * Called from hardclock every hz/10 == rrticks hardclock ticks.
- */
-/* ARGSUSED */
-void
-roundrobin(struct cpu_info *ci)
-{
- struct schedstate_percpu *spc = &ci->ci_schedstate;
-
- spc->spc_rrticks = rrticks;
-
- if (!CURCPU_IDLE_P()) {
- if (spc->spc_flags & SPCF_SEENRR) {
- /*
- * The process has already been through a roundrobin
- * without switching and may be hogging the CPU.
- * Indicate that the process should yield.
- */
- spc->spc_flags |= SPCF_SHOULDYIELD;
- } else
- spc->spc_flags |= SPCF_SEENRR;
- }
- cpu_need_resched(curcpu());
-}
-
-#define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
-#define NICE_WEIGHT 2 /* priorities per nice level */
-
-#define ESTCPU_SHIFT 11
-#define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
-#define ESTCPULIM(e) min((e), ESTCPU_MAX)
-
-/*
- * Constants for digital decay and forget:
- * 90% of (p_estcpu) usage in 5 * loadav time
- * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
- * Note that, as ps(1) mentions, this can let percentages
- * total over 100% (I've seen 137.9% for 3 processes).
- *
- * Note that hardclock updates p_estcpu and p_cpticks independently.
- *
- * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
- * That is, the system wants to compute a value of decay such
- * that the following for loop:
- * for (i = 0; i < (5 * loadavg); i++)
- * p_estcpu *= decay;
- * will compute
- * p_estcpu *= 0.1;
- * for all values of loadavg:
- *
- * Mathematically this loop can be expressed by saying:
- * decay ** (5 * loadavg) ~= .1
- *
- * The system computes decay as:
- * decay = (2 * loadavg) / (2 * loadavg + 1)
- *
- * We wish to prove that the system's computation of decay
- * will always fulfill the equation:
- * decay ** (5 * loadavg) ~= .1
- *
- * If we compute b as:
- * b = 2 * loadavg
- * then
- * decay = b / (b + 1)
- *
- * We now need to prove two things:
- * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
- * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
- *
- * Facts:
- * For x close to zero, exp(x) =~ 1 + x, since
- * exp(x) = 0! + x**1/1! + x**2/2! + ... .
- * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
- * For x close to zero, ln(1+x) =~ x, since
- * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
- * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
- * ln(.1) =~ -2.30
- *
- * Proof of (1):
- * Solve (factor)**(power) =~ .1 given power (5*loadav):
- * solving for factor,
- * ln(factor) =~ (-2.30/5*loadav), or
- * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
- * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
- *
- * Proof of (2):
- * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
- * solving for power,
- * power*ln(b/(b+1)) =~ -2.30, or
- * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
- *
- * Actual power values for the implemented algorithm are as follows:
- * loadav: 1 2 3 4
- * power: 5.68 10.32 14.94 19.55
- */
-
-/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
-#define loadfactor(loadav) (2 * (loadav))
-
-static fixpt_t
-decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
-{
-
- if (estcpu == 0) {
- return 0;
- }
-
-#if !defined(_LP64)
- /* avoid 64bit arithmetics. */
-#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
- if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
- return estcpu * loadfac / (loadfac + FSCALE);
- }
-#endif /* !defined(_LP64) */
-
- return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
-}
-
-/*
- * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
- * sleeping for at least seven times the loadfactor will decay p_estcpu to
- * less than (1 << ESTCPU_SHIFT).
- *
- * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
- */
-static fixpt_t
-decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
-{
-
- if ((n << FSHIFT) >= 7 * loadfac) {
- return 0;
- }
-
- while (estcpu != 0 && n > 1) {
- estcpu = decay_cpu(loadfac, estcpu);
- n--;
- }
-
- return estcpu;
-}
-
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
-
-/*
- * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
- * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
- * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
- *
- * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
- * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
- *
- * If you dont want to bother with the faster/more-accurate formula, you
- * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
- * (more general) method of calculating the %age of CPU used by a process.
- */
-#define CCPU_SHIFT 11
-
-/*
- * schedcpu:
- *
- * Recompute process priorities, every hz ticks.
- *
- * XXXSMP This needs to be reorganised in order to reduce the locking
- * burden.
- */
-/* ARGSUSED */
-void
-schedcpu(void *arg)
-{
- fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
- struct rlimit *rlim;
- struct lwp *l;
- struct proc *p;
- int minslp, clkhz, sig;
- long runtm;
-
- schedcpu_ticks++;
-
- mutex_enter(&proclist_mutex);
- PROCLIST_FOREACH(p, &allproc) {
- /*
- * Increment time in/out of memory and sleep time (if
- * sleeping). We ignore overflow; with 16-bit int's
- * (remember them?) overflow takes 45 days.
- */
- minslp = 2;
- mutex_enter(&p->p_smutex);
- runtm = p->p_rtime.tv_sec;
- LIST_FOREACH(l, &p->p_lwps, l_sibling) {
- if ((l->l_flag & L_IDLE) != 0)
- continue;
- lwp_lock(l);
- runtm += l->l_rtime.tv_sec;
- l->l_swtime++;
- if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
- l->l_stat == LSSUSPENDED) {
- l->l_slptime++;
- minslp = min(minslp, l->l_slptime);
- } else
- minslp = 0;
- lwp_unlock(l);
- }
- p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
-
- /*
- * Check if the process exceeds its CPU resource allocation.
- * If over max, kill it.
- */
- rlim = &p->p_rlimit[RLIMIT_CPU];
- sig = 0;
- if (runtm >= rlim->rlim_cur) {
- if (runtm >= rlim->rlim_max)
- sig = SIGKILL;
- else {
- sig = SIGXCPU;
- if (rlim->rlim_cur < rlim->rlim_max)
- rlim->rlim_cur += 5;
- }
- }
-
- /*
- * If the process has run for more than autonicetime, reduce
- * priority to give others a chance.
- */
- if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
- && kauth_cred_geteuid(p->p_cred)) {
- mutex_spin_enter(&p->p_stmutex);
- p->p_nice = autoniceval + NZERO;
- resetprocpriority(p);
- mutex_spin_exit(&p->p_stmutex);
- }
-
- /*
- * If the process has slept the entire second,
- * stop recalculating its priority until it wakes up.
- */
- if (minslp <= 1) {
- /*
- * p_pctcpu is only for ps.
- */
- mutex_spin_enter(&p->p_stmutex);
- clkhz = stathz != 0 ? stathz : hz;
-#if (FSHIFT >= CCPU_SHIFT)
- p->p_pctcpu += (clkhz == 100)?
- ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
- 100 * (((fixpt_t) p->p_cpticks)
- << (FSHIFT - CCPU_SHIFT)) / clkhz;
-#else
- p->p_pctcpu += ((FSCALE - ccpu) *
- (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
-#endif
- p->p_cpticks = 0;
- p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
-
- LIST_FOREACH(l, &p->p_lwps, l_sibling) {
- if ((l->l_flag & L_IDLE) != 0)
- continue;
- lwp_lock(l);
- if (l->l_slptime <= 1 &&
- l->l_priority >= PUSER)
- resetpriority(l);
- lwp_unlock(l);
- }
- mutex_spin_exit(&p->p_stmutex);
- }
-
- mutex_exit(&p->p_smutex);
- if (sig) {
- psignal(p, sig);
- }
- }
- mutex_exit(&proclist_mutex);
- uvm_meter();
- wakeup((caddr_t)&lbolt);
- callout_schedule(&schedcpu_ch, hz);
-}
-
-/*
- * Recalculate the priority of a process after it has slept for a while.
- */
-void
-updatepri(struct lwp *l)
-{
- struct proc *p = l->l_proc;
- fixpt_t loadfac;
-
- LOCK_ASSERT(lwp_locked(l, NULL));
- KASSERT(l->l_slptime > 1);
-
- loadfac = loadfactor(averunnable.ldavg[0]);
-
- l->l_slptime--; /* the first time was done in schedcpu */
- /* XXX NJWLWP */
- /* XXXSMP occasionally unlocked, should be per-LWP */
- p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
- resetpriority(l);
-}
-
-/*
* During autoconfiguration or after a panic, a sleep will simply lower the
* priority briefly to allow interrupts, then return. The priority to be
* used (safepri) is machine-dependent, thus this value is initialized and
@@ -703,7 +394,7 @@
KASSERT(lwp_locked(l, &sched_mutex));
l->l_stat = LSRUN;
if ((l->l_flag & L_IDLE) == 0) {
- setrunqueue(l);
+ sched_enqueue(l);
}
}
uvmexp.swtch++;
@@ -732,11 +423,11 @@
#endif
if (newl == NULL) {
- newl = nextrunqueue();
+ newl = sched_nextlwp();
}
if (newl != NULL) {
KASSERT(lwp_locked(newl, &sched_mutex));
- remrunqueue(newl);
+ sched_dequeue(newl);
} else {
newl = l->l_cpu->ci_data.cpu_idlelwp;
KASSERT(newl != NULL);
@@ -790,54 +481,6 @@
}
/*
- * Initialize the (doubly-linked) run queues
- * to be empty.
- */
-void
-rqinit()
-{
- int i;
-
- for (i = 0; i < RUNQUE_NQS; i++)
- sched_qs[i].ph_link = sched_qs[i].ph_rlink =
- (struct lwp *)&sched_qs[i];
-
- mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
-}
-
-static inline void
-resched_lwp(struct lwp *l, u_char pri)
-{
- struct cpu_info *ci;
-
- /*
- * XXXSMP
- * Since l->l_cpu persists across a context switch,
- * this gives us *very weak* processor affinity, in
- * that we notify the CPU on which the process last
- * ran that it should try to switch.
- *
- * This does not guarantee that the process will run on
- * that processor next, because another processor might
- * grab it the next time it performs a context switch.
- *
- * This also does not handle the case where its last
- * CPU is running a higher-priority process, but every
- * other CPU is running a lower-priority process. There
- * are ways to handle this situation, but they're not
- * currently very pretty, and we also need to weigh the
- * cost of moving a process from one CPU to another.
- *
- * XXXSMP
- * There is also the issue of locking the other CPU's
- * sched state, which we currently do not do.
- */
- ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
- if (pri < ci->ci_schedstate.spc_curpriority)
- cpu_need_resched(ci);
-}
-
-/*
* Change process state to be runnable, placing it on the run queue if it is
* in memory, and awakening the swapper if it isn't in memory.
*
@@ -920,14 +563,13 @@
* Set the LWP runnable. If it's swapped out, we need to wake the swapper
* to bring it back in. Otherwise, enter it into a run queue.
*/
- if (l->l_slptime > 1)
- updatepri(l);
+ sched_setrunnable(l);
l->l_stat = LSRUN;
l->l_slptime = 0;
if (l->l_flag & L_INMEM) {
- setrunqueue(l);
- resched_lwp(l, l->l_priority);
+ sched_enqueue(l);
+ resched_cpu(l, l->l_priority);
lwp_unlock(l);
} else {
lwp_unlock(l);
@@ -935,84 +577,6 @@
}
}
-boolean_t
-sched_curcpu_runnable_p(void)
-{
-
- return sched_whichqs != 0;
-}
-
-/*
- * Compute the priority of a process when running in user mode.
- * Arrange to reschedule if the resulting priority is better
- * than that of the current process.
- */
-void
-resetpriority(struct lwp *l)
-{
- unsigned int newpriority;
- struct proc *p = l->l_proc;
-
- /* XXXSMP LOCK_ASSERT(mutex_owned(&p->p_stmutex)); */
- LOCK_ASSERT(lwp_locked(l, NULL));
-
- if ((l->l_flag & L_SYSTEM) != 0)
- return;
-
- newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
- NICE_WEIGHT * (p->p_nice - NZERO);
- newpriority = min(newpriority, MAXPRI);
- lwp_changepri(l, newpriority);
-}
-
-/*
- * Recompute priority for all LWPs in a process.
- */
-void
-resetprocpriority(struct proc *p)
-{
- struct lwp *l;
-
- LOCK_ASSERT(mutex_owned(&p->p_stmutex));
-
- LIST_FOREACH(l, &p->p_lwps, l_sibling) {
- lwp_lock(l);
- resetpriority(l);
- lwp_unlock(l);
- }
-}
-
-/*
- * We adjust the priority of the current process. The priority of a process
- * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
- * is increased here. The formula for computing priorities (in kern_synch.c)
- * will compute a different value each time p_estcpu increases. This can
- * cause a switch, but unless the priority crosses a PPQ boundary the actual
- * queue will not change. The CPU usage estimator ramps up quite quickly
- * when the process is running (linearly), and decays away exponentially, at
- * a rate which is proportionally slower when the system is busy. The basic
- * principle is that the system will 90% forget that the process used a lot
- * of CPU time in 5 * loadav seconds. This causes the system to favor
- * processes which haven't run much recently, and to round-robin among other
- * processes.
- */
-
-void
-schedclock(struct lwp *l)
-{
- struct proc *p = l->l_proc;
-
- KASSERT(!CURCPU_IDLE_P());
- mutex_spin_enter(&p->p_stmutex);
- p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
- lwp_lock(l);
- resetpriority(l);
- mutex_spin_exit(&p->p_stmutex);
- if ((l->l_flag & L_SYSTEM) == 0 && l->l_priority >= PUSER)
- l->l_priority = l->l_usrpri;
- lwp_unlock(l);
-}
-
/*
* suspendsched:
*
@@ -1086,43 +650,6 @@
}
/*
- * scheduler_fork_hook:
- *
- * Inherit the parent's scheduler history.
- */
-void
-scheduler_fork_hook(struct proc *parent, struct proc *child)
-{
-
- LOCK_ASSERT(mutex_owned(&parent->p_smutex));
-
- child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
- child->p_forktime = schedcpu_ticks;
-}
-
-/*
- * scheduler_wait_hook:
- *
- * Chargeback parents for the sins of their children.
- */
-void
-scheduler_wait_hook(struct proc *parent, struct proc *child)
-{
- fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
- fixpt_t estcpu;
-
- /* XXX Only if parent != init?? */
-
- mutex_spin_enter(&parent->p_stmutex);
- estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
- schedcpu_ticks - child->p_forktime);
- if (child->p_estcpu > estcpu)
- parent->p_estcpu =
- ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
- mutex_spin_exit(&parent->p_stmutex);
-}
-
-/*
* sched_kpri:
*
* Scale a priority level to a kernel priority level, usually
@@ -1174,232 +701,34 @@
panic("sched_unsleep");
}
-/*
- * sched_changepri:
- *
- * Adjust the priority of an LWP.
- */
-void
-sched_changepri(struct lwp *l, int pri)
+inline void
+resched_cpu(struct lwp *l, u_char pri)
{
+ struct cpu_info *ci;
- LOCK_ASSERT(lwp_locked(l, &sched_mutex));
-
- l->l_usrpri = pri;
-
- if (l->l_priority < PUSER)
- return;
- if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0 ||
- (l->l_priority / PPQ) == (pri / PPQ)) {
- l->l_priority = pri;
- return;
- }
-
- remrunqueue(l);
- l->l_priority = pri;
- setrunqueue(l);
- resched_lwp(l, pri);
-}
-
-/*
- * On some architectures, it's faster to use a MSB ordering for the priorites
- * than the traditional LSB ordering.
- */
-#ifdef __HAVE_BIGENDIAN_BITOPS
-#define RQMASK(n) (0x80000000 >> (n))
-#else
-#define RQMASK(n) (0x00000001 << (n))
-#endif
-
-/*
- * Low-level routines to access the run queue. Optimised assembler
- * routines can override these.
- */
-
-#ifndef __HAVE_MD_RUNQUEUE
-
-/*
- * The primitives that manipulate the run queues. whichqs tells which
- * of the 32 queues qs have processes in them. Setrunqueue puts processes
- * into queues, remrunqueue removes them from queues. The running process is
- * on no queue, other processes are on a queue related to p->p_priority,
- * divided by 4 actually to shrink the 0-127 range of priorities into the 32
- * available queues.
- */
-#ifdef RQDEBUG
-static void
-checkrunqueue(int whichq, struct lwp *l)
-{
- const struct prochd * const rq = &sched_qs[whichq];
- struct lwp *l2;
- int found = 0;
- int die = 0;
- int empty = 1;
- for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
- if (l2->l_stat != LSRUN) {
- printf("checkrunqueue[%d]: lwp %p state (%d) "
- " != LSRUN\n", whichq, l2, l2->l_stat);
- }
- if (l2->l_back->l_forw != l2) {
- printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
- "corrupt %p\n", whichq, l2, l2->l_back,
- l2->l_back->l_forw);
- die = 1;
- }
- if (l2->l_forw->l_back != l2) {
- printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
- "corrupt %p\n", whichq, l2, l2->l_forw,
- l2->l_forw->l_back);
- die = 1;
- }
- if (l2 == l)
- found = 1;
- empty = 0;
- }
- if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
- printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
- whichq, rq);
- die = 1;
- } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
- printf("checkrunqueue[%d]: bit clear for non-empty "
- "run-queue %p\n", whichq, rq);
- die = 1;
- }
- if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
- printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
- whichq, l);
- die = 1;
- }
- if (l != NULL && empty) {
- printf("checkrunqueue[%d]: empty run-queue %p with "
- "active lwp %p\n", whichq, rq, l);
- die = 1;
- }
- if (l != NULL && !found) {
- printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
- whichq, l, rq);
- die = 1;
- }
- if (die)
- panic("checkrunqueue: inconsistency found");
-}
-#endif /* RQDEBUG */
-
-void
-setrunqueue(struct lwp *l)
-{
- struct prochd *rq;
- struct lwp *prev;
- const int whichq = l->l_priority / PPQ;
-
- LOCK_ASSERT(lwp_locked(l, &sched_mutex));
-
-#ifdef RQDEBUG
- checkrunqueue(whichq, NULL);
-#endif
-#ifdef DIAGNOSTIC
- if (l->l_back != NULL || l->l_stat != LSRUN)
- panic("setrunqueue");
-#endif
- sched_whichqs |= RQMASK(whichq);
- rq = &sched_qs[whichq];
- prev = rq->ph_rlink;
- l->l_forw = (struct lwp *)rq;
- rq->ph_rlink = l;
- prev->l_forw = l;
- l->l_back = prev;
-#ifdef RQDEBUG
- checkrunqueue(whichq, l);
-#endif
-}
-
-/*
- * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
- * drop of the effective priority level from kernel to user needs to be
- * moved here from userret(). The assignment in userret() is currently
- * done unlocked.
- */
-void
-remrunqueue(struct lwp *l)
-{
- struct lwp *prev, *next;
- const int whichq = l->l_priority / PPQ;
-
- LOCK_ASSERT(lwp_locked(l, &sched_mutex));
-
-#ifdef RQDEBUG
- checkrunqueue(whichq, l);
-#endif
-
-#if defined(DIAGNOSTIC)
- if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
- /* Shouldn't happen - interrupts disabled. */
- panic("remrunqueue: bit %d not set", whichq);
- }
-#endif
- prev = l->l_back;
- l->l_back = NULL;
- next = l->l_forw;
- prev->l_forw = next;
- next->l_back = prev;
- if (prev == next)
- sched_whichqs &= ~RQMASK(whichq);
-#ifdef RQDEBUG
- checkrunqueue(whichq, NULL);
-#endif
-}
-
-struct lwp *
-nextrunqueue(void)
-{
- const struct prochd *rq;
- struct lwp *l;
- int whichq;
-
- if (sched_whichqs == 0) {
- return NULL;
- }
-#ifdef __HAVE_BIGENDIAN_BITOPS
- for (whichq = 0; ; whichq++) {
- if ((sched_whichqs & RQMASK(whichq)) != 0) {
- break;
- }
- }
-#else
- whichq = ffs(sched_whichqs) - 1;
-#endif
- rq = &sched_qs[whichq];
- l = rq->ph_link;
- return l;
-}
-
-#endif /* !defined(__HAVE_MD_RUNQUEUE) */
-
-#if defined(DDB)
-void
-sched_print_runqueue(void (*pr)(const char *, ...))
-{
- struct prochd *ph;
- struct lwp *l;
- int i, first;
-
- for (i = 0; i < RUNQUE_NQS; i++)
- {
- first = 1;
- ph = &sched_qs[i];
- for (l = ph->ph_link; l != (void *)ph; l = l->l_forw) {
- if (first) {
- (*pr)("%c%d",
- (sched_whichqs & RQMASK(i))
- ? ' ' : '!', i);
- first = 0;
- }
- (*pr)("\t%d.%d (%s) pri=%d usrpri=%d\n",
- l->l_proc->p_pid,
- l->l_lid, l->l_proc->p_comm,
- (int)l->l_priority, (int)l->l_usrpri);
- }
- }
+ /*
+ * XXXSMP
+ * Since l->l_cpu persists across a context switch,
+ * this gives us *very weak* processor affinity, in
+ * that we notify the CPU on which the process last
+ * ran that it should try to switch.
+ *
+ * This does not guarantee that the process will run on
+ * that processor next, because another processor might
+ * grab it the next time it performs a context switch.
+ *
+ * This also does not handle the case where its last
+ * CPU is running a higher-priority process, but every
+ * other CPU is running a lower-priority process. There
+ * are ways to handle this situation, but they're not
+ * currently very pretty, and we also need to weigh the
+ * cost of moving a process from one CPU to another.
+ *
+ * XXXSMP
+ * There is also the issue of locking the other CPU's
+ * sched state, which we currently do not do.
+ */
+ ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
+ if (pri < ci->ci_schedstate.spc_curpriority)
+ cpu_need_resched(ci);
}
-#endif /* defined(DDB) */
-#undef RQMASK
Index: kern/sched_4bsd.c
===================================================================
RCS file: kern/sched_4bsd.c
diff -N kern/sched_4bsd.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ kern/sched_4bsd.c 17 Feb 2007 20:38:25 -0000
@@ -0,0 +1,820 @@
+/* $NetBSD: kern_synch.c,v 1.176 2007/02/10 14:02:01 yamt Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
+ * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
+ * Daniel Sieger.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.176 2007/02/10 14:02:01 yamt Exp $");
+
+#include "opt_ddb.h"
+#include "opt_kstack.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_perfctrs.h"
+
+#define __MUTEX_PRIVATE
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#if defined(PERFCTRS)
+#include <sys/pmc.h>
+#endif
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/kauth.h>
+#include <sys/sleepq.h>
+#include <sys/lockdebug.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/cpu.h>
+
+/*
+ * Run queues.
+ *
+ * We have 32 run queues in descending priority of 0..31. We maintain
+ * a bitmask of non-empty queues in order speed up finding the first
+ * runnable process. The bitmask is maintained only by machine-dependent
+ * code, allowing the most efficient instructions to be used to find the
+ * first non-empty queue.
+ */
+
+
+#define RUNQUE_NQS 32 /* number of runqueues */
+#define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
+
+struct prochd {
+ struct lwp *ph_link;
+ struct lwp *ph_rlink;
+};
+
+struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
+volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
+
+void schedcpu(void *);
+void updatepri(struct lwp *);
+void resetpriority (struct lwp *);
+void resetprocpriority(struct proc *);
+
+struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
+static unsigned int schedcpu_ticks;
+
+int rrticks; /* number of hardclock ticks per sched_tick() */
+
+/*
+ * Force switch among equal priority processes every 100ms.
+ * Called from hardclock every hz/10 == rrticks hardclock ticks.
+ */
+/* ARGSUSED */
+void
+sched_tick(struct cpu_info *ci)
+{
+ struct schedstate_percpu *spc = &ci->ci_schedstate;
+
+ spc->spc_ticks = rrticks;
+
+ if (!CURCPU_IDLE_P()) {
+ if (spc->spc_flags & SPCF_SEENRR) {
+ /*
+ * The process has already been through a roundrobin
+ * without switching and may be hogging the CPU.
+ * Indicate that the process should yield.
+ */
+ spc->spc_flags |= SPCF_SHOULDYIELD;
+ } else
+ spc->spc_flags |= SPCF_SEENRR;
+ }
+ cpu_need_resched(curcpu());
+}
+
+#define NICE_WEIGHT 2 /* priorities per nice level */
+
+#define ESTCPU_SHIFT 11
+#define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
+#define ESTCPULIM(e) min((e), ESTCPU_MAX)
+
+/*
+ * Constants for digital decay and forget:
+ * 90% of (p_estcpu) usage in 5 * loadav time
+ * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
+ * Note that, as ps(1) mentions, this can let percentages
+ * total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that hardclock updates p_estcpu and p_cpticks independently.
+ *
+ * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * for (i = 0; i < (5 * loadavg); i++)
+ * p_estcpu *= decay;
+ * will compute
+ * p_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * b = 2 * loadavg
+ * then
+ * decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ * For x close to zero, exp(x) =~ 1 + x, since
+ * exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ * For x close to zero, ln(1+x) =~ x, since
+ * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
+ * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ * ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ * Solve (factor)**(power) =~ .1 given power (5*loadav):
+ * solving for factor,
+ * ln(factor) =~ (-2.30/5*loadav), or
+ * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
+ *
+ * Proof of (2):
+ * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ * solving for power,
+ * power*ln(b/(b+1)) =~ -2.30, or
+ * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ * loadav: 1 2 3 4
+ * power: 5.68 10.32 14.94 19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define loadfactor(loadav) (2 * (loadav))
+
+static fixpt_t
+decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
+{
+
+ if (estcpu == 0) {
+ return 0;
+ }
+
+#if !defined(_LP64)
+ /* avoid 64bit arithmetics. */
+#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
+ if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
+ return estcpu * loadfac / (loadfac + FSCALE);
+ }
+#endif /* !defined(_LP64) */
+
+ return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
+}
+
+/*
+ * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
+ * sleeping for at least seven times the loadfactor will decay p_estcpu to
+ * less than (1 << ESTCPU_SHIFT).
+ *
+ * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
+ */
+static fixpt_t
+decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
+{
+
+ if ((n << FSHIFT) >= 7 * loadfac) {
+ return 0;
+ }
+
+ while (estcpu != 0 && n > 1) {
+ estcpu = decay_cpu(loadfac, estcpu);
+ n--;
+ }
+
+ return estcpu;
+}
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you dont want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define CCPU_SHIFT 11
+
+/*
+ * schedcpu:
+ *
+ * Recompute process priorities, every hz ticks.
+ *
+ * XXXSMP This needs to be reorganised in order to reduce the locking
+ * burden.
+ */
+/* ARGSUSED */
+void
+schedcpu(void *arg)
+{
+ fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+ struct rlimit *rlim;
+ struct lwp *l;
+ struct proc *p;
+ int minslp, clkhz, sig;
+ long runtm;
+
+ schedcpu_ticks++;
+
+ mutex_enter(&proclist_mutex);
+ PROCLIST_FOREACH(p, &allproc) {
+ /*
+ * Increment time in/out of memory and sleep time (if
+ * sleeping). We ignore overflow; with 16-bit int's
+ * (remember them?) overflow takes 45 days.
+ */
+ minslp = 2;
+ mutex_enter(&p->p_smutex);
+ runtm = p->p_rtime.tv_sec;
+ LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+ if ((l->l_flag & L_IDLE) != 0)
+ continue;
+ lwp_lock(l);
+ runtm += l->l_rtime.tv_sec;
+ l->l_swtime++;
+ if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
+ l->l_stat == LSSUSPENDED) {
+ l->l_slptime++;
+ minslp = min(minslp, l->l_slptime);
+ } else
+ minslp = 0;
+ lwp_unlock(l);
+ }
+ p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
+
+ /*
+ * Check if the process exceeds its CPU resource allocation.
+ * If over max, kill it.
+ */
+ rlim = &p->p_rlimit[RLIMIT_CPU];
+ sig = 0;
+ if (runtm >= rlim->rlim_cur) {
+ if (runtm >= rlim->rlim_max)
+ sig = SIGKILL;
+ else {
+ sig = SIGXCPU;
+ if (rlim->rlim_cur < rlim->rlim_max)
+ rlim->rlim_cur += 5;
+ }
+ }
+
+ /*
+ * If the process has run for more than autonicetime, reduce
+ * priority to give others a chance.
+ */
+ if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
+ && kauth_cred_geteuid(p->p_cred)) {
+ mutex_spin_enter(&p->p_stmutex);
+ p->p_nice = autoniceval + NZERO;
+ resetprocpriority(p);
+ mutex_spin_exit(&p->p_stmutex);
+ }
+
+ /*
+ * If the process has slept the entire second,
+ * stop recalculating its priority until it wakes up.
+ */
+ if (minslp <= 1) {
+ /*
+ * p_pctcpu is only for ps.
+ */
+ mutex_spin_enter(&p->p_stmutex);
+ clkhz = stathz != 0 ? stathz : hz;
+#if (FSHIFT >= CCPU_SHIFT)
+ p->p_pctcpu += (clkhz == 100)?
+ ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
+ 100 * (((fixpt_t) p->p_cpticks)
+ << (FSHIFT - CCPU_SHIFT)) / clkhz;
+#else
+ p->p_pctcpu += ((FSCALE - ccpu) *
+ (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
+#endif
+ p->p_cpticks = 0;
+ p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
+
+ LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+ if ((l->l_flag & L_IDLE) != 0)
+ continue;
+ lwp_lock(l);
+ if (l->l_slptime <= 1 &&
+ l->l_priority >= PUSER)
+ resetpriority(l);
+ lwp_unlock(l);
+ }
+ mutex_spin_exit(&p->p_stmutex);
+ }
+
+ mutex_exit(&p->p_smutex);
+ if (sig) {
+ psignal(p, sig);
+ }
+ }
+ mutex_exit(&proclist_mutex);
+ uvm_meter();
+ wakeup((caddr_t)&lbolt);
+ callout_schedule(&schedcpu_ch, hz);
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ */
+void
+updatepri(struct lwp *l)
+{
+ struct proc *p = l->l_proc;
+ fixpt_t loadfac;
+
+ LOCK_ASSERT(lwp_locked(l, NULL));
+ KASSERT(l->l_slptime > 1);
+
+ loadfac = loadfactor(averunnable.ldavg[0]);
+
+ l->l_slptime--; /* the first time was done in schedcpu */
+ /* XXX NJWLWP */
+ /* XXXSMP occasionally unlocked, should be per-LWP */
+ p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
+ resetpriority(l);
+}
+
+/*
+ * Initialize the (doubly-linked) run queues
+ * to be empty.
+ */
+void
+sched_rqinit()
+{
+ int i;
+
+ for (i = 0; i < RUNQUE_NQS; i++)
+ sched_qs[i].ph_link = sched_qs[i].ph_rlink =
+ (struct lwp *)&sched_qs[i];
+
+ mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
+}
+
+void
+sched_setup()
+{
+ rrticks = hz / 10;
+
+ schedcpu(NULL);
+}
+
+void
+sched_setrunnable(struct lwp *l)
+{
+ if (l->l_slptime > 1)
+ updatepri(l);
+}
+
+boolean_t
+sched_curcpu_runnable_p(void)
+{
+
+ return sched_whichqs != 0;
+}
+
+void
+sched_nice(struct proc *chgp, int n)
+{
+ chgp->p_nice = n;
+ (void)resetprocpriority(chgp);
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+void
+resetpriority(struct lwp *l)
+{
+ unsigned int newpriority;
+ struct proc *p = l->l_proc;
+
+ /* XXXSMP LOCK_ASSERT(mutex_owned(&p->p_stmutex)); */
+ LOCK_ASSERT(lwp_locked(l, NULL));
+
+ if ((l->l_flag & L_SYSTEM) != 0)
+ return;
+
+ newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
+ NICE_WEIGHT * (p->p_nice - NZERO);
+ newpriority = min(newpriority, MAXPRI);
+ lwp_changepri(l, newpriority);
+}
+
+/*
+ * Recompute priority for all LWPs in a process.
+ */
+void
+resetprocpriority(struct proc *p)
+{
+ struct lwp *l;
+
+ LOCK_ASSERT(mutex_owned(&p->p_stmutex));
+
+ LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+ lwp_lock(l);
+ resetpriority(l);
+ lwp_unlock(l);
+ }
+}
+
+/*
+ * We adjust the priority of the current process. The priority of a process
+ * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
+ * is increased here. The formula for computing priorities (in kern_synch.c)
+ * will compute a different value each time p_estcpu increases. This can
+ * cause a switch, but unless the priority crosses a PPQ boundary the actual
+ * queue will not change. The CPU usage estimator ramps up quite quickly
+ * when the process is running (linearly), and decays away exponentially, at
+ * a rate which is proportionally slower when the system is busy. The basic
+ * principle is that the system will 90% forget that the process used a lot
+ * of CPU time in 5 * loadav seconds. This causes the system to favor
+ * processes which haven't run much recently, and to round-robin among other
+ * processes.
+ */
+
+void
+schedclock(struct lwp *l)
+{
+ struct proc *p = l->l_proc;
+
+ KASSERT(!CURCPU_IDLE_P());
+ mutex_spin_enter(&p->p_stmutex);
+ p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
+ lwp_lock(l);
+ resetpriority(l);
+ mutex_spin_exit(&p->p_stmutex);
+ if ((l->l_flag & L_SYSTEM) == 0 && l->l_priority >= PUSER)
+ l->l_priority = l->l_usrpri;
+ lwp_unlock(l);
+}
+
+/*
+ * scheduler_fork_hook:
+ *
+ * Inherit the parent's scheduler history.
+ */
+void
+sched_proc_fork(struct proc *parent, struct proc *child)
+{
+
+ LOCK_ASSERT(mutex_owned(&parent->p_smutex));
+
+ child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
+ child->p_forktime = schedcpu_ticks;
+}
+
+/*
+ * scheduler_wait_hook:
+ *
+ * Chargeback parents for the sins of their children.
+ */
+void
+sched_proc_exit(struct proc *parent, struct proc *child)
+{
+ fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+ fixpt_t estcpu;
+
+ /* XXX Only if parent != init?? */
+
+ mutex_spin_enter(&parent->p_stmutex);
+ estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
+ schedcpu_ticks - child->p_forktime);
+ if (child->p_estcpu > estcpu)
+ parent->p_estcpu =
+ ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
+ mutex_spin_exit(&parent->p_stmutex);
+}
+
+/*
+ * sched_changepri:
+ *
+ * Adjust the priority of an LWP.
+ */
+void
+sched_changepri(struct lwp *l, int pri)
+{
+
+ LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+ l->l_usrpri = pri;
+
+ if (l->l_priority < PUSER)
+ return;
+ if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0 ||
+ (l->l_priority / PPQ) == (pri / PPQ)) {
+ l->l_priority = pri;
+ return;
+ }
+
+ sched_dequeue(l);
+ l->l_priority = pri;
+ sched_enqueue(l);
+ resched_cpu(l, pri);
+}
+
+/*
+ * On some architectures, it's faster to use a MSB ordering for the priorites
+ * than the traditional LSB ordering.
+ */
+#ifdef __HAVE_BIGENDIAN_BITOPS
+#define RQMASK(n) (0x80000000 >> (n))
+#else
+#define RQMASK(n) (0x00000001 << (n))
+#endif
+
+/*
+ * Low-level routines to access the run queue. Optimised assembler
+ * routines can override these.
+ */
+
+#ifndef __HAVE_MD_RUNQUEUE
+
+/*
+ * The primitives that manipulate the run queues. whichqs tells which
+ * of the 32 queues qs have processes in them. Setrunqueue puts processes
+ * into queues, remrunqueue removes them from queues. The running process is
+ * on no queue, other processes are on a queue related to p->p_priority,
+ * divided by 4 actually to shrink the 0-127 range of priorities into the 32
+ * available queues.
+ */
+#ifdef RQDEBUG
+static void
+checkrunqueue(int whichq, struct lwp *l)
+{
+ const struct prochd * const rq = &sched_qs[whichq];
+ struct lwp *l2;
+ int found = 0;
+ int die = 0;
+ int empty = 1;
+ for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
+ if (l2->l_stat != LSRUN) {
+ printf("checkrunqueue[%d]: lwp %p state (%d) "
+ " != LSRUN\n", whichq, l2, l2->l_stat);
+ }
+ if (l2->l_back->l_forw != l2) {
+ printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
+ "corrupt %p\n", whichq, l2, l2->l_back,
+ l2->l_back->l_forw);
+ die = 1;
+ }
+ if (l2->l_forw->l_back != l2) {
+ printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
+ "corrupt %p\n", whichq, l2, l2->l_forw,
+ l2->l_forw->l_back);
+ die = 1;
+ }
+ if (l2 == l)
+ found = 1;
+ empty = 0;
+ }
+ if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
+ printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
+ whichq, rq);
+ die = 1;
+ } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
+ printf("checkrunqueue[%d]: bit clear for non-empty "
+ "run-queue %p\n", whichq, rq);
+ die = 1;
+ }
+ if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
+ printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
+ whichq, l);
+ die = 1;
+ }
+ if (l != NULL && empty) {
+ printf("checkrunqueue[%d]: empty run-queue %p with "
+ "active lwp %p\n", whichq, rq, l);
+ die = 1;
+ }
+ if (l != NULL && !found) {
+ printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
+ whichq, l, rq);
+ die = 1;
+ }
+ if (die)
+ panic("checkrunqueue: inconsistency found");
+}
+#endif /* RQDEBUG */
+
+void
+sched_enqueue(struct lwp *l)
+{
+ struct prochd*rq;
+ struct lwp *prev;
+ const int whichq = l->l_priority / PPQ;
+
+ LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+#ifdef RQDEBUG
+ checkrunqueue(whichq, NULL);
+#endif
+#ifdef DIAGNOSTIC
+ if (l->l_back != NULL || l->l_stat != LSRUN)
+ panic("setrunqueue");
+#endif
+ sched_whichqs |= RQMASK(whichq);
+ rq = &sched_qs[whichq];
+ prev = rq->ph_rlink;
+ l->l_forw = (struct lwp *)rq;
+ rq->ph_rlink = l;
+ prev->l_forw = l;
+ l->l_back = prev;
+#ifdef RQDEBUG
+ checkrunqueue(whichq, l);
+#endif
+}
+
+/*
+ * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
+ * drop of the effective priority level from kernel to user needs to be
+ * moved here from userret(). The assignment in userret() is currently
+ * done unlocked.
+ */
+void
+sched_dequeue(struct lwp *l)
+{
+ struct lwp *prev, *next;
+ const int whichq = l->l_priority / PPQ;
+
+ LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+#ifdef RQDEBUG
+ checkrunqueue(whichq, l);
+#endif
+
+#if defined(DIAGNOSTIC)
+ if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
+ /* Shouldn't happen - interrupts disabled. */
+ panic("remrunqueue: bit %d not set", whichq);
+ }
+#endif
+ prev = l->l_back;
+ l->l_back = NULL;
+ next = l->l_forw;
+ prev->l_forw = next;
+ next->l_back = prev;
+ if (prev == next)
+ sched_whichqs &= ~RQMASK(whichq);
+#ifdef RQDEBUG
+ checkrunqueue(whichq, NULL);
+#endif
+}
+
+struct lwp *
+sched_nextlwp(void)
+{
+ const struct prochd *rq;
+ struct lwp *l;
+ int whichq;
+
+ if (sched_whichqs == 0) {
+ return NULL;
+ }
+#ifdef __HAVE_BIGENDIAN_BITOPS
+ for (whichq = 0; ; whichq++) {
+ if ((sched_whichqs & RQMASK(whichq)) != 0) {
+ break;
+ }
+ }
+#else
+ whichq = ffs(sched_whichqs) - 1;
+#endif
+ rq = &sched_qs[whichq];
+ l = rq->ph_link;
+ return l;
+}
+
+#endif /* !defined(__HAVE_MD_RUNQUEUE) */
+
+#if defined(DDB)
+void
+sched_print_runqueue(void (*pr)(const char *, ...))
+{
+ struct prochd *ph;
+ struct lwp *l;
+ int i, first;
+
+ for (i = 0; i < RUNQUE_NQS; i++)
+ {
+ first = 1;
+ ph = &sched_qs[i];
+ for (l = ph->ph_link; l != (void *)ph; l = l->l_forw) {
+ if (first) {
+ (*pr)("%c%d",
+ (sched_whichqs & RQMASK(i))
+ ? ' ' : '!', i);
+ first = 0;
+ }
+ (*pr)("\t%d.%d (%s) pri=%d usrpri=%d\n",
+ l->l_proc->p_pid,
+ l->l_lid, l->l_proc->p_comm,
+ (int)l->l_priority, (int)l->l_usrpri);
+ }
+ }
+}
+#endif /* defined(DDB) */
+#undef RQMASK
Index: kern/sys_lwp.c
===================================================================
RCS file: /cvsroot/src/sys/kern/sys_lwp.c,v
retrieving revision 1.3
diff -u -r1.3 sys_lwp.c
--- kern/sys_lwp.c 15 Feb 2007 20:21:13 -0000 1.3
+++ kern/sys_lwp.c 17 Feb 2007 20:38:25 -0000
@@ -151,7 +151,7 @@
LOCK_ASSERT(lwp_locked(l2, &sched_mutex));
p->p_nrlwps++;
l2->l_stat = LSRUN;
- setrunqueue(l2);
+ sched_enqueue(l2);
}
} else
l2->l_stat = LSSUSPENDED;
Index: sys/lwp.h
===================================================================
RCS file: /cvsroot/src/sys/sys/lwp.h,v
retrieving revision 1.48.2.1
diff -u -r1.48.2.1 lwp.h
--- sys/lwp.h 17 Feb 2007 10:31:03 -0000 1.48.2.1
+++ sys/lwp.h 17 Feb 2007 20:38:26 -0000
@@ -218,24 +218,6 @@
lwp_update_creds(l); \
} while (/* CONSTCOND */ 0)
-void preempt (void);
-int mi_switch (struct lwp *, struct lwp *);
-#ifndef remrunqueue
-void remrunqueue (struct lwp *);
-#endif
-void resetpriority (struct lwp *);
-void setrunnable (struct lwp *);
-#ifndef setrunqueue
-void setrunqueue (struct lwp *);
-#endif
-#ifndef nextrunqueue
-struct lwp *nextrunqueue(void);
-#endif
-void unsleep (struct lwp *);
-#ifndef cpu_switchto
-struct lwp *cpu_switchto(struct lwp *, struct lwp *);
-#endif
-
void lwp_startup(struct lwp *, struct lwp *);
int lwp_locked(struct lwp *, kmutex_t *);
Index: sys/proc.h
===================================================================
RCS file: /cvsroot/src/sys/sys/proc.h,v
retrieving revision 1.236.2.1
diff -u -r1.236.2.1 proc.h
--- sys/proc.h 17 Feb 2007 10:31:04 -0000 1.236.2.1
+++ sys/proc.h 17 Feb 2007 20:38:26 -0000
@@ -538,7 +538,6 @@
void yield(void);
void pgdelete(struct pgrp *);
void procinit(void);
-void resetprocpriority(struct proc *);
void suspendsched(void);
int ltsleep(wchan_t, int, const char *, int,
volatile struct simplelock *);
@@ -554,7 +553,6 @@
void exit_lwps(struct lwp *l);
int fork1(struct lwp *, int, int, void *, size_t,
void (*)(void *), void *, register_t *, struct proc **);
-void rqinit(void);
int pgid_in_session(struct proc *, pid_t);
#ifndef cpu_idle
void cpu_idle(void);
Index: sys/sched.h
===================================================================
RCS file: /cvsroot/src/sys/sys/sched.h,v
retrieving revision 1.30.2.1
diff -u -r1.30.2.1 sched.h
--- sys/sched.h 17 Feb 2007 10:31:06 -0000 1.30.2.1
+++ sys/sched.h 17 Feb 2007 20:38:27 -0000
@@ -5,7 +5,8 @@
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
- * by Ross Harvey, Jason R. Thorpe, Nathan J. Williams, and Andrew Doran.
+ * by Ross Harvey, Jason R. Thorpe, Nathan J. Williams, Andrew Doran and
+ * Daniel Sieger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -80,8 +81,17 @@
#if defined(_KERNEL_OPT)
#include "opt_multiprocessor.h"
#include "opt_lockdebug.h"
+#include "opt_sched.h"
#endif
+#if _KERNEL
+
+#if defined(SCHED_4BSD)
+#include <sys/sched_4bsd.h>
+#endif
+
+#endif /* _KERNEL */
+
struct sched_param {
int sched_priority;
};
@@ -100,21 +110,6 @@
#include <sys/time.h>
/*
- * Run queues.
- *
- * We have 32 run queues in descending priority of 0..31. We maintain
- * a bitmask of non-empty queues in order speed up finding the first
- * runnable process. The bitmask is maintained only by machine-dependent
- * code, allowing the most efficient instructions to be used to find the
- * first non-empty queue.
- */
-#define RUNQUE_NQS 32
-struct prochd {
- struct lwp *ph_link;
- struct lwp *ph_rlink;
-};
-
-/*
* CPU states.
* XXX Not really scheduler state, but no other good place to put
* it right now, and it really is per-CPU.
@@ -135,7 +130,7 @@
u_int spc_schedticks; /* ticks for schedclock() */
uint64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
u_char spc_curpriority; /* usrpri of curproc */
- int spc_rrticks; /* ticks until roundrobin() */
+ int spc_ticks; /* ticks until sched_tick() */
int spc_pscnt; /* prof/stat counter */
int spc_psdiv; /* prof/stat divisor */
};
@@ -164,21 +159,45 @@
#ifdef _KERNEL
-extern int schedhz; /* ideally: 16 */
-extern int rrticks; /* ticks per roundrobin() */
-
struct proc;
struct cpu_info;
-void schedclock(struct lwp *);
-void roundrobin(struct cpu_info *);
-int sched_kpri(struct lwp *);
+/*
+ * Common Scheduler Interface
+ */
+
+/* Scheduler initialization */
+void sched_rqinit(void); /* Initialize runqueues */
+void sched_setup(void); /* Setup scheduler, e.g. kick off timeout driven events */
+
+/* Main scheduler functions */
+void sched_tick(struct cpu_info *); /* Maybe resched after spc_ticks hardclock() ticks */
+
+/* Runqueue-related functions */
+inline boolean_t sched_curcpu_runnable_p(void); /* Indicate runnable processes on current CPU */
+struct lwp * sched_nextlwp(void); /* Select LWP to run on the CPU next */
+void sched_enqueue(struct lwp *); /* Place a process on its runqueue */
+void sched_dequeue(struct lwp *); /* Remove a process from its runqueue */
+
+/* Priority adjustment */
+void sched_nice(struct proc *, int); /* Recalc priority according to its nice value */
+void sched_changepri(struct lwp *, int); /* Change priority of a LWP */
+
+/* General helper functions */
+void sched_proc_fork(struct proc *, struct proc *); /* Inherit scheduling history */
+void sched_proc_exit(struct proc *, struct proc *); /* Chargeback parents */
+void sched_print_runqueue(void (*pr)(const char *, ...)); /* Print runqueues in DDB */
+void sched_setrunnable(struct lwp *); /* Scheduler-specific actions for setrunnable() */
-void scheduler_fork_hook(struct proc *, struct proc *);
-void scheduler_wait_hook(struct proc *, struct proc *);
+/* Functions common to all scheduler implementations */
+void sched_wakeup(volatile const void *);
+int sched_kpri(struct lwp *);
-boolean_t sched_curcpu_runnable_p(void);
-void sched_print_runqueue(void (*)(const char *, ...));
+inline void resched_cpu(struct lwp *, u_char); /* Arrange reschedule */
+void setrunnable(struct lwp *);
+void preempt(void);
+int mi_switch(struct lwp *, struct lwp *);
+struct lwp *cpu_switchto(struct lwp *, struct lwp *);
/*
* Synchronisation object operations set.
Index: sys/sched_4bsd.h
===================================================================
RCS file: sys/sched_4bsd.h
diff -N sys/sched_4bsd.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ sys/sched_4bsd.h 17 Feb 2007 20:38:27 -0000
@@ -0,0 +1,83 @@
+/* $NetBSD: sched.h,v 1.29 2007/02/09 21:55:37 ad Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Ross Harvey, Jason R. Thorpe, Nathan J. Williams, Andrew Doran and
+ * Daniel Sieger.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ */
+
+#ifndef _SYS_SCHED_4BSD_H_
+#define _SYS_SCHED_4BSD_H_
+
+extern int schedhz; /* ideally: 16 */
+
+void schedclock(struct lwp *);
+
+#endif /* _SYS_SCHED_4BSD_H_ */
Index: uvm/uvm_glue.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_glue.c,v
retrieving revision 1.99
diff -u -r1.99 uvm_glue.c
--- uvm/uvm_glue.c 15 Feb 2007 20:21:13 -0000 1.99
+++ uvm/uvm_glue.c 17 Feb 2007 20:38:32 -0000
@@ -443,7 +443,7 @@
cpu_swapin(l);
lwp_lock(l);
if (l->l_stat == LSRUN)
- setrunqueue(l);
+ sched_enqueue(l);
l->l_flag |= L_INMEM;
l->l_swtime = 0;
lwp_unlock(l);
@@ -698,7 +698,7 @@
l->l_flag &= ~L_INMEM;
l->l_swtime = 0;
if (l->l_stat == LSRUN)
- remrunqueue(l);
+ sched_dequeue(l);
lwp_unlock(l);
p->p_stats->p_ru.ru_nswap++; /* XXXSMP */
++uvmexp.swapouts;
--3uo+9/B/ebqu+fSQ--