Subject: Scheduler API changes for yamt-idlelwp
To: None <tech-kern@netbsd.org>
From: Daniel Sieger <dsieger@TechFak.Uni-Bielefeld.DE>
List: tech-kern
Date: 02/17/2007 22:17:36
--3uo+9/B/ebqu+fSQ
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi folks,
Here's a proposed patch to introduce a new scheduler API in
Yamamoto-san's idle lwp branch. It is a slightly modified/improved
version of the patch I send a couple of weeks ago (see [1]).

Note: schedcpu() will need some further work as some parts are
scheduler dependent and others are not. For example, schedcpu()
currently takes care of updating p->p_estcpu and p->p_pctcpu. While
the former is highly 4BSD scheduler specific, the latter is a POSIX
requirement. Simply splitting schedcpu() into two distinct functions
is not an option, since it has a negative impact on performance.

Backwards compatibility is also an issue here. Since we can't simply
get rid of p_estcpu, we may need to retain schedcpu() even for other
schedulers, which is not very nice.

Comments, suggestions?

Regards,
Daniel

[1]: http://mail-index.netbsd.org/tech-kern/2007/01/14/0006.html

-- 
Daniel Sieger
Faculty of Technology
Bielefeld University
wwwhomes.uni-bielefeld.de/dsieger

--3uo+9/B/ebqu+fSQ
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="csf.diff"

Index: arch/i386/conf/GENERIC
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/conf/GENERIC,v
retrieving revision 1.808
diff -u -r1.808 GENERIC
--- arch/i386/conf/GENERIC	17 Feb 2007 00:28:23 -0000	1.808
+++ arch/i386/conf/GENERIC	17 Feb 2007 20:38:31 -0000
@@ -119,6 +119,9 @@
 #options 	BUFQ_READPRIO
 #options 	BUFQ_PRIOCSCAN
 
+# Scheduling algorithm
+options		SCHED_4BSD
+
 # Diagnostic/debugging support options
 #options 	DIAGNOSTIC	# expensive kernel consistency checks
 #options 	DEBUG		# expensive debugging checks/support
Index: conf/files
===================================================================
RCS file: /cvsroot/src/sys/conf/files,v
retrieving revision 1.831.2.1
diff -u -r1.831.2.1 files
--- conf/files	17 Feb 2007 10:30:49 -0000	1.831.2.1
+++ conf/files	17 Feb 2007 20:38:15 -0000
@@ -36,6 +36,8 @@
 defflag				BUFQ_READPRIO
 defflag				NEW_BUFQ_STRATEGY	# same as BUFQ_READPRIO
 
+defflag opt_sched.h             SCHED_4BSD
+
 defparam			SOMAXKVA
 defflag	opt_sock_counters.h	SOSEND_COUNTERS
 defflag	opt_sosend_loan.h	SOSEND_NO_LOAN
@@ -1318,6 +1320,7 @@
 file	kern/kern_uuid.c
 file	kern/kern_xxx.c
 file	kern/kgdb_stub.c		kgdb
+file    kern/sched_4bsd.c               sched_4bsd
 file	kern/subr_autoconf.c
 file	kern/subr_blist.c		vmswap
 file	kern/subr_bufq.c
Index: kern/init_main.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_main.c,v
retrieving revision 1.294.2.1
diff -u -r1.294.2.1 init_main.c
--- kern/init_main.c	17 Feb 2007 10:30:51 -0000	1.294.2.1
+++ kern/init_main.c	17 Feb 2007 20:38:16 -0000
@@ -120,6 +120,7 @@
 #include <sys/sysctl.h>
 #include <sys/event.h>
 #include <sys/mbuf.h>
+#include <sys/sched.h>
 #include <sys/sleepq.h>
 #include <sys/iostat.h>
 #ifdef FAST_IPSEC
@@ -242,7 +243,6 @@
 	struct pdevinit *pdev;
 	int s, error;
 	extern struct pdevinit pdevinit[];
-	extern void schedcpu(void *);
 #ifdef NVNODE_IMPLICIT
 	int usevnodes;
 #endif
@@ -339,7 +339,7 @@
 	(void)chgproccnt(0, 1);
 
 	/* Initialize the run queues, turnstiles and sleep queues. */
-	rqinit();
+	sched_rqinit();
 	turnstile_init();
 	sleeptab_init(&sleeptab);
 
@@ -467,8 +467,8 @@
 	/* Initialize system accouting. */
 	acct_init();
 
-	/* Kick off timeout driven events by calling first time. */
-	schedcpu(NULL);
+	/* Setup the scheduler */
+	sched_setup();
 
 #ifdef KTRACE
 	/* Initialize ktrace. */
Index: kern/kern_clock.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_clock.c,v
retrieving revision 1.106.2.1
diff -u -r1.106.2.1 kern_clock.c
--- kern/kern_clock.c	17 Feb 2007 10:30:53 -0000	1.106.2.1
+++ kern/kern_clock.c	17 Feb 2007 20:38:19 -0000
@@ -331,7 +331,9 @@
 int	schedhz;
 int	profprocs;
 int	hardclock_ticks;
+#ifdef SCHED_4BSD
 static int statscheddiv; /* stat => sched divider (used if schedhz == 0) */
+#endif
 static int psdiv;			/* prof => stat divider */
 int	psratio;			/* ratio: prof / stat */
 #ifndef __HAVE_TIMECOUNTER
@@ -404,19 +406,20 @@
 	cpu_initclocks();
 
 	/*
-	 * Compute profhz/stathz/rrticks, and fix profhz if needed.
+	 * Compute profhz and stathz, fix profhz if needed.
 	 */
 	i = stathz ? stathz : hz;
 	if (profhz == 0)
 		profhz = i;
 	psratio = profhz / i;
-	rrticks = hz / 10;
+#ifdef SCHED_4BSD
 	if (schedhz == 0) {
 		/* 16Hz is best */
 		statscheddiv = i / 16;
 		if (statscheddiv <= 0)
 			panic("statscheddiv");
 	}
+#endif /* SCHED_4BSD */
 
 #ifndef __HAVE_TIMECOUNTER
 #ifdef NTP
@@ -533,8 +536,8 @@
 	 */
 	if (stathz == 0)
 		statclock(frame);
-	if ((--ci->ci_schedstate.spc_rrticks) <= 0)
-		roundrobin(ci);
+	if ((--ci->ci_schedstate.spc_ticks) <= 0)
+		sched_tick(ci);
 
 #if defined(MULTIPROCESSOR)
 	/*
@@ -1243,6 +1246,7 @@
 	++p->p_cpticks;
 	mutex_spin_exit(&p->p_stmutex);
 
+#ifdef SCHED_4BSD
 	/*
 	 * If no separate schedclock is provided, call it here
 	 * at about 16 Hz.
@@ -1253,6 +1257,7 @@
 			ci->ci_schedstate.spc_schedticks = statscheddiv;
 		}
 	}
+#endif /* SCHED_4BSD */
 }
 
 #ifndef __HAVE_TIMECOUNTER
Index: kern/kern_exit.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_exit.c,v
retrieving revision 1.166.2.1
diff -u -r1.166.2.1 kern_exit.c
--- kern/kern_exit.c	17 Feb 2007 10:30:54 -0000	1.166.2.1
+++ kern/kern_exit.c	17 Feb 2007 20:38:21 -0000
@@ -892,7 +892,7 @@
 	leavepgrp(p);
 
 	parent = p->p_pptr;
-	scheduler_wait_hook(parent, p);
+	sched_proc_exit(parent, p);
 	ruadd(&parent->p_stats->p_cru, p->p_ru);
 	p->p_xstat = 0;
 
Index: kern/kern_fork.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_fork.c,v
retrieving revision 1.131.2.1
diff -u -r1.131.2.1 kern_fork.c
--- kern/kern_fork.c	17 Feb 2007 10:30:55 -0000	1.131.2.1
+++ kern/kern_fork.c	17 Feb 2007 20:38:21 -0000
@@ -397,7 +397,7 @@
 	p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS);
 	p2->p_sflag |=
 	    (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
-	scheduler_fork_hook(p1, p2);
+	sched_proc_fork(p1, p2);
 	mutex_exit(&p1->p_smutex);
 
 	p2->p_stflag = p1->p_stflag;
@@ -520,7 +520,7 @@
 		lwp_lock(l2);
 		l2->l_stat = LSRUN;
 		l2->l_flag |= tmp;
-		setrunqueue(l2);
+		sched_enqueue(l2);
 		lwp_unlock(l2);
 	}
 
Index: kern/kern_resource.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_resource.c,v
retrieving revision 1.113
diff -u -r1.113 kern_resource.c
--- kern/kern_resource.c	9 Feb 2007 21:55:31 -0000	1.113
+++ kern/kern_resource.c	17 Feb 2007 20:38:22 -0000
@@ -230,8 +230,7 @@
 		mutex_spin_exit(&chgp->p_stmutex);
 		goto again;
 	}
-	chgp->p_nice = n;
-	(void)resetprocpriority(chgp);
+	sched_nice(chgp, n);
 	mutex_spin_exit(&chgp->p_stmutex);
 	return (0);
 }
Index: kern/kern_sleepq.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_sleepq.c,v
retrieving revision 1.4
diff -u -r1.4 kern_sleepq.c
--- kern/kern_sleepq.c	15 Feb 2007 20:21:13 -0000	1.4
+++ kern/kern_sleepq.c	17 Feb 2007 20:38:23 -0000
@@ -171,12 +171,11 @@
 	 * Set it running.  We'll try to get the last CPU that ran
 	 * this LWP to pick it up again.
 	 */
-	if (l->l_slptime > 1)
-		updatepri(l);
+	sched_setrunnable(l);
 	l->l_stat = LSRUN;
 	l->l_slptime = 0;
 	if ((l->l_flag & L_INMEM) != 0) {
-		setrunqueue(l);
+		sched_enqueue(l);
 		if (l->l_priority < ci->ci_schedstate.spc_curpriority)
 			cpu_need_resched(ci);
 		sched_unlock(1);
Index: kern/kern_synch.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_synch.c,v
retrieving revision 1.177.2.3
diff -u -r1.177.2.3 kern_synch.c
--- kern/kern_synch.c	17 Feb 2007 11:13:51 -0000	1.177.2.3
+++ kern/kern_synch.c	17 Feb 2007 20:38:23 -0000
@@ -6,7 +6,8 @@
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
+ * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
+ * Daniel Sieger.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -105,23 +106,13 @@
 #include <machine/cpu.h>
 
 int	lbolt;			/* once a second sleep address */
-int	rrticks;		/* number of hardclock ticks per roundrobin() */
 
 /*
  * The global scheduler state.
  */
 kmutex_t	sched_mutex;		/* global sched state mutex */
-struct prochd	sched_qs[RUNQUE_NQS];	/* run queues */
-volatile uint32_t sched_whichqs;	/* bitmap of non-empty queues */
-
-void	schedcpu(void *);
-void	updatepri(struct lwp *);
 
 void	sched_unsleep(struct lwp *);
-void	sched_changepri(struct lwp *, int);
-
-struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
-static unsigned int schedcpu_ticks;
 
 syncobj_t sleep_syncobj = {
 	SOBJ_SLEEPQ_SORTED,
@@ -136,306 +127,6 @@
 };
 
 /*
- * Force switch among equal priority processes every 100ms.
- * Called from hardclock every hz/10 == rrticks hardclock ticks.
- */
-/* ARGSUSED */
-void
-roundrobin(struct cpu_info *ci)
-{
-	struct schedstate_percpu *spc = &ci->ci_schedstate;
-
-	spc->spc_rrticks = rrticks;
-
-	if (!CURCPU_IDLE_P()) {
-		if (spc->spc_flags & SPCF_SEENRR) {
-			/*
-			 * The process has already been through a roundrobin
-			 * without switching and may be hogging the CPU.
-			 * Indicate that the process should yield.
-			 */
-			spc->spc_flags |= SPCF_SHOULDYIELD;
-		} else
-			spc->spc_flags |= SPCF_SEENRR;
-	}
-	cpu_need_resched(curcpu());
-}
-
-#define	PPQ	(128 / RUNQUE_NQS)	/* priorities per queue */
-#define	NICE_WEIGHT 2			/* priorities per nice level */
-
-#define	ESTCPU_SHIFT	11
-#define	ESTCPU_MAX	((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
-#define	ESTCPULIM(e)	min((e), ESTCPU_MAX)
-
-/*
- * Constants for digital decay and forget:
- *	90% of (p_estcpu) usage in 5 * loadav time
- *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
- *          Note that, as ps(1) mentions, this can let percentages
- *          total over 100% (I've seen 137.9% for 3 processes).
- *
- * Note that hardclock updates p_estcpu and p_cpticks independently.
- *
- * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
- * That is, the system wants to compute a value of decay such
- * that the following for loop:
- * 	for (i = 0; i < (5 * loadavg); i++)
- * 		p_estcpu *= decay;
- * will compute
- * 	p_estcpu *= 0.1;
- * for all values of loadavg:
- *
- * Mathematically this loop can be expressed by saying:
- * 	decay ** (5 * loadavg) ~= .1
- *
- * The system computes decay as:
- * 	decay = (2 * loadavg) / (2 * loadavg + 1)
- *
- * We wish to prove that the system's computation of decay
- * will always fulfill the equation:
- * 	decay ** (5 * loadavg) ~= .1
- *
- * If we compute b as:
- * 	b = 2 * loadavg
- * then
- * 	decay = b / (b + 1)
- *
- * We now need to prove two things:
- *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
- *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
- *
- * Facts:
- *         For x close to zero, exp(x) =~ 1 + x, since
- *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
- *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
- *         For x close to zero, ln(1+x) =~ x, since
- *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
- *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
- *         ln(.1) =~ -2.30
- *
- * Proof of (1):
- *    Solve (factor)**(power) =~ .1 given power (5*loadav):
- *	solving for factor,
- *      ln(factor) =~ (-2.30/5*loadav), or
- *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
- *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
- *
- * Proof of (2):
- *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
- *	solving for power,
- *      power*ln(b/(b+1)) =~ -2.30, or
- *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
- *
- * Actual power values for the implemented algorithm are as follows:
- *      loadav: 1       2       3       4
- *      power:  5.68    10.32   14.94   19.55
- */
-
-/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
-#define	loadfactor(loadav)	(2 * (loadav))
-
-static fixpt_t
-decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
-{
-
-	if (estcpu == 0) {
-		return 0;
-	}
-
-#if !defined(_LP64)
-	/* avoid 64bit arithmetics. */
-#define	FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
-	if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
-		return estcpu * loadfac / (loadfac + FSCALE);
-	}
-#endif /* !defined(_LP64) */
-
-	return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
-}
-
-/*
- * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
- * sleeping for at least seven times the loadfactor will decay p_estcpu to
- * less than (1 << ESTCPU_SHIFT).
- *
- * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
- */
-static fixpt_t
-decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
-{
-
-	if ((n << FSHIFT) >= 7 * loadfac) {
-		return 0;
-	}
-
-	while (estcpu != 0 && n > 1) {
-		estcpu = decay_cpu(loadfac, estcpu);
-		n--;
-	}
-
-	return estcpu;
-}
-
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
-
-/*
- * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
- * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
- * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
- *
- * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
- *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
- *
- * If you dont want to bother with the faster/more-accurate formula, you
- * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
- * (more general) method of calculating the %age of CPU used by a process.
- */
-#define	CCPU_SHIFT	11
-
-/*
- * schedcpu:
- *
- *	Recompute process priorities, every hz ticks.
- *
- *	XXXSMP This needs to be reorganised in order to reduce the locking
- *	burden.
- */
-/* ARGSUSED */
-void
-schedcpu(void *arg)
-{
-	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
-	struct rlimit *rlim;
-	struct lwp *l;
-	struct proc *p;
-	int minslp, clkhz, sig;
-	long runtm;
-
-	schedcpu_ticks++;
-
-	mutex_enter(&proclist_mutex);
-	PROCLIST_FOREACH(p, &allproc) {
-		/*
-		 * Increment time in/out of memory and sleep time (if
-		 * sleeping).  We ignore overflow; with 16-bit int's
-		 * (remember them?) overflow takes 45 days.
-		 */
-		minslp = 2;
-		mutex_enter(&p->p_smutex);
-		runtm = p->p_rtime.tv_sec;
-		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
-			if ((l->l_flag & L_IDLE) != 0)
-				continue;
-			lwp_lock(l);
-			runtm += l->l_rtime.tv_sec;
-			l->l_swtime++;
-			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
-			    l->l_stat == LSSUSPENDED) {
-				l->l_slptime++;
-				minslp = min(minslp, l->l_slptime);
-			} else
-				minslp = 0;
-			lwp_unlock(l);
-		}
-		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
-
-		/*
-		 * Check if the process exceeds its CPU resource allocation.
-		 * If over max, kill it.
-		 */
-		rlim = &p->p_rlimit[RLIMIT_CPU];
-		sig = 0;
-		if (runtm >= rlim->rlim_cur) {
-			if (runtm >= rlim->rlim_max)
-				sig = SIGKILL;
-			else {
-				sig = SIGXCPU;
-				if (rlim->rlim_cur < rlim->rlim_max)
-					rlim->rlim_cur += 5;
-			}
-		}
-
-		/* 
-		 * If the process has run for more than autonicetime, reduce
-		 * priority to give others a chance.
-		 */
-		if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
-		    && kauth_cred_geteuid(p->p_cred)) {
-			mutex_spin_enter(&p->p_stmutex);
-			p->p_nice = autoniceval + NZERO;
-			resetprocpriority(p);
-			mutex_spin_exit(&p->p_stmutex);
-		}
-
-		/*
-		 * If the process has slept the entire second,
-		 * stop recalculating its priority until it wakes up.
-		 */
-		if (minslp <= 1) {
-			/*
-			 * p_pctcpu is only for ps.
-			 */
-			mutex_spin_enter(&p->p_stmutex);
-			clkhz = stathz != 0 ? stathz : hz;
-#if	(FSHIFT >= CCPU_SHIFT)
-			p->p_pctcpu += (clkhz == 100)?
-			    ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
-			    100 * (((fixpt_t) p->p_cpticks)
-			    << (FSHIFT - CCPU_SHIFT)) / clkhz;
-#else
-			p->p_pctcpu += ((FSCALE - ccpu) *
-			    (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
-#endif
-			p->p_cpticks = 0;
-			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
-
-			LIST_FOREACH(l, &p->p_lwps, l_sibling) {
-				if ((l->l_flag & L_IDLE) != 0)
-					continue;
-				lwp_lock(l);
-				if (l->l_slptime <= 1 &&
-				    l->l_priority >= PUSER)
-					resetpriority(l);
-				lwp_unlock(l);
-			}
-			mutex_spin_exit(&p->p_stmutex);
-		}
-
-		mutex_exit(&p->p_smutex);
-		if (sig) {
-			psignal(p, sig);
-		}
-	}
-	mutex_exit(&proclist_mutex);
-	uvm_meter();
-	wakeup((caddr_t)&lbolt);
-	callout_schedule(&schedcpu_ch, hz);
-}
-
-/*
- * Recalculate the priority of a process after it has slept for a while.
- */
-void
-updatepri(struct lwp *l)
-{
-	struct proc *p = l->l_proc;
-	fixpt_t loadfac;
-
-	LOCK_ASSERT(lwp_locked(l, NULL));
-	KASSERT(l->l_slptime > 1);
-
-	loadfac = loadfactor(averunnable.ldavg[0]);
-
-	l->l_slptime--; /* the first time was done in schedcpu */
-	/* XXX NJWLWP */
-	/* XXXSMP occasionally unlocked, should be per-LWP */
-	p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
-	resetpriority(l);
-}
-
-/*
  * During autoconfiguration or after a panic, a sleep will simply lower the
  * priority briefly to allow interrupts, then return.  The priority to be
  * used (safepri) is machine-dependent, thus this value is initialized and
@@ -703,7 +394,7 @@
 		KASSERT(lwp_locked(l, &sched_mutex));
 		l->l_stat = LSRUN;
 		if ((l->l_flag & L_IDLE) == 0) {
-			setrunqueue(l);
+			sched_enqueue(l);
 		}
 	}
 	uvmexp.swtch++;
@@ -732,11 +423,11 @@
 #endif
 
 	if (newl == NULL) {
-		newl = nextrunqueue();
+		newl = sched_nextlwp();
 	}
 	if (newl != NULL) {
 		KASSERT(lwp_locked(newl, &sched_mutex));
-		remrunqueue(newl);
+		sched_dequeue(newl);
 	} else {
 		newl = l->l_cpu->ci_data.cpu_idlelwp;
 		KASSERT(newl != NULL);
@@ -790,54 +481,6 @@
 }
 
 /*
- * Initialize the (doubly-linked) run queues
- * to be empty.
- */
-void
-rqinit()
-{
-	int i;
-
-	for (i = 0; i < RUNQUE_NQS; i++)
-		sched_qs[i].ph_link = sched_qs[i].ph_rlink =
-		    (struct lwp *)&sched_qs[i];
-
-	mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
-}
-
-static inline void
-resched_lwp(struct lwp *l, u_char pri)
-{
-	struct cpu_info *ci;
-
-	/*
-	 * XXXSMP
-	 * Since l->l_cpu persists across a context switch,
-	 * this gives us *very weak* processor affinity, in
-	 * that we notify the CPU on which the process last
-	 * ran that it should try to switch.
-	 *
-	 * This does not guarantee that the process will run on
-	 * that processor next, because another processor might
-	 * grab it the next time it performs a context switch.
-	 *
-	 * This also does not handle the case where its last
-	 * CPU is running a higher-priority process, but every
-	 * other CPU is running a lower-priority process.  There
-	 * are ways to handle this situation, but they're not
-	 * currently very pretty, and we also need to weigh the
-	 * cost of moving a process from one CPU to another.
-	 *
-	 * XXXSMP
-	 * There is also the issue of locking the other CPU's
-	 * sched state, which we currently do not do.
-	 */
-	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
-	if (pri < ci->ci_schedstate.spc_curpriority)
-		cpu_need_resched(ci);
-}
-
-/*
  * Change process state to be runnable, placing it on the run queue if it is
  * in memory, and awakening the swapper if it isn't in memory.
  *
@@ -920,14 +563,13 @@
 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
 	 * to bring it back in.  Otherwise, enter it into a run queue.
 	 */
-	if (l->l_slptime > 1)
-		updatepri(l);
+	sched_setrunnable(l);
 	l->l_stat = LSRUN;
 	l->l_slptime = 0;
 
 	if (l->l_flag & L_INMEM) {
-		setrunqueue(l);
-		resched_lwp(l, l->l_priority);
+		sched_enqueue(l);
+		resched_cpu(l, l->l_priority);
 		lwp_unlock(l);
 	} else {
 		lwp_unlock(l);
@@ -935,84 +577,6 @@
 	}
 }
 
-boolean_t
-sched_curcpu_runnable_p(void)
-{
-
-	return sched_whichqs != 0;
-}
-
-/*
- * Compute the priority of a process when running in user mode.
- * Arrange to reschedule if the resulting priority is better
- * than that of the current process.
- */
-void
-resetpriority(struct lwp *l)
-{
-	unsigned int newpriority;
-	struct proc *p = l->l_proc;
-
-	/* XXXSMP LOCK_ASSERT(mutex_owned(&p->p_stmutex)); */
-	LOCK_ASSERT(lwp_locked(l, NULL));
-
-	if ((l->l_flag & L_SYSTEM) != 0)
-		return;
-
-	newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
-	    NICE_WEIGHT * (p->p_nice - NZERO);
-	newpriority = min(newpriority, MAXPRI);
-	lwp_changepri(l, newpriority);
-}
-
-/*
- * Recompute priority for all LWPs in a process.
- */
-void
-resetprocpriority(struct proc *p)
-{
-	struct lwp *l;
-
-	LOCK_ASSERT(mutex_owned(&p->p_stmutex));
-
-	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
-		lwp_lock(l);
-		resetpriority(l);
-		lwp_unlock(l);
-	}
-}
-
-/*
- * We adjust the priority of the current process.  The priority of a process
- * gets worse as it accumulates CPU time.  The CPU usage estimator (p_estcpu)
- * is increased here.  The formula for computing priorities (in kern_synch.c)
- * will compute a different value each time p_estcpu increases. This can
- * cause a switch, but unless the priority crosses a PPQ boundary the actual
- * queue will not change.  The CPU usage estimator ramps up quite quickly
- * when the process is running (linearly), and decays away exponentially, at
- * a rate which is proportionally slower when the system is busy.  The basic
- * principle is that the system will 90% forget that the process used a lot
- * of CPU time in 5 * loadav seconds.  This causes the system to favor
- * processes which haven't run much recently, and to round-robin among other
- * processes.
- */
-
-void
-schedclock(struct lwp *l)
-{
-	struct proc *p = l->l_proc;
-
-	KASSERT(!CURCPU_IDLE_P());
-	mutex_spin_enter(&p->p_stmutex);
-	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
-	lwp_lock(l);
-	resetpriority(l);
-	mutex_spin_exit(&p->p_stmutex);
-	if ((l->l_flag & L_SYSTEM) == 0 && l->l_priority >= PUSER)
-		l->l_priority = l->l_usrpri;
-	lwp_unlock(l);
-}
-
 /*
  * suspendsched:
  *
@@ -1086,43 +650,6 @@
 }
 
 /*
- * scheduler_fork_hook:
- *
- *	Inherit the parent's scheduler history.
- */
-void
-scheduler_fork_hook(struct proc *parent, struct proc *child)
-{
-
-	LOCK_ASSERT(mutex_owned(&parent->p_smutex));
-
-	child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
-	child->p_forktime = schedcpu_ticks;
-}
-
-/*
- * scheduler_wait_hook:
- *
- *	Chargeback parents for the sins of their children.
- */
-void
-scheduler_wait_hook(struct proc *parent, struct proc *child)
-{
-	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
-	fixpt_t estcpu;
-
-	/* XXX Only if parent != init?? */
-
-	mutex_spin_enter(&parent->p_stmutex);
-	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
-	    schedcpu_ticks - child->p_forktime);
-	if (child->p_estcpu > estcpu)
-		parent->p_estcpu =
-		    ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
-	mutex_spin_exit(&parent->p_stmutex);
-}
-
-/*
  * sched_kpri:
  *
  *	Scale a priority level to a kernel priority level, usually
@@ -1174,232 +701,34 @@
 	panic("sched_unsleep");
 }
 
-/*
- * sched_changepri:
- *
- *	Adjust the priority of an LWP.
- */
-void
-sched_changepri(struct lwp *l, int pri)
+inline void
+resched_cpu(struct lwp *l, u_char pri)
 {
+	struct cpu_info *ci;
 
-	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
-
-	l->l_usrpri = pri;
-
-	if (l->l_priority < PUSER)
-		return;
-	if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0 ||
-	    (l->l_priority / PPQ) == (pri / PPQ)) {
-		l->l_priority = pri;
-		return;
-	}
-
-	remrunqueue(l);
-	l->l_priority = pri;
-	setrunqueue(l);
-	resched_lwp(l, pri);
-}
-
-/*
- * On some architectures, it's faster to use a MSB ordering for the priorites
- * than the traditional LSB ordering.
- */
-#ifdef __HAVE_BIGENDIAN_BITOPS
-#define	RQMASK(n) (0x80000000 >> (n))
-#else
-#define	RQMASK(n) (0x00000001 << (n))
-#endif
-
-/*
- * Low-level routines to access the run queue.  Optimised assembler
- * routines can override these.
- */
-
-#ifndef __HAVE_MD_RUNQUEUE
-
-/*
- * The primitives that manipulate the run queues.  whichqs tells which
- * of the 32 queues qs have processes in them.  Setrunqueue puts processes
- * into queues, remrunqueue removes them from queues.  The running process is
- * on no queue, other processes are on a queue related to p->p_priority,
- * divided by 4 actually to shrink the 0-127 range of priorities into the 32
- * available queues.
- */
-#ifdef RQDEBUG
-static void
-checkrunqueue(int whichq, struct lwp *l)
-{
-	const struct prochd * const rq = &sched_qs[whichq];
-	struct lwp *l2;
-	int found = 0;
-	int die = 0;
-	int empty = 1;
-	for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
-		if (l2->l_stat != LSRUN) {
-			printf("checkrunqueue[%d]: lwp %p state (%d) "
-			    " != LSRUN\n", whichq, l2, l2->l_stat);
-		}
-		if (l2->l_back->l_forw != l2) {
-			printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
-			    "corrupt %p\n", whichq, l2, l2->l_back,
-			    l2->l_back->l_forw);
-			die = 1;
-		}
-		if (l2->l_forw->l_back != l2) {
-			printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
-			    "corrupt %p\n", whichq, l2, l2->l_forw,
-			    l2->l_forw->l_back);
-			die = 1;
-		}
-		if (l2 == l)
-			found = 1;
-		empty = 0;
-	}
-	if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
-		printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
-		    whichq, rq);
-		die = 1;
-	} else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
-		printf("checkrunqueue[%d]: bit clear for non-empty "
-		    "run-queue %p\n", whichq, rq);
-		die = 1;
-	}
-	if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
-		printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
-		    whichq, l);
-		die = 1;
-	}
-	if (l != NULL && empty) {
-		printf("checkrunqueue[%d]: empty run-queue %p with "
-		    "active lwp %p\n", whichq, rq, l);
-		die = 1;
-	}
-	if (l != NULL && !found) {
-		printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
-		    whichq, l, rq);
-		die = 1;
-	}
-	if (die)
-		panic("checkrunqueue: inconsistency found");
-}
-#endif /* RQDEBUG */
-
-void
-setrunqueue(struct lwp *l)
-{
-	struct prochd *rq;
-	struct lwp *prev;
-	const int whichq = l->l_priority / PPQ;
-
-	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
-
-#ifdef RQDEBUG
-	checkrunqueue(whichq, NULL);
-#endif
-#ifdef DIAGNOSTIC
-	if (l->l_back != NULL || l->l_stat != LSRUN)
-		panic("setrunqueue");
-#endif
-	sched_whichqs |= RQMASK(whichq);
-	rq = &sched_qs[whichq];
-	prev = rq->ph_rlink;
-	l->l_forw = (struct lwp *)rq;
-	rq->ph_rlink = l;
-	prev->l_forw = l;
-	l->l_back = prev;
-#ifdef RQDEBUG
-	checkrunqueue(whichq, l);
-#endif
-}
-
-/*
- * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
- * drop of the effective priority level from kernel to user needs to be
- * moved here from userret().  The assignment in userret() is currently
- * done unlocked.
- */
-void
-remrunqueue(struct lwp *l)
-{
-	struct lwp *prev, *next;
-	const int whichq = l->l_priority / PPQ;
-
-	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
-
-#ifdef RQDEBUG
-	checkrunqueue(whichq, l);
-#endif
-
-#if defined(DIAGNOSTIC)
-	if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
-		/* Shouldn't happen - interrupts disabled. */
-		panic("remrunqueue: bit %d not set", whichq);
-	}
-#endif
-	prev = l->l_back;
-	l->l_back = NULL;
-	next = l->l_forw;
-	prev->l_forw = next;
-	next->l_back = prev;
-	if (prev == next)
-		sched_whichqs &= ~RQMASK(whichq);
-#ifdef RQDEBUG
-	checkrunqueue(whichq, NULL);
-#endif
-}
-
-struct lwp *
-nextrunqueue(void)
-{
-	const struct prochd *rq;
-	struct lwp *l;
-	int whichq;
-
-	if (sched_whichqs == 0) {
-		return NULL;
-	}
-#ifdef __HAVE_BIGENDIAN_BITOPS
-	for (whichq = 0; ; whichq++) {
-		if ((sched_whichqs & RQMASK(whichq)) != 0) {
-			break;
-		}
-	}
-#else
-	whichq = ffs(sched_whichqs) - 1;
-#endif
-	rq = &sched_qs[whichq];
-	l = rq->ph_link;
-	return l;
-}
-
-#endif /* !defined(__HAVE_MD_RUNQUEUE) */
-
-#if defined(DDB)
-void
-sched_print_runqueue(void (*pr)(const char *, ...))
-{
-	struct prochd *ph;
-	struct lwp *l;
-	int i, first;
-
-	for (i = 0; i < RUNQUE_NQS; i++)
-	{
-		first = 1;
-		ph = &sched_qs[i];
-		for (l = ph->ph_link; l != (void *)ph; l = l->l_forw) {
-			if (first) {
-				(*pr)("%c%d",
-				    (sched_whichqs & RQMASK(i))
-				    ? ' ' : '!', i);
-				first = 0;
-			}
-			(*pr)("\t%d.%d (%s) pri=%d usrpri=%d\n",
-			    l->l_proc->p_pid,
-			    l->l_lid, l->l_proc->p_comm,
-			    (int)l->l_priority, (int)l->l_usrpri);
-		}
-	}
+	/*
+	 * XXXSMP
+	 * Since l->l_cpu persists across a context switch,
+	 * this gives us *very weak* processor affinity, in
+	 * that we notify the CPU on which the process last
+	 * ran that it should try to switch.
+	 *
+	 * This does not guarantee that the process will run on
+	 * that processor next, because another processor might
+	 * grab it the next time it performs a context switch.
+	 *
+	 * This also does not handle the case where its last
+	 * CPU is running a higher-priority process, but every
+	 * other CPU is running a lower-priority process.  There
+	 * are ways to handle this situation, but they're not
+	 * currently very pretty, and we also need to weigh the
+	 * cost of moving a process from one CPU to another.
+	 *
+	 * XXXSMP
+	 * There is also the issue of locking the other CPU's
+	 * sched state, which we currently do not do.
+	 */
+	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
+	if (pri < ci->ci_schedstate.spc_curpriority)
+		cpu_need_resched(ci);
 }
-#endif /* defined(DDB) */
-#undef RQMASK
Index: kern/sched_4bsd.c
===================================================================
RCS file: kern/sched_4bsd.c
diff -N kern/sched_4bsd.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ kern/sched_4bsd.c	17 Feb 2007 20:38:25 -0000
@@ -0,0 +1,820 @@
+/*	$NetBSD: kern_synch.c,v 1.176 2007/02/10 14:02:01 yamt Exp $	*/
+
+/*-
+ * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
+ * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
+ * Daniel Sieger.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the NetBSD
+ *	Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.176 2007/02/10 14:02:01 yamt Exp $");
+
+#include "opt_ddb.h"
+#include "opt_kstack.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_perfctrs.h"
+
+#define	__MUTEX_PRIVATE
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#if defined(PERFCTRS)
+#include <sys/pmc.h>
+#endif
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/kauth.h>
+#include <sys/sleepq.h>
+#include <sys/lockdebug.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/cpu.h>
+
+/*
+ * Run queues.
+ *
+ * We have 32 run queues in descending priority of 0..31.  We maintain
+ * a bitmask of non-empty queues in order speed up finding the first
+ * runnable process.  The bitmask is maintained only by machine-dependent
+ * code, allowing the most efficient instructions to be used to find the
+ * first non-empty queue.
+ */
+
+
+#define	RUNQUE_NQS		32      /* number of runqueues */
+#define	PPQ	(128 / RUNQUE_NQS)	/* priorities per queue */
+
+struct prochd {
+	struct lwp *ph_link;
+	struct lwp *ph_rlink;
+};
+
+struct prochd sched_qs[RUNQUE_NQS];	/* run queues */
+volatile uint32_t sched_whichqs;	/* bitmap of non-empty queues */
+
+void schedcpu(void *);
+void updatepri(struct lwp *);
+void resetpriority (struct lwp *);
+void resetprocpriority(struct proc *);
+
+struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
+static unsigned int schedcpu_ticks;
+
+int rrticks; /* number of hardclock ticks per sched_tick() */
+
+/*
+ * Force switch among equal priority processes every 100ms.
+ * Called from hardclock every hz/10 == rrticks hardclock ticks.
+ */
+/* ARGSUSED */
+void
+sched_tick(struct cpu_info *ci)
+{
+	struct schedstate_percpu *spc = &ci->ci_schedstate;
+
+	spc->spc_ticks = rrticks;
+
+	if (!CURCPU_IDLE_P()) {
+		if (spc->spc_flags & SPCF_SEENRR) {
+			/*
+			 * The process has already been through a roundrobin
+			 * without switching and may be hogging the CPU.
+			 * Indicate that the process should yield.
+			 */
+			spc->spc_flags |= SPCF_SHOULDYIELD;
+		} else
+			spc->spc_flags |= SPCF_SEENRR;
+	}
+	cpu_need_resched(curcpu());
+}
+
+#define	NICE_WEIGHT 2			/* priorities per nice level */
+
+#define	ESTCPU_SHIFT	11
+#define	ESTCPU_MAX	((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
+#define	ESTCPULIM(e)	min((e), ESTCPU_MAX)
+
+/*
+ * Constants for digital decay and forget:
+ *	90% of (p_estcpu) usage in 5 * loadav time
+ *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
+ *          Note that, as ps(1) mentions, this can let percentages
+ *          total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that hardclock updates p_estcpu and p_cpticks independently.
+ *
+ * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * 	for (i = 0; i < (5 * loadavg); i++)
+ * 		p_estcpu *= decay;
+ * will compute
+ * 	p_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * 	decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * 	decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * 	b = 2 * loadavg
+ * then
+ * 	decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ *         For x close to zero, exp(x) =~ 1 + x, since
+ *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ *         For x close to zero, ln(1+x) =~ x, since
+ *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
+ *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ *         ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ *    Solve (factor)**(power) =~ .1 given power (5*loadav):
+ *	solving for factor,
+ *      ln(factor) =~ (-2.30/5*loadav), or
+ *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
+ *
+ * Proof of (2):
+ *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ *	solving for power,
+ *      power*ln(b/(b+1)) =~ -2.30, or
+ *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ *      loadav: 1       2       3       4
+ *      power:  5.68    10.32   14.94   19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define	loadfactor(loadav)	(2 * (loadav))
+
+static fixpt_t
+decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
+{
+
+	if (estcpu == 0) {
+		return 0;
+	}
+
+#if !defined(_LP64)
+	/* avoid 64bit arithmetics. */
+#define	FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
+	if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
+		return estcpu * loadfac / (loadfac + FSCALE);
+	}
+#endif /* !defined(_LP64) */
+
+	return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
+}
+
+/*
+ * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
+ * sleeping for at least seven times the loadfactor will decay p_estcpu to
+ * less than (1 << ESTCPU_SHIFT).
+ *
+ * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
+ */
+static fixpt_t
+decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
+{
+
+	if ((n << FSHIFT) >= 7 * loadfac) {
+		return 0;
+	}
+
+	while (estcpu != 0 && n > 1) {
+		estcpu = decay_cpu(loadfac, estcpu);
+		n--;
+	}
+
+	return estcpu;
+}
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you dont want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define	CCPU_SHIFT	11
+
+/*
+ * schedcpu:
+ *
+ *	Recompute process priorities, every hz ticks.
+ *
+ *	XXXSMP This needs to be reorganised in order to reduce the locking
+ *	burden.
+ */
+/* ARGSUSED */
+void
+schedcpu(void *arg)
+{
+	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+	struct rlimit *rlim;
+	struct lwp *l;
+	struct proc *p;
+	int minslp, clkhz, sig;
+	long runtm;
+
+	schedcpu_ticks++;
+
+	mutex_enter(&proclist_mutex);
+	PROCLIST_FOREACH(p, &allproc) {
+		/*
+		 * Increment time in/out of memory and sleep time (if
+		 * sleeping).  We ignore overflow; with 16-bit int's
+		 * (remember them?) overflow takes 45 days.
+		 */
+		minslp = 2;
+		mutex_enter(&p->p_smutex);
+		runtm = p->p_rtime.tv_sec;
+		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+			if ((l->l_flag & L_IDLE) != 0)
+				continue;
+			lwp_lock(l);
+			runtm += l->l_rtime.tv_sec;
+			l->l_swtime++;
+			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
+			    l->l_stat == LSSUSPENDED) {
+				l->l_slptime++;
+				minslp = min(minslp, l->l_slptime);
+			} else
+				minslp = 0;
+			lwp_unlock(l);
+		}
+		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
+
+		/*
+		 * Check if the process exceeds its CPU resource allocation.
+		 * If over max, kill it.
+		 */
+		rlim = &p->p_rlimit[RLIMIT_CPU];
+		sig = 0;
+		if (runtm >= rlim->rlim_cur) {
+			if (runtm >= rlim->rlim_max)
+				sig = SIGKILL;
+			else {
+				sig = SIGXCPU;
+				if (rlim->rlim_cur < rlim->rlim_max)
+					rlim->rlim_cur += 5;
+			}
+		}
+
+		/* 
+		 * If the process has run for more than autonicetime, reduce
+		 * priority to give others a chance.
+		 */
+		if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
+		    && kauth_cred_geteuid(p->p_cred)) {
+			mutex_spin_enter(&p->p_stmutex);
+			p->p_nice = autoniceval + NZERO;
+			resetprocpriority(p);
+			mutex_spin_exit(&p->p_stmutex);
+		}
+
+		/*
+		 * If the process has slept the entire second,
+		 * stop recalculating its priority until it wakes up.
+		 */
+		if (minslp <= 1) {
+			/*
+			 * p_pctcpu is only for ps.
+			 */
+			mutex_spin_enter(&p->p_stmutex);
+			clkhz = stathz != 0 ? stathz : hz;
+#if	(FSHIFT >= CCPU_SHIFT)
+			p->p_pctcpu += (clkhz == 100)?
+			    ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
+			    100 * (((fixpt_t) p->p_cpticks)
+			    << (FSHIFT - CCPU_SHIFT)) / clkhz;
+#else
+			p->p_pctcpu += ((FSCALE - ccpu) *
+			    (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
+#endif
+			p->p_cpticks = 0;
+			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
+
+			LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+				if ((l->l_flag & L_IDLE) != 0)
+					continue;
+				lwp_lock(l);
+				if (l->l_slptime <= 1 &&
+				    l->l_priority >= PUSER)
+					resetpriority(l);
+				lwp_unlock(l);
+			}
+			mutex_spin_exit(&p->p_stmutex);
+		}
+
+		mutex_exit(&p->p_smutex);
+		if (sig) {
+			psignal(p, sig);
+		}
+	}
+	mutex_exit(&proclist_mutex);
+	uvm_meter();
+	wakeup((caddr_t)&lbolt);
+	callout_schedule(&schedcpu_ch, hz);
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ */
+void
+updatepri(struct lwp *l)
+{
+	struct proc *p = l->l_proc;
+	fixpt_t loadfac;
+
+	LOCK_ASSERT(lwp_locked(l, NULL));
+	KASSERT(l->l_slptime > 1);
+
+	loadfac = loadfactor(averunnable.ldavg[0]);
+
+	l->l_slptime--; /* the first time was done in schedcpu */
+	/* XXX NJWLWP */
+	/* XXXSMP occasionally unlocked, should be per-LWP */
+	p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
+	resetpriority(l);
+}
+
+/*
+ * Initialize the (doubly-linked) run queues
+ * to be empty.
+ */
+void
+sched_rqinit()
+{
+	int i;
+
+	for (i = 0; i < RUNQUE_NQS; i++)
+		sched_qs[i].ph_link = sched_qs[i].ph_rlink =
+		    (struct lwp *)&sched_qs[i];
+
+	mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
+}
+
+void
+sched_setup()
+{
+	rrticks = hz / 10;
+        
+	schedcpu(NULL);
+}
+
+void
+sched_setrunnable(struct lwp *l)
+{
+ 	if (l->l_slptime > 1)
+ 		updatepri(l);
+}
+
+boolean_t
+sched_curcpu_runnable_p(void)
+{
+
+	return sched_whichqs != 0;
+}
+
+void
+sched_nice(struct proc *chgp, int n)
+{
+	chgp->p_nice = n;
+	(void)resetprocpriority(chgp);
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+void
+resetpriority(struct lwp *l)
+{
+	unsigned int newpriority;
+	struct proc *p = l->l_proc;
+
+	/* XXXSMP LOCK_ASSERT(mutex_owned(&p->p_stmutex)); */
+	LOCK_ASSERT(lwp_locked(l, NULL));
+
+	if ((l->l_flag & L_SYSTEM) != 0)
+		return;
+
+	newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
+	    NICE_WEIGHT * (p->p_nice - NZERO);
+	newpriority = min(newpriority, MAXPRI);
+	lwp_changepri(l, newpriority);
+}
+
+/*
+ * Recompute priority for all LWPs in a process.
+ */
+void
+resetprocpriority(struct proc *p)
+{
+	struct lwp *l;
+
+	LOCK_ASSERT(mutex_owned(&p->p_stmutex));
+
+	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+		lwp_lock(l);
+		resetpriority(l);
+		lwp_unlock(l);
+	}
+}
+
+/*
+ * We adjust the priority of the current process.  The priority of a process
+ * gets worse as it accumulates CPU time.  The CPU usage estimator (p_estcpu)
+ * is increased here.  The formula for computing priorities (in kern_synch.c)
+ * will compute a different value each time p_estcpu increases. This can
+ * cause a switch, but unless the priority crosses a PPQ boundary the actual
+ * queue will not change.  The CPU usage estimator ramps up quite quickly
+ * when the process is running (linearly), and decays away exponentially, at
+ * a rate which is proportionally slower when the system is busy.  The basic
+ * principle is that the system will 90% forget that the process used a lot
+ * of CPU time in 5 * loadav seconds.  This causes the system to favor
+ * processes which haven't run much recently, and to round-robin among other
+ * processes.
+ */
+
+void
+schedclock(struct lwp *l)
+{
+	struct proc *p = l->l_proc;
+
+	KASSERT(!CURCPU_IDLE_P());
+	mutex_spin_enter(&p->p_stmutex);
+	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
+	lwp_lock(l);
+	resetpriority(l);
+	mutex_spin_exit(&p->p_stmutex);
+	if ((l->l_flag & L_SYSTEM) == 0 && l->l_priority >= PUSER)
+		l->l_priority = l->l_usrpri;
+	lwp_unlock(l);
+}
+
+/*
+ * scheduler_fork_hook:
+ *
+ *	Inherit the parent's scheduler history.
+ */
+void
+sched_proc_fork(struct proc *parent, struct proc *child)
+{
+
+	LOCK_ASSERT(mutex_owned(&parent->p_smutex));
+
+	child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
+	child->p_forktime = schedcpu_ticks;
+}
+
+/*
+ * scheduler_wait_hook:
+ *
+ *	Chargeback parents for the sins of their children.
+ */
+void
+sched_proc_exit(struct proc *parent, struct proc *child)
+{
+	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+	fixpt_t estcpu;
+
+	/* XXX Only if parent != init?? */
+
+	mutex_spin_enter(&parent->p_stmutex);
+	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
+	    schedcpu_ticks - child->p_forktime);
+	if (child->p_estcpu > estcpu)
+		parent->p_estcpu =
+		    ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
+	mutex_spin_exit(&parent->p_stmutex);
+}
+
+/*
+ * sched_changepri:
+ *
+ *	Adjust the priority of an LWP.
+ */
+void
+sched_changepri(struct lwp *l, int pri)
+{
+
+	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+	l->l_usrpri = pri;
+
+	if (l->l_priority < PUSER)
+		return;
+	if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0 ||
+	    (l->l_priority / PPQ) == (pri / PPQ)) {
+		l->l_priority = pri;
+		return;
+	}
+
+	sched_dequeue(l);
+	l->l_priority = pri;
+	sched_enqueue(l);
+	resched_cpu(l, pri);
+}
+
+/*
+ * On some architectures, it's faster to use a MSB ordering for the priorites
+ * than the traditional LSB ordering.
+ */
+#ifdef __HAVE_BIGENDIAN_BITOPS
+#define	RQMASK(n) (0x80000000 >> (n))
+#else
+#define	RQMASK(n) (0x00000001 << (n))
+#endif
+
+/*
+ * Low-level routines to access the run queue.  Optimised assembler
+ * routines can override these.
+ */
+
+#ifndef __HAVE_MD_RUNQUEUE
+
+/*
+ * The primitives that manipulate the run queues.  whichqs tells which
+ * of the 32 queues qs have processes in them.  Setrunqueue puts processes
+ * into queues, remrunqueue removes them from queues.  The running process is
+ * on no queue, other processes are on a queue related to p->p_priority,
+ * divided by 4 actually to shrink the 0-127 range of priorities into the 32
+ * available queues.
+ */
+#ifdef RQDEBUG
+static void
+checkrunqueue(int whichq, struct lwp *l)
+{
+	const struct prochd * const rq = &sched_qs[whichq];
+	struct lwp *l2;
+	int found = 0;
+	int die = 0;
+	int empty = 1;
+	for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
+		if (l2->l_stat != LSRUN) {
+			printf("checkrunqueue[%d]: lwp %p state (%d) "
+			    " != LSRUN\n", whichq, l2, l2->l_stat);
+		}
+		if (l2->l_back->l_forw != l2) {
+			printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
+			    "corrupt %p\n", whichq, l2, l2->l_back,
+			    l2->l_back->l_forw);
+			die = 1;
+		}
+		if (l2->l_forw->l_back != l2) {
+			printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
+			    "corrupt %p\n", whichq, l2, l2->l_forw,
+			    l2->l_forw->l_back);
+			die = 1;
+		}
+		if (l2 == l)
+			found = 1;
+		empty = 0;
+	}
+	if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
+		printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
+		    whichq, rq);
+		die = 1;
+	} else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
+		printf("checkrunqueue[%d]: bit clear for non-empty "
+		    "run-queue %p\n", whichq, rq);
+		die = 1;
+	}
+	if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
+		printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
+		    whichq, l);
+		die = 1;
+	}
+	if (l != NULL && empty) {
+		printf("checkrunqueue[%d]: empty run-queue %p with "
+		    "active lwp %p\n", whichq, rq, l);
+		die = 1;
+	}
+	if (l != NULL && !found) {
+		printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
+		    whichq, l, rq);
+		die = 1;
+	}
+	if (die)
+		panic("checkrunqueue: inconsistency found");
+}
+#endif /* RQDEBUG */
+
+void
+sched_enqueue(struct lwp *l)
+{
+	struct prochd*rq;
+	struct lwp *prev;
+	const int whichq = l->l_priority / PPQ;
+
+	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+#ifdef RQDEBUG
+	checkrunqueue(whichq, NULL);
+#endif
+#ifdef DIAGNOSTIC
+	if (l->l_back != NULL || l->l_stat != LSRUN)
+		panic("setrunqueue");
+#endif
+	sched_whichqs |= RQMASK(whichq);
+	rq = &sched_qs[whichq];
+	prev = rq->ph_rlink;
+	l->l_forw = (struct lwp *)rq;
+	rq->ph_rlink = l;
+	prev->l_forw = l;
+	l->l_back = prev;
+#ifdef RQDEBUG
+	checkrunqueue(whichq, l);
+#endif
+}
+
+/*
+ * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
+ * drop of the effective priority level from kernel to user needs to be
+ * moved here from userret().  The assignment in userret() is currently
+ * done unlocked.
+ */
+void
+sched_dequeue(struct lwp *l)
+{
+	struct lwp *prev, *next;
+	const int whichq = l->l_priority / PPQ;
+
+	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+#ifdef RQDEBUG
+	checkrunqueue(whichq, l);
+#endif
+
+#if defined(DIAGNOSTIC)
+	if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
+		/* Shouldn't happen - interrupts disabled. */
+		panic("remrunqueue: bit %d not set", whichq);
+	}
+#endif
+	prev = l->l_back;
+	l->l_back = NULL;
+	next = l->l_forw;
+	prev->l_forw = next;
+	next->l_back = prev;
+	if (prev == next)
+		sched_whichqs &= ~RQMASK(whichq);
+#ifdef RQDEBUG
+	checkrunqueue(whichq, NULL);
+#endif
+}
+
+struct lwp *
+sched_nextlwp(void)
+{
+	const struct prochd *rq;
+	struct lwp *l;
+	int whichq;
+
+	if (sched_whichqs == 0) {
+		return NULL;
+	}
+#ifdef __HAVE_BIGENDIAN_BITOPS
+	for (whichq = 0; ; whichq++) {
+		if ((sched_whichqs & RQMASK(whichq)) != 0) {
+			break;
+		}
+	}
+#else
+	whichq = ffs(sched_whichqs) - 1;
+#endif
+	rq = &sched_qs[whichq];
+	l = rq->ph_link;
+	return l;
+}
+
+#endif /* !defined(__HAVE_MD_RUNQUEUE) */
+
+#if defined(DDB)
+void
+sched_print_runqueue(void (*pr)(const char *, ...))
+{
+	struct prochd *ph;
+	struct lwp *l;
+	int i, first;
+
+	for (i = 0; i < RUNQUE_NQS; i++)
+	{
+		first = 1;
+		ph = &sched_qs[i];
+		for (l = ph->ph_link; l != (void *)ph; l = l->l_forw) {
+			if (first) {
+				(*pr)("%c%d",
+				    (sched_whichqs & RQMASK(i))
+				    ? ' ' : '!', i);
+				first = 0;
+			}
+			(*pr)("\t%d.%d (%s) pri=%d usrpri=%d\n",
+			    l->l_proc->p_pid,
+			    l->l_lid, l->l_proc->p_comm,
+			    (int)l->l_priority, (int)l->l_usrpri);
+		}
+	}
+}
+#endif /* defined(DDB) */
+#undef RQMASK
Index: kern/sys_lwp.c
===================================================================
RCS file: /cvsroot/src/sys/kern/sys_lwp.c,v
retrieving revision 1.3
diff -u -r1.3 sys_lwp.c
--- kern/sys_lwp.c	15 Feb 2007 20:21:13 -0000	1.3
+++ kern/sys_lwp.c	17 Feb 2007 20:38:25 -0000
@@ -151,7 +151,7 @@
 			LOCK_ASSERT(lwp_locked(l2, &sched_mutex));
 			p->p_nrlwps++;
 			l2->l_stat = LSRUN;
-			setrunqueue(l2);
+			sched_enqueue(l2);
 		}
 	} else
 		l2->l_stat = LSSUSPENDED;
Index: sys/lwp.h
===================================================================
RCS file: /cvsroot/src/sys/sys/lwp.h,v
retrieving revision 1.48.2.1
diff -u -r1.48.2.1 lwp.h
--- sys/lwp.h	17 Feb 2007 10:31:03 -0000	1.48.2.1
+++ sys/lwp.h	17 Feb 2007 20:38:26 -0000
@@ -218,24 +218,6 @@
 		lwp_update_creds(l);					\
 } while (/* CONSTCOND */ 0)
 
-void	preempt (void);
-int	mi_switch (struct lwp *, struct lwp *);
-#ifndef remrunqueue
-void	remrunqueue (struct lwp *);
-#endif
-void	resetpriority (struct lwp *);
-void	setrunnable (struct lwp *);
-#ifndef setrunqueue
-void	setrunqueue (struct lwp *);
-#endif
-#ifndef nextrunqueue
-struct lwp *nextrunqueue(void);
-#endif
-void	unsleep (struct lwp *);
-#ifndef cpu_switchto
-struct lwp *cpu_switchto(struct lwp *, struct lwp *);
-#endif
-
 void	lwp_startup(struct lwp *, struct lwp *);
 
 int	lwp_locked(struct lwp *, kmutex_t *);
Index: sys/proc.h
===================================================================
RCS file: /cvsroot/src/sys/sys/proc.h,v
retrieving revision 1.236.2.1
diff -u -r1.236.2.1 proc.h
--- sys/proc.h	17 Feb 2007 10:31:04 -0000	1.236.2.1
+++ sys/proc.h	17 Feb 2007 20:38:26 -0000
@@ -538,7 +538,6 @@
 void	yield(void);
 void	pgdelete(struct pgrp *);
 void	procinit(void);
-void	resetprocpriority(struct proc *);
 void	suspendsched(void);
 int	ltsleep(wchan_t, int, const char *, int,
 	    volatile struct simplelock *);
@@ -554,7 +553,6 @@
 void	exit_lwps(struct lwp *l);
 int	fork1(struct lwp *, int, int, void *, size_t,
 	    void (*)(void *), void *, register_t *, struct proc **);
-void	rqinit(void);
 int	pgid_in_session(struct proc *, pid_t);
 #ifndef cpu_idle
 void	cpu_idle(void);
Index: sys/sched.h
===================================================================
RCS file: /cvsroot/src/sys/sys/sched.h,v
retrieving revision 1.30.2.1
diff -u -r1.30.2.1 sched.h
--- sys/sched.h	17 Feb 2007 10:31:06 -0000	1.30.2.1
+++ sys/sched.h	17 Feb 2007 20:38:27 -0000
@@ -5,7 +5,8 @@
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
- * by Ross Harvey, Jason R. Thorpe, Nathan J. Williams, and Andrew Doran.
+ * by Ross Harvey, Jason R. Thorpe, Nathan J. Williams, Andrew Doran and
+ * Daniel Sieger.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -80,8 +81,17 @@
 #if defined(_KERNEL_OPT)
 #include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
+#include "opt_sched.h"
 #endif
 
+#if _KERNEL
+
+#if defined(SCHED_4BSD)
+#include <sys/sched_4bsd.h>
+#endif
+
+#endif /* _KERNEL */
+
 struct sched_param {
 	int	sched_priority;
 };
@@ -100,21 +110,6 @@
 #include <sys/time.h>
 
 /*
- * Run queues.
- *
- * We have 32 run queues in descending priority of 0..31.  We maintain
- * a bitmask of non-empty queues in order speed up finding the first
- * runnable process.  The bitmask is maintained only by machine-dependent
- * code, allowing the most efficient instructions to be used to find the
- * first non-empty queue.
- */
-#define	RUNQUE_NQS		32
-struct prochd {
-	struct lwp *ph_link;
-	struct lwp *ph_rlink;
-};
-
-/*
  * CPU states.
  * XXX Not really scheduler state, but no other good place to put
  * it right now, and it really is per-CPU.
@@ -135,7 +130,7 @@
 	u_int spc_schedticks;		/* ticks for schedclock() */
 	uint64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
 	u_char spc_curpriority;		/* usrpri of curproc */
-	int spc_rrticks;		/* ticks until roundrobin() */
+	int spc_ticks;		        /* ticks until sched_tick() */
 	int spc_pscnt;			/* prof/stat counter */
 	int spc_psdiv;			/* prof/stat divisor */
 };
@@ -164,21 +159,45 @@
 
 #ifdef _KERNEL
 
-extern int schedhz;			/* ideally: 16 */
-extern int rrticks;			/* ticks per roundrobin() */
-
 struct proc;
 struct cpu_info;
 
-void schedclock(struct lwp *);
-void roundrobin(struct cpu_info *);
-int sched_kpri(struct lwp *);
+/*
+ * Common Scheduler Interface
+ */
+
+/* Scheduler initialization */
+void sched_rqinit(void);	/* Initialize runqueues */
+void sched_setup(void);		/* Setup scheduler, e.g. kick off timeout driven events */
+
+/* Main scheduler functions */
+void sched_tick(struct cpu_info *); /* Maybe resched after spc_ticks hardclock() ticks */
+
+/* Runqueue-related functions */
+inline boolean_t sched_curcpu_runnable_p(void); /* Indicate runnable processes on current CPU */
+struct lwp * sched_nextlwp(void);		/* Select LWP to run on the CPU next */
+void sched_enqueue(struct lwp *);	/* Place a process on its runqueue */
+void sched_dequeue(struct lwp *);	/* Remove a process from its runqueue */
+
+/* Priority adjustment */
+void sched_nice(struct proc *, int);		/* Recalc priority according to its nice value */
+void sched_changepri(struct lwp *, int);	/* Change priority of a LWP */
+
+/* General helper functions */
+void sched_proc_fork(struct proc *, struct proc *);	/* Inherit scheduling history */
+void sched_proc_exit(struct proc *, struct proc *);	/* Chargeback parents */
+void sched_print_runqueue(void (*pr)(const char *, ...));	/* Print runqueues in DDB */
+void sched_setrunnable(struct lwp *);	/* Scheduler-specific actions for setrunnable() */
 
-void scheduler_fork_hook(struct proc *, struct proc *);
-void scheduler_wait_hook(struct proc *, struct proc *);
+/* Functions common to all scheduler implementations */
+void sched_wakeup(volatile const void *);
+int sched_kpri(struct lwp *);
 
-boolean_t sched_curcpu_runnable_p(void);
-void sched_print_runqueue(void (*)(const char *, ...));
+inline void resched_cpu(struct lwp *, u_char); /* Arrange reschedule */
+void setrunnable(struct lwp *);
+void preempt(void);
+int mi_switch(struct lwp *, struct lwp *);
+struct lwp *cpu_switchto(struct lwp *, struct lwp *);
 
 /*
  * Synchronisation object operations set.
Index: sys/sched_4bsd.h
===================================================================
RCS file: sys/sched_4bsd.h
diff -N sys/sched_4bsd.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/sched_4bsd.h	17 Feb 2007 20:38:27 -0000
@@ -0,0 +1,83 @@
+/* $NetBSD: sched.h,v 1.29 2007/02/09 21:55:37 ad Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Ross Harvey, Jason R. Thorpe, Nathan J. Williams, Andrew Doran and
+ * Daniel Sieger.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ */
+
+#ifndef	_SYS_SCHED_4BSD_H_
+#define	_SYS_SCHED_4BSD_H_
+
+extern int schedhz;			/* ideally: 16 */
+
+void schedclock(struct lwp *);
+
+#endif /* _SYS_SCHED_4BSD_H_ */
Index: uvm/uvm_glue.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_glue.c,v
retrieving revision 1.99
diff -u -r1.99 uvm_glue.c
--- uvm/uvm_glue.c	15 Feb 2007 20:21:13 -0000	1.99
+++ uvm/uvm_glue.c	17 Feb 2007 20:38:32 -0000
@@ -443,7 +443,7 @@
 	cpu_swapin(l);
 	lwp_lock(l);
 	if (l->l_stat == LSRUN)
-		setrunqueue(l);
+		sched_enqueue(l);
 	l->l_flag |= L_INMEM;
 	l->l_swtime = 0;
 	lwp_unlock(l);
@@ -698,7 +698,7 @@
 	l->l_flag &= ~L_INMEM;
 	l->l_swtime = 0;
 	if (l->l_stat == LSRUN)
-		remrunqueue(l);
+		sched_dequeue(l);
 	lwp_unlock(l);
 	p->p_stats->p_ru.ru_nswap++;	/* XXXSMP */
 	++uvmexp.swapouts;

--3uo+9/B/ebqu+fSQ--