Subject: Re: Moving scheduler semantics from cpu_switch() to kern_synch.c
To: None <garrett_damore@tadpole.com>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 10/01/2006 20:56:19
--NextPart-20060930211225-0902601
Content-Type: Text/Plain; charset=us-ascii

> > > => idle() (written in C, in MI code) implements idle policy.  This is
> > > where we can check for new processes to run, zero free pages, etc.  If
> > > there is no "idle time" work to do, then cpu_idle() is called.
> > >
> > > => cpu_idle() does NOT loop!  cpu_idle() simply does the truly MD
> > > things that idle would do, e.g. call the HLT instruction or do other
> > > idle-time power saving, etc.  Once that special instruction has
> > > finished executing, we know that something has happened (i.e. an
> > > interrupt that may have caused an LWP to become runnable), so we
> > > return back to idle(), which loops around again (thus checking for
> > > runnable LWPs... lather, rinse, repeat).
> 
> cpu "wakeup" code and the correcponding part of idle() should be MD.
> otherwise i agree.
> 
> > Yes, yes yes!  I agree with all of it!  So when can we do it? :-)
> 
> whenever we have enough volunteers for each of our too many ports.
> (as usual :-)
> 
> i think i had some old (pre-lwp i guess) idle thread code for i386
> somewhere in my home dir.  i'll try to dig it up unless anyone has
> newer one.

i found and ported it to -current.  (attached)

YAMAMOTO Takashi

--NextPart-20060930211225-0902601
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="i.diff"

Index: conf/files
===================================================================
--- conf/files	(revision 1799)
+++ conf/files	(working copy)
@@ -1283,6 +1283,7 @@ file	kern/kern_event.c
 file	kern/kern_exec.c
 file	kern/kern_exit.c
 file	kern/kern_fork.c
+file	kern/kern_idle.c
 file	kern/kern_kcont.c		kcont
 file	kern/kern_kthread.c
 file	kern/kern_ktrace.c
Index: kern/kern_synch.c
===================================================================
--- kern/kern_synch.c	(revision 1794)
+++ kern/kern_synch.c	(working copy)
@@ -127,8 +127,8 @@ int	rrticks;		/* number of hardclock tic
 /*
  * The global scheduler state.
  */
-struct prochd sched_qs[RUNQUE_NQS];	/* run queues */
-volatile uint32_t sched_whichqs;	/* bitmap of non-empty queues */
+static struct prochd sched_qs[RUNQUE_NQS];	/* run queues */
+volatile static uint32_t sched_whichqs;	/* bitmap of non-empty queues */
 static struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */
 
 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER;
@@ -156,7 +156,7 @@ roundrobin(struct cpu_info *ci)
 
 	spc->spc_rrticks = rrticks;
 
-	if (curlwp != NULL) {
+	if (!CURCPU_IDLE_P()) {
 		if (spc->spc_flags & SPCF_SEENRR) {
 			/*
 			 * The process has already been through a roundrobin
@@ -327,6 +327,8 @@ schedcpu(void *arg)
 		 */
 		minslp = 2;
 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+			if ((l->l_flag & L_IDLE) != 0)
+				continue;
 			l->l_swtime++;
 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
 			    l->l_stat == LSSUSPENDED) {
@@ -363,6 +365,8 @@ schedcpu(void *arg)
 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 			if (l->l_slptime > 1)
 				continue;
+			if ((l->l_flag & L_IDLE) != 0)
+				continue;
 			resetpriority(l);
 			if (l->l_priority >= PUSER) {
 				if (l->l_stat == LSRUN &&
@@ -435,7 +439,7 @@ ltsleep(volatile const void *ident, int 
     volatile struct simplelock *interlock)
 {
 	struct lwp *l = curlwp;
-	struct proc *p = l ? l->l_proc : NULL;
+	struct proc *p = l->l_proc;
 	struct slpque *qp;
 	struct sadata_upcall *sau;
 	int sig, s;
@@ -451,7 +455,7 @@ ltsleep(volatile const void *ident, int 
 	 * in the shutdown case is disgusting but partly necessary given
 	 * how shutdown (barely) works.
 	 */
-	if (cold || (doing_shutdown && (panicstr || (l == NULL)))) {
+	if (cold || (doing_shutdown && (panicstr || CURCPU_IDLE_P()))) {
 		/*
 		 * After a panic, or during autoconfiguration,
 		 * just give interrupts a chance, then just return;
@@ -727,18 +731,11 @@ awaken(struct lwp *l)
 
 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
 void
-sched_unlock_idle(void)
+sched_unlock(void)
 {
 
 	simple_unlock(&sched_lock);
 }
-
-void
-sched_lock_idle(void)
-{
-
-	simple_lock(&sched_lock);
-}
 #endif /* MULTIPROCESSOR || LOCKDEBUG */
 
 /*
@@ -889,6 +886,7 @@ preempt(int more)
 	struct lwp *l = curlwp;
 	int r, s;
 
+	KASSERT(!CURCPU_IDLE_P());
 	SCHED_LOCK(s);
 	l->l_priority = l->l_usrpri;
 	l->l_stat = LSRUN;
@@ -901,6 +899,13 @@ preempt(int more)
 		sa_preempt(l);
 }
 
+boolean_t
+sched_curcpu_runnable_p(void)
+{
+
+	return sched_whichqs != 0;
+}
+
 /*
  * The machine independent parts of context switch.
  * Must be called at splsched() (no higher!) and with
@@ -940,47 +945,49 @@ mi_switch(struct lwp *l, struct lwp *new
 	simple_lock_switchcheck();
 #endif
 
-	/*
-	 * Compute the amount of time during which the current
-	 * process was running.
-	 */
-	microtime(&tv);
-	u = p->p_rtime.tv_usec +
-	    (tv.tv_usec - spc->spc_runtime.tv_usec);
-	s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
-	if (u < 0) {
-		u += 1000000;
-		s--;
-	} else if (u >= 1000000) {
-		u -= 1000000;
-		s++;
-	}
-	p->p_rtime.tv_usec = u;
-	p->p_rtime.tv_sec = s;
+	if ((l->l_flag & L_IDLE) == 0) {
+		/*
+		 * Compute the amount of time during which the current
+		 * process was running.
+		 */
+		microtime(&tv);
+		u = p->p_rtime.tv_usec +
+		    (tv.tv_usec - spc->spc_runtime.tv_usec);
+		s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
+		if (u < 0) {
+			u += 1000000;
+			s--;
+		} else if (u >= 1000000) {
+			u -= 1000000;
+			s++;
+		}
+		p->p_rtime.tv_usec = u;
+		p->p_rtime.tv_sec = s;
 
-	/*
-	 * Check if the process exceeds its CPU resource allocation.
-	 * If over max, kill it.  In any case, if it has run for more
-	 * than 10 minutes, reduce priority to give others a chance.
-	 */
-	rlim = &p->p_rlimit[RLIMIT_CPU];
-	if (s >= rlim->rlim_cur) {
 		/*
-		 * XXXSMP: we're inside the scheduler lock perimeter;
-		 * use sched_psignal.
+		 * Check if the process exceeds its CPU resource allocation.
+		 * If over max, kill it.  In any case, if it has run for more
+		 * than 10 minutes, reduce priority to give others a chance.
 		 */
-		if (s >= rlim->rlim_max)
-			sched_psignal(p, SIGKILL);
-		else {
-			sched_psignal(p, SIGXCPU);
-			if (rlim->rlim_cur < rlim->rlim_max)
-				rlim->rlim_cur += 5;
+		rlim = &p->p_rlimit[RLIMIT_CPU];
+		if (s >= rlim->rlim_cur) {
+			/*
+			 * XXXSMP: we're inside the scheduler lock perimeter;
+			 * use sched_psignal.
+			 */
+			if (s >= rlim->rlim_max)
+				sched_psignal(p, SIGKILL);
+			else {
+				sched_psignal(p, SIGXCPU);
+				if (rlim->rlim_cur < rlim->rlim_max)
+					rlim->rlim_cur += 5;
+			}
+		}
+		if (autonicetime && s > autonicetime &&
+		    kauth_cred_geteuid(p->p_cred) && p->p_nice == NZERO) {
+			p->p_nice = autoniceval + NZERO;
+			resetpriority(l);
 		}
-	}
-	if (autonicetime && s > autonicetime &&
-	    kauth_cred_geteuid(p->p_cred) && p->p_nice == NZERO) {
-		p->p_nice = autoniceval + NZERO;
-		resetpriority(l);
 	}
 
 	/*
@@ -1006,14 +1013,30 @@ mi_switch(struct lwp *l, struct lwp *new
 	 * Switch to the new current process.  When we
 	 * run again, we'll return back here.
 	 */
-	uvmexp.swtch++;
 	if (newl == NULL) {
-		retval = cpu_switch(l, NULL);
-	} else {
+		newl = nextrunqueue();
+	}
+	if (newl != NULL) {
 		remrunqueue(newl);
+	} else {
+		newl = l->l_cpu->ci_data.cpu_idlelwp;
+		KASSERT(newl != NULL);
+	}
+	newl->l_stat = LSONPROC;
+	if (l != newl) {
+		uvmexp.swtch++;
+		pmap_deactivate(l);
+		curlwp = newl;
+		newl->l_cpu = l->l_cpu;
 		cpu_switchto(l, newl);
+		KASSERT(curlwp == l);
+		pmap_activate(l);
+		retval = 1;
+	} else {
+		sched_unlock();
 		retval = 0;
 	}
+	spl0();
 
 	/*
 	 * If we are using h/w performance counters, restore context.
@@ -1037,7 +1060,9 @@ mi_switch(struct lwp *l, struct lwp *new
 	 */
 	KDASSERT(l->l_cpu != NULL);
 	KDASSERT(l->l_cpu == curcpu());
-	microtime(&l->l_cpu->ci_schedstate.spc_runtime);
+	if ((l->l_flag & L_IDLE) == 0) {
+		microtime(&l->l_cpu->ci_schedstate.spc_runtime);
+	}
 
 	/*
 	 * Reacquire the kernel_lock now.  We do this after we've
@@ -1045,6 +1070,7 @@ mi_switch(struct lwp *l, struct lwp *new
 	 * we reacquire the interlock.
 	 */
 	KERNEL_LOCK_ACQUIRE_COUNT(hold_count);
+	(void)splsched(); /* XXX */
 
 	return retval;
 }
@@ -1107,6 +1133,7 @@ setrunnable(struct lwp *l)
 
 	SCHED_ASSERT_LOCKED();
 
+	KASSERT((l->l_flag & L_IDLE) == 0);
 	switch (l->l_stat) {
 	case 0:
 	case LSRUN:
@@ -1205,6 +1232,7 @@ schedclock(struct lwp *l)
 	struct proc *p = l->l_proc;
 	int s;
 
+	KASSERT(!CURCPU_IDLE_P());
 	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
 	SCHED_LOCK(s);
 	resetpriority(l);
@@ -1288,13 +1316,6 @@ scheduler_wait_hook(struct proc *parent,
 }
 
 /*
- * Low-level routines to access the run queue.  Optimised assembler
- * routines can override these.
- */
-
-#ifndef __HAVE_MD_RUNQUEUE
-
-/*
  * On some architectures, it's faster to use a MSB ordering for the priorites
  * than the traditional LSB ordering.
  */
@@ -1305,6 +1326,13 @@ scheduler_wait_hook(struct proc *parent,
 #endif
 
 /*
+ * Low-level routines to access the run queue.  Optimised assembler
+ * routines can override these.
+ */
+
+#ifndef __HAVE_MD_RUNQUEUE
+
+/*
  * The primitives that manipulate the run queues.  whichqs tells which
  * of the 32 queues qs have processes in them.  Setrunqueue puts processes
  * into queues, remrunqueue removes them from queues.  The running process is
@@ -1424,5 +1452,57 @@ remrunqueue(struct lwp *l)
 #endif
 }
 
-#undef RQMASK
+struct lwp *
+nextrunqueue(void)
+{
+	const struct prochd *rq;
+	struct lwp *l;
+	int whichq;
+
+	if (sched_whichqs == 0) {
+		return NULL;
+	}
+#ifdef __HAVE_BIGENDIAN_BITOPS
+	for (whichq = 0; ; whichq++) {
+		if ((schqd_whichqs & RQMASK(whichq)) != 0) {
+			break;
+		}
+	}
+#else
+	whichq = ffs(sched_whichqs) - 1;
+#endif
+	rq = &sched_qs[whichq];
+	l = rq->ph_link;
+	return l;
+}
+
 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
+
+#if defined(DDB)
+void
+sched_print_runqueue(void (*pr)(const char *, ...))
+{
+	struct prochd *ph;
+	struct lwp *l;
+	int i, first;
+
+	for (i = 0; i < RUNQUE_NQS; i++)
+	{
+		first = 1;
+		ph = &sched_qs[i];
+		for (l = ph->ph_link; l != (void *)ph; l = l->l_forw) {
+			if (first) {
+				(*pr)("%c%d",
+				    (sched_whichqs & RQMASK(i))
+				    ? ' ' : '!', i);
+				first = 0;
+			}
+			(*pr)("\t%d.%d (%s) pri=%d usrpri=%d\n",
+			    l->l_proc->p_pid,
+			    l->l_lid, l->l_proc->p_comm,
+			    (int)l->l_priority, (int)l->l_usrpri);
+		}
+	}
+}
+#endif /* defined(DDB) */
+#undef RQMASK
Index: kern/init_main.c
===================================================================
--- kern/init_main.c	(revision 1787)
+++ kern/init_main.c	(working copy)
@@ -92,6 +92,7 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c,
 #include <sys/file.h>
 #include <sys/errno.h>
 #include <sys/callout.h>
+#include <sys/idle.h>
 #include <sys/kernel.h>
 #include <sys/kcont.h>
 #include <sys/kmem.h>
@@ -278,6 +279,9 @@ main(void)
 	/* Create process 0 (the swapper). */
 	proc0_init();
 
+	error = create_idle_lwp(curcpu());
+	KASSERT(error == 0);
+
 	/*
 	 * Charge root for one process.
 	 */
Index: kern/kern_clock.c
===================================================================
--- kern/kern_clock.c	(revision 1785)
+++ kern/kern_clock.c	(working copy)
@@ -518,7 +518,7 @@ hardclock(struct clockframe *frame)
 #endif /* __HAVE_TIMECOUNTER */
 
 	l = curlwp;
-	if (l) {
+	if (!CURCPU_IDLE_P()) {
 		p = l->l_proc;
 		/*
 		 * Run current process's virtual and profile time, as needed.
@@ -1221,7 +1221,7 @@ statclock(struct clockframe *frame)
 			if (p != NULL)
 				p->p_iticks++;
 			spc->spc_cp_time[CP_INTR]++;
-		} else if (p != NULL) {
+		} else if (!CURCPU_IDLE_P()) {
 			p->p_sticks++;
 			spc->spc_cp_time[CP_SYS]++;
 		} else
@@ -1229,7 +1229,7 @@ statclock(struct clockframe *frame)
 	}
 	spc->spc_pscnt = psdiv;
 
-	if (p != NULL) {
+	if (p != NULL && !CURCPU_IDLE_P()) {
 		++p->p_cpticks;
 		/*
 		 * If no separate schedclock is provided, call it here
Index: kern/kern_lwp.c
===================================================================
--- kern/kern_lwp.c	(revision 1770)
+++ kern/kern_lwp.c	(working copy)
@@ -587,13 +587,13 @@ lwp_exit(struct lwp *l)
 
 	pmap_deactivate(l);
 
+	curlwp = curcpu()->ci_data.cpu_idlelwp;
 	if (l->l_flag & L_DETACHED) {
 		simple_lock(&p->p_lock);
 		LIST_REMOVE(l, l_sibling);
 		p->p_nlwps--;
 		simple_unlock(&p->p_lock);
 
-		curlwp = NULL;
 		l->l_proc = NULL;
 	}
 
@@ -605,8 +605,20 @@ lwp_exit(struct lwp *l)
 	/* This LWP no longer needs to hold the kernel lock. */
 	KERNEL_PROC_UNLOCK(l);
 
-	/* cpu_exit() will not return */
-	cpu_exit(l);
+	lwp_exit_switchaway(l);
+}
+
+void
+lwp_exit_switchaway(struct lwp *l)
+{
+	struct cpu_info *ci;
+
+	uvmexp.swtch++;
+	ci = curcpu();	
+	KASSERT(ci->ci_data.cpu_exitinglwp == NULL);
+	KASSERT(CURCPU_IDLE_P());
+	ci->ci_data.cpu_exitinglwp = l;
+	cpu_switchto(NULL, ci->ci_data.cpu_idlelwp);
 }
 
 /*
@@ -668,6 +680,9 @@ proc_representative_lwp(struct proc *p)
 		onproc = running = sleeping = stopped = suspended = NULL;
 		signalled = NULL;
 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
+			if ((l->l_flag & L_IDLE) != 0) {
+				continue;
+			}
 			if (l->l_lid == p->p_sigctx.ps_lwp)
 				signalled = l;
 			switch (l->l_stat) {
Index: kern/kern_proc.c
===================================================================
--- kern/kern_proc.c	(revision 1747)
+++ kern/kern_proc.c	(working copy)
@@ -319,6 +319,7 @@ proc0_init(void)
 	l->l_flag = L_INMEM;
 	l->l_stat = LSONPROC;
 	p->p_nrlwps = 1;
+	p->p_nlwpid = l->l_lid;
 
 	callout_init(&l->l_tsleep_ch);
 
Index: kern/kern_exit.c
===================================================================
--- kern/kern_exit.c	(revision 1799)
+++ kern/kern_exit.c	(working copy)
@@ -497,7 +497,7 @@ exit1(struct lwp *l, int rv)
 	 *
 	 * Other substructures are freed from wait().
 	 */
-	curlwp = NULL;
+	curlwp = curcpu()->ci_data.cpu_idlelwp;
 
 	/* Delay release until after dropping the proclist lock */
 	plim = p->p_limit;
@@ -537,26 +537,12 @@ exit1(struct lwp *l, int rv)
 	/* Release cached credentials. */
 	kauth_cred_free(l->l_cred);
 
-#ifdef DEBUG
-	/* Nothing should use the process link anymore */
 	l->l_proc = NULL;
-#endif
 
 	/* This process no longer needs to hold the kernel lock. */
 	KERNEL_PROC_UNLOCK(l);
 
-	/*
-	 * Finally, call machine-dependent code to switch to a new
-	 * context (possibly the idle context).  Once we are no longer
-	 * using the dead lwp's stack, lwp_exit2() will be called
-	 * to arrange for the resources to be released.
-	 *
-	 * Note that cpu_exit() will end with a call equivalent to
-	 * cpu_switch(), finishing our execution (pun intended).
-	 */
-
-	uvmexp.swtch++;
-	cpu_exit(l);
+	lwp_exit_switchaway(l);
 }
 
 void
Index: kern/kern_lock.c
===================================================================
--- kern/kern_lock.c	(revision 1799)
+++ kern/kern_lock.c	(working copy)
@@ -1424,8 +1424,8 @@ void
 assert_sleepable(struct simplelock *interlock, const char *msg)
 {
 
-	if (curlwp == NULL) {
-		panic("assert_sleepable: NULL curlwp");
+	if (CURCPU_IDLE_P()) {
+		panic("assert_sleepable: idle");
 	}
 	spinlock_switchcheck();
 	simple_lock_only_held(interlock, msg);
Index: uvm/uvm_page.c
===================================================================
--- uvm/uvm_page.c	(revision 1799)
+++ uvm/uvm_page.c	(working copy)
@@ -1523,8 +1523,7 @@ uvm_page_own(struct vm_page *pg, const c
  *
  * => try to complete one color bucket at a time, to reduce our impact
  *	on the CPU cache.
- * => we loop until we either reach the target or whichqs indicates that
- *	there is a process ready to run.
+ * => we loop until we either reach the target or there is a lwp ready to run.
  */
 void
 uvm_pageidlezero(void)
@@ -1538,8 +1537,9 @@ uvm_pageidlezero(void)
 	s = uvm_lock_fpageq();
 	firstbucket = nextbucket;
 	do {
-		if (sched_whichqs != 0)
+		if (sched_curcpu_runnable_p()) {
 			goto quit;
+		}
 		if (uvmexp.zeropages >= UVM_PAGEZERO_TARGET) {
 			uvm.page_idle_zero = FALSE;
 			goto quit;
@@ -1548,7 +1548,7 @@ uvm_pageidlezero(void)
 			pgfl = &uvm.page_free[free_list];
 			while ((pg = TAILQ_FIRST(&pgfl->pgfl_buckets[
 			    nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) {
-				if (sched_whichqs != 0)
+				if (sched_curcpu_runnable_p())
 					goto quit;
 
 				TAILQ_REMOVE(&pgfl->pgfl_buckets[
Index: arch/i386/include/frame.h
===================================================================
--- arch/i386/include/frame.h	(revision 1485)
+++ arch/i386/include/frame.h	(working copy)
@@ -139,7 +139,7 @@ struct intrframe {
 };
 
 /*
- * Stack frame inside cpu_switch()
+ * Stack frame inside cpu_switchto()
  */
 struct switchframe {
 	int	sf_edi;
Index: arch/i386/include/cpu.h
===================================================================
--- arch/i386/include/cpu.h	(revision 1785)
+++ arch/i386/include/cpu.h	(working copy)
@@ -101,10 +101,6 @@ struct cpu_info {
 #define	TLBSTATE_LAZY	1	/* tlbs are valid but won't be kept uptodate */
 #define	TLBSTATE_STALE	2	/* we might have stale user tlbs */
 
-	struct pcb *ci_curpcb;		/* VA of current HW PCB */
-	struct pcb *ci_idle_pcb;	/* VA of current PCB */
-	int ci_idle_tss_sel;		/* TSS selector of idle PCB */
-
 	struct intrsource *ci_isources[MAX_INTR_SOURCES];
 	uint32_t	ci_ipending;
 	int		ci_ilevel;
@@ -214,7 +210,7 @@ curcpu()
 extern	struct cpu_info *cpu_info[X86_MAXPROCS];
 
 void cpu_boot_secondary_processors(void);
-void cpu_init_idle_pcbs(void);
+void cpu_init_idle_lwps(void);
 
 /*
  * Preempt the current process if in interrupt from user mode,
@@ -242,7 +238,7 @@ extern void need_resched(struct cpu_info
 do {									\
 	struct cpu_info *__ci = (ci);					\
 	__ci->ci_want_resched = 1;					\
-	if (__ci->ci_curlwp != NULL)					\
+	if (__ci->ci_curlwp != __ci->ci_data.cpu_idlelwp)		\
 		aston(__ci->ci_curlwp->l_proc);       			\
 } while (/*CONSTCOND*/0)
 
@@ -252,8 +248,8 @@ do {									\
 
 extern uint32_t cpus_attached;
 
-#define	curpcb			curcpu()->ci_curpcb
 #define	curlwp			curcpu()->ci_curlwp
+#define	curpcb			&curlwp->l_addr->u_pcb
 
 /*
  * Arguments to hardclock, softclock and statclock
@@ -348,7 +344,6 @@ extern int i386_has_sse2;
 void	dumpconf(void);
 int	cpu_maxproc(void);
 void	cpu_reset(void);
-void	i386_init_pcb_tss_ldt(struct cpu_info *);
 void	i386_proc0_tss_ldt_init(void);
 
 /* identcpu.c */
Index: arch/i386/conf/files.i386
===================================================================
--- arch/i386/conf/files.i386	(revision 1785)
+++ arch/i386/conf/files.i386	(working copy)
@@ -73,6 +73,7 @@ file	arch/i386/i386/db_memrw.c	ddb | kgd
 file	arch/i386/i386/db_trace.c	ddb
 file	kern/subr_disk_mbr.c		disk
 file	arch/i386/i386/gdt.c
+file	arch/i386/i386/idle_machdep.c
 file	arch/i386/i386/in_cksum.S	inet | inet6
 file	arch/i386/i386/ipkdb_glue.c	ipkdb
 file	arch/i386/i386/kgdb_machdep.c	kgdb
Index: arch/i386/i386/copy.S
===================================================================
--- arch/i386/i386/copy.S	(revision 1464)
+++ arch/i386/i386/copy.S	(working copy)
@@ -80,7 +80,9 @@
 #include <machine/frameasm.h>
 #include <machine/cputypes.h>
 
-#define GET_CURPCB(reg)			movl	CPUVAR(CURPCB),reg
+#define GET_CURPCB(reg)	\
+	movl	CPUVAR(CURLWP), reg; \
+	movl	L_ADDR(reg), reg
 
 /*
  * The following primitives are used to fill and copy regions of memory.
Index: arch/i386/i386/machdep.c
===================================================================
--- arch/i386/i386/machdep.c	(revision 1799)
+++ arch/i386/i386/machdep.c	(working copy)
@@ -458,6 +458,9 @@ cpu_startup()
 
 	/* Safe for i/o port / memory space allocation to use malloc now. */
 	x86_bus_space_mallocok();
+
+	gdt_init();
+	i386_proc0_tss_ldt_init();
 }
 
 /*
@@ -466,13 +469,12 @@ cpu_startup()
 void
 i386_proc0_tss_ldt_init()
 {
+	struct lwp *l;
 	struct pcb *pcb;
 	int x;
 
-	gdt_init();
-
-	cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb;
-
+	l = &lwp0;
+	pcb = &l->l_addr->u_pcb;
 	pcb->pcb_tss.tss_ioopt =
 	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16;
 
@@ -482,36 +484,15 @@ i386_proc0_tss_ldt_init()
 	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
 	pcb->pcb_cr0 = rcr0();
 	pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
-	pcb->pcb_tss.tss_esp0 = USER_TO_UAREA(lwp0.l_addr) + KSTACK_SIZE - 16;
-	lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
-	lwp0.l_md.md_tss_sel = tss_alloc(pcb);
+	pcb->pcb_tss.tss_esp0 = USER_TO_UAREA(l->l_addr) + KSTACK_SIZE - 16;
+	l->l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
+	l->l_md.md_tss_sel = tss_alloc(pcb);
 
-	ltr(lwp0.l_md.md_tss_sel);
+	ltr(l->l_md.md_tss_sel);
 	lldt(pcb->pcb_ldt_sel);
 }
 
 /*
- * Set up TSS and LDT for a new PCB.
- */
-
-void
-i386_init_pcb_tss_ldt(struct cpu_info *ci)
-{
-	int x;
-	struct pcb *pcb = ci->ci_idle_pcb;
-
-	pcb->pcb_tss.tss_ioopt =
-	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16;
-	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
-		pcb->pcb_iomap[x] = 0xffffffff;
-
-	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
-	pcb->pcb_cr0 = rcr0();
-
-	ci->ci_idle_tss_sel = tss_alloc(pcb);
-}
-
-/*
  * sysctl helper routine for machdep.tm* nodes.
  */
 static int
@@ -1483,7 +1464,6 @@ init386(paddr_t first_avail)
 
 	proc0paddr = UAREA_TO_USER(proc0uarea);
 	lwp0.l_addr = proc0paddr;
-	cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb;
 
 	x86_bus_space_init();
 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
@@ -2397,7 +2377,7 @@ need_resched(struct cpu_info *ci)
 		return;
 
 	ci->ci_want_resched = 1;
-	if ((ci)->ci_curlwp != NULL)
+	if (ci->ci_curlwp != ci->ci_data.cpu_idlelwp)
 		aston((ci)->ci_curlwp->l_proc);
 	else if (ci != curcpu())
 		x86_send_ipi(ci, 0);
Index: arch/i386/i386/autoconf.c
===================================================================
--- arch/i386/i386/autoconf.c	(revision 1664)
+++ arch/i386/i386/autoconf.c	(working copy)
@@ -107,8 +107,6 @@ cpu_configure(void)
 	pcibios_init();
 #endif
 
-	/* kvm86 needs a TSS */
-	i386_proc0_tss_ldt_init();
 #ifdef KVM86
 	kvm86_init();
 #endif
@@ -128,7 +126,7 @@ cpu_configure(void)
 	lwp0.l_addr->u_pcb.pcb_cr0 = rcr0();
 #ifdef MULTIPROCESSOR
 	/* propagate this to the idle pcb's. */
-	cpu_init_idle_pcbs();
+	cpu_init_idle_lwps();
 #endif
 
 #if defined(I586_CPU) || defined(I686_CPU)
Index: arch/i386/i386/cpu.c
===================================================================
--- arch/i386/i386/cpu.c	(revision 1785)
+++ arch/i386/i386/cpu.c	(working copy)
@@ -87,6 +87,7 @@ __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.29
 #include <sys/systm.h>
 #include <sys/device.h>
 #include <sys/malloc.h>
+#include <sys/idle.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -251,8 +252,6 @@ cpu_attach(parent, self, aux)
 	struct cpu_info *ci;
 #if defined(MULTIPROCESSOR)
 	int cpunum = caa->cpu_number;
-	vaddr_t kstack;
-	struct pcb *pcb;
 #endif
 
 	/*
@@ -304,30 +303,22 @@ cpu_attach(parent, self, aux)
 
 #if defined(MULTIPROCESSOR)
 	/*
-	 * Allocate UPAGES contiguous pages for the idle PCB and stack.
+	 * primary cpu has its idle lwp already allocated by init_main.
 	 */
-	kstack = uvm_km_alloc(kernel_map, USPACE, 0, UVM_KMF_WIRED);
-	if (kstack == 0) {
-		if (caa->cpu_role != CPU_ROLE_AP) {
-			printf("\n");
-			panic("cpu_attach: unable to allocate idle stack for"
-			    " primary");
+
+	if (caa->cpu_role == CPU_ROLE_AP) {
+		int error;
+
+		error = create_idle_lwp(ci);
+		if (error != 0) {
+			aprint_normal("\n");
+			aprint_error("%s: unable to allocate idle lwp\n",
+			    sc->sc_dev.dv_xname);
+			return;
 		}
-		aprint_normal("\n");
-		aprint_error("%s: unable to allocate idle stack\n",
-		    sc->sc_dev.dv_xname);
-		return;
+	} else {
+		KASSERT(ci->ci_data.cpu_idlelwp != NULL);
 	}
-	pcb = ci->ci_idle_pcb = (struct pcb *) kstack;
-	memset(pcb, 0, USPACE);
-
-	pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
-	pcb->pcb_tss.tss_esp0 =
-	    kstack + USPACE - 16 - sizeof (struct trapframe);
-	pcb->pcb_tss.tss_esp =
-	    kstack + USPACE - 16 - sizeof (struct trapframe);
-	pcb->pcb_cr0 = rcr0();
-	pcb->pcb_cr3 = pmap_kernel()->pm_pdirpa;
 #endif
 	pmap_reference(pmap_kernel());
 	ci->ci_pmap = pmap_kernel();
@@ -397,10 +388,10 @@ cpu_attach(parent, self, aux)
 
 #if defined(MULTIPROCESSOR)
 	if (mp_verbose) {
-		aprint_verbose("%s: kstack at 0x%lx for %d bytes\n",
-		    sc->sc_dev.dv_xname, kstack, USPACE);
-		aprint_verbose("%s: idle pcb at %p, idle sp at 0x%x\n",
-		    sc->sc_dev.dv_xname, pcb, pcb->pcb_esp);
+		struct lwp *l = ci->ci_data.cpu_idlelwp;
+
+		aprint_verbose("%s: idle lwp at %p, idle sp at 0x%x\n",
+		    sc->sc_dev.dv_xname, l, l->l_addr->u_pcb.pcb_esp);
 	}
 #endif
 }
@@ -493,7 +484,7 @@ cpu_boot_secondary_processors()
 		ci = cpu_info[i];
 		if (ci == NULL)
 			continue;
-		if (ci->ci_idle_pcb == NULL)
+		if (ci->ci_data.cpu_idlelwp == NULL)
 			continue;
 		if ((ci->ci_flags & CPUF_PRESENT) == 0)
 			continue;
@@ -503,21 +494,30 @@ cpu_boot_secondary_processors()
 	}
 }
 
+static void
+cpu_init_idle_lwp(struct cpu_info *ci)
+{
+	struct lwp *l = ci->ci_data.cpu_idlelwp;
+	struct pcb *pcb = &l->l_addr->u_pcb;
+
+	pcb->pcb_cr0 = rcr0();
+}
+
 void
-cpu_init_idle_pcbs()
+cpu_init_idle_lwps()
 {
 	struct cpu_info *ci;
 	u_long i;
 
-	for (i=0; i < X86_MAXPROCS; i++) {
+	for (i = 0; i < X86_MAXPROCS; i++) {
 		ci = cpu_info[i];
 		if (ci == NULL)
 			continue;
-		if (ci->ci_idle_pcb == NULL)
+		if (ci->ci_data.cpu_idlelwp == NULL)
 			continue;
 		if ((ci->ci_flags & CPUF_PRESENT) == 0)
 			continue;
-		i386_init_pcb_tss_ldt(ci);
+		cpu_init_idle_lwp(ci);
 	}
 }
 
@@ -525,19 +525,17 @@ void
 cpu_start_secondary(ci)
 	struct cpu_info *ci;
 {
-	struct pcb *pcb;
 	int i;
 	struct pmap *kpm = pmap_kernel();
 	extern uint32_t mp_pdirpa;
 
 	mp_pdirpa = kpm->pm_pdirpa; /* XXX move elsewhere, not per CPU. */
 
-	pcb = ci->ci_idle_pcb;
-
 	ci->ci_flags |= CPUF_AP;
 
 	aprint_normal("%s: starting\n", ci->ci_dev->dv_xname);
 
+	ci->ci_curlwp = ci->ci_data.cpu_idlelwp;
 	CPU_STARTUP(ci);
 
 	/*
@@ -613,7 +611,7 @@ cpu_hatch(void *v)
 		panic("%s: already running!?", ci->ci_dev->dv_xname);
 #endif
 
-	lcr0(ci->ci_idle_pcb->pcb_cr0);
+	lcr0(ci->ci_data.cpu_idlelwp->l_addr->u_pcb.pcb_cr0);
 	cpu_init_idt();
 	lapic_set_lvt();
 	gdt_init_cpu(ci);
Index: arch/i386/i386/pmap.c
===================================================================
--- arch/i386/i386/pmap.c	(revision 1638)
+++ arch/i386/i386/pmap.c	(working copy)
@@ -1757,7 +1757,7 @@ pmap_ldt_cleanup(l)
 		ldt_free(pmap);
 		pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
 		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
-		if (pcb == curpcb)
+		if (l == curlwp)
 			lldt(pcb->pcb_ldt_sel);
 		old_ldt = pmap->pm_ldt;
 		len = pmap->pm_ldt_len * sizeof(union descriptor);
@@ -1889,8 +1889,7 @@ pmap_load()
 	KASSERT(pmap != pmap_kernel());
 	oldpmap = ci->ci_pmap;
 
-	pcb = ci->ci_curpcb;
-	KASSERT(pcb == &l->l_addr->u_pcb);
+	pcb = &l->l_addr->u_pcb;
 	/* loaded by pmap_activate */
 	KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
 
@@ -2159,7 +2158,7 @@ pmap_pageidlezero(pa)
 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
 	for (ptr = (int *) zerova, ep = ptr + PAGE_SIZE / sizeof(int);
 	    ptr < ep; ptr++) {
-		if (sched_whichqs != 0) {
+		if (sched_curcpu_runnable_p()) {
 
 			/*
 			 * A process has become ready.  Abort now,
Index: arch/i386/i386/sys_machdep.c
===================================================================
--- arch/i386/i386/sys_machdep.c	(revision 1799)
+++ arch/i386/i386/sys_machdep.c	(working copy)
@@ -325,9 +325,7 @@ i386_set_ldt(l, args, retval)
 			pmap->pm_flags |= PMF_USER_LDT;
 		ldt_alloc(pmap, new_ldt, new_len);
 		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
-		if (pcb == curpcb)
-			lldt(pcb->pcb_ldt_sel);
-
+		lldt(pcb->pcb_ldt_sel);
 	}
 copy:
 	/* Now actually replace the descriptors. */
Index: arch/i386/i386/mptramp.S
===================================================================
--- arch/i386/i386/mptramp.S	(revision 1464)
+++ arch/i386/i386/mptramp.S	(working copy)
@@ -221,7 +221,8 @@ _C_LABEL(cpu_spinup_trampoline_end):	#en
 mp_cont:
 	HALT(0x15)
 
-	movl	CPU_INFO_IDLE_PCB(%ecx),%esi
+	movl	CPU_INFO_IDLELWP(%ecx),%esi
+	movl	L_ADDR(%esi),%esi
 	
 # %esi now points at our PCB.
 		
@@ -250,8 +251,7 @@ mp_cont:
 	pushl	%ecx
 	call	_C_LABEL(cpu_hatch)
 	HALT(0x33)
-	xorl	%esi,%esi
-	jmp	_C_LABEL(mpidle)
+	jmp	_C_LABEL(idle_loop)
 	
 	.data
 _C_LABEL(mp_pdirpa):
Index: arch/i386/i386/locore.S
===================================================================
--- arch/i386/i386/locore.S	(revision 1747)
+++ arch/i386/i386/locore.S	(working copy)
@@ -107,24 +107,6 @@
 
 #include <machine/asm.h>
 
-#if defined(MULTIPROCESSOR)
-
-#define SET_CURLWP(lwp,cpu)				\
-	movl	CPUVAR(SELF),cpu		; 	\
-	movl	lwp,CPUVAR(CURLWP)	;	\
-	movl	cpu,L_CPU(lwp)
-
-#else
-
-#define SET_CURLWP(lwp,tcpu)		movl	lwp,CPUVAR(CURLWP)
-#define GET_CURLWP(reg)			movl	CPUVAR(CURLWP),reg
-
-#endif
-
-#define SET_CURPCB(reg)			movl	reg,CPUVAR(CURPCB)
-
-#define CLEAR_RESCHED(reg)		movl	reg,CPUVAR(RESCHED)
-
 /* XXX temporary kluge; these should not be here */
 /* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
 #include <dev/isa/isareg.h>
@@ -657,10 +639,13 @@ begin:
  */
 /* LINTSTUB: Func: void proc_trampoline(void) */
 NENTRY(proc_trampoline)
+	movl	$IPL_NONE,CPUVAR(ILEVEL)
+	pushl	CPUVAR(CURLWP)
+	call	_C_LABEL(pmap_activate)
+	addl	$4,%esp
 #ifdef MULTIPROCESSOR
 	call	_C_LABEL(proc_trampoline_mp)
 #endif
-	movl	$IPL_NONE,CPUVAR(ILEVEL)
 	pushl	%ebx
 	call	*%esi
 	addl	$4,%esp
@@ -761,7 +746,6 @@ ENTRY(longjmp)
 
 /*****************************************************************************/
 
-	.globl	_C_LABEL(sched_whichqs),_C_LABEL(sched_qs)
 	.globl	_C_LABEL(uvmexp),_C_LABEL(panic)
 
 #ifdef DIAGNOSTIC
@@ -773,200 +757,27 @@ NENTRY(switch_error)
 #endif /* DIAGNOSTIC */
 
 /*
- * void cpu_switch(struct lwp *)
- * Find a runnable lwp and switch to it.  Wait if necessary.  If the new
- * lwp is the same as the old one, we short-circuit the context save and
- * restore.
+ * void cpu_switchto(struct lwp *oldlwp, struct newlwp)
+ *
+ *	1. if (oldlwp != NULL), save its context and call sched_unlock().
+ *	2. then, restore context of newlwp.
  *
  * Note that the stack frame layout is known to "struct switchframe"
- * in <machine/frame.h> and to the code in cpu_fork() which initializes
+ * in <machine/frame.h> and to the code in cpu_lwp_fork() which initializes
  * it for a new lwp.
  */
-ENTRY(cpu_switch)
+
+/*
+ * void cpu_switchto(struct lwp *current, struct lwp *next)
+ * Switch to the specified next LWP.
+ */
+ENTRY(cpu_switchto)
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
 
-#ifdef DEBUG
-	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
-	jae	1f
-	pushl	$2f
-	call	_C_LABEL(panic)
-	/* NOTREACHED */
-2:	.asciz	"not splsched() in cpu_switch!"
-1:
-#endif /* DEBUG */
-
 	movl	16(%esp),%esi		# current
-
-	/*
-	 * Clear curlwp so that we don't accumulate system time while idle.
-	 * This also insures that schedcpu() will move the old lwp to
-	 * the correct queue if it happens to get called from the spllower()
-	 * below and changes the priority.  (See corresponding comment in
-	 * userret()).
-	 */
-	movl	$0,CPUVAR(CURLWP)
-	/*
-	 * First phase: find new lwp.
-	 *
-	 * Registers:
-	 *   %eax - queue head, scratch, then zero
-	 *   %ebx - queue number
-	 *   %ecx - cached value of whichqs
-	 *   %edx - next lwp in queue
-	 *   %esi - old lwp
-	 *   %edi - new lwp
-	 */
-
-	/* Look for new lwp. */
-	cli				# splhigh doesn't do a cli
-	movl	_C_LABEL(sched_whichqs),%ecx
-	bsfl	%ecx,%ebx		# find a full q
-	jnz	switch_dequeue
-
-	/*
-	 * idling:	save old context.
-	 *
-	 * Registers:
-	 *   %eax, %ecx - scratch
-	 *   %esi - old lwp, then old pcb
-	 *   %edi - idle pcb
-	 */
-
-	pushl	%esi
-	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
-	addl	$4,%esp
-
-	movl	L_ADDR(%esi),%esi
-
-	/* Save stack pointers. */
-	movl	%esp,PCB_ESP(%esi)
-	movl	%ebp,PCB_EBP(%esi)
-
-	/* Find idle PCB for this CPU */
-#ifndef MULTIPROCESSOR
-	movl	$_C_LABEL(lwp0),%ebx
-	movl	L_ADDR(%ebx),%edi
-	movl	L_MD_TSS_SEL(%ebx),%edx
-#else
-	movl	CPUVAR(IDLE_PCB),%edi
-	movl	CPUVAR(IDLE_TSS_SEL),%edx
-#endif
-	movl	$0,CPUVAR(CURLWP)		/* In case we fault... */
-
-	/* Restore the idle context (avoid interrupts) */
-	cli
-
-	/* Restore stack pointers. */
-	movl	PCB_ESP(%edi),%esp
-	movl	PCB_EBP(%edi),%ebp
-
-	/* Switch TSS. Reset "task busy" flag before loading. */
-	movl	%cr3,%eax
-	movl	%eax,PCB_CR3(%edi)
-#ifdef MULTIPROCESSOR
-	movl	CPUVAR(GDT),%eax
-#else
-	movl	_C_LABEL(gdt),%eax
-#endif
-	andl	$~0x0200,4-SEL_KPL(%eax,%edx,1)
-	ltr	%dx
-
-	/* We're always in the kernel, so we don't need the LDT. */
-
-	/* Restore cr0 (including FPU state). */
-	movl	PCB_CR0(%edi),%ecx
-	movl	%ecx,%cr0
-
-	/* Record new pcb. */
-	SET_CURPCB(%edi)
-
-	xorl	%esi,%esi
-	sti
-idle_unlock:
-#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
-	call	_C_LABEL(sched_unlock_idle)
-#endif
-	/* Interrupts are okay again. */
-	pushl	$IPL_NONE		# spl0()
-	call	_C_LABEL(Xspllower)	# process pending interrupts
-	addl	$4,%esp
-	jmp	idle_start
-idle_zero:
-	sti
-	call	_C_LABEL(uvm_pageidlezero)
-	cli
-	cmpl	$0,_C_LABEL(sched_whichqs)
-	jnz	idle_exit
-idle_loop:
-	/* Try to zero some pages. */
-	movl	_C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx
-	testl	%ecx,%ecx
-	jnz	idle_zero
-	sti
-	hlt
-NENTRY(mpidle)
-idle_start:
-	cli
-	cmpl	$0,_C_LABEL(sched_whichqs)
-	jz	idle_loop
-idle_exit:
-	movl	$IPL_HIGH,CPUVAR(ILEVEL)		# splhigh
-	sti
-#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
-	call	_C_LABEL(sched_lock_idle)
-#endif
-	movl	_C_LABEL(sched_whichqs),%ecx
-	bsfl	%ecx,%ebx
-	jz	idle_unlock
-
-switch_dequeue:
-	/*
-	 * we're running at splhigh(), but it's otherwise okay to take
-	 * interrupts here.
-	 */
-	sti
-	leal	_C_LABEL(sched_qs)(,%ebx,8),%eax # select q
-
-	movl	L_FORW(%eax),%edi	# unlink from front of process q
-#ifdef	DIAGNOSTIC
-	cmpl	%edi,%eax		# linked to self (i.e. nothing queued)?
-	je	_C_LABEL(switch_error)	# not possible
-#endif /* DIAGNOSTIC */
-	movl	L_FORW(%edi),%edx
-	movl	%edx,L_FORW(%eax)
-	movl	%eax,L_BACK(%edx)
-
-	cmpl	%edx,%eax		# q empty?
-	jne	3f
-
-	btrl	%ebx,%ecx		# yes, clear to indicate empty
-	movl	%ecx,_C_LABEL(sched_whichqs) # update q status
-
-3:	/* We just did it. */
-	xorl	%eax,%eax
-	CLEAR_RESCHED(%eax)
-
-switch_resume:
-#ifdef	DIAGNOSTIC
-	cmpl	%eax,L_WCHAN(%edi)	# Waiting for something?
-	jne	_C_LABEL(switch_error)	# Yes; shouldn't be queued.
-	cmpb	$LSRUN,L_STAT(%edi)	# In run state?
-	jne	_C_LABEL(switch_error)	# No; shouldn't be queued.
-#endif /* DIAGNOSTIC */
-
-	/* Isolate lwp.  XXX Is this necessary? */
-	movl	%eax,L_BACK(%edi)
-
-	/* Record new lwp. */
-	movb	$LSONPROC,L_STAT(%edi)	# l->l_stat = LSONPROC
-	SET_CURLWP(%edi,%ecx)
-
-	/* Skip context switch if same lwp. */
-	xorl	%ebx,%ebx
-	cmpl	%edi,%esi
-	je	switch_return
+	movl	20(%esp),%edi		# next
 
 	/* If old lwp exited, don't bother. */
 	testl	%esi,%esi
@@ -981,16 +792,16 @@ switch_resume:
 	 *   %edi - new lwp
 	 */
 
-	pushl	%esi
-	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
-	addl	$4,%esp
-
 	movl	L_ADDR(%esi),%esi
 
 	/* Save stack pointers. */
 	movl	%esp,PCB_ESP(%esi)
 	movl	%ebp,PCB_EBP(%esi)
 
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+	call	_C_LABEL(sched_unlock)
+#endif
+
 switch_exited:
 	/*
 	 * Third phase: restore saved context.
@@ -1029,10 +840,6 @@ switch_exited:
 	andl	$~0x0200,4(%eax,%edx, 1)
 	ltr	%dx
 
-	pushl	%edi
-	call	_C_LABEL(pmap_activate)		# pmap_activate(p)
-	addl	$4,%esp
-
 #if 0
 switch_restored:
 #endif
@@ -1051,9 +858,6 @@ switch_restored:
 #endif
 	movl	%ecx,%cr0
 
-	/* Record new pcb. */
-	SET_CURPCB(%esi)
-
 	/* Interrupts are okay again. */
 	sti
 
@@ -1065,22 +869,6 @@ switch_restored:
 	cmpl	$0,P_RASLIST(%esi)
 	jne	2f
 1:
-	movl	$1,%ebx
-
-switch_return:
-#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
-	call    _C_LABEL(sched_unlock_idle)
-#endif
-	cmpl	$0,CPUVAR(IPENDING)
-	jz	3f
-	pushl	$IPL_NONE		# spl0()
-	call	_C_LABEL(Xspllower)	# process pending interrupts
-	addl	$4,%esp
-3:
-	movl	$IPL_HIGH,CPUVAR(ILEVEL)	# splhigh()
-
-	movl	%ebx,%eax
-
 	popl	%edi
 	popl	%esi
 	popl	%ebx
@@ -1099,116 +887,6 @@ switch_return:
 	jmp	1b
 
 /*
- * void cpu_switchto(struct lwp *current, struct lwp *next)
- * Switch to the specified next LWP.
- */
-ENTRY(cpu_switchto)
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-#ifdef DEBUG
-	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
-	jae	1f
-	pushl	$2f
-	call	_C_LABEL(panic)
-	/* NOTREACHED */
-2:	.asciz	"not splsched() in cpu_switchto!"
-1:
-#endif /* DEBUG */
-
-	movl	16(%esp),%esi		# current
-	movl	20(%esp),%edi		# next
-
-	/*
-	 * Clear curlwp so that we don't accumulate system time while idle.
-	 * This also insures that schedcpu() will move the old process to
-	 * the correct queue if it happens to get called from the spllower()
-	 * below and changes the priority.  (See corresponding comment in
-	 * usrret()).
-	 *
-	 * XXX Is this necessary?  We know we won't go idle.
-	 */
-	movl	$0,CPUVAR(CURLWP)
-
-	/*
-	 * We're running at splhigh(), but it's otherwise okay to take
-	 * interrupts here.
-	 */
-	sti
-
-	/* Jump into the middle of cpu_switch */
-	xorl	%eax,%eax
-	jmp	switch_resume
-
-/*
- * void cpu_exit(struct lwp *l)
- * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's
- * if multiprocessor) and deallocate the address space and kernel stack for p.
- * Then jump into cpu_switch(), as if we were in the idle proc all along.
- */
-#ifndef MULTIPROCESSOR
-	.globl	_C_LABEL(lwp0)
-#endif
-/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */
-ENTRY(cpu_exit)
-	movl	4(%esp),%edi		# old process
-#ifndef MULTIPROCESSOR
-	movl	$_C_LABEL(lwp0),%ebx
-	movl	L_ADDR(%ebx),%esi
-	movl	L_MD_TSS_SEL(%ebx),%edx
-#else
-	movl	CPUVAR(IDLE_PCB),%esi
-	movl	CPUVAR(IDLE_TSS_SEL),%edx
-#endif
-	/* In case we fault... */
-	movl	$0,CPUVAR(CURLWP)
-
-	/* Restore the idle context. */
-	cli
-
-	/* Restore stack pointers. */
-	movl	PCB_ESP(%esi),%esp
-	movl	PCB_EBP(%esi),%ebp
-
-	/* Switch TSS. Reset "task busy" flag before loading. */
-	movl	%cr3,%eax
-	movl	%eax,PCB_CR3(%esi)
-#ifdef MULTIPROCESSOR
-	movl	CPUVAR(GDT),%eax
-#else
-	/* Load TSS info. */
-	movl	_C_LABEL(gdt),%eax
-#endif
-
-	andl	$~0x0200,4-SEL_KPL(%eax,%edx,1)
-	ltr	%dx
-
-	/* We're always in the kernel, so we don't need the LDT. */
-
-	/* Restore cr0 (including FPU state). */
-	movl	PCB_CR0(%esi),%ecx
-	movl	%ecx,%cr0
-
-	/* Record new pcb. */
-	SET_CURPCB(%esi)
-
-	/* Interrupts are okay again. */
-	sti
-
-	/*
-	 * Schedule the dead LWP's stack to be freed.
-	 */
-	pushl	%edi
-	call	_C_LABEL(lwp_exit2)
-	addl	$4,%esp
-
-	/* Jump into cpu_switch() with the right state. */
-	xorl	%esi,%esi
-	movl	%esi,CPUVAR(CURLWP)
-	jmp	idle_start
-
-/*
  * void savectx(struct pcb *pcb);
  * Update pcb, saving current processor state.
  */
Index: arch/i386/i386/trap.c
===================================================================
--- arch/i386/i386/trap.c	(revision 1799)
+++ arch/i386/i386/trap.c	(working copy)
@@ -509,8 +509,10 @@ copyfault:
 			KERNEL_PROC_UNLOCK(l);
 		}
 		/* Allow a forced task switch. */
-		if (curcpu()->ci_want_resched) /* XXX CSE me? */
+		if (curcpu()->ci_want_resched) { /* XXX CSE me? */
+			curcpu()->ci_want_resched = 0;
 			preempt(0);
+		}
 		goto out;
 
 	case T_DNA|T_USER: {
Index: arch/i386/i386/genassym.cf
===================================================================
--- arch/i386/i386/genassym.cf	(revision 1728)
+++ arch/i386/i386/genassym.cf	(working copy)
@@ -283,10 +283,8 @@ define	CPU_INFO_WANT_PMAPLOAD	offsetof(s
 define	CPU_INFO_TLBSTATE	offsetof(struct cpu_info, ci_tlbstate)
 define	TLBSTATE_VALID		TLBSTATE_VALID
 define	CPU_INFO_CURLWP		offsetof(struct cpu_info, ci_curlwp)
-define	CPU_INFO_CURPCB		offsetof(struct cpu_info, ci_curpcb)
-define	CPU_INFO_IDLE_PCB	offsetof(struct cpu_info, ci_idle_pcb)
-define  CPU_INFO_IDLE_TSS_SEL	offsetof(struct cpu_info, ci_idle_tss_sel)
 define	CPU_INFO_ASTPENDING	offsetof(struct cpu_info, ci_astpending)
+define	CPU_INFO_IDLELWP	offsetof(struct cpu_info, ci_data.cpu_idlelwp)
 
 define	CPU_INFO_LEVEL		offsetof(struct cpu_info, ci_cpuid_level)
 define	CPU_INFO_VENDOR		offsetof(struct cpu_info, ci_vendor[0])
Index: ddb/db_xxx.c
===================================================================
--- ddb/db_xxx.c	(revision 1638)
+++ ddb/db_xxx.c	(working copy)
@@ -292,25 +292,6 @@ db_dmesg(db_expr_t addr, int haddr, db_e
 void
 db_show_sched_qs(db_expr_t addr, int haddr, db_expr_t count, const char *modif)
 {
-	struct prochd *ph;
-	struct lwp *l;
-	int i, first;
 
-	for (i = 0; i < RUNQUE_NQS; i++)
-	{
-		first = 1;
-		ph = &sched_qs[i];
-		for (l = ph->ph_link; l != (void *)ph; l = l->l_forw) {
-			if (first) {
-				db_printf("%c%d",
-				    (sched_whichqs & RQMASK(i))
-				    ? ' ' : '!', i);
-				first = 0;
-			}
-			db_printf("\t%d.%d (%s) pri=%d usrpri=%d\n",
-			    l->l_proc->p_pid,
-			    l->l_lid, l->l_proc->p_comm,
-			    (int)l->l_priority, (int)l->l_usrpri);
-		}
-	}
+	sched_print_runqueue(db_printf);
 }
Index: sys/lwp.h
===================================================================
--- sys/lwp.h	(revision 1747)
+++ sys/lwp.h	(working copy)
@@ -114,6 +114,7 @@ extern struct lwp lwp0;			/* LWP for pro
 #endif
 
 /* These flags are kept in l_flag. [*] is shared with p_flag */
+#define	L_IDLE		0x00000001
 #define	L_INMEM		0x00000004 /* [*] Loaded into memory. */
 #define	L_SELECT	0x00000040 /* [*] Selecting; wakeup/waiting danger. */
 #define	L_SINTR		0x00000080 /* [*] Sleep is interruptible. */
@@ -174,9 +175,6 @@ void	setrunqueue (struct lwp *);
 struct lwp *nextrunqueue(void);
 #endif
 void	unsleep (struct lwp *);
-#ifndef cpu_switch
-int	cpu_switch (struct lwp *, struct lwp *);
-#endif
 #ifndef cpu_switchto
 void	cpu_switchto (struct lwp *, struct lwp *);
 #endif
@@ -192,6 +190,7 @@ void	cpu_setfunc(struct lwp *, void (*)(
 void	startlwp(void *);
 void	upcallret(struct lwp *);
 void	lwp_exit (struct lwp *);
+void	lwp_exit_switchaway(struct lwp *);
 void	lwp_exit2 (struct lwp *);
 struct lwp *proc_representative_lwp(struct proc *);
 __inline int lwp_suspend(struct lwp *, struct lwp *);
Index: sys/cpu_data.h
===================================================================
--- sys/cpu_data.h	(revision 1787)
+++ sys/cpu_data.h	(working copy)
@@ -61,7 +61,9 @@ struct lwp;
  */
 
 struct cpu_data {
-	struct	schedstate_percpu cpu_schedstate; /* scheduler state */
+	struct schedstate_percpu cpu_schedstate; /* scheduler state */
+	struct lwp *cpu_idlelwp;	/* idle lwp */
+	struct lwp *cpu_exitinglwp;
 
 #if defined(MULTIPROCESSOR)
 	int	cpu_biglock_count;
Index: sys/sched.h
===================================================================
--- sys/sched.h	(revision 1638)
+++ sys/sched.h	(working copy)
@@ -175,17 +175,6 @@ struct schedstate_percpu {
 extern int schedhz;			/* ideally: 16 */
 extern int rrticks;			/* ticks per roundrobin() */
 
-/*
- * Global scheduler state.  We would like to group these all together
- * in a single structure to make them easier to find, but leaving
- * whichqs and qs as independent globals makes for more efficient
- * assembly language in the low-level context switch code.  So we
- * simply give them meaningful names; the globals are actually declared
- * in kern/kern_synch.c.
- */
-extern struct prochd sched_qs[];
-extern volatile uint32_t sched_whichqs;
-
 struct proc;
 struct cpu_info;
 
@@ -217,8 +206,9 @@ do {									\
 	splx(s);							\
 } while (/* CONSTCOND */ 0)
 
-void	sched_lock_idle(void);
-void	sched_unlock_idle(void);
+void	sched_unlock(void);
+boolean_t sched_curcpu_runnable_p(void);
+void sched_print_runqueue(void (*)(const char *, ...));
 
 #else /* ! MULTIPROCESSOR || LOCKDEBUG */
 
Index: sys/proc.h
===================================================================
--- sys/proc.h	(revision 1747)
+++ sys/proc.h	(working copy)
@@ -410,6 +410,8 @@ extern struct lwp	*curlwp;		/* Current r
 #endif /* MULTIPROCESSOR */
 #endif /* ! curproc */
 
+#define	CURCPU_IDLE_P()	(curlwp == curcpu()->ci_data.cpu_idlelwp)
+
 static struct proc *__curproc(void);
 
 static __inline struct proc *
@@ -465,7 +467,6 @@ int	inferior(struct proc *, struct proc 
 int	leavepgrp(struct proc *);
 void	sessdelete(struct session *);
 void	yield(void);
-struct lwp *chooselwp(void);
 void	pgdelete(struct pgrp *);
 void	procinit(void);
 void	resetprocpriority(struct proc *);

--NextPart-20060930211225-0902601
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="kern_idle.c"

/*	$NetBSD$	*/

/*-
 * Copyright (c)2002, 2006 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>

__KERNEL_RCSID(0, "$NetBSD$");

#include <sys/param.h>
#include <sys/idle.h>
#include <sys/lwp.h>
#include <sys/proc.h>

#include <uvm/uvm.h>
#include <uvm/uvm_extern.h>

#define	PIDLELWP	(MAXPRI + 1)	/* lowest priority */

void
idle_loop(void *dummy)
{
	struct cpu_info *ci = curcpu();
	struct lwp *l = curlwp;

	(void)KERNEL_LOCK_RELEASE_ALL();
	l->l_usrpri = PIDLELWP;
	while (1 /* CONSTCOND */) {
		struct lwp *exiting;
		int s;

		KERNEL_LOCK_ASSERT_UNLOCKED();
		KASSERT((l->l_flag & L_IDLE) != 0);
		KASSERT(ci == curcpu());
		KASSERT(l == curlwp);
		KASSERT(CURCPU_IDLE_P());
		KASSERT(l->l_usrpri == PIDLELWP);

		if (uvm.page_idle_zero) {
			if (sched_curcpu_runnable_p()) {
				goto schedule;
			}
			uvm_pageidlezero();
		}
		if (!sched_curcpu_runnable_p()) {
			cpu_idle();
			if (!sched_curcpu_runnable_p()) {
				continue;
			}
		}
schedule:
		SCHED_LOCK(s);
		l->l_stat = LSRUN;
		mi_switch(l, NULL);
		splx(s);

		exiting = ci->ci_data.cpu_exitinglwp;
		if (exiting != NULL) {
			ci->ci_data.cpu_exitinglwp = NULL;
			lwp_exit2(exiting);
		}
	}
}

int
create_idle_lwp(struct cpu_info *ci)
{
	struct proc *p = &proc0;
	struct lwp *l;
	vaddr_t uaddr;
	boolean_t inmem;
	int error;

	/* XXX should use kthread_create1? */

	KASSERT(ci->ci_data.cpu_idlelwp == NULL);
	inmem = uvm_uarea_alloc(&uaddr);
	if (uaddr == 0) {
		return ENOMEM;
	}
	error = newlwp(&lwp0, p, uaddr, inmem, 0, NULL, 0, idle_loop, NULL, &l);
	if (error != 0) {
		panic("create_idle_lwp: newlwp failed");
	}
	PHOLD(l);
	l->l_stat = LSRUN;
	l->l_flag |= L_IDLE;
	l->l_cpu = ci;
	ci->ci_data.cpu_idlelwp = l;
	return error;
}

--NextPart-20060930211225-0902601
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="idle_machdep.c"

/*	$NetBSD$	*/

/*-
 * Copyright (c)2002, 2006 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>

__KERNEL_RCSID(0, "$NetBSD$");

#include <sys/param.h>
#include <sys/proc.h>

#include <machine/cpufunc.h>

void
cpu_idle(void)
{
	struct cpu_info *ci = curcpu();

	disable_intr();
	__insn_barrier();
	if (__predict_false(ci->ci_want_resched) == 0) {
		__asm __volatile ("sti; hlt");
	} else {
		enable_intr();
	}
	ci->ci_want_resched = 0;
}

--NextPart-20060930211225-0902601--