Subject: kern/31542: scheduler doesn't work well with many runnable processes
To: None <kern-bug-people@netbsd.org, gnats-admin@netbsd.org,>
From: None <yamt@mwd.biglobe.ne.jp>
List: netbsd-bugs
Date: 10/10/2005 11:19:00
>Number:         31542
>Category:       kern
>Synopsis:       scheduler doesn't work well with many runnable processes
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Mon Oct 10 11:19:00 +0000 2005
>Originator:     YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
>Release:        NetBSD 3.99.8
>Organization:

>Environment:
	
	
System: NetBSD kaeru 3.99.8 NetBSD 3.99.8 (build.kaeru.xen.nodebug) #10: Thu Aug 25 20:00:08 JST 2005 takashi@kaeru:/home/takashi/work/kernel/build.kaeru.xen.nodebug i386
Architecture: i386
Machine: i386
>Description:
	- schedcpu() decreases p_estcpu of all processes
	  every seconds, by at least 1 regardless of load average.

	- schedclock() increases p_estcpu of curproc by 1,
	  at about 16 hz.

	in the consequence, if a system has >16 processes
	with runnable lwps, their p_estcpu are not likely increased.
>How-To-Repeat:
>Fix:
	make p_estcpu fixpt_t.

Index: sys/sched.h
===================================================================
--- sys/sched.h	(revision 1389)
+++ sys/sched.h	(working copy)
@@ -180,7 +180,8 @@ struct schedstate_percpu {
 
 #define	PPQ	(128 / RUNQUE_NQS)	/* priorities per queue */
 #define NICE_WEIGHT 2			/* priorities per nice level */
-#define	ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - PPQ)
+#define	ESTCPU_SHIFT	11
+#define	ESTCPULIM(e) min((e), (NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
 
 extern int schedhz;			/* ideally: 16 */
 extern int rrticks;			/* ticks per roundrobin() */
Index: sys/proc.h
===================================================================
--- sys/proc.h	(revision 1335)
+++ sys/proc.h	(working copy)
@@ -201,7 +201,7 @@ struct proc {
 	struct sadata 	*p_sa;		/* Scheduler activation information */
 
 	/* scheduling */
-	u_int		p_estcpu;	/* Time averaged value of p_cpticks XXX belongs in p_startcopy section */
+	fixpt_t		p_estcpu;	/* Time averaged value of p_cpticks XXX belongs in p_startcopy section */
 	int		p_cpticks;	/* Ticks of CPU time */
 	fixpt_t		p_pctcpu;	/* %cpu for this process during p_swtime */
 
Index: kern/kern_synch.c
===================================================================
--- kern/kern_synch.c	(revision 1390)
+++ kern/kern_synch.c	(working copy)
@@ -223,8 +223,16 @@ roundrobin(struct cpu_info *ci)
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
-#define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
+static __inline fixpt_t
+decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
+{
+
+	/* XXX tweak shifts to avoid 64 bit arithmetics? */
+
+	return ((uint64_t)estcpu * loadfac / (loadfac + FSCALE));
+}
+
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
 
@@ -253,7 +261,6 @@ schedcpu(void *arg)
 	struct lwp *l;
 	struct proc *p;
 	int s, minslp;
-	unsigned int newcpu;
 	int clkhz;
 
 	proclist_lock_read();
@@ -295,8 +302,7 @@ schedcpu(void *arg)
 			(p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
 #endif
 		p->p_cpticks = 0;
-		newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu);
-		p->p_estcpu = newcpu;
+		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
 		splx(s);	/* Done with the process CPU ticks update */
 		SCHED_LOCK(s);
 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
@@ -325,13 +331,13 @@ schedcpu(void *arg)
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
- * least six times the loadfactor will decay p_estcpu to zero.
+ * least six times the loadfactor will decay p_estcpu to zero. XXX XXX
  */
 void
 updatepri(struct lwp *l)
 {
 	struct proc *p = l->l_proc;
-	unsigned int newcpu;
+	fixpt_t newcpu;
 	fixpt_t loadfac;
 
 	SCHED_ASSERT_LOCKED();
@@ -344,7 +350,7 @@ updatepri(struct lwp *l)
 	else {
 		l->l_slptime--;	/* the first time was done in schedcpu */
 		while (newcpu && --l->l_slptime)
-			newcpu = (int) decay_cpu(loadfac, newcpu);
+			newcpu = decay_cpu(loadfac, newcpu);
 		p->p_estcpu = newcpu;
 	}
 	resetpriority(l);
@@ -1105,7 +1111,7 @@ resetpriority(struct lwp *l)
 
 	SCHED_ASSERT_LOCKED();
 
-	newpriority = PUSER + p->p_estcpu +
+	newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
 			NICE_WEIGHT * (p->p_nice - NZERO);
 	newpriority = min(newpriority, MAXPRI);
 	l->l_usrpri = newpriority;
@@ -1145,7 +1151,7 @@ schedclock(struct lwp *l)
 	struct proc *p = l->l_proc;
 	int s;
 
-	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
+	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
 	SCHED_LOCK(s);
 	resetpriority(l);
 	SCHED_UNLOCK(s);
Index: kern/init_sysctl.c
===================================================================
--- kern/init_sysctl.c	(revision 1392)
+++ kern/init_sysctl.c	(working copy)
@@ -2782,7 +2782,7 @@ fill_kproc2(struct proc *p, struct kinfo
 		ki->p_tdev = NODEV;
 	}
 
-	ki->p_estcpu = p->p_estcpu;
+	ki->p_estcpu = p->p_estcpu >> ESTCPU_SHIFT;
 	ki->p_rtime_sec = p->p_rtime.tv_sec;
 	ki->p_rtime_usec = p->p_rtime.tv_usec;
 	ki->p_cpticks = p->p_cpticks;

>Unformatted: