Subject: Re: sobj_changepri (Re: CVS commit: [newlock2] src/sys)
To: None <ad@netbsd.org>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 02/23/2007 02:07:54
--NextPart-20070223020650-0958901
Content-Type: Text/Plain; charset=us-ascii

> On Thu, Feb 22, 2007 at 08:13:29PM +0900, YAMAMOTO Takashi wrote:
> 
> > > Log Message:
> > > - Change syncobj_t::sobj_changepri() to alter both the user priority and
> > >   the effective priority of LWPs. How the effective priority is adjusted
> > >   depends on the type of object.
> > 
> > i'm not sure if it was a good idea.
> > priority inheritance needs a way to change the effective priority only.
> 
> At the time, it seemed like a reasonable accommodation to make so we can
> handle the different behaviour of tsleep and condition variables. I figured
> we would need another method like sobj_lendpri later on. If you can see a
> better way of doing it, please change it.

after some more thoughts, i've changed my mind.
it's cleaner to use another dedicated priority member in struct lwp.
(see the attached patch.)

YAMAMOTO Takashi

--NextPart-20070223020650-0958901
Content-Type: Text/Plain; charset=us-ascii
Content-Disposition: attachment; filename="pi.diff5"

Index: sys/lwp.h
===================================================================
RCS file: /cvsroot/src/sys/sys/lwp.h,v
retrieving revision 1.51
diff -u -p -r1.51 lwp.h
--- sys/lwp.h	22 Feb 2007 04:38:03 -0000	1.51
+++ sys/lwp.h	22 Feb 2007 16:56:17 -0000
@@ -83,8 +83,10 @@ struct	lwp {
 	u_int		l_swtime;	/* l: time swapped in or out */
 	int		l_holdcnt;	/* l: if non-zero, don't swap */
 	int		l_biglocks;	/* l: biglock count before sleep */
-	u_char		l_priority;	/* l: process priority */
-	u_char		l_usrpri;	/* l: user-priority */
+	int		l_priority;	/* l: process priority */
+	int		l_usrpri;	/* l: user-priority */
+	int		l_inheritedprio;/* l: inherited priority */
+	SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */
 	long		l_nvcsw;	/* l: voluntary context switches */
 	long		l_nivcsw;	/* l: involuntary context switches */
 
@@ -250,6 +252,7 @@ void	lwp_setlock(struct lwp *, kmutex_t 
 void	lwp_unlock_to(struct lwp *, kmutex_t *);
 void	lwp_lock_retry(struct lwp *, kmutex_t *);
 void	lwp_relock(struct lwp *, kmutex_t *);
+int	lwp_trylock(struct lwp *);
 void	lwp_addref(struct lwp *);
 void	lwp_delref(struct lwp *);
 void	lwp_drainrefs(struct lwp *);
@@ -322,10 +325,24 @@ lwp_changepri(struct lwp *l, int pri)
 {
 	LOCK_ASSERT(mutex_owned(l->l_mutex));
 
+	if (l->l_priority == pri)
+		return;
+
 	(*l->l_syncobj->sobj_changepri)(l, pri);
 }
 
 static inline void
+lwp_inheritpri(struct lwp *l, int pri)
+{
+	LOCK_ASSERT(mutex_owned(l->l_mutex));
+
+	if (l->l_inheritedprio == pri)
+		return;
+
+	(*l->l_syncobj->sobj_inheritpri)(l, pri);
+}
+
+static inline void
 lwp_unsleep(struct lwp *l)
 {
 	LOCK_ASSERT(mutex_owned(l->l_mutex));
@@ -333,6 +350,13 @@ lwp_unsleep(struct lwp *l)
 	(*l->l_syncobj->sobj_unsleep)(l);
 }
 
+static inline int
+lwp_eprio(struct lwp *l)
+{
+
+	return MIN(l->l_inheritedprio, l->l_priority);
+}
+
 int newlwp(struct lwp *, struct proc *, vaddr_t, bool, int,
     void *, size_t, void (*)(void *), void *, struct lwp **);
 
Index: sys/queue.h
===================================================================
RCS file: /cvsroot/src/sys/sys/queue.h,v
retrieving revision 1.45
diff -u -p -r1.45 queue.h
--- sys/queue.h	7 Mar 2006 17:56:00 -0000	1.45
+++ sys/queue.h	22 Feb 2007 16:56:17 -0000
@@ -222,6 +222,11 @@ struct {								\
 	}								\
 } while (/*CONSTCOND*/0)
 
+#define	SLIST_REMOVE_AFTER(slistelm, field) do {			\
+	(slistelm)->field.sle_next =					\
+	    SLIST_NEXT(SLIST_NEXT((slistelm), field), field);		\
+} while (/*CONSTCOND*/0)
+
 #define	SLIST_FOREACH(var, head, field)					\
 	for((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next)
 
Index: sys/sched.h
===================================================================
RCS file: /cvsroot/src/sys/sys/sched.h,v
retrieving revision 1.30
diff -u -p -r1.30 sched.h
--- sys/sched.h	15 Feb 2007 15:09:16 -0000	1.30
+++ sys/sched.h	22 Feb 2007 16:56:17 -0000
@@ -195,13 +195,18 @@ typedef struct syncobj {
 	u_int	sobj_flag;
 	void	(*sobj_unsleep)(struct lwp *);
 	void	(*sobj_changepri)(struct lwp *, int);
+	void	(*sobj_inheritpri)(struct lwp *, int);
+	struct lwp *(*sobj_owner)(volatile const void *); /* XXX wchan_t */
 } syncobj_t;
 
+struct lwp *syncobj_noowner(volatile const void *); /* XXX wchan_t */
+
 #define	SOBJ_SLEEPQ_SORTED	0x01
 #define	SOBJ_SLEEPQ_FIFO	0x02
 
 extern syncobj_t	sched_syncobj;
-extern syncobj_t	turnstile_syncobj;
+extern syncobj_t	mutex_syncobj;
+extern syncobj_t	rw_syncobj;
 
 #endif	/* _KERNEL */
 #endif	/* _SYS_SCHED_H_ */
Index: sys/sleepq.h
===================================================================
RCS file: /cvsroot/src/sys/sys/sleepq.h,v
retrieving revision 1.2
diff -u -p -r1.2 sleepq.h
--- sys/sleepq.h	9 Feb 2007 21:55:37 -0000	1.2
+++ sys/sleepq.h	22 Feb 2007 16:56:17 -0000
@@ -95,8 +95,12 @@ void	sleepq_timeout(void *);
 void	sleepq_wake(sleepq_t *, wchan_t, u_int);
 int	sleepq_abort(kmutex_t *, int);
 void	sleepq_changepri(struct lwp *, int);
+void	sleepq_inheritpri(struct lwp *, int);
 int	sleepq_unblock(int, int);
-void	sleepq_insert(sleepq_t *, struct lwp *, int, syncobj_t *);
+void	sleepq_insert(sleepq_t *, struct lwp *, syncobj_t *);
+
+void	sleepq_enqueue(sleepq_t *, int, wchan_t, const char *, syncobj_t *);
+void	sleepq_switch(int, int);
 
 void	sleeptab_init(sleeptab_t *);
 
@@ -189,6 +193,11 @@ typedef struct turnstile {
 	struct turnstile	*ts_free;	/* turnstile free list */
 	wchan_t			ts_obj;		/* lock object */
 	sleepq_t		ts_sleepq[2];	/* sleep queues */
+
+	/* priority inheritance */
+	u_char			ts_eprio;
+	struct lwp		*ts_inheritor;
+	SLIST_ENTRY(turnstile)	ts_pichain;
 } turnstile_t;
 
 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
@@ -222,7 +231,7 @@ typedef struct tschain {
 void	turnstile_init(void);
 turnstile_t	*turnstile_lookup(wchan_t);
 void	turnstile_exit(wchan_t);
-void	turnstile_block(turnstile_t *, int, wchan_t);
+void	turnstile_block(turnstile_t *, int, wchan_t, syncobj_t *);
 void	turnstile_wakeup(turnstile_t *, int, int, struct lwp *);
 void	turnstile_print(volatile void *, void (*)(const char *, ...));
 
@@ -232,6 +241,9 @@ turnstile_unblock(void)
 	(void)sleepq_unblock(0, 0);
 }
 
+void	turnstile_unsleep(struct lwp *);
+void	turnstile_changepri(struct lwp *, int);
+
 extern struct pool_cache turnstile_cache;
 extern struct turnstile turnstile0;
 
Index: kern/kern_condvar.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_condvar.c,v
retrieving revision 1.3
diff -u -p -r1.3 kern_condvar.c
--- kern/kern_condvar.c	11 Feb 2007 15:41:53 -0000	1.3
+++ kern/kern_condvar.c	22 Feb 2007 16:56:17 -0000
@@ -61,6 +61,8 @@ syncobj_t cv_syncobj = {
 	SOBJ_SLEEPQ_SORTED,
 	cv_unsleep,
 	cv_changepri,
+	sleepq_inheritpri,
+	syncobj_noowner,
 };
 
 /*
@@ -149,13 +151,13 @@ cv_changepri(struct lwp *l, int pri)
 
 	KASSERT(lwp_locked(l, sq->sq_mutex));
 
-	opri = l->l_priority;
+	opri = lwp_eprio(l);
 	l->l_usrpri = pri;
 	l->l_priority = sched_kpri(l);
 
-	if (l->l_priority != opri) {
+	if (lwp_eprio(l) != opri) {
 		TAILQ_REMOVE(&sq->sq_queue, l, l_sleepchain);
-		sleepq_insert(sq, l, pri, l->l_syncobj);
+		sleepq_insert(sq, l, l->l_syncobj);
 	}
 }
 
Index: kern/kern_ktrace.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_ktrace.c,v
retrieving revision 1.116
diff -u -p -r1.116 kern_ktrace.c
--- kern/kern_ktrace.c	22 Feb 2007 06:34:43 -0000	1.116
+++ kern/kern_ktrace.c	22 Feb 2007 16:56:17 -0000
@@ -688,7 +688,7 @@ ktrcsw(struct lwp *l, int out, int user)
 	 * Don't record context switches resulting from blocking on 
 	 * locks; it's too easy to get duff results.
 	 */
-	if (l->l_syncobj == &turnstile_syncobj)
+	if (l->l_syncobj == &mutex_syncobj || l->l_syncobj == &rw_syncobj)
 		return;
 
 	/*
Index: kern/kern_lwp.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_lwp.c,v
retrieving revision 1.59
diff -u -p -r1.59 kern_lwp.c
--- kern/kern_lwp.c	21 Feb 2007 23:48:13 -0000	1.59
+++ kern/kern_lwp.c	22 Feb 2007 16:56:17 -0000
@@ -504,11 +504,14 @@ newlwp(struct lwp *l1, struct proc *p2, 
 		l2 = pool_get(&lwp_pool, PR_WAITOK);
 		memset(l2, 0, sizeof(*l2));
 		l2->l_ts = pool_cache_get(&turnstile_cache, PR_WAITOK);
+		SLIST_INIT(&l2->l_pi_lenders);
+		l2->l_inheritedprio = MAXPRI;
 	} else {
 		l2 = isfree;
 		ts = l2->l_ts;
 		memset(l2, 0, sizeof(*l2));
 		l2->l_ts = ts;
+		KASSERT(SLIST_EMPTY(&l2->l_pi_lenders));
 	}
 
 	l2->l_stat = LSIDL;
@@ -782,6 +785,8 @@ lwp_free(struct lwp *l, int recycle, int
 	cpu_lwp_free2(l);
 #endif
 	uvm_lwp_exit(l);
+	KASSERT(SLIST_EMPTY(&l->l_pi_lenders));
+	KASSERT(l->l_inheritedprio == MAXPRI);
 	if (!recycle)
 		pool_put(&lwp_pool, l);
 	KERNEL_UNLOCK_ONE(curlwp);	/* XXXSMP */
@@ -1065,6 +1070,24 @@ lwp_relock(struct lwp *l, kmutex_t *new)
 #endif
 }
 
+int
+lwp_trylock(struct lwp *l)
+{
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+	kmutex_t *old;
+
+	for (;;) {
+		if (!mutex_tryenter(old = l->l_mutex))
+			return 0;
+		if (__predict_true(l->l_mutex == old))
+			return 1;
+		mutex_spin_exit(old);
+	}
+#else
+	return mutex_tryenter(l->l_mutex);
+#endif
+}
+
 /*
  * Handle exceptions for mi_userret().  Called if a member of LW_USERRET is
  * set.
Index: kern/kern_mutex.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_mutex.c,v
retrieving revision 1.4
diff -u -p -r1.4 kern_mutex.c
--- kern/kern_mutex.c	15 Feb 2007 15:49:27 -0000	1.4
+++ kern/kern_mutex.c	22 Feb 2007 16:56:17 -0000
@@ -236,6 +236,7 @@ __strong_alias(mutex_spin_exit, mutex_ve
 void	mutex_abort(kmutex_t *, const char *, const char *);
 void	mutex_dump(volatile void *);
 int	mutex_onproc(uintptr_t, struct cpu_info **);
+static struct lwp *mutex_getowner(wchan_t); /* XXX naming conflict */
 
 lockops_t mutex_spin_lockops = {
 	"Mutex",
@@ -249,6 +250,14 @@ lockops_t mutex_adaptive_lockops = {
 	mutex_dump
 };
 
+syncobj_t mutex_syncobj = {
+	SOBJ_SLEEPQ_SORTED,
+	turnstile_unsleep,
+	turnstile_changepri,
+	sleepq_inheritpri,
+	mutex_getowner,
+};
+
 /*
  * mutex_dump:
  *
@@ -637,7 +646,7 @@ mutex_vector_enter(kmutex_t *mtx)
 
 		LOCKSTAT_START_TIMER(lsflag, slptime);
 
-		turnstile_block(ts, TS_WRITER_Q, mtx);
+		turnstile_block(ts, TS_WRITER_Q, mtx, &mutex_syncobj);
 
 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
 		LOCKSTAT_COUNT(slpcnt, 1);
@@ -761,6 +770,14 @@ mutex_owner(kmutex_t *mtx)
 	return (struct lwp *)MUTEX_OWNER(mtx->mtx_owner);
 }
 
+static struct lwp *
+mutex_getowner(wchan_t obj)
+{
+	kmutex_t *mtx = (void *)(uintptr_t)obj; /* discard qualifiers */
+
+	return mutex_owner(mtx);
+}
+
 /*
  * mutex_tryenter:
  *
Index: kern/kern_proc.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_proc.c,v
retrieving revision 1.104
diff -u -p -r1.104 kern_proc.c
--- kern/kern_proc.c	21 Feb 2007 23:48:14 -0000	1.104
+++ kern/kern_proc.c	22 Feb 2007 16:56:17 -0000
@@ -355,6 +355,8 @@ proc0_init(void)
 	l->l_cpu = curcpu();
 	l->l_priority = PRIBIO;
 	l->l_usrpri = PRIBIO;
+	l->l_inheritedprio = MAXPRI;
+	SLIST_INIT(&l->l_pi_lenders);
 
 	callout_init(&l->l_tsleep_ch);
 	cv_init(&l->l_sigcv, "sigwait");
Index: kern/kern_rwlock.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.3
diff -u -p -r1.3 kern_rwlock.c
--- kern/kern_rwlock.c	10 Feb 2007 21:07:52 -0000	1.3
+++ kern/kern_rwlock.c	22 Feb 2007 16:56:17 -0000
@@ -145,6 +145,7 @@ __strong_alias(rw_exit, rw_vector_exit);
 #endif
 
 void	rw_dump(volatile void *);
+static struct lwp *rw_owner(wchan_t);
 
 lockops_t rwlock_lockops = {
 	"Reader / writer lock",
@@ -152,6 +153,14 @@ lockops_t rwlock_lockops = {
 	rw_dump
 };
 
+syncobj_t rw_syncobj = {
+	SOBJ_SLEEPQ_SORTED,
+	turnstile_unsleep,
+	turnstile_changepri,
+	sleepq_inheritpri,
+	rw_owner,
+};
+
 /*
  * rw_dump:
  *
@@ -299,7 +308,7 @@ rw_vector_enter(krwlock_t *rw, const krw
 
 		LOCKSTAT_START_TIMER(lsflag, slptime);
 
-		turnstile_block(ts, queue, rw);
+		turnstile_block(ts, queue, rw, &rw_syncobj);
 
 		/* If we wake up and arrive here, we've been handed the lock. */
 		RW_RECEIVE(rw);
@@ -670,3 +679,15 @@ rw_lock_held(krwlock_t *rw)
 
 	return (rw->rw_owner & RW_THREAD) != 0;
 }
+
+static struct lwp *
+rw_owner(wchan_t obj)
+{
+	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
+	uintptr_t owner = rw->rw_owner;
+
+	if ((owner & RW_WRITE_LOCKED) == 0)
+		return NULL;
+
+	return (void *)(owner & RW_THREAD);
+}
Index: kern/kern_sleepq.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_sleepq.c,v
retrieving revision 1.5
diff -u -p -r1.5 kern_sleepq.c
--- kern/kern_sleepq.c	17 Feb 2007 22:31:43 -0000	1.5
+++ kern/kern_sleepq.c	22 Feb 2007 16:56:17 -0000
@@ -177,7 +177,7 @@ sleepq_remove(sleepq_t *sq, struct lwp *
 	l->l_slptime = 0;
 	if ((l->l_flag & LW_INMEM) != 0) {
 		setrunqueue(l);
-		if (l->l_priority < ci->ci_schedstate.spc_curpriority)
+		if (lwp_eprio(l) < ci->ci_schedstate.spc_curpriority)
 			cpu_need_resched(ci);
 		sched_unlock(1);
 		return 0;
@@ -193,13 +193,14 @@ sleepq_remove(sleepq_t *sq, struct lwp *
  *	Insert an LWP into the sleep queue, optionally sorting by priority.
  */
 inline void
-sleepq_insert(sleepq_t *sq, struct lwp *l, int pri, syncobj_t *sobj)
+sleepq_insert(sleepq_t *sq, struct lwp *l, syncobj_t *sobj)
 {
 	struct lwp *l2;
+	const int pri = lwp_eprio(l);
 
 	if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) {
 		TAILQ_FOREACH(l2, &sq->sq_queue, l_sleepchain) {
-			if (l2->l_priority > pri) {
+			if (lwp_eprio(l2) > pri) {
 				TAILQ_INSERT_BEFORE(l2, l, l_sleepchain);
 				return;
 			}
@@ -209,19 +210,9 @@ sleepq_insert(sleepq_t *sq, struct lwp *
 	TAILQ_INSERT_TAIL(&sq->sq_queue, l, l_sleepchain);
 }
 
-/*
- * sleepq_block:
- *
- *	Enter an LWP into the sleep queue and prepare for sleep.  The sleep
- *	queue must already be locked, and any interlock (such as the kernel
- *	lock) must have be released (see sleeptab_lookup(), sleepq_enter()).
- *
- * 	sleepq_block() may return early under exceptional conditions, for
- * 	example if the LWP's containing process is exiting.
- */
 void
-sleepq_block(sleepq_t *sq, int pri, wchan_t wchan, const char *wmesg, int timo,
-	     int catch, syncobj_t *sobj)
+sleepq_enqueue(sleepq_t *sq, int pri, wchan_t wchan, const char *wmesg,
+    syncobj_t *sobj)
 {
 	struct lwp *l = curlwp;
 
@@ -240,7 +231,13 @@ sleepq_block(sleepq_t *sq, int pri, wcha
 	l->l_nvcsw++;
 
 	sq->sq_waiters++;
-	sleepq_insert(sq, l, pri, sobj);
+	sleepq_insert(sq, l, sobj);
+}
+
+void
+sleepq_switch(int timo, int catch)
+{
+	struct lwp *l = curlwp;
 
 #ifdef KTRACE
 	if (KTRPOINT(l->l_proc, KTR_CSW))
@@ -281,6 +278,25 @@ sleepq_block(sleepq_t *sq, int pri, wcha
 }
 
 /*
+ * sleepq_block:
+ *
+ *	Enter an LWP into the sleep queue and prepare for sleep.  The sleep
+ *	queue must already be locked, and any interlock (such as the kernel
+ *	lock) must have be released (see sleeptab_lookup(), sleepq_enter()).
+ *
+ * 	sleepq_block() may return early under exceptional conditions, for
+ * 	example if the LWP's containing process is exiting.
+ */
+void
+sleepq_block(sleepq_t *sq, int pri, wchan_t wchan, const char *wmesg, int timo,
+	     int catch, syncobj_t *sobj)
+{
+
+	sleepq_enqueue(sq, pri, wchan, wmesg, sobj);
+	sleepq_switch(timo, catch);
+}
+
+/*
  * sleepq_unblock:
  *
  *	After any intermediate step such as updating statistics, re-acquire
@@ -484,3 +500,21 @@ sleepq_changepri(struct lwp *l, int pri)
 	KASSERT(lwp_locked(l, l->l_sleepq->sq_mutex));
 	l->l_usrpri = pri;
 }
+
+void
+sleepq_inheritpri(struct lwp *l, int pri)
+{
+	sleepq_t *sq = l->l_sleepq;
+	int opri;
+
+	KASSERT(lwp_locked(l, sq->sq_mutex));
+
+	opri = lwp_eprio(l);
+	l->l_inheritedprio = pri;
+
+	if (lwp_eprio(l) != opri &&
+	    (l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) {
+		TAILQ_REMOVE(&sq->sq_queue, l, l_sleepchain);
+		sleepq_insert(sq, l, l->l_syncobj);
+	}
+}
Index: kern/kern_synch.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_synch.c,v
retrieving revision 1.182
diff -u -p -r1.182 kern_synch.c
--- kern/kern_synch.c	21 Feb 2007 23:48:15 -0000	1.182
+++ kern/kern_synch.c	22 Feb 2007 16:56:17 -0000
@@ -120,6 +120,7 @@ void	updatepri(struct lwp *);
 
 void	sched_unsleep(struct lwp *);
 void	sched_changepri(struct lwp *, int);
+void	sched_inheritpri(struct lwp *, int);
 
 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
 static unsigned int schedcpu_ticks;
@@ -127,13 +128,17 @@ static unsigned int schedcpu_ticks;
 syncobj_t sleep_syncobj = {
 	SOBJ_SLEEPQ_SORTED,
 	sleepq_unsleep,
-	sleepq_changepri
+	sleepq_changepri,
+	sleepq_inheritpri,
+	syncobj_noowner,
 };
 
 syncobj_t sched_syncobj = {
 	SOBJ_SLEEPQ_SORTED,
 	sched_unsleep,
-	sched_changepri
+	sched_changepri,
+	sched_inheritpri,
+	syncobj_noowner,
 };
 
 /*
@@ -743,9 +748,10 @@ rqinit()
 }
 
 static inline void
-resched_lwp(struct lwp *l, u_char pri)
+resched_lwp(struct lwp *l)
 {
 	struct cpu_info *ci;
+	const int pri = lwp_eprio(l);
 
 	/*
 	 * XXXSMP
@@ -863,7 +869,7 @@ setrunnable(struct lwp *l)
 
 	if (l->l_flag & LW_INMEM) {
 		setrunqueue(l);
-		resched_lwp(l, l->l_priority);
+		resched_lwp(l);
 		lwp_unlock(l);
 	} else {
 		lwp_unlock(l);
@@ -1114,11 +1120,10 @@ sched_changepri(struct lwp *l, int pri)
 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 
 	l->l_usrpri = pri;
-
 	if (l->l_priority < PUSER)
 		return;
-	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0 ||
-	    (l->l_priority / PPQ) == (pri / PPQ)) {
+
+	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
 		l->l_priority = pri;
 		return;
 	}
@@ -1126,7 +1131,31 @@ sched_changepri(struct lwp *l, int pri)
 	remrunqueue(l);
 	l->l_priority = pri;
 	setrunqueue(l);
-	resched_lwp(l, pri);
+	resched_lwp(l);
+}
+
+void
+sched_inheritpri(struct lwp *l, int pri)
+{
+
+	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
+
+	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
+		l->l_inheritedprio = pri;
+		return;
+	}
+
+	remrunqueue(l);
+	l->l_inheritedprio = pri;
+	setrunqueue(l);
+	resched_lwp(l);
+}
+
+struct lwp *
+syncobj_noowner(wchan_t wchan)
+{
+
+	return NULL;
 }
 
 /*
@@ -1218,7 +1247,7 @@ setrunqueue(struct lwp *l)
 {
 	struct prochd *rq;
 	struct lwp *prev;
-	const int whichq = l->l_priority / PPQ;
+	const int whichq = lwp_eprio(l) / PPQ;
 
 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 
@@ -1251,7 +1280,7 @@ void
 remrunqueue(struct lwp *l)
 {
 	struct lwp *prev, *next;
-	const int whichq = l->l_priority / PPQ;
+	const int whichq = lwp_eprio(l) / PPQ;
 
 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 
Index: kern/kern_turnstile.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_turnstile.c,v
retrieving revision 1.3
diff -u -p -r1.3 kern_turnstile.c
--- kern/kern_turnstile.c	15 Feb 2007 20:21:13 -0000	1.3
+++ kern/kern_turnstile.c	22 Feb 2007 16:56:17 -0000
@@ -63,8 +63,7 @@
  * grabs a free turnstile off the free list.  Otherwise, it can take back
  * the active turnstile from the lock (thus deactivating the turnstile).
  *
- * Turnstiles are the place to do priority inheritence.  However, we do
- * not currently implement that.
+ * Turnstiles are the place to do priority inheritence.
  */
 
 #include <sys/cdefs.h>
@@ -77,6 +76,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_turnsti
 
 #include <sys/param.h>
 #include <sys/lock.h>
+#include <sys/lockdebug.h>
 #include <sys/pool.h>
 #include <sys/proc.h> 
 #include <sys/sleepq.h>
@@ -94,16 +94,13 @@ struct pool turnstile_pool;
 struct pool_cache turnstile_cache;
 
 int	turnstile_ctor(void *, void *, int);
-void	turnstile_unsleep(struct lwp *);
-void	turnstile_changepri(struct lwp *, int);
 
 extern turnstile_t turnstile0;
-
-syncobj_t turnstile_syncobj = {
-	SOBJ_SLEEPQ_FIFO,
-	turnstile_unsleep,
-	turnstile_changepri
-};
+#if 1
+int pi_lend1;
+int pi_lend2;
+int pi_waive;
+#endif
 
 /*
  * turnstile_init:
@@ -231,15 +228,18 @@ turnstile_exit(wchan_t obj)
  *	 LWP for sleep.
  */
 void
-turnstile_block(turnstile_t *ts, int q, wchan_t obj)
+turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
 {
 	struct lwp *l;
+	struct lwp *cur; /* cached curlwp */
+	struct lwp *owner;
 	turnstile_t *ots;
 	tschain_t *tc;
 	sleepq_t *sq;
+	u_char prio;
 
 	tc = &turnstile_tab[TS_HASH(obj)];
-	l = curlwp;
+	l = cur = curlwp;
 
 	KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
 	KASSERT(mutex_owned(tc->tc_mutex));
@@ -255,6 +255,7 @@ turnstile_block(turnstile_t *ts, int q, 
 		KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q].sq_queue) &&
 			TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q].sq_queue));
 		ts->ts_obj = obj;
+		ts->ts_inheritor = NULL;
 		ts->ts_sleepq[TS_READER_Q].sq_mutex = tc->tc_mutex;
 		ts->ts_sleepq[TS_WRITER_Q].sq_mutex = tc->tc_mutex;
 		LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain);
@@ -269,6 +270,7 @@ turnstile_block(turnstile_t *ts, int q, 
 		ts->ts_free = ots;
 		l->l_ts = ts;
 
+		KASSERT(ts->ts_obj == obj);
 		KASSERT(TS_ALL_WAITERS(ts) != 0);
 		KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q].sq_queue) ||
 			!TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q].sq_queue));
@@ -276,8 +278,71 @@ turnstile_block(turnstile_t *ts, int q, 
 
 	sq = &ts->ts_sleepq[q];
 	sleepq_enter(sq, l);
-	sleepq_block(sq, sched_kpri(l), obj, "tstile", 0, 0,
-	    &turnstile_syncobj);
+	LOCKDEBUG_BARRIER(tc->tc_mutex, 1);
+	prio = lwp_eprio(l);
+	sleepq_enqueue(sq, prio, obj, "tstile", sobj);
+
+	/*
+	 * lend our priority to lwps on the blocking chain.
+	 */
+
+	for (;;) {
+		bool dolock;
+
+		if (l->l_wchan == NULL)
+			break;
+
+		owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
+		if (owner == NULL)
+			break;
+
+		KASSERT(l != owner);
+		KASSERT(cur != owner);
+
+		if (l->l_mutex != owner->l_mutex)
+			dolock = true;
+		else
+			dolock = false;
+		if (dolock && !lwp_trylock(owner)) {
+			/*
+			 * restart from curlwp.
+			 */
+			lwp_unlock(l);
+			l = cur;
+			lwp_lock(l);
+			prio = lwp_eprio(l);
+			continue;
+		}
+		if (prio >= lwp_eprio(owner)) {
+			if (dolock)
+				lwp_unlock(owner);
+			break;
+		}
+		ts = l->l_ts;
+		KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
+		if (ts->ts_inheritor == NULL) {
+			ts->ts_inheritor = owner;
+			ts->ts_eprio = prio;
+			SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
+			lwp_inheritpri(owner, prio);
+			pi_lend1++;
+		} else if (prio < ts->ts_eprio) {
+			ts->ts_eprio = prio;
+			lwp_inheritpri(owner, prio);
+			pi_lend2++;
+		}
+		if (dolock)
+			lwp_unlock(l);
+		l = owner;
+	}
+	LOCKDEBUG_BARRIER(l->l_mutex, 1);
+	if (cur->l_mutex != l->l_mutex) {
+		lwp_unlock(l);
+		lwp_lock(cur);
+	}
+	LOCKDEBUG_BARRIER(cur->l_mutex, 1);
+
+	sleepq_switch(0, 0);
 }
 
 /*
@@ -301,6 +366,53 @@ turnstile_wakeup(turnstile_t *ts, int q,
 	KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
 	KASSERT(count > 0 && count <= TS_WAITERS(ts, q));
 	KASSERT(mutex_owned(tc->tc_mutex) && sq->sq_mutex == tc->tc_mutex);
+	KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
+
+	/*
+	 * restore inherited priority if necessary.
+	 */
+
+	if (ts->ts_inheritor != NULL) {
+		turnstile_t *iter;
+		turnstile_t *next;
+		turnstile_t *prev = NULL;
+		u_char prio;
+
+		ts->ts_inheritor = NULL;
+		l = curlwp;
+		lwp_lock(l);
+
+		/*
+		 * the following loop does two things.
+		 *
+		 * - remove ts from the list.
+		 *
+		 * - from the rest of the list, find the highest priority.
+		 */
+
+		pi_waive++;
+		prio = MAXPRI;
+		KASSERT(!SLIST_EMPTY(&l->l_pi_lenders));
+		for (iter = SLIST_FIRST(&l->l_pi_lenders);
+		    iter != NULL; iter = next) {
+			KASSERT(lwp_eprio(l) <= ts->ts_eprio);
+			next = SLIST_NEXT(iter, ts_pichain);
+			if (iter == ts) {
+				if (prev == NULL) {
+					SLIST_REMOVE_HEAD(&l->l_pi_lenders,
+					    ts_pichain);
+				} else {
+					SLIST_REMOVE_AFTER(prev, ts_pichain);
+				}
+			} else if (prio > iter->ts_eprio) {
+				prio = iter->ts_eprio;
+			}
+			prev = iter;
+		}
+
+		lwp_inheritpri(l, prio);
+		lwp_unlock(l);
+	}
 
 	if (nl != NULL) {
 #if defined(DEBUG) || defined(LOCKDEBUG)
@@ -348,16 +460,14 @@ turnstile_unsleep(struct lwp *l)
 /*
  * turnstile_changepri:
  *
- *	Adjust the priority of an LWP residing on a turnstile.  Since we do
- *	not yet do priority inheritance, we mostly ignore this action.
+ *	Adjust the priority of an LWP residing on a turnstile.
  */
 void
 turnstile_changepri(struct lwp *l, int pri)
 {
 
-	/* LWPs on turnstiles always have kernel priority. */
-	l->l_usrpri = pri;
-	l->l_priority = sched_kpri(l);
+	/* XXX priority inheritance */
+	sleepq_changepri(l, pri);
 }
 
 #if defined(LOCKDEBUG)
Index: kern/sys_lwp.c
===================================================================
RCS file: /cvsroot/src/sys/kern/sys_lwp.c,v
retrieving revision 1.6
diff -u -p -r1.6 sys_lwp.c
--- kern/sys_lwp.c	21 Feb 2007 23:48:15 -0000	1.6
+++ kern/sys_lwp.c	22 Feb 2007 16:56:18 -0000
@@ -61,7 +61,9 @@ __KERNEL_RCSID(0, "$NetBSD: sys_lwp.c,v 
 syncobj_t lwp_park_sobj = {
 	SOBJ_SLEEPQ_SORTED,
 	sleepq_unsleep,
-	sleepq_changepri
+	sleepq_changepri,
+	sleepq_inheritpri,
+	syncobj_noowner,
 };
 
 sleeptab_t	lwp_park_tab;
Index: uvm/uvm_meter.c
===================================================================
RCS file: /cvsroot/src/sys/uvm/uvm_meter.c,v
retrieving revision 1.46
diff -u -p -r1.46 uvm_meter.c
--- uvm/uvm_meter.c	17 Feb 2007 22:31:45 -0000	1.46
+++ uvm/uvm_meter.c	22 Feb 2007 16:56:18 -0000
@@ -378,7 +378,7 @@ uvm_total(struct vmtotal *totalp)
 		case LSSLEEP:
 		case LSSTOP:
 			if (l->l_flag & LW_INMEM) {
-				if (l->l_priority <= PZERO)
+				if (lwp_eprio(l) <= PZERO)
 					totalp->t_dw++;
 				else if (l->l_slptime < maxslp)
 					totalp->t_sl++;

--NextPart-20070223020650-0958901--