Re: Problem with outstanding knotes and device detach - and a fix

To: Taylor R Campbell <campbell+netbsd-tech-kern%mumble.net@localhost>
Subject: Re: Problem with outstanding knotes and device detach - and a fix
From: Jason Thorpe <thorpej%me.com@localhost>
Date: Sun, 17 Jul 2022 12:54:56 -0700

> On Jul 13, 2022, at 7:18 PM, Jason Thorpe <thorpej%me.com@localhost> wrote:
> 
> Ok, new version.  Main differences:

Aaaaand another new version.  This:

==> Creates a knote_impl structure that’s private to kern_event.c that has the new lock.  I took the opportunity to move kn_influx to the knote_impl as well, since absolutely no one outside of kern_event.c should touch it.  This change is ABI-compatible.

==> Improves the comments about the locking rules.

There are no functional changes since the last patch.

Index: kern/kern_event.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_event.c,v
retrieving revision 1.143
diff -u -p -r1.143 kern_event.c
--- kern/kern_event.c	13 Jul 2022 14:11:46 -0000	1.143
+++ kern/kern_event.c	17 Jul 2022 19:44:06 -0000
@@ -120,6 +120,61 @@ static void	filt_userdetach(struct knote
 static int	filt_user(struct knote *, long hint);
 static int	filt_usertouch(struct knote *, struct kevent *, long type);
 
+/*
+ * Private knote state that should never be exposed outside
+ * of kern_event.c
+ *
+ * Field locking:
+ *
+ * q	kn_kq->kq_lock
+ */
+struct knote_impl {
+	struct knote	ki_knote;
+	unsigned int	ki_influx;	/* q: in-flux counter */
+	kmutex_t	ki_foplock;	/* for kn_filterops */
+};
+
+#define	KIMPL_TO_KNOTE(kip)	(&(kip)->ki_knote)
+#define	KNOTE_TO_KIMPL(knp)	container_of((knp), struct knote_impl, ki_knote)
+
+static inline struct knote *
+knote_alloc(bool sleepok)
+{
+	struct knote_impl *ki;
+
+	ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
+	mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
+
+	return KIMPL_TO_KNOTE(ki);
+}
+
+static inline void
+knote_free(struct knote *kn)
+{
+	struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
+
+	mutex_destroy(&ki->ki_foplock);
+	kmem_free(ki, sizeof(*ki));
+}
+
+static inline void
+knote_foplock_enter(struct knote *kn)
+{
+	mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
+static inline void
+knote_foplock_exit(struct knote *kn)
+{
+	mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
+static inline bool
+knote_foplock_owned(struct knote *kn)
+{
+	return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
 static const struct fileops kqueueops = {
 	.fo_name = "kqueue",
 	.fo_read = (void *)enxio,
@@ -133,6 +188,31 @@ static const struct fileops kqueueops = 
 	.fo_restart = kqueue_restart,
 };
 
+static void
+filt_nopdetach(struct knote *kn __unused)
+{
+}
+
+static int
+filt_nopevent(struct knote *kn __unused, long hint __unused)
+{
+	return 0;
+}
+
+static const struct filterops nop_fd_filtops = {
+	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+	.f_attach = NULL,
+	.f_detach = filt_nopdetach,
+	.f_event = filt_nopevent,
+};
+
+static const struct filterops nop_filtops = {
+	.f_flags = FILTEROP_MPSAFE,
+	.f_attach = NULL,
+	.f_detach = filt_nopdetach,
+	.f_event = filt_nopevent,
+};
+
 static const struct filterops kqread_filtops = {
 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
 	.f_attach = NULL,
@@ -232,15 +312,41 @@ static size_t		user_kfiltersz;		/* size 
  *	-> object lock (e.g., device driver lock, &c.)
  *	-> kn_kq->kq_lock
  *
- * Locking rules:
+ * Locking rules.  ==> indicates the lock is acquired by the backing
+ * object, locks prior are acquired before calling filter ops:
+ *
+ *	f_attach: fdp->fd_lock -> knote foplock ->
+ *	  (maybe) KERNEL_LOCK ==> backing object lock
+ *
+ *	f_detach: fdp->fd_lock -> knote foplock ->
+ *	   (maybe) KERNEL_LOCK ==> backing object lock
+ *
+ *	f_event via kevent: fdp->fd_lock -> knote foplock ->
+ *	   (maybe) KERNEL_LOCK ==> backing object lock
+ *	   N.B. NOTE_SUBMIT will never be set in the "hint" argument
+ *	   in this case.
  *
- *	f_attach: fdp->fd_lock, KERNEL_LOCK
- *	f_detach: fdp->fd_lock, KERNEL_LOCK
- *	f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
- *	f_event via knote: whatever caller guarantees
- *		Typically,	f_event(NOTE_SUBMIT) via knote: object lock
- *				f_event(!NOTE_SUBMIT) via knote: nothing,
- *					acquires/releases object lock inside.
+ *	f_event via knote (via backing object: Whatever caller guarantees.
+ *	Typically:
+ *		f_event(NOTE_SUBMIT): caller has already acquired backing
+ *		    object lock.
+ *		f_event(!NOTE_SUBMIT): caller has not acquired backing object,
+ *		    lock or has possibly acquired KERNEL_LOCK.  Backing object
+ *		    lock may or may not be acquired as-needed.
+ *	N.B. the knote foplock will **not** be acquired in this case.  The
+ *	caller guarantees that klist_fini() will not be called concurrently
+ *	with knote().
+ *
+ *	f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
+ *	    N.B. knote foplock is **not** acquired in this case and
+ *	    the caller must guarantee that klist_fini() will never
+ *	    be called.  kevent_register() restricts filters that
+ *	    provide f_touch to known-safe cases.
+ *
+ *	klist_fini(): Caller must guarantee that no more knotes can
+ *	    be attached to the klist, and must **not** hold the backing
+ *	    object's lock; klist_fini() itself will acquire the foplock
+ *	    of each knote on the klist.
  *
  * Locking rules when detaching knotes:
  *
@@ -286,7 +392,7 @@ static inline bool
 kn_in_flux(struct knote *kn)
 {
 	KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
-	return kn->kn_influx != 0;
+	return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
 }
 
 static inline bool
@@ -298,8 +404,9 @@ kn_enter_flux(struct knote *kn)
 		return false;
 	}
 
-	KASSERT(kn->kn_influx < UINT_MAX);
-	kn->kn_influx++;
+	struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
+	KASSERT(ki->ki_influx < UINT_MAX);
+	ki->ki_influx++;
 
 	return true;
 }
@@ -308,14 +415,17 @@ static inline bool
 kn_leave_flux(struct knote *kn)
 {
 	KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
-	KASSERT(kn->kn_influx > 0);
-	kn->kn_influx--;
-	return kn->kn_influx == 0;
+
+	struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
+	KASSERT(ki->ki_influx > 0);
+	ki->ki_influx--;
+	return ki->ki_influx == 0;
 }
 
 static void
 kn_wait_flux(struct knote *kn, bool can_loop)
 {
+	struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
 	bool loop;
 
 	KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
@@ -325,7 +435,7 @@ kn_wait_flux(struct knote *kn, bool can_
 	 * dropping the kq_lock.  The caller has let us know in
 	 * 'can_loop'.
 	 */
-	for (loop = true; loop && kn->kn_influx != 0; loop = can_loop) {
+	for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
 		KQ_FLUX_WAIT(kn->kn_kq);
 	}
 }
@@ -423,33 +533,31 @@ knote_detach_quiesce(struct knote *kn)
 	return false;
 }
 
-static inline struct knote *
-knote_alloc(bool sleepok)
-{
-	struct knote *kn;
-
-	kn = kmem_zalloc(sizeof(*kn), sleepok ? KM_SLEEP : KM_NOSLEEP);
-
-	return kn;
-}
-
-static inline void
-knote_free(struct knote *kn)
-{
-	kmem_free(kn, sizeof(*kn));
-}
+/*
+ * Calls into the filterops need to be resilient against things which
+ * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
+ * chasing garbage pointers (to data, or even potentially code in a
+ * module about to be unloaded).  To that end, we acquire the
+ * knote foplock before calling into the filter ops.  When a driver
+ * (or anything else) is tearing down its klist, klist_fini() enumerates
+ * each knote, acquires its foplock, and replaces the filterops with a
+ * nop stub, allowing knote detach (when descriptors are closed) to safely
+ * proceed.
+ */
 
 static int
 filter_attach(struct knote *kn)
 {
 	int rv;
 
+	KASSERT(knote_foplock_owned(kn));
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_attach != NULL);
 
 	/*
 	 * N.B. that kn->kn_fop may change as the result of calling
-	 * f_attach().
+	 * f_attach().  After f_attach() returns, kn->kn_fop may not
+	 * be modified by code outside of klist_fini().
 	 */
 	if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
 		rv = kn->kn_fop->f_attach(kn);
@@ -465,6 +573,8 @@ filter_attach(struct knote *kn)
 static void
 filter_detach(struct knote *kn)
 {
+
+	KASSERT(knote_foplock_owned(kn));
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_detach != NULL);
 
@@ -478,10 +588,12 @@ filter_detach(struct knote *kn)
 }
 
 static int
-filter_event(struct knote *kn, long hint)
+filter_event(struct knote *kn, long hint, bool submitting)
 {
 	int rv;
 
+	/* See knote(). */
+	KASSERT(submitting || knote_foplock_owned(kn));
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_event != NULL);
 
@@ -499,6 +611,19 @@ filter_event(struct knote *kn, long hint
 static int
 filter_touch(struct knote *kn, struct kevent *kev, long type)
 {
+
+	/*
+	 * XXX We cannot assert that the knote foplock is held here
+	 * XXX beause we cannot safely acquire it in all cases
+	 * XXX where "touch" will be used in kqueue_scan().  We just
+	 * XXX have to assume that f_touch will always be safe to call,
+	 * XXX and kqueue_register() allows only the two known-safe
+	 * XXX users of that op.
+	 */
+
+	KASSERT(kn->kn_fop != NULL);
+	KASSERT(kn->kn_fop->f_touch != NULL);
+
 	return kn->kn_fop->f_touch(kn, kev, type);
 }
 
@@ -1849,6 +1974,17 @@ kqueue_register(struct kqueue *kq, struc
 
 			KASSERT(kn->kn_fop != NULL);
 			/*
+			 * XXX Allow only known-safe users of f_touch.
+			 * XXX See filter_touch() for details.
+			 */
+			if (kn->kn_fop->f_touch != NULL &&
+			    kn->kn_fop != &timer_filtops &&
+			    kn->kn_fop != &user_filtops) {
+				error = ENOTSUP;
+				goto fail_ev_add;
+			}
+
+			/*
 			 * apply reference count to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
@@ -1880,6 +2016,7 @@ kqueue_register(struct kqueue *kq, struc
 			 * N.B. kn->kn_fop may change as the result
 			 * of filter_attach()!
 			 */
+			knote_foplock_enter(kn);
 			error = filter_attach(kn);
 			if (error != 0) {
 #ifdef DEBUG
@@ -1893,6 +2030,7 @@ kqueue_register(struct kqueue *kq, struc
 				    ft ? ft->f_ops->fo_name : "?", error);
 #endif
 
+ fail_ev_add:
 				/*
 				 * N.B. no need to check for this note to
 				 * be in-flux, since it was never visible
@@ -1900,6 +2038,7 @@ kqueue_register(struct kqueue *kq, struc
 				 *
 				 * knote_detach() drops fdp->fd_lock
 				 */
+				knote_foplock_exit(kn);
 				mutex_enter(&kq->kq_lock);
 				KNOTE_WILLDETACH(kn);
 				KASSERT(kn_in_flux(kn) == false);
@@ -1957,6 +2096,7 @@ kqueue_register(struct kqueue *kq, struc
 	 * initial EV_ADD, but doing so will not reset any
 	 * filter which have already been triggered.
 	 */
+	knote_foplock_enter(kn);
 	kn->kn_kevent.udata = kev->udata;
 	KASSERT(kn->kn_fop != NULL);
 	if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
@@ -1967,6 +2107,7 @@ kqueue_register(struct kqueue *kq, struc
 		if (__predict_false(error != 0)) {
 			/* Never a new knote (which would consume newkn). */
 			KASSERT(newkn != NULL);
+			knote_foplock_exit(kn);
 			goto doneunlock;
 		}
 	} else {
@@ -1981,10 +2122,12 @@ kqueue_register(struct kqueue *kq, struc
 	 * broken and does not return an error.
 	 */
  done_ev_add:
-	rv = filter_event(kn, 0);
+	rv = filter_event(kn, 0, false);
 	if (rv)
 		knote_activate(kn);
 
+	knote_foplock_exit(kn);
+
 	/* disable knote */
 	if ((kev->flags & EV_DISABLE)) {
 		mutex_spin_enter(&kq->kq_lock);
@@ -2256,7 +2399,9 @@ relock:
 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
 			mutex_spin_exit(&kq->kq_lock);
 			KASSERT(mutex_owned(&fdp->fd_lock));
-			rv = filter_event(kn, 0);
+			knote_foplock_enter(kn);
+			rv = filter_event(kn, 0, false);
+			knote_foplock_exit(kn);
 			mutex_spin_enter(&kq->kq_lock);
 			/* Re-poll if note was re-enqueued. */
 			if ((kn->kn_status & KN_QUEUED) != 0) {
@@ -2615,7 +2760,15 @@ knote(struct klist *list, long hint)
 	struct knote *kn, *tmpkn;
 
 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
-		if (filter_event(kn, hint)) {
+		/*
+		 * We assume here that the backing object's lock is
+		 * already held if we're traversing the klist, and
+		 * so acquiring the knote foplock would create a
+		 * deadlock scenario.  But we also know that the klist
+		 * won't disappear on us while we're here, so not
+		 * acquiring it is safe.
+		 */
+		if (filter_event(kn, hint, true)) {
 			knote_activate(kn);
 		}
 	}
@@ -2664,7 +2817,9 @@ knote_detach(struct knote *kn, filedesc_
 
 	/* Remove from monitored object. */
 	if (dofop) {
+		knote_foplock_enter(kn);
 		filter_detach(kn);
+		knote_foplock_exit(kn);
 	}
 
 	/* Remove from descriptor table. */
@@ -2829,7 +2984,26 @@ klist_init(struct klist *list)
 void
 klist_fini(struct klist *list)
 {
-	/* Nothing, for now. */
+	struct knote *kn;
+
+	/*
+	 * Neuter all existing knotes on the klist because the list is
+	 * being destroyed.  The caller has guaranteed that no additional
+	 * knotes will be added to the list, that the backing object's
+	 * locks are not held (otherwise there is a locking order issue
+	 * with acquiring the knote foplock ), and that we can traverse
+	 * the list safely in this state.
+	 */
+	SLIST_FOREACH(kn, list, kn_selnext) {
+		knote_foplock_enter(kn);
+		KASSERT(kn->kn_fop != NULL);
+		if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+			kn->kn_fop = &nop_fd_filtops;
+		} else {
+			kn->kn_fop = &nop_filtops;
+		}
+		knote_foplock_exit(kn);
+	}
 }
 
 /*
Index: sys/event.h
===================================================================
RCS file: /cvsroot/src/sys/sys/event.h,v
retrieving revision 1.53
diff -u -p -r1.53 event.h
--- sys/event.h	13 Jul 2022 14:11:46 -0000	1.53
+++ sys/event.h	17 Jul 2022 19:44:06 -0000
@@ -262,7 +262,6 @@ struct knote {
 	struct kfilter		*kn_kfilter;
 	void 			*kn_hook;
 	int			kn_hookid;
-	unsigned int		kn_influx;	/* q: in-flux counter */
 
 #define	KN_ACTIVE	0x01U			/* event has been triggered */
 #define	KN_QUEUED	0x02U			/* event is on queue */


-- thorpej

Follow-Ups:
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Taylor R Campbell

References:
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Taylor R Campbell
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Jason Thorpe
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Jason Thorpe

Prev by Date: Re: bus_space_barrier semantics
Next by Date: Re: Anyone recall the dreaded tstile issue?
Previous by Thread: Re: Problem with outstanding knotes and device detach - and a fix
Next by Thread: Re: Problem with outstanding knotes and device detach - and a fix
Indexes:

Home | Main Index | Thread Index | Old Index