Re: Problem with outstanding knotes and device detach - and a fix

To: Taylor R Campbell <campbell+netbsd-tech-kern%mumble.net@localhost>
Subject: Re: Problem with outstanding knotes and device detach - and a fix
From: Jason Thorpe <thorpej%me.com@localhost>
Date: Wed, 13 Jul 2022 19:18:01 -0700

> On Jul 13, 2022, at 12:02 PM, Jason Thorpe <thorpej%me.com@localhost> wrote:
> 
>> On Jul 13, 2022, at 11:25 AM, Taylor R Campbell <campbell+netbsd-tech-kern%mumble.net@localhost> wrote:
>> 
>> Sorry, haven't had time yet to do a full review, but I spot at least
>> one problem that means this won't fly as is: kqueue_register and
>> kqueue_scan both call filter_touch under a spin lock, but with your
>> patch filter_touch now takes an rwlock -- which is forbidden under a
>> spin lock (and it'll crash under LOCKDEBUG).
> 
> I found another issue as well and will post a follow-up later.

Ok, new version.  Main differences:

==> each knote gets its own mutex, rather than using a global rwlock.

==> Moved the locking outside of the filter_*() functions.

==> “Dealt” with the locking issue around the “touch” op, by only allowing user_filtops and timer_filtops to use it, because they’re known to be safe to call “touch” without holding the knote foplock (they don’t put notes on any lists).

The knote foplock got tossed onto the end of struct knote with the idea being that it doesn’t change the knote ABI for modules (they can’t allocate their own knotes).  If this gets aligned with another change that bumps the kernel version, I’ll put the foplock up next to fop.

This passes the kqueue tests on a LOCKDEBUG kernel.

Index: kern/kern_event.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_event.c,v
retrieving revision 1.143
diff -u -p -r1.143 kern_event.c
--- kern/kern_event.c	13 Jul 2022 14:11:46 -0000	1.143
+++ kern/kern_event.c	14 Jul 2022 02:01:44 -0000
@@ -133,6 +133,31 @@ static const struct fileops kqueueops = 
 	.fo_restart = kqueue_restart,
 };
 
+static void
+filt_nopdetach(struct knote *kn __unused)
+{
+}
+
+static int
+filt_nopevent(struct knote *kn __unused, long hint __unused)
+{
+	return 0;
+}
+
+static const struct filterops nop_fd_filtops = {
+	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+	.f_attach = NULL,
+	.f_detach = filt_nopdetach,
+	.f_event = filt_nopevent,
+};
+
+static const struct filterops nop_filtops = {
+	.f_flags = FILTEROP_MPSAFE,
+	.f_attach = NULL,
+	.f_detach = filt_nopdetach,
+	.f_event = filt_nopevent,
+};
+
 static const struct filterops kqread_filtops = {
 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
 	.f_attach = NULL,
@@ -235,12 +260,14 @@ static size_t		user_kfiltersz;		/* size 
  * Locking rules:
  *
  *	f_attach: fdp->fd_lock, KERNEL_LOCK
- *	f_detach: fdp->fd_lock, KERNEL_LOCK
- *	f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
- *	f_event via knote: whatever caller guarantees
+ *	f_detach: fdp->fd_lock, knote foplock, KERNEL_LOCK
+ *	f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, knote foplock,
+ *					  acquires/releases object lock inside
+ *	f_event via knote: whatever caller guarantees,
+ *					knote foplock _not_ acquired
  *		Typically,	f_event(NOTE_SUBMIT) via knote: object lock
- *				f_event(!NOTE_SUBMIT) via knote: nothing,
- *					acquires/releases object lock inside.
+ *				f_event(!NOTE_SUBMIT) via knote:
+ *					acquires/releases object lock inside
  *
  * Locking rules when detaching knotes:
  *
@@ -429,6 +456,7 @@ knote_alloc(bool sleepok)
 	struct knote *kn;
 
 	kn = kmem_zalloc(sizeof(*kn), sleepok ? KM_SLEEP : KM_NOSLEEP);
+	mutex_init(&kn->kn_foplock, MUTEX_DEFAULT, IPL_NONE);
 
 	return kn;
 }
@@ -436,14 +464,28 @@ knote_alloc(bool sleepok)
 static inline void
 knote_free(struct knote *kn)
 {
+	mutex_destroy(&kn->kn_foplock);
 	kmem_free(kn, sizeof(*kn));
 }
 
+/*
+ * Calls into the filterops need to be resilient against things which
+ * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
+ * chasing garbage pointers (to data, or even potentially code in a
+ * module about to be unloaded).  To that end, we acquire the
+ * knote foplock before calling into the filter ops.  When a driver
+ * (or anything else) is tearing down its klist, klist_fini() enumerates
+ * each knote, acquires its foplock, and replaces the filterops with a
+ * nop stub, allowing knote detach (when descriptors are closed) to safely
+ * proceed.
+ */
+
 static int
 filter_attach(struct knote *kn)
 {
 	int rv;
 
+	KASSERT(mutex_owned(&kn->kn_foplock));
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_attach != NULL);
 
@@ -465,6 +507,8 @@ filter_attach(struct knote *kn)
 static void
 filter_detach(struct knote *kn)
 {
+
+	KASSERT(mutex_owned(&kn->kn_foplock));
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_detach != NULL);
 
@@ -478,10 +522,12 @@ filter_detach(struct knote *kn)
 }
 
 static int
-filter_event(struct knote *kn, long hint)
+filter_event(struct knote *kn, long hint, bool submitting)
 {
 	int rv;
 
+	/* See knote(). */
+	KASSERT(submitting || mutex_owned(&kn->kn_foplock));
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_event != NULL);
 
@@ -499,6 +545,19 @@ filter_event(struct knote *kn, long hint
 static int
 filter_touch(struct knote *kn, struct kevent *kev, long type)
 {
+
+	/*
+	 * XXX We cannot assert that the knote foplock is held here
+	 * XXX beause we cannot safely acquire it in all cases
+	 * XXX where "touch" will be used in kqueue_scan().  We just
+	 * XXX have to assume that f_touch will always be safe to call,
+	 * XXX and kqueue_register() allows only the two known-safe
+	 * XXX users of that op.
+	 */
+
+	KASSERT(kn->kn_fop != NULL);
+	KASSERT(kn->kn_fop->f_touch != NULL);
+
 	return kn->kn_fop->f_touch(kn, kev, type);
 }
 
@@ -1849,6 +1908,17 @@ kqueue_register(struct kqueue *kq, struc
 
 			KASSERT(kn->kn_fop != NULL);
 			/*
+			 * XXX Allow only known-safe users of f_touch.
+			 * XXX See filter_touch() for details.
+			 */
+			if (kn->kn_fop->f_touch != NULL &&
+			    kn->kn_fop != &timer_filtops &&
+			    kn->kn_fop != &user_filtops) {
+				error = ENOTSUP;
+				goto fail_ev_add;
+			}
+
+			/*
 			 * apply reference count to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
@@ -1880,6 +1950,7 @@ kqueue_register(struct kqueue *kq, struc
 			 * N.B. kn->kn_fop may change as the result
 			 * of filter_attach()!
 			 */
+			mutex_enter(&kn->kn_foplock);
 			error = filter_attach(kn);
 			if (error != 0) {
 #ifdef DEBUG
@@ -1893,6 +1964,7 @@ kqueue_register(struct kqueue *kq, struc
 				    ft ? ft->f_ops->fo_name : "?", error);
 #endif
 
+ fail_ev_add:
 				/*
 				 * N.B. no need to check for this note to
 				 * be in-flux, since it was never visible
@@ -1900,6 +1972,7 @@ kqueue_register(struct kqueue *kq, struc
 				 *
 				 * knote_detach() drops fdp->fd_lock
 				 */
+				mutex_exit(&kn->kn_foplock);
 				mutex_enter(&kq->kq_lock);
 				KNOTE_WILLDETACH(kn);
 				KASSERT(kn_in_flux(kn) == false);
@@ -1957,6 +2030,7 @@ kqueue_register(struct kqueue *kq, struc
 	 * initial EV_ADD, but doing so will not reset any
 	 * filter which have already been triggered.
 	 */
+	mutex_enter(&kn->kn_foplock);
 	kn->kn_kevent.udata = kev->udata;
 	KASSERT(kn->kn_fop != NULL);
 	if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
@@ -1967,6 +2041,7 @@ kqueue_register(struct kqueue *kq, struc
 		if (__predict_false(error != 0)) {
 			/* Never a new knote (which would consume newkn). */
 			KASSERT(newkn != NULL);
+			mutex_exit(&kn->kn_foplock);
 			goto doneunlock;
 		}
 	} else {
@@ -1981,10 +2056,12 @@ kqueue_register(struct kqueue *kq, struc
 	 * broken and does not return an error.
 	 */
  done_ev_add:
-	rv = filter_event(kn, 0);
+	rv = filter_event(kn, 0, false);
 	if (rv)
 		knote_activate(kn);
 
+	mutex_exit(&kn->kn_foplock);
+
 	/* disable knote */
 	if ((kev->flags & EV_DISABLE)) {
 		mutex_spin_enter(&kq->kq_lock);
@@ -2256,7 +2333,9 @@ relock:
 		if ((kn->kn_flags & EV_ONESHOT) == 0) {
 			mutex_spin_exit(&kq->kq_lock);
 			KASSERT(mutex_owned(&fdp->fd_lock));
-			rv = filter_event(kn, 0);
+			mutex_enter(&kn->kn_foplock);
+			rv = filter_event(kn, 0, false);
+			mutex_exit(&kn->kn_foplock);
 			mutex_spin_enter(&kq->kq_lock);
 			/* Re-poll if note was re-enqueued. */
 			if ((kn->kn_status & KN_QUEUED) != 0) {
@@ -2615,7 +2694,15 @@ knote(struct klist *list, long hint)
 	struct knote *kn, *tmpkn;
 
 	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
-		if (filter_event(kn, hint)) {
+		/*
+		 * We assume here that the backing object's lock is
+		 * already held if we're traversing the klist, and
+		 * so acquiring the knote foplock would create a
+		 * deadlock scenario.  But we also know that the klist
+		 * won't disappear on us while we're here, so not
+		 * acquiring it is safe.
+		 */
+		if (filter_event(kn, hint, true)) {
 			knote_activate(kn);
 		}
 	}
@@ -2664,7 +2751,9 @@ knote_detach(struct knote *kn, filedesc_
 
 	/* Remove from monitored object. */
 	if (dofop) {
+		mutex_enter(&kn->kn_foplock);
 		filter_detach(kn);
+		mutex_exit(&kn->kn_foplock);
 	}
 
 	/* Remove from descriptor table. */
@@ -2829,7 +2918,26 @@ klist_init(struct klist *list)
 void
 klist_fini(struct klist *list)
 {
-	/* Nothing, for now. */
+	struct knote *kn;
+
+	/*
+	 * Neuter all existing knotes on the klist because the list is
+	 * being destroyed.  The caller has guaranteed that no additional
+	 * knotes will be added to the list, that the backing object's
+	 * locks are not held (otherwise there is a locking order issue
+	 * with acquiring the knote foplock ), and that we can traverse
+	 * the list safely in this state.
+	 */
+	SLIST_FOREACH(kn, list, kn_selnext) {
+		mutex_enter(&kn->kn_foplock);
+		KASSERT(kn->kn_fop != NULL);
+		if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+			kn->kn_fop = &nop_fd_filtops;
+		} else {
+			kn->kn_fop = &nop_filtops;
+		}
+		mutex_exit(&kn->kn_foplock);
+	}
 }
 
 /*
Index: sys/event.h
===================================================================
RCS file: /cvsroot/src/sys/sys/event.h,v
retrieving revision 1.53
diff -u -p -r1.53 event.h
--- sys/event.h	13 Jul 2022 14:11:46 -0000	1.53
+++ sys/event.h	14 Jul 2022 02:01:44 -0000
@@ -203,6 +203,8 @@ struct kfilter_mapping {
 
 #ifdef _KERNEL
 
+#include <sys/mutex.h>			/* needed by struct knote */
+
 #define	KNOTE(list, hint)	if (!SLIST_EMPTY(list)) knote(list, hint)
 
 /*
@@ -245,6 +247,7 @@ struct filterops {
  * f	kn_kq->kq_fdp->fd_lock
  * q	kn_kq->kq_lock
  * o	object mutex (e.g. device driver or vnode interlock)
+ * n	knote::kn_foplock
  */
 struct kfilter;
 
@@ -258,7 +261,7 @@ struct knote {
 	uint32_t		kn_sfflags;	/*    saved filter flags */
 	uintptr_t		kn_sdata;	/*    saved data field */
 	void			*kn_obj;	/*    monitored obj */
-	const struct filterops	*kn_fop;
+	const struct filterops	*kn_fop;	/* n */
 	struct kfilter		*kn_kfilter;
 	void 			*kn_hook;
 	int			kn_hookid;
@@ -294,6 +297,7 @@ struct knote {
 #define	kn_flags	kn_kevent.flags		/* q */
 #define	kn_fflags	kn_kevent.fflags	/* o */
 #define	kn_data		kn_kevent.data		/* o */
+	kmutex_t		kn_foplock;	/* for kn_filterops */
 };
 
 #include <sys/systm.h>	/* for copyin_t */

-- thorpej

Follow-Ups:
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Jason Thorpe

References:
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Taylor R Campbell
- Re: Problem with outstanding knotes and device detach - and a fix
  - From: Jason Thorpe

Prev by Date: Re: Language-neutral interface specifications (research)
Next by Date: Re: Fix for PR kern/56713
Previous by Thread: Re: Problem with outstanding knotes and device detach - and a fix
Next by Thread: Re: Problem with outstanding knotes and device detach - and a fix
Indexes:

Home | Main Index | Thread Index | Old Index