tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Problem with outstanding knotes and device detach - and a fix



Background: kqueue events are represented by a “knote” that is kept on a list of knotes (a klist) by its respective driver (usually indirectly in the “selinfo” structure).  A similar situation exists for vnodes.  Then, when activated, they are queued onto the kqueue’s active note list.  The driver gets called to attach the knote in a generic fashion, but then provides its own ops vector for subsequent operations, including “detach”.  When a file descriptor for a given knote is closed, this driver-supplied “detach” function is called to remove it from whatever driver-specific klist the knote is on.

When a driver detaches, it frees its softc structure, which includes the selinfo / klist.  This list might still have notes on it… but that isn’t inherently problematic, because the driver owns the list and no one will ever traverse it again — the generic kqueue code doesn’t care about it.

But it is a problem when an outstanding file descriptor with a registered kqueue event (e.g. EVFILT_READ) for that device that may have been open when the device was detached.  In this case, when the file descriptor is closed, the knote, which still points to the now-detached device’s filterops, will call the “detach” operation.  For a driver that is still loaded into the kernel, we call code that’s still valid, but a use-after-free condition exists where the now-freed softc structure will be accessed to unlink the knote from the klist.  The situation is worse for modular drivers that may have been unloaded after the device detaches — jumping through function pointers to code that no longer exists is bad, m’kay?

The following patch rectifies this situation by having klist_fini() traverse a list of knotes and substitute their filterops with no-op stubs.  This requires synchronizing with any calls into the filterops themselves.  We have convenient funnel-points for this in kern_event.c already, so it’s not particularly difficult… the main issue is that the filterops themselves are allowed to block.  My solution to this was to have a single global rwlock, knote_filterops_lock, that’s acquired for reading when a call though a knote's filterops is made, and in klist_fini() is acquired for writing for the short duration needed to stub out filterops.  This lock should **very rarely** be contended, but it will be *hot* (lots of acquire-for-read), so it gets its own cache line.

If someone has ideas about another synchronization mechanism, I’m all ears… again, the main issue is that the filterops calls themselves are allowed to block, so I’m not sure that any of our passive serialization mechanisms would work here.

Index: kern/kern_event.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_event.c,v
retrieving revision 1.141
diff -u -p -r1.141 kern_event.c
--- kern/kern_event.c	24 May 2022 20:50:19 -0000	1.141
+++ kern/kern_event.c	11 Jul 2022 20:53:40 -0000
@@ -133,6 +133,31 @@ static const struct fileops kqueueops = 
 	.fo_restart = kqueue_restart,
 };
 
+static void
+filt_nopdetach(struct knote *kn __unused)
+{
+}
+
+static int
+filt_nopevent(struct knote *kn __unused, long hint __unused)
+{
+	return 0;
+}
+
+static const struct filterops nop_fd_filtops = {
+	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+	.f_attach = NULL,
+	.f_detach = filt_nopdetach,
+	.f_event = filt_nopevent,
+};
+
+static const struct filterops nop_filtops = {
+	.f_flags = FILTEROP_MPSAFE,
+	.f_attach = NULL,
+	.f_detach = filt_nopdetach,
+	.f_event = filt_nopevent,
+};
+
 static const struct filterops kqread_filtops = {
 	.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
 	.f_attach = NULL,
@@ -232,6 +257,9 @@ static size_t		user_kfiltersz;		/* size 
  *	-> object lock (e.g., device driver lock, &c.)
  *	-> kn_kq->kq_lock
  *
+ *	knote_filterops_lock (rw)
+ *	-> whatever will be acquired in filter_*()
+ *
  * Locking rules:
  *
  *	f_attach: fdp->fd_lock, KERNEL_LOCK
@@ -279,6 +307,24 @@ static size_t		user_kfiltersz;		/* size 
  */
 static krwlock_t	kqueue_filter_lock;	/* lock on filter lists */
 
+/*
+ * Calls into the filterops need to be resilient against things which
+ * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
+ * chasing garbage pointers (to data, or even potentially code in a
+ * module about to be unloaded).  To that end, we acquire the
+ * knote_filterops_lock as a reader whenver calling into the filter
+ * ops (in the filter_*() functions).  When a driver (or anything else)
+ * is tearing down its klist, klist_fini() acquires knote_filterops_lock
+ * as a writer, and replaces all of the filterops in each knote with a
+ * stub, allowing knote detach (when descriptors are closed) to safely
+ * proceed.
+ *
+ * This lock should be very rarely *contended* (klist tear-down is fairly
+ * rare), but it will be *hot* (acquired as a reader for all filter calls),
+ * so give it its own cache line.
+ */
+static krwlock_t	knote_filterops_lock __cacheline_aligned;
+
 #define	KQ_FLUX_WAIT(kq)	(void)cv_wait(&kq->kq_cv, &kq->kq_lock)
 #define	KQ_FLUX_WAKEUP(kq)	cv_broadcast(&kq->kq_cv)
 
@@ -428,6 +474,13 @@ filter_attach(struct knote *kn)
 {
 	int rv;
 
+	/*
+	 * No need to acquire knote_filterops_lock here; if the
+	 * device is detaching, the driver already guarantees that
+	 * no new knote references can be created (same rule as
+	 * selinfo).
+	 */
+
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_attach != NULL);
 
@@ -449,6 +502,9 @@ filter_attach(struct knote *kn)
 static void
 filter_detach(struct knote *kn)
 {
+
+	rw_enter(&knote_filterops_lock, RW_READER);
+
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_detach != NULL);
 
@@ -459,6 +515,8 @@ filter_detach(struct knote *kn)
 		kn->kn_fop->f_detach(kn);
 		KERNEL_UNLOCK_ONE(NULL);
 	}
+
+	rw_exit(&knote_filterops_lock);
 }
 
 static int
@@ -466,6 +524,8 @@ filter_event(struct knote *kn, long hint
 {
 	int rv;
 
+	rw_enter(&knote_filterops_lock, RW_READER);
+
 	KASSERT(kn->kn_fop != NULL);
 	KASSERT(kn->kn_fop->f_event != NULL);
 
@@ -477,13 +537,36 @@ filter_event(struct knote *kn, long hint
 		KERNEL_UNLOCK_ONE(NULL);
 	}
 
+	rw_exit(&knote_filterops_lock);
+
 	return rv;
 }
 
 static int
 filter_touch(struct knote *kn, struct kevent *kev, long type)
 {
-	return kn->kn_fop->f_touch(kn, kev, type);
+	int rv;
+
+	/*
+	 * This one is a little tricky; we are called conditionally
+	 * (only if f_touch is non-NULL), but that test is done without
+	 * synchronizing with klist_fini(), so we need to check again
+	 * after acquiring knote_filterops_lock.
+	 */
+
+	rw_enter(&knote_filterops_lock, RW_READER);
+
+	KASSERT(kn->kn_fop != NULL);
+
+	if (__predict_true(kn->kn_fop->f_touch != NULL)) {
+		rv = kn->kn_fop->f_touch(kn, kev, type);
+	} else {
+		rv = 0;
+	}
+
+	rw_exit(&knote_filterops_lock);
+
+	return rv;
 }
 
 static kauth_listener_t	kqueue_listener;
@@ -518,6 +601,7 @@ kqueue_init(void)
 {
 
 	rw_init(&kqueue_filter_lock);
+	rw_init(&knote_filterops_lock);
 
 	kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
 	    kqueue_listener_cb, NULL);
@@ -2797,3 +2881,59 @@ knote_clear_eof(struct knote *kn)
 	kn->kn_flags &= ~EV_EOF;
 	mutex_spin_exit(&kq->kq_lock);
 }
+
+/*
+ * Initialize a klist.
+ */
+void
+klist_init(struct klist *list)
+{
+	SLIST_INIT(list);
+}
+
+/*
+ * Neuter all existing knotes on the klist because the list is
+ * being destroyed.  The caller has guaranteed that no additional
+ * knotes will be added to the list, that the backing object's
+ * locks are not held (otherwise there is a locking order issue
+ * with acquire knote_filterops_lock), and that we can traverse
+ * the list safely in this state.
+ */
+void
+klist_fini(struct klist *list)
+{
+	struct knote *kn;
+
+	rw_enter(&knote_filterops_lock, RW_WRITER);
+
+	SLIST_FOREACH(kn, list, kn_selnext) {
+		KASSERT(kn->kn_fop != NULL);
+		if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+			kn->kn_fop = &nop_fd_filtops;
+		} else {
+			kn->kn_fop = &nop_filtops;
+		}
+	}
+
+	rw_exit(&knote_filterops_lock);
+}
+
+/*
+ * Insert a knote into a klist.
+ */
+void
+klist_insert(struct klist *list, struct knote *kn)
+{
+	SLIST_INSERT_HEAD(list, kn, kn_selnext);
+}
+
+/*
+ * Remove a knote from a klist.  Returns true if the last
+ * knote was removed and the list is now empty.
+ */
+bool
+klist_remove(struct klist *list, struct knote *kn)
+{
+	SLIST_REMOVE(list, kn, knote, kn_selnext);
+	return SLIST_EMPTY(list);
+}
Index: sys/event.h
===================================================================
RCS file: /cvsroot/src/sys/sys/event.h,v
retrieving revision 1.52
diff -u -p -r1.52 event.h
--- sys/event.h	12 Feb 2022 15:51:29 -0000	1.52
+++ sys/event.h	11 Jul 2022 20:53:40 -0000
@@ -334,30 +334,10 @@ int	kfilter_unregister(const char *);
 int	filt_seltrue(struct knote *, long);
 extern const struct filterops seltrue_filtops;
 
-static inline void
-klist_init(struct klist *list)
-{
-	SLIST_INIT(list);
-}
-
-static inline void
-klist_fini(struct klist *list)
-{
-	/* Nothing, for now. */
-}
-
-static inline void
-klist_insert(struct klist *list, struct knote *kn)
-{
-	SLIST_INSERT_HEAD(list, kn, kn_selnext);
-}
-
-static inline bool
-klist_remove(struct klist *list, struct knote *kn)
-{
-	SLIST_REMOVE(list, kn, knote, kn_selnext);
-	return SLIST_EMPTY(list);
-}
+void	klist_init(struct klist *);
+void	klist_fini(struct klist *);
+void	klist_insert(struct klist *, struct knote *);
+bool	klist_remove(struct klist *, struct knote *);
 
 #else 	/* !_KERNEL */
 

-- thorpej




Home | Main Index | Thread Index | Old Index