[src/trunk]: src/sys/kern Fix a problem whereby detaching a device that has o...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/kern Fix a problem whereby detaching a device that has o...
From: thorpej <thorpej%NetBSD.org@localhost>
Date: Tue, 19 Jul 2022 04:07:51 +0000
details:   https://anonhg.NetBSD.org/src/rev/f47fda3c4ebb
branches:  trunk
changeset: 368556:f47fda3c4ebb
user:      thorpej <thorpej%NetBSD.org@localhost>
date:      Tue Jul 19 01:03:05 2022 +0000

description:
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.

Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details).  Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.

The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
    the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
    klist_fini() will not complete until there are no callers inside
    the filterops that are about to be blown away.

diffstat:

 sys/kern/kern_event.c |  185 +++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 169 insertions(+), 16 deletions(-)

diffs (truncated from 363 to 300 lines):

diff -r 2a4ccc8dd8ee -r f47fda3c4ebb sys/kern/kern_event.c
--- a/sys/kern/kern_event.c     Tue Jul 19 00:46:00 2022 +0000
+++ b/sys/kern/kern_event.c     Tue Jul 19 01:03:05 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: kern_event.c,v 1.144 2022/07/19 00:46:00 thorpej Exp $ */
+/*     $NetBSD: kern_event.c,v 1.145 2022/07/19 01:03:05 thorpej Exp $ */
 
 /*-
  * Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
@@ -63,7 +63,7 @@
 #endif /* _KERNEL_OPT */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.144 2022/07/19 00:46:00 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.145 2022/07/19 01:03:05 thorpej Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -131,6 +131,7 @@
 struct knote_impl {
        struct knote    ki_knote;
        unsigned int    ki_influx;      /* q: in-flux counter */
+       kmutex_t        ki_foplock;     /* for kn_filterops */
 };
 
 #define        KIMPL_TO_KNOTE(kip)     (&(kip)->ki_knote)
@@ -142,6 +143,7 @@
        struct knote_impl *ki;
 
        ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
+       mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
 
        return KIMPL_TO_KNOTE(ki);
 }
@@ -151,9 +153,28 @@
 {
        struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
 
+       mutex_destroy(&ki->ki_foplock);
        kmem_free(ki, sizeof(*ki));
 }
 
+static inline void
+knote_foplock_enter(struct knote *kn)
+{
+       mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
+static inline void
+knote_foplock_exit(struct knote *kn)
+{
+       mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
+static inline bool
+knote_foplock_owned(struct knote *kn)
+{
+       return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
 static const struct fileops kqueueops = {
        .fo_name = "kqueue",
        .fo_read = (void *)enxio,
@@ -167,6 +188,31 @@
        .fo_restart = kqueue_restart,
 };
 
+static void
+filt_nopdetach(struct knote *kn __unused)
+{
+}
+
+static int
+filt_nopevent(struct knote *kn __unused, long hint __unused)
+{
+       return 0;
+}
+
+static const struct filterops nop_fd_filtops = {
+       .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+       .f_attach = NULL,
+       .f_detach = filt_nopdetach,
+       .f_event = filt_nopevent,
+};
+
+static const struct filterops nop_filtops = {
+       .f_flags = FILTEROP_MPSAFE,
+       .f_attach = NULL,
+       .f_detach = filt_nopdetach,
+       .f_event = filt_nopevent,
+};
+
 static const struct filterops kqread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
@@ -263,18 +309,45 @@
  *
  *     kqueue_filter_lock
  *     -> kn_kq->kq_fdp->fd_lock
+ *     -> knote foplock (if taken)
  *     -> object lock (e.g., device driver lock, &c.)
  *     -> kn_kq->kq_lock
  *
- * Locking rules:
+ * Locking rules.  ==> indicates the lock is acquired by the backing
+ * object, locks prior are acquired before calling filter ops:
+ *
+ *     f_attach: fdp->fd_lock -> knote foplock ->
+ *       (maybe) KERNEL_LOCK ==> backing object lock
+ *
+ *     f_detach: fdp->fd_lock -> knote foplock ->
+ *        (maybe) KERNEL_LOCK ==> backing object lock
+ *
+ *     f_event via kevent: fdp->fd_lock -> knote foplock ->
+ *        (maybe) KERNEL_LOCK ==> backing object lock
+ *        N.B. NOTE_SUBMIT will never be set in the "hint" argument
+ *        in this case.
  *
- *     f_attach: fdp->fd_lock, KERNEL_LOCK
- *     f_detach: fdp->fd_lock, KERNEL_LOCK
- *     f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
- *     f_event via knote: whatever caller guarantees
- *             Typically,      f_event(NOTE_SUBMIT) via knote: object lock
- *                             f_event(!NOTE_SUBMIT) via knote: nothing,
- *                                     acquires/releases object lock inside.
+ *     f_event via knote (via backing object: Whatever caller guarantees.
+ *     Typically:
+ *             f_event(NOTE_SUBMIT): caller has already acquired backing
+ *                 object lock.
+ *             f_event(!NOTE_SUBMIT): caller has not acquired backing object,
+ *                 lock or has possibly acquired KERNEL_LOCK.  Backing object
+ *                 lock may or may not be acquired as-needed.
+ *     N.B. the knote foplock will **not** be acquired in this case.  The
+ *     caller guarantees that klist_fini() will not be called concurrently
+ *     with knote().
+ *
+ *     f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
+ *         N.B. knote foplock is **not** acquired in this case and
+ *         the caller must guarantee that klist_fini() will never
+ *         be called.  kevent_register() restricts filters that
+ *         provide f_touch to known-safe cases.
+ *
+ *     klist_fini(): Caller must guarantee that no more knotes can
+ *         be attached to the klist, and must **not** hold the backing
+ *         object's lock; klist_fini() itself will acquire the foplock
+ *         of each knote on the klist.
  *
  * Locking rules when detaching knotes:
  *
@@ -461,17 +534,31 @@
        return false;
 }
 
+/*
+ * Calls into the filterops need to be resilient against things which
+ * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
+ * chasing garbage pointers (to data, or even potentially code in a
+ * module about to be unloaded).  To that end, we acquire the
+ * knote foplock before calling into the filter ops.  When a driver
+ * (or anything else) is tearing down its klist, klist_fini() enumerates
+ * each knote, acquires its foplock, and replaces the filterops with a
+ * nop stub, allowing knote detach (when descriptors are closed) to safely
+ * proceed.
+ */
+
 static int
 filter_attach(struct knote *kn)
 {
        int rv;
 
+       KASSERT(knote_foplock_owned(kn));
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_attach != NULL);
 
        /*
         * N.B. that kn->kn_fop may change as the result of calling
-        * f_attach().
+        * f_attach().  After f_attach() returns, kn->kn_fop may not
+        * be modified by code outside of klist_fini().
         */
        if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
                rv = kn->kn_fop->f_attach(kn);
@@ -487,6 +574,8 @@
 static void
 filter_detach(struct knote *kn)
 {
+
+       KASSERT(knote_foplock_owned(kn));
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_detach != NULL);
 
@@ -500,10 +589,12 @@
 }
 
 static int
-filter_event(struct knote *kn, long hint)
+filter_event(struct knote *kn, long hint, bool submitting)
 {
        int rv;
 
+       /* See knote(). */
+       KASSERT(submitting || knote_foplock_owned(kn));
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_event != NULL);
 
@@ -521,6 +612,19 @@
 static int
 filter_touch(struct knote *kn, struct kevent *kev, long type)
 {
+
+       /*
+        * XXX We cannot assert that the knote foplock is held here
+        * XXX beause we cannot safely acquire it in all cases
+        * XXX where "touch" will be used in kqueue_scan().  We just
+        * XXX have to assume that f_touch will always be safe to call,
+        * XXX and kqueue_register() allows only the two known-safe
+        * XXX users of that op.
+        */
+
+       KASSERT(kn->kn_fop != NULL);
+       KASSERT(kn->kn_fop->f_touch != NULL);
+
        return kn->kn_fop->f_touch(kn, kev, type);
 }
 
@@ -1871,6 +1975,17 @@
 
                        KASSERT(kn->kn_fop != NULL);
                        /*
+                        * XXX Allow only known-safe users of f_touch.
+                        * XXX See filter_touch() for details.
+                        */
+                       if (kn->kn_fop->f_touch != NULL &&
+                           kn->kn_fop != &timer_filtops &&
+                           kn->kn_fop != &user_filtops) {
+                               error = ENOTSUP;
+                               goto fail_ev_add;
+                       }
+
+                       /*
                         * apply reference count to knote structure, and
                         * do not release it at the end of this routine.
                         */
@@ -1902,6 +2017,7 @@
                         * N.B. kn->kn_fop may change as the result
                         * of filter_attach()!
                         */
+                       knote_foplock_enter(kn);
                        error = filter_attach(kn);
                        if (error != 0) {
 #ifdef DEBUG
@@ -1915,6 +2031,7 @@
                                    ft ? ft->f_ops->fo_name : "?", error);
 #endif
 
+ fail_ev_add:
                                /*
                                 * N.B. no need to check for this note to
                                 * be in-flux, since it was never visible
@@ -1922,6 +2039,7 @@
                                 *
                                 * knote_detach() drops fdp->fd_lock
                                 */
+                               knote_foplock_exit(kn);
                                mutex_enter(&kq->kq_lock);
                                KNOTE_WILLDETACH(kn);
                                KASSERT(kn_in_flux(kn) == false);
@@ -1979,6 +2097,7 @@
         * initial EV_ADD, but doing so will not reset any
         * filter which have already been triggered.
         */
+       knote_foplock_enter(kn);
        kn->kn_kevent.udata = kev->udata;
        KASSERT(kn->kn_fop != NULL);
        if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
@@ -1989,6 +2108,7 @@
                if (__predict_false(error != 0)) {
                        /* Never a new knote (which would consume newkn). */
                        KASSERT(newkn != NULL);
+                       knote_foplock_exit(kn);
                        goto doneunlock;
                }
        } else {
@@ -2003,10 +2123,12 @@
         * broken and does not return an error.
         */
  done_ev_add:
-       rv = filter_event(kn, 0);
+       rv = filter_event(kn, 0, false);
        if (rv)
                knote_activate(kn);
 
+       knote_foplock_exit(kn);
+
        /* disable knote */
        if ((kev->flags & EV_DISABLE)) {
                mutex_spin_enter(&kq->kq_lock);
@@ -2279,7 +2401,9 @@
                if ((kn->kn_flags & EV_ONESHOT) == 0) {
                        mutex_spin_exit(&kq->kq_lock);
Prev by Date: [src/trunk]: src/sys Make some knote implementation details private to kern_e...
Next by Date: [xsrc/trunk]: xsrc/external/mit/ctwm/dist Fix path in man page to system.ctwmrc.
Previous by Thread: [src/trunk]: src/sys Make some knote implementation details private to kern_e...
Next by Thread: [xsrc/trunk]: xsrc/external/mit/ctwm/dist Fix path in man page to system.ctwmrc.
Indexes:
Home | Main Index | Thread Index | Old Index