tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: Problem with outstanding knotes and device detach - and a fix
> On Jul 13, 2022, at 7:18 PM, Jason Thorpe <thorpej%me.com@localhost> wrote:
>
> Ok, new version. Main differences:
Aaaaand another new version. This:
==> Creates a knote_impl structure that’s private to kern_event.c that has the new lock. I took the opportunity to move kn_influx to the knote_impl as well, since absolutely no one outside of kern_event.c should touch it. This change is ABI-compatible.
==> Improves the comments about the locking rules.
There are no functional changes since the last patch.
Index: kern/kern_event.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_event.c,v
retrieving revision 1.143
diff -u -p -r1.143 kern_event.c
--- kern/kern_event.c 13 Jul 2022 14:11:46 -0000 1.143
+++ kern/kern_event.c 17 Jul 2022 19:44:06 -0000
@@ -120,6 +120,61 @@ static void filt_userdetach(struct knote
static int filt_user(struct knote *, long hint);
static int filt_usertouch(struct knote *, struct kevent *, long type);
+/*
+ * Private knote state that should never be exposed outside
+ * of kern_event.c
+ *
+ * Field locking:
+ *
+ * q kn_kq->kq_lock
+ */
+struct knote_impl {
+ struct knote ki_knote;
+ unsigned int ki_influx; /* q: in-flux counter */
+ kmutex_t ki_foplock; /* for kn_filterops */
+};
+
+#define KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote)
+#define KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote)
+
+static inline struct knote *
+knote_alloc(bool sleepok)
+{
+ struct knote_impl *ki;
+
+ ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
+ mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
+
+ return KIMPL_TO_KNOTE(ki);
+}
+
+static inline void
+knote_free(struct knote *kn)
+{
+ struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
+
+ mutex_destroy(&ki->ki_foplock);
+ kmem_free(ki, sizeof(*ki));
+}
+
+static inline void
+knote_foplock_enter(struct knote *kn)
+{
+ mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
+static inline void
+knote_foplock_exit(struct knote *kn)
+{
+ mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
+static inline bool
+knote_foplock_owned(struct knote *kn)
+{
+ return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
+}
+
static const struct fileops kqueueops = {
.fo_name = "kqueue",
.fo_read = (void *)enxio,
@@ -133,6 +188,31 @@ static const struct fileops kqueueops =
.fo_restart = kqueue_restart,
};
+static void
+filt_nopdetach(struct knote *kn __unused)
+{
+}
+
+static int
+filt_nopevent(struct knote *kn __unused, long hint __unused)
+{
+ return 0;
+}
+
+static const struct filterops nop_fd_filtops = {
+ .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+ .f_attach = NULL,
+ .f_detach = filt_nopdetach,
+ .f_event = filt_nopevent,
+};
+
+static const struct filterops nop_filtops = {
+ .f_flags = FILTEROP_MPSAFE,
+ .f_attach = NULL,
+ .f_detach = filt_nopdetach,
+ .f_event = filt_nopevent,
+};
+
static const struct filterops kqread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
@@ -232,15 +312,41 @@ static size_t user_kfiltersz; /* size
* -> object lock (e.g., device driver lock, &c.)
* -> kn_kq->kq_lock
*
- * Locking rules:
+ * Locking rules. ==> indicates the lock is acquired by the backing
+ * object, locks prior are acquired before calling filter ops:
+ *
+ * f_attach: fdp->fd_lock -> knote foplock ->
+ * (maybe) KERNEL_LOCK ==> backing object lock
+ *
+ * f_detach: fdp->fd_lock -> knote foplock ->
+ * (maybe) KERNEL_LOCK ==> backing object lock
+ *
+ * f_event via kevent: fdp->fd_lock -> knote foplock ->
+ * (maybe) KERNEL_LOCK ==> backing object lock
+ * N.B. NOTE_SUBMIT will never be set in the "hint" argument
+ * in this case.
*
- * f_attach: fdp->fd_lock, KERNEL_LOCK
- * f_detach: fdp->fd_lock, KERNEL_LOCK
- * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock
- * f_event via knote: whatever caller guarantees
- * Typically, f_event(NOTE_SUBMIT) via knote: object lock
- * f_event(!NOTE_SUBMIT) via knote: nothing,
- * acquires/releases object lock inside.
+ * f_event via knote (via backing object: Whatever caller guarantees.
+ * Typically:
+ * f_event(NOTE_SUBMIT): caller has already acquired backing
+ * object lock.
+ * f_event(!NOTE_SUBMIT): caller has not acquired backing object,
+ * lock or has possibly acquired KERNEL_LOCK. Backing object
+ * lock may or may not be acquired as-needed.
+ * N.B. the knote foplock will **not** be acquired in this case. The
+ * caller guarantees that klist_fini() will not be called concurrently
+ * with knote().
+ *
+ * f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
+ * N.B. knote foplock is **not** acquired in this case and
+ * the caller must guarantee that klist_fini() will never
+ * be called. kevent_register() restricts filters that
+ * provide f_touch to known-safe cases.
+ *
+ * klist_fini(): Caller must guarantee that no more knotes can
+ * be attached to the klist, and must **not** hold the backing
+ * object's lock; klist_fini() itself will acquire the foplock
+ * of each knote on the klist.
*
* Locking rules when detaching knotes:
*
@@ -286,7 +392,7 @@ static inline bool
kn_in_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
- return kn->kn_influx != 0;
+ return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
}
static inline bool
@@ -298,8 +404,9 @@ kn_enter_flux(struct knote *kn)
return false;
}
- KASSERT(kn->kn_influx < UINT_MAX);
- kn->kn_influx++;
+ struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
+ KASSERT(ki->ki_influx < UINT_MAX);
+ ki->ki_influx++;
return true;
}
@@ -308,14 +415,17 @@ static inline bool
kn_leave_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
- KASSERT(kn->kn_influx > 0);
- kn->kn_influx--;
- return kn->kn_influx == 0;
+
+ struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
+ KASSERT(ki->ki_influx > 0);
+ ki->ki_influx--;
+ return ki->ki_influx == 0;
}
static void
kn_wait_flux(struct knote *kn, bool can_loop)
{
+ struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
bool loop;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
@@ -325,7 +435,7 @@ kn_wait_flux(struct knote *kn, bool can_
* dropping the kq_lock. The caller has let us know in
* 'can_loop'.
*/
- for (loop = true; loop && kn->kn_influx != 0; loop = can_loop) {
+ for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
KQ_FLUX_WAIT(kn->kn_kq);
}
}
@@ -423,33 +533,31 @@ knote_detach_quiesce(struct knote *kn)
return false;
}
-static inline struct knote *
-knote_alloc(bool sleepok)
-{
- struct knote *kn;
-
- kn = kmem_zalloc(sizeof(*kn), sleepok ? KM_SLEEP : KM_NOSLEEP);
-
- return kn;
-}
-
-static inline void
-knote_free(struct knote *kn)
-{
- kmem_free(kn, sizeof(*kn));
-}
+/*
+ * Calls into the filterops need to be resilient against things which
+ * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
+ * chasing garbage pointers (to data, or even potentially code in a
+ * module about to be unloaded). To that end, we acquire the
+ * knote foplock before calling into the filter ops. When a driver
+ * (or anything else) is tearing down its klist, klist_fini() enumerates
+ * each knote, acquires its foplock, and replaces the filterops with a
+ * nop stub, allowing knote detach (when descriptors are closed) to safely
+ * proceed.
+ */
static int
filter_attach(struct knote *kn)
{
int rv;
+ KASSERT(knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_attach != NULL);
/*
* N.B. that kn->kn_fop may change as the result of calling
- * f_attach().
+ * f_attach(). After f_attach() returns, kn->kn_fop may not
+ * be modified by code outside of klist_fini().
*/
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
rv = kn->kn_fop->f_attach(kn);
@@ -465,6 +573,8 @@ filter_attach(struct knote *kn)
static void
filter_detach(struct knote *kn)
{
+
+ KASSERT(knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_detach != NULL);
@@ -478,10 +588,12 @@ filter_detach(struct knote *kn)
}
static int
-filter_event(struct knote *kn, long hint)
+filter_event(struct knote *kn, long hint, bool submitting)
{
int rv;
+ /* See knote(). */
+ KASSERT(submitting || knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_event != NULL);
@@ -499,6 +611,19 @@ filter_event(struct knote *kn, long hint
static int
filter_touch(struct knote *kn, struct kevent *kev, long type)
{
+
+ /*
+ * XXX We cannot assert that the knote foplock is held here
+ * XXX beause we cannot safely acquire it in all cases
+ * XXX where "touch" will be used in kqueue_scan(). We just
+ * XXX have to assume that f_touch will always be safe to call,
+ * XXX and kqueue_register() allows only the two known-safe
+ * XXX users of that op.
+ */
+
+ KASSERT(kn->kn_fop != NULL);
+ KASSERT(kn->kn_fop->f_touch != NULL);
+
return kn->kn_fop->f_touch(kn, kev, type);
}
@@ -1849,6 +1974,17 @@ kqueue_register(struct kqueue *kq, struc
KASSERT(kn->kn_fop != NULL);
/*
+ * XXX Allow only known-safe users of f_touch.
+ * XXX See filter_touch() for details.
+ */
+ if (kn->kn_fop->f_touch != NULL &&
+ kn->kn_fop != &timer_filtops &&
+ kn->kn_fop != &user_filtops) {
+ error = ENOTSUP;
+ goto fail_ev_add;
+ }
+
+ /*
* apply reference count to knote structure, and
* do not release it at the end of this routine.
*/
@@ -1880,6 +2016,7 @@ kqueue_register(struct kqueue *kq, struc
* N.B. kn->kn_fop may change as the result
* of filter_attach()!
*/
+ knote_foplock_enter(kn);
error = filter_attach(kn);
if (error != 0) {
#ifdef DEBUG
@@ -1893,6 +2030,7 @@ kqueue_register(struct kqueue *kq, struc
ft ? ft->f_ops->fo_name : "?", error);
#endif
+ fail_ev_add:
/*
* N.B. no need to check for this note to
* be in-flux, since it was never visible
@@ -1900,6 +2038,7 @@ kqueue_register(struct kqueue *kq, struc
*
* knote_detach() drops fdp->fd_lock
*/
+ knote_foplock_exit(kn);
mutex_enter(&kq->kq_lock);
KNOTE_WILLDETACH(kn);
KASSERT(kn_in_flux(kn) == false);
@@ -1957,6 +2096,7 @@ kqueue_register(struct kqueue *kq, struc
* initial EV_ADD, but doing so will not reset any
* filter which have already been triggered.
*/
+ knote_foplock_enter(kn);
kn->kn_kevent.udata = kev->udata;
KASSERT(kn->kn_fop != NULL);
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
@@ -1967,6 +2107,7 @@ kqueue_register(struct kqueue *kq, struc
if (__predict_false(error != 0)) {
/* Never a new knote (which would consume newkn). */
KASSERT(newkn != NULL);
+ knote_foplock_exit(kn);
goto doneunlock;
}
} else {
@@ -1981,10 +2122,12 @@ kqueue_register(struct kqueue *kq, struc
* broken and does not return an error.
*/
done_ev_add:
- rv = filter_event(kn, 0);
+ rv = filter_event(kn, 0, false);
if (rv)
knote_activate(kn);
+ knote_foplock_exit(kn);
+
/* disable knote */
if ((kev->flags & EV_DISABLE)) {
mutex_spin_enter(&kq->kq_lock);
@@ -2256,7 +2399,9 @@ relock:
if ((kn->kn_flags & EV_ONESHOT) == 0) {
mutex_spin_exit(&kq->kq_lock);
KASSERT(mutex_owned(&fdp->fd_lock));
- rv = filter_event(kn, 0);
+ knote_foplock_enter(kn);
+ rv = filter_event(kn, 0, false);
+ knote_foplock_exit(kn);
mutex_spin_enter(&kq->kq_lock);
/* Re-poll if note was re-enqueued. */
if ((kn->kn_status & KN_QUEUED) != 0) {
@@ -2615,7 +2760,15 @@ knote(struct klist *list, long hint)
struct knote *kn, *tmpkn;
SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
- if (filter_event(kn, hint)) {
+ /*
+ * We assume here that the backing object's lock is
+ * already held if we're traversing the klist, and
+ * so acquiring the knote foplock would create a
+ * deadlock scenario. But we also know that the klist
+ * won't disappear on us while we're here, so not
+ * acquiring it is safe.
+ */
+ if (filter_event(kn, hint, true)) {
knote_activate(kn);
}
}
@@ -2664,7 +2817,9 @@ knote_detach(struct knote *kn, filedesc_
/* Remove from monitored object. */
if (dofop) {
+ knote_foplock_enter(kn);
filter_detach(kn);
+ knote_foplock_exit(kn);
}
/* Remove from descriptor table. */
@@ -2829,7 +2984,26 @@ klist_init(struct klist *list)
void
klist_fini(struct klist *list)
{
- /* Nothing, for now. */
+ struct knote *kn;
+
+ /*
+ * Neuter all existing knotes on the klist because the list is
+ * being destroyed. The caller has guaranteed that no additional
+ * knotes will be added to the list, that the backing object's
+ * locks are not held (otherwise there is a locking order issue
+ * with acquiring the knote foplock ), and that we can traverse
+ * the list safely in this state.
+ */
+ SLIST_FOREACH(kn, list, kn_selnext) {
+ knote_foplock_enter(kn);
+ KASSERT(kn->kn_fop != NULL);
+ if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+ kn->kn_fop = &nop_fd_filtops;
+ } else {
+ kn->kn_fop = &nop_filtops;
+ }
+ knote_foplock_exit(kn);
+ }
}
/*
Index: sys/event.h
===================================================================
RCS file: /cvsroot/src/sys/sys/event.h,v
retrieving revision 1.53
diff -u -p -r1.53 event.h
--- sys/event.h 13 Jul 2022 14:11:46 -0000 1.53
+++ sys/event.h 17 Jul 2022 19:44:06 -0000
@@ -262,7 +262,6 @@ struct knote {
struct kfilter *kn_kfilter;
void *kn_hook;
int kn_hookid;
- unsigned int kn_influx; /* q: in-flux counter */
#define KN_ACTIVE 0x01U /* event has been triggered */
#define KN_QUEUED 0x02U /* event is on queue */
-- thorpej
Home |
Main Index |
Thread Index |
Old Index