[src/trunk]: src/sys/miscfs/specfs specfs: Drain all I/O operations after las...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/sys/miscfs/specfs specfs: Drain all I/O operations after las...
From: riastradh <riastradh%NetBSD.org@localhost>
Date: Mon, 28 Mar 2022 15:29:05 +0000
details:   https://anonhg.NetBSD.org/src/rev/e6bf5a15b610
branches:  trunk
changeset: 364513:e6bf5a15b610
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Mon Mar 28 12:36:51 2022 +0000

description:
specfs: Drain all I/O operations after last .d_close call.

New kind of I/O reference on specdevs, sd_iocnt.  This could be done
with psref instead; I chose a reference count instead for now because
we already have to take a per-object lock anyway, v_interlock, for
vdead_check, so another atomic is not likely to hurt much more.  We
can always change the mechanism inside spec_io_enter/exit/drain later
on.

Make sure every access to vp->v_rdev or vp->v_specnode and every call
to a devsw operation is protected either:

- by the vnode lock (with vdead_check if we unlocked/relocked),
- by positive sd_opencnt,
- by spec_io_enter/exit, or
- by sd_opencnt management in open/close.

diffstat:

 sys/miscfs/specfs/spec_vnops.c |  365 ++++++++++++++++++++++++++++++----------
 sys/miscfs/specfs/specdev.h    |    3 +-
 2 files changed, 273 insertions(+), 95 deletions(-)

diffs (truncated from 586 to 300 lines):

diff -r d24736930cdc -r e6bf5a15b610 sys/miscfs/specfs/spec_vnops.c
--- a/sys/miscfs/specfs/spec_vnops.c    Mon Mar 28 12:36:42 2022 +0000
+++ b/sys/miscfs/specfs/spec_vnops.c    Mon Mar 28 12:36:51 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: spec_vnops.c,v 1.201 2022/03/28 12:36:42 riastradh Exp $       */
+/*     $NetBSD: spec_vnops.c,v 1.202 2022/03/28 12:36:51 riastradh Exp $       */
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -58,7 +58,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.201 2022/03/28 12:36:42 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.202 2022/03/28 12:36:51 riastradh Exp $");
 
 #include <sys/param.h>
 #include <sys/proc.h>
@@ -81,6 +81,7 @@
 #include <sys/kauth.h>
 #include <sys/fstrans.h>
 #include <sys/module.h>
+#include <sys/atomic.h>
 
 #include <miscfs/genfs/genfs.h>
 #include <miscfs/specfs/specdev.h>
@@ -173,6 +174,7 @@
        { &spec_vnodeop_p, spec_vnodeop_entries };
 
 static kauth_listener_t rawio_listener;
+static struct kcondvar specfs_iocv;
 
 /* Returns true if vnode is /dev/mem or /dev/kmem. */
 bool
@@ -218,6 +220,141 @@
 
        rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
            rawio_listener_cb, NULL);
+       cv_init(&specfs_iocv, "specio");
+}
+
+/*
+ * spec_io_enter(vp, &sn, &dev)
+ *
+ *     Enter an operation that may not hold vp's vnode lock or an
+ *     fstrans on vp's mount.  Until spec_io_exit, the vnode will not
+ *     be revoked.
+ *
+ *     On success, set sn to the specnode pointer and dev to the dev_t
+ *     number and return zero.  Caller must later call spec_io_exit
+ *     when done.
+ *
+ *     On failure, return ENXIO -- the device has been revoked and no
+ *     longer exists.
+ */
+static int
+spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
+{
+       dev_t dev;
+       struct specnode *sn;
+       unsigned iocnt;
+       int error = 0;
+
+       mutex_enter(vp->v_interlock);
+
+       /*
+        * Extract all the info we need from the vnode, unless the
+        * vnode has already been reclaimed.  This can happen if the
+        * underlying device has been removed and all the device nodes
+        * for it have been revoked.  The caller may not hold a vnode
+        * lock or fstrans to prevent this from happening before it has
+        * had an opportunity to notice the vnode is dead.
+        */
+       if (vdead_check(vp, VDEAD_NOWAIT) != 0 ||
+           (sn = vp->v_specnode) == NULL ||
+           (dev = vp->v_rdev) == NODEV) {
+               error = ENXIO;
+               goto out;
+       }
+
+       /*
+        * Notify spec_close that we are doing an I/O operation which
+        * may not be not bracketed by fstrans(9) and thus is not
+        * blocked by vfs suspension.
+        *
+        * We could hold this reference with psref(9) instead, but we
+        * already have to take the interlock for vdead_check, so
+        * there's not much more cost here to another atomic operation.
+        */
+       do {
+               iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt);
+               if (__predict_false(iocnt == UINT_MAX)) {
+                       /*
+                        * The I/O count is limited by the number of
+                        * LWPs (which will never overflow this) --
+                        * unless one driver uses another driver via
+                        * specfs, which is rather unusual, but which
+                        * could happen via pud(4) userspace drivers.
+                        * We could use a 64-bit count, but can't use
+                        * atomics for that on all platforms.
+                        * (Probably better to switch to psref or
+                        * localcount instead.)
+                        */
+                       error = EBUSY;
+                       goto out;
+               }
+       } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1)
+           != iocnt);
+
+       /* Success!  */
+       *snp = sn;
+       *devp = dev;
+       error = 0;
+
+out:   mutex_exit(vp->v_interlock);
+       return error;
+}
+
+/*
+ * spec_io_exit(vp, sn)
+ *
+ *     Exit an operation entered with a successful spec_io_enter --
+ *     allow concurrent spec_node_revoke to proceed.  The argument sn
+ *     must match the struct specnode pointer returned by spec_io_exit
+ *     for vp.
+ */
+static void
+spec_io_exit(struct vnode *vp, struct specnode *sn)
+{
+       struct specdev *sd = sn->sn_dev;
+       unsigned iocnt;
+
+       KASSERT(vp->v_specnode == sn);
+
+       /*
+        * We are done.  Notify spec_close if appropriate.  The
+        * transition of 1 -> 0 must happen under device_lock so
+        * spec_close doesn't miss a wakeup.
+        */
+       do {
+               iocnt = atomic_load_relaxed(&sd->sd_iocnt);
+               KASSERT(iocnt > 0);
+               if (iocnt == 1) {
+                       mutex_enter(&device_lock);
+                       if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0)
+                               cv_broadcast(&specfs_iocv);
+                       mutex_exit(&device_lock);
+                       break;
+               }
+       } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
+}
+
+/*
+ * spec_io_drain(sd)
+ *
+ *     Wait for all existing spec_io_enter/exit sections to complete.
+ *     Caller must ensure spec_io_enter will fail at this point.
+ */
+static void
+spec_io_drain(struct specdev *sd)
+{
+
+       /*
+        * I/O at the same time as closing is unlikely -- it often
+        * indicates an application bug.
+        */
+       if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
+               return;
+
+       mutex_enter(&device_lock);
+       while (atomic_load_relaxed(&sd->sd_iocnt) > 0)
+               cv_wait(&specfs_iocv, &device_lock);
+       mutex_exit(&device_lock);
 }
 
 /*
@@ -258,6 +395,7 @@
                sd->sd_refcnt = 1;
                sd->sd_opencnt = 0;
                sd->sd_bdevvp = NULL;
+               sd->sd_iocnt = 0;
                sd->sd_opened = false;
                sn->sn_dev = sd;
                sd = NULL;
@@ -472,6 +610,7 @@
 
        /* If the device is no longer in use, destroy our record. */
        if (refcnt == 1) {
+               KASSERT(sd->sd_iocnt == 0);
                KASSERT(sd->sd_opencnt == 0);
                KASSERT(sd->sd_bdevvp == NULL);
                kmem_free(sd, sizeof(*sd));
@@ -509,29 +648,26 @@
                int  a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
-       struct lwp *l;
-       struct vnode *vp;
+       struct lwp *l = curlwp;
+       struct vnode *vp = ap->a_vp;
        dev_t dev;
        int error;
        enum kauth_device_req req;
        specnode_t *sn;
        specdev_t *sd;
        spec_ioctl_t ioctl;
-       u_int gen;
-       const char *name;
+       u_int gen = 0;
+       const char *name = NULL;
        bool needclose = false;
        struct partinfo pi;
-       
-       l = curlwp;
-       vp = ap->a_vp;
+
+       KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+       KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
+           vp->v_type);
+
        dev = vp->v_rdev;
        sn = vp->v_specnode;
        sd = sn->sn_dev;
-       name = NULL;
-       gen = 0;
-
-       KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
-           vp->v_type);
 
        /*
         * Don't allow open if fs is mounted -nodev.
@@ -797,6 +933,8 @@
        struct vnode *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        struct lwp *l = curlwp;
+       struct specnode *sn;
+       dev_t dev;
        struct buf *bp;
        daddr_t bn;
        int bsize, bscale;
@@ -819,9 +957,27 @@
        switch (vp->v_type) {
 
        case VCHR:
+               /*
+                * Release the lock while we sleep -- possibly
+                * indefinitely, if this is, e.g., a tty -- in
+                * cdev_read, so we don't hold up everything else that
+                * might want access to the vnode.
+                *
+                * But before we issue the read, take an I/O reference
+                * to the specnode so close will know when we're done
+                * reading.  Note that the moment we release the lock,
+                * the vnode's identity may change; hence spec_io_enter
+                * may fail, and the caller may have a dead vnode on
+                * their hands, if the file system on which vp lived
+                * has been unmounted.
+                */
                VOP_UNLOCK(vp);
-               error = cdev_read(vp->v_rdev, uio, ap->a_ioflag);
-               vn_lock(vp, LK_SHARED | LK_RETRY);
+               error = spec_io_enter(vp, &sn, &dev);
+               if (error)
+                       goto out;
+               error = cdev_read(dev, uio, ap->a_ioflag);
+               spec_io_exit(vp, sn);
+out:           vn_lock(vp, LK_SHARED | LK_RETRY);
                return (error);
 
        case VBLK:
@@ -898,6 +1054,8 @@
        struct vnode *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        struct lwp *l = curlwp;
+       struct specnode *sn;
+       dev_t dev;
        struct buf *bp;
        daddr_t bn;
        int bsize, bscale;
@@ -913,9 +1071,27 @@
        switch (vp->v_type) {
 
        case VCHR:
+               /*
+                * Release the lock while we sleep -- possibly
+                * indefinitely, if this is, e.g., a tty -- in
+                * cdev_write, so we don't hold up everything else that
+                * might want access to the vnode.
+                *
+                * But before we issue the write, take an I/O reference
+                * to the specnode so close will know when we're done
+                * writing.  Note that the moment we release the lock,
+                * the vnode's identity may change; hence spec_io_enter
+                * may fail, and the caller may have a dead vnode on
+                * their hands, if the file system on which vp lived
+                * has been unmounted.
+                */
                VOP_UNLOCK(vp);
-               error = cdev_write(vp->v_rdev, uio, ap->a_ioflag);
-               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
Prev by Date: [src/trunk]: src/sys/miscfs/specfs specfs: Resolve a race between close and a...
Next by Date: [src/trunk]: src/sys/miscfs/specfs specfs: Take an I/O reference in spec_node...
Previous by Thread: [src/trunk]: src/sys/miscfs/specfs specfs: Resolve a race between close and a...
Next by Thread: [src/trunk]: src/sys/miscfs/specfs specfs: Take an I/O reference in spec_node...
Indexes:
Home | Main Index | Thread Index | Old Index