Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/kern driver(9): Fix synchronization of devsw_attach/look...



details:   https://anonhg.NetBSD.org/src/rev/f7fc1af35aca
branches:  trunk
changeset: 364490:f7fc1af35aca
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Mon Mar 28 12:33:32 2022 +0000

description:
driver(9): Fix synchronization of devsw_attach/lookup/detach.

(`dev' means either `bdev' or `cdev' for brevity here, e.g. in
`devsw_lookup' (bdevsw_lookup, cdevsw_lookup), `dev_open' (bdev_open,
cdev_open), `maxdevsws', &c., except for `devsw_attach' and
`devsw_detach' which are taken literally.)

- Use atomic_store_release and atomic_load_consume for devsw and
  tables and their entries, which are read unlocked and thus require
  memory barriers to ensure ordering between initialization in
  devsw_attach and use in dev_lookup.

- Use pserialize(9) and localcount(9) to synchronize dev_open and
  devsw_detach.

  => Driver must ensure d_open fails and all open instances have been
     closed by the time it calls devsw_detach.

  => Bonus: dev_open is no longer globally serialized through
     device_lock.

- Use atomic_store_release and atomic_load_acquire for max_devsws,
  which is used in conditionals in the new devsw_lookup_acquire.

  => It is safe to use atomic_load_relaxed in devsw_lookup because
     the caller must guarantee the entry is stable, so any increase
     of max_devsws must have already happened.

  => devsw_lookup and devsw_lookup_acquire assume that max_devsws
     never goes down.  If you change this you must find some way to
     adapt the users, preferably without adding much overhead so that
     devsw operations are cheap.

This change introduces an auxiliary table devswref mapping device
majors to localcounts of opens in progress.  The auxiliary table only
occupies one pointer's worth of memory in a monolithic kernel, and is
allocated on the fly for dynamically loaded modules.  We could ask
the module itself to reserve storage for it, but I don't see much
value in that, and it would require some changes to the ABI and to
config(8).

- Omit needless boolean indirection.

diffstat:

 sys/kern/subr_devsw.c |  310 ++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 251 insertions(+), 59 deletions(-)

diffs (truncated from 491 to 300 lines):

diff -r 7775bf1a281f -r f7fc1af35aca sys/kern/subr_devsw.c
--- a/sys/kern/subr_devsw.c     Mon Mar 28 12:33:20 2022 +0000
+++ b/sys/kern/subr_devsw.c     Mon Mar 28 12:33:32 2022 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: subr_devsw.c,v 1.39 2022/03/28 12:33:22 riastradh Exp $        */
+/*     $NetBSD: subr_devsw.c,v 1.40 2022/03/28 12:33:32 riastradh Exp $        */
 
 /*-
  * Copyright (c) 2001, 2002, 2007, 2008 The NetBSD Foundation, Inc.
@@ -69,7 +69,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.39 2022/03/28 12:33:22 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.40 2022/03/28 12:33:32 riastradh Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_dtrace.h"
@@ -85,6 +85,10 @@
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/sdt.h>
+#include <sys/atomic.h>
+#include <sys/localcount.h>
+#include <sys/pserialize.h>
+#include <sys/xcall.h>
 
 #ifdef DEVSW_DEBUG
 #define        DPRINTF(x)      printf x
@@ -97,12 +101,21 @@
 #define        CDEVSW_SIZE     (sizeof(struct cdevsw *))
 #define        DEVSWCONV_SIZE  (sizeof(struct devsw_conv))
 
+struct devswref {
+       struct localcount       *dr_lc;
+};
+
+/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */
 extern const struct bdevsw **bdevsw, *bdevsw0[];
 extern const struct cdevsw **cdevsw, *cdevsw0[];
 extern struct devsw_conv *devsw_conv, devsw_conv0[];
 extern const int sys_bdevsws, sys_cdevsws;
 extern int max_bdevsws, max_cdevsws, max_devsw_convs;
 
+static struct devswref *cdevswref;
+static struct devswref *bdevswref;
+static kcondvar_t devsw_cv;
+
 static int bdevsw_attach(const struct bdevsw *, devmajor_t *);
 static int cdevsw_attach(const struct cdevsw *, devmajor_t *);
 static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *);
@@ -118,6 +131,8 @@
        KASSERT(sys_bdevsws < MAXDEVSW - 1);
        KASSERT(sys_cdevsws < MAXDEVSW - 1);
        mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE);
+
+       cv_init(&devsw_cv, "devsw");
 }
 
 int
@@ -158,15 +173,13 @@
                        error = EEXIST;
                        goto fail;
                }
-
-               if (bdev != NULL)
-                       bdevsw[*bmajor] = bdev;
-               cdevsw[*cmajor] = cdev;
-
-               mutex_exit(&device_lock);
-               return (0);
+               break;
        }
 
+       /*
+        * XXX This should allocate what it needs up front so we never
+        * need to flail around trying to unwind.
+        */
        error = bdevsw_attach(bdev, bmajor);
        if (error != 0) 
                goto fail;
@@ -176,6 +189,13 @@
                goto fail;
        }
 
+       /*
+        * If we already found a conv, we're done.  Otherwise, find an
+        * empty slot or extend the table.
+        */
+       if (i == max_devsw_convs)
+               goto fail;
+
        for (i = 0 ; i < max_devsw_convs ; i++) {
                if (devsw_conv[i].d_name == NULL)
                        break;
@@ -224,7 +244,9 @@
 static int
 bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor)
 {
-       const struct bdevsw **newptr;
+       const struct bdevsw **newbdevsw = NULL;
+       struct devswref *newbdevswref = NULL;
+       struct localcount *lc;
        devmajor_t bmajor;
        int i;
 
@@ -253,20 +275,34 @@
                return (ENOMEM);
        }
 
+       if (bdevswref == NULL) {
+               newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]),
+                   KM_NOSLEEP);
+               if (newbdevswref == NULL)
+                       return ENOMEM;
+               atomic_store_release(&bdevswref, newbdevswref);
+       }
+
        if (*devmajor >= max_bdevsws) {
                KASSERT(bdevsw == bdevsw0);
-               newptr = kmem_zalloc(MAXDEVSW * BDEVSW_SIZE, KM_NOSLEEP);
-               if (newptr == NULL)
-                       return (ENOMEM);
-               memcpy(newptr, bdevsw, max_bdevsws * BDEVSW_SIZE);
-               bdevsw = newptr;
-               max_bdevsws = MAXDEVSW;
+               newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]),
+                   KM_NOSLEEP);
+               if (newbdevsw == NULL)
+                       return ENOMEM;
+               memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0]));
+               atomic_store_release(&bdevsw, newbdevsw);
+               atomic_store_release(&max_bdevsws, MAXDEVSW);
        }
 
        if (bdevsw[*devmajor] != NULL)
                return (EEXIST);
 
-       bdevsw[*devmajor] = devsw;
+       KASSERT(bdevswref[*devmajor].dr_lc == NULL);
+       lc = kmem_zalloc(sizeof(*lc), KM_SLEEP);
+       localcount_init(lc);
+       bdevswref[*devmajor].dr_lc = lc;
+
+       atomic_store_release(&bdevsw[*devmajor], devsw);
 
        return (0);
 }
@@ -274,7 +310,9 @@
 static int
 cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor)
 {
-       const struct cdevsw **newptr;
+       const struct cdevsw **newcdevsw = NULL;
+       struct devswref *newcdevswref = NULL;
+       struct localcount *lc;
        devmajor_t cmajor;
        int i;
 
@@ -300,20 +338,34 @@
                return (ENOMEM);
        }
 
+       if (cdevswref == NULL) {
+               newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]),
+                   KM_NOSLEEP);
+               if (newcdevswref == NULL)
+                       return ENOMEM;
+               atomic_store_release(&cdevswref, newcdevswref);
+       }
+
        if (*devmajor >= max_cdevsws) {
                KASSERT(cdevsw == cdevsw0);
-               newptr = kmem_zalloc(MAXDEVSW * CDEVSW_SIZE, KM_NOSLEEP);
-               if (newptr == NULL)
-                       return (ENOMEM);
-               memcpy(newptr, cdevsw, max_cdevsws * CDEVSW_SIZE);
-               cdevsw = newptr;
-               max_cdevsws = MAXDEVSW;
+               newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]),
+                   KM_NOSLEEP);
+               if (newcdevsw == NULL)
+                       return ENOMEM;
+               memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0]));
+               atomic_store_release(&cdevsw, newcdevsw);
+               atomic_store_release(&max_cdevsws, MAXDEVSW);
        }
 
        if (cdevsw[*devmajor] != NULL)
                return (EEXIST);
 
-       cdevsw[*devmajor] = devsw;
+       KASSERT(cdevswref[*devmajor].dr_lc == NULL);
+       lc = kmem_zalloc(sizeof(*lc), KM_SLEEP);
+       localcount_init(lc);
+       cdevswref[*devmajor].dr_lc = lc;
+
+       atomic_store_release(&cdevsw[*devmajor], devsw);
 
        return (0);
 }
@@ -321,25 +373,67 @@
 static void
 devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev)
 {
-       int i;
+       int bi, ci = -1/*XXXGCC*/;
 
        KASSERT(mutex_owned(&device_lock));
 
+       /* Prevent new references.  */
        if (bdev != NULL) {
-               for (i = 0 ; i < max_bdevsws ; i++) {
-                       if (bdevsw[i] != bdev)
+               for (bi = 0; bi < max_bdevsws; bi++) {
+                       if (bdevsw[bi] != bdev)
                                continue;
-                       bdevsw[i] = NULL;
+                       atomic_store_relaxed(&bdevsw[bi], NULL);
+                       break;
+               }
+               KASSERT(bi < max_bdevsws);
+       }
+       if (cdev != NULL) {
+               for (ci = 0; ci < max_cdevsws; ci++) {
+                       if (cdevsw[ci] != cdev)
+                               continue;
+                       atomic_store_relaxed(&cdevsw[ci], NULL);
                        break;
                }
+               KASSERT(ci < max_cdevsws);
+       }
+
+       if (bdev == NULL && cdev == NULL) /* XXX possible? */
+               return;
+
+       /*
+        * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire
+        * calls to notice that the devsw is gone.
+        *
+        * XXX Despite the use of the pserialize_read_enter/exit API
+        * elsewhere in this file, we use xc_barrier here instead of
+        * pserialize_perform -- because devsw_init is too early for
+        * pserialize_create.  Either pserialize_create should be made
+        * to work earlier, or it should be nixed altogether.  Until
+        * that is fixed, xc_barrier will serve the same purpose.
+        */
+       xc_barrier(0);
+
+       /*
+        * Wait for all references to drain.  It is the caller's
+        * responsibility to ensure that at this point, there are no
+        * extant open instances and all new d_open calls will fail.
+        *
+        * Note that localcount_drain may release and reacquire
+        * device_lock.
+        */
+       if (bdev != NULL) {
+               localcount_drain(bdevswref[bi].dr_lc,
+                   &devsw_cv, &device_lock);
+               localcount_fini(bdevswref[bi].dr_lc);
+               kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc));
+               bdevswref[bi].dr_lc = NULL;
        }
        if (cdev != NULL) {
-               for (i = 0 ; i < max_cdevsws ; i++) {
-                       if (cdevsw[i] != cdev)
-                               continue;
-                       cdevsw[i] = NULL;
-                       break;
-               }
+               localcount_drain(cdevswref[ci].dr_lc,
+                   &devsw_cv, &device_lock);
+               localcount_fini(cdevswref[ci].dr_lc);
+               kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc));
+               cdevswref[ci].dr_lc = NULL;
        }
 }
 
@@ -365,10 +459,59 @@
        if (dev == NODEV)
                return (NULL);
        bmajor = major(dev);
-       if (bmajor < 0 || bmajor >= max_bdevsws)
+       if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws))
                return (NULL);
 
-       return (bdevsw[bmajor]);
+       return atomic_load_consume(&bdevsw)[bmajor];
+}
+
+static const struct bdevsw *
+bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp)
+{
+       devmajor_t bmajor;
+       const struct bdevsw *bdev = NULL, *const *curbdevsw;
+       struct devswref *curbdevswref;
+       int s;
+
+       if (dev == NODEV)



Home | Main Index | Thread Index | Old Index