tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[PATCH] Re: Locking in src/sys/dev/fss.c
Hello
I took more time on fss.c. I still have some trouble to grasp the
locking protocol in this file, but I have a proposal to address the
most offending issue: snapshot creation are serialized and the driver
locks fss_device_lock while they occur. On a big filesystem,
the operation can takes dozens of seconds while fssconfig(1)
commands get stuck in the kernel awaiting for fss_device_lock.
In the attached patch, I added a thread dedicated to snapshot
creation. This threads takes snapshot creation requests and
execute them one by one. The caller does not have to wait on
fss_device_lock.
While there, I added a few mutex_enter/mutex_exit for sc->sc_slock
where it looked inconsistent with other usages. I tried to avoid
holding the lock on long operation. One question pending: it is
okay to hold the lock during a copyin?
Opinions welcome.
--
Emmanuel Dreyfus
manu%netbsd.org@localhost
Index: fss.c
===================================================================
RCS file: /cvsroot/src/sys/dev/fss.c,v
retrieving revision 1.114
diff -U4 -r1.114 fss.c
--- fss.c 22 Mar 2023 21:14:46 -0000 1.114
+++ fss.c 9 Aug 2024 16:41:30 -0000
@@ -54,8 +54,9 @@
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
+#include <sys/queue.h>
#include <sys/kthread.h>
#include <sys/fstrans.h>
#include <sys/vfs_syscalls.h> /* For do_sys_unlink(). */
@@ -90,12 +91,23 @@
static void fss_bs_thread(void *);
static int fss_bs_io(struct fss_softc *, fss_io_type,
u_int32_t, off_t, int, void *, size_t *);
static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
+static void fss_create_enqueue(struct fss_softc *, struct fss_set *);
+static void fss_create_thread(void *);
+
+SIMPLEQ_HEAD(fss_create_queue, fss_create_entry) fss_create_queue;
+struct fss_create_entry {
+ struct fss_softc *fce_sc;
+ struct fss_set fce_fss;
+ char fce_mount[MNAMELEN];
+ char fce_bstore[MNAMELEN];
+ SIMPLEQ_ENTRY(fss_create_entry) fce_link;
+};
static kmutex_t fss_device_lock; /* Protect all units. */
static kcondvar_t fss_device_cv; /* Serialize snapshot creation. */
-static bool fss_creating = false; /* Currently creating a snapshot. */
+lwp_t *fss_create_lwp = NULL; /* create thread lwp */
static int fss_num_attached = 0; /* Number of attached devices. */
static struct vfs_hooks fss_vfs_hooks = {
.vh_unmount = fss_unmount_hook
};
@@ -135,13 +147,38 @@
void
fssattach(int num)
{
+ int error;
mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&fss_device_cv, "snapwait");
- if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
+ SIMPLEQ_INIT(&fss_create_queue);
+
+ error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
+ fss_create_thread, NULL, &fss_create_lwp, "fss_create");
+ if (error) {
+ aprint_error("%s: kthread_create failed\n", fss_cd.cd_name);
+ goto out;
+ }
+
+ error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
+ if (error) {
+ aprint_error("%s: config_cfattach_attach failed\n",
+ fss_cd.cd_name);
+ goto out;
+ }
+
+out:
+ if (error) {
aprint_error("%s: unable to register\n", fss_cd.cd_name);
+ if (fss_create_lwp)
+ fss_create_thread_join();
+ cv_destroy(&fss_device_cv);
+ mutex_destroy(&fss_device_lock);
+ }
+ return;
+
}
static int
fss_match(device_t self, cfdata_t cfdata, void *aux)
@@ -219,9 +256,12 @@
if (sc == NULL) {
mutex_exit(&fss_device_lock);
return ENOMEM;
}
+
+ mutex_enter(&sc->sc_slock);
sc->sc_state = FSS_IDLE;
+ mutex_exit(&sc->sc_slock);
}
mutex_enter(&sc->sc_slock);
@@ -360,8 +400,9 @@
if (error == 0 && sc->sc_state != FSS_IDLE) {
error = EBUSY;
} else {
sc->sc_state = FSS_CREATING;
+ /* XXX copyin with mutex held? */
copyinstr(fss->fss_mount, sc->sc_mntname,
sizeof(sc->sc_mntname), NULL);
memset(&sc->sc_time, 0, sizeof(sc->sc_time));
sc->sc_clshift = 0;
@@ -372,38 +413,9 @@
/*
* Serialize snapshot creation.
*/
- mutex_enter(&fss_device_lock);
- while (fss_creating) {
- error = cv_wait_sig(&fss_device_cv, &fss_device_lock);
- if (error) {
- mutex_enter(&sc->sc_slock);
- KASSERT(sc->sc_state == FSS_CREATING);
- sc->sc_state = FSS_IDLE;
- mutex_exit(&sc->sc_slock);
- mutex_exit(&fss_device_lock);
- break;
- }
- }
- fss_creating = true;
- mutex_exit(&fss_device_lock);
-
- error = fss_create_snapshot(sc, fss, l);
- mutex_enter(&sc->sc_slock);
- if (error == 0) {
- KASSERT(sc->sc_state == FSS_ACTIVE);
- sc->sc_uflags = fss->fss_flags;
- } else {
- KASSERT(sc->sc_state == FSS_CREATING);
- sc->sc_state = FSS_IDLE;
- }
- mutex_exit(&sc->sc_slock);
-
- mutex_enter(&fss_device_lock);
- fss_creating = false;
- cv_broadcast(&fss_device_cv);
- mutex_exit(&fss_device_lock);
+ fss_create_enqueue(sc, fss);
break;
case FSSIOCCLR:
@@ -701,14 +713,14 @@
struct timespec ts;
/* distinguish lookup 1 from lookup 2 to reduce mistakes */
struct pathbuf *pb2;
struct vnode *vp, *vp2;
+ struct mount *mount;
/*
* Get the mounted file system.
*/
-
- error = namei_simple_user(fss->fss_mount,
+ error = namei_simple_kernel(fss->fss_mount,
NSM_FOLLOW_NOEMULROOT, &vp);
if (error != 0)
return error;
@@ -716,50 +728,61 @@
vrele(vp);
return EINVAL;
}
- sc->sc_mount = vp->v_mount;
- memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
+ mount = vp->v_mount;
+ mutex_enter(&sc->sc_slock);
+ sc->sc_mount = mount;
+ memcpy(sc->sc_mntname, mount->mnt_stat.f_mntonname, MNAMELEN);
+ mutex_exit(&sc->sc_slock);
vrele(vp);
/*
* Check for file system internal snapshot.
*/
- error = namei_simple_user(fss->fss_bstore,
+ error = namei_simple_kernel(fss->fss_bstore,
NSM_FOLLOW_NOEMULROOT, &vp);
if (error != 0)
return error;
- if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
+ mutex_enter(&sc->sc_slock);
+ if (vp->v_type == VREG && vp->v_mount == mount) {
sc->sc_flags |= FSS_PERSISTENT;
sc->sc_bs_vp = vp;
- fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
+ fsbsize = mount->mnt_stat.f_iosize;
bits = sizeof(sc->sc_bs_bshift)*NBBY;
+
for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
sc->sc_bs_bshift++)
if (FSS_FSBSIZE(sc) == fsbsize)
break;
- if (sc->sc_bs_bshift >= bits)
+
+ if (sc->sc_bs_bshift >= bits) {
+ mutex_exit(&sc->sc_slock);
return EINVAL;
+ }
sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
sc->sc_clshift = 0;
+ mutex_exit(&sc->sc_slock);
if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
- error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
+ error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE);
if (error)
return error;
}
error = vn_lock(vp, LK_EXCLUSIVE);
if (error != 0)
return error;
- error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
+ error = VFS_SNAPSHOT(mount, vp, &ts);
+ mutex_enter(&sc->sc_slock);
TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
+ mutex_exit(&sc->sc_slock);
- VOP_UNLOCK(sc->sc_bs_vp);
+ VOP_UNLOCK(vp);
return error;
}
vrele(vp);
@@ -767,12 +790,15 @@
/*
* Get the block device it is mounted on and its size.
*/
- error = spec_node_lookup_by_mount(sc->sc_mount, &vp);
+ error = spec_node_lookup_by_mount(mount, &vp);
if (error)
return error;
+
+ mutex_enter(&sc->sc_slock);
sc->sc_bdev = vp->v_rdev;
+ mutex_exit(&sc->sc_slock);
error = getdisksize(vp, &numsec, &secsize);
vrele(vp);
if (error)
@@ -784,19 +810,21 @@
* Get the backing store
*/
error = pathbuf_copyin(fss->fss_bstore, &pb2);
- if (error) {
+ if (error)
return error;
- }
+
error = vn_open(NULL, pb2, 0, FREAD|FWRITE, 0, &vp2, NULL, NULL);
if (error != 0) {
pathbuf_destroy(pb2);
return error;
}
VOP_UNLOCK(vp2);
+ mutex_enter(&sc->sc_slock);
sc->sc_bs_vp = vp2;
+ mutex_exit(&sc->sc_slock);
if (vp2->v_type != VREG && vp2->v_type != VCHR) {
vrele(vp2);
pathbuf_destroy(pb2);
@@ -804,27 +832,34 @@
}
pathbuf_destroy(pb2);
if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
- error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
+ error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE);
if (error)
return error;
}
+
+ mutex_enter(&sc->sc_slock);
if (sc->sc_bs_vp->v_type == VREG) {
fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
- if (fsbsize & (fsbsize-1)) /* No power of two */
+ if (fsbsize & (fsbsize-1)) { /* No power of two */
+ mutex_exit(&sc->sc_slock);
return EINVAL;
+ }
for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
sc->sc_bs_bshift++)
if (FSS_FSBSIZE(sc) == fsbsize)
break;
- if (sc->sc_bs_bshift >= 32)
+ if (sc->sc_bs_bshift >= 32) {
+ mutex_exit(&sc->sc_slock);
return EINVAL;
+ }
sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
} else {
sc->sc_bs_bshift = DEV_BSHIFT;
sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
}
+ mutex_exit(&sc->sc_slock);
return 0;
}
@@ -833,8 +868,10 @@
*/
static int
fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
{
+ struct mount *mount = sc->sc_mount;
+ struct vnode *vp;
int len, error;
u_int32_t csize;
off_t bsize;
@@ -845,15 +882,16 @@
*/
if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
goto bad;
+ mutex_enter(&sc->sc_slock);
if (sc->sc_flags & FSS_PERSISTENT) {
- fss_softc_alloc(sc);
- mutex_enter(&sc->sc_slock);
sc->sc_state = FSS_ACTIVE;
mutex_exit(&sc->sc_slock);
+ fss_softc_alloc(sc);
return 0;
}
+ mutex_exit(&sc->sc_slock);
/*
* Set cluster size. Must be a power of two and
* a multiple of backing store block size.
@@ -864,13 +902,15 @@
csize = fss->fss_csize;
if (bsize/csize > FSS_CLUSTER_MAX)
csize = bsize/FSS_CLUSTER_MAX+1;
+ mutex_enter(&sc->sc_slock);
for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
sc->sc_clshift++)
if (FSS_CLSIZE(sc) >= csize)
break;
if (sc->sc_clshift >= 32) {
+ mutex_exit(&sc->sc_slock);
error = EINVAL;
goto bad;
}
sc->sc_clmask = FSS_CLSIZE(sc)-1;
@@ -898,52 +938,67 @@
sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
sc->sc_clnext = sc->sc_indir_size;
sc->sc_indir_cur = 0;
- if ((error = fss_softc_alloc(sc)) != 0)
+ mutex_exit(&sc->sc_slock);
+
+ if ((error = fss_softc_alloc(sc)) != 0) {
goto bad;
+ }
/*
* Activate the snapshot.
*/
- if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
+ if ((error = vfs_suspend(mount, 0)) != 0)
goto bad;
+ mutex_enter(&sc->sc_slock);
microtime(&sc->sc_time);
+ mutex_exit(&sc->sc_slock);
- vrele_flush(sc->sc_mount);
- error = VFS_SYNC(sc->sc_mount, MNT_WAIT, curlwp->l_cred);
- if (error == 0)
- error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
+ vrele_flush(mount);
+ error = VFS_SYNC(mount, MNT_WAIT, curlwp->l_cred);
+ if (error == 0)
+ error = fscow_establish(mount, fss_copy_on_write, sc);
if (error == 0) {
mutex_enter(&sc->sc_slock);
sc->sc_state = FSS_ACTIVE;
mutex_exit(&sc->sc_slock);
}
- vfs_resume(sc->sc_mount);
+ vfs_resume(mount);
if (error != 0)
goto bad;
+ mutex_enter(&sc->sc_slock);
aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
aprint_debug_dev(sc->sc_dev,
"%u clusters of %u, %u cache slots, %u indir clusters\n",
sc->sc_clcount, FSS_CLSIZE(sc),
sc->sc_cache_size, sc->sc_indir_size);
+ mutex_exit(&sc->sc_slock);
return 0;
bad:
fss_softc_free(sc);
- if (sc->sc_bs_vp != NULL) {
- if (sc->sc_flags & FSS_PERSISTENT)
- vrele(sc->sc_bs_vp);
- else
- vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
+
+ mutex_enter(&sc->sc_slock);
+ if ((vp = sc->sc_bs_vp) != NULL) {
+ if (sc->sc_flags & FSS_PERSISTENT) {
+ mutex_exit(&sc->sc_slock);
+ vrele(vp);
+ } else {
+ mutex_exit(&sc->sc_slock);
+ vn_close(vp, FREAD|FWRITE, l->l_cred);
+ }
}
+
+ mutex_enter(&sc->sc_slock);
sc->sc_bs_vp = NULL;
+ mutex_exit(&sc->sc_slock);
return error;
}
@@ -1368,8 +1423,81 @@
mutex_enter(&sc->sc_slock);
}
}
+static void
+fss_create_enqueue(struct fss_softc *sc, struct fss_set *fss)
+{
+ struct fss_create_entry *fce;
+
+ fce = kmem_zalloc(sizeof(*fce), KM_SLEEP);
+ fce->fce_sc = sc;
+ if (fss) {
+ memcpy(&fce->fce_fss, fss, sizeof(*fss));
+
+ copyinstr(fss->fss_mount, fce->fce_mount,
+ sizeof(fce->fce_mount), NULL);
+ fce->fce_fss.fss_mount = fce->fce_mount;
+
+ copyinstr(fss->fss_bstore, fce->fce_bstore,
+ sizeof(fce->fce_bstore), NULL);
+ fce->fce_fss.fss_bstore = fce->fce_bstore;
+ }
+
+ mutex_enter(&fss_device_lock);
+ SIMPLEQ_INSERT_TAIL(&fss_create_queue, fce, fce_link);
+ mutex_exit(&fss_device_lock);
+
+ cv_signal(&fss_device_cv);
+
+ return;
+}
+
+/*
+ * Snapshot serialization thread, for all fss devices
+ */
+static void
+fss_create_thread(void *arg)
+{
+ struct fss_create_entry *fce;
+
+ do {
+ struct fss_softc *sc;
+ struct fss_set *fss;
+ int error;
+
+ mutex_enter(&fss_device_lock);
+ while ((fce = SIMPLEQ_FIRST(&fss_create_queue)) == NULL)
+ cv_wait(&fss_device_cv, &fss_device_lock);
+
+ SIMPLEQ_REMOVE_HEAD(&fss_create_queue, fce_link);
+
+ sc = fce->fce_sc;
+ fss = &fce->fce_fss;
+
+ if (sc == NULL) {
+ mutex_exit(&fss_device_lock);
+ kmem_free(fce, sizeof(*fce));
+ kthread_exit(0);
+ }
+
+ mutex_exit(&fss_device_lock);
+
+ error = fss_create_snapshot(sc, fss, curlwp);
+ mutex_enter(&sc->sc_slock);
+ if (error == 0) {
+ KASSERT(sc->sc_state == FSS_ACTIVE);
+ sc->sc_uflags = fss->fss_flags;
+ } else {
+ KASSERT(sc->sc_state == FSS_CREATING);
+ sc->sc_state = FSS_IDLE;
+ }
+ mutex_exit(&sc->sc_slock);
+
+ kmem_free(fce, sizeof(*fce));
+ } while (1);
+}
+
#ifdef _MODULE
#include <sys/module.h>
@@ -1377,36 +1505,55 @@
CFDRIVER_DECL(fss, DV_DISK, NULL);
devmajor_t fss_bmajor = -1, fss_cmajor = -1;
+static void
+fss_create_thread_join(void) {
+ fss_create_enqueue(NULL, NULL);
+ if (fss_create_lwp != NULL)
+ kthread_join(fss_create_lwp);
+ fss_create_lwp = NULL;
+}
+
static int
fss_modcmd(modcmd_t cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MODULE_CMD_INIT:
+ error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
+ fss_create_thread, NULL, &fss_create_lwp, "fss_create");
+ if (error)
+ break;
+
mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&fss_device_cv, "snapwait");
error = devsw_attach(fss_cd.cd_name,
&fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor);
if (error) {
+ fss_create_thread_join();
+ cv_destroy(&fss_device_cv);
mutex_destroy(&fss_device_lock);
break;
}
error = config_cfdriver_attach(&fss_cd);
if (error) {
devsw_detach(&fss_bdevsw, &fss_cdevsw);
+ fss_create_thread_join();
+ cv_destroy(&fss_device_cv);
mutex_destroy(&fss_device_lock);
break;
}
error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
if (error) {
config_cfdriver_detach(&fss_cd);
devsw_detach(&fss_bdevsw, &fss_cdevsw);
+ fss_create_thread_join();
+ cv_destroy(&fss_device_cv);
mutex_destroy(&fss_device_lock);
break;
}
@@ -1422,8 +1569,9 @@
config_cfattach_attach(fss_cd.cd_name, &fss_ca);
break;
}
devsw_detach(&fss_bdevsw, &fss_cdevsw);
+ fss_create_thread_join();
cv_destroy(&fss_device_cv);
mutex_destroy(&fss_device_lock);
break;
Home |
Main Index |
Thread Index |
Old Index