tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[PATCH] Re: Locking in src/sys/dev/fss.c



Hello

I took more time on fss.c. I still have some trouble to grasp the 
locking protocol in this file, but I have a proposal to address the
most offending issue: snapshot creation are serialized and the driver
locks fss_device_lock while they occur. On a big filesystem, 
the operation can takes dozens of seconds while fssconfig(1) 
commands get stuck in the kernel awaiting for fss_device_lock.

In the attached patch, I added a thread dedicated to snapshot
creation. This threads takes snapshot creation requests and 
execute them one by one. The caller does not have to wait on
fss_device_lock. 

While there, I added a few mutex_enter/mutex_exit for sc->sc_slock
where it looked inconsistent with other usages. I tried to avoid
holding the lock on long operation. One question pending: it is
okay to hold the lock during a copyin? 

Opinions welcome.

-- 
Emmanuel Dreyfus
manu%netbsd.org@localhost
Index: fss.c
===================================================================
RCS file: /cvsroot/src/sys/dev/fss.c,v
retrieving revision 1.114
diff -U4 -r1.114 fss.c
--- fss.c	22 Mar 2023 21:14:46 -0000	1.114
+++ fss.c	9 Aug 2024 16:41:30 -0000
@@ -54,8 +54,9 @@
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/uio.h>
 #include <sys/conf.h>
+#include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/fstrans.h>
 #include <sys/vfs_syscalls.h>		/* For do_sys_unlink(). */
 
@@ -90,12 +91,23 @@
 static void fss_bs_thread(void *);
 static int fss_bs_io(struct fss_softc *, fss_io_type,
     u_int32_t, off_t, int, void *, size_t *);
 static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
+static void fss_create_enqueue(struct fss_softc *, struct fss_set *);
+static void fss_create_thread(void *);
+
+SIMPLEQ_HEAD(fss_create_queue, fss_create_entry) fss_create_queue;
+struct fss_create_entry {
+	struct fss_softc *fce_sc;
+	struct fss_set fce_fss;
+	char fce_mount[MNAMELEN];
+	char fce_bstore[MNAMELEN];
+	SIMPLEQ_ENTRY(fss_create_entry) fce_link;
+};
 
 static kmutex_t fss_device_lock;	/* Protect all units. */
 static kcondvar_t fss_device_cv;	/* Serialize snapshot creation. */
-static bool fss_creating = false;	/* Currently creating a snapshot. */
+lwp_t *fss_create_lwp = NULL;		/* create thread lwp */
 static int fss_num_attached = 0;	/* Number of attached devices. */
 static struct vfs_hooks fss_vfs_hooks = {
 	.vh_unmount = fss_unmount_hook
 };
@@ -135,13 +147,38 @@
 
 void
 fssattach(int num)
 {
+	int error; 
 
 	mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
 	cv_init(&fss_device_cv, "snapwait");
-	if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
+	SIMPLEQ_INIT(&fss_create_queue);
+
+	error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
+	    fss_create_thread, NULL, &fss_create_lwp, "fss_create");
+	if (error) {
+		aprint_error("%s: kthread_create failed\n", fss_cd.cd_name);
+		goto out;
+	}
+
+	error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
+	if (error) {
+		aprint_error("%s: config_cfattach_attach failed\n", 
+		    fss_cd.cd_name);
+		goto out;
+	}
+
+out:
+	if (error) {
 		aprint_error("%s: unable to register\n", fss_cd.cd_name);
+		if (fss_create_lwp)
+			fss_create_thread_join();
+		cv_destroy(&fss_device_cv);
+		mutex_destroy(&fss_device_lock);
+	}
+	return;
+
 }
 
 static int
 fss_match(device_t self, cfdata_t cfdata, void *aux)
@@ -219,9 +256,12 @@
 		if (sc == NULL) {
 			mutex_exit(&fss_device_lock);
 			return ENOMEM;
 		}
+
+		mutex_enter(&sc->sc_slock);
 		sc->sc_state = FSS_IDLE;
+		mutex_exit(&sc->sc_slock);
 	}
 
 	mutex_enter(&sc->sc_slock);
 
@@ -360,8 +400,9 @@
 		if (error == 0 && sc->sc_state != FSS_IDLE) {
 			error = EBUSY;
 		} else {
 			sc->sc_state = FSS_CREATING;
+			/* XXX copyin with mutex held? */
 			copyinstr(fss->fss_mount, sc->sc_mntname,
 			    sizeof(sc->sc_mntname), NULL);
 			memset(&sc->sc_time, 0, sizeof(sc->sc_time));
 			sc->sc_clshift = 0;
@@ -372,38 +413,9 @@
 
 		/*
 		 * Serialize snapshot creation.
 		 */
-		mutex_enter(&fss_device_lock);
-		while (fss_creating) {
-			error = cv_wait_sig(&fss_device_cv, &fss_device_lock);
-			if (error) {
-				mutex_enter(&sc->sc_slock);
-				KASSERT(sc->sc_state == FSS_CREATING);
-				sc->sc_state = FSS_IDLE;
-				mutex_exit(&sc->sc_slock);
-				mutex_exit(&fss_device_lock);
-				break;
-			}
-		}
-		fss_creating = true;
-		mutex_exit(&fss_device_lock);
-
-		error = fss_create_snapshot(sc, fss, l);
-		mutex_enter(&sc->sc_slock);
-		if (error == 0) {
-			KASSERT(sc->sc_state == FSS_ACTIVE);
-			sc->sc_uflags = fss->fss_flags;
-		} else {
-			KASSERT(sc->sc_state == FSS_CREATING);
-			sc->sc_state = FSS_IDLE;
-		}
-		mutex_exit(&sc->sc_slock);
-
-		mutex_enter(&fss_device_lock);
-		fss_creating = false;
-		cv_broadcast(&fss_device_cv);
-		mutex_exit(&fss_device_lock);
+		fss_create_enqueue(sc, fss);
 
 		break;
 
 	case FSSIOCCLR:
@@ -701,14 +713,14 @@
 	struct timespec ts;
 	/* distinguish lookup 1 from lookup 2 to reduce mistakes */
 	struct pathbuf *pb2;
 	struct vnode *vp, *vp2;
+	struct mount *mount;
 
 	/*
 	 * Get the mounted file system.
 	 */
-
-	error = namei_simple_user(fss->fss_mount,
+	error = namei_simple_kernel(fss->fss_mount,
 				NSM_FOLLOW_NOEMULROOT, &vp);
 	if (error != 0)
 		return error;
 
@@ -716,50 +728,61 @@
 		vrele(vp);
 		return EINVAL;
 	}
 
-	sc->sc_mount = vp->v_mount;
-	memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
+	mount = vp->v_mount;
+	mutex_enter(&sc->sc_slock);
+	sc->sc_mount = mount;
+	memcpy(sc->sc_mntname, mount->mnt_stat.f_mntonname, MNAMELEN);
+	mutex_exit(&sc->sc_slock);
 
 	vrele(vp);
 
 	/*
 	 * Check for file system internal snapshot.
 	 */
 
-	error = namei_simple_user(fss->fss_bstore,
+	error = namei_simple_kernel(fss->fss_bstore,
 				NSM_FOLLOW_NOEMULROOT, &vp);
 	if (error != 0)
 		return error;
 
-	if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
+	mutex_enter(&sc->sc_slock);
+	if (vp->v_type == VREG && vp->v_mount == mount) {
 		sc->sc_flags |= FSS_PERSISTENT;
 		sc->sc_bs_vp = vp;
 
-		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
+		fsbsize = mount->mnt_stat.f_iosize;
 		bits = sizeof(sc->sc_bs_bshift)*NBBY;
+
 		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
 		    sc->sc_bs_bshift++)
 			if (FSS_FSBSIZE(sc) == fsbsize)
 				break;
-		if (sc->sc_bs_bshift >= bits)
+
+		if (sc->sc_bs_bshift >= bits) {
+			mutex_exit(&sc->sc_slock);
 			return EINVAL;
+		}
 
 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
 		sc->sc_clshift = 0;
+		mutex_exit(&sc->sc_slock);
 
 		if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
-			error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
+			error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE);
 			if (error)
 				return error;
 		}
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0)
 			return error;
-		error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
+		error = VFS_SNAPSHOT(mount, vp, &ts);
+		mutex_enter(&sc->sc_slock);
 		TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
+		mutex_exit(&sc->sc_slock);
 
-		VOP_UNLOCK(sc->sc_bs_vp);
+		VOP_UNLOCK(vp);
 
 		return error;
 	}
 	vrele(vp);
@@ -767,12 +790,15 @@
 	/*
 	 * Get the block device it is mounted on and its size.
 	 */
 
-	error = spec_node_lookup_by_mount(sc->sc_mount, &vp);
+	error = spec_node_lookup_by_mount(mount, &vp);
 	if (error)
 		return error;
+
+	mutex_enter(&sc->sc_slock);
 	sc->sc_bdev = vp->v_rdev;
+	mutex_exit(&sc->sc_slock);
 
 	error = getdisksize(vp, &numsec, &secsize);
 	vrele(vp);
 	if (error)
@@ -784,19 +810,21 @@
 	 * Get the backing store
 	 */
 
 	error = pathbuf_copyin(fss->fss_bstore, &pb2);
-	if (error) {
+	if (error)
  		return error;
-	}
+
 	error = vn_open(NULL, pb2, 0, FREAD|FWRITE, 0, &vp2, NULL, NULL);
 	if (error != 0) {
 		pathbuf_destroy(pb2);
 		return error;
 	}
 	VOP_UNLOCK(vp2);
 
+	mutex_enter(&sc->sc_slock);
 	sc->sc_bs_vp = vp2;
+	mutex_exit(&sc->sc_slock);
 
 	if (vp2->v_type != VREG && vp2->v_type != VCHR) {
 		vrele(vp2);
 		pathbuf_destroy(pb2);
@@ -804,27 +832,34 @@
 	}
 	pathbuf_destroy(pb2);
 
 	if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
-		error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
+		error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE);
 		if (error)
 			return error;
 	}
+
+	mutex_enter(&sc->sc_slock);
 	if (sc->sc_bs_vp->v_type == VREG) {
 		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
-		if (fsbsize & (fsbsize-1))	/* No power of two */
+		if (fsbsize & (fsbsize-1)) {	/* No power of two */
+			mutex_exit(&sc->sc_slock);
 			return EINVAL;
+		}
 		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
 		    sc->sc_bs_bshift++)
 			if (FSS_FSBSIZE(sc) == fsbsize)
 				break;
-		if (sc->sc_bs_bshift >= 32)
+		if (sc->sc_bs_bshift >= 32) {
+			mutex_exit(&sc->sc_slock);
 			return EINVAL;
+		}
 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
 	} else {
 		sc->sc_bs_bshift = DEV_BSHIFT;
 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
 	}
+	mutex_exit(&sc->sc_slock);
 
 	return 0;
 }
 
@@ -833,8 +868,10 @@
  */
 static int
 fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
 {
+	struct mount *mount = sc->sc_mount;
+	struct vnode *vp;
 	int len, error;
 	u_int32_t csize;
 	off_t bsize;
 
@@ -845,15 +882,16 @@
 	 */
 	if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
 		goto bad;
 
+	mutex_enter(&sc->sc_slock);
 	if (sc->sc_flags & FSS_PERSISTENT) {
-		fss_softc_alloc(sc);
-		mutex_enter(&sc->sc_slock);
 		sc->sc_state = FSS_ACTIVE;
 		mutex_exit(&sc->sc_slock);
+		fss_softc_alloc(sc);
 		return 0;
 	}
+	mutex_exit(&sc->sc_slock);
 
 	/*
 	 * Set cluster size. Must be a power of two and
 	 * a multiple of backing store block size.
@@ -864,13 +902,15 @@
 		csize = fss->fss_csize;
 	if (bsize/csize > FSS_CLUSTER_MAX)
 		csize = bsize/FSS_CLUSTER_MAX+1;
 
+	mutex_enter(&sc->sc_slock);
 	for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
 	    sc->sc_clshift++)
 		if (FSS_CLSIZE(sc) >= csize)
 			break;
 	if (sc->sc_clshift >= 32) {
+		mutex_exit(&sc->sc_slock);
 		error = EINVAL;
 		goto bad;
 	}
 	sc->sc_clmask = FSS_CLSIZE(sc)-1;
@@ -898,52 +938,67 @@
 	sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
 	sc->sc_clnext = sc->sc_indir_size;
 	sc->sc_indir_cur = 0;
 
-	if ((error = fss_softc_alloc(sc)) != 0)
+	mutex_exit(&sc->sc_slock);
+
+	if ((error = fss_softc_alloc(sc)) != 0) {
 		goto bad;
+	}
 
 	/*
 	 * Activate the snapshot.
 	 */
 
-	if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
+	if ((error = vfs_suspend(mount, 0)) != 0)
 		goto bad;
 
+	mutex_enter(&sc->sc_slock);
 	microtime(&sc->sc_time);
+	mutex_exit(&sc->sc_slock);
 
-	vrele_flush(sc->sc_mount);
-	error = VFS_SYNC(sc->sc_mount, MNT_WAIT, curlwp->l_cred);
-	if (error == 0)
-		error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
+	vrele_flush(mount);
+	error = VFS_SYNC(mount, MNT_WAIT, curlwp->l_cred);
+	if (error == 0) 
+		error = fscow_establish(mount, fss_copy_on_write, sc);
 	if (error == 0) {
 		mutex_enter(&sc->sc_slock);
 		sc->sc_state = FSS_ACTIVE;
 		mutex_exit(&sc->sc_slock);
 	}
 
-	vfs_resume(sc->sc_mount);
+	vfs_resume(mount);
 
 	if (error != 0)
 		goto bad;
 
+	mutex_enter(&sc->sc_slock);
 	aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
 	aprint_debug_dev(sc->sc_dev,
 	    "%u clusters of %u, %u cache slots, %u indir clusters\n",
 	    sc->sc_clcount, FSS_CLSIZE(sc),
 	    sc->sc_cache_size, sc->sc_indir_size);
+	mutex_exit(&sc->sc_slock);
 
 	return 0;
 
 bad:
 	fss_softc_free(sc);
-	if (sc->sc_bs_vp != NULL) {
-		if (sc->sc_flags & FSS_PERSISTENT)
-			vrele(sc->sc_bs_vp);
-		else
-			vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
+
+	mutex_enter(&sc->sc_slock);
+	if ((vp = sc->sc_bs_vp) != NULL) {
+		if (sc->sc_flags & FSS_PERSISTENT) {
+			mutex_exit(&sc->sc_slock);
+			vrele(vp);
+		} else {
+			mutex_exit(&sc->sc_slock);
+			vn_close(vp, FREAD|FWRITE, l->l_cred);
+		}
 	}
+
+	mutex_enter(&sc->sc_slock);
 	sc->sc_bs_vp = NULL;
+	mutex_exit(&sc->sc_slock);
 
 	return error;
 }
 
@@ -1368,8 +1423,81 @@
 		mutex_enter(&sc->sc_slock);
 	}
 }
 
+static void
+fss_create_enqueue(struct fss_softc *sc, struct fss_set *fss)
+{
+	struct fss_create_entry *fce;
+	
+	fce = kmem_zalloc(sizeof(*fce), KM_SLEEP);
+	fce->fce_sc = sc;
+	if (fss) {
+		memcpy(&fce->fce_fss, fss, sizeof(*fss));
+
+		copyinstr(fss->fss_mount, fce->fce_mount,
+		    sizeof(fce->fce_mount), NULL);
+		fce->fce_fss.fss_mount = fce->fce_mount;
+
+		copyinstr(fss->fss_bstore, fce->fce_bstore,
+		    sizeof(fce->fce_bstore), NULL);
+		fce->fce_fss.fss_bstore = fce->fce_bstore;
+	}
+
+	mutex_enter(&fss_device_lock);
+	SIMPLEQ_INSERT_TAIL(&fss_create_queue, fce, fce_link);
+	mutex_exit(&fss_device_lock);
+
+	cv_signal(&fss_device_cv);
+
+	return;
+}
+
+/* 
+ * Snapshot serialization thread, for all fss devices
+ */
+static void
+fss_create_thread(void *arg)
+{
+	struct fss_create_entry *fce;
+
+	do {
+		struct fss_softc *sc;
+		struct fss_set *fss;
+		int error;
+
+		mutex_enter(&fss_device_lock);
+		while ((fce = SIMPLEQ_FIRST(&fss_create_queue)) == NULL)
+			cv_wait(&fss_device_cv, &fss_device_lock);
+
+		SIMPLEQ_REMOVE_HEAD(&fss_create_queue, fce_link);
+
+		sc = fce->fce_sc;
+		fss = &fce->fce_fss;
+
+		if (sc == NULL) {
+			mutex_exit(&fss_device_lock);
+			kmem_free(fce, sizeof(*fce));
+			kthread_exit(0);
+		}
+
+		mutex_exit(&fss_device_lock);
+
+		error = fss_create_snapshot(sc, fss, curlwp);
+		mutex_enter(&sc->sc_slock);
+		if (error == 0) {
+			KASSERT(sc->sc_state == FSS_ACTIVE);
+			sc->sc_uflags = fss->fss_flags;
+		} else {
+			KASSERT(sc->sc_state == FSS_CREATING);
+			sc->sc_state = FSS_IDLE;
+		}
+		mutex_exit(&sc->sc_slock);
+
+		kmem_free(fce, sizeof(*fce));
+	} while (1);
+}
+
 #ifdef _MODULE
 
 #include <sys/module.h>
 
@@ -1377,36 +1505,55 @@
 CFDRIVER_DECL(fss, DV_DISK, NULL);
 
 devmajor_t fss_bmajor = -1, fss_cmajor = -1;
 
+static void
+fss_create_thread_join(void) {
+	fss_create_enqueue(NULL, NULL);
+	if (fss_create_lwp != NULL)
+		kthread_join(fss_create_lwp);
+	fss_create_lwp = NULL;
+}
+
 static int
 fss_modcmd(modcmd_t cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MODULE_CMD_INIT:
+		error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
+		    fss_create_thread, NULL, &fss_create_lwp, "fss_create");
+		if (error)
+			break;
+
 		mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
 		cv_init(&fss_device_cv, "snapwait");
 
 		error = devsw_attach(fss_cd.cd_name,
 		    &fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor);
 		if (error) {
+			fss_create_thread_join();
+			cv_destroy(&fss_device_cv);
 			mutex_destroy(&fss_device_lock);
 			break;
 		}
 
 		error = config_cfdriver_attach(&fss_cd);
 		if (error) {
 			devsw_detach(&fss_bdevsw, &fss_cdevsw);
+			fss_create_thread_join();
+			cv_destroy(&fss_device_cv);
 			mutex_destroy(&fss_device_lock);
 			break;
 		}
 
 		error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
 		if (error) {
 			config_cfdriver_detach(&fss_cd);
 			devsw_detach(&fss_bdevsw, &fss_cdevsw);
+			fss_create_thread_join();
+			cv_destroy(&fss_device_cv);
 			mutex_destroy(&fss_device_lock);
 			break;
 		}
 
@@ -1422,8 +1569,9 @@
 			config_cfattach_attach(fss_cd.cd_name, &fss_ca);
 			break;
 		}
 		devsw_detach(&fss_bdevsw, &fss_cdevsw);
+		fss_create_thread_join();
 		cv_destroy(&fss_device_cv);
 		mutex_destroy(&fss_device_lock);
 		break;
 


Home | Main Index | Thread Index | Old Index