tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: vnd locking



On Fri, May 22, 2015 at 04:56:57PM +0000, Taylor R Campbell wrote:
>    Date: Fri, 22 May 2015 13:09:58 +0100
>    From: Patrick Welche <prlw1%cam.ac.uk@localhost>
> 
>    There are a load of locking PRs involving vnd. I thought I would have a
>    stab at converting vnd to use condvars and mutexes (mutices?) as a first
>    step. I cobbled something together which allows me to
>    [...]
>    with a LOCKDEBUG kernel, so I may have guessed right, but I haven't found
>    any documentation on The Right Way, so please review carefully, assume
>    the worst, and tell me how to do it better!

I may have regressed: with the attached patch, I now get a locking
error in vndthread at:

        /*
         * Allocate a header for this transfer and link it to the
         * buffer
         */
        mutex_enter(&vnd->sc_sclock); <------------ XXX already locked?!
        vnx = VND_GETXFER(vnd); /* may sleep */
        mutex_exit(&vnd->sc_sclock);

(That is assuming that coredumps and gdb are accurate...)

> First thought: I'd suggest making a branch for this so you can work on
> it incrementally without breaking the world, or at least a local Git
> branch so you can work on separate commits for different issues.
> 
> Concur with all hannken@'s comments.  Small bug: you almost certainly
> need sc_sclock to be at IPL_BIO, not IPL_NONE, if you want it to
> replace splbio/splx pairs.  Minor nit: diff -pu, please.

Committed the whitespace trivia to remove distractions.
Changed to IPL_BIO - I wondered whether vnd shouldn't have a lower
priority than the filesystem of the file backing in it. I suppose
not as they both do i/o?
 
> You converted the VNF_LOCKED/VNF_WANTED flags into a mutex sc_sclock
> (why not just `sc_lock'?), but the semantics doesn't quite carry over:
> previously, vndlock would wait interruptibly, and the owner of
> VNF_LOCKED was allowed to do I/O.  But you can't do that with a mutex
> -- you can't wait interruptibly or do I/O while holding one.

I put vnd{,un}lock back, and hopefully converted it correctly to
condvar. So, after a call to vndlock, the lock was obtained
interruptibly and released.

> Right now you just drop sc_sclock around vndgetdisklabel, and don't
> take it for DIOCWDINFO &c., but then concurrent callers can stomp all
> over one another.
>
> You will probably have to introduce a new state, e.g. VNF_IO (or maybe
> just VNF_DISKLABELIO), and make anyone trying to use the disklabel or
> any other vnd-internal I/O wait for it to complete.

With vndlock back, the lock isn't held before vndgetdisklabel, so I
don't have to release it, and DIOCWDINFO is back within a vndlock/vndunlock
pair. If it was possible to acquire the lock, then nothing else should
be executing the code concurrently even though one isn't still holding
it? (readdisklabel calls vndstrategy which acquires the lock)

> 
>    - there was some sort of start kernel thread, thread sets variable,
>      meanwhile loop waiting for variable to be set. I removed that. On
>      the other hand I added a MUSTJOIN for the kernel thread exiting
>      side of things.
> 
> I suggest moving the kthread_join around a little bit.  Don't read
> sc_flags without the lock; instead, grab the thread while under the
> lock, and kthread_join it outside.

Is this what you mean:

                mutex_enter(&vnd->sc_sclock);
...
                kt = vnd->sc_kthread;
...
                mutex_exit(&vnd->sc_sclock);
                kthread_join(kt);


> I suspect you will need some way for vndset to wait until any pending
> vndclear is actually complete.  I don't see that immediately.  Note
> that you won't hit any of this in testing, no matter how hard you
> hammer on it, until you add D_MPSAFE to vnd_bdevsw/vnd_cdevsw.

I switched on all the MPSAFEty for all the fallout in this patch...
(Haven't considered vndset/clear yet)

>    - given that vnd is a pseudo-device, do I need an extra lock to care
>      of device creation/destruction? (What am I protecting against?
> 
> There is an MP safety issue with device_lookup_private: if you detach
> a driver just after someone does device_lookup_private, nothing causes
> the detach to block until the caller of device_lookup_private is done.
> But this is a general issue in autoconf, not a vnd-specific issue.  I
> don't, in my cursory glance, see any obvious MP safety issues in the
> attachment goo in your patch.

OK...

Thanks!

Patrick
Index: vndvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/vndvar.h,v
retrieving revision 1.34
diff -u -p -r1.34 vndvar.h
--- vndvar.h	25 May 2015 20:57:18 -0000	1.34
+++ vndvar.h	31 May 2015 11:52:59 -0000
@@ -107,14 +107,19 @@ struct vnode;
  */
 struct vnd_softc {
 	device_t	 sc_dev;
+	kmutex_t	 sc_sclock;	/* protect this softc */
 	int		 sc_flags;	/* flags */
+	kcondvar_t	 sc_intercv;	/* interruptible lock condvar */
 	uint64_t	 sc_size;	/* size of vnd */
 	struct vnode	*sc_vp;		/* vnode */
 	kauth_cred_t	 sc_cred;	/* credentials */
-	int		 sc_maxactive;	/* max # of active requests */
 	struct bufq_state *sc_tab;	/* transfer queue */
+	kcondvar_t	 sc_bufqcv;	/* queue's condvar */
 	int		 sc_pending;	/* number of pending transfers */
+	kcondvar_t	 sc_pendingcv;	/* pending transfer condvar */
+	int		 sc_maxactive;	/* max # of active requests */
 	int		 sc_active;	/* number of active transfers */
+	kcondvar_t	 sc_activitycv;	/* activity condvar */
 	struct disk	 sc_dkdev;	/* generic disk device info */
 	struct vndgeom	 sc_geom;	/* virtual geometry */
 	struct pool	 sc_vxpool;	/* vndxfer pool */
@@ -134,13 +139,11 @@ struct vnd_softc {
 #define	VNF_INITED	0x001	/* unit has been initialized */
 #define	VNF_WLABEL	0x002	/* label area is writable */
 #define	VNF_LABELLING	0x004	/* unit is currently being labelled */
-#define	VNF_WANTED	0x008	/* someone is waiting to obtain a lock */
 #define	VNF_LOCKED	0x010	/* unit is locked */
 #define	VNF_READONLY	0x020	/* unit is read-only */
 #define	VNF_KLABEL	0x040	/* keep label on close */
 #define	VNF_VLABEL	0x080	/* label is valid */
 #define	VNF_KTHREAD	0x100	/* thread is running */
-#define	VNF_VUNCONF	0x200	/* device is unconfiguring */
 #define VNF_COMP	0x400	/* file is compressed */
 #define VNF_CLEARING	0x800	/* unit is being torn down */
 #define VNF_USE_VN_RDWR	0x1000	/* have to use vn_rdwr() */
Index: vnd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/vnd.c,v
retrieving revision 1.245
diff -u -p -r1.245 vnd.c
--- vnd.c	25 May 2015 21:02:37 -0000	1.245
+++ vnd.c	31 May 2015 11:53:00 -0000
@@ -204,7 +204,7 @@ const struct bdevsw vnd_bdevsw = {
 	.d_dump = vnddump,
 	.d_psize = vndsize,
 	.d_discard = nodiscard,
-	.d_flag = D_DISK
+	.d_flag = D_DISK | D_MPSAFE
 };
 
 const struct cdevsw vnd_cdevsw = {
@@ -219,7 +219,7 @@ const struct cdevsw vnd_cdevsw = {
 	.d_mmap = nommap,
 	.d_kqfilter = nokqfilter,
 	.d_discard = nodiscard,
-	.d_flag = D_DISK
+	.d_flag = D_DISK | D_MPSAFE
 };
 
 static int	vnd_match(device_t, cfdata_t, void *);
@@ -241,12 +241,9 @@ static struct	dkdriver vnddkdriver = {
 void
 vndattach(int num)
 {
-	int error;
 
-	error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
-	if (error)
-		aprint_error("%s: unable to register cfattach\n",
-		    vnd_cd.cd_name);
+	if (config_cfattach_attach(vnd_cd.cd_name, &vnd_ca))
+		aprint_error("%s: unable to register\n", vnd_cd.cd_name);
 }
 
 static int
@@ -265,6 +262,11 @@ vnd_attach(device_t parent, device_t sel
 	sc->sc_comp_offsets = NULL;
 	sc->sc_comp_buff = NULL;
 	sc->sc_comp_decombuf = NULL;
+	mutex_init(&sc->sc_sclock, MUTEX_DEFAULT, IPL_BIO);
+	cv_init(&sc->sc_intercv, "vndlck");
+	cv_init(&sc->sc_activitycv, "vndac");
+	cv_init(&sc->sc_bufqcv, "vndbp");
+	cv_init(&sc->sc_pendingcv, "vndpc");
 	bufq_alloc(&sc->sc_tab, "disksort", BUFQ_SORT_RAWBLOCK);
 	disk_init(&sc->sc_dkdev, device_xname(self), &vnddkdriver);
 	if (!pmf_device_register(self, NULL, NULL))
@@ -284,6 +286,11 @@ vnd_detach(device_t self, int flags)
 	}
 
 	pmf_device_deregister(self);
+	mutex_destroy(&sc->sc_sclock);
+	cv_destroy(&sc->sc_intercv);
+	cv_destroy(&sc->sc_activitycv);
+	cv_destroy(&sc->sc_bufqcv);
+	cv_destroy(&sc->sc_pendingcv);
 	bufq_free(sc->sc_tab);
 	disk_destroy(&sc->sc_dkdev);
 
@@ -374,7 +381,9 @@ vndopen(dev_t dev, int flags, int mode, 
 			 */
 			if ((sc->sc_flags & VNF_VLABEL) == 0) {
 				sc->sc_flags |= VNF_VLABEL;
+				/* mutex_exit(&sc->sc_sclock); */
 				vndgetdisklabel(dev, sc);
+				/* mutex_enter(&sc->sc_sclock); */
 			}
 		}
 	}
@@ -470,7 +479,8 @@ vndstrategy(struct buf *bp)
 	    device_lookup_private(&vnd_cd, unit);
 	struct disklabel *lp;
 	daddr_t blkno;
-	int s = splbio();
+
+	mutex_enter(&vnd->sc_sclock);
 
 	if (vnd == NULL) {
 		bp->b_error = ENXIO;
@@ -545,18 +555,20 @@ vndstrategy(struct buf *bp)
 		KASSERT(vnd->sc_pending >= 0 &&
 		    vnd->sc_pending <= VND_MAXPENDING(vnd));
 		while (vnd->sc_pending == VND_MAXPENDING(vnd))
-			tsleep(&vnd->sc_pending, PRIBIO, "vndpc", 0);
+			cv_wait(&vnd->sc_pendingcv, &vnd->sc_sclock);
 		vnd->sc_pending++;
 	}
+	mutex_exit(&vnd->sc_sclock);
 	bufq_put(vnd->sc_tab, bp);
-	wakeup(&vnd->sc_tab);
-	splx(s);
+	mutex_enter(&vnd->sc_sclock);
+	cv_signal(&vnd->sc_bufqcv);
+	mutex_exit(&vnd->sc_sclock);
 	return;
 
 done:
+	mutex_exit(&vnd->sc_sclock);
 	bp->b_resid = bp->b_bcount;
 	biodone(bp);
-	splx(s);
 }
 
 static bool
@@ -601,7 +613,8 @@ static void
 vndthread(void *arg)
 {
 	struct vnd_softc *vnd = arg;
-	int s;
+
+	mutex_enter(&vnd->sc_sclock);
 
 	/* Determine whether we can *use* VOP_BMAP and VOP_STRATEGY to
 	 * directly access the backing vnode.  If we can, use these two
@@ -620,31 +633,27 @@ vndthread(void *arg)
 		    "using read/write operations");
 #endif
 
-	s = splbio();
-	vnd->sc_flags |= VNF_KTHREAD;
-	wakeup(&vnd->sc_kthread);
-
 	/*
 	 * Dequeue requests and serve them depending on the available
 	 * vnode operations.
 	 */
-	while ((vnd->sc_flags & VNF_VUNCONF) == 0) {
+	while ((vnd->sc_flags & VNF_KTHREAD) != 0) {
 		struct vndxfer *vnx;
 		struct buf *obp;
 		struct buf *bp;
 
 		obp = bufq_get(vnd->sc_tab);
 		if (obp == NULL) {
-			tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0);
+			cv_wait(&vnd->sc_bufqcv, &vnd->sc_sclock);
 			continue;
 		};
 		if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
 			KASSERT(vnd->sc_pending > 0 &&
 			    vnd->sc_pending <= VND_MAXPENDING(vnd));
 			if (vnd->sc_pending-- == VND_MAXPENDING(vnd))
-				wakeup(&vnd->sc_pending);
+				cv_signal(&vnd->sc_pendingcv);
 		}
-		splx(s);
+		mutex_exit(&vnd->sc_sclock);
 #ifdef DEBUG
 		if (vnddebug & VDB_FOLLOW)
 			printf("vndthread(%p)\n", obp);
@@ -672,17 +681,17 @@ vndthread(void *arg)
 		 * Allocate a header for this transfer and link it to the
 		 * buffer
 		 */
-		s = splbio();
-		vnx = VND_GETXFER(vnd);
-		splx(s);
+		mutex_enter(&vnd->sc_sclock);
+		vnx = VND_GETXFER(vnd); /* may sleep */
+		mutex_exit(&vnd->sc_sclock);
 		vnx->vx_vnd = vnd;
 
-		s = splbio();
+		mutex_enter(&vnd->sc_sclock);
 		while (vnd->sc_active >= vnd->sc_maxactive) {
-			tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0);
+			cv_wait(&vnd->sc_activitycv, &vnd->sc_sclock);
 		}
 		vnd->sc_active++;
-		splx(s);
+		mutex_exit(&vnd->sc_sclock);
 
 		/* Instrumentation. */
 		disk_busy(&vnd->sc_dkdev);
@@ -706,17 +715,15 @@ vndthread(void *arg)
 		else
 			handle_with_rdwr(vnd, obp, bp);
 
-		s = splbio();
+		mutex_enter(&vnd->sc_sclock);
 		continue;
 
 done:
 		biodone(obp);
-		s = splbio();
+		mutex_enter(&vnd->sc_sclock);
 	}
 
-	vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF);
-	wakeup(&vnd->sc_kthread);
-	splx(s);
+	mutex_exit(&vnd->sc_sclock);
 	kthread_exit(0);
 }
 
@@ -794,7 +801,7 @@ handle_with_rdwr(struct vnd_softc *vnd, 
 }
 
 /*
- * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP
+ * Handles the read/write request given in 'bp' using the vnode's VOP_BMAP
  * and VOP_STRATEGY operations.
  *
  * 'obp' is a pointer to the original request fed to the vnd device.
@@ -917,7 +924,8 @@ vndiodone(struct buf *bp)
 	struct vndxfer *vnx = VND_BUFTOXFER(bp);
 	struct vnd_softc *vnd = vnx->vx_vnd;
 	struct buf *obp = bp->b_private;
-	int s = splbio();
+
+	mutex_enter(&vnd->sc_sclock);
 
 	KASSERT(&vnx->vx_buf == bp);
 	KASSERT(vnd->sc_active > 0);
@@ -931,9 +939,9 @@ vndiodone(struct buf *bp)
 	    (bp->b_flags & B_READ));
 	vnd->sc_active--;
 	if (vnd->sc_active == 0) {
-		wakeup(&vnd->sc_tab);
+		cv_broadcast(&vnd->sc_activitycv);
 	}
-	splx(s);
+	mutex_exit(&vnd->sc_sclock);
 	obp->b_error = bp->b_error;
 	obp->b_resid = bp->b_resid;
 	buf_destroy(bp);
@@ -1163,7 +1171,7 @@ vndioctl(dev_t dev, u_long cmd, void *da
 		}
 		KASSERT(l);
 		error = VOP_GETATTR(nd.ni_vp, &vattr, l->l_cred);
-		if (!error && nd.ni_vp->v_type != VREG)
+		if (!error && vattr.va_type != VREG)
 			error = EOPNOTSUPP;
 		if (!error && vattr.va_bytes < vattr.va_size)
 			/* File is definitely sparse, use vn_rdwr() */
@@ -1359,13 +1367,14 @@ vndioctl(dev_t dev, u_long cmd, void *da
 			vio->vnd_size = dbtob(vnd->sc_size);
 		vnd->sc_flags |= VNF_INITED;
 
-		/* create the kernel thread, wait for it to be up */
-		error = kthread_create(PRI_NONE, 0, NULL, vndthread, vnd,
-		    &vnd->sc_kthread, "%s", device_xname(vnd->sc_dev));
-		if (error)
+		vnd->sc_flags |= VNF_KTHREAD;
+		error = kthread_create(PRI_BIO,
+		    KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
+		    vndthread, vnd, &vnd->sc_kthread, "%s",
+		    device_xname(vnd->sc_dev));
+		if (error) {
+			vnd->sc_flags &= ~VNF_KTHREAD;
 			goto close_and_exit;
-		while ((vnd->sc_flags & VNF_KTHREAD) == 0) {
-			tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0);
 		}
 #ifdef DEBUG
 		if (vnddebug & VDB_INIT)
@@ -1657,10 +1666,10 @@ vndshutdown(void)
 static void
 vndclear(struct vnd_softc *vnd, int myminor)
 {
+	struct lwp *kt;
 	struct vnode *vp = vnd->sc_vp;
 	int fflags = FREAD;
 	int bmaj, cmaj, i, mn;
-	int s;
 
 #ifdef DEBUG
 	if (vnddebug & VDB_FOLLOW)
@@ -1678,17 +1687,25 @@ vndclear(struct vnd_softc *vnd, int mymi
 			vdevgone(cmaj, mn, mn, VCHR);
 	}
 
+	mutex_enter(&vnd->sc_sclock);
+
 	if ((vnd->sc_flags & VNF_READONLY) == 0)
 		fflags |= FWRITE;
 
-	s = splbio();
 	bufq_drain(vnd->sc_tab);
-	splx(s);
+	cv_broadcast(&vnd->sc_bufqcv);
 
-	vnd->sc_flags |= VNF_VUNCONF;
-	wakeup(&vnd->sc_tab);
-	while (vnd->sc_flags & VNF_KTHREAD)
-		tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0);
+	mutex_exit(&vnd->sc_sclock);
+
+	if (vnd->sc_flags & VNF_KTHREAD) {
+		mutex_enter(&vnd->sc_sclock);
+		vnd->sc_flags &= ~VNF_KTHREAD;
+		kt = vnd->sc_kthread;
+		cv_signal(&vnd->sc_bufqcv);
+		cv_signal(&vnd->sc_activitycv);
+		mutex_exit(&vnd->sc_sclock);
+		kthread_join(kt);
+	}
 
 #ifdef VND_COMPRESSION
 	/* free the compressed file buffers */
@@ -1709,7 +1726,7 @@ vndclear(struct vnd_softc *vnd, int mymi
 #endif /* VND_COMPRESSION */
 	vnd->sc_flags &=
 	    ~(VNF_INITED | VNF_READONLY | VNF_KLABEL | VNF_VLABEL
-	      | VNF_VUNCONF | VNF_COMP | VNF_CLEARING);
+	      | VNF_COMP | VNF_CLEARING);
 	if (vp == NULL)
 		panic("vndclear: null vp");
 	(void) vn_close(vp, fflags, vnd->sc_cred);
@@ -1802,6 +1819,46 @@ vndgetdefaultlabel(struct vnd_softc *sc,
 }
 
 /*
+ * Wait interruptibly for an exclusive lock.
+ *
+ * XXX
+ * Several drivers do this; it should be abstracted and made MP-safe.
+ */
+static int
+vndlock(struct vnd_softc *sc)
+{
+	int error;
+
+	mutex_enter(&sc->sc_sclock);
+
+	error = 0;
+	while ((sc->sc_flags & VNF_LOCKED) != 0) {
+		error = cv_wait_sig(&sc->sc_intercv, &sc->sc_sclock);
+		if (error != 0)
+			break;
+	}
+	if (error == 0)
+		sc->sc_flags |= VNF_LOCKED;
+
+	mutex_exit(&sc->sc_sclock);
+
+	return error;
+}
+
+/*
+ * Unlock and wake up any waiters.
+ */
+static void
+vndunlock(struct vnd_softc *sc)
+{
+
+	mutex_enter(&sc->sc_sclock);
+	sc->sc_flags &= ~VNF_LOCKED;
+	cv_signal(&sc->sc_intercv);
+	mutex_exit(&sc->sc_sclock);
+}
+
+/*
  * Read the disklabel from a vnd.  If one is not present, create a fake one.
  */
 static void
@@ -1853,40 +1910,6 @@ vndgetdisklabel(dev_t dev, struct vnd_so
 	}
 }
 
-/*
- * Wait interruptibly for an exclusive lock.
- *
- * XXX
- * Several drivers do this; it should be abstracted and made MP-safe.
- */
-static int
-vndlock(struct vnd_softc *sc)
-{
-	int error;
-
-	while ((sc->sc_flags & VNF_LOCKED) != 0) {
-		sc->sc_flags |= VNF_WANTED;
-		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
-			return error;
-	}
-	sc->sc_flags |= VNF_LOCKED;
-	return 0;
-}
-
-/*
- * Unlock and wake up any waiters.
- */
-static void
-vndunlock(struct vnd_softc *sc)
-{
-
-	sc->sc_flags &= ~VNF_LOCKED;
-	if ((sc->sc_flags & VNF_WANTED) != 0) {
-		sc->sc_flags &= ~VNF_WANTED;
-		wakeup(sc);
-	}
-}
-
 #ifdef VND_COMPRESSION
 /* compressed file read */
 static void
@@ -2051,7 +2074,7 @@ vnd_modcmd(modcmd_t cmd, void *arg)
 		error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
 	        if (error) {
 			config_cfdriver_detach(&vnd_cd);
-			aprint_error("%s: unable to register cfattach\n",
+			aprint_error("%s: unable to register\n",
 			    vnd_cd.cd_name);
 			break;
 		}


Home | Main Index | Thread Index | Old Index