tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: vnd locking



On Mon, Jun 01, 2015 at 01:11:44PM +0100, Patrick Welche wrote:
> On Fri, May 22, 2015 at 04:56:57PM +0000, Taylor R Campbell wrote:
> >    Date: Fri, 22 May 2015 13:09:58 +0100
> >    From: Patrick Welche <prlw1%cam.ac.uk@localhost>
> > 
> >    There are a load of locking PRs involving vnd. I thought I would have a
> >    stab at converting vnd to use condvars and mutexes (mutices?) as a first
> >    step. I cobbled something together which allows me to
> >    [...]
> >    with a LOCKDEBUG kernel, so I may have guessed right, but I haven't found
> >    any documentation on The Right Way, so please review carefully, assume
> >    the worst, and tell me how to do it better!
> 
> I may have regressed: with the attached patch, I now get a locking
> error in vndthread at:
> 
>         /*
>          * Allocate a header for this transfer and link it to the
>          * buffer
>          */
>         mutex_enter(&vnd->sc_sclock); <------------ XXX already locked?!
>         vnx = VND_GETXFER(vnd); /* may sleep */
>         mutex_exit(&vnd->sc_sclock);

Simply VND_GETXFER(vnd) is pool_get(&(vnd)->sc_vxpool, PR_WAITOK), and

                pool_init(&vnd->sc_vxpool, sizeof(struct vndxfer), 0,
                    0, 0, "vndxpl", NULL, IPL_BIO);

so I think the splbio replacement mutex_enter/exit is simply not
necessary. Removing them gets me a successful

  mkdir /mnt0 /mnt1 /mnt2 /mnt3
  dd if=/dev/zero of=/scratch/disk0.img bs=1m count=1100
  dd if=/dev/zero of=/scratch/disk1.img bs=1m count=1100
  dd if=/dev/zero of=/scratch2/disk2.img bs=1m count=1100
  dd if=/dev/zero of=/scratch3/disk3.img bs=1m count=1100
  vnconfig vnd0 /scratch/disk0.img
  vnconfig vnd1 /scratch/disk1.img
  vnconfig vnd2 /scratch2/disk2.img
  vnconfig vnd3 /scratch3/disk3.img
  for i in 0 1 2 3; do
	  disklabel -rw vnd$i vndtest;
	  newfs /dev/rvnd${i}a
	  mount /dev/vnd${i}a /mnt${i}
  done
  for i in 0 1 2 3; do
	  bonnie -d /mnt${i}&
  done

with a LOCKDEBUG kernel on a quad core computer, with scratch, scratch2
and scratch3 being on separate physical disks.

Cheers,

Patrick
Index: vndvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/vndvar.h,v
retrieving revision 1.34
diff -u -p -r1.34 vndvar.h
--- vndvar.h	25 May 2015 20:57:18 -0000	1.34
+++ vndvar.h	23 Jun 2015 17:16:50 -0000
@@ -107,14 +107,19 @@ struct vnode;
  */
 struct vnd_softc {
 	device_t	 sc_dev;
+	kmutex_t	 sc_sclock;	/* protect this softc */
 	int		 sc_flags;	/* flags */
+	kcondvar_t	 sc_intercv;	/* interruptible lock condvar */
 	uint64_t	 sc_size;	/* size of vnd */
 	struct vnode	*sc_vp;		/* vnode */
 	kauth_cred_t	 sc_cred;	/* credentials */
-	int		 sc_maxactive;	/* max # of active requests */
 	struct bufq_state *sc_tab;	/* transfer queue */
+	kcondvar_t	 sc_bufqcv;	/* queue's condvar */
 	int		 sc_pending;	/* number of pending transfers */
+	kcondvar_t	 sc_pendingcv;	/* pending transfer condvar */
+	int		 sc_maxactive;	/* max # of active requests */
 	int		 sc_active;	/* number of active transfers */
+	kcondvar_t	 sc_activitycv;	/* activity condvar */
 	struct disk	 sc_dkdev;	/* generic disk device info */
 	struct vndgeom	 sc_geom;	/* virtual geometry */
 	struct pool	 sc_vxpool;	/* vndxfer pool */
@@ -140,7 +145,6 @@ struct vnd_softc {
 #define	VNF_KLABEL	0x040	/* keep label on close */
 #define	VNF_VLABEL	0x080	/* label is valid */
 #define	VNF_KTHREAD	0x100	/* thread is running */
-#define	VNF_VUNCONF	0x200	/* device is unconfiguring */
 #define VNF_COMP	0x400	/* file is compressed */
 #define VNF_CLEARING	0x800	/* unit is being torn down */
 #define VNF_USE_VN_RDWR	0x1000	/* have to use vn_rdwr() */
Index: vnd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/vnd.c,v
retrieving revision 1.245
diff -u -p -r1.245 vnd.c
--- vnd.c	25 May 2015 21:02:37 -0000	1.245
+++ vnd.c	23 Jun 2015 17:16:51 -0000
@@ -204,7 +204,7 @@ const struct bdevsw vnd_bdevsw = {
 	.d_dump = vnddump,
 	.d_psize = vndsize,
 	.d_discard = nodiscard,
-	.d_flag = D_DISK
+	.d_flag = D_DISK | D_MPSAFE
 };
 
 const struct cdevsw vnd_cdevsw = {
@@ -219,7 +219,7 @@ const struct cdevsw vnd_cdevsw = {
 	.d_mmap = nommap,
 	.d_kqfilter = nokqfilter,
 	.d_discard = nodiscard,
-	.d_flag = D_DISK
+	.d_flag = D_DISK | D_MPSAFE
 };
 
 static int	vnd_match(device_t, cfdata_t, void *);
@@ -241,12 +241,9 @@ static struct	dkdriver vnddkdriver = {
 void
 vndattach(int num)
 {
-	int error;
 
-	error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
-	if (error)
-		aprint_error("%s: unable to register cfattach\n",
-		    vnd_cd.cd_name);
+	if (config_cfattach_attach(vnd_cd.cd_name, &vnd_ca))
+		aprint_error("%s: unable to register\n", vnd_cd.cd_name);
 }
 
 static int
@@ -265,6 +262,11 @@ vnd_attach(device_t parent, device_t sel
 	sc->sc_comp_offsets = NULL;
 	sc->sc_comp_buff = NULL;
 	sc->sc_comp_decombuf = NULL;
+	mutex_init(&sc->sc_sclock, MUTEX_DEFAULT, IPL_BIO);
+	cv_init(&sc->sc_intercv, "vndlck");
+	cv_init(&sc->sc_activitycv, "vndac");
+	cv_init(&sc->sc_bufqcv, "vndbp");
+	cv_init(&sc->sc_pendingcv, "vndpc");
 	bufq_alloc(&sc->sc_tab, "disksort", BUFQ_SORT_RAWBLOCK);
 	disk_init(&sc->sc_dkdev, device_xname(self), &vnddkdriver);
 	if (!pmf_device_register(self, NULL, NULL))
@@ -284,6 +286,11 @@ vnd_detach(device_t self, int flags)
 	}
 
 	pmf_device_deregister(self);
+	mutex_destroy(&sc->sc_sclock);
+	cv_destroy(&sc->sc_intercv);
+	cv_destroy(&sc->sc_activitycv);
+	cv_destroy(&sc->sc_bufqcv);
+	cv_destroy(&sc->sc_pendingcv);
 	bufq_free(sc->sc_tab);
 	disk_destroy(&sc->sc_dkdev);
 
@@ -470,7 +477,8 @@ vndstrategy(struct buf *bp)
 	    device_lookup_private(&vnd_cd, unit);
 	struct disklabel *lp;
 	daddr_t blkno;
-	int s = splbio();
+
+	mutex_enter(&vnd->sc_sclock);
 
 	if (vnd == NULL) {
 		bp->b_error = ENXIO;
@@ -545,18 +553,20 @@ vndstrategy(struct buf *bp)
 		KASSERT(vnd->sc_pending >= 0 &&
 		    vnd->sc_pending <= VND_MAXPENDING(vnd));
 		while (vnd->sc_pending == VND_MAXPENDING(vnd))
-			tsleep(&vnd->sc_pending, PRIBIO, "vndpc", 0);
+			cv_wait(&vnd->sc_pendingcv, &vnd->sc_sclock);
 		vnd->sc_pending++;
 	}
+	mutex_exit(&vnd->sc_sclock);
 	bufq_put(vnd->sc_tab, bp);
-	wakeup(&vnd->sc_tab);
-	splx(s);
+	mutex_enter(&vnd->sc_sclock);
+	cv_signal(&vnd->sc_bufqcv);
+	mutex_exit(&vnd->sc_sclock);
 	return;
 
 done:
+	mutex_exit(&vnd->sc_sclock);
 	bp->b_resid = bp->b_bcount;
 	biodone(bp);
-	splx(s);
 }
 
 static bool
@@ -601,7 +611,8 @@ static void
 vndthread(void *arg)
 {
 	struct vnd_softc *vnd = arg;
-	int s;
+
+	mutex_enter(&vnd->sc_sclock);
 
 	/* Determine whether we can *use* VOP_BMAP and VOP_STRATEGY to
 	 * directly access the backing vnode.  If we can, use these two
@@ -620,31 +631,27 @@ vndthread(void *arg)
 		    "using read/write operations");
 #endif
 
-	s = splbio();
-	vnd->sc_flags |= VNF_KTHREAD;
-	wakeup(&vnd->sc_kthread);
-
 	/*
 	 * Dequeue requests and serve them depending on the available
 	 * vnode operations.
 	 */
-	while ((vnd->sc_flags & VNF_VUNCONF) == 0) {
+	while ((vnd->sc_flags & VNF_KTHREAD) != 0) {
 		struct vndxfer *vnx;
 		struct buf *obp;
 		struct buf *bp;
 
 		obp = bufq_get(vnd->sc_tab);
 		if (obp == NULL) {
-			tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0);
+			cv_wait(&vnd->sc_bufqcv, &vnd->sc_sclock);
 			continue;
 		};
 		if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
 			KASSERT(vnd->sc_pending > 0 &&
 			    vnd->sc_pending <= VND_MAXPENDING(vnd));
 			if (vnd->sc_pending-- == VND_MAXPENDING(vnd))
-				wakeup(&vnd->sc_pending);
+				cv_signal(&vnd->sc_pendingcv);
 		}
-		splx(s);
+		mutex_exit(&vnd->sc_sclock);
 #ifdef DEBUG
 		if (vnddebug & VDB_FOLLOW)
 			printf("vndthread(%p)\n", obp);
@@ -672,17 +679,17 @@ vndthread(void *arg)
 		 * Allocate a header for this transfer and link it to the
 		 * buffer
 		 */
-		s = splbio();
-		vnx = VND_GETXFER(vnd);
-		splx(s);
+		/* mutex_enter(&vnd->sc_sclock); */
+		vnx = VND_GETXFER(vnd); /* may sleep */
+		/* mutex_exit(&vnd->sc_sclock); */
 		vnx->vx_vnd = vnd;
 
-		s = splbio();
+		mutex_enter(&vnd->sc_sclock);
 		while (vnd->sc_active >= vnd->sc_maxactive) {
-			tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0);
+			cv_wait(&vnd->sc_activitycv, &vnd->sc_sclock);
 		}
 		vnd->sc_active++;
-		splx(s);
+		mutex_exit(&vnd->sc_sclock);
 
 		/* Instrumentation. */
 		disk_busy(&vnd->sc_dkdev);
@@ -706,17 +713,15 @@ vndthread(void *arg)
 		else
 			handle_with_rdwr(vnd, obp, bp);
 
-		s = splbio();
+		mutex_enter(&vnd->sc_sclock);
 		continue;
 
 done:
 		biodone(obp);
-		s = splbio();
+		mutex_enter(&vnd->sc_sclock);
 	}
 
-	vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF);
-	wakeup(&vnd->sc_kthread);
-	splx(s);
+	mutex_exit(&vnd->sc_sclock);
 	kthread_exit(0);
 }
 
@@ -794,7 +799,7 @@ handle_with_rdwr(struct vnd_softc *vnd, 
 }
 
 /*
- * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP
+ * Handles the read/write request given in 'bp' using the vnode's VOP_BMAP
  * and VOP_STRATEGY operations.
  *
  * 'obp' is a pointer to the original request fed to the vnd device.
@@ -917,7 +922,8 @@ vndiodone(struct buf *bp)
 	struct vndxfer *vnx = VND_BUFTOXFER(bp);
 	struct vnd_softc *vnd = vnx->vx_vnd;
 	struct buf *obp = bp->b_private;
-	int s = splbio();
+
+	mutex_enter(&vnd->sc_sclock);
 
 	KASSERT(&vnx->vx_buf == bp);
 	KASSERT(vnd->sc_active > 0);
@@ -931,9 +937,9 @@ vndiodone(struct buf *bp)
 	    (bp->b_flags & B_READ));
 	vnd->sc_active--;
 	if (vnd->sc_active == 0) {
-		wakeup(&vnd->sc_tab);
+		cv_broadcast(&vnd->sc_activitycv);
 	}
-	splx(s);
+	mutex_exit(&vnd->sc_sclock);
 	obp->b_error = bp->b_error;
 	obp->b_resid = bp->b_resid;
 	buf_destroy(bp);
@@ -1163,7 +1169,7 @@ vndioctl(dev_t dev, u_long cmd, void *da
 		}
 		KASSERT(l);
 		error = VOP_GETATTR(nd.ni_vp, &vattr, l->l_cred);
-		if (!error && nd.ni_vp->v_type != VREG)
+		if (!error && vattr.va_type != VREG)
 			error = EOPNOTSUPP;
 		if (!error && vattr.va_bytes < vattr.va_size)
 			/* File is definitely sparse, use vn_rdwr() */
@@ -1359,13 +1365,14 @@ vndioctl(dev_t dev, u_long cmd, void *da
 			vio->vnd_size = dbtob(vnd->sc_size);
 		vnd->sc_flags |= VNF_INITED;
 
-		/* create the kernel thread, wait for it to be up */
-		error = kthread_create(PRI_NONE, 0, NULL, vndthread, vnd,
-		    &vnd->sc_kthread, "%s", device_xname(vnd->sc_dev));
-		if (error)
+		vnd->sc_flags |= VNF_KTHREAD;
+		error = kthread_create(PRI_BIO,
+		    KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
+		    vndthread, vnd, &vnd->sc_kthread, "%s",
+		    device_xname(vnd->sc_dev));
+		if (error) {
+			vnd->sc_flags &= ~VNF_KTHREAD;
 			goto close_and_exit;
-		while ((vnd->sc_flags & VNF_KTHREAD) == 0) {
-			tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0);
 		}
 #ifdef DEBUG
 		if (vnddebug & VDB_INIT)
@@ -1657,10 +1664,10 @@ vndshutdown(void)
 static void
 vndclear(struct vnd_softc *vnd, int myminor)
 {
+	struct lwp *kt;
 	struct vnode *vp = vnd->sc_vp;
 	int fflags = FREAD;
 	int bmaj, cmaj, i, mn;
-	int s;
 
 #ifdef DEBUG
 	if (vnddebug & VDB_FOLLOW)
@@ -1678,17 +1685,25 @@ vndclear(struct vnd_softc *vnd, int mymi
 			vdevgone(cmaj, mn, mn, VCHR);
 	}
 
+	mutex_enter(&vnd->sc_sclock);
+
 	if ((vnd->sc_flags & VNF_READONLY) == 0)
 		fflags |= FWRITE;
 
-	s = splbio();
 	bufq_drain(vnd->sc_tab);
-	splx(s);
+	cv_broadcast(&vnd->sc_bufqcv);
+
+	mutex_exit(&vnd->sc_sclock);
 
-	vnd->sc_flags |= VNF_VUNCONF;
-	wakeup(&vnd->sc_tab);
-	while (vnd->sc_flags & VNF_KTHREAD)
-		tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0);
+	if (vnd->sc_flags & VNF_KTHREAD) {
+		mutex_enter(&vnd->sc_sclock);
+		vnd->sc_flags &= ~VNF_KTHREAD;
+		kt = vnd->sc_kthread;
+		cv_signal(&vnd->sc_bufqcv);
+		cv_signal(&vnd->sc_activitycv);
+		mutex_exit(&vnd->sc_sclock);
+		kthread_join(kt);
+	}
 
 #ifdef VND_COMPRESSION
 	/* free the compressed file buffers */
@@ -1709,7 +1724,7 @@ vndclear(struct vnd_softc *vnd, int mymi
 #endif /* VND_COMPRESSION */
 	vnd->sc_flags &=
 	    ~(VNF_INITED | VNF_READONLY | VNF_KLABEL | VNF_VLABEL
-	      | VNF_VUNCONF | VNF_COMP | VNF_CLEARING);
+	      | VNF_COMP | VNF_CLEARING);
 	if (vp == NULL)
 		panic("vndclear: null vp");
 	(void) vn_close(vp, fflags, vnd->sc_cred);
@@ -1802,6 +1817,50 @@ vndgetdefaultlabel(struct vnd_softc *sc,
 }
 
 /*
+ * Wait interruptibly for an exclusive lock.
+ *
+ * XXX
+ * Several drivers do this; it should be abstracted and made MP-safe.
+ */
+static int
+vndlock(struct vnd_softc *sc)
+{
+	int error;
+
+	mutex_enter(&sc->sc_sclock);
+
+	error = 0;
+	while ((sc->sc_flags & VNF_LOCKED) != 0) {
+		sc->sc_flags |= VNF_WANTED;
+		error = cv_wait_sig(&sc->sc_intercv, &sc->sc_sclock);
+		if (error != 0)
+			break;
+	}
+	if (error == 0)
+		sc->sc_flags |= VNF_LOCKED;
+
+	mutex_exit(&sc->sc_sclock);
+
+	return error;
+}
+
+/*
+ * Unlock and wake up any waiters.
+ */
+static void
+vndunlock(struct vnd_softc *sc)
+{
+
+	mutex_enter(&sc->sc_sclock);
+	sc->sc_flags &= ~VNF_LOCKED;
+	if ((sc->sc_flags & VNF_WANTED) != 0) {
+		sc->sc_flags &= ~VNF_WANTED;
+		cv_broadcast(&sc->sc_intercv);
+	}
+	mutex_exit(&sc->sc_sclock);
+}
+
+/*
  * Read the disklabel from a vnd.  If one is not present, create a fake one.
  */
 static void
@@ -1853,40 +1912,6 @@ vndgetdisklabel(dev_t dev, struct vnd_so
 	}
 }
 
-/*
- * Wait interruptibly for an exclusive lock.
- *
- * XXX
- * Several drivers do this; it should be abstracted and made MP-safe.
- */
-static int
-vndlock(struct vnd_softc *sc)
-{
-	int error;
-
-	while ((sc->sc_flags & VNF_LOCKED) != 0) {
-		sc->sc_flags |= VNF_WANTED;
-		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
-			return error;
-	}
-	sc->sc_flags |= VNF_LOCKED;
-	return 0;
-}
-
-/*
- * Unlock and wake up any waiters.
- */
-static void
-vndunlock(struct vnd_softc *sc)
-{
-
-	sc->sc_flags &= ~VNF_LOCKED;
-	if ((sc->sc_flags & VNF_WANTED) != 0) {
-		sc->sc_flags &= ~VNF_WANTED;
-		wakeup(sc);
-	}
-}
-
 #ifdef VND_COMPRESSION
 /* compressed file read */
 static void
@@ -2051,7 +2076,7 @@ vnd_modcmd(modcmd_t cmd, void *arg)
 		error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
 	        if (error) {
 			config_cfdriver_detach(&vnd_cd);
-			aprint_error("%s: unable to register cfattach\n",
+			aprint_error("%s: unable to register\n",
 			    vnd_cd.cd_name);
 			break;
 		}


Home | Main Index | Thread Index | Old Index