tech-kern: pr/35143 and layer_node

Subject: pr/35143 and layer_node_find()
To: None <tech-kern@netbsd.org>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 11/28/2006 12:47:32
--6e7ZaeXHKrTJCxdu
Content-Type: multipart/mixed; boundary="LTeJQqWS0MN7I/qa"
Content-Disposition: inline


--LTeJQqWS0MN7I/qa
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

Darrin Jewell and ad@ recently looked into a locking issue seen once on=20
ftp.n.o. I filed PR 35143 based on his investigation. I think I know=20
what's wrong, though I'm not 100% sure how it happened.

The problem is that we started vcleaning a vnode when the vnode stack is=20
in use. This behavior is something that will cause problems, and is why=20
there's an explicit check in getcleanvnode() to avoid locked layered=20
vnodes.

Somehow or other the test wasn't good enough. I'm attaching a patch which=
=20
I hope will fix things enough that we can eventually remove the test.

The deadlock issue is that vclean() sets VXLOCK then waits to get the=20
vnode lock, while layer_node_find() is called with the vnode stack locked=
=20
and calls vget(), which will stall on VXLOCK.

The patch does two things. The main thing it does is change the vget() in=
=20
layer_node_find() to a) not perform a lockmgr() op (I didn't know you=20
could do this, but it's exactly what I want) and b) set LK_NOWAIT. This=20
way, we either pull the vnode off the free list or we get an error=20
indicating that the node's being recycled.

The other thing the patch does is add layer_node_find1(), which now
contains all the code. The caller must hold lmp->layerm_hashlock
explicitly. The main reason for this is so layer_node_alloc() can hold it
the whole time. This way we close a window in which, if the locking
protocol were ever changed, two different upper nodes could be added (in
the same layer) above one given lower node. Since we want to change the=20
locking protocol in the future, I want to make this change. Also, it makes=
=20
some of the cleaning case issues clear in my head. :-)

Take care,

Bill

--LTeJQqWS0MN7I/qa
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="pr-35143.diff"
Content-Transfer-Encoding: quoted-printable

Index: miscfs/genfs/layer_subr.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/miscfs/genfs/layer_subr.c,v
retrieving revision 1.20
diff -u -p -r1.20 layer_subr.c
--- miscfs/genfs/layer_subr.c	25 Nov 2006 22:14:38 -0000	1.20
+++ miscfs/genfs/layer_subr.c	28 Nov 2006 19:22:16 -0000
@@ -88,6 +88,8 @@ __KERNEL_RCSID(0, "$NetBSD: layer_subr.c
 int layerfs_debug =3D 1;
 #endif
=20
+static struct vnode *layer_node_find1(mp, lowervp);
+
 /*
  * layer cache:
  * Each cache entry holds a reference to the lower vnode
@@ -122,11 +124,29 @@ layerfs_done()
=20
 /*
  * Return a locked, VREF'ed alias for lower vnode if already exists, else =
0.
+ *
+ * Lock wrapper around layer_node_find1() for lmp->layerm_hashlock.
  */
 struct vnode *
-layer_node_find(mp, lowervp)
-	struct mount *mp;
-	struct vnode *lowervp;
+layer_node_find(struct mount *mp, struct vnode *lowervp)
+{
+	struct vnode *rv;
+	struct layer_mount *lmp =3D MOUNTTOLAYERMOUNT(mp);
+
+	simple_lock(&lmp->layerm_hashlock);
+	rv =3D layer_node_find1(mp, lowervp);
+	simple_unlock(&lmp->layerm_hashlock);
+
+	return rv;
+}
+
+/*
+ * Return a locked, VREF'ed alias for lower vnode if already exists, else =
0.
+ *
+ * Caller must hold lmp->layerm_hashlock.
+ */
+static struct vnode *
+layer_node_find1(struct mount *mp, struct vnode *lowervp)
 {
 	struct layer_mount *lmp =3D MOUNTTOLAYERMOUNT(mp);
 	struct layer_node_hashhead *hd;
@@ -141,37 +161,43 @@ layer_node_find(mp, lowervp)
 	 * and return the vnode locked.
 	 */
 	hd =3D LAYER_NHASH(lmp, lowervp);
-loop:
-	simple_lock(&lmp->layerm_hashlock);
 	for (a =3D hd->lh_first; a !=3D 0; a =3D a->layer_hash.le_next) {
 		if (a->layer_lowervp =3D=3D lowervp && LAYERTOV(a)->v_mount =3D=3D mp) {
 			vp =3D LAYERTOV(a);
-			simple_unlock(&lmp->layerm_hashlock);
 			/*
 			 * We must be careful here as the fact the lower
 			 * vnode is locked will imply vp is locked unless
 			 * someone has decided to start vclean'ing either
-			 * vp or lowervp.
+			 * vp or lowervp. Given that lowervp was successfully
+			 * locked before getting to this call, lowervp will
+			 * be stable in face of vclean.
 			 *
-			 * So we try for an exclusive, recursive lock
-			 * on the upper vnode. If it fails, vcleaning
-			 * is in progress (so when we try again, we'll
-			 * fail). If it succeeds, we now have double
-			 * locked the bottom node. So we do an explicit
-			 * VOP_UNLOCK on it to keep the counts right. Note
-			 * that we will end up with the upper node and
-			 * the lower node locked once.
+			 * We don't use a LK_TYPE_MASK op, so vget()
+			 * will only perform its operations (pulling
+			 * off the free list, noticing VXLOCK). This
+			 * behavior is good as we already hold the lock
+			 * on the stack.
 			 */
-			if (vget(vp, LK_EXCLUSIVE | LK_CANRECURSE)) {
-				printf ("layer_node_find: vget failed.\n");
-				goto loop;
+			if (vget(vp, LK_NOWAIT)) {
+				/*
+				 * ok, we had a vnode above lowervp, but
+				 * it's being recycled. So skip this one.
+				 * We will need to allocate a new one.
+				 * Keep checking just in case something odd
+				 * happened and there is a real one
+				 * on the list after this one. Should never
+				 * happen, but it's better to handle
+				 * the odd case this way than to ever
+				 * actually get two layer nodes for the same
+				 * lower node.
+				 */
+				printf ("layer_node_find1: vget failed.\n");
+				continue;
 			};
-			VOP_UNLOCK(lowervp, 0);
 			return (vp);
 		}
 	}
=20
-	simple_unlock(&lmp->layerm_hashlock);
 	return NULL;
 }
=20
@@ -217,10 +243,12 @@ layer_node_alloc(mp, lowervp, vpp)
 	 * check to see if someone else has beaten us to it.
 	 * (We could have slept in MALLOC.)
 	 */
-	if ((nvp =3D layer_node_find(mp, lowervp)) !=3D NULL) {
+	simple_lock(&lmp->layerm_hashlock);
+	if ((nvp =3D layer_node_find1(mp, lowervp)) !=3D NULL) {
 		*vpp =3D nvp;
=20
 		/* free the substructures we've allocated. */
+		simple_unlock(&lmp->layerm_hashlock);
 		FREE(xp, M_TEMP);
 		if (vp->v_type =3D=3D VBLK || vp->v_type =3D=3D VCHR)
 			FREE(vp->v_specinfo, M_VNODE);
@@ -231,8 +259,6 @@ layer_node_alloc(mp, lowervp, vpp)
 		return (0);
 	}
=20
-	simple_lock(&lmp->layerm_hashlock);
-
 	/*
 	 * Now lock the new node. We rely on the fact that we were passed
 	 * a locked vnode. If the lower node is exporting a struct lock
@@ -300,7 +326,9 @@ layer_node_create(mp, lowervp, newvpp)
 	struct vnode *aliasvp;
 	struct layer_mount *lmp =3D MOUNTTOLAYERMOUNT(mp);
=20
-	if ((aliasvp =3D layer_node_find(mp, lowervp)) !=3D NULL) {
+	simple_lock(&lmp->layerm_hashlock);
+	if ((aliasvp =3D layer_node_find1(mp, lowervp)) !=3D NULL) {
+		simple_unlock(&lmp->layerm_hashlock);
 		/*
 		 * layer_node_find has taken another reference
 		 * to the alias vnode and moved the lock holding to
@@ -313,6 +341,8 @@ layer_node_create(mp, lowervp, newvpp)
 	} else {
 		int error;
=20
+		simple_unlock(&lmp->layerm_hashlock);
+
 		/*
 		 * Get new vnode.
 		 */

--LTeJQqWS0MN7I/qa--

--6e7ZaeXHKrTJCxdu
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.3 (NetBSD)

iD8DBQFFbKBkWz+3JHUci9cRAtjzAJ9aK/kHanuxVvRnq9l2hqk0wj7UuACfQWgQ
xlykdDp8pLzu8QmY0I8YWAQ=
=En7+
-----END PGP SIGNATURE-----

--6e7ZaeXHKrTJCxdu--