Subject: PR 32535
To: None <tech-kern@netbsd.org>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 10/23/2006 20:29:12
--2OzUYMsT4j3Kc+NU
Content-Type: multipart/mixed; boundary="Jtds+vpI57xq70EV"
Content-Disposition: inline


--Jtds+vpI57xq70EV
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

I have a proposed fix for PR 32535, and I'd like other folks to look it=20
over.

The issue is that when we are doing a directory lookup, after we get and
lock the child vnode, we vrele() the parent.  Normally this is great. =20
If the use count goes to zero, we lock the vnode and call VOP_INACTIVE()=20
on it.

Technically locking the parent with the child locked violates the locking=
=20
hierarchy. But if nothing is using the parent, there is no issue.

With layered file systems, though, we run into a problem. We try to lock=20
the parent when the layer vnode's use count goes to zero. But something=20
can be trying to use the underlying vnode (or if there are multiple null=20
layers, using another layer will count). Specifically, something can have=
=20
locked the parrent and be looking for the child.

My proposed fix is to add vrele2(), which is a form of vrele() that is=20
careful if we need to lock the being-released vnode. We pass it the to-be-=
=20
released vnode and another vnode that we have locked. If the reference=20
count goes to zero and the vnode's lock is held, we release the lock on=20
the other vnode, lock the being-released-one, VOP_INACTIVE(), and re-lock=
=20
the other vnode.

I also moved some code around. There were places where we release both=20
nodes, so I made it so we vput() the child before vrele()'ing the parent.=
=20
We then will have no problems (and it seems silly to do the relock dance=20
if we're about to vput()).

However I'd like other folks to look at this!

Once it's all 100%, pullup requests will follow. :-)

Take care,

Bill

--Jtds+vpI57xq70EV
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="pr-32535.patch"
Content-Transfer-Encoding: quoted-printable

? cvslog
? diffie
? arch/i386/compile/GENERIC
Index: kern/vfs_lookup.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/kern/vfs_lookup.c,v
retrieving revision 1.71
diff -p -u -r1.71 vfs_lookup.c
--- kern/vfs_lookup.c	23 Jul 2006 22:06:12 -0000	1.71
+++ kern/vfs_lookup.c	23 Oct 2006 20:40:36 -0000
@@ -364,8 +364,9 @@ namei(struct nameidata *ndp)
 		}
 	}
 	PNBUF_PUT(cnp->cn_pnbuf);
-	vrele(ndp->ni_dvp);
+	/* release out of hierarchical order so we don't need vrele2() */
 	vput(ndp->ni_vp);
+	vrele(ndp->ni_dvp);
 	ndp->ni_vp =3D NULL;
 	return (error);
 }
@@ -756,7 +757,11 @@ nextname:
 	 */
 	if (!(cnp->cn_flags & ISLASTCN)) {
 		cnp->cn_nameptr =3D ndp->ni_next;
-		vrele(ndp->ni_dvp);
+		if ((error =3D vrele2(ndp->ni_dvp, dp))) {
+			/* Oops! there was an error re-locking dp! */
+			dpunlocked =3D 1;
+			goto bad;
+		}
 		goto dirloop;
 	}
=20
@@ -773,28 +778,44 @@ terminal:
 		error =3D EROFS;
 		goto bad2;
 	}
+	if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
+		VOP_UNLOCK(dp, 0);
 	if (ndp->ni_dvp !=3D NULL) {
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir =3D ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
-		if (!wantparent)
-			vrele(ndp->ni_dvp);
+		if (!wantparent) {
+			if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
+				vrele(ndp->ni_dvp);
+			else
+				vrele2(ndp->ni_dvp, dp);
+		}
 	}
-	if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
-		VOP_UNLOCK(dp, 0);
 	return (0);
=20
 bad2:
 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
 			((cnp->cn_flags & PDIRUNLOCK) =3D=3D 0))
-		VOP_UNLOCK(ndp->ni_dvp, 0);
-	vrele(ndp->ni_dvp);
+		vput(ndp->ni_dvp);
+	else if (!(dpunlocked)) {
+		/*
+		 * We're about to vrele(ndp->ni_dvp) while dp is locked. This
+		 * strictly-speaking needs vrele2(). However we will then
+		 * vput(dp). So instead, do operations in the opposite order,
+		 * making vrele(ndp->ni_dvp) sufficient.
+		 */
+		vput(dp);
+		vrele(ndp->ni_dvp);
+		goto bad1;
+	} else
+		vrele(ndp->ni_dvp);
 bad:
 	if (dpunlocked)
 		vrele(dp);
 	else
 		vput(dp);
+bad1:
 	ndp->ni_vp =3D NULL;
 	return (error);
 }
@@ -909,16 +930,29 @@ relookup(struct vnode *dvp, struct vnode
 	/* ASSERT(dvp =3D=3D ndp->ni_startdir) */
 	if (cnp->cn_flags & SAVESTART)
 		VREF(dvp);
-	if (!wantparent)
-		vrele(dvp);
-	if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0)
+	/*
+	 * vrele(dvp), catching the case where we want the leaf locked.
+	 * In that case, be careful about deadlock in vrele().
+	 */
+	if ((cnp->cn_flags & LOCKLEAF) =3D=3D 0) {
 		VOP_UNLOCK(dp, 0);
+		if (!wantparent)
+			vrele(dvp);
+	} else {
+		if (!wantparent)
+			return (vrele2(dvp, dp));
+	}
 	return (0);
=20
 bad2:
+	vput(dp);
+	*vpp =3D NULL;
 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
-		VOP_UNLOCK(dvp, 0);
-	vrele(dvp);
+		vput(dvp);
+	else
+		vrele(dvp);
+	return (error);
+
 bad:
 	vput(dp);
 	*vpp =3D NULL;
Index: kern/vfs_subr.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.274
diff -p -u -r1.274 vfs_subr.c
--- kern/vfs_subr.c	22 Oct 2006 00:48:14 -0000	1.274
+++ kern/vfs_subr.c	23 Oct 2006 20:40:38 -0000
@@ -1304,6 +1304,68 @@ vrele(struct vnode *vp)
 }
=20
 /*
+ * Vnode release with another vnode locked.
+ * If count drops to zero, call inactive routine and return to freelist.
+ * Same as vrele(), except that if we have to lock the vnode for
+ * VOP_INACTIVE(), unlock ovp before sleeping for vp's lock. Return
+ * value is either 0 or the return value from re-locking ovp.
+ */
+int
+vrele2(struct vnode *vp, struct vnode *ovp)
+{
+	int r;
+	struct lwp *l =3D curlwp;		/* XXX */
+
+#ifdef DIAGNOSTIC
+	if (vp =3D=3D NULL)
+		panic("vrele: null vp");
+#endif
+	simple_lock(&vp->v_interlock);
+	vp->v_usecount--;
+	if (vp->v_usecount > 0) {
+		simple_unlock(&vp->v_interlock);
+		return 0;
+	}
+#ifdef DIAGNOSTIC
+	if (vp->v_usecount < 0 || vp->v_writecount !=3D 0) {
+		vprint("vrele: bad ref count", vp);
+		panic("vrele: ref cnt vp %p", vp);
+	}
+#endif
+	/*
+	 * Insert at tail of LRU list.
+	 */
+	simple_lock(&vnode_free_list_slock);
+	if (vp->v_holdcnt > 0)
+		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
+	else
+		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	simple_unlock(&vnode_free_list_slock);
+	if (vp->v_flag & VEXECMAP) {
+		uvmexp.execpages -=3D vp->v_uobj.uo_npages;
+		uvmexp.filepages +=3D vp->v_uobj.uo_npages;
+	}
+	vp->v_flag &=3D ~(VTEXT|VEXECMAP|VWRITEMAP|VMAPPED);
+	r =3D vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT);
+	if (r =3D=3D 0) {
+		/* No issues aquiring lock, so VOP_INACTIVE() and exit */
+		VOP_INACTIVE(vp, l);
+		return 0;
+	}
+	/* Skip VOP_INACTIVE() if there's an error locking. */
+	if (r !=3D EBUSY)
+		return 0;
+	/*
+	 * At this point we know there's contention for vp, so do
+	 * the unlock/lock dance for ovp.
+	 */
+	VOP_UNLOCK(ovp, 0);
+	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) =3D=3D 0)
+		VOP_INACTIVE(vp, l);
+	return (vn_lock(ovp, LK_EXCLUSIVE));
+}
+
+/*
  * Page or buffer structure gets a reference.
  * Called with v_interlock held.
  */
Index: sys/vnode.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.159
diff -p -u -r1.159 vnode.h
--- sys/vnode.h	22 Oct 2006 22:49:38 -0000	1.159
+++ sys/vnode.h	23 Oct 2006 20:40:40 -0000
@@ -545,6 +545,7 @@ void	vprint(const char *, struct vnode *
 void 	vput(struct vnode *);
 int	vrecycle(struct vnode *, struct simplelock *, struct lwp *);
 void 	vrele(struct vnode *);
+int 	vrele2(struct vnode *, struct vnode *);
 int	vtruncbuf(struct vnode *, daddr_t, int, int);
 void	vwakeup(struct buf *);
=20

--Jtds+vpI57xq70EV--

--2OzUYMsT4j3Kc+NU
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.3 (NetBSD)

iD8DBQFFPYiIWz+3JHUci9cRAj+DAJ0X2By3uN+6BAtsRE9a7vHKBoB07ACdEagk
PTAXlRlVhnpiiK4hxmDjHRA=
=t+UT
-----END PGP SIGNATURE-----

--2OzUYMsT4j3Kc+NU--