tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Fix for PR kern/56713



When I changed vnode kqueue events to be posted from common code at the VFS layer, I introduced a regression in sending those events on stacked file systems like nullfs.  The root cause of the problem is that when knotes are attached to the vnode, that operation (by way of the VOP bypass mechanism in layerfs) drills all the way down to the bottom of the vnode stack and attaches there (as intended), but with event delivery being handled now in vnode_if.c, it’s the upper vnode that’s being worked with, and no knotes are attached there.

These stacked vnodes already share a bit of state with the bottom layer… (locks, etc.), so my solution to this problem is to allow these stacked vnodes to also refer to the bottom vnode’s klist.  Each vnode always starts out referring to its own, but as a layerfs vnode is constructed, it is told to refer to the lower vnode's klist (just as it refers to the lower vnode’s interlock and uvm_obj lock).  Various assertions are included to ensure that knotes are never attached to a vnode that refers to another vnode’s klist (they should always be attached to the bottom-most vnode).

(After applying the patch, vnode_if.c must be rebuilt.)

Index: sys/fs/union/union_subr.c
===================================================================
RCS file: /cvsroot/src/sys/fs/union/union_subr.c,v
retrieving revision 1.81
diff -u -p -r1.81 union_subr.c
--- sys/fs/union/union_subr.c	19 Mar 2022 13:53:32 -0000	1.81
+++ sys/fs/union/union_subr.c	12 Jul 2022 00:17:28 -0000
@@ -232,10 +232,11 @@ union_newupper(struct union_node *un, st
 	unlock_ap.a_desc = VDESC(vop_unlock);
 	unlock_ap.a_vp = UNIONTOV(un);
 	genfs_unlock(&unlock_ap);
-	/* Update union vnode interlock & vmobjlock. */
+	/* Update union vnode interlock, vmobjlock, & klist. */
 	vshareilock(UNIONTOV(un), uppervp);
 	rw_obj_hold(uppervp->v_uobj.vmobjlock);
 	uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
+	vshareklist(UNIONTOV(un), uppervp);
 	mutex_exit(&un->un_lock);
 	if (ohash != nhash) {
 		LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
@@ -577,6 +578,7 @@ union_loadvnode(struct mount *mp, struct
 	vshareilock(vp, svp);
 	rw_obj_hold(svp->v_uobj.vmobjlock);
 	uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
+	vshareklist(vp, svp);
 
 	/* detect the root vnode (and aliases) */
 	if ((un->un_uppervp == um->um_uppervp) &&
Index: sys/miscfs/genfs/layer_vfsops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/genfs/layer_vfsops.c,v
retrieving revision 1.54
diff -u -p -r1.54 layer_vfsops.c
--- sys/miscfs/genfs/layer_vfsops.c	23 Feb 2020 15:46:41 -0000	1.54
+++ sys/miscfs/genfs/layer_vfsops.c	12 Jul 2022 00:17:28 -0000
@@ -205,10 +205,11 @@ layerfs_loadvnode(struct mount *mp, stru
 
 	xp = kmem_alloc(lmp->layerm_size, KM_SLEEP);
 
-	/* Share the interlock and vmobjlock with the lower node. */
+	/* Share the interlock, vmobjlock, and klist with the lower node. */
 	vshareilock(vp, lowervp);
 	rw_obj_hold(lowervp->v_uobj.vmobjlock);
 	uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock);
+	vshareklist(vp, lowervp);
 
 	vp->v_tag = lmp->layerm_tag;
 	vp->v_type = lowervp->v_type;
Index: sys/kern/vfs_vnode.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnode.c,v
retrieving revision 1.143
diff -u -p -r1.143 vfs_vnode.c
--- sys/kern/vfs_vnode.c	9 Apr 2022 23:45:45 -0000	1.143
+++ sys/kern/vfs_vnode.c	12 Jul 2022 00:17:28 -0000
@@ -457,7 +457,8 @@ vnalloc_marker(struct mount *mp)
 	vp->v_mount = mp;
 	vp->v_type = VBAD;
 	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
-	klist_init(&vp->v_klist);
+	klist_init(&vip->vi_klist.vk_klist);
+	vp->v_klist = &vip->vi_klist;
 	vip->vi_state = VS_MARKER;
 
 	return vp;
@@ -475,7 +476,7 @@ vnfree_marker(vnode_t *vp)
 	KASSERT(vip->vi_state == VS_MARKER);
 	mutex_obj_free(vp->v_interlock);
 	uvm_obj_destroy(&vp->v_uobj, true);
-	klist_fini(&vp->v_klist);
+	klist_fini(&vip->vi_klist.vk_klist);
 	pool_cache_put(vcache_pool, vip);
 }
 
@@ -1391,7 +1392,8 @@ vcache_alloc(void)
 	vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 
 	uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
-	klist_init(&vp->v_klist);
+	klist_init(&vip->vi_klist.vk_klist);
+	vp->v_klist = &vip->vi_klist;
 	cv_init(&vp->v_cv, "vnode");
 	cache_vnode_init(vp);
 
@@ -1453,7 +1455,9 @@ vcache_free(vnode_impl_t *vip)
 	mutex_obj_free(vp->v_interlock);
 	rw_destroy(&vip->vi_lock);
 	uvm_obj_destroy(&vp->v_uobj, true);
-	klist_fini(&vp->v_klist);
+	KASSERT(vp->v_klist == &vip->vi_klist ||
+		SLIST_EMPTY(&vip->vi_klist.vk_klist));
+	klist_fini(&vip->vi_klist.vk_klist);
 	cv_destroy(&vp->v_cv);
 	cache_vnode_fini(vp);
 	pool_cache_put(vcache_pool, vip);
@@ -1916,7 +1920,7 @@ vcache_reclaim(vnode_t *vp)
 	 * Don't check for interest in NOTE_REVOKE; it's always posted
 	 * because it sets EV_EOF.
 	 */
-	KNOTE(&vp->v_klist, NOTE_REVOKE);
+	KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
 	mutex_exit(vp->v_interlock);
 
 	/*
@@ -2095,3 +2099,28 @@ vshareilock(vnode_t *tvp, vnode_t *fvp)
 	tvp->v_interlock = fvp->v_interlock;
 	mutex_obj_free(oldlock);
 }
+
+void
+vshareklist(vnode_t *tvp, vnode_t *fvp)
+{
+	/*
+	 * If two vnodes share klist state, they must also share
+	 * an interlock.
+	 */
+	KASSERT(tvp->v_interlock == fvp->v_interlock);
+
+	/*
+	 * We make the following assumptions:
+	 *
+	 * ==> Some other synchronization is happening outside of
+	 *     our view to make this safe.
+	 *
+	 * ==> That the "to" vnode will have the necessary references
+	 *     on the "from" vnode so that the storage for the klist
+	 *     won't be yanked out from beneath us (the vnode_impl).
+	 *
+	 * ==> If "from" is also sharing, we then assume that "from"
+	 *     has the necessary references, and so on.
+	 */
+	tvp->v_klist = fvp->v_klist;
+}
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.233
diff -u -p -r1.233 vfs_vnops.c
--- sys/kern/vfs_vnops.c	6 Jul 2022 13:52:24 -0000	1.233
+++ sys/kern/vfs_vnops.c	12 Jul 2022 00:17:28 -0000
@@ -79,7 +79,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
-#include <sys/vnode.h>
+#include <sys/vnode_impl.h>
 #include <sys/ioctl.h>
 #include <sys/tty.h>
 #include <sys/poll.h>
@@ -1428,9 +1428,17 @@ vn_knote_to_interest(const struct knote 
 void
 vn_knote_attach(struct vnode *vp, struct knote *kn)
 {
+	struct vnode_klist *vk = vp->v_klist;
 	long interest = 0;
 
 	/*
+	 * In the case of layered / stacked file systems, knotes
+	 * should only ever be associated with the base vnode.
+	 */
+	KASSERT(kn->kn_hook == vp);
+	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
+
+	/*
 	 * We maintain a bitmask of the kevents that there is interest in,
 	 * to minimize the impact of having watchers.  It's silly to have
 	 * to traverse vn_klist every time a read or write happens simply
@@ -1439,18 +1447,23 @@ vn_knote_attach(struct vnode *vp, struct
 	 */
 
 	mutex_enter(vp->v_interlock);
-	SLIST_INSERT_HEAD(&vp->v_klist, kn, kn_selnext);
-	SLIST_FOREACH(kn, &vp->v_klist, kn_selnext) {
+	SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
+	SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
 		interest |= vn_knote_to_interest(kn);
 	}
-	vp->v_klist_interest = interest;
+	vk->vk_interest = interest;
 	mutex_exit(vp->v_interlock);
 }
 
 void
 vn_knote_detach(struct vnode *vp, struct knote *kn)
 {
-	int interest = 0;
+	struct vnode_klist *vk = vp->v_klist;
+	long interest = 0;
+
+	/* See above. */
+	KASSERT(kn->kn_hook == vp);
+	KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
 
 	/*
 	 * We special case removing the head of the list, because:
@@ -1464,16 +1477,16 @@ vn_knote_detach(struct vnode *vp, struct
 	 */
 
 	mutex_enter(vp->v_interlock);
-	if (__predict_true(kn == SLIST_FIRST(&vp->v_klist))) {
-		SLIST_REMOVE_HEAD(&vp->v_klist, kn_selnext);
-		SLIST_FOREACH(kn, &vp->v_klist, kn_selnext) {
+	if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
+		SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
+		SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
 			interest |= vn_knote_to_interest(kn);
 		}
-		vp->v_klist_interest = interest;
+		vk->vk_interest = interest;
 	} else {
 		struct knote *thiskn, *nextkn, *prevkn = NULL;
 
-		SLIST_FOREACH_SAFE(thiskn, &vp->v_klist, kn_selnext, nextkn) {
+		SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
 			if (thiskn == kn) {
 				KASSERT(kn != NULL);
 				KASSERT(prevkn != NULL);
@@ -1484,7 +1497,7 @@ vn_knote_detach(struct vnode *vp, struct
 				prevkn = thiskn;
 			}
 		}
-		vp->v_klist_interest = interest;
+		vk->vk_interest = interest;
 	}
 	mutex_exit(vp->v_interlock);
 }
Index: sys/kern/vnode_if.sh
===================================================================
RCS file: /cvsroot/src/sys/kern/vnode_if.sh,v
retrieving revision 1.75
diff -u -p -r1.75 vnode_if.sh
--- sys/kern/vnode_if.sh	3 May 2022 13:54:18 -0000	1.75
+++ sys/kern/vnode_if.sh	12 Jul 2022 00:17:28 -0000
@@ -444,7 +444,7 @@ do {									\\
 	 */								\\
 	mutex_enter((thisvp)->v_interlock);				\\
 	if (__predict_true((e) == 0)) {					\\
-		knote(&(thisvp)->v_klist, (n));				\\
+		knote(&(thisvp)->v_klist->vk_klist, (n));		\\
 	}								\\
 	holdrelel((thisvp));						\\
 	mutex_exit((thisvp)->v_interlock);				\\
@@ -557,7 +557,7 @@ do {									\\
 		 * meaningless from the watcher's perspective.		\\
 		 */							\\
 		if (__predict_true(thisvp->v_op != dead_vnodeop_p)) {	\\
-			knote(&thisvp->v_klist,				\\
+			knote(&thisvp->v_klist->vk_klist,		\\
 			    ((ap)->a_fflag & FWRITE)			\\
 			    ? NOTE_CLOSE_WRITE : NOTE_CLOSE);		\\
 		}							\\
Index: sys/sys/param.h
===================================================================
RCS file: /cvsroot/src/sys/sys/param.h,v
retrieving revision 1.711
diff -u -p -r1.711 param.h
--- sys/sys/param.h	20 Jun 2022 08:38:56 -0000	1.711
+++ sys/sys/param.h	12 Jul 2022 00:17:28 -0000
@@ -67,7 +67,7 @@
  *	2.99.9		(299000900)
  */
 
-#define	__NetBSD_Version__	999009800	/* NetBSD 9.99.98 */
+#define	__NetBSD_Version__	999009900	/* NetBSD 9.99.99 */
 
 #define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
     (m) * 1000000) + (p) * 100) <= __NetBSD_Version__)
Index: sys/sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.301
diff -u -p -r1.301 vnode.h
--- sys/sys/vnode.h	25 Mar 2022 08:56:36 -0000	1.301
+++ sys/sys/vnode.h	12 Jul 2022 00:17:28 -0000
@@ -179,8 +179,8 @@ struct vnode {
 	enum vtype	v_type;			/* -   vnode type */
 	enum vtagtype	v_tag;			/* -   type of underlying data */
 	void 		*v_data;		/* -   private data for fs */
-	struct klist	v_klist;		/* i   notes attached to vnode */
-	long		v_klist_interest;	/* i   what the noes are interested in */
+	struct vnode_klist *v_klist;		/* i   kevent / knote info */
+
 	void		*v_segvguard;		/* e   for PAX_SEGVGUARD */
 };
 #define	v_mountedhere	v_un.vu_mountedhere
@@ -190,6 +190,19 @@ struct vnode {
 #define	v_ractx		v_un.vu_ractx
 
 typedef struct vnode vnode_t;
+
+/*
+ * Structure that encompasses the kevent state for a vnode.  This is
+ * carved out as a separate structure because some vnodes may share
+ * this state with one another.
+ *
+ * N.B. if two vnodes share a vnode_klist, then they must also share
+ * v_interlock.
+ */
+struct vnode_klist {
+	struct klist	vk_klist;	/* i   notes attached to vnode */
+	long		vk_interest;	/* i   what the notes are interested in */
+};
 #endif
 
 /*
@@ -415,7 +428,7 @@ void vref(struct vnode *);
  * Macro to determine kevent interest on a vnode.
  */
 #define	VN_KEVENT_INTEREST(vp, n)					\
-	((vp)->v_klist_interest != 0)
+	(((vp)->v_klist->vk_interest & (n)) != 0)
 
 static inline void
 VN_KNOTE(struct vnode *vp, long hint)
@@ -429,7 +442,7 @@ VN_KNOTE(struct vnode *vp, long hint)
 	 */
 	if (__predict_false(VN_KEVENT_INTEREST(vp, hint))) {
 		mutex_enter(vp->v_interlock);
-		knote(&vp->v_klist, hint);
+		knote(&vp->v_klist->vk_klist, hint);
 		mutex_exit(vp->v_interlock);
 	}
 }
@@ -594,6 +607,7 @@ int	vdead_check(struct vnode *, int);
 void	vrevoke(struct vnode *);
 void	vremfree(struct vnode *);
 void	vshareilock(struct vnode *, struct vnode *);
+void	vshareklist(struct vnode *, struct vnode *);
 int	vrefcnt(struct vnode *);
 int	vcache_get(struct mount *, const void *, size_t, struct vnode **);
 int	vcache_new(struct mount *, struct vnode *,
Index: sys/sys/vnode_impl.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode_impl.h,v
retrieving revision 1.23
diff -u -p -r1.23 vnode_impl.h
--- sys/sys/vnode_impl.h	22 Mar 2020 14:38:37 -0000	1.23
+++ sys/sys/vnode_impl.h	12 Jul 2022 00:17:28 -0000
@@ -77,6 +77,12 @@ struct vnode_impl {
 	struct vcache_key vi_key;		/* c   vnode cache key */
 
 	/*
+	 * The vnode klist is accessed frequently, but rarely
+	 * modified.
+	 */
+	struct vnode_klist vi_klist;		/* i   kevent / knote state */
+
+	/*
 	 * vnode cache, LRU and syncer.  This all changes with some
 	 * regularity so keep it together.
 	 */
Index: tests/lib/libc/kevent_nullmnt/t_nullmnt.sh
===================================================================
RCS file: /cvsroot/src/tests/lib/libc/kevent_nullmnt/t_nullmnt.sh,v
retrieving revision 1.5
diff -u -p -r1.5 t_nullmnt.sh
--- tests/lib/libc/kevent_nullmnt/t_nullmnt.sh	4 Jun 2022 20:32:49 -0000	1.5
+++ tests/lib/libc/kevent_nullmnt/t_nullmnt.sh	12 Jul 2022 00:17:28 -0000
@@ -33,7 +33,6 @@ nullmnt_upper_lower_head()
 }
 nullmnt_upper_lower_body()
 {
-	atf_expect_fail "PR kern/56713"
 	nullmnt_common lower_dir upper_dir
 } 
 nullmnt_upper_lower_cleanup()
@@ -48,7 +47,6 @@ nullmnt_upper_upper_head()
 }
 nullmnt_upper_upper_body()
 {
-	atf_expect_fail "PR kern/56713"
 	nullmnt_common upper_dir upper_dir
 } 
 nullmnt_upper_upper_cleanup()
-- thorpej




Home | Main Index | Thread Index | Old Index