Subject: Re: Redoing file system suspension API (update)
To: Bill Studenmund <wrstuden@netbsd.org>
From: Juergen Hannken-Illjes <hannken@eis.cs.tu-bs.de>
List: tech-kern
Date: 07/02/2006 11:22:55
--MGYHOYXEY6WxJCY8
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Fri, Jun 30, 2006 at 10:11:14AM -0700, Bill Studenmund wrote:
> On Fri, Jun 30, 2006 at 10:32:20AM +0200, Juergen Hannken-Illjes wrote:
> > On Thu, Jun 29, 2006 at 08:54:54PM -0700, Bill Studenmund wrote:
> > > On Thu, Jun 29, 2006 at 09:52:43PM +0200, Juergen Hannken-Illjes wrote:
> > > 
> > > Ok. That's a mess.
> > > 
> > > We should look at what Solaris does.
> > 
> > It does what we need :-)  When aquiring a shared lock it walks a (thread-owned)
> > list and if it finds a shared lock it returns "have a lock".
> > 
> > I can't think of a way to do this without direct or indirect thread specific
> > information.  Either
> > 
> >     Add a list of "struct mount" on which we own shared locks to "struct lwp".
> >     Length will be at most the depth of mount points.
> > 
> > or
> > 
> >     Add a list of "struct lwp" who own a shared lock to our lock.  This list
> >     will become too long to be implemented as a simple list.  Would need
> >     a hash tab or so.
> 
> Let's just do what Solaris does. Add info to struct lwp so we remember 
> shared locks we hold. Since a thread will hold far fewer shared locks than 
> threads will own a given lock, the list is shorter to associate it with 
> the lwp not struct lock.

Could we agree on the attached diff?  Compiled but not tested so far.

- Enter/leave with counter instead of "return have_the_lock".  Getting
  the lock always succeeds if a thread already owns a shared or exclusive lock.

- Old or new api is selected from a mount specific flag to allow successive
  migration.

-- 
Juergen Hannken-Illjes - hannken@eis.cs.tu-bs.de - TU Braunschweig (Germany)

--MGYHOYXEY6WxJCY8
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="vntrans.diff"

Index: sys/sys/fstypes.h
===================================================================
RCS file: /cvsroot/src/sys/sys/fstypes.h,v
retrieving revision 1.8
diff -p -u -4 -r1.8 fstypes.h
--- sys/sys/fstypes.h	12 Feb 2006 01:32:07 -0000	1.8
+++ sys/sys/fstypes.h	2 Jul 2006 09:05:14 -0000
@@ -193,8 +193,9 @@ typedef struct fhandle	fhandle_t;
 #define	IMNT_SUSPEND	0x00000008	/* request upper write suspension */
 #define	IMNT_SUSPENDLOW	0x00000010	/* request lower write suspension */
 #define	IMNT_SUSPENDED	0x00000020	/* write operations are suspended */
 #define	IMNT_DTYPE	0x00000040	/* returns d_type fields */
+#define	IMNT_HAS_TRANS	0x00000080	/* supports transactions */
 
 #define __MNT_FLAGS \
 	__MNT_BASIC_FLAGS \
 	__MNT_EXPORTED_FLAGS \
@@ -237,8 +238,9 @@ typedef struct fhandle	fhandle_t;
 	"\01MNT_RDONLY"
 
 #define __IMNT_FLAG_BITS \
 	"\20" \
+	"\10IMNT_HAS_TRANS" \
 	"\07IMNT_DTYPE" \
 	"\06IMNT_SUSPENDED" \
 	"\05IMNT_SUSPENDLOW" \
 	"\04IMNT_SUSPEND" \
Index: sys/sys/lwp.h
===================================================================
RCS file: /cvsroot/src/sys/sys/lwp.h,v
retrieving revision 1.37
diff -p -u -4 -r1.37 lwp.h
--- sys/sys/lwp.h	22 May 2006 13:43:54 -0000	1.37
+++ sys/sys/lwp.h	2 Jul 2006 09:05:14 -0000
@@ -72,8 +72,9 @@ struct	lwp {
 	int	l_holdcnt;	/* If non-zero, don't swap. */
 	void	*l_ctxlink;	/* uc_link {get,set}context */
 	int	l_dupfd;	/* Sideways return value from cloning devices XXX */
 	struct sadata_vp *l_savp; /* SA "virtual processor" */
+	void	*l_vntrans;	/* vn_trans private data */
 
 	int	l_locks;       	/* DEBUG: lockmgr count of held locks */
 	void	*l_private;	/* svr4-style lwp-private data */
 
Index: sys/sys/mount.h
===================================================================
RCS file: /cvsroot/src/sys/sys/mount.h,v
retrieving revision 1.142
diff -p -u -4 -r1.142 mount.h
--- sys/sys/mount.h	17 Jun 2006 07:06:50 -0000	1.142
+++ sys/sys/mount.h	2 Jul 2006 09:05:15 -0000
@@ -97,8 +97,9 @@ struct mount {
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
 	struct vnode	*mnt_syncer;		/* syncer vnode */
 	struct vnodelst	mnt_vnodelist;		/* list of vnodes this mount */
 	struct lock	mnt_lock;		/* mount structure lock */
+	struct lock	mnt_trans_lock;		/* mount transaction lock */
 	int		mnt_flag;		/* flags */
 	int		mnt_iflag;		/* internal flags */
 	int		mnt_fs_bshift;		/* offset shift for lblkno */
 	int		mnt_dev_bshift;		/* shift for device sectors */
Index: sys/sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.155
diff -p -u -4 -r1.155 vnode.h
--- sys/sys/vnode.h	23 Jun 2006 14:13:02 -0000	1.155
+++ sys/sys/vnode.h	2 Jul 2006 09:05:15 -0000
@@ -493,17 +493,8 @@ struct vop_generic_args {
 #define	VDESC(OP) (& __CONCAT(OP,_desc))
 #define	VOFFSET(OP) (VDESC(OP)->vdesc_offset)
 
 /*
- * Functions to gate filesystem write operations. Declared static inline
- * here because they usually go into time critical code paths.
- */
-#include <sys/mount.h>
-
-int vn_start_write(struct vnode *, struct mount **, int);
-void vn_finished_write(struct mount *, int);
-
-/*
  * Finally, include the default set of vnode operations.
  */
 #include <sys/vnode_if.h>
 
@@ -571,8 +562,13 @@ int	vn_cow_establish(struct vnode *, int
             void *);
 int	vn_cow_disestablish(struct vnode *, int (*)(void *, struct buf *),
             void *);
 void	vn_ra_allocctx(struct vnode *);
+void	vn_trans_destroy(struct lwp *);
+int	vn_trans_lock(struct mount *, int);
+void	vn_trans_unlock(struct mount *);
+int	vn_start_write(struct vnode *, struct mount **, int);
+void	vn_finished_write(struct mount *, int);
 
 /* initialise global vnode management */
 void	vntblinit(void);
 
Index: sys/kern/kern_exit.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_exit.c,v
retrieving revision 1.156
diff -p -u -4 -r1.156 kern_exit.c
--- sys/kern/kern_exit.c	14 May 2006 21:15:11 -0000	1.156
+++ sys/kern/kern_exit.c	2 Jul 2006 09:05:12 -0000
@@ -350,8 +350,10 @@ exit1(struct lwp *l, int rv)
 	 */
 #ifndef __NO_CPU_LWP_FREE
 	cpu_lwp_free(l, 1);
 #endif
+	if (l->l_vntrans)
+		vn_trans_destroy(l);
 
 	pmap_deactivate(l);
 
 	/*
Index: sys/kern/vfs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.267
diff -p -u -4 -r1.267 vfs_subr.c
--- sys/kern/vfs_subr.c	23 Jun 2006 14:13:02 -0000	1.267
+++ sys/kern/vfs_subr.c	2 Jul 2006 09:05:13 -0000
@@ -358,8 +358,9 @@ vfs_rootmountalloc(const char *fstypenam
 		return (ENODEV);
 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 	memset((char *)mp, 0, (u_long)sizeof(struct mount));
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+	lockinit(&mp->mnt_trans_lock, PVFS, "suspfs", 0, 0);
 	simple_lock_init(&mp->mnt_slock);
 	(void)vfs_busy(mp, LK_NOWAIT, 0);
 	LIST_INIT(&mp->mnt_vnodelist);
 	mp->mnt_op = vfsp;
Index: sys/kern/vfs_syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.243
diff -p -u -4 -r1.243 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c	17 Jun 2006 07:06:51 -0000	1.243
+++ sys/kern/vfs_syscalls.c	2 Jul 2006 09:05:13 -0000
@@ -319,8 +319,9 @@ sys_mount(struct lwp *l, void *v, regist
 	mp = (struct mount *)malloc((u_long)sizeof(struct mount),
 		M_MOUNT, M_WAITOK);
 	memset((char *)mp, 0, (u_long)sizeof(struct mount));
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+	lockinit(&mp->mnt_trans_lock, PVFS, "suspfs", 0, 0);
 	simple_lock_init(&mp->mnt_slock);
 	(void)vfs_busy(mp, LK_NOWAIT, 0);
 	mp->mnt_op = vfs;
 	vfs->vfs_refcount++;
@@ -616,8 +617,9 @@ dounmount(struct mount *mp, int flags, s
 	if (LIST_FIRST(&mp->mnt_vnodelist) != NULL)
 		panic("unmount: dangling vnode");
 	mp->mnt_iflag |= IMNT_GONE;
 	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock);
+	lockmgr(&mp->mnt_trans_lock, LK_DRAIN, NULL);
 	if (used_syncer)
 		lockmgr(&syncer_lock, LK_RELEASE, NULL);
 	simple_lock(&mp->mnt_slock);
 	while (mp->mnt_wcnt > 0) {
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.112
diff -p -u -4 -r1.112 vfs_vnops.c
--- sys/kern/vfs_vnops.c	27 May 2006 23:46:49 -0000	1.112
+++ sys/kern/vfs_vnops.c	2 Jul 2006 09:05:13 -0000
@@ -938,8 +938,105 @@ vn_extattr_rm(struct vnode *vp, int iofl
 	return (error);
 }
 
 /*
+ * vnode transaction interface.
+ */
+
+struct vn_trans_entry {
+	struct vn_trans_entry *vte_succ;
+	struct mount *vte_mount;
+	int vte_count;
+};
+
+POOL_INIT(vn_trans_pl, sizeof(struct vn_trans_entry), 0, 0, 0, "vntrans", NULL);
+
+void
+vn_trans_destroy(struct lwp *l)
+{
+	struct vn_trans_entry *vte, *vtn;
+
+	for (vte = l->l_vntrans; vte; vte = vtn) {
+		KASSERT(vte->vte_mount == NULL);
+		KASSERT(vte->vte_count == 0);
+		vtn = vte->vte_succ;
+		pool_put(&vn_trans_pl, vte);
+	}
+}
+
+int
+vn_trans_lock(struct mount *mp, int flags)
+{
+	int error, pflags;
+	struct vn_trans_entry *vte, *new_vte;
+
+	KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0);
+
+	if (mp == NULL)
+		return 0;
+	if ((mp->mnt_iflag & IMNT_HAS_TRANS) == 0)
+		return 0;
+
+	new_vte = NULL;
+	for (vte = curlwp->l_vntrans; vte; vte = vte->vte_succ) {
+		if (vte->vte_mount == NULL && new_vte == NULL)
+			new_vte = vte;
+		if (vte->vte_mount == mp) {
+			vte->vte_count += 1;
+			return 0;
+		}
+	}
+
+	if (new_vte == NULL) {
+		pflags = (flags & LK_NOWAIT) ? PR_NOWAIT : 0;
+		if ((new_vte = pool_get(&vn_trans_pl, pflags)) == NULL)
+			return EWOULDBLOCK;
+		new_vte->vte_mount = NULL;
+		new_vte->vte_count = 0;
+		new_vte->vte_succ = curlwp->l_vntrans;
+		curlwp->l_vntrans = new_vte;
+	} else {
+		KASSERT(new_vte->vte_mount == NULL);
+		KASSERT(new_vte->vte_count == 0);
+	}
+
+	if ((error = lockmgr(&mp->mnt_trans_lock, flags, NULL)) != 0)
+		return error;
+
+	new_vte->vte_mount = mp;
+	new_vte->vte_count = 1;
+
+	return 0;
+}
+
+void
+vn_trans_unlock(struct mount *mp)
+{
+	struct vn_trans_entry *vte;
+
+	if (mp == NULL)
+		return;
+	if ((mp->mnt_iflag & IMNT_HAS_TRANS) == 0)
+		return;
+
+	for (vte = curlwp->l_vntrans; vte; vte = vte->vte_succ) {
+		if (vte->vte_mount == mp) {
+			vte->vte_count -= 1;
+			if (vte->vte_count > 0)
+				return;
+			break;
+		}
+	}
+
+	KASSERT(vte != NULL);
+	KASSERT(vte->vte_mount == mp);
+	KASSERT(vte->vte_count == 0);
+	vte->vte_mount = NULL;
+
+	lockmgr(&mp->mnt_trans_lock, LK_RELEASE, NULL);
+}
+
+/*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
@@ -965,8 +1062,10 @@ vn_start_write(struct vnode *vp, struct 
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 	mp = mp->mnt_leaf;
+	if ((mp->mnt_iflag & IMNT_HAS_TRANS) != 0)
+		return 0;
 	/*
 	 * Check on status of suspension.
 	 */
 	prio = PUSER - 1;
@@ -1006,8 +1105,10 @@ vn_finished_write(struct mount *mp, int 
 {
 	if (mp == NULL)
 		return;
 	mp = mp->mnt_leaf;
+	if ((mp->mnt_iflag & IMNT_HAS_TRANS) != 0)
+		return;
 	simple_lock(&mp->mnt_slock);
 	if ((flags & V_LOWER) == 0) {
 		mp->mnt_writeopcountupper--;
 		if (mp->mnt_writeopcountupper < 0)

--MGYHOYXEY6WxJCY8--