Subject: Re: Redoing file system suspension API (update)
To: Bill Studenmund <wrstuden@netbsd.org>
From: Juergen Hannken-Illjes <hannken@eis.cs.tu-bs.de>
List: tech-kern
Date: 07/02/2006 11:22:55
--MGYHOYXEY6WxJCY8
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
On Fri, Jun 30, 2006 at 10:11:14AM -0700, Bill Studenmund wrote:
> On Fri, Jun 30, 2006 at 10:32:20AM +0200, Juergen Hannken-Illjes wrote:
> > On Thu, Jun 29, 2006 at 08:54:54PM -0700, Bill Studenmund wrote:
> > > On Thu, Jun 29, 2006 at 09:52:43PM +0200, Juergen Hannken-Illjes wrote:
> > >
> > > Ok. That's a mess.
> > >
> > > We should look at what Solaris does.
> >
> > It does what we need :-) When aquiring a shared lock it walks a (thread-owned)
> > list and if it finds a shared lock it returns "have a lock".
> >
> > I can't think of a way to do this without direct or indirect thread specific
> > information. Either
> >
> > Add a list of "struct mount" on which we own shared locks to "struct lwp".
> > Length will be at most the depth of mount points.
> >
> > or
> >
> > Add a list of "struct lwp" who own a shared lock to our lock. This list
> > will become too long to be implemented as a simple list. Would need
> > a hash tab or so.
>
> Let's just do what Solaris does. Add info to struct lwp so we remember
> shared locks we hold. Since a thread will hold far fewer shared locks than
> threads will own a given lock, the list is shorter to associate it with
> the lwp not struct lock.
Could we agree on the attached diff? Compiled but not tested so far.
- Enter/leave with counter instead of "return have_the_lock". Getting
the lock always succeeds if a thread already owns a shared or exclusive lock.
- Old or new api is selected from a mount specific flag to allow successive
migration.
--
Juergen Hannken-Illjes - hannken@eis.cs.tu-bs.de - TU Braunschweig (Germany)
--MGYHOYXEY6WxJCY8
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="vntrans.diff"
Index: sys/sys/fstypes.h
===================================================================
RCS file: /cvsroot/src/sys/sys/fstypes.h,v
retrieving revision 1.8
diff -p -u -4 -r1.8 fstypes.h
--- sys/sys/fstypes.h 12 Feb 2006 01:32:07 -0000 1.8
+++ sys/sys/fstypes.h 2 Jul 2006 09:05:14 -0000
@@ -193,8 +193,9 @@ typedef struct fhandle fhandle_t;
#define IMNT_SUSPEND 0x00000008 /* request upper write suspension */
#define IMNT_SUSPENDLOW 0x00000010 /* request lower write suspension */
#define IMNT_SUSPENDED 0x00000020 /* write operations are suspended */
#define IMNT_DTYPE 0x00000040 /* returns d_type fields */
+#define IMNT_HAS_TRANS 0x00000080 /* supports transactions */
#define __MNT_FLAGS \
__MNT_BASIC_FLAGS \
__MNT_EXPORTED_FLAGS \
@@ -237,8 +238,9 @@ typedef struct fhandle fhandle_t;
"\01MNT_RDONLY"
#define __IMNT_FLAG_BITS \
"\20" \
+ "\10IMNT_HAS_TRANS" \
"\07IMNT_DTYPE" \
"\06IMNT_SUSPENDED" \
"\05IMNT_SUSPENDLOW" \
"\04IMNT_SUSPEND" \
Index: sys/sys/lwp.h
===================================================================
RCS file: /cvsroot/src/sys/sys/lwp.h,v
retrieving revision 1.37
diff -p -u -4 -r1.37 lwp.h
--- sys/sys/lwp.h 22 May 2006 13:43:54 -0000 1.37
+++ sys/sys/lwp.h 2 Jul 2006 09:05:14 -0000
@@ -72,8 +72,9 @@ struct lwp {
int l_holdcnt; /* If non-zero, don't swap. */
void *l_ctxlink; /* uc_link {get,set}context */
int l_dupfd; /* Sideways return value from cloning devices XXX */
struct sadata_vp *l_savp; /* SA "virtual processor" */
+ void *l_vntrans; /* vn_trans private data */
int l_locks; /* DEBUG: lockmgr count of held locks */
void *l_private; /* svr4-style lwp-private data */
Index: sys/sys/mount.h
===================================================================
RCS file: /cvsroot/src/sys/sys/mount.h,v
retrieving revision 1.142
diff -p -u -4 -r1.142 mount.h
--- sys/sys/mount.h 17 Jun 2006 07:06:50 -0000 1.142
+++ sys/sys/mount.h 2 Jul 2006 09:05:15 -0000
@@ -97,8 +97,9 @@ struct mount {
struct vnode *mnt_vnodecovered; /* vnode we mounted on */
struct vnode *mnt_syncer; /* syncer vnode */
struct vnodelst mnt_vnodelist; /* list of vnodes this mount */
struct lock mnt_lock; /* mount structure lock */
+ struct lock mnt_trans_lock; /* mount transaction lock */
int mnt_flag; /* flags */
int mnt_iflag; /* internal flags */
int mnt_fs_bshift; /* offset shift for lblkno */
int mnt_dev_bshift; /* shift for device sectors */
Index: sys/sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.155
diff -p -u -4 -r1.155 vnode.h
--- sys/sys/vnode.h 23 Jun 2006 14:13:02 -0000 1.155
+++ sys/sys/vnode.h 2 Jul 2006 09:05:15 -0000
@@ -493,17 +493,8 @@ struct vop_generic_args {
#define VDESC(OP) (& __CONCAT(OP,_desc))
#define VOFFSET(OP) (VDESC(OP)->vdesc_offset)
/*
- * Functions to gate filesystem write operations. Declared static inline
- * here because they usually go into time critical code paths.
- */
-#include <sys/mount.h>
-
-int vn_start_write(struct vnode *, struct mount **, int);
-void vn_finished_write(struct mount *, int);
-
-/*
* Finally, include the default set of vnode operations.
*/
#include <sys/vnode_if.h>
@@ -571,8 +562,13 @@ int vn_cow_establish(struct vnode *, int
void *);
int vn_cow_disestablish(struct vnode *, int (*)(void *, struct buf *),
void *);
void vn_ra_allocctx(struct vnode *);
+void vn_trans_destroy(struct lwp *);
+int vn_trans_lock(struct mount *, int);
+void vn_trans_unlock(struct mount *);
+int vn_start_write(struct vnode *, struct mount **, int);
+void vn_finished_write(struct mount *, int);
/* initialise global vnode management */
void vntblinit(void);
Index: sys/kern/kern_exit.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_exit.c,v
retrieving revision 1.156
diff -p -u -4 -r1.156 kern_exit.c
--- sys/kern/kern_exit.c 14 May 2006 21:15:11 -0000 1.156
+++ sys/kern/kern_exit.c 2 Jul 2006 09:05:12 -0000
@@ -350,8 +350,10 @@ exit1(struct lwp *l, int rv)
*/
#ifndef __NO_CPU_LWP_FREE
cpu_lwp_free(l, 1);
#endif
+ if (l->l_vntrans)
+ vn_trans_destroy(l);
pmap_deactivate(l);
/*
Index: sys/kern/vfs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.267
diff -p -u -4 -r1.267 vfs_subr.c
--- sys/kern/vfs_subr.c 23 Jun 2006 14:13:02 -0000 1.267
+++ sys/kern/vfs_subr.c 2 Jul 2006 09:05:13 -0000
@@ -358,8 +358,9 @@ vfs_rootmountalloc(const char *fstypenam
return (ENODEV);
mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
memset((char *)mp, 0, (u_long)sizeof(struct mount));
lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+ lockinit(&mp->mnt_trans_lock, PVFS, "suspfs", 0, 0);
simple_lock_init(&mp->mnt_slock);
(void)vfs_busy(mp, LK_NOWAIT, 0);
LIST_INIT(&mp->mnt_vnodelist);
mp->mnt_op = vfsp;
Index: sys/kern/vfs_syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.243
diff -p -u -4 -r1.243 vfs_syscalls.c
--- sys/kern/vfs_syscalls.c 17 Jun 2006 07:06:51 -0000 1.243
+++ sys/kern/vfs_syscalls.c 2 Jul 2006 09:05:13 -0000
@@ -319,8 +319,9 @@ sys_mount(struct lwp *l, void *v, regist
mp = (struct mount *)malloc((u_long)sizeof(struct mount),
M_MOUNT, M_WAITOK);
memset((char *)mp, 0, (u_long)sizeof(struct mount));
lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+ lockinit(&mp->mnt_trans_lock, PVFS, "suspfs", 0, 0);
simple_lock_init(&mp->mnt_slock);
(void)vfs_busy(mp, LK_NOWAIT, 0);
mp->mnt_op = vfs;
vfs->vfs_refcount++;
@@ -616,8 +617,9 @@ dounmount(struct mount *mp, int flags, s
if (LIST_FIRST(&mp->mnt_vnodelist) != NULL)
panic("unmount: dangling vnode");
mp->mnt_iflag |= IMNT_GONE;
lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock);
+ lockmgr(&mp->mnt_trans_lock, LK_DRAIN, NULL);
if (used_syncer)
lockmgr(&syncer_lock, LK_RELEASE, NULL);
simple_lock(&mp->mnt_slock);
while (mp->mnt_wcnt > 0) {
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.112
diff -p -u -4 -r1.112 vfs_vnops.c
--- sys/kern/vfs_vnops.c 27 May 2006 23:46:49 -0000 1.112
+++ sys/kern/vfs_vnops.c 2 Jul 2006 09:05:13 -0000
@@ -938,8 +938,105 @@ vn_extattr_rm(struct vnode *vp, int iofl
return (error);
}
/*
+ * vnode transaction interface.
+ */
+
+struct vn_trans_entry {
+ struct vn_trans_entry *vte_succ;
+ struct mount *vte_mount;
+ int vte_count;
+};
+
+POOL_INIT(vn_trans_pl, sizeof(struct vn_trans_entry), 0, 0, 0, "vntrans", NULL);
+
+void
+vn_trans_destroy(struct lwp *l)
+{
+ struct vn_trans_entry *vte, *vtn;
+
+ for (vte = l->l_vntrans; vte; vte = vtn) {
+ KASSERT(vte->vte_mount == NULL);
+ KASSERT(vte->vte_count == 0);
+ vtn = vte->vte_succ;
+ pool_put(&vn_trans_pl, vte);
+ }
+}
+
+int
+vn_trans_lock(struct mount *mp, int flags)
+{
+ int error, pflags;
+ struct vn_trans_entry *vte, *new_vte;
+
+ KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0);
+
+ if (mp == NULL)
+ return 0;
+ if ((mp->mnt_iflag & IMNT_HAS_TRANS) == 0)
+ return 0;
+
+ new_vte = NULL;
+ for (vte = curlwp->l_vntrans; vte; vte = vte->vte_succ) {
+ if (vte->vte_mount == NULL && new_vte == NULL)
+ new_vte = vte;
+ if (vte->vte_mount == mp) {
+ vte->vte_count += 1;
+ return 0;
+ }
+ }
+
+ if (new_vte == NULL) {
+ pflags = (flags & LK_NOWAIT) ? PR_NOWAIT : 0;
+ if ((new_vte = pool_get(&vn_trans_pl, pflags)) == NULL)
+ return EWOULDBLOCK;
+ new_vte->vte_mount = NULL;
+ new_vte->vte_count = 0;
+ new_vte->vte_succ = curlwp->l_vntrans;
+ curlwp->l_vntrans = new_vte;
+ } else {
+ KASSERT(new_vte->vte_mount == NULL);
+ KASSERT(new_vte->vte_count == 0);
+ }
+
+ if ((error = lockmgr(&mp->mnt_trans_lock, flags, NULL)) != 0)
+ return error;
+
+ new_vte->vte_mount = mp;
+ new_vte->vte_count = 1;
+
+ return 0;
+}
+
+void
+vn_trans_unlock(struct mount *mp)
+{
+ struct vn_trans_entry *vte;
+
+ if (mp == NULL)
+ return;
+ if ((mp->mnt_iflag & IMNT_HAS_TRANS) == 0)
+ return;
+
+ for (vte = curlwp->l_vntrans; vte; vte = vte->vte_succ) {
+ if (vte->vte_mount == mp) {
+ vte->vte_count -= 1;
+ if (vte->vte_count > 0)
+ return;
+ break;
+ }
+ }
+
+ KASSERT(vte != NULL);
+ KASSERT(vte->vte_mount == mp);
+ KASSERT(vte->vte_count == 0);
+ vte->vte_mount = NULL;
+
+ lockmgr(&mp->mnt_trans_lock, LK_RELEASE, NULL);
+}
+
+/*
* Preparing to start a filesystem write operation. If the operation is
* permitted, then we bump the count of operations in progress and
* proceed. If a suspend request is in progress, we wait until the
* suspension is over, and then proceed.
@@ -965,8 +1062,10 @@ vn_start_write(struct vnode *vp, struct
}
if ((mp = *mpp) == NULL)
return (0);
mp = mp->mnt_leaf;
+ if ((mp->mnt_iflag & IMNT_HAS_TRANS) != 0)
+ return 0;
/*
* Check on status of suspension.
*/
prio = PUSER - 1;
@@ -1006,8 +1105,10 @@ vn_finished_write(struct mount *mp, int
{
if (mp == NULL)
return;
mp = mp->mnt_leaf;
+ if ((mp->mnt_iflag & IMNT_HAS_TRANS) != 0)
+ return;
simple_lock(&mp->mnt_slock);
if ((flags & V_LOWER) == 0) {
mp->mnt_writeopcountupper--;
if (mp->mnt_writeopcountupper < 0)
--MGYHOYXEY6WxJCY8--