fixing zfs

To: tech-kern%NetBSD.org@localhost
Subject: fixing zfs
From: Taylor R Campbell <campbell+netbsd-tech-kern%mumble.net@localhost>
Date: Sun, 14 Oct 2012 19:36:58 +0000

The attached patches fixes a lot of issues in our zfs port mainly
having to do with locking and our (insane) vop protocols.  With it,
many of the zfs tests pass much more reliably, although there remain a
number that still fail, mainly having to do with permissions and file
flags.

Currently zfs is badly hosed, and I am pretty confident that at least
the intent of these patches is correct even if I have made some
mistakes in the details.  So I strongly doubt whether committing these
patches would make our zfs situation any worse than it currently is.

Any objections?  If not, I'll commit them tomorrow or in the next few
days.

Fix various issues in zfs life cycle and vops.

Restore some locking and unlocking that got lost randomly.

Fix zfs_netbsd_rename locking to sanitize the world first before
calling zfs_rename, so that we don't stupidly hold onto various locks
while zfs is doing its job.

Simplify zfs_netbsd_reclaim and make it work.  Now that getnewvnode
never tries to vclean itself, we don't need to worry about recursion of
zfs_obj_mutex locks.

Clarify and fix genfs node initialization and destruction.

Index: zfs_vnops.c
===================================================================
RCS file: /cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c,v
retrieving revision 1.11
diff -p -u -r1.11 zfs_vnops.c
--- zfs_vnops.c 1 Oct 2012 18:19:18 -0000       1.11
+++ zfs_vnops.c 14 Oct 2012 19:22:51 -0000
@@ -2033,6 +2033,12 @@ top:
        vnevent_rmdir(vp, dvp, name, ct);
 
        /*
+        * Grab a lock on the directory to make sure that noone is
+        * trying to add (or lookup) entries while we are removing it.
+        */
+       rw_enter(&zp->z_name_lock, RW_WRITER);
+
+       /*
         * Grab a lock on the parent pointer to make sure we play well
         * with the treewalk and directory rename code.
         */
@@ -4899,10 +4905,35 @@ zfs_netbsd_mkdir(void *v)
 static int
 zfs_netbsd_rmdir(void *v)
 {
-       struct vop_rmdir_args *ap = v;
+       struct vop_rmdir_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            *a_vp;
+               struct componentname    *a_cnp;
+       } */ *ap = v;
+       struct vnode *dvp = ap->a_dvp;
+       struct vnode *vp = ap->a_vp;
        struct componentname *cnp = ap->a_cnp;
+       int error;
+
+       KASSERT(dvp != NULL);
+       KASSERT(vp != NULL);
+       KASSERT(cnp != NULL);
+       KASSERT(cnp->cn_nameptr != NULL);
+       KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+       KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+       /*
+        * zfs_rmdir will look up the entry again itself, so ignore vp.
+        */
+       VOP_UNLOCK(vp);
+       VN_RELE(vp);
 
-       return (zfs_rmdir(ap->a_dvp, (char *)cnp->cn_nameptr, NULL, 
cnp->cn_cred, NULL, 0));
+       error = zfs_rmdir(dvp, __UNCONST(cnp->cn_nameptr), NULL, cnp->cn_cred,
+           NULL, 0);
+
+       VOP_UNLOCK(dvp);
+       VN_RELE(dvp);
+       return error;
 }
 
 static int
@@ -5053,21 +5084,63 @@ zfs_netbsd_rename(void *v)
        } */ *ap = v;
        vnode_t *fdvp = ap->a_fdvp;
        vnode_t *fvp = ap->a_fvp;
+       struct componentname *fcnp = ap->a_fcnp;
        vnode_t *tdvp = ap->a_tdvp;
        vnode_t *tvp = ap->a_tvp;
+       struct componentname *tcnp = ap->a_tcnp;
+       kauth_cred_t cred;
        int error;
 
-       error = zfs_rename(fdvp, (char *)ap->a_fcnp->cn_nameptr, tdvp,
-           (char *)ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
+       KASSERT(fdvp != NULL);
+       KASSERT(fvp != NULL);
+       KASSERT(fcnp != NULL);
+       KASSERT(fcnp->cn_nameptr != NULL);
+       KASSERT(tdvp != NULL);
+       KASSERT(tcnp != NULL);
+       KASSERT(fcnp->cn_nameptr != NULL);
+       /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
+       /* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
+       KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
+       KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
+       KASSERT(fdvp->v_type == VDIR);
+       KASSERT(tdvp->v_type == VDIR);
 
-       if (tdvp == tvp)
-               VN_RELE(tdvp);
-       else
-               VN_URELE(tdvp);
-       if (tvp)
-               VN_URELE(tvp);
-       VN_RELE(fdvp);
+       cred = fcnp->cn_cred;
+
+       /*
+        * XXX Want a better equality test.  `tcnp->cn_cred == cred'
+        * hoses p2k because puffs transmits the creds separately and
+        * allocates distinct but equivalent structures for them.
+        */
+       KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
+
+       /*
+        * Drop the insane locks.
+        */
+       VOP_UNLOCK(tdvp);
+       if ((tvp != NULL) && (tvp != tdvp))
+               VOP_UNLOCK(tvp);
+
+       /*
+        * Release the source and target nodes; zfs_rename will look
+        * them up again once the locking situation is sane.
+        */
        VN_RELE(fvp);
+       if (tvp != NULL)
+               VN_RELE(tvp);
+
+       /*
+        * Do the rename ZFSly.
+        */
+       error = zfs_rename(fdvp, __UNCONST(fcnp->cn_nameptr), tdvp,
+           __UNCONST(tcnp->cn_nameptr), cred, NULL, 0);
+
+       /*
+        * Release the directories now too, because the VOP_RENAME
+        * protocol is insane.
+        */
+       VN_RELE(fdvp);
+       VN_RELE(tdvp);
 
        return (error);
 }
@@ -5286,91 +5359,67 @@ zfs_netbsd_inactive(void *v)
        return (0);
 }
 
-/*
- * Destroy znode from taskq thread without ZFS_OBJ_MUTEX held.
- */
-static void
-zfs_reclaim_deferred(void *arg)
-{
-       znode_t *zp = arg;
-       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-       uint64_t z_id = zp->z_id;
-
-       /*
-        * Don't allow a zfs_zget() while were trying to release this znode
-        */
-       ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
-
-       /* Don't need to call ZFS_OBJ_HOLD_EXIT zfs_inactive did thatfor us. */
-       zfs_zinactive(zp);
-       
-}
-
 static int
 zfs_netbsd_reclaim(void *v)
 {
-       struct vop_reclaim_args *ap = v;
-       vnode_t *vp = ap->a_vp;
-       znode_t *zp = VTOZ(vp);
+       struct vop_reclaim_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       znode_t *zp;
        zfsvfs_t *zfsvfs;
-       int locked;
-
-       locked = 0;
-       
-       ASSERT(zp != NULL);
-       KASSERT(!vn_has_cached_data(vp));
+       int error;
 
+       KASSERT(vp != NULL);
+       zp = VTOZ(vp);
+       KASSERT(zp != NULL);
        zfsvfs = zp->z_zfsvfs;
-       
-       mutex_enter(&zp->z_lock);
-       ASSERT(zp->z_phys);
+       KASSERT(zfsvfs != NULL);
 
-//     dprintf("destroying znode %p -- vnode %p -- zp->z_buf = %p\n", zp, 
ZTOV(zp), zp->z_dbuf);
-//     rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-       genfs_node_destroy(vp);
-       cache_purge(vp);
+       /* Not until we get uvm and zfs talking to one another.  */
+       KASSERT(!vn_has_cached_data(vp));
 
+       rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
        if (zp->z_dbuf == NULL) {
                /*
                 * The fs has been unmounted, or we did a
                 * suspend/resume and this file no longer exists.
                 */
+               if (vn_has_cached_data(vp))
+                       /* zfs and uvm are hosed.  Should not happen.  */
+                       panic("zfs vnode has cached data (0): %p", vp);
                rw_exit(&zfsvfs->z_teardown_inactive_lock);
-               mutex_exit(&zp->z_lock);
                zfs_znode_free(zp);
                return (0);
        }
-       mutex_exit(&zp->z_lock);
 
-       mutex_enter(&zp->z_lock);
-       if (!zp->z_unlinked) {
-               /*
-                * XXX Hack because ZFS_OBJ_MUTEX is held we can't call 
zfs_zinactive
-                * now. I need to defer zfs_zinactive to another thread which 
doesn't hold this mutex.
-                */
-               locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
-                   ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
-               if (locked == 0) {
-                       /*
-                        * Lock can't be obtained due to deadlock possibility,
-                        * so defer znode destruction.
-                        */
-                       taskq_dispatch(system_taskq, zfs_reclaim_deferred, zp, 
0);
+       /*
+        * Attempt to push any data in the page cache.  If this fails
+        * we will get kicked out later in zfs_zinactive().
+        */
+       if (vn_has_cached_data(vp))
+               /* zfs and uvm are hosed.  Should not happen.  */
+               panic("zfs vnode has cached data (1): %p", vp);
+
+       if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+               dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+               dmu_tx_hold_bonus(tx, zp->z_id);
+               error = dmu_tx_assign(tx, TXG_WAIT);
+               if (error) {
+                       dmu_tx_abort(tx);
                } else {
-                       zfs_znode_dmu_fini(zp);
-                       /* Our LWP is holding ZFS_OBJ_HELD mutex but it was 
locked before
-                          zfs_zinactive was called therefore we can't release 
it. */
-                       if (locked == 1)
-                               ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-                       zfs_znode_free(zp);
+                       dmu_buf_will_dirty(zp->z_dbuf, tx);
+                       mutex_enter(&zp->z_lock);
+                       zp->z_atime_dirty = 0;
+                       mutex_exit(&zp->z_lock);
+                       dmu_tx_commit(tx);
                }
-       } else
-               mutex_exit(&zp->z_lock);
+       }
 
-       ZTOV(zp) = NULL;
-       vp->v_data = NULL; /* v_data must be NULL for a cleaned vnode. */
-       
-       return (0);
+       zfs_zinactive(zp);
+       rw_exit(&zfsvfs->z_teardown_inactive_lock);
+       return 0;
 }
 
 static int
Index: zfs_znode.c
===================================================================
RCS file: /cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c,v
retrieving revision 1.12
diff -p -u -r1.12 zfs_znode.c
--- zfs_znode.c 20 Nov 2011 02:54:25 -0000      1.12
+++ zfs_znode.c 14 Oct 2012 19:22:51 -0000
@@ -618,8 +618,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
        int error;
 
        zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-       for (;;) {
 
+       for (;;) {
                error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs,
                    zfs_vnodeop_p, NULL, &zp->z_vnode);
                if (__predict_true(error == 0))
@@ -628,7 +628,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
                    "error=%d\n", error);
                (void)kpause("zfsnewvn", false, hz, NULL);
        }
-       
+
        ASSERT(zp->z_dirlocks == NULL);
        ASSERT(zp->z_dbuf == NULL);
        ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
@@ -656,6 +656,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
        vp->v_vfsp = zfsvfs->z_parent->z_vfs;
        vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
        vp->v_data = zp;
+       genfs_node_init(vp, &zfs_genfsops);
        switch (vp->v_type) {
        case VDIR:
                zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
@@ -686,6 +687,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
        zp->z_zfsvfs = zfsvfs;
        mutex_exit(&zfsvfs->z_znodes_lock);
 
+       VFS_HOLD(zfsvfs->z_vfs);
        return (zp);
 }
 
@@ -998,11 +1000,6 @@ again:
        if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
                zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
                *zpp = zp;
-
-               vp = ZTOV(zp);
-               genfs_node_init(vp, &zfs_genfsops);
-               VOP_UNLOCK(vp);
-
                err = 0;
        } else {
                dmu_buf_rele(db, NULL);
@@ -1076,17 +1073,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *
        zfs_znode_free(zp);
 }
 
-/*
- * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock
- * will be released in zfs_zinactive.
- */
 void
 zfs_zinactive(znode_t *zp)
 {
        vnode_t *vp = ZTOV(zp);
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        uint64_t z_id = zp->z_id;
-       
+
        ASSERT(zp->z_dbuf && zp->z_phys);
 
        /*
@@ -1107,7 +1100,7 @@ zfs_zinactive(znode_t *zp)
        }
 
        mutex_exit(&zp->z_lock);
-       /* XXX why disabled zfs_znode_dmu_fini(zp); */
+       zfs_znode_dmu_fini(zp);
        ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
        zfs_znode_free(zp);
 }
@@ -1116,7 +1109,14 @@ void
 zfs_znode_free(znode_t *zp)
 {
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-       ASSERT(ZTOV(zp) == NULL);
+       struct vnode *vp = ZTOV(zp);
+
+       /* XXX Not all callers are from VOP_RECLAIM.  What to do?  */
+       KASSERT(vp != NULL);
+       mutex_enter(vp->v_interlock); /* XXX Necessary?  */
+       genfs_node_destroy(vp);
+       vp->v_data = NULL;
+       mutex_exit(vp->v_interlock);
 
        dprintf("destroying znode %p\n", zp);
        //cpu_Debugger();
@@ -1403,6 +1403,8 @@ top:
 
        dmu_tx_commit(tx);
 
+       zfs_range_unlock(rl);
+
        /*
         * Clear any mapped pages in the truncated region.  This has to
         * happen outside of the transaction to avoid the possibility of

Do reference counting for zfs dirlock waiters.

Solaris relies on cv_broadcast(&cv); cv_destroy(&cv) working, but
that hoses our cv_wait, which needs to continue using cv after it is
woken.  Solaris's idiom is an abuse of the condvar abstraction, but
we can get the same effect with reference counting.

Index: zfs_dir.c
===================================================================
RCS file: /cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c,v
retrieving revision 1.5
diff -p -u -r1.5 zfs_dir.c
--- zfs_dir.c   27 Feb 2010 23:43:53 -0000      1.5
+++ zfs_dir.c   14 Oct 2012 19:22:51 -0000
@@ -96,6 +96,43 @@ zfs_match_find(zfsvfs_t *zfsvfs, znode_t
 }
 
 /*
+ * Reference counting for dirlocks.  Solaris destroys the condvar as
+ * soon as it broadcasts, which works for them because cv_wait doesn't
+ * need to use the condvar after it is woken, but which is too fast and
+ * loose with the abstraction for us in NetBSD.
+ */
+
+static int
+zfs_dirlock_hold(zfs_dirlock_t *dl, znode_t *dzp)
+{
+
+       (void)dzp;              /* ignore */
+       KASSERT(mutex_owned(&dzp->z_lock));
+
+       if (dl->dl_refcnt >= ULONG_MAX) /* XXX Name this constant.  */
+               return (ENFILE);        /* XXX What to do?  */
+
+       dl->dl_refcnt++;
+       return (0);
+}
+
+static void
+zfs_dirlock_rele(zfs_dirlock_t *dl, znode_t *dzp)
+{
+
+       (void)dzp;              /* ignore */
+       KASSERT(mutex_owned(&dzp->z_lock));
+       KASSERT(dl->dl_refcnt > 0);
+
+       if (--dl->dl_refcnt == 0) {
+               if (dl->dl_namesize != 0)
+                       kmem_free(dl->dl_name, dl->dl_namesize);
+               cv_destroy(&dl->dl_cv);
+               kmem_free(dl, sizeof(*dl));
+       }
+}
+
+/*
  * Lock a directory entry.  A dirlock on <dzp, name> protects that name
  * in dzp's directory zap object.  As long as you hold a dirlock, you can
  * assume two things: (1) dzp cannot be reaped, and (2) no other thread
@@ -246,14 +283,23 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, zn
                        dl->dl_sharecnt = 0;
                        dl->dl_namelock = 0;
                        dl->dl_namesize = 0;
+                       dl->dl_refcnt = 1;
                        dl->dl_dzp = dzp;
                        dl->dl_next = dzp->z_dirlocks;
                        dzp->z_dirlocks = dl;
                        break;
-               } 
+               }
                if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
                        break;
+               error = zfs_dirlock_hold(dl, dzp);
+               if (error) {
+                       mutex_exit(&dzp->z_lock);
+                       if (!(flag & ZHAVELOCK))
+                               rw_exit(&dzp->z_name_lock);
+                       return (error);
+               }
                cv_wait(&dl->dl_cv, &dzp->z_lock);
+               zfs_dirlock_rele(dl, dzp);
        }
 
        /*
@@ -355,12 +401,8 @@ zfs_dirent_unlock(zfs_dirlock_t *dl)
                prev_dl = &cur_dl->dl_next;
        *prev_dl = dl->dl_next;
        cv_broadcast(&dl->dl_cv);
+       zfs_dirlock_rele(dl, dzp);
        mutex_exit(&dzp->z_lock);
-
-       if (dl->dl_namesize != 0)
-               kmem_free(dl->dl_name, dl->dl_namesize);
-       cv_destroy(&dl->dl_cv);
-       kmem_free(dl, sizeof (*dl));
 }
 
 /*
Index: sys/zfs_znode.h
===================================================================
RCS file: 
/cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h,v
retrieving revision 1.4
diff -p -u -r1.4 zfs_znode.h
--- sys/zfs_znode.h     27 Feb 2010 23:43:53 -0000      1.4
+++ sys/zfs_znode.h     14 Oct 2012 19:22:51 -0000
@@ -186,6 +186,7 @@ typedef struct zfs_dirlock {
        uint32_t        dl_sharecnt;    /* 0 if exclusive, > 0 if shared */
        uint8_t         dl_namelock;    /* 1 if z_name_lock is NOT held */
        uint16_t        dl_namesize;    /* set if dl_name was allocated */
+       unsigned long   dl_refcnt;      /* reference count */
        kcondvar_t      dl_cv;          /* wait for entry to be unlocked */
        struct znode    *dl_dzp;        /* directory znode */
        struct zfs_dirlock *dl_next;    /* next in z_dirlocks list */

Do reference counting for range lock waiters.

Avoid cv_broadcast(&cv); cv_destroy(&cv); which works in Solaris only
by abuse of the condvar abstraction.

There are parts of this code that should be factored into smaller
subroutines, mainly range lock allocation and initialization, but
that would make it harder to merge newer versions of zfs, so for now
I've just expanded those parts further in-line.

Index: zfs_rlock.c
===================================================================
RCS file: /cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c,v
retrieving revision 1.1.1.2
diff -p -u -r1.1.1.2 zfs_rlock.c
--- zfs_rlock.c 27 Feb 2010 22:31:26 -0000      1.1.1.2
+++ zfs_rlock.c 14 Oct 2012 19:22:51 -0000
@@ -94,6 +94,33 @@
 
 #include <sys/zfs_rlock.h>
 
+static int
+zfs_range_lock_hold(rl_t *rl)
+{
+
+       KASSERT(mutex_owned(&rl->r_zp->z_range_lock));
+
+       if (rl->r_refcnt >= ULONG_MAX)
+               return (ENFILE); /* XXX What to do?  */
+
+       rl->r_refcnt++;
+       return (0);
+}
+
+static void
+zfs_range_lock_rele(rl_t *rl)
+{
+
+       KASSERT(mutex_owned(&rl->r_zp->z_range_lock));
+       KASSERT(rl->r_refcnt > 0);
+
+       if (--rl->r_refcnt == 0) {
+               cv_destroy(&rl->r_wr_cv);
+               cv_destroy(&rl->r_rd_cv);
+               kmem_free(rl, sizeof (rl_t));
+       }
+}
+
 /*
  * Check if a write lock can be grabbed, or wait and recheck until available.
  */
@@ -169,10 +196,12 @@ zfs_range_lock_writer(znode_t *zp, rl_t 
                return;
 wait:
                if (!rl->r_write_wanted) {
-                       cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
                        rl->r_write_wanted = B_TRUE;
                }
+               if (zfs_range_lock_hold(rl) != 0)
+                       panic("too many waiters on zfs range lock %p", rl);
                cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
+               zfs_range_lock_rele(rl);
 
                /* reset to original */
                new->r_off = off;
@@ -205,8 +234,11 @@ zfs_range_proxify(avl_tree_t *tree, rl_t
        proxy->r_cnt = 1;
        proxy->r_type = RL_READER;
        proxy->r_proxy = B_TRUE;
+       cv_init(&proxy->r_wr_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&proxy->r_rd_cv, NULL, CV_DEFAULT, NULL);
        proxy->r_write_wanted = B_FALSE;
        proxy->r_read_wanted = B_FALSE;
+       proxy->r_refcnt = 1;
        avl_add(tree, proxy);
 
        return (proxy);
@@ -234,6 +266,9 @@ zfs_range_split(avl_tree_t *tree, rl_t *
        rear->r_cnt = rl->r_cnt;
        rear->r_type = RL_READER;
        rear->r_proxy = B_TRUE;
+       cv_init(&rear->r_wr_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&rear->r_rd_cv, NULL, CV_DEFAULT, NULL);
+       rear->r_refcnt = 1;
        rear->r_write_wanted = B_FALSE;
        rear->r_read_wanted = B_FALSE;
 
@@ -259,8 +294,11 @@ zfs_range_new_proxy(avl_tree_t *tree, ui
        rl->r_cnt = 1;
        rl->r_type = RL_READER;
        rl->r_proxy = B_TRUE;
+       cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&rl->r_rd_cv, NULL, CV_DEFAULT, NULL);
        rl->r_write_wanted = B_FALSE;
        rl->r_read_wanted = B_FALSE;
+       rl->r_refcnt = 1;
        avl_add(tree, rl);
 }
 
@@ -372,10 +410,13 @@ retry:
        if (prev && (off < prev->r_off + prev->r_len)) {
                if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
                        if (!prev->r_read_wanted) {
-                               cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
                                prev->r_read_wanted = B_TRUE;
                        }
+                       if (zfs_range_lock_hold(prev) != 0)
+                               panic("too many waiters on zfs range lock %p",
+                                   prev);
                        cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
+                       zfs_range_lock_rele(prev);
                        goto retry;
                }
                if (off + len < prev->r_off + prev->r_len)
@@ -395,10 +436,13 @@ retry:
                        goto got_lock;
                if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
                        if (!next->r_read_wanted) {
-                               cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
                                next->r_read_wanted = B_TRUE;
                        }
+                       if (zfs_range_lock_hold(next) != 0)
+                               panic("too many waiters on zfs range lock %p",
+                                   next);
                        cv_wait(&next->r_rd_cv, &zp->z_range_lock);
+                       zfs_range_lock_rele(next);
                        goto retry;
                }
                if (off + len <= next->r_off + next->r_len)
@@ -435,20 +479,25 @@ zfs_range_lock(znode_t *zp, uint64_t off
        new->r_cnt = 1; /* assume it's going to be in the tree */
        new->r_type = type;
        new->r_proxy = B_FALSE;
+       cv_init(&new->r_wr_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&new->r_rd_cv, NULL, CV_DEFAULT, NULL);
        new->r_write_wanted = B_FALSE;
        new->r_read_wanted = B_FALSE;
+       new->r_refcnt = 1;
 
        mutex_enter(&zp->z_range_lock);
        if (type == RL_READER) {
                /*
                 * First check for the usual case of no locks
                 */
-               if (avl_numnodes(&zp->z_range_avl) == 0)
+               if (avl_numnodes(&zp->z_range_avl) == 0) {
                        avl_add(&zp->z_range_avl, new);
-               else
+               } else {
                        zfs_range_lock_reader(zp, new);
-       } else
+               }
+       } else {
                zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
+       }
        mutex_exit(&zp->z_range_lock);
        return (new);
 }
@@ -474,11 +523,9 @@ zfs_range_unlock_reader(znode_t *zp, rl_
                avl_remove(tree, remove);
                if (remove->r_write_wanted) {
                        cv_broadcast(&remove->r_wr_cv);
-                       cv_destroy(&remove->r_wr_cv);
                }
                if (remove->r_read_wanted) {
                        cv_broadcast(&remove->r_rd_cv);
-                       cv_destroy(&remove->r_rd_cv);
                }
        } else {
                ASSERT3U(remove->r_cnt, ==, 0);
@@ -507,17 +554,15 @@ zfs_range_unlock_reader(znode_t *zp, rl_
                                avl_remove(tree, rl);
                                if (rl->r_write_wanted) {
                                        cv_broadcast(&rl->r_wr_cv);
-                                       cv_destroy(&rl->r_wr_cv);
                                }
                                if (rl->r_read_wanted) {
                                        cv_broadcast(&rl->r_rd_cv);
-                                       cv_destroy(&rl->r_rd_cv);
                                }
-                               kmem_free(rl, sizeof (rl_t));
+                               zfs_range_lock_rele(rl);
                        }
                }
        }
-       kmem_free(remove, sizeof (rl_t));
+       zfs_range_lock_rele(remove);
 }
 
 /*
@@ -536,16 +581,14 @@ zfs_range_unlock(rl_t *rl)
        if (rl->r_type == RL_WRITER) {
                /* writer locks can't be shared or split */
                avl_remove(&zp->z_range_avl, rl);
-               mutex_exit(&zp->z_range_lock);
                if (rl->r_write_wanted) {
                        cv_broadcast(&rl->r_wr_cv);
-                       cv_destroy(&rl->r_wr_cv);
                }
                if (rl->r_read_wanted) {
                        cv_broadcast(&rl->r_rd_cv);
-                       cv_destroy(&rl->r_rd_cv);
                }
-               kmem_free(rl, sizeof (rl_t));
+               zfs_range_lock_rele(rl);
+               mutex_exit(&zp->z_range_lock);
        } else {
                /*
                 * lock may be shared, let zfs_range_unlock_reader()
@@ -577,11 +620,11 @@ zfs_range_reduce(rl_t *rl, uint64_t off,
        mutex_enter(&zp->z_range_lock);
        rl->r_off = off;
        rl->r_len = len;
-       mutex_exit(&zp->z_range_lock);
        if (rl->r_write_wanted)
                cv_broadcast(&rl->r_wr_cv);
        if (rl->r_read_wanted)
                cv_broadcast(&rl->r_rd_cv);
+       mutex_exit(&zp->z_range_lock);
 }
 
 /*
Index: sys/zfs_rlock.h
===================================================================
RCS file: 
/cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_rlock.h,v
retrieving revision 1.1.1.1
diff -p -u -r1.1.1.1 zfs_rlock.h
--- sys/zfs_rlock.h     7 Aug 2009 18:33:43 -0000       1.1.1.1
+++ sys/zfs_rlock.h     14 Oct 2012 19:22:51 -0000
@@ -54,6 +54,7 @@ typedef struct rl {
        uint8_t r_proxy;        /* acting for original range */
        uint8_t r_write_wanted; /* writer wants to lock this range */
        uint8_t r_read_wanted;  /* reader wants to lock this range */
+       unsigned long r_refcnt; /* reference count for cv waits */
 } rl_t;
 
 /*

Follow-Ups:
- Re: fixing zfs
  - From: haad
- Re: fixing zfs
  - From: Christos Zoulas

Prev by Date: Re: NetBSD vs Solaris condvar semantics
Next by Date: Kernel based virtual machine
Previous by Thread: 5.1 AUDIO_SETINFO
Next by Thread: Re: fixing zfs
Indexes:

Home | Main Index | Thread Index | Old Index