Re: kern/41417 (WAPBL: hang on tstile)

To: netbsd-bugs%netbsd.org@localhost
Subject: Re: kern/41417 (WAPBL: hang on tstile)
From: buhrow%lothlorien.nfbcal.org@localhost (Brian Buhrow)
Date: Mon, 7 May 2012 12:51:13 -0700
        Hello.  More testing has revealed a minor misunderstanding between the
vnode API in -current and 5.x.  The below patch, against NetBSD-5.1
sources, rolls all the accumulated patches into one patch set.  With this
patch, I believe you can now run with WAPBL, softdep or traditional ufs
semantics with heavy file loads and avoid panics due to resource exhaustion
and/or tstile deadlocks.  Testing has been done on I386, both uniprocessor
and multiprocessor, and on Sparc  machines in uniprocessor mode, though I
think multiprocessor Sparc would be fine as well.  Since these changes are
machine independent, I don't anticipate any issues on any platform.  It is
my hope that modulo any final issues that come up in the final round of
testing I'm currently performing, these patches will be ready to be pulled
up into the NetBSD-5 branch.
        Finally, I'd like to thank mouse@ and hannkan@ for their help and
patience in helping me track down and test the final versions of these
patches.  With their assistance, I'm confident these patches make NetBSD-5
a much more stable and robust operating environment in a variety of
setings.


-Brian



Index: ufs/lfs/lfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/lfs/lfs_vnops.c,v
retrieving revision 1.218
diff -u -r1.218 lfs_vnops.c
--- ufs/lfs/lfs_vnops.c 24 Jun 2008 10:47:32 -0000      1.218
+++ ufs/lfs/lfs_vnops.c 7 May 2012 18:46:54 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: lfs_vnops.c,v 1.218 2008/06/24 10:47:32 gmcgarry Exp $ */
+/*     $NetBSD: lfs_vnops.c,v 1.237 2011/07/12 16:59:48 dholland Exp $ */
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -602,13 +602,18 @@
        int error;
        struct mount    *mp;
        ino_t           ino;
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(ap->a_dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
 
        if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
                vput(ap->a_dvp);
                return error;
        }
        error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
-                             ap->a_dvp, vpp, ap->a_cnp);
+                             ap->a_dvp, ulr, vpp, ap->a_cnp);
 
        /* Either way we're done with the dirop at this point */
        SET_ENDOP_CREATE_AP(ap, "mknod");
Index: ufs/ufs/inode.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/inode.h,v
retrieving revision 1.54.4.1
diff -u -r1.54.4.1 inode.h
--- ufs/ufs/inode.h     29 Nov 2008 23:10:18 -0000      1.54.4.1
+++ ufs/ufs/inode.h     7 May 2012 18:46:54 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: inode.h,v 1.54.4.1 2008/11/29 23:10:18 snj Exp $       */
+/*     $NetBSD: inode.h,v 1.58 2011/07/12 02:22:13 dholland Exp $      */
 
 /*
  * Copyright (c) 1982, 1989, 1993
@@ -47,6 +47,22 @@
 #include <miscfs/genfs/genfs_node.h>
 
 /*
+ * Lookup result state (other than the result inode). This is
+ * currently stashed in the vnode between VOP_LOOKUP and directory
+ * operation VOPs, which is gross.
+ */
+struct ufs_lookup_results {
+       int32_t   ulr_count;    /* Size of free slot in directory. */
+       doff_t    ulr_endoff;   /* End of useful stuff in directory. */
+       doff_t    ulr_diroff;   /* Offset in dir, where we found last entry. */
+       doff_t    ulr_offset;   /* Offset of free space in directory. */
+       u_int32_t ulr_reclen;   /* Size of found directory entry. */
+};
+
+/* notyet XXX */
+#define UFS_CHECK_CRAPCOUNTER(dp)
+
+/*
  * Per-filesystem inode extensions.
  */
 struct ffs_inode_ext {
@@ -98,14 +114,18 @@
        struct   lockf *i_lockf;/* Head of byte-level lock list. */
 
        /*
-        * Side effects; used during directory lookup.
+        * Side effects; used during (and after) directory lookup.
+        * XXX should not be here.
         */
-       int32_t   i_count;      /* Size of free slot in directory. */
-       doff_t    i_endoff;     /* End of useful stuff in directory. */
-       doff_t    i_diroff;     /* Offset in dir, where we found last entry. */
-       doff_t    i_offset;     /* Offset of free space in directory. */
-       u_int32_t i_reclen;     /* Size of found directory entry. */
+       struct ufs_lookup_results i_crap;
        int       i_ffs_effnlink;  /* i_nlink when I/O completes */
+
+#define i_count i_crap.ulr_count
+#define i_endoff i_crap.ulr_endoff
+#define i_diroff i_crap.ulr_diroff
+#define i_offset i_crap.ulr_offset
+#define i_reclen i_crap.ulr_reclen
+
        /*
         * Inode extensions
         */
Index: ufs/ufs/ufs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_extern.h,v
retrieving revision 1.60
diff -u -r1.60 ufs_extern.h
--- ufs/ufs/ufs_extern.h        31 May 2008 21:37:08 -0000      1.60
+++ ufs/ufs/ufs_extern.h        7 May 2012 18:46:54 -0000
@@ -124,12 +124,14 @@
 int    ufs_dirbadentry(struct vnode *, struct direct *, int);
 void   ufs_makedirentry(struct inode *, struct componentname *,
                         struct direct *);
-int    ufs_direnter(struct vnode *, struct vnode *, struct direct *,
+int    ufs_direnter(struct vnode *, const struct ufs_lookup_results *, struct 
vnode *, struct direct *,
                     struct componentname *, struct buf *);
-int    ufs_dirremove(struct vnode *, struct inode *, int, int);
-int    ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int, int);
+int    ufs_dirremove(struct vnode *, const struct ufs_lookup_results *, struct 
inode *, int, int);
+int    ufs_dirrewrite(struct inode *, off_t, struct inode *, ino_t, int, int, 
int);
 int    ufs_dirempty(struct inode *, ino_t, kauth_cred_t);
 int    ufs_checkpath(struct inode *, struct inode *, kauth_cred_t);
+int    ufs_parentcheck(struct vnode *, struct vnode *, kauth_cred_t,
+                       int *, struct vnode **);
 int    ufs_blkatoff(struct vnode *, off_t, char **, struct buf **, bool);
 
 /* ufs_quota.c */
@@ -161,7 +163,7 @@
 /* ufs_vnops.c */
 void   ufs_vinit(struct mount *, int (**)(void *),
                  int (**)(void *), struct vnode **);
-int    ufs_makeinode(int, struct vnode *, struct vnode **,
+int    ufs_makeinode(int, struct vnode *, const struct ufs_lookup_results *, 
struct vnode **,
                      struct componentname *);
 int    ufs_gop_alloc(struct vnode *, off_t, off_t, int, kauth_cred_t);
 void   ufs_gop_markupdate(struct vnode *, int);
Index: ufs/ufs/ufs_lookup.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_lookup.c,v
retrieving revision 1.99.4.1
diff -u -r1.99.4.1 ufs_lookup.c
--- ufs/ufs/ufs_lookup.c        14 Feb 2010 13:55:29 -0000      1.99.4.1
+++ ufs/ufs/ufs_lookup.c        7 May 2012 18:46:54 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: ufs_lookup.c,v 1.99.4.1 2010/02/14 13:55:29 bouyer Exp $       
*/
+/*     $NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $        
*/
 
 /*
  * Copyright (c) 1989, 1993
@@ -146,6 +146,7 @@
        const int needswap = UFS_MPNEEDSWAP(ump);
        int dirblksiz = ump->um_dirblksiz;
        ino_t foundino;
+       struct ufs_lookup_results *results;
 
        flags = cnp->cn_flags;
 
@@ -153,6 +154,13 @@
        slotoffset = -1;
        *vpp = NULL;
        endsearch = 0; /* silence compiler warning */
+
+       /*
+        * Produce the auxiliary lookup results into i_crap.
+        *This should not be done this way. XXX.
+        */
+       results = &dp->i_crap;
+
        /*
         * Check accessiblity of directory.
         */
@@ -227,13 +235,13 @@
                numdirpasses = 1;
                entryoffsetinblock = 0; /* silence compiler warning */
                switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
-                   &dp->i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
+                   &results->ulr_offset, &bp, nameiop == DELETE ? &prevoff : 
NULL)) {
                case 0:
                        ep = (struct direct *)((char *)bp->b_data +
-                           (dp->i_offset & bmask));
+                           (results->ulr_offset & bmask));
                        goto foundentry;
                case ENOENT:
-                       dp->i_offset = roundup(dp->i_size, dirblksiz);
+                       results->ulr_offset = roundup(dp->i_size, dirblksiz);
                        goto notfound;
                default:
                        /* Something failed; just do a linear search. */
@@ -242,35 +250,35 @@
        }
 #endif /* UFS_DIRHASH */
 
-       if (nameiop != LOOKUP || dp->i_diroff == 0 ||
-           dp->i_diroff >= dp->i_size) {
+       if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+           results->ulr_diroff >= dp->i_size) {
                entryoffsetinblock = 0;
-               dp->i_offset = 0;
+               results->ulr_offset = 0;
                numdirpasses = 1;
        } else {
-               dp->i_offset = dp->i_diroff;
-               if ((entryoffsetinblock = dp->i_offset & bmask) &&
-                   (error = ufs_blkatoff(vdp, (off_t)dp->i_offset,
+               results->ulr_offset = results->ulr_diroff;
+               if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+                   (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
                    NULL, &bp, false)))
                        goto out;
                numdirpasses = 2;
                nchstats.ncs_2passes++;
        }
-       prevoff = dp->i_offset;
+       prevoff = results->ulr_offset;
        endsearch = roundup(dp->i_size, dirblksiz);
        enduseful = 0;
 
 searchloop:
-       while (dp->i_offset < endsearch) {
+       while (results->ulr_offset < endsearch) {
                if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
                        preempt();
                /*
                 * If necessary, get the next directory block.
                 */
-               if ((dp->i_offset & bmask) == 0) {
+               if ((results->ulr_offset & bmask) == 0) {
                        if (bp != NULL)
                                brelse(bp, 0);
-                       error = ufs_blkatoff(vdp, (off_t)dp->i_offset, NULL,
+                       error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, 
NULL,
                            &bp, false);
                        if (error)
                                goto out;
@@ -298,9 +306,9 @@
                    (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
                        int i;
 
-                       ufs_dirbad(dp, dp->i_offset, "mangled entry");
+                       ufs_dirbad(dp, results->ulr_offset, "mangled entry");
                        i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
-                       dp->i_offset += i;
+                       results->ulr_offset += i;
                        entryoffsetinblock += i;
                        continue;
                }
@@ -319,16 +327,16 @@
                        if (size > 0) {
                                if (size >= slotneeded) {
                                        slotstatus = FOUND;
-                                       slotoffset = dp->i_offset;
+                                       slotoffset = results->ulr_offset;
                                        slotsize = ufs_rw16(ep->d_reclen,
                                            needswap);
                                } else if (slotstatus == NONE) {
                                        slotfreespace += size;
                                        if (slotoffset == -1)
-                                               slotoffset = dp->i_offset;
+                                               slotoffset = 
results->ulr_offset;
                                        if (slotfreespace >= slotneeded) {
                                                slotstatus = COMPACT;
-                                               slotsize = dp->i_offset +
+                                               slotsize = results->ulr_offset +
                                                    ufs_rw16(ep->d_reclen,
                                                             needswap) -
                                                    slotoffset;
@@ -365,12 +373,12 @@
                                 */
                                if (!FSFMT(vdp) && ep->d_type == DT_WHT) {
                                        slotstatus = FOUND;
-                                       slotoffset = dp->i_offset;
+                                       slotoffset = results->ulr_offset;
                                        slotsize = ufs_rw16(ep->d_reclen,
                                            needswap);
-                                       dp->i_reclen = slotsize;
+                                       results->ulr_reclen = slotsize;
                                        /*
-                                        * This is used to set dp->i_endoff,
+                                        * This is used to set 
results->ulr_endoff,
                                         * which may be used by ufs_direnter2()
                                         * as a length to truncate the
                                         * directory to.  Therefore, it must
@@ -391,15 +399,15 @@
                                        goto notfound;
                                }
                                foundino = ufs_rw32(ep->d_ino, needswap);
-                               dp->i_reclen = ufs_rw16(ep->d_reclen, needswap);
+                               results->ulr_reclen = ufs_rw16(ep->d_reclen, 
needswap);
                                goto found;
                        }
                }
-               prevoff = dp->i_offset;
-               dp->i_offset += ufs_rw16(ep->d_reclen, needswap);
+               prevoff = results->ulr_offset;
+               results->ulr_offset += ufs_rw16(ep->d_reclen, needswap);
                entryoffsetinblock += ufs_rw16(ep->d_reclen, needswap);
                if (ep->d_ino)
-                       enduseful = dp->i_offset;
+                       enduseful = results->ulr_offset;
        }
 notfound:
        /*
@@ -408,8 +416,8 @@
         */
        if (numdirpasses == 2) {
                numdirpasses--;
-               dp->i_offset = 0;
-               endsearch = dp->i_diroff;
+               results->ulr_offset = 0;
+               endsearch = results->ulr_diroff;
                goto searchloop;
        }
        if (bp != NULL)
@@ -434,29 +442,29 @@
                /*
                 * Return an indication of where the new directory
                 * entry should be put.  If we didn't find a slot,
-                * then set dp->i_count to 0 indicating
+                * then set results->ulr_count to 0 indicating
                 * that the new slot belongs at the end of the
                 * directory. If we found a slot, then the new entry
-                * can be put in the range from dp->i_offset to
-                * dp->i_offset + dp->i_count.
+                * can be put in the range from results->ulr_offset to
+                * results->ulr_offset + results->ulr_count.
                 */
                if (slotstatus == NONE) {
-                       dp->i_offset = roundup(dp->i_size, dirblksiz);
-                       dp->i_count = 0;
-                       enduseful = dp->i_offset;
+                       results->ulr_offset = roundup(dp->i_size, dirblksiz);
+                       results->ulr_count = 0;
+                       enduseful = results->ulr_offset;
                } else if (nameiop == DELETE) {
-                       dp->i_offset = slotoffset;
-                       if ((dp->i_offset & (dirblksiz - 1)) == 0)
-                               dp->i_count = 0;
+                       results->ulr_offset = slotoffset;
+                       if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+                               results->ulr_count = 0;
                        else
-                               dp->i_count = dp->i_offset - prevoff;
+                               results->ulr_count = results->ulr_offset - 
prevoff;
                } else {
-                       dp->i_offset = slotoffset;
-                       dp->i_count = slotsize;
+                       results->ulr_offset = slotoffset;
+                       results->ulr_count = slotsize;
                        if (enduseful < slotoffset + slotsize)
                                enduseful = slotoffset + slotsize;
                }
-               dp->i_endoff = roundup(enduseful, dirblksiz);
+               results->ulr_endoff = roundup(enduseful, dirblksiz);
 #if 0 /* commented out by dbj. none of the on disk fields changed */
                dp->i_flag |= IN_CHANGE | IN_UPDATE;
 #endif
@@ -492,9 +500,9 @@
         * Check that directory length properly reflects presence
         * of this entry.
         */
-       if (dp->i_offset + DIRSIZ(FSFMT(vdp), ep, needswap) > dp->i_size) {
-               ufs_dirbad(dp, dp->i_offset, "i_size too small");
-               dp->i_size = dp->i_offset + DIRSIZ(FSFMT(vdp), ep, needswap);
+       if (results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap) > 
dp->i_size) {
+               ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+               dp->i_size = results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, 
needswap);
                DIP_ASSIGN(dp, size, dp->i_size);
                dp->i_flag |= IN_CHANGE | IN_UPDATE;
                UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
@@ -507,7 +515,7 @@
         * in the cache as to where the entry was found.
         */
        if ((flags & ISLASTCN) && nameiop == LOOKUP)
-               dp->i_diroff = dp->i_offset &~ (dirblksiz - 1);
+               results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
 
        /*
         * If deleting, and at end of pathname, return
@@ -522,15 +530,15 @@
                if (error)
                        goto out;
                /*
-                * Return pointer to current entry in dp->i_offset,
+                * Return pointer to current entry in results->ulr_offset,
                 * and distance past previous entry (if there
-                * is a previous entry in this block) in dp->i_count.
+                * is a previous entry in this block) in results->ulr_count.
                 * Save directory inode pointer in ndp->ni_dvp for dirremove().
                 */
-               if ((dp->i_offset & (dirblksiz - 1)) == 0)
-                       dp->i_count = 0;
+               if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+                       results->ulr_count = 0;
                else
-                       dp->i_count = dp->i_offset - prevoff;
+                       results->ulr_count = results->ulr_offset - prevoff;
                if (dp->i_number == foundino) {
                        VREF(vdp);
                        *vpp = vdp;
@@ -718,8 +726,8 @@
 
 /*
  * Construct a new directory entry after a call to namei, using the
- * parameters that it left in the componentname argument cnp. The
- * argument ip is the inode to which the new directory entry will refer.
+ * name in the componentname argument cnp. The argument ip is the
+ * inode to which the new directory entry will refer.
  */
 void
 ufs_makedirentry(struct inode *ip, struct componentname *cnp,
@@ -741,15 +749,36 @@
 
 /*
  * Write a directory entry after a call to namei, using the parameters
- * that it left in nameidata. The argument dirp is the new directory
- * entry contents. Dvp is a pointer to the directory to be written,
- * which was left locked by namei. Remaining parameters (dp->i_offset,
- * dp->i_count) indicate how the space for the new entry is to be obtained.
- * Non-null bp indicates that a directory is being created (for the
- * soft dependency code).
+ * that ufs_lookup left in nameidata and in the ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * TVP is not used. (XXX: why is it here? remove it)
+ * DIRP is the new directory entry contents.
+ * CNP is the componentname from the final lookup step.
+ * NEWDIRBP is not used and (XXX) should be removed. The previous
+ * comment here said it was used by the now-removed softupdates code.
+ *
+ * The link count of the target inode is *not* incremented; the
+ * caller does that.
+ *
+ * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
+ * directory entry. ulr_offset, which is the place to put the entry,
+ * should be on a block boundary (and should be at the end of the
+ * directory AFAIK) and a fresh block is allocated to put the new
+ * directory entry in.
+ *
+ * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
+ * the entry into. This slot ranges from ulr_offset to ulr_offset +
+ * ulr_count. However, this slot may already be partially populated
+ * requiring compaction. See notes below.
+ *
+ * Furthermore, if ulr_count is not zero and ulr_endoff is not the
+ * same as i_size, the directory is truncated to size ulr_endoff.
  */
 int
-ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp,
+ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+    struct vnode *tvp, struct direct *dirp,
     struct componentname *cnp, struct buf *newdirbp)
 {
        kauth_cred_t cr;
@@ -774,26 +803,35 @@
 
        dp = VTOI(dvp);
        newentrysize = DIRSIZ(0, dirp, 0);
+       if (DOINGSOFTDEP(dvp))
+               dp->i_offset = ulr->ulr_offset; /*sofdep tags buffers withthis 
value*/
 
-       if (dp->i_count == 0) {
+ #if 0
+       struct ufs_lookup_results *ulr;
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+ #endif
+ 
+       if (ulr->ulr_count == 0) {
                /*
-                * If dp->i_count is 0, then namei could find no
-                * space in the directory. Here, dp->i_offset will
+                * If ulr_count is 0, then namei could find no
+                * space in the directory. Here, ulr_offset will
                 * be on a directory block boundary and we will write the
                 * new entry into a fresh block.
                 */
-               if (dp->i_offset & (dirblksiz - 1))
+               if (ulr->ulr_offset & (dirblksiz - 1))
                        panic("ufs_direnter: newblk");
                flags = B_CLRBUF;
                if (!DOINGSOFTDEP(dvp))
                        flags |= B_SYNC;
-               if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, dirblksiz,
+               if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
                    cr, flags, &bp)) != 0) {
                        if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
                                bdwrite(newdirbp);
                        return (error);
                }
-               dp->i_size = dp->i_offset + dirblksiz;
+               dp->i_size = ulr->ulr_offset + dirblksiz;
                DIP_ASSIGN(dp, size, dp->i_size);
                dp->i_flag |= IN_CHANGE | IN_UPDATE;
                uvm_vnp_setsize(dvp, dp->i_size);
@@ -810,14 +848,14 @@
                                dirp->d_type = tmp;
                        }
                }
-               blkoff = dp->i_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
+               blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 
1);
                memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
 #ifdef UFS_DIRHASH
                if (dp->i_dirhash != NULL) {
-                       ufsdirhash_newblk(dp, dp->i_offset);
-                       ufsdirhash_add(dp, dirp, dp->i_offset);
+                       ufsdirhash_newblk(dp, ulr->ulr_offset);
+                       ufsdirhash_add(dp, dirp, ulr->ulr_offset);
                        ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
-                           dp->i_offset);
+                           ulr->ulr_offset);
                }
 #endif
                if (DOINGSOFTDEP(dvp)) {
@@ -869,8 +907,8 @@
        }
 
        /*
-        * If dp->i_count is non-zero, then namei found space for the new
-        * entry in the range dp->i_offset to dp->i_offset + dp->i_count
+        * If ulr_count is non-zero, then namei found space for the new
+        * entry in the range ulr_offset to url_offset + url_count
         * in the directory. To use this space, we may have to compact
         * the entries located there, by copying them together towards the
         * beginning of the block, leaving the free space in one usable
@@ -884,8 +922,12 @@
         *
         * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
         */
-       if (dp->i_offset + dp->i_count > dp->i_size) {
-               dp->i_size = dp->i_offset + dp->i_count;
+       if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
+#ifdef DIAGNOSTIC
+               printf("ufs_direnter: reached 4.2-only block, "
+                      "not supposed to happen\n");
+#endif
+               dp->i_size = ulr->ulr_offset + ulr->ulr_count;
                DIP_ASSIGN(dp, size, dp->i_size);
                dp->i_flag |= IN_CHANGE | IN_UPDATE;
                UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
@@ -893,7 +935,7 @@
        /*
         * Get the block containing the space for the new directory entry.
         */
-       error = ufs_blkatoff(dvp, (off_t)dp->i_offset, &dirbuf, &bp, true);
+       error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
        if (error) {
                if (DOINGSOFTDEP(dvp) && newdirbp != NULL)
                        bdwrite(newdirbp);
@@ -908,7 +950,7 @@
        ep = (struct direct *)dirbuf;
        dsize = (ep->d_ino != 0) ?  DIRSIZ(FSFMT(dvp), ep, needswap) : 0;
        spacefree = ufs_rw16(ep->d_reclen, needswap) - dsize;
-       for (loc = ufs_rw16(ep->d_reclen, needswap); loc < dp->i_count; ) {
+       for (loc = ufs_rw16(ep->d_reclen, needswap); loc < ulr->ulr_count; ) {
                uint16_t reclen;
 
                nep = (struct direct *)(dirbuf + loc);
@@ -938,8 +980,8 @@
 #ifdef UFS_DIRHASH
                if (dp->i_dirhash != NULL)
                        ufsdirhash_move(dp, nep,
-                           dp->i_offset + ((char *)nep - dirbuf),
-                           dp->i_offset + ((char *)ep - dirbuf));
+                           ulr->ulr_offset + ((char *)nep - dirbuf),
+                           ulr->ulr_offset + ((char *)ep - dirbuf));
 #endif
                if (DOINGSOFTDEP(dvp))
                        softdep_change_directoryentry_offset(dp, dirbuf,
@@ -985,14 +1027,14 @@
 #ifdef UFS_DIRHASH
        if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
            dirp->d_reclen == spacefree))
-               ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf));
+               ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - 
dirbuf));
 #endif
        memcpy((void *)ep, (void *)dirp, (u_int)newentrysize);
 #ifdef UFS_DIRHASH
        if (dp->i_dirhash != NULL)
                ufsdirhash_checkblock(dp, dirbuf -
-                   (dp->i_offset & (dirblksiz - 1)),
-                   dp->i_offset & ~(dirblksiz - 1));
+                   (ulr->ulr_offset & (dirblksiz - 1)),
+                   ulr->ulr_offset & ~(dirblksiz - 1));
 #endif
        if (DOINGSOFTDEP(dvp)) {
                softdep_setup_directory_add(bp, dp,
@@ -1010,14 +1052,14 @@
         * lock other inodes which can lead to deadlock if we also hold a
         * lock on the newly entered node.
         */
-       if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
+       if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
                if (DOINGSOFTDEP(dvp) && (tvp != NULL))
                        VOP_UNLOCK(tvp, 0);
 #ifdef UFS_DIRHASH
                if (dp->i_dirhash != NULL)
-                       ufsdirhash_dirtrunc(dp, dp->i_endoff);
+                       ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
 #endif
-               (void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, cr);
+               (void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
                if (DOINGSOFTDEP(dvp) && (tvp != NULL))
                        vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
        }
@@ -1026,19 +1068,43 @@
 }
 
 /*
- * Remove a directory entry after a call to namei, using
- * the parameters which it left in nameidata. The entry
- * dp->i_offset contains the offset into the directory of the
- * entry to be eliminated.  The dp->i_count field contains the
- * size of the previous record in the directory.  If this
- * is 0, the first entry is being deleted, so we need only
- * zero the inode number to mark the entry as free.  If the
- * entry is not the first in the directory, we must reclaim
- * the space of the now empty record by adding the record size
- * to the size of the previous entry.
+  * Remove a directory entry after a call to namei, using the
+  * parameters that ufs_lookup left in nameidata and in the
+  * ufs_lookup_results.
+  *
+  * DVP is the directory to be updated. It must be locked.
+  * ULR is the ufs_lookup_results structure from the final lookup step.
+  * IP, if not null, is the inode being unlinked.
+  * FLAGS may contain DOWHITEOUT.
+  * ISRMDIR is not used and (XXX) should be removed.
+  *
+  * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
+  * instead of being cleared.
+  *
+  * ulr->ulr_offset contains the position of the directory entry
+  * to be removed.
+  *
+  * ulr->ulr_reclen contains the size of the directory entry to be
+  * removed.
+  *
+  * ulr->ulr_count contains the size of the *previous* directory
+  * entry. This allows finding it, for free space management. If
+  * ulr_count is 0, the target entry is at the beginning of the
+  * directory. (Does this ever happen? The first entry should be ".",
+  * which should only be removed at rmdir time. Does rmdir come here
+  * to clear out the "." and ".." entries? Perhaps, but I doubt it.)
+  *
+  * The space is marked free by adding it to the record length (not
+  * name length) of the preceding entry. If the first entry becomes
+  * free, it is marked free by setting the inode number to 0.
+  *
+  * The link count of IP is decremented. Note that this is not the
+  * inverse behavior of ufs_direnter, which does not adjust link
+  * counts. Sigh.
  */
 int
-ufs_dirremove(struct vnode *dvp, struct inode *ip, int flags, int isrmdir)
+ ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+             struct inode *ip, int flags, int isrmdir)
 {
        struct inode *dp = VTOI(dvp);
        struct direct *ep;
@@ -1054,7 +1120,7 @@
                /*
                 * Whiteout entry: set d_ino to WINO.
                 */
-               error = ufs_blkatoff(dvp, (off_t)dp->i_offset, (void *)&ep,
+               error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, (void *)&ep,
                                     &bp, true);
                if (error)
                        return (error);
@@ -1064,7 +1130,7 @@
        }
 
        if ((error = ufs_blkatoff(dvp,
-           (off_t)(dp->i_offset - dp->i_count), (void *)&ep, &bp, true)) != 0)
+           (off_t)(ulr->ulr_offset - ulr->ulr_count), (void *)&ep, &bp, true)) 
!= 0)
                return (error);
 
 #ifdef UFS_DIRHASH
@@ -1073,12 +1139,12 @@
         * that `ep' is the previous entry when dp->i_count != 0.
         */
        if (dp->i_dirhash != NULL)
-               ufsdirhash_remove(dp, (dp->i_count == 0) ? ep :
+               ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
                   (struct direct *)((char *)ep +
-                  ufs_rw16(ep->d_reclen, needswap)), dp->i_offset);
+                  ufs_rw16(ep->d_reclen, needswap)), ulr->ulr_offset);
 #endif
 
-       if (dp->i_count == 0) {
+       if (ulr->ulr_count == 0) {
                /*
                 * First entry in block: set d_ino to zero.
                 */
@@ -1088,7 +1154,7 @@
                 * Collapse new free space into previous entry.
                 */
                ep->d_reclen =
-                   ufs_rw16(ufs_rw16(ep->d_reclen, needswap) + dp->i_reclen,
+                   ufs_rw16(ufs_rw16(ep->d_reclen, needswap) + ulr->ulr_reclen,
                        needswap);
        }
 
@@ -1096,8 +1162,8 @@
        if (dp->i_dirhash != NULL) {
                int dirblksiz = ip->i_ump->um_dirblksiz;
                ufsdirhash_checkblock(dp, (char *)ep -
-                   ((dp->i_offset - dp->i_count) & (dirblksiz - 1)),
-                   dp->i_offset & ~(dirblksiz - 1));
+                   ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
+                   ulr->ulr_offset & ~(dirblksiz - 1));
        }
 #endif
 
@@ -1106,6 +1172,11 @@
                if (ip) {
                        ip->i_ffs_effnlink--;
                        softdep_change_linkcnt(ip);
+                       dp->i_count = ulr->ulr_count;
+                       dp->i_endoff = ulr->ulr_endoff;
+                       dp->i_diroff = ulr->ulr_diroff;
+                       dp->i_offset = ulr->ulr_offset;
+                       dp->i_reclen = ulr->ulr_reclen;
                        softdep_setup_remove(bp, dp, ip, isrmdir);
                }
                bdwrite(bp);
@@ -1135,12 +1206,24 @@
 }
 
 /*
- * Rewrite an existing directory entry to point at the inode
- * supplied.  The parameters describing the directory entry are
- * set up by a call to namei.
+ * Rewrite an existing directory entry to point at the inode supplied.
+ *
+ * DP is the directory to update.
+ * OFFSET is the position of the entry in question. It may come
+ * from ulr_offset of a ufs_lookup_results.
+ * OIP is the old inode the directory previously pointed to.
+ * NEWINUM is the number of the new inode.
+ * NEWTYPE is the new value for the type field of the directory entry.
+ * (This is ignored if the fs doesn't support that.)
+ * ISRMDIR is not used and (XXX) should be removed.
+ * IFLAGS are added to DP's inode flags.
+ *
+ * The link count of OIP is decremented. Note that the link count of
+ * the new inode is *not* incremented. Yay for symmetry.
  */
 int
-ufs_dirrewrite(struct inode *dp, struct inode *oip, ino_t newinum, int newtype,
+ufs_dirrewrite(struct inode *dp, off_t offset,
+    struct inode *oip, ino_t newinum, int newtype,
     int isrmdir, int iflags)
 {
        struct buf *bp;
@@ -1148,7 +1231,7 @@
        struct vnode *vdp = ITOV(dp);
        int error;
 
-       error = ufs_blkatoff(vdp, (off_t)dp->i_offset, (void *)&ep, &bp, true);
+       error = ufs_blkatoff(vdp, offset, (void *)&ep, &bp, true);
        if (error)
                return (error);
        ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump));
@@ -1157,6 +1240,7 @@
        oip->i_ffs_effnlink--;
        if (DOINGSOFTDEP(vdp)) {
                softdep_change_linkcnt(oip);
+               dp->i_offset = offset; /*softdep gets this from the inode*/
                softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
                bdwrite(bp);
        } else {
@@ -1323,6 +1407,129 @@
        return (error);
 }
 
+/*
+ * Extract the inode number of ".." from a directory.
+ * Helper for ufs_parentcheck.
+ */
+static int
+ufs_readdotdot(struct vnode *vp, int needswap, kauth_cred_t cred, ino_t 
*result)
+{
+       struct dirtemplate dirbuf;
+       int namlen, error;
+
+       error = vn_rdwr(UIO_READ, vp, &dirbuf,
+                   sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+                   IO_NODELOCKED, cred, NULL, NULL);
+       if (error) {
+               return error;
+       }
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+       if (FSFMT(vp) && needswap == 0)
+               namlen = dirbuf.dotdot_type;
+       else
+               namlen = dirbuf.dotdot_namlen;
+#else
+       if (FSFMT(vp) && needswap != 0)
+               namlen = dirbuf.dotdot_type;
+       else
+               namlen = dirbuf.dotdot_namlen;
+#endif
+       if (namlen != 2 ||
+           dirbuf.dotdot_name[0] != '.' ||
+           dirbuf.dotdot_name[1] != '.') {
+               printf("ufs_readdotdot: directory %llu contains "
+                      "garbage instead of ..\n",
+                      (unsigned long long) VTOI(vp)->i_number);
+               return ENOTDIR;
+       }
+       *result = ufs_rw32(dirbuf.dotdot_ino, needswap);
+       return 0;
+}
+
+/*
+ * Check if LOWER is a descendent of UPPER. If we find UPPER, return
+ * nonzero in FOUND and return a reference to the immediate descendent
+ * of UPPER in UPPERCHILD. If we don't find UPPER (that is, if we
+ * reach the volume root and that isn't UPPER), return zero in FOUND
+ * and null in UPPERCHILD.
+ *
+ * Neither UPPER nor LOWER should be locked.
+ *
+ * On error (such as a permissions error checking up the directory
+ * tree) fail entirely.
+ *
+ * Note that UPPER and LOWER must be on the same volume, and because
+ * we inspect only that volume NEEDSWAP can be constant.
+ */
+int
+ufs_parentcheck(struct vnode *upper, struct vnode *lower, kauth_cred_t cred,
+               int *found_ret, struct vnode **upperchild_ret)
+{
+       const int needswap = UFS_MPNEEDSWAP(VTOI(lower)->i_ump);
+       ino_t upper_ino, found_ino;
+       struct vnode *current, *next;
+       int error;
+
+       if (upper == lower) {
+               vref(upper);
+               *found_ret = 1;
+               *upperchild_ret = upper;
+               return 0;
+       }
+       if (VTOI(lower)->i_number == ROOTINO) {
+               *found_ret = 0;
+               *upperchild_ret = NULL;
+               return 0;
+       }
+
+       upper_ino = VTOI(upper)->i_number;
+
+       current = lower;
+       vref(current);
+       vn_lock(current, LK_EXCLUSIVE | LK_RETRY);
+
+       for (;;) {
+               error = ufs_readdotdot(current, needswap, cred, &found_ino);
+               if (error) {
+                       vput(current);
+                       return error;
+               }
+               if (found_ino == upper_ino) {
+                       VOP_UNLOCK(current, 0);
+                       *found_ret = 1;
+                       *upperchild_ret = current;
+                       return 0;
+               }
+               if (found_ino == ROOTINO) {
+                       vput(current);
+                       *found_ret = 0;
+                       *upperchild_ret = NULL;
+                       return 0;
+               }
+               VOP_UNLOCK(current, 0);
+               error = VFS_VGET(current->v_mount, found_ino, &next);
+               if (error) {
+                       vrele(current);
+                       return error;
+               }
+               KASSERT(VOP_ISLOCKED(next));
+               if (next->v_type != VDIR) {
+                       printf("ufs_parentcheck: inode %llu reached via .. of "
+                              "inode %llu is not a directory\n",
+                           (unsigned long long)VTOI(next)->i_number,
+                           (unsigned long long)VTOI(current)->i_number);
+                       vput(next);
+                       vrele(current);
+                       return ENOTDIR;
+               }
+               vrele(current);
+               current = next;
+       }
+
+       return 0;
+}
+
 #define        UFS_DIRRABLKS 0
 int ufs_dirrablks = UFS_DIRRABLKS;
 
Index: ufs/ufs/ufs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_vnops.c,v
retrieving revision 1.169.4.1
diff -u -r1.169.4.1 ufs_vnops.c
--- ufs/ufs/ufs_vnops.c 28 Mar 2010 17:31:55 -0000      1.169.4.1
+++ ufs/ufs/ufs_vnops.c 7 May 2012 18:46:54 -0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: ufs_vnops.c,v 1.169.4.1 2010/03/28 17:31:55 snj Exp $  */
+/*     $NetBSD: ufs_vnops.c,v 1.200 2011/07/18 06:45:47 dholland Exp $ */
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -135,22 +135,28 @@
                struct vattr            *a_vap;
        } */ *ap = v;
        int     error;
+       struct vnode *dvp = ap->a_dvp;
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
 
        /*
         * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
         * ufs_makeinode
         */
-       fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+       fstrans_start(dvp->v_mount, FSTRANS_SHARED);
        error =
            ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
-                         ap->a_dvp, ap->a_vpp, ap->a_cnp);
+                         dvp, ulr, ap->a_vpp, ap->a_cnp);
        if (error) {
-               fstrans_done(ap->a_dvp->v_mount);
+               fstrans_done(dvp->v_mount);
                return (error);
        }
-       UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
-       fstrans_done(ap->a_dvp->v_mount);
-       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       UFS_WAPBL_END1(dvp->v_mount, dvp);
+       fstrans_done(dvp->v_mount);
+       VN_KNOTE(dvp, NOTE_WRITE);
        return (0);
 }
 
@@ -173,10 +179,15 @@
        int             error;
        struct mount    *mp;
        ino_t           ino;
+       struct ufs_lookup_results *ulr;
 
        vap = ap->a_vap;
        vpp = ap->a_vpp;
 
+       /* XXX should handle this material another way */
+       ulr = &VTOI(ap->a_dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
        /*
         * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
         * ufs_makeinode
@@ -184,7 +195,7 @@
        fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
        if ((error =
            ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
-           ap->a_dvp, vpp, ap->a_cnp)) != 0)
+           ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0)
                goto out;
        VN_KNOTE(ap->a_dvp, NOTE_WRITE);
        ip = VTOI(*vpp);
@@ -745,10 +756,16 @@
        struct inode    *ip;
        int             error;
        bool            pace;
+       struct ufs_lookup_results *ulr;
 
        vp = ap->a_vp;
        dvp = ap->a_dvp;
        ip = VTOI(vp);
+ 
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+ 
        fstrans_start(dvp->v_mount, FSTRANS_SHARED);
        if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) ||
            (VTOI(dvp)->i_flags & APPEND))
@@ -756,7 +773,8 @@
        else {
                error = UFS_WAPBL_BEGIN(dvp->v_mount);
                if (error == 0) {
-                       error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
+                       error = ufs_dirremove(dvp, ulr,
+                                             ip, ap->a_cnp->cn_flags, 0);
                        UFS_WAPBL_END(dvp->v_mount);
                }
        }
@@ -795,6 +813,7 @@
        struct inode            *ip;
        struct direct           *newdir;
        int                     error;
+       struct ufs_lookup_results *ulr;
 
        dvp = ap->a_dvp;
        vp = ap->a_vp;
@@ -803,6 +822,10 @@
        if ((cnp->cn_flags & HASBUF) == 0)
                panic("ufs_link: no name");
 #endif
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+ 
        fstrans_start(dvp->v_mount, FSTRANS_SHARED);
        if (vp->v_type == VDIR) {
                VOP_ABORTOP(dvp, cnp);
@@ -844,7 +867,7 @@
        if (!error) {
                newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
                ufs_makedirentry(ip, cnp, newdir);
-               error = ufs_direnter(dvp, vp, newdir, cnp, NULL);
+               error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL);
                pool_cache_put(ufs_direct_cache, newdir);
        }
        if (error) {
@@ -885,6 +908,11 @@
        struct direct           *newdir;
        int                     error;
        struct ufsmount         *ump = VFSTOUFS(dvp->v_mount);
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
 
        error = 0;
        switch (ap->a_flags) {
@@ -911,7 +939,7 @@
                    (size_t)cnp->cn_namelen);
                newdir->d_name[cnp->cn_namelen] = '\0';
                newdir->d_type = DT_WHT;
-               error = ufs_direnter(dvp, NULL, newdir, cnp, NULL);
+               error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL);
                pool_cache_put(ufs_direct_cache, newdir);
                break;
 
@@ -924,7 +952,7 @@
 #endif
 
                cnp->cn_flags &= ~DOWHITEOUT;
-               error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0);
+               error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0);
                break;
        default:
                panic("ufs_whiteout: unknown op");
@@ -963,6 +991,338 @@
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
+
+/*
+ * Notes on rename locking:
+ *
+ * We lock parent vnodes before child vnodes. This means in particular
+ * that if A is above B in the directory tree then A must be locked
+ * before B. (This is true regardless of how many steps appear in
+ * between, because an arbitrary number of other processes could lock
+ * parent/child in between and establish a lock cycle and deadlock.)
+ *
+ * Therefore, if tdvp is above fdvp we must lock tdvp first; if fdvp
+ * is above tdvp we must lock fdvp first; and if they're
+ * incommensurate it doesn't matter. (But, we rely on the fact that
+ * there's a whole-volume rename lock to prevent deadlock among groups
+ * of renames upon overlapping sets of incommensurate vnodes.)
+ *
+ * In addition to establishing lock ordering the parent check also
+ * serves to rule out cases where someone tries to move a directory
+ * underneath itself, e.g. rename("a/b", "a/b/c"). If allowed to
+ * proceed such renames would detach portions of the directory tree
+ * and make fsck very unhappy.
+ *
+ * Note that it is an error for *fvp* to be above tdvp; however,
+ * *fdvp* can be above tdvp, as in rename("a/b", "a/c/d").
+ *
+ * The parent check searches up the tree from tdvp until it either
+ * finds fdvp or the root of the volume. It also returns the vnode it
+ * saw immediately before fdvp, if any. Later on (after looking up
+ * fvp) we will check to see if this *is* fvp and if so fail.
+ *
+ * If the parent check finds fdvp, it means fdvp is above tdvp, so we
+ * lock fdvp first and then tdvp. Otherwise, either tdvp is above fdvp
+ * or they're incommensurate and we lock tdvp first.
+ *
+ * In either case each of the child vnodes has to be looked up and
+ * locked immediately after its parent. The cases
+ *
+ *       fdvp/fvp/[.../]tdvp/tvp
+ *       tdvp/tvp/[.../]fdvp/fvp
+ *
+ * can cause deadlock otherwise. Note that both of these are error
+ * cases; the first fails the parent check and the second fails
+ * because tvp isn't empty. The parent check case is handled before
+ * we start locking; however, the nonempty case requires locking tvp
+ * to find out safely that it's nonempty.
+ *
+ * Therefore the procedure is either
+ *
+ *   lock fdvp
+ *   lookup fvp
+ *   lock fvp
+ *   lock tdvp
+ *   lookup tvp
+ *   lock tvp
+ *
+ * or
+ *
+ *   lock tdvp
+ *   lookup tvp
+ *   lock tvp
+ *   lock fdvp
+ *   lookup fvp
+ *   lock fvp
+ *
+ * This could in principle be simplified by always looking up fvp
+ * last; because of the parent check we know by the time we start
+ * locking that fvp cannot be directly above tdvp, so (given the
+ * whole-volume rename lock and other assumptions) it's safe to lock
+ * tdvp before fvp. This would allow the following scheme:
+ *
+ *   lock fdvp
+ *   lock tdvp
+ * or
+ *   lock tdvp
+ *   lock fdvp
+ *
+ * then
+ *   lookup tvp
+ *   lock tvp
+ *   lookup fvp
+ *   check if fvp is above of tdvp, fail if so
+ *   lock fvp
+ *
+ * which is much, much simpler.
+ *
+ * However, current levels of vfs namei/lookup sanity do not permit
+ * this. It is impossible currently to look up fvp without locking it.
+ * (It gets locked regardless of whether LOCKLEAF is set; without
+ * LOCKLEAF it just gets unlocked again, which doesn't help.)
+ *
+ * Therefore, because we must look up fvp to know if it's above tdvp,
+ * which locks fvp, we must, at least in the case where fdvp is above
+ * tdvp, do that before locking tdvp. The longer scheme does that; the
+ * simpler scheme is not safe.
+ *
+ * Note that for now we aren't doing lookup() but relookup(); however,
+ * the differences are minor.
+ *
+ * On top of all the above, just to make everything more
+ * exciting, any two of the vnodes might end up being the same.
+ *
+ * FROMPARENT == FROMCHILD     mv a/. foo      is an error.
+ * FROMPARENT == TOPARENT      mv a/b a/c      is ok.
+ * FROMPARENT == TOCHILD       mv a/b/c a/b    will give ENOTEMPTY.
+ * FROMCHILD == TOPARENT       mv a/b a/b/c    fails the parent check.
+ * FROMCHILD == TOCHILD                mv a/b a/b      is ok.
+ * TOPARENT == TOCHILD         mv foo a/.      is an error.
+ *
+ * This introduces more cases in the locking, because each distinct
+ * vnode must be locked exactly once.
+ *
+ * When FROMPARENT == TOPARENT and FROMCHILD != TOCHILD we assume it
+ * doesn't matter what order the children are locked in, because the
+ * per-volume rename lock excludes other renames and no other
+ * operation locks two files in the same directory at once. (Note: if
+ * it turns out that link() does, link() is wrong.)
+ *
+ * Until such time as we can do lookups without the namei and lookup
+ * machinery "helpfully" locking the result vnode for us, we can't
+ * avoid tripping on cases where FROMCHILD == TOCHILD. Currently for
+ * non-directories we unlock the first one we lock while looking up
+ * the second, then relock it if necessary. This is more or less
+ * harmless since not much of interest can happen to the objects in
+ * that window while we have the containing directory locked; but it's
+ * not desirable and should be cleaned up when that becomes possible.
+ * The right way to do it is to check after looking the second one up
+ * and only lock it if it's different. (Note: for directories we don't
+ * do this dance because the same directory can't appear more than
+ * once.)
+ */
+
+/* XXX following lifted from ufs_lookup.c */
+#define        FSFMT(vp)       (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Check if either entry referred to by FROM_ULR is within the range
+ * of entries named by TO_ULR.
+ */
+static int
+ulr_overlap(const struct ufs_lookup_results *from_ulr,
+           const struct ufs_lookup_results *to_ulr)
+{
+       doff_t from_start, from_prevstart;
+       doff_t to_start, to_end;
+
+       /*
+        * FROM is a DELETE result; offset points to the entry to
+        * remove and subtracting count gives the previous entry.
+        */
+       from_start = from_ulr->ulr_offset - from_ulr->ulr_count;
+       from_prevstart = from_ulr->ulr_offset;
+
+       /*
+        * TO is a RENAME (thus non-DELETE) result; offset points
+        * to the beginning of a region to write in, and adding
+        * count gives the end of the region.
+        */
+       to_start = to_ulr->ulr_offset;
+       to_end = to_ulr->ulr_offset + to_ulr->ulr_count;
+
+       if (from_prevstart >= to_start && from_prevstart < to_end) {
+               return 1;
+       }
+       if (from_start >= to_start && from_start < to_end) {
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * Wrapper for relookup that also updates the supplemental results.
+ */
+static int
+do_relookup(struct vnode *dvp, struct ufs_lookup_results *ulr,
+           struct vnode **vp, struct componentname *cnp)
+{
+       int error, savestart;
+
+       savestart = cnp->cn_flags & SAVESTART;
+       cnp->cn_flags &= ~SAVESTART;
+       error = relookup(dvp, vp, cnp);
+       cnp->cn_flags |= savestart;
+       if (error) {
+               return error;
+       }
+       /* update the supplemental reasults */
+       *ulr = VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+       return 0;
+}
+
+/*
+ * Lock and relookup a sequence of two directories and two children.
+ *
+ */
+static int
+lock_vnode_sequence(struct vnode *d1, struct ufs_lookup_results *ulr1,
+                   struct vnode **v1_ret, struct componentname *cn1, 
+                   int v1_missing_ok,
+                   int overlap_error,
+                   struct vnode *d2, struct ufs_lookup_results *ulr2,
+                   struct vnode **v2_ret, struct componentname *cn2, 
+                   int v2_missing_ok)
+{
+       struct vnode *v1, *v2;
+       int error;
+
+       KASSERT(d1 != d2);
+
+       vn_lock(d1, LK_EXCLUSIVE | LK_RETRY);
+       if (VTOI(d1)->i_size == 0) {
+               /* d1 has been rmdir'd */
+               VOP_UNLOCK(d1, 0);
+               return ENOENT;
+       }
+       error = do_relookup(d1, ulr1, &v1, cn1);
+       if (v1_missing_ok) {
+               if (error == ENOENT) {
+                       /*
+                        * Note: currently if the name doesn't exist,
+                        * relookup succeeds (it intercepts the
+                        * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+                        * to NULL. Therefore, we will never get
+                        * ENOENT and this branch is not needed.
+                        * However, in a saner future the EJUSTRETURN
+                        * garbage will go away, so let's DTRT.
+                        */
+                       v1 = NULL;
+                       error = 0;
+               }
+       } else {
+               if (error == 0 && v1 == NULL) {
+                       /* This is what relookup sets if v1 disappeared. */
+                       error = ENOENT;
+               }
+       }
+       if (error) {
+               VOP_UNLOCK(d1, 0);
+               return error;
+       }
+       if (v1 && v1 == d2) {
+               VOP_UNLOCK(d1, 0);
+               VOP_UNLOCK(v1, 0);
+               vrele(v1);
+               return overlap_error;
+       }
+
+       /*
+        * The right way to do this is to do lookups without locking
+        * the results, and lock the results afterwards; then at the
+        * end we can avoid trying to lock v2 if v2 == v1.
+        *
+        * However, for the reasons described in the fdvp == tdvp case
+        * in rename below, we can't do that safely. So, in the case
+        * where v1 is not a directory, unlock it and lock it again
+        * afterwards. This is safe in locking order because a
+        * non-directory can't be above anything else in the tree. If
+        * v1 *is* a directory, that's not true, but then because d1
+        * != d2, v1 != v2.
+        */
+       if (v1 && v1->v_type != VDIR) {
+               VOP_UNLOCK(v1, 0);
+       }
+       vn_lock(d2, LK_EXCLUSIVE | LK_RETRY);
+       if (VTOI(d2)->i_size == 0) {
+               /* d2 has been rmdir'd */
+               VOP_UNLOCK(d2, 0);
+               if (v1 && v1->v_type == VDIR) {
+                       VOP_UNLOCK(v1, 0);
+               }
+               VOP_UNLOCK(d1, 0);
+               if (v1) {
+                       vrele(v1);
+               }
+               return ENOENT;
+       }
+       error = do_relookup(d2, ulr2, &v2, cn2);
+       if (v2_missing_ok) {
+               if (error == ENOENT) {
+                       /* as above */
+                       v2 = NULL;
+                       error = 0;
+               }
+       } else {
+               if (error == 0 && v2 == NULL) {
+                       /* This is what relookup sets if v2 disappeared. */
+                       error = ENOENT;
+               }
+       }
+       if (error) {
+               VOP_UNLOCK(d2, 0);
+               if (v1 && v1->v_type == VDIR) {
+                       VOP_UNLOCK(v1, 0);
+               }
+               VOP_UNLOCK(d1, 0);
+               if (v1) {
+                       vrele(v1);
+               }
+               return error;
+       }
+       if (v1 && v1->v_type != VDIR && v1 != v2) {
+               vn_lock(v1, LK_EXCLUSIVE | LK_RETRY);
+       }
+       *v1_ret = v1;
+       *v2_ret = v2;
+       return 0;
+}
+
+/*
+ * Rename vnode operation
+ *     rename("foo", "bar");
+ * is essentially
+ *     unlink("bar");
+ *     link("foo", "bar");
+ *     unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
 int
 ufs_rename(void *v)
 {
@@ -976,15 +1336,12 @@
        } */ *ap = v;
        struct vnode            *tvp, *tdvp, *fvp, *fdvp;
        struct componentname    *tcnp, *fcnp;
-       struct inode            *ip, *xp, *dp;
+       struct inode            *ip, *txp, *fxp, *tdp, *fdp;
        struct mount            *mp;
        struct direct           *newdir;
        int                     doingdirectory, oldparent, newparent, error;
 
-#ifdef WAPBL
-       if (ap->a_tdvp->v_mount->mnt_wapbl)
-               return wapbl_ufs_rename(v);
-#endif
+       struct ufs_lookup_results from_ulr, to_ulr;
 
        tvp = ap->a_tvp;
        tdvp = ap->a_tdvp;
@@ -999,103 +1356,297 @@
            (fcnp->cn_flags & HASBUF) == 0)
                panic("ufs_rename: no name");
 #endif
+
+       /* save the supplemental lookup results as they currently exist */
+       from_ulr = VTOI(fdvp)->i_crap;
+       to_ulr = VTOI(tdvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(fdvp));
+       UFS_CHECK_CRAPCOUNTER(VTOI(tdvp));
+ 
+       /*
+        * Owing to VFS oddities we are currently called with tdvp/tvp
+        * locked and not fdvp/fvp. In a sane world we'd be passed
+        * tdvp and fdvp only, unlocked, and two name strings. Pretend
+        * we have a sane world and unlock tdvp and tvp.
+        */
+       VOP_UNLOCK(tdvp, 0);
+       if (tvp && tvp != tdvp) {
+               VOP_UNLOCK(tvp, 0);
+       }
+ 
+       /* Also pretend we have a sane world and vrele fvp/tvp. */
+       vrele(fvp);
+       fvp = NULL;
+       if (tvp) {
+               vrele(tvp);
+               tvp = NULL;
+       }
+ 
        /*
         * Check for cross-device rename.
         */
-       if ((fvp->v_mount != tdvp->v_mount) ||
-           (tvp && (fvp->v_mount != tvp->v_mount))) {
+       if (fdvp->v_mount != tdvp->v_mount) {
                error = EXDEV;
- abortit:
-               VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
-               if (tdvp == tvp)
-                       vrele(tdvp);
-               else
-                       vput(tdvp);
-               if (tvp)
-                       vput(tvp);
-               VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
-               vrele(fdvp);
-               vrele(fvp);
-               return (error);
        }
 
        /*
-        * Check if just deleting a link name.
-        */
+        * Reject "." and ".."
+        */
+       if ((fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) ||
+           (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+           (tcnp->cn_namelen == 1 && tcnp->cn_nameptr[0] == '.')) {
+               error = EINVAL;
+               goto abort;
+       }
+           
+       /*
+        * Get locks.
+        */
+ 
+       /* paranoia */
+       fcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+       tcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+ 
+       if (fdvp == tdvp) {
+               /* One directory. Lock it and relookup both children. */
+               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+ 
+               if (VTOI(fdvp)->i_size == 0) {
+                       /* directory has been rmdir'd */
+                       VOP_UNLOCK(fdvp, 0);
+                       error = ENOENT;
+                       goto abort;
+               }
+ 
+               error = do_relookup(fdvp, &from_ulr, &fvp, fcnp);
+               if (error == 0 && fvp == NULL) {
+                       /* relookup may produce this if fvp disappears */
+                       error = ENOENT;
+               }
+               if (error) {
+                       VOP_UNLOCK(fdvp, 0);
+                       goto abort;
+               }
+ 
+               /*
+                * The right way to do this is to look up both children
+                * without locking either, and then lock both unless they
+                * turn out to be the same. However, due to deep-seated
+                * VFS-level issues all lookups lock the child regardless
+                * of whether LOCKLEAF is set (if LOCKLEAF is not set,
+                * the child is locked during lookup and then unlocked)
+                * so it is not safe to look up tvp while fvp is locked.
+                *
+                * Unlocking fvp here temporarily is more or less safe,
+                * because with the directory locked there's not much
+                * that can happen to it. However, ideally it wouldn't
+                * be necessary. XXX.
+                */
+               VOP_UNLOCK(fvp, 0);
+               /* remember fdvp == tdvp so tdvp is locked */
+               error = do_relookup(tdvp, &to_ulr, &tvp, tcnp);
+               if (error && error != ENOENT) {
+                       VOP_UNLOCK(fdvp, 0);
+                       goto abort;
+               }
+               if (error == ENOENT) {
+                       /*
+                        * Note: currently if the name doesn't exist,
+                        * relookup succeeds (it intercepts the
+                        * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+                        * to NULL. Therefore, we will never get
+                        * ENOENT and this branch is not needed.
+                        * However, in a saner future the EJUSTRETURN
+                        * garbage will go away, so let's DTRT.
+                        */
+                       tvp = NULL;
+               }
+ 
+               /* tvp is locked; lock fvp if necessary */
+               if (!tvp || tvp != fvp) {
+                       vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
+               }
+       } else {
+               int found_fdvp;
+               struct vnode *illegal_fvp;
+ 
+               /*
+                * The source must not be above the destination. (If
+                * it were, the rename would detach a section of the
+                * tree.)
+                *
+                * Look up the tree from tdvp to see if we find fdvp,
+                * and if so, return the immediate child of fdvp we're
+                * under; that must not turn out to be the same as
+                * fvp.
+                *
+                * The per-volume rename lock guarantees that the
+                * result of this check remains true until we finish
+                * looking up and locking.
+                */
+               error = ufs_parentcheck(fdvp, tdvp, fcnp->cn_cred,
+                                       &found_fdvp, &illegal_fvp);
+               if (error) {
+                       goto abort;
+               }
+ 
+               /* Must lock in tree order. */
+ 
+               if (found_fdvp) {
+                       /* fdvp -> fvp -> tdvp -> tvp */
+                       error = lock_vnode_sequence(fdvp, &from_ulr,
+                                                   &fvp, fcnp, 0,
+                                                   EINVAL,
+                                                   tdvp, &to_ulr,
+                                                   &tvp, tcnp, 1);
+               } else {
+                       /* tdvp -> tvp -> fdvp -> fvp */
+                       error = lock_vnode_sequence(tdvp, &to_ulr,
+                                                   &tvp, tcnp, 1,
+                                                   ENOTEMPTY,
+                                                   fdvp, &from_ulr,
+                                                   &fvp, fcnp, 0);
+               }
+               if (error) {
+                       if (illegal_fvp) {
+                               vrele(illegal_fvp);
+                       }
+                       goto abort;
+               }
+               KASSERT(fvp != NULL);
+ 
+               if (illegal_fvp && fvp == illegal_fvp) {
+                       vrele(illegal_fvp);
+                       error = EINVAL;
+                       goto abort_withlocks;
+               }
+ 
+               if (illegal_fvp) {
+                       vrele(illegal_fvp);
+               }
+       }
+ 
+       KASSERT(fdvp && VOP_ISLOCKED(fdvp));
+       KASSERT(fvp && VOP_ISLOCKED(fvp));
+       KASSERT(tdvp && VOP_ISLOCKED(tdvp));
+       KASSERT(tvp == NULL || VOP_ISLOCKED(tvp));
+ 
+       /* --- everything is now locked --- */
+ 
        if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
            (VTOI(tdvp)->i_flags & APPEND))) {
                error = EPERM;
-               goto abortit;
+               goto abort_withlocks;
        }
+ 
+       /*
+        * Check if just deleting a link name.
+        */
        if (fvp == tvp) {
                if (fvp->v_type == VDIR) {
                        error = EINVAL;
-                       goto abortit;
+                       goto abort_withlocks;
                }
 
-               /* Release destination completely. */
+               /* Release destination completely. Leave fdvp locked. */
                VOP_ABORTOP(tdvp, tcnp);
-               vput(tdvp);
-               vput(tvp);
+               if (fdvp != tdvp) {
+                       VOP_UNLOCK(tdvp, 0);
+               }
+               VOP_UNLOCK(tvp, 0);
+               vrele(tdvp);
+               vrele(tvp);
 
                /* Delete source. */
+               /* XXX: do we really need to relookup again? */
+ 
+               /*
+                * fdvp is still locked, but we just unlocked fvp
+                * (because fvp == tvp) so just decref fvp
+                */
                vrele(fvp);
                fcnp->cn_flags &= ~(MODMASK | SAVESTART);
                fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
                fcnp->cn_nameiop = DELETE;
-               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
                if ((error = relookup(fdvp, &fvp, fcnp))) {
                        vput(fdvp);
                        return (error);
                }
                return (VOP_REMOVE(fdvp, fvp, fcnp));
        }
-       if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
-               goto abortit;
-       dp = VTOI(fdvp);
+       fdp = VTOI(fdvp);
        ip = VTOI(fvp);
        if ((nlink_t) ip->i_nlink >= LINK_MAX) {
-               VOP_UNLOCK(fvp, 0);
                error = EMLINK;
-               goto abortit;
+               goto abort_withlocks;
        }
        if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
-               (dp->i_flags & APPEND)) {
-               VOP_UNLOCK(fvp, 0);
+               (fdp->i_flags & APPEND)) {
                error = EPERM;
-               goto abortit;
+       goto abort_withlocks;
        }
        if ((ip->i_mode & IFMT) == IFDIR) {
                /*
                 * Avoid ".", "..", and aliases of "." for obvious reasons.
                 */
                if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
-                   dp == ip ||
+                   fdp == ip ||
                    (fcnp->cn_flags & ISDOTDOT) ||
                    (tcnp->cn_flags & ISDOTDOT) ||
                    (ip->i_flag & IN_RENAME)) {
-                       VOP_UNLOCK(fvp, 0);
                        error = EINVAL;
-                       goto abortit;
+                       goto abort_withlocks;
                }
                ip->i_flag |= IN_RENAME;
-               oldparent = dp->i_number;
                doingdirectory = 1;
        }
+       oldparent = fdp->i_number;
        VN_KNOTE(fdvp, NOTE_WRITE);             /* XXXLUKEM/XXX: right place? */
 
        /*
-        * When the target exists, both the directory
-        * and target vnodes are returned locked.
+        * Both the directory
+        * and target vnodes are locked.
         */
-       dp = VTOI(tdvp);
-       xp = NULL;
+       tdp = VTOI(tdvp);
+       txp = NULL;
        if (tvp)
-               xp = VTOI(tvp);
+               txp = VTOI(tvp);
 
        mp = fdvp->v_mount;
        fstrans_start(mp, FSTRANS_SHARED);
 
+       if (oldparent != tdp->i_number)
+               newparent = tdp->i_number;
+ 
+       /*
+        * If ".." must be changed (ie the directory gets a new
+        * parent) the user must have write permission in the source
+        * so as to be able to change "..".
+        */
+       if (doingdirectory && newparent) {
+               error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+               if (error)
+                       goto out;
+       }
+ 
+       KASSERT(fdvp != tvp);
+ 
+       if (newparent) {
+               /* Check for the rename("foo/foo", "foo") case. */
+               if (fdvp == tvp) {
+                       error = doingdirectory ? ENOTEMPTY : EISDIR;
+                       goto out;
+               }
+       }
+ 
+       fxp = VTOI(fvp);
+       fdp = VTOI(fdvp);
+ 
+       error = UFS_WAPBL_BEGIN(fdvp->v_mount);
+       if (error)
+               goto out2;
+ 
+
        /*
         * 1) Bump link count while we're moving stuff
         *    around.  If we crash somewhere before
@@ -1109,55 +1660,18 @@
        if (DOINGSOFTDEP(fvp))
                softdep_change_linkcnt(ip);
        if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) {
-               VOP_UNLOCK(fvp, 0);
                goto bad;
        }
 
        /*
-        * If ".." must be changed (ie the directory gets a new
-        * parent) then the source directory must not be in the
-        * directory hierarchy above the target, as this would
-        * orphan everything below the source directory. Also
-        * the user must have write permission in the source so
-        * as to be able to change "..". We must repeat the call
-        * to namei, as the parent directory is unlocked by the
-        * call to checkpath().
-        */
-       error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
-       VOP_UNLOCK(fvp, 0);
-       if (oldparent != dp->i_number)
-               newparent = dp->i_number;
-       if (doingdirectory && newparent) {
-               if (error)      /* write access check above */
-                       goto bad;
-               if (xp != NULL)
-                       vput(tvp);
-               vref(tdvp);     /* compensate for the ref checkpath loses */
-               if ((error = ufs_checkpath(ip, dp, tcnp->cn_cred)) != 0) {
-                       vrele(tdvp);
-                       goto out;
-               }
-               tcnp->cn_flags &= ~SAVESTART;
-               vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
-               error = relookup(tdvp, &tvp, tcnp);
-               if (error != 0) {
-                       vput(tdvp);
-                       goto out;
-               }
-               dp = VTOI(tdvp);
-               xp = NULL;
-               if (tvp)
-                       xp = VTOI(tvp);
-       }
-       /*
         * 2) If target doesn't exist, link the target
         *    to the source and unlink the source.
         *    Otherwise, rewrite the target directory
         *    entry to reference the source inode and
         *    expunge the original entry's existence.
         */
-       if (xp == NULL) {
-               if (dp->i_dev != ip->i_dev)
+       if (txp == NULL) {
+               if (tdp->i_dev != ip->i_dev)
                        panic("rename: EXDEV");
                /*
                 * Account for ".." in new directory.
@@ -1165,53 +1679,53 @@
                 * parent we don't fool with the link count.
                 */
                if (doingdirectory && newparent) {
-                       if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+                       if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
                                error = EMLINK;
                                goto bad;
                        }
-                       dp->i_ffs_effnlink++;
-                       dp->i_nlink++;
-                       DIP_ASSIGN(dp, nlink, dp->i_nlink);
-                       dp->i_flag |= IN_CHANGE;
+                       tdp->i_ffs_effnlink++;
+                       tdp->i_nlink++;
+                       DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                       tdp->i_flag |= IN_CHANGE;
                        if (DOINGSOFTDEP(tdvp))
-                               softdep_change_linkcnt(dp);
+                               softdep_change_linkcnt(tdp);
                        if ((error = UFS_UPDATE(tdvp, NULL, NULL,
                            UPDATE_DIROP)) != 0) {
-                               dp->i_ffs_effnlink--;
-                               dp->i_nlink--;
-                               DIP_ASSIGN(dp, nlink, dp->i_nlink);
-                               dp->i_flag |= IN_CHANGE;
+                               tdp->i_ffs_effnlink--;
+                       tdp->i_nlink--;
+                       DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                       tdp->i_flag |= IN_CHANGE;
                                if (DOINGSOFTDEP(tdvp))
-                                       softdep_change_linkcnt(dp);
+                                       softdep_change_linkcnt(tdp);
                                goto bad;
                        }
                }
                newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
                ufs_makedirentry(ip, tcnp, newdir);
-               error = ufs_direnter(tdvp, NULL, newdir, tcnp, NULL);
+               error = ufs_direnter(tdvp, &to_ulr,
+                                    NULL, newdir, tcnp, NULL);
                pool_cache_put(ufs_direct_cache, newdir);
                if (error != 0) {
                        if (doingdirectory && newparent) {
-                               dp->i_ffs_effnlink--;
-                               dp->i_nlink--;
-                               DIP_ASSIGN(dp, nlink, dp->i_nlink);
-                               dp->i_flag |= IN_CHANGE;
+                               tdp->i_ffs_effnlink--;
+                               tdp->i_nlink--;
+                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                               tdp->i_flag |= IN_CHANGE;
                                if (DOINGSOFTDEP(tdvp))
-                                       softdep_change_linkcnt(dp);
+                                       softdep_change_linkcnt(tdp);
                                (void)UFS_UPDATE(tdvp, NULL, NULL,
-                                                UPDATE_WAIT|UPDATE_DIROP);
+                                                UPDATE_WAIT | UPDATE_DIROP);
                        }
                        goto bad;
                }
                VN_KNOTE(tdvp, NOTE_WRITE);
-               vput(tdvp);
        } else {
-               if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+               if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev)
                        panic("rename: EXDEV");
                /*
                 * Short circuit rename(foo, foo).
                 */
-               if (xp->i_number == ip->i_number)
+               if (txp->i_number == ip->i_number)
                        panic("rename: same file");
                /*
                 * If the parent directory is "sticky", then the user must
@@ -1219,11 +1733,11 @@
                 * otherwise the destination may not be changed (except by
                 * root). This implements append-only directories.
                 */
-               if ((dp->i_mode & S_ISTXT) &&
+               if ((tdp->i_mode & S_ISTXT) &&
                    kauth_authorize_generic(tcnp->cn_cred,
                     KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
-                   kauth_cred_geteuid(tcnp->cn_cred) != dp->i_uid &&
-                   xp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+                   kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid &&
+                   txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
                        error = EPERM;
                        goto bad;
                }
@@ -1232,9 +1746,9 @@
                 * to it. Also, ensure source and target are compatible
                 * (both directories, or both not directories).
                 */
-               if ((xp->i_mode & IFMT) == IFDIR) {
-                       if (xp->i_ffs_effnlink > 2 ||
-                           !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
+               if ((txp->i_mode & IFMT) == IFDIR) {
+                       if (txp->i_nlink > 2 ||
+                           !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) {
                                error = ENOTEMPTY;
                                goto bad;
                        }
@@ -1247,19 +1761,20 @@
                        error = EISDIR;
                        goto bad;
                }
-               if ((error = ufs_dirrewrite(dp, xp, ip->i_number,
+               if ((error = ufs_dirrewrite(tdp, to_ulr.ulr_offset,
+                   txp, ip->i_number,
                    IFTODT(ip->i_mode), doingdirectory && newparent ?
                    newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0)
                        goto bad;
                if (doingdirectory) {
                        if (!newparent) {
-                               dp->i_ffs_effnlink--;
+                               tdp->i_ffs_effnlink--;
                                if (DOINGSOFTDEP(tdvp))
-                                       softdep_change_linkcnt(dp);
+                                       softdep_change_linkcnt(tdp);
                        }
-                       xp->i_ffs_effnlink--;
+                       txp->i_ffs_effnlink--;
                        if (DOINGSOFTDEP(tvp))
-                               softdep_change_linkcnt(xp);
+                               softdep_change_linkcnt(txp);
                }
                if (doingdirectory && !DOINGSOFTDEP(tvp)) {
                        /*
@@ -1274,48 +1789,139 @@
                         * them now.
                         */
                        if (!newparent) {
-                               dp->i_nlink--;
-                               DIP_ASSIGN(dp, nlink, dp->i_nlink);
-                               dp->i_flag |= IN_CHANGE;
+                               tdp->i_nlink--;
+                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                               tdp->i_flag |= IN_CHANGE;
+                               UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
                        }
-                       xp->i_nlink--;
-                       DIP_ASSIGN(xp, nlink, xp->i_nlink);
-                       xp->i_flag |= IN_CHANGE;
+                       txp->i_nlink--;
+                       DIP_ASSIGN(txp, nlink, txp->i_nlink);
+                       txp->i_flag |= IN_CHANGE;
                        if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
                            tcnp->cn_cred)))
                                goto bad;
                }
                VN_KNOTE(tdvp, NOTE_WRITE);
-               vput(tdvp);
                VN_KNOTE(tvp, NOTE_DELETE);
-               vput(tvp);
-               xp = NULL;
        }
 
        /*
-        * 3) Unlink the source.
-        */
-       fcnp->cn_flags &= ~(MODMASK | SAVESTART);
-       fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
-       vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
-       if ((error = relookup(fdvp, &fvp, fcnp))) {
-               vput(fdvp);
-               vrele(ap->a_fvp);
-               goto out2;
-       }
-       if (fvp != NULL) {
-               xp = VTOI(fvp);
-               dp = VTOI(fdvp);
-       } else {
+        * Handle case where the directory entry we need to remove,
+        * which is/was at from_ulr.ulr_offset, or the one before it,
+        * which is/was at from_ulr.ulr_offset - from_ulr.ulr_count,
+        * may have been moved when the directory insertion above
+        * performed compaction.
+        */
+       if (tdp->i_number == fdp->i_number &&
+           ulr_overlap(&from_ulr, &to_ulr)) {
+ 
+               struct buf *bp;
+               struct direct *ep;
+               struct ufsmount *ump = fdp->i_ump;
+               doff_t curpos;
+               doff_t endsearch;       /* offset to end directory search */
+               uint32_t prev_reclen;
+               int dirblksiz = ump->um_dirblksiz;
+               const int needswap = UFS_MPNEEDSWAP(ump);
+               u_long bmask;
+               int namlen, entryoffsetinblock;
+               char *dirbuf;
+ 
+               bmask = fdvp->v_mount->mnt_stat.f_iosize - 1;
+ 
                /*
-                * From name has disappeared.
+                * The fcnp entry will be somewhere between the start of
+                * compaction (to_ulr.ulr_offset) and the original location
+                * (from_ulr.ulr_offset).
                 */
-               if (doingdirectory)
-                       panic("rename: lost dir entry");
-               vrele(ap->a_fvp);
-               error = 0;
-               goto out2;
-       }
+               curpos = to_ulr.ulr_offset;
+               endsearch = from_ulr.ulr_offset + from_ulr.ulr_reclen;
+               entryoffsetinblock = 0;
+ 
+               /*
+                * Get the directory block containing the start of
+                * compaction.
+                */
+               error = ufs_blkatoff(fdvp, (off_t)to_ulr.ulr_offset, &dirbuf,
+                   &bp, false);
+               if (error)
+                       goto bad;
+ 
+               /*
+                * Keep existing ulr_count (length of previous record)
+                * for the case where compaction did not include the
+                * previous entry but started at the from-entry.
+                */
+               prev_reclen = from_ulr.ulr_count;
+ 
+               while (curpos < endsearch) {
+                       uint32_t reclen;
+ 
+                       /*
+                        * If necessary, get the next directory block.
+                        *
+                        * dholland 7/13/11 to the best of my understanding
+                        * this should never happen; compaction occurs only
+                        * within single blocks. I think.
+                        */
+                       if ((curpos & bmask) == 0) {
+                               if (bp != NULL)
+                                       brelse(bp, 0);
+                               error = ufs_blkatoff(fdvp, (off_t)curpos,
+                                   &dirbuf, &bp, false);
+                               if (error)
+                                       goto bad;
+                               entryoffsetinblock = 0;
+                       }
+ 
+                       KASSERT(bp != NULL);
+                       ep = (struct direct *)(dirbuf + entryoffsetinblock);
+                       reclen = ufs_rw16(ep->d_reclen, needswap);
+ 
+ #if (BYTE_ORDER == LITTLE_ENDIAN)
+                       if (FSFMT(fdvp) && needswap == 0)
+                               namlen = ep->d_type;
+                       else
+                               namlen = ep->d_namlen;
+ #else
+                       if (FSFMT(fdvp) && needswap != 0)
+                               namlen = ep->d_type;
+                       else
+                               namlen = ep->d_namlen;
+ #endif
+                       if ((ep->d_ino != 0) &&
+                           (ufs_rw32(ep->d_ino, needswap) != WINO) &&
+                           (namlen == fcnp->cn_namelen) &&
+                           memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) {
+                               from_ulr.ulr_reclen = reclen;
+                               break;
+                       }
+                       curpos += reclen;
+                       entryoffsetinblock += reclen;
+                       prev_reclen = reclen;
+               }
+ 
+               from_ulr.ulr_offset = curpos;
+               from_ulr.ulr_count = prev_reclen;
+ 
+               KASSERT(curpos <= endsearch);
+ 
+               /*
+                * If ulr_offset points to start of a directory block,
+                * clear ulr_count so ufs_dirremove() doesn't try to
+                * merge free space over a directory block boundary.
+                */
+               if ((from_ulr.ulr_offset & (dirblksiz - 1)) == 0)
+                       from_ulr.ulr_count = 0;
+ 
+               brelse(bp, 0);
+       }
+ 
+       /*
+        * 3) Unlink the source.
+        */
+ 
+ #if 0
        /*
         * Ensure that the directory entry still exists and has not
         * changed while the new name has been entered. If the source is
@@ -1325,58 +1931,93 @@
         * flag ensures that it cannot be moved by another rename or removed
         * by a rmdir.
         */
-       if (xp != ip) {
-               if (doingdirectory)
-                       panic("rename: lost dir entry");
-       } else {
-               /*
-                * If the source is a directory with a
-                * new parent, the link count of the old
-                * parent directory must be decremented
-                * and ".." set to point to the new parent.
-                */
-               if (doingdirectory && newparent) {
-                       KASSERT(dp != NULL);
-                       xp->i_offset = mastertemplate.dot_reclen;
-                       ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0, IN_CHANGE);
-                       cache_purge(fdvp);
-               }
-               error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
-               xp->i_flag &= ~IN_RENAME;
-       }
+ #endif
+       KASSERT(fxp == ip);
+ 
+       /*
+        * If the source is a directory with a new parent, the link
+        * count of the old parent directory must be decremented and
+        * ".." set to point to the new parent.
+        */
+       if (doingdirectory && newparent) {
+               KASSERT(fdp != NULL);
+               ufs_dirrewrite(fxp, mastertemplate.dot_reclen,
+                              fdp, newparent, DT_DIR, 0, IN_CHANGE);
+               cache_purge(fdvp);
+       }
+       error = ufs_dirremove(fdvp, &from_ulr,
+                             fxp, fcnp->cn_flags, 0);
+       fxp->i_flag &= ~IN_RENAME;
+ 
        VN_KNOTE(fvp, NOTE_RENAME);
-       if (dp)
-               vput(fdvp);
-       if (xp)
-               vput(fvp);
-       vrele(ap->a_fvp);
+       goto done;
+ 
+  out:
        goto out2;
 
        /* exit routines from steps 1 & 2 */
  bad:
-       if (xp)
-               vput(ITOV(xp));
-       vput(ITOV(dp));
- out:
        if (doingdirectory)
                ip->i_flag &= ~IN_RENAME;
-       if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
-               ip->i_ffs_effnlink--;
-               ip->i_nlink--;
-               DIP_ASSIGN(ip, nlink, ip->i_nlink);
-               ip->i_flag |= IN_CHANGE;
-               ip->i_flag &= ~IN_RENAME;
-               if (DOINGSOFTDEP(fvp))
-                       softdep_change_linkcnt(ip);
-               vput(fvp);
-       } else
-               vrele(fvp);
+       ip->i_ffs_effnlink--;
+       ip->i_nlink--;
+       DIP_ASSIGN(ip, nlink, ip->i_nlink);
+       ip->i_flag |= IN_CHANGE;
+       ip->i_flag &= ~IN_RENAME;
+       if (DOINGSOFTDEP(fvp))
+               softdep_change_linkcnt(ip);
+       UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
+  done:
+       UFS_WAPBL_END(fdvp->v_mount);
+  out2:
+       /*
+        * clear IN_RENAME - some exit paths happen too early to go
+        * through the cleanup done in the "bad" case above, so we
+        * always do this mini-cleanup here.
+        */
+       ip->i_flag &= ~IN_RENAME;
+ 
+       VOP_UNLOCK(fdvp, 0);
+       if (tdvp != fdvp) {
+               VOP_UNLOCK(tdvp, 0);
+       }
+       VOP_UNLOCK(fvp, 0);
+       if (tvp && tvp != fvp) {
+               VOP_UNLOCK(tvp, 0);
+       }
+ 
        vrele(fdvp);
+       vrele(tdvp);
+       vrele(fvp);
+       if (tvp) {
+               vrele(tvp);
+       }
 
-       /* exit routines from step 3 */
- out2:
        fstrans_done(mp);
        return (error);
+ 
+  abort_withlocks:
+       VOP_UNLOCK(fdvp, 0);
+       if (tdvp != fdvp) {
+               VOP_UNLOCK(tdvp, 0);
+       }
+       VOP_UNLOCK(fvp, 0);
+       if (tvp && tvp != fvp) {
+               VOP_UNLOCK(tvp, 0);
+       }
+ 
+  abort:
+       VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+       VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+       vrele(tdvp);
+       if (tvp) {
+               vrele(tvp);
+       }
+       vrele(fdvp);
+       if (fvp) {
+               vrele(fvp);
+       }
+       return (error);
 }
 
 int
@@ -1398,9 +2039,15 @@
        int                     error, dmode, blkoff;
        struct ufsmount         *ump = dp->i_ump;
        int                     dirblksiz = ump->um_dirblksiz;
+       struct ufs_lookup_results *ulr;
 
        fstrans_start(dvp->v_mount, FSTRANS_SHARED);
 
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+ 
+
 #ifdef DIAGNOSTIC
        if ((cnp->cn_flags & HASBUF) == 0)
                panic("ufs_mkdir: no name");
@@ -1418,11 +2065,16 @@
         */
        if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0)
                goto out;
-       error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
-       if (error)
-               goto out;
+
        tvp = *ap->a_vpp;
        ip = VTOI(tvp);
+
+       error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
+       if (error) {
+               UFS_VFREE(tvp, ip->i_number, dmode);
+               vput(tvp);
+               goto out;
+       }
        ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
        DIP_ASSIGN(ip, uid, ip->i_uid);
        ip->i_gid = dp->i_gid;
@@ -1532,7 +2184,7 @@
        }
        newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
        ufs_makedirentry(ip, cnp, newdir);
-       error = ufs_direnter(dvp, tvp, newdir, cnp, bp);
+       error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp);
        pool_cache_put(ufs_direct_cache, newdir);
  bad:
        if (error == 0) {
@@ -1583,6 +2235,7 @@
        struct componentname    *cnp;
        struct inode            *ip, *dp;
        int                     error;
+       struct ufs_lookup_results *ulr;
        bool                    pace;
 
        vp = ap->a_vp;
@@ -1590,14 +2243,19 @@
        cnp = ap->a_cnp;
        ip = VTOI(vp);
        dp = VTOI(dvp);
+ 
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+ 
        /*
         * No rmdir "." or of mounted directories please.
         */
        if (dp == ip || vp->v_mountedhere != NULL) {
                if (dp == ip)
-                       vrele(vp);
+                       vrele(dvp);
                else
-                       vput(vp);
+                       vput(dvp);
                vput(vp);
                return (EINVAL);
        }
@@ -1639,7 +2297,7 @@
                softdep_change_linkcnt(dp);
                softdep_change_linkcnt(ip);
        }
-       error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
+       error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1);
        if (error) {
                if (DOINGSOFTDEP(vp)) {
                        dp->i_ffs_effnlink++;
@@ -1660,13 +2318,13 @@
         * when running with that code we avoid doing them now.
         */
        if (!DOINGSOFTDEP(vp)) {
-               dp->i_nlink--;
                dp->i_ffs_effnlink--;
+               dp->i_nlink--;
                DIP_ASSIGN(dp, nlink, dp->i_nlink);
                dp->i_flag |= IN_CHANGE;
                UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
-               ip->i_nlink--;
                ip->i_ffs_effnlink--;
+               ip->i_nlink--;
                DIP_ASSIGN(ip, nlink, ip->i_nlink);
                ip->i_flag |= IN_CHANGE;
                error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
@@ -1684,7 +2342,6 @@
  out:
        VN_KNOTE(vp, NOTE_DELETE);
        pace = DOINGSOFTDEP(dvp);
-       vput(dvp);
        vput(vp);
        if (pace) {
                /*
@@ -1694,6 +2351,7 @@
                softdep_pace_dirrem();
        }
        fstrans_done(dvp->v_mount);
+       vput(dvp);
        return (error);
 }
 
@@ -1713,14 +2371,20 @@
        struct vnode    *vp, **vpp;
        struct inode    *ip;
        int             len, error;
+       struct ufs_lookup_results *ulr;
 
        vpp = ap->a_vpp;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(ap->a_dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
        /*
         * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
         * ufs_makeinode
         */
        fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
-       error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
+       error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, ulr,
                              vpp, ap->a_cnp);
        if (error)
                goto out;
@@ -2239,8 +2903,8 @@
  * Allocate a new inode.
  */
 int
-ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
-       struct componentname *cnp)
+ufs_makeinode(int mode, struct vnode *dvp, const struct ufs_lookup_results 
*ulr,
+       struct vnode **vpp, struct componentname *cnp)
 {
        struct inode    *ip, *pdir;
        struct direct   *newdir;
@@ -2318,7 +2982,7 @@
                goto bad;
        newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
        ufs_makedirentry(ip, cnp, newdir);
-       error = ufs_direnter(dvp, tvp, newdir, cnp, NULL);
+       error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL);
        pool_cache_put(ufs_direct_cache, newdir);
        if (error)
                goto bad;
Index: ufs/ufs/ufs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_wapbl.c,v
retrieving revision 1.2.8.1
diff -u -r1.2.8.1 ufs_wapbl.c
--- ufs/ufs/ufs_wapbl.c 14 Dec 2008 11:56:04 -0000      1.2.8.1
+++ ufs/ufs/ufs_wapbl.c 7 May 2012 18:46:55 -0000
@@ -117,616 +117,6 @@
        0,      DIRBLKSIZ - 12, DT_DIR, 2,      ".."
 };
 
-/*
- * Rename vnode operation
- *     rename("foo", "bar");
- * is essentially
- *     unlink("bar");
- *     link("foo", "bar");
- *     unlink("foo");
- * but ``atomically''.  Can't do full commit without saving state in the
- * inode on disk which isn't feasible at this time.  Best we can do is
- * always guarantee the target exists.
- *
- * Basic algorithm is:
- *
- * 1) Bump link count on source while we're linking it to the
- *    target.  This also ensure the inode won't be deleted out
- *    from underneath us while we work (it may be truncated by
- *    a concurrent `trunc' or `open' for creation).
- * 2) Link source to destination.  If destination already exists,
- *    delete it first.
- * 3) Unlink source reference to inode if still around. If a
- *    directory was moved and the parent of the destination
- *    is different from the source, patch the ".." entry in the
- *    directory.
- *
- * WAPBL NOTE: wapbl_ufs_rename derived from ufs_rename in ufs_vnops.c
- * ufs_vnops.c netbsd cvs revision 1.108
- * which has the berkeley copyright above
- * changes introduced to ufs_rename since netbsd cvs revision 1.164
- * will need to be ported into wapbl_ufs_rename
- */
-int
-wapbl_ufs_rename(void *v)
-{
-       struct vop_rename_args  /* {
-               struct vnode            *a_fdvp;
-               struct vnode            *a_fvp;
-               struct componentname    *a_fcnp;
-               struct vnode            *a_tdvp;
-               struct vnode            *a_tvp;
-               struct componentname    *a_tcnp;
-       } */ *ap = v;
-       struct vnode            *tvp, *tdvp, *fvp, *fdvp;
-       struct componentname    *tcnp, *fcnp;
-       struct inode            *ip, *txp, *fxp, *tdp, *fdp;
-       struct mount            *mp;
-       struct direct           *newdir;
-       int                     doingdirectory, oldparent, newparent, error;
-
-       int32_t   saved_f_count;
-       doff_t    saved_f_diroff;
-       doff_t    saved_f_offset;
-       u_int32_t saved_f_reclen;
-       int32_t   saved_t_count;
-       doff_t    saved_t_endoff;
-       doff_t    saved_t_diroff;
-       doff_t    saved_t_offset;
-       u_int32_t saved_t_reclen;
-
-       tvp = ap->a_tvp;
-       tdvp = ap->a_tdvp;
-       fvp = ap->a_fvp;
-       fdvp = ap->a_fdvp;
-       tcnp = ap->a_tcnp;
-       fcnp = ap->a_fcnp;
-       doingdirectory = oldparent = newparent = error = 0;
-
-#ifdef DIAGNOSTIC
-       if ((tcnp->cn_flags & HASBUF) == 0 ||
-           (fcnp->cn_flags & HASBUF) == 0)
-               panic("ufs_rename: no name");
-#endif
-       /*
-        * Check for cross-device rename.
-        */
-       if ((fvp->v_mount != tdvp->v_mount) ||
-           (tvp && (fvp->v_mount != tvp->v_mount))) {
-               error = EXDEV;
- abortit:
-               VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
-               if (tdvp == tvp)
-                       vrele(tdvp);
-               else
-                       vput(tdvp);
-               if (tvp)
-                       vput(tvp);
-               VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
-               vrele(fdvp);
-               vrele(fvp);
-               return (error);
-       }
-
-       /*
-        * Check if just deleting a link name.
-        */
-       if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
-           (VTOI(tdvp)->i_flags & APPEND))) {
-               error = EPERM;
-               goto abortit;
-       }
-       if (fvp == tvp) {
-               if (fvp->v_type == VDIR) {
-                       error = EINVAL;
-                       goto abortit;
-               }
-
-               /* Release destination completely. */
-               VOP_ABORTOP(tdvp, tcnp);
-               vput(tdvp);
-               vput(tvp);
-
-               /* Delete source. */
-               vrele(fvp);
-               fcnp->cn_flags &= ~(MODMASK | SAVESTART);
-               fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
-               fcnp->cn_nameiop = DELETE;
-               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
-               if ((error = relookup(fdvp, &fvp, fcnp))) {
-                       vput(fdvp);
-                       return (error);
-               }
-               return (VOP_REMOVE(fdvp, fvp, fcnp));
-       }
-       if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
-               goto abortit;
-       fdp = VTOI(fdvp);
-       ip = VTOI(fvp);
-       if ((nlink_t) ip->i_nlink >= LINK_MAX) {
-               VOP_UNLOCK(fvp, 0);
-               error = EMLINK;
-               goto abortit;
-       }
-       if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
-               (fdp->i_flags & APPEND)) {
-               VOP_UNLOCK(fvp, 0);
-               error = EPERM;
-               goto abortit;
-       }
-       if ((ip->i_mode & IFMT) == IFDIR) {
-               /*
-                * Avoid ".", "..", and aliases of "." for obvious reasons.
-                */
-               if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
-                   fdp == ip ||
-                   (fcnp->cn_flags & ISDOTDOT) ||
-                   (tcnp->cn_flags & ISDOTDOT) ||
-                   (ip->i_flag & IN_RENAME)) {
-                       VOP_UNLOCK(fvp, 0);
-                       error = EINVAL;
-                       goto abortit;
-               }
-               ip->i_flag |= IN_RENAME;
-               doingdirectory = 1;
-       }
-       oldparent = fdp->i_number;
-       VN_KNOTE(fdvp, NOTE_WRITE);             /* XXXLUKEM/XXX: right place? */
-
-       /*
-        * When the target exists, both the directory
-        * and target vnodes are returned locked.
-        */
-       tdp = VTOI(tdvp);
-       txp = NULL;
-       if (tvp)
-               txp = VTOI(tvp);
-
-       mp = fdvp->v_mount;
-       fstrans_start(mp, FSTRANS_SHARED);
-
-       /*
-        * If ".." must be changed (ie the directory gets a new
-        * parent) then the source directory must not be in the
-        * directory hierarchy above the target, as this would
-        * orphan everything below the source directory. Also
-        * the user must have write permission in the source so
-        * as to be able to change "..". We must repeat the call 
-        * to namei, as the parent directory is unlocked by the
-        * call to checkpath().
-        */
-       error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
-       VOP_UNLOCK(fvp, 0);
-       if (oldparent != tdp->i_number)
-               newparent = tdp->i_number;
-       if (doingdirectory && newparent) {
-               if (error)      /* write access check above */
-                       goto out;
-               if (txp != NULL)
-                       vput(tvp);
-               txp = NULL;
-               vref(tdvp);     /* compensate for the ref checkpath loses */
-               if ((error = ufs_checkpath(ip, tdp, tcnp->cn_cred)) != 0) {
-                       vrele(tdvp);
-                       tdp = NULL;
-                       goto out;
-               }
-               tcnp->cn_flags &= ~SAVESTART;
-               tdp = NULL;
-               vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
-               error = relookup(tdvp, &tvp, tcnp);
-               if (error != 0) {
-                       vput(tdvp);
-                       goto out;
-               }
-               tdp = VTOI(tdvp);
-               if (tvp)
-                       txp = VTOI(tvp);
-       }
-
-       /*
-        * XXX handle case where fdvp is parent of tdvp,
-        * by unlocking tdvp and regrabbing it with vget after?
-        */
-
-       /* save directory lookup information in case tdvp == fdvp */
-       saved_t_count  = tdp->i_count;
-       saved_t_endoff = tdp->i_endoff;
-       saved_t_diroff = tdp->i_diroff;
-       saved_t_offset = tdp->i_offset;
-       saved_t_reclen = tdp->i_reclen;
-
-       /*
-        * This was moved up to before the journal lock to
-        * avoid potential deadlock
-        */
-       fcnp->cn_flags &= ~(MODMASK | SAVESTART);
-       fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
-       if (newparent) {
-               /* Check for the rename("foo/foo", "foo") case. */
-               if (fdvp == tvp) {
-                       error = doingdirectory ? ENOTEMPTY : EISDIR;
-                       goto out;
-               }
-               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
-               if ((error = relookup(fdvp, &fvp, fcnp))) {
-                       vput(fdvp);
-                       vrele(ap->a_fvp);
-                       goto out2;
-               }
-       } else {
-               error = VOP_LOOKUP(fdvp, &fvp, fcnp);
-               if (error && (error != EJUSTRETURN)) {
-                       vrele(ap->a_fvp);
-                       goto out2;
-               }
-               error = 0;
-       }
-       if (fvp != NULL) {
-               fxp = VTOI(fvp);
-               fdp = VTOI(fdvp);
-       } else {
-               /*
-                * From name has disappeared.
-                */
-               if (doingdirectory)
-                       panic("rename: lost dir entry");
-               vrele(ap->a_fvp);
-               error = ENOENT; /* XXX ufs_rename sets "0" here */
-               goto out2;
-       }
-       vrele(ap->a_fvp);
-
-       /* save directory lookup information in case tdvp == fdvp */
-       saved_f_count  = fdp->i_count;
-       saved_f_diroff = fdp->i_diroff;
-       saved_f_offset = fdp->i_offset;
-       saved_f_reclen = fdp->i_reclen;
-
-       /* restore directory lookup information in case tdvp == fdvp */
-       tdp->i_offset = saved_t_offset;
-       tdp->i_reclen = saved_t_reclen;
-       tdp->i_count  = saved_t_count;
-       tdp->i_endoff = saved_t_endoff;
-       tdp->i_diroff = saved_t_diroff;
-
-       error = UFS_WAPBL_BEGIN(fdvp->v_mount);
-       if (error)
-               goto out2;
-
-       /*
-        * 1) Bump link count while we're moving stuff
-        *    around.  If we crash somewhere before
-        *    completing our work, the link count
-        *    may be wrong, but correctable.
-        */
-       ip->i_ffs_effnlink++;
-       ip->i_nlink++;
-       DIP_ASSIGN(ip, nlink, ip->i_nlink);
-       ip->i_flag |= IN_CHANGE;
-       if (DOINGSOFTDEP(fvp))
-               softdep_change_linkcnt(ip);
-       if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) {
-               goto bad;
-       }
-
-       /*
-        * 2) If target doesn't exist, link the target
-        *    to the source and unlink the source.
-        *    Otherwise, rewrite the target directory
-        *    entry to reference the source inode and
-        *    expunge the original entry's existence.
-        */
-       if (txp == NULL) {
-               if (tdp->i_dev != ip->i_dev)
-                       panic("rename: EXDEV");
-               /*
-                * Account for ".." in new directory.
-                * When source and destination have the same
-                * parent we don't fool with the link count.
-                */
-               if (doingdirectory && newparent) {
-                       if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
-                               error = EMLINK;
-                               goto bad;
-                       }
-                       tdp->i_ffs_effnlink++;
-                       tdp->i_nlink++;
-                       DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
-                       tdp->i_flag |= IN_CHANGE;
-                       if (DOINGSOFTDEP(tdvp))
-                               softdep_change_linkcnt(tdp);
-                       if ((error = UFS_UPDATE(tdvp, NULL, NULL,
-                           UPDATE_DIROP)) != 0) {
-                               tdp->i_ffs_effnlink--;
-                               tdp->i_nlink--;
-                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
-                               tdp->i_flag |= IN_CHANGE;
-                               if (DOINGSOFTDEP(tdvp))
-                                       softdep_change_linkcnt(tdp);
-                               goto bad;
-                       }
-               }
-               newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
-               ufs_makedirentry(ip, tcnp, newdir);
-               error = ufs_direnter(tdvp, NULL, newdir, tcnp, NULL);
-               pool_cache_put(ufs_direct_cache, newdir);
-               if (error != 0) {
-                       if (doingdirectory && newparent) {
-                               tdp->i_ffs_effnlink--;
-                               tdp->i_nlink--;
-                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
-                               tdp->i_flag |= IN_CHANGE;
-                               if (DOINGSOFTDEP(tdvp))
-                                       softdep_change_linkcnt(tdp);
-                               (void)UFS_UPDATE(tdvp, NULL, NULL,
-                                                UPDATE_WAIT | UPDATE_DIROP);
-                       }
-                       goto bad;
-               }
-               VN_KNOTE(tdvp, NOTE_WRITE);
-       } else {
-               if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev)
-                       panic("rename: EXDEV");
-               /*
-                * Short circuit rename(foo, foo).
-                */
-               if (txp->i_number == ip->i_number)
-                       panic("rename: same file");
-               /*
-                * If the parent directory is "sticky", then the user must
-                * own the parent directory, or the destination of the rename,
-                * otherwise the destination may not be changed (except by
-                * root). This implements append-only directories.
-                */
-               if ((tdp->i_mode & S_ISTXT) &&
-                   kauth_authorize_generic(tcnp->cn_cred,
-                    KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
-                   kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid &&
-                   txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
-                       error = EPERM;
-                       goto bad;
-               }
-               /*
-                * Target must be empty if a directory and have no links
-                * to it. Also, ensure source and target are compatible
-                * (both directories, or both not directories).
-                */
-               if ((txp->i_mode & IFMT) == IFDIR) {
-                       if (txp->i_ffs_effnlink > 2 ||
-                           !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) {
-                               error = ENOTEMPTY;
-                               goto bad;
-                       }
-                       if (!doingdirectory) {
-                               error = ENOTDIR;
-                               goto bad;
-                       }
-                       cache_purge(tdvp);
-               } else if (doingdirectory) {
-                       error = EISDIR;
-                       goto bad;
-               }
-               if ((error = ufs_dirrewrite(tdp, txp, ip->i_number,
-                   IFTODT(ip->i_mode), doingdirectory && newparent ?
-                   newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0)
-                       goto bad;
-               if (doingdirectory) {
-                       if (!newparent) {
-                               tdp->i_ffs_effnlink--;
-                               if (DOINGSOFTDEP(tdvp))
-                                       softdep_change_linkcnt(tdp);
-                       }
-                       txp->i_ffs_effnlink--;
-                       if (DOINGSOFTDEP(tvp))
-                               softdep_change_linkcnt(txp);
-               }
-               if (doingdirectory && !DOINGSOFTDEP(tvp)) {
-                       /*
-                        * Truncate inode. The only stuff left in the directory
-                        * is "." and "..". The "." reference is inconsequential
-                        * since we are quashing it. We have removed the "."
-                        * reference and the reference in the parent directory,
-                        * but there may be other hard links. The soft
-                        * dependency code will arrange to do these operations
-                        * after the parent directory entry has been deleted on
-                        * disk, so when running with that code we avoid doing
-                        * them now.
-                        */
-                       if (!newparent) {
-                               tdp->i_nlink--;
-                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
-                               tdp->i_flag |= IN_CHANGE;
-                               UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
-                       }
-                       txp->i_nlink--;
-                       DIP_ASSIGN(txp, nlink, txp->i_nlink);
-                       txp->i_flag |= IN_CHANGE;
-                       if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
-                           tcnp->cn_cred)))
-                               goto bad;
-               }
-               VN_KNOTE(tdvp, NOTE_WRITE);
-               VN_KNOTE(tvp, NOTE_DELETE);
-       }
-
-       /* restore directory lookup information in case tdvp == fdvp */
-       fdp->i_offset = saved_f_offset;
-       fdp->i_reclen = saved_f_reclen;
-       fdp->i_count  = saved_f_count;
-       fdp->i_diroff = saved_f_diroff;
-
-       /*
-        * Handle case where the directory we need to remove may have
-        * been moved when the directory insertion above performed compaction.
-        * or when i_count may be wrong due to insertion before this entry.
-        */
-       if ((tdp->i_number == fdp->i_number) &&
-               (((saved_f_offset >= saved_t_offset) &&
-                       (saved_f_offset < saved_t_offset + saved_t_count)) ||
-               ((saved_f_offset - saved_f_count >= saved_t_offset) &&
-                       (saved_f_offset - saved_f_count <
-                        saved_t_offset + saved_t_count)))) {
-               struct buf *bp;
-               struct direct *ep;
-               struct ufsmount *ump = fdp->i_ump;
-               doff_t endsearch;       /* offset to end directory search */
-               int dirblksiz = ump->um_dirblksiz;
-               const int needswap = UFS_MPNEEDSWAP(ump);
-               u_long bmask;
-               int namlen, entryoffsetinblock;
-               char *dirbuf;
-
-               bmask = fdvp->v_mount->mnt_stat.f_iosize - 1;
-
-               /*
-                * the fcnp entry will be somewhere between the start of
-                * compaction and the original location.
-                */
-               fdp->i_offset = saved_t_offset;
-               error = ufs_blkatoff(fdvp, (off_t)fdp->i_offset, &dirbuf, &bp,
-                   false);
-               if (error)
-                       goto bad;
-
-               /*
-                * keep existing fdp->i_count in case
-                * compaction started at the same location as the fcnp entry.
-                */
-               endsearch = saved_f_offset + saved_f_reclen;
-               entryoffsetinblock = 0;
-               while (fdp->i_offset < endsearch) {
-                       int reclen;
-
-                       /*
-                        * If necessary, get the next directory block.
-                        */
-                       if ((fdp->i_offset & bmask) == 0) {
-                               if (bp != NULL)
-                                       brelse(bp, 0);
-                               error = ufs_blkatoff(fdvp, (off_t)fdp->i_offset,
-                                   &dirbuf, &bp, false);
-                               if (error)
-                                       goto bad;
-                               entryoffsetinblock = 0;
-                       }
-
-                       KASSERT(bp != NULL);
-                       ep = (struct direct *)(dirbuf + entryoffsetinblock);
-                       reclen = ufs_rw16(ep->d_reclen, needswap);
-
-#if (BYTE_ORDER == LITTLE_ENDIAN)
-                       if (FSFMT(fdvp) && needswap == 0)
-                               namlen = ep->d_type;
-                       else
-                               namlen = ep->d_namlen;
-#else
-                       if (FSFMT(fdvp) && needswap != 0)
-                               namlen = ep->d_type;
-                       else
-                               namlen = ep->d_namlen;
-#endif
-                       if ((ep->d_ino != 0) &&
-                           (ufs_rw32(ep->d_ino, needswap) != WINO) &&
-                           (namlen == fcnp->cn_namelen) &&
-                           memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) {
-                               fdp->i_reclen = reclen;
-                               break;
-                       }
-                       fdp->i_offset += reclen;
-                       fdp->i_count = reclen;
-                       entryoffsetinblock += reclen;
-               }
-
-               KASSERT(fdp->i_offset <= endsearch);
-
-               /*
-                * If fdp->i_offset points to start of a directory block,
-                * set fdp->i_count so ufs_dirremove() doesn't compact over
-                * a directory block boundary.
-                */
-               if ((fdp->i_offset & (dirblksiz - 1)) == 0)
-                       fdp->i_count = 0;
-
-               brelse(bp, 0);
-       }
-
-       /*
-        * 3) Unlink the source.
-        */
-       /*
-        * Ensure that the directory entry still exists and has not
-        * changed while the new name has been entered. If the source is
-        * a file then the entry may have been unlinked or renamed. In
-        * either case there is no further work to be done. If the source
-        * is a directory then it cannot have been rmdir'ed; The IRENAME
-        * flag ensures that it cannot be moved by another rename or removed
-        * by a rmdir.
-        */
-       if (fxp != ip) {
-               if (doingdirectory)
-                       panic("rename: lost dir entry");
-       } else {
-               /*
-                * If the source is a directory with a
-                * new parent, the link count of the old
-                * parent directory must be decremented
-                * and ".." set to point to the new parent.
-                */
-               if (doingdirectory && newparent) {
-                       KASSERT(fdp != NULL);
-                       fxp->i_offset = mastertemplate.dot_reclen;
-                       ufs_dirrewrite(fxp, fdp, newparent, DT_DIR, 0, 
IN_CHANGE);
-                       cache_purge(fdvp);
-               }
-               error = ufs_dirremove(fdvp, fxp, fcnp->cn_flags, 0);
-               fxp->i_flag &= ~IN_RENAME;
-       }
-       VN_KNOTE(fvp, NOTE_RENAME);
-       goto done;
-
- out:
-       vrele(fvp);
-       vrele(fdvp);
-       goto out2;
-
-       /* exit routines from steps 1 & 2 */
- bad:
-       if (doingdirectory)
-               ip->i_flag &= ~IN_RENAME;
-       ip->i_ffs_effnlink--;
-       ip->i_nlink--;
-       DIP_ASSIGN(ip, nlink, ip->i_nlink);
-       ip->i_flag |= IN_CHANGE;
-       ip->i_flag &= ~IN_RENAME;
-       UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
-       if (DOINGSOFTDEP(fvp))
-               softdep_change_linkcnt(ip);
- done:
-       UFS_WAPBL_END(fdvp->v_mount);
-       vput(fdvp);
-       vput(fvp);
- out2:
-       /*
-        * clear IN_RENAME - some exit paths happen too early to go
-        * through the cleanup done in the "bad" case above, so we
-        * always do this mini-cleanup here.
-        */
-       ip->i_flag &= ~IN_RENAME;
-
-       if (txp)
-               vput(ITOV(txp));
-       if (tdp) {
-               if (newparent)
-                       vput(ITOV(tdp));
-               else
-                       vrele(ITOV(tdp));
-       }
-
-       fstrans_done(mp);
-       return (error);
-}
-
 #ifdef WAPBL_DEBUG_INODES
 void
 ufs_wapbl_verify_inodes(struct mount *mp, const char *str)
Prev by Date: Re: kern/46424: hme(4) on Sun Netra T1 won't work - shows no carrier
Next by Date: NetBSD Nightly Trouble Ticket Report
Previous by Thread: Re: kern/41417 (WAPBL: hang on tstile)
Next by Thread: Re: kern/41417 (WAPBL: hang on tstile)
Indexes:
Home | Main Index | Thread Index | Old Index