Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/kern Disentangle do_sys_rename.



details:   https://anonhg.NetBSD.org/src/rev/51ded7191beb
branches:  trunk
changeset: 781992:51ded7191beb
user:      riastradh <riastradh%NetBSD.org@localhost>
date:      Fri Oct 12 02:37:20 2012 +0000

description:
Disentangle do_sys_rename.

Elide the fs-wide rename lock for single-directory renames.  This
required changing the order of lookups, so that we know what the
directories are before we lock the nodes.

Clean up error branches, explain why various nonsense happens and
what it does and doesn't do, and note some of what needs to change.

diffstat:

 sys/kern/vfs_syscalls.c |  424 +++++++++++++++++++++++++++++++++--------------
 1 files changed, 294 insertions(+), 130 deletions(-)

diffs (truncated from 487 to 300 lines):

diff -r adad5fe72f75 -r 51ded7191beb sys/kern/vfs_syscalls.c
--- a/sys/kern/vfs_syscalls.c   Thu Oct 11 20:05:50 2012 +0000
+++ b/sys/kern/vfs_syscalls.c   Fri Oct 12 02:37:20 2012 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: vfs_syscalls.c,v 1.457 2012/06/27 12:28:28 cheusov Exp $       */
+/*     $NetBSD: vfs_syscalls.c,v 1.458 2012/10/12 02:37:20 riastradh Exp $     */
 
 /*-
  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
@@ -70,7 +70,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.457 2012/06/27 12:28:28 cheusov Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.458 2012/10/12 02:37:20 riastradh Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_fileassoc.h"
@@ -3871,176 +3871,340 @@
  * (retain == 0)       deleted unless `from' and `to' refer to the same
  *                     object in the file system's name space (BSD).
  * (retain == 1)       always retained (POSIX).
+ *
+ * XXX Synchronize with nfsrv_rename in nfs_serv.c.
  */
 int
 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
 {
-       struct vnode *tvp, *fvp, *tdvp;
-       struct pathbuf *frompb, *topb;
-       struct nameidata fromnd, tond;
-       struct mount *fs;
+       struct pathbuf *fpb, *tpb;
+       struct nameidata fnd, tnd;
+       struct vnode *fdvp, *fvp;
+       struct vnode *tdvp, *tvp;
+       struct mount *mp, *tmp;
        int error;
 
-       error = pathbuf_maybe_copyin(from, seg, &frompb);
-       if (error) {
-               return error;
-       }
-       error = pathbuf_maybe_copyin(to, seg, &topb);
-       if (error) {
-               pathbuf_destroy(frompb);
-               return error;
+       KASSERT(from != NULL);
+       KASSERT(to != NULL);
+
+       error = pathbuf_maybe_copyin(from, seg, &fpb);
+       if (error)
+               goto out0;
+       KASSERT(fpb != NULL);
+
+       error = pathbuf_maybe_copyin(to, seg, &tpb);
+       if (error)
+               goto out1;
+       KASSERT(tpb != NULL);
+
+       /*
+        * Lookup from.
+        *
+        * XXX LOCKPARENT is wrong because we don't actually want it
+        * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
+        * insane, so for the time being we need to leave it like this.
+        */
+       NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT | INRENAME), fpb);
+       if ((error = namei(&fnd)) != 0)
+               goto out2;
+
+       /*
+        * Pull out the important results of the lookup, fdvp and fvp.
+        * Of course, fvp is bogus because we're about to unlock fdvp.
+        */
+       fdvp = fnd.ni_dvp;
+       fvp = fnd.ni_vp;
+       KASSERT(fdvp != NULL);
+       KASSERT(fvp != NULL);
+       KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
+
+       /*
+        * Make sure neither fdvp nor fvp is locked.
+        */
+       if (fdvp != fvp)
+               VOP_UNLOCK(fdvp);
+       /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
+       /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
+
+       /*
+        * Reject renaming `.' and `..'.  Can't do this until after
+        * namei because we need namei's parsing to find the final
+        * component name.  (namei should just leave us with the final
+        * component name and not look it up itself, but anyway...)
+        *
+        * This was here before because we used to relookup from
+        * instead of to and relookup requires the caller to check
+        * this, but now file systems may depend on this check, so we
+        * must retain it until the file systems are all rototilled.
+        */
+       if (((fnd.ni_cnd.cn_namelen == 1) &&
+               (fnd.ni_cnd.cn_nameptr[0] == '.')) ||
+           ((fnd.ni_cnd.cn_namelen == 2) &&
+               (fnd.ni_cnd.cn_nameptr[0] == '.') &&
+               (fnd.ni_cnd.cn_nameptr[1] == '.'))) {
+               error = EINVAL; /* XXX EISDIR?  */
+               goto abort0;
        }
 
-       NDINIT(&fromnd, DELETE, LOCKPARENT | TRYEMULROOT | INRENAME,
-           frompb);
-       if ((error = namei(&fromnd)) != 0) {
-               pathbuf_destroy(frompb);
-               pathbuf_destroy(topb);
-               return (error);
-       }
-       if (fromnd.ni_dvp != fromnd.ni_vp)
-               VOP_UNLOCK(fromnd.ni_dvp);
-       fvp = fromnd.ni_vp;
-
-       fs = fvp->v_mount;
-       error = VFS_RENAMELOCK_ENTER(fs);
-       if (error) {
-               VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
-               vrele(fromnd.ni_dvp);
-               vrele(fvp);
-               goto out1;
+       /*
+        * Lookup to.
+        *
+        * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
+        * fvp here to decide whether to add CREATEDIR is a load of
+        * bollocks because fvp might be the wrong node by now, since
+        * fdvp is unlocked.
+        *
+        * XXX Why not pass CREATEDIR always?
+        */
+       NDINIT(&tnd, RENAME,
+           (LOCKPARENT | NOCACHE | TRYEMULROOT | INRENAME |
+               ((fvp->v_type == VDIR)? CREATEDIR : 0)),
+           tpb);
+       if ((error = namei(&tnd)) != 0)
+               goto abort0;
+
+       /*
+        * Pull out the important results of the lookup, tdvp and tvp.
+        * Of course, tvp is bogus because we're about to unlock tdvp.
+        */
+       tdvp = tnd.ni_dvp;
+       tvp = tnd.ni_vp;
+       KASSERT(tdvp != NULL);
+       KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
+
+       /*
+        * Make sure neither tdvp nor tvp is locked.
+        */
+       if (tdvp != tvp)
+               VOP_UNLOCK(tdvp);
+       /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
+       /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
+
+       /*
+        * Reject renaming onto `.' or `..'.  relookup is unhappy with
+        * these, which is why we must do this here.  Once upon a time
+        * we relooked up from instead of to, and consequently didn't
+        * need this check, but now that we relookup to instead of
+        * from, we need this; and we shall need it forever forward
+        * until the VOP_RENAME protocol changes, because file systems
+        * will no doubt begin to depend on this check.
+        */
+       if (((tnd.ni_cnd.cn_namelen == 1) &&
+               (tnd.ni_cnd.cn_nameptr[0] == '.')) ||
+           ((tnd.ni_cnd.cn_namelen == 2) &&
+               (tnd.ni_cnd.cn_nameptr[0] == '.') &&
+               (tnd.ni_cnd.cn_nameptr[1] == '.'))) {
+               error = EINVAL; /* XXX EISDIR?  */
+               goto abort1;
        }
 
        /*
-        * close, partially, yet another race - ideally we should only
-        * go as far as getting fromnd.ni_dvp before getting the per-fs
-        * lock, and then continue to get fromnd.ni_vp, but we can't do
-        * that with namei as it stands.
-        *
-        * This still won't prevent rmdir from nuking fromnd.ni_vp
-        * under us. The real fix is to get the locks in the right
-        * order and do the lookups in the right places, but that's a
-        * major rototill.
+        * Get the mount point.  If the file system has been unmounted,
+        * which it may be because we're not holding any vnode locks,
+        * then v_mount will be NULL.  We're not really supposed to
+        * read v_mount without holding the vnode lock, but since we
+        * have fdvp referenced, if fdvp->v_mount changes then at worst
+        * it will be set to NULL, not changed to another mount point.
+        * And, of course, since it is up to the file system to
+        * determine the real lock order, we can't lock both fdvp and
+        * tdvp at the same time.
+        */
+       mp = fdvp->v_mount;
+       if (mp == NULL) {
+               error = ENOENT;
+               goto abort1;
+       }
+
+       /*
+        * Make sure the mount points match.  Again, although we don't
+        * hold any vnode locks, the v_mount fields may change -- but
+        * at worst they will change to NULL, so this will never become
+        * a cross-device rename, because we hold vnode references.
         *
-        * Note: this logic (as well as this whole function) is cloned
-        * in nfs_serv.c. Proceed accordingly.
+        * XXX Because nothing is locked and the compiler may reorder
+        * things here, unmounting the file system at an inopportune
+        * moment may cause rename to fail with ENXDEV when it really
+        * should fail with ENOENT.
         */
-       vrele(fvp);
-       if ((fromnd.ni_cnd.cn_namelen == 1 && 
-            fromnd.ni_cnd.cn_nameptr[0] == '.') ||
-           (fromnd.ni_cnd.cn_namelen == 2 && 
-            fromnd.ni_cnd.cn_nameptr[0] == '.' &&
-            fromnd.ni_cnd.cn_nameptr[1] == '.')) {
-               error = EINVAL;
-               VFS_RENAMELOCK_EXIT(fs);
-               VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
-               vrele(fromnd.ni_dvp);
-               goto out1;
+       tmp = tdvp->v_mount;
+       if (tmp == NULL) {
+               error = ENOENT;
+               goto abort1;
+       }
+
+       if (mp != tmp) {
+               error = EXDEV;
+               goto abort1;
+       }
+
+       /*
+        * Take the vfs rename lock to avoid cross-directory screw cases.
+        * Nothing is locked currently, so taking this lock is safe.
+        */
+       if (fdvp != tdvp) {
+               error = VFS_RENAMELOCK_ENTER(mp);
+               if (error)
+                       goto abort1;
        }
-       vn_lock(fromnd.ni_dvp, LK_EXCLUSIVE | LK_RETRY);
-       error = relookup(fromnd.ni_dvp, &fromnd.ni_vp, &fromnd.ni_cnd, 0);
-       if (error) {
-               VOP_UNLOCK(fromnd.ni_dvp);
-               VFS_RENAMELOCK_EXIT(fs);
-               VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
-               vrele(fromnd.ni_dvp);
-               goto out1;
-       }
-       VOP_UNLOCK(fromnd.ni_vp);
-       if (fromnd.ni_dvp != fromnd.ni_vp)
-               VOP_UNLOCK(fromnd.ni_dvp);
-       fvp = fromnd.ni_vp;
-
-       NDINIT(&tond, RENAME,
-           LOCKPARENT | LOCKLEAF | NOCACHE | TRYEMULROOT
-             | INRENAME | (fvp->v_type == VDIR ? CREATEDIR : 0),
-           topb);
-       if ((error = namei(&tond)) != 0) {
-               VFS_RENAMELOCK_EXIT(fs);
-               VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
-               vrele(fromnd.ni_dvp);
-               vrele(fvp);
-               goto out1;
-       }
-       tdvp = tond.ni_dvp;
-       tvp = tond.ni_vp;
-
+
+       /*
+        * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
+        * and nothing is locked except for the vfs rename lock.
+        *
+        * The next step is a little rain dance to conform to the
+        * insane lock protocol, even though it does nothing to ward
+        * off race conditions.
+        *
+        * We need tdvp and tvp to be locked.  However, because we have
+        * unlocked tdvp in order to hold no locks while we take the
+        * vfs rename lock, tvp may be wrong here, and we can't safely
+        * lock it even if the sensible file systems will just unlock
+        * it straight away.  Consequently, we must lock tdvp and then
+        * relookup tvp to get it locked.
+        *
+        * Finally, because the VOP_RENAME protocol is brain-damaged
+        * and various file systems insanely depend on the semantics of
+        * this brain damage, the lookup of to must be the last lookup
+        * before VOP_RENAME.
+        */
+       vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+       error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);



Home | Main Index | Thread Index | Old Index