Subject: Re: kern/33861
To: None <kern-bug-people@netbsd.org, gnats-admin@netbsd.org,>
From: Jed Davis [staff] <jld@panix.com>
List: netbsd-bugs
Date: 06/30/2006 06:20:02
The following reply was made to PR kern/33861; it has been noted by GNATS.

From: "Jed Davis [staff]" <jld@panix.com>
To: gnats-bugs@NetBSD.org
Cc: 
Subject: Re: kern/33861
Date: Fri, 30 Jun 2006 02:15:22 -0400

 I've looked at this a little more.  Trying to undo the sillyrename isn't
 the best idea; it leaves a brief window when the destination file isn't
 there (thus breaking the important part of rename), and if someone else
 creates a file in the absent destination's place, the act of renaming
 the .nfs file back will silently destroy the new file.
 
 Also, it leads to odd behavior in the (easily reproducible) case of
 rename("a/x","b/x") vs. rename("b/x","a/x") with both files existing: we
 currently delete both files and fail both calls with ENOENT, which would
 be changed to leaving both files in place and failing with ENOENT; this
 is better, but still not correct.
 
 Better would be to link the file to a temporary name rather than
 renaming, if supported by the server; thus the destination is always
 present, and survives whether or not the rename succeeds.  If the rename
 fails, this does cause the file to wind up with an extra link, to be
 removed when the vnode becomes inactive; this seems mostly harmless,
 and trying to fix it may cause more trouble than it's worth.  (This
 was first suggested to me by a coworker, and also happens to be what
 OpenSolaris does.)
 
 It's still worthwhile to double-check the existence of the source file
 before going ahead with the sillyrename, I feel, to avoid unnecessary
 silliness.  (It also happens that OpenSolaris does this, too.)
 
 So, I've included a patch which does those two things; I've done some
 simple tests with it, but it could use more looking-at, as I had to
 outline from nfs_link() the code that does the actual RPC, as opposed to
 vnode glue.
 
 
 Index: sys/nfs/nfs_var.h
 ===================================================================
 RCS file: /cvsroot/src/sys/nfs/nfs_var.h,v
 retrieving revision 1.49
 diff -u -r1.49 nfs_var.h
 --- sys/nfs/nfs_var.h	27 Jan 2005 11:33:26 -0000	1.49
 +++ sys/nfs/nfs_var.h	30 Jun 2006 05:07:16 -0000
 @@ -126,7 +126,7 @@
  int nfs_readdirrpc __P((struct vnode *, struct uio *, struct ucred *));
  int nfs_readdirplusrpc __P((struct vnode *, struct uio *, struct ucred *));
  int nfs_sillyrename __P((struct vnode *, struct vnode *,
 -			 struct componentname *));
 +			 struct componentname *, int));
  int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *,
  		      struct proc *, struct nfsnode **));
  int nfs_commit __P((struct vnode *, off_t, uint32_t, struct proc *));
 Index: sys/nfs/nfs_vnops.c
 ===================================================================
 RCS file: /cvsroot/src/sys/nfs/nfs_vnops.c,v
 retrieving revision 1.220.2.1
 diff -u -r1.220.2.1 nfs_vnops.c
 --- sys/nfs/nfs_vnops.c	27 Sep 2005 10:31:29 -0000	1.220.2.1
 +++ sys/nfs/nfs_vnops.c	30 Jun 2006 05:07:16 -0000
 @@ -1881,7 +1881,7 @@
  			error = nfs_removerpc(dvp, cnp->cn_nameptr,
  				cnp->cn_namelen, cnp->cn_cred, cnp->cn_proc);
  	} else if (!np->n_sillyrename)
 -		error = nfs_sillyrename(dvp, vp, cnp);
 +		error = nfs_sillyrename(dvp, vp, cnp, 0);
  	PNBUF_PUT(cnp->cn_pnbuf);
  	if (!error && nfs_getattrcache(vp, &vattr) == 0 &&
  	    vattr.va_nlink == 1) {
 @@ -2000,10 +2000,31 @@
  	 * rename of the new file over it.
  	 */
  	if (tvp && tvp->v_usecount > 1 && !VTONFS(tvp)->n_sillyrename &&
 -		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 -		VN_KNOTE(tvp, NOTE_DELETE);
 -		vput(tvp);
 -		tvp = NULL;
 +	    tvp->v_type != VDIR) {
 +		/*
 +		 * Make sure the "from" file still exists.
 +		 *
 +		 * Do this here, under the tdvp lock, so concurrent renames
 +		 * with the same args on the same client don't race.
 +		 * 
 +		 * Also, do this here, just before the rename, to minimize
 +		 * the race window against other clients.
 +		 */
 +		error = nfs_lookitup(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 +		    fcnp->cn_cred, fcnp->cn_proc, NULL);
 +		if (error != 0)
 +			goto out;
 +		/*
 +		 * Have sillyrename use link instead of rename if possible,
 +		 * so that we don't lose the file if the rename fails, and so
 +		 * that there's no window when the "to" file doesn't exist.
 +		 */
 +		if (!nfs_sillyrename(tdvp, tvp, tcnp, 1)) {
 +			/* XXX should note deletion only if rename succeeds */
 +			VN_KNOTE(tvp, NOTE_DELETE);
 +			vput(tvp);
 +			tvp = NULL;
 +		}
  	}
  
  	error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 @@ -2106,20 +2127,13 @@
  }
  
  /*
 - * nfs hard link create call
 + * NFS link RPC, called from nfs_link below.
 + * Assumes dvp and vp locked, and leaves them that way.
   */
 -int
 -nfs_link(v)
 -	void *v;
 +static int
 +nfs_linkrpc(struct vnode *dvp, struct vnode *vp, const char *nameptr,
 +	    int namelen, struct ucred *cred, struct proc *proc)
  {
 -	struct vop_link_args /* {
 -		struct vnode *a_dvp;
 -		struct vnode *a_vp;
 -		struct componentname *a_cnp;
 -	} */ *ap = v;
 -	struct vnode *vp = ap->a_vp;
 -	struct vnode *dvp = ap->a_dvp;
 -	struct componentname *cnp = ap->a_cnp;
  	u_int32_t *tl;
  	caddr_t cp;
  #ifndef NFS_V2_ONLY
 @@ -2135,36 +2149,22 @@
  	int rexmit;
  	struct nfsnode *np;
  
 -	if (dvp->v_mount != vp->v_mount) {
 -		VOP_ABORTOP(dvp, cnp);
 -		vput(dvp);
 -		return (EXDEV);
 -	}
 -	if (dvp != vp) {
 -		error = vn_lock(vp, LK_EXCLUSIVE);
 -		if (error != 0) {
 -			VOP_ABORTOP(dvp, cnp);
 -			vput(dvp);
 -			return error;
 -		}
 -	}
 -
  	/*
  	 * Push all writes to the server, so that the attribute cache
  	 * doesn't get "out of sync" with the server.
  	 * XXX There should be a better way!
  	 */
 -	VOP_FSYNC(vp, cnp->cn_cred, FSYNC_WAIT, 0, 0, cnp->cn_proc);
 +	VOP_FSYNC(vp, cred, FSYNC_WAIT, 0, 0, proc);
  
  	v3 = NFS_ISV3(vp);
  	nfsstats.rpccnt[NFSPROC_LINK]++;
  	np = VTONFS(vp);
  	nfsm_reqhead(np, NFSPROC_LINK,
 -		NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
 +		NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(namelen));
  	nfsm_fhtom(np, v3);
  	nfsm_fhtom(VTONFS(dvp), v3);
 -	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
 -	nfsm_request1(np, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred, &rexmit);
 +	nfsm_strtom(nameptr, namelen, NFS_MAXNAMLEN);
 +	nfsm_request1(np, NFSPROC_LINK, proc, cred, &rexmit);
  #ifndef NFS_V2_ONLY
  	if (v3) {
  		nfsm_postop_attr(vp, attrflag, 0);
 @@ -2172,25 +2172,62 @@
  	}
  #endif
  	nfsm_reqdone;
 -	if (error == 0 || error == EEXIST)
 -		cache_purge1(dvp, cnp, 0);
 -	PNBUF_PUT(cnp->cn_pnbuf);
  	VTONFS(dvp)->n_flag |= NMODIFIED;
  	if (!attrflag)
  		NFS_INVALIDATE_ATTRCACHE(VTONFS(vp));
  	if (!wccflag)
  		NFS_INVALIDATE_ATTRCACHE(VTONFS(dvp));
 -	if (dvp != vp)
 -		VOP_UNLOCK(vp, 0);
  	VN_KNOTE(vp, NOTE_LINK);
  	VN_KNOTE(dvp, NOTE_WRITE);
 -	vput(dvp);
  	/*
  	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
  	 */
  	if (rexmit && error == EEXIST)
  		error = 0;
 -	return (error);
 +	return error;
 +}
 +
 +/*
 + * nfs hard link create call
 + */
 +int
 +nfs_link(v)
 +	void *v;
 +{
 +	struct vop_link_args /* {
 +		struct vnode *a_dvp;
 +		struct vnode *a_vp;
 +		struct componentname *a_cnp;
 +	} */ *ap = v;
 +	struct vnode *vp = ap->a_vp;
 +	struct vnode *dvp = ap->a_dvp;
 +	struct componentname *cnp = ap->a_cnp;
 +	int error;
 +
 +	if (dvp->v_mount != vp->v_mount) {
 +		VOP_ABORTOP(dvp, cnp);
 +		vput(dvp);
 +		return (EXDEV);
 +	}
 +	if (dvp != vp) {
 +		error = vn_lock(vp, LK_EXCLUSIVE);
 +		if (error != 0) {
 +			VOP_ABORTOP(dvp, cnp);
 +			vput(dvp);
 +			return error;
 +		}
 +	}
 +
 +	error = nfs_linkrpc(dvp, vp, cnp->cn_nameptr, cnp->cn_namelen,
 +	    cnp->cn_cred, cnp->cn_proc);
 +
 +	if (error == 0 || error == EEXIST)
 +		cache_purge1(dvp, cnp, 0);
 +	PNBUF_PUT(cnp->cn_pnbuf);
 +	if (dvp != vp)
 +		VOP_UNLOCK(vp, 0);
 +	vput(dvp);
 +	return error;
  }
  
  /*
 @@ -3032,7 +3069,7 @@
   * nfs_rename() completes, but...
   */
  int
 -nfs_sillyrename(dvp, vp, cnp)
 +nfs_sillyrename(dvp, vp, cnp, use_link)
  	struct vnode *dvp, *vp;
  	struct componentname *cnp;
  {
 @@ -3071,7 +3108,14 @@
  			goto bad;
  		}
  	}
 -	error = nfs_renameit(dvp, cnp, sp);
 +	if (use_link) {
 +		error = nfs_linkrpc(dvp, vp, sp->s_name, sp->s_namlen,
 +		    sp->s_cred, cnp->cn_proc);
 +		if (error == EOPNOTSUPP)
 +			error = nfs_renameit(dvp, cnp, sp);
 +	} else {
 +		error = nfs_renameit(dvp, cnp, sp);
 +	}
  	if (error)
  		goto bad;
  	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,