Subject: emul vs. symlinks
To: None <tech-kern@NetBSD.org>
From: Quentin Garnier <cube@NetBSD.org>
List: tech-kern
Date: 07/09/2004 23:33:20
--Signature=_Fri__9_Jul_2004_23_33_20_+0200_2Knr28fY6H.k2E1U
Content-Type: multipart/mixed;
 boundary="Multipart=_Fri__9_Jul_2004_23_33_20_+0200_Ttc/IxCvF5Zr6T0R"


--Multipart=_Fri__9_Jul_2004_23_33_20_+0200_Ttc/IxCvF5Zr6T0R
Content-Type: text/plain; charset=US-ASCII
Content-Disposition: inline
Content-Transfer-Encoding: 7bit

Hi folks,

There is an issue with the way the emulation layer works with symlinks.
When opening a file or looking for an interpreter, the emul_find()
function is usually called, and its main purpose is to try and find a file
in the emulation root and failback on the real root if it isn't found
there.

It doesn't work well with symlinks since that part is completely handled
by namei(), which doesn't know, and therefore ignore, the emulation root.

The result is that absolute symlinks are always treated relatively to the
real root, which has some side effects, notably in netbsd32 since we
switched to a dynamic root:  /usr/lib/libc.so (and friends) and
/usr/libexec/ld.elf_so are symlinks to their counterpart in /lib and
/libexec, and an infortunate choice was made to use absolute symlinks.

That makes it impossible to use 32 bits binaries in a 64 bits system the
same way we use Linux binaries on i386 (Linux uses relative symlinks for
the lib stuff, so the issue never really arose).

My proposal is to add a function in sys/compat/compat_util.c that is a
complete rip-off of namei(), but customised to emul_find()'s needs.  It
actually needs two things:  to use the emulation root in the resolution
process to try and find a file inside that root first, but still fallback
on the real root.  Then, it also needs a path to found file, since the
actual resolution will happen later using the real namei().

So far it works fairly well, I have quickly tested it but not thoroughly.
I have commented all the differences with the real namei(), so report to
the code for details.

Opinions?

-- 
Quentin Garnier - cube@NetBSD.org
The NetBSD Project - http://www.NetBSD.org/

--Multipart=_Fri__9_Jul_2004_23_33_20_+0200_Ttc/IxCvF5Zr6T0R
Content-Type: text/plain;
 name="diff"
Content-Disposition: attachment;
 filename="diff"
Content-Transfer-Encoding: 7bit

Index: compat_util.c
===================================================================
RCS file: /cvsroot/src/sys/compat/common/compat_util.c,v
retrieving revision 1.27
diff -u -r1.27 compat_util.c
--- compat_util.c	29 Jun 2003 22:29:13 -0000	1.27
+++ compat_util.c	9 Jul 2004 21:06:34 -0000
@@ -39,6 +39,11 @@
 #include <sys/cdefs.h>
 __KERNEL_RCSID(0, "$NetBSD: compat_util.c,v 1.27 2003/06/29 22:29:13 fvdl Exp $");
 
+#if defined(_KERNEL_OPT)
+#include "opt_ktrace.h"
+#include "opt_systrace.h"
+#endif
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
@@ -54,9 +59,27 @@
 #include <sys/syslog.h>
 #include <sys/mount.h>
 
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#ifdef SYSTRACE
+#include <sys/systrace.h>
+#endif
 
 #include <compat/common/compat_util.h>
 
+struct compat_nameidata {
+	struct nameidata	cni_ni;
+	struct vnode		*cni_altrootdir;
+	int			cni_flags;
+#define	COMPAT_NAMEI_ALTROOT	0x1
+};
+
+static int	compat_namei(struct compat_nameidata *);
+#ifdef COMPAT_NAMEI_DEBUG
+int dni;
+#endif
+
 /*
  * Search an alternate path before passing pathname arguments on
  * to system calls. Useful for keeping a separate 'emulation tree'.
@@ -80,7 +103,8 @@
 	const char	**pbuf;
 	int		  sflag;
 {
-	struct nameidata	 nd;
+	struct compat_nameidata	 cnd;
+	struct nameidata	 *ndp = &cnd.cni_ni;
 	struct nameidata	 ndroot;
 	struct vattr		 vat;
 	struct vattr		 vatroot;
@@ -139,22 +163,50 @@
 			;
 		*cp = '\0';
 
-		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, p);
+		NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, p);
 
-		if ((error = namei(&nd)) != 0)
+		if ((error = namei(ndp)) != 0)
 			goto bad;
 
 		*cp = '/';
 		break;
 	case CHECK_ALT_FL_EXISTS:
 	case CHECK_ALT_FL_SYMLINK:
-		NDINIT(&nd, LOOKUP,	
-			(sflag == CHECK_ALT_FL_SYMLINK) ? NOFOLLOW : FOLLOW,
-			UIO_SYSSPACE, buf, p);
+		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, p);
 
-		if ((error = namei(&nd)) != 0)
+		if ((error = namei(&ndroot)) != 0)
 			goto bad;
 
+		/* We use SAVENAME because the returned nameidata structure will
+		 * contain the full path of the target, potentially relative to
+		 * the emul root. */
+		NDINIT(ndp, LOOKUP,	
+			((sflag == CHECK_ALT_FL_SYMLINK) ? NOFOLLOW : FOLLOW) | SAVENAME,
+			UIO_SYSSPACE, ptr, p);
+		cnd.cni_altrootdir = ndroot.ni_vp;
+
+#ifdef COMPAT_NAMEI_DEBUG
+		dni = 1;
+#endif
+		error = compat_namei(&cnd);
+#ifdef COMPAT_NAMEI_DEBUG
+		dni = 0;
+#endif
+		if (error != 0)
+			goto bad2;
+#ifdef COMPAT_NAMEI_DEBUG
+		if ((cnd.cni_flags & COMPAT_NAMEI_ALTROOT) == 0) {
+			printf("absolute path\n");
+			ptr = buf;
+		} else
+			printf("relative path\n");
+#else
+		if ((cnd.cni_flags & COMPAT_NAMEI_ALTROOT) == 0)
+			ptr = buf;
+#endif
+		copystr(cnd.cni_ni.ni_cnd.cn_pnbuf, ptr, sz, &len);
+		PNBUF_PUT(cnd.cni_ni.ni_cnd.cn_pnbuf);
+
 		/*
 		 * We now compare the vnode of the emulation root to the one
 		 * vnode asked. If they resolve to be the same, then we
@@ -163,12 +215,7 @@
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
-		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, p);
-
-		if ((error = namei(&ndroot)) != 0)
-			goto bad2;
-
-		if ((error = VOP_GETATTR(nd.ni_vp, &vat, p->p_ucred, p)) != 0)
+		if ((error = VOP_GETATTR(ndp->ni_vp, &vat, p->p_ucred, p)) != 0)
 			goto bad3;
 
 		if ((error = VOP_GETATTR(ndroot.ni_vp, &vatroot, p->p_ucred, p))
@@ -184,7 +231,7 @@
 		break;
 	}
 
-	vrele(nd.ni_vp);
+	vrele(ndp->ni_vp);
 	if (sflag == CHECK_ALT_FL_EXISTS)
 		vrele(ndroot.ni_vp);
 
@@ -202,16 +249,22 @@
 			*pbuf = path;
 			goto bad;
 		}
+#ifdef COMPAT_NAMEI_DEBUG
+		printf("sgp; buf = %s\n", buf);
+#endif
 		free(buf, M_TEMP);
 	}
 	return 0;
 
 bad3:
-	vrele(ndroot.ni_vp);
+	vrele(ndp->ni_vp);
 bad2:
-	vrele(nd.ni_vp);
+	vrele(ndroot.ni_vp);
 bad:
 	free(buf, M_TEMP);
+#ifdef COMPAT_NAMEI_DEBUG
+	printf("emul_find: returning %d\n", error);
+#endif
 	return error;
 }
 
@@ -316,3 +369,302 @@
 	    msg, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
 	uprintf("%s: dir offset too large for emulated program\n", msg);
 }
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_lookup.c	8.10 (Berkeley) 5/27/95
+ */
+
+static int
+compat_namei(cndp)
+	struct compat_nameidata *cndp;
+{
+	struct cwdinfo *cwdi;		/* pointer to cwd state */
+	char *cp;			/* pointer into pathname argument */
+	struct vnode *dp;		/* the directory we are searching */
+	struct iovec aiov;		/* uio for reading symbolic links */
+	struct uio auio;
+	int error, linklen, restart;
+	struct nameidata *ndp = &cndp->cni_ni;
+	struct componentname *cnp = &ndp->ni_cnd;
+	size_t pathlen = 0;
+	const char *next = NULL;
+	u_long loopcnt = 0;
+
+#ifdef DIAGNOSTIC
+	if (!cnp->cn_cred || !cnp->cn_proc)
+		panic ("namei: bad cred/proc");
+	if (cnp->cn_nameiop & (~OPMASK))
+		panic ("namei: nameiop contaminated with flags");
+	if (cnp->cn_flags & OPMASK)
+		panic ("namei: flags contaminated with nameiops");
+#endif
+	cwdi = cnp->cn_proc->p_cwdi;
+
+	/*
+	 * Get a buffer for the name to be translated, and copy the
+	 * name into the buffer.
+	 */
+	if ((cnp->cn_flags & HASBUF) == 0)
+		cnp->cn_pnbuf = PNBUF_GET();
+	if (ndp->ni_segflg == UIO_SYSSPACE)
+		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, &ndp->ni_pathlen);
+	else
+		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+			    MAXPATHLEN, &ndp->ni_pathlen);
+
+	/*
+	 * POSIX.1 requirement: "" is not a valid file name.
+	 */      
+	if (!error && ndp->ni_pathlen == 1)
+		error = ENOENT;
+
+	if (error) {
+		PNBUF_PUT(cnp->cn_pnbuf);
+		ndp->ni_vp = NULL;
+		return (error);
+	}
+	ndp->ni_loopcnt = 0;
+
+#ifdef KTRACE
+	if (KTRPOINT(cnp->cn_proc, KTR_NAMEI))
+		ktrnamei(cnp->cn_proc, cnp->cn_pnbuf);
+#endif
+#ifdef SYSTRACE
+	if (ISSET(cnp->cn_proc->p_flag, P_SYSTRACE))
+		systrace_namei(ndp);
+#endif
+
+	/*
+	 * Get starting point for the translation.
+	 */
+	if ((ndp->ni_rootdir = cwdi->cwdi_rdir) == NULL)
+		ndp->ni_rootdir = rootvnode;
+	/*
+	 * Check if starting from root directory or current directory.
+	 */
+	/* namei() difference:
+	 *
+	 * The COMPAT_NAMEI_ALTROOT flag is used to tell which root
+	 * we're using for the lookup.  As we're in the compat module,
+	 * we always want to start with the alternative root dir.  But
+	 * we don't necessarily have to, as we may be asked for a
+	 * relative path.
+	 */
+	if (cnp->cn_pnbuf[0] == '/') {
+		dp = cndp->cni_altrootdir;
+		VREF(dp);
+		cndp->cni_flags |= COMPAT_NAMEI_ALTROOT;
+	} else {
+		dp = cwdi->cwdi_cdir;
+		VREF(dp);
+		cndp->cni_flags &= ~COMPAT_NAMEI_ALTROOT;
+	}
+	/* namei() difference:
+	 *
+	 * We want to keep the full resolution of the path.  As an
+	 * intermediate step in the name resolution, we mostly want
+	 * to know if the file exists inside the alternative root.  The
+	 * actual vnode is not our main target.
+	 *
+	 * Keeping the full path (even at the cost of leaving symlinks
+	 * in the middle, we'll do the resolution again later anyway)
+	 * means we cannot simply erase the current path when we get
+	 * a symlink.  Therefore cn_nameptr must be handled with care,
+	 * as we might start from the middle of the string cn_pnbuf
+	 * points to.
+	 */
+	cnp->cn_nameptr = cnp->cn_pnbuf;
+	for (;;) {
+redolookup:
+		if (!dp->v_mount)
+		{
+			/* Give up if the directory is no longer mounted */
+			PNBUF_PUT(cnp->cn_pnbuf);
+			return (ENOENT);
+		}
+		ndp->ni_startdir = dp;
+		/* namei difference:
+		 *
+		 * There might be another lookup, starting from the real
+		 * root dir in case we haven't found the target in the
+		 * alternative root.  So we have to keep around a few
+		 * values to start again.
+		 */
+		if ((cndp->cni_flags & COMPAT_NAMEI_ALTROOT) != 0) {
+			/* Save state in case we have to back out */
+			next = ndp->ni_next;
+			pathlen = ndp->ni_pathlen;
+			loopcnt = ndp->ni_loopcnt;
+		}
+#ifdef COMPAT_NAMEI_DEBUG
+		if (dni) {
+			printf("ni_pl = %lu\n", ndp->ni_pathlen);
+			printf("nameptr = %s\n", cnp->cn_nameptr);
+		}
+#endif
+		if ((error = lookup(ndp)) != 0) {
+			if ((cndp->cni_flags & COMPAT_NAMEI_ALTROOT) != 0) {
+				cndp->cni_flags &= ~COMPAT_NAMEI_ALTROOT;
+				ndp->ni_next = next;
+				ndp->ni_pathlen = pathlen;
+				ndp->ni_loopcnt = loopcnt;
+				dp = ndp->ni_rootdir;
+				VREF(dp);
+				goto redolookup;
+			}
+			PNBUF_PUT(cnp->cn_pnbuf);
+			return (error);
+		}
+		/*
+		 * Check for symbolic link
+		 */
+		if ((cnp->cn_flags & ISSYMLINK) == 0) {
+			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
+				PNBUF_PUT(cnp->cn_pnbuf);
+			else
+				cnp->cn_flags |= HASBUF;
+			return (0);
+		}
+		if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
+			VOP_UNLOCK(ndp->ni_dvp, 0);
+		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+			error = ELOOP;
+			break;
+		}
+		if (ndp->ni_vp->v_mount->mnt_flag & MNT_SYMPERM) {
+			error = VOP_ACCESS(ndp->ni_vp, VEXEC, cnp->cn_cred,
+			    cnp->cn_proc);
+			if (error != 0)
+				break;
+		}
+		/* namei difference
+		 *
+		 * The real namei() only allocate a new pnbuf only if there
+		 * are remaining path components, so it can copy them after
+		 * the target of the current symlink.
+		 *
+		 * When it is the last component, namei() does not allocate
+		 * a pnbuf, and simply overwrite the current pnbuf.  We
+		 * cannot afford that here since we want to keep the first
+		 * part of the name resolution, so we unconditionally
+		 * allocate a pnbuf.
+		 *
+		 * Of course, if it is an absolute symlink we will overwrite
+		 * the current pnbuf, but we can't know that before reading
+		 * it.
+		 */
+		cp = PNBUF_GET();
+		aiov.iov_base = cp;
+		aiov.iov_len = MAXPATHLEN;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_procp = (struct proc *)0;
+		auio.uio_resid = MAXPATHLEN;
+		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+		if (error) {
+		badlink:
+			PNBUF_PUT(cp);
+			break;
+		}
+		linklen = MAXPATHLEN - auio.uio_resid;
+		if (linklen == 0) {
+			error = ENOENT;
+			goto badlink;
+		}
+		/* namei difference:
+		 *
+		 * We keep the resolved path, so we have to take it in account if the
+		 * symlink is relative.  In case the symlink is absolute, we'll
+		 * overwrite the current pnbuf so it can be longer.
+		 */
+		if (linklen + ndp->ni_pathlen +
+		    (cp[0] == '/' ? (cnp->cn_nameptr - cnp->cn_pnbuf) : 0) >= MAXPATHLEN) {
+			error = ENAMETOOLONG;
+			goto badlink;
+		}
+#ifdef COMPAT_NAMEI_DEBUG
+		if (dni) printf("cp = %s, %d, %lu\n", cp, linklen, ndp->ni_pathlen);
+#endif
+		/* namei difference
+		 *
+		 * We want to keep the full path of the target in the pnbuf,
+		 * so we do whatever is necessary at that point:  we start by
+		 * copying the rest of the path after the resolution of the
+		 * symlink.  Then, if it is an absolute symlink, we erase the
+		 * current pnbuf, but if it is a relative one, we copy the
+		 * content of cp after the last resolved component of the
+		 * path.  That means we can end with '/../' in the path.
+		 *
+		 * While there, take note in 'restart' if it is an absolute
+		 * symlink:  we'll have to restart the resolution from the
+		 * alternative root.
+		 */
+		memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
+		if (cp[0] == '/') {
+			PNBUF_PUT(cnp->cn_pnbuf);
+			cnp->cn_pnbuf = cp;
+			restart = 1;
+		} else {
+			memcpy((char *)cnp->cn_nameptr, cp, linklen+ndp->ni_pathlen);
+			PNBUF_PUT(cp);
+			restart = 0;
+		}
+		ndp->ni_pathlen += linklen;
+#ifdef COMPAT_NAMEI_DEBUG
+		if (dni) printf("buf = %s, %lu\n", cnp->cn_pnbuf, ndp->ni_pathlen);
+#endif
+		vput(ndp->ni_vp);
+		dp = ndp->ni_dvp;
+		/*
+		 * Check if root directory should replace current directory.
+		 */
+		if (restart) {
+			vrele(dp);
+			dp = cndp->cni_altrootdir;
+			cndp->cni_flags |= COMPAT_NAMEI_ALTROOT;
+			VREF(dp);
+			cnp->cn_nameptr = cnp->cn_pnbuf;
+		}
+	}
+	PNBUF_PUT(cnp->cn_pnbuf);
+	vrele(ndp->ni_dvp);
+	vput(ndp->ni_vp);
+	ndp->ni_vp = NULL;
+	return (error);
+}

--Multipart=_Fri__9_Jul_2004_23_33_20_+0200_Ttc/IxCvF5Zr6T0R--

--Signature=_Fri__9_Jul_2004_23_33_20_+0200_2Knr28fY6H.k2E1U
Content-Type: application/pgp-signature

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (NetBSD)

iQEVAwUBQO8PJdgoQloHrPnoAQIr5AgAjAEGkwN2pzwC5r0zoe0LFDa+zETbIvgo
N4U28jXTr2dKJOrPKvKH3zoj01b5a5JPPrysL4qSRukJgU1NKZgRjAzoPiz96UEL
Zz4Tofj/CFexydtVHXa7EfVHi7Tyg4mlZdY6fQNiKCCW9GzCF4JuphHT1lWpo0XE
5Au0u7c4PkK49vvNwEzexVjfONyFUvuyR272cvS3Oj3WEXq2+LHfiuQnSLNtfX/s
iqH2+QLqvv8ZWCVnKv/12iUb+PdxcjQChu/9VhqkuhX3E/aRmtWGTbKFDD09qWQs
dIdquo4JiVHeL078VRkOClm37EnDYsmZxUjkrcwBORi8atex9G7n0Q==
=BZOS
-----END PGP SIGNATURE-----

--Signature=_Fri__9_Jul_2004_23_33_20_+0200_2Knr28fY6H.k2E1U--