Subject: Split of syscall code (please comment)
To: None <tech-kern@netbsd.org>
From: Jaromír Dolecek <dolecek@ics.muni.cz>
List: tech-kern
Date: 11/25/2000 10:29:32
--ELM975144572-885-0_
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=US-ASCII

Hi,
The compile-time COMPAT_FOO dependencies and comparation to &emul_foo is
evil for LKMs. These are present in MD code, in trap.c or machdep.c.  Some
of those checks are "right" to be done via an emulation flag (like if the
emulation has SYS___syscall, or stores parent id for get*id() calls in
return structure (see kern_prot.c)). However, many of that
emulation-specific quirks are not "right" to be done via flags,
since they are specific for that particular emulation, or are even just MD.
So I think it makes sense to split syscall() code, so that each
emulation which needs separate syscall() can provide one.

Question here is, what is the best way to achieve this. Ideally,
there would be no code duplication at all; in any case
it's not wise to copy the syscall() code to n emulation-dependant files.

I've implemented and tested one of possible ways for i386. It
doesn't feel right, but works OK. Frank van den Linden reviewed
the code and has similar mixed feelings to it like me.

The e-mail attachment contains patch to trap.c with new linux_syscall.c
and ibcs2_syscall.c code. For reference, I've also appended new trap.c
as a whole.

Would you have some better solution, or do you think that the presented
solution is good enough ?

Jaromir
-- 
Jaromir Dolecek <jdolecek@NetBSD.org>      http://www.ics.muni.cz/~dolecek/
@@@@  Wanna a real operating system ? Go and get NetBSD, damn!  @@@@

--ELM975144572-885-0_
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=ISO-8859-2
Content-Disposition: attachment; filename=trap.diff

Index: trap.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/i386/i386/trap.c,v
retrieving revision 1.142
diff -u -r1.142 trap.c
--- trap.c	2000/11/21 21:27:04	1.142
+++ trap.c	2000/11/25 09:25:03
@@ -78,6 +78,7 @@
  * 386 Trap and System call handling
  */
 
+#if defined(_KERNEL) && !defined(_LKM)
 #include "opt_ddb.h"
 #include "opt_syscall_debug.h"
 #include "opt_execfmt.h"
@@ -85,10 +86,7 @@
 #include "opt_vm86.h"
 #include "opt_ktrace.h"
 #include "opt_cputype.h"
-#include "opt_compat_freebsd.h"
-#include "opt_compat_linux.h"
-#include "opt_compat_ibcs2.h"
-#include "opt_compat_aout.h"
+#endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -113,6 +111,7 @@
 #include <machine/db_machdep.h>
 #endif
 
+#if defined(_KERNEL) && !defined(_LKM)
 #include "mca.h"
 #if NMCA > 0
 #include <machine/mca_machdep.h>
@@ -123,36 +122,42 @@
 #ifdef KGDB
 #include <sys/kgdb.h>
 #endif
+
+#include "npx.h"
 
-#ifdef COMPAT_IBCS2
+#endif /* KERNEL && !LKM */
+
+#ifdef COMPAT_IBCS2_SOURCE
 #include <sys/exec_elf.h>
 #include <compat/ibcs2/ibcs2_errno.h>
 #include <compat/ibcs2/ibcs2_exec.h>
-extern struct emul emul_ibcs2;
+#define syscall		ibcs2_syscall
 #endif
 
-#ifdef COMPAT_LINUX
+#ifdef COMPAT_LINUX_SOURCE
 # include <sys/exec.h>
 # include <compat/linux/linux_syscall.h>
-extern struct emul emul_linux;
-#endif /* COMPAT_LINUX */
+#define syscall		linux_syscall
+#endif /* COMPAT_LINUX_SOURCE */
+
+static __inline void userret __P((struct proc *, int, u_quad_t));
 
-#ifdef COMPAT_FREEBSD
-extern struct emul emul_freebsd;
-#endif /* COMPAT_FREEBSD */
-
-#ifdef COMPAT_AOUT
-extern struct emul emul_netbsd_aout;
-#endif /* COMPAT_AOUT */
+#ifdef WANT_COMPAT_SYSCALL
 
-#include "npx.h"
+/* if compat syscall, the frame is passed via pointer */
+#define FRAME		(*frame)
+void syscall __P((struct trapframe *));
 
-static __inline void userret __P((struct proc *, int, u_quad_t));
+#else /* ! WANT_COMPAT_SYSCALL */
+
+#define FRAME		frame
+void syscall __P((struct trapframe));
 void trap __P((struct trapframe));
 #if defined(I386_CPU)
 int trapwrite __P((unsigned));
 #endif
-void syscall __P((struct trapframe));
+
+#endif /* WANT_COMPAT_SYSCALL */
 
 /*
  * Define the code needed before returning to user mode, for
@@ -191,6 +196,7 @@
 	curcpu()->ci_schedstate.spc_curpriority = p->p_priority;
 }
 
+#ifndef WANT_COMPAT_SYSCALL
 const char *trap_type[] = {
 	"privileged instruction fault",		/*  0 T_PRIVINFLT */
 	"breakpoint trap",			/*  1 T_BPTFLT */
@@ -588,6 +594,8 @@
 }
 #endif /* I386_CPU */
 
+#endif /*  WANT_COMPAT_SYSCALL */
+
 /*
  * syscall(frame):
  *	System call request from POSIX system call gate interface to kernel.
@@ -596,7 +604,7 @@
 /*ARGSUSED*/
 void
 syscall(frame)
-	struct trapframe frame;
+	struct trapframe FRAME;
 {
 	register caddr_t params;
 	register const struct sysent *callp;
@@ -605,40 +613,32 @@
 	size_t argsize;
 	register_t code, args[8], rval[2];
 	u_quad_t sticks;
-#ifdef COMPAT_LINUX
-	int linux;
-#endif /* COMPAT_LINUX */
-#ifdef COMPAT_FREEBSD
-	int freebsd;
-#endif /* COMPAT_FREEBSD */
+
+	p = curproc;
+#ifndef WANT_COMPAT_SYSCALL
+	if (p->p_emul->e_syscall) {
+		p->p_emul->e_syscall(&frame);
+		return;
+	}
+#endif
 
 	uvmexp.syscalls++;
-	if (!USERMODE(frame.tf_cs, frame.tf_eflags))
+	if (!USERMODE(FRAME.tf_cs, FRAME.tf_eflags))
 		panic("syscall");
-	p = curproc;
 	sticks = p->p_sticks;
-	p->p_md.md_regs = &frame;
-	opc = frame.tf_eip;
-	code = frame.tf_eax;
+	p->p_md.md_regs = &FRAME;
+	opc = FRAME.tf_eip;
+	code = FRAME.tf_eax;
 
 	nsys = p->p_emul->e_nsysent;
 	callp = p->p_emul->e_sysent;
 
-#ifdef COMPAT_LINUX
-	linux = (p->p_emul == &emul_linux);
-#endif /* COMPAT_LINUX */
-
-#ifdef COMPAT_FREEBSD
-	freebsd = (p->p_emul == &emul_freebsd);
-#endif /* COMPAT_FREEBSD */
-
-#ifdef COMPAT_IBCS2
-	if (p->p_emul == &emul_ibcs2) {
-		if (IBCS2_HIGH_SYSCALL(code))
-			code = IBCS2_CVT_HIGH_SYSCALL(code);
-	}
-#endif /* COMPAT_IBCS2 */
-	params = (caddr_t)frame.tf_esp + sizeof(int);
+#ifdef COMPAT_IBCS2_SOURCE
+	if (IBCS2_HIGH_SYSCALL(code))
+		code = IBCS2_CVT_HIGH_SYSCALL(code);
+#endif /* COMPAT_IBCS2_SOURCE */
+
+	params = (caddr_t)FRAME.tf_esp + sizeof(int);
 
 #ifdef VM86
 	/*
@@ -646,81 +646,72 @@
 	 * it get a SIGSYS and have the VM86 handler in the process take care
 	 * of it.
 	 */
-	if (frame.tf_eflags & PSL_VM)
+	if (FRAME.tf_eflags & PSL_VM)
 		code = -1;
 	else
 #endif /* VM86 */
 
 	switch (code) {
 	case SYS_syscall:
-#ifdef COMPAT_LINUX
+#ifdef COMPAT_LINUX_SOURCE
 		/* Linux has a special system setup call as number 0 */
-		if (linux)
-			break;
-#endif /* COMPAT_LINUX */
+		break;
+#endif /* COMPAT_LINUX_SOURCE */
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		code = fuword(params);
 		params += sizeof(int);
 		break;
+#if !defined(COMPAT_IBCS2_SOURCE) && !defined(COMPAT_LINUX_SOURCE)
 	case SYS___syscall:
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
-		if (callp == sysent 	/* Native */
-#ifdef COMPAT_FREEBSD
-		    || freebsd		/* FreeBSD has the same function */
-#endif
-#ifdef COMPAT_AOUT
-		    || (p->p_emul == &emul_netbsd_aout)	/* Our a.out */
-#endif
-		    ) {
+		if (p->p_emul->e_flags & EMUL_HAS___SYSCALL) {
 			code = fuword(params + _QUAD_LOWWORD * sizeof(int));
 			params += sizeof(quad_t);
 		}
 		break;
+#endif /* !COMPAT_IBCS2_SOURCE && !COMPAT_LINUX_SOURCE */
 	default:
 		break;
 	}
-	if (code < 0 || code >= nsys)
+	if (code < 0 || code >= nsys) {
 		callp += p->p_emul->e_nosys;		/* illegal */
-	else
+		printf("p_emul->e_nosys\n");
+	} else
 		callp += code;
 	argsize = callp->sy_argsize;
 	if (argsize) {
-#ifdef COMPAT_LINUX
-		if (linux) {
-			/*
-			 * Linux passes the args in ebx, ecx, edx, esi, edi, in
-			 * increasing order.
-			 */
-			switch (argsize >> 2) {
-			case 5:
-				args[4] = frame.tf_edi;
-			case 4:
-				args[3] = frame.tf_esi;
-			case 3:
-				args[2] = frame.tf_edx;
-			case 2:
-				args[1] = frame.tf_ecx;
-			case 1:
-				args[0] = frame.tf_ebx;
-				break;
-			default:
-				panic("linux syscall bogus argument size %d",
-				    argsize);
-				break;
-			}
-		}
-		else
-#endif /* COMPAT_LINUX */
-		{
-			error = copyin(params, (caddr_t)args, argsize);
-			if (error)
-				goto bad;
+#ifdef COMPAT_LINUX_SOURCE
+		/*
+		 * Linux passes the args in ebx, ecx, edx, esi, edi, in
+		 * increasing order.
+		 */
+		switch (argsize >> 2) {
+		case 5:
+			args[4] = FRAME.tf_edi;
+		case 4:
+			args[3] = FRAME.tf_esi;
+		case 3:
+			args[2] = FRAME.tf_edx;
+		case 2:
+			args[1] = FRAME.tf_ecx;
+		case 1:
+			args[0] = FRAME.tf_ebx;
+			break;
+		default:
+			panic("linux syscall bogus argument size %d",
+			    argsize);
+			break;
 		}
+#else /* !COMPAT_LINUX_SOURCE */
+		error = copyin(params, (caddr_t)args, argsize);
+		if (error)
+			goto bad;
+#endif /* COMPAT_LINUX_SOURCE */
 	}
 #ifdef SYSCALL_DEBUG
 	scdebug_call(p, code, args);
@@ -730,7 +721,7 @@
 		ktrsyscall(p, code, argsize, args);
 #endif /* KTRACE */
 	rval[0] = 0;
-	rval[1] = frame.tf_edx;
+	rval[1] = FRAME.tf_edx;
 	error = (*callp->sy_call)(p, args, rval);
 	switch (error) {
 	case 0:
@@ -739,12 +730,11 @@
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
-		frame.tf_eax = rval[0];
-#ifdef COMPAT_LINUX
-		if (!linux)
-#endif /* COMPAT_LINUX */
-			frame.tf_edx = rval[1];
-		frame.tf_eflags &= ~PSL_C;	/* carry bit */
+		FRAME.tf_eax = rval[0];
+#ifndef COMPAT_LINUX_SOURCE
+		FRAME.tf_edx = rval[1];
+#endif
+		FRAME.tf_eflags &= ~PSL_C;	/* carry bit */
 		break;
 	case ERESTART:
 		/*
@@ -752,30 +742,33 @@
 		 * the kernel through the trap or call gate.  We pushed the
 		 * size of the instruction into tf_err on entry.
 		 */
-		frame.tf_eip = opc - frame.tf_err;
+		FRAME.tf_eip = opc - FRAME.tf_err;
 		break;
 	case EJUSTRETURN:
 		/* nothing to do */
 		break;
 	default:
+#ifndef COMPAT_LINUX_SOURCE
 	bad:
+#endif
 		if (p->p_emul->e_errno)
 			error = p->p_emul->e_errno[error];
-		frame.tf_eax = error;
-		frame.tf_eflags |= PSL_C;	/* carry bit */
+		FRAME.tf_eax = error;
+		FRAME.tf_eflags |= PSL_C;	/* carry bit */
 		break;
 	}
 
 #ifdef SYSCALL_DEBUG
 	scdebug_ret(p, code, error, rval);
 #endif /* SYSCALL_DEBUG */
-	userret(p, frame.tf_eip, sticks);
+	userret(p, FRAME.tf_eip, sticks);
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p, code, error, rval[0]);
 #endif /* KTRACE */
 }
 
+#ifndef WANT_COMPAT_SYSCALL
 void
 child_return(arg)
 	void *arg;
@@ -792,3 +785,4 @@
 		ktrsysret(p, SYS_fork, 0, 0);
 #endif
 }
+#endif /* WANT_COMPAT_SYSCALL */
Index: ibcs2_syscall.c
===================================================================
RCS file: ibcs2_syscall.c
diff -N ibcs2_syscall.c
--- /dev/null	Sat Nov 25 10:14:40 2000
+++ ibcs2_syscall.c	Sat Nov 25 11:25:03 2000
@@ -0,0 +1,42 @@
+/* $NetBSD: if_tr_mca.c,v 1.3 2000/06/06 20:09:20 jdolecek Exp $ */
+
+/*_
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the NetBSD  
+ *      Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * IBCS2 syscall code - common syscall() with IBCS2 quirks turned on.
+ */
+
+#define WANT_COMPAT_SYSCALL
+#define COMPAT_IBCS2_SOURCE
+
+#include <arch/i386/i386/trap.c>
Index: linux_syscall.c
===================================================================
RCS file: linux_syscall.c
diff -N linux_syscall.c
--- /dev/null	Sat Nov 25 10:14:40 2000
+++ linux_syscall.c	Sat Nov 25 11:25:03 2000
@@ -0,0 +1,42 @@
+/* $NetBSD: if_tr_mca.c,v 1.3 2000/06/06 20:09:20 jdolecek Exp $ */
+
+/*_
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the NetBSD  
+ *      Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Linux syscall code - common syscall() with Linux quirks turned on.
+ */
+
+#define WANT_COMPAT_SYSCALL
+#define COMPAT_LINUX_SOURCE
+
+#include <arch/i386/i386/trap.c>

--ELM975144572-885-0_
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=ISO-8859-2
Content-Disposition: attachment; filename=trap.c

/*	$NetBSD: trap.c,v 1.142 2000/11/21 21:27:04 jdolecek Exp $	*/

/*-
 * Copyright (c) 1998 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the NetBSD
 *        Foundation, Inc. and its contributors.
 * 4. Neither the name of The NetBSD Foundation nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the University of Utah, and William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)trap.c	7.4 (Berkeley) 5/13/91
 */

/*
 * 386 Trap and System call handling
 */

#if defined(_KERNEL) && !defined(_LKM)
#include "opt_ddb.h"
#include "opt_syscall_debug.h"
#include "opt_execfmt.h"
#include "opt_math_emulate.h"
#include "opt_vm86.h"
#include "opt_ktrace.h"
#include "opt_cputype.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/acct.h>
#include <sys/kernel.h>
#include <sys/signal.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <sys/syscall.h>

#include <uvm/uvm_extern.h>

#include <machine/cpu.h>
#include <machine/cpufunc.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/trap.h>
#ifdef DDB
#include <machine/db_machdep.h>
#endif

#if defined(_KERNEL) && !defined(_LKM)
#include "mca.h"
#if NMCA > 0
#include <machine/mca_machdep.h>
#endif

#include "isa.h"

#ifdef KGDB
#include <sys/kgdb.h>
#endif

#include "npx.h"

#endif /* KERNEL && !LKM */

#ifdef COMPAT_IBCS2_SOURCE
#include <sys/exec_elf.h>
#include <compat/ibcs2/ibcs2_errno.h>
#include <compat/ibcs2/ibcs2_exec.h>
#define syscall		ibcs2_syscall
#endif

#ifdef COMPAT_LINUX_SOURCE
# include <sys/exec.h>
# include <compat/linux/linux_syscall.h>
#define syscall		linux_syscall
#endif /* COMPAT_LINUX_SOURCE */

static __inline void userret __P((struct proc *, int, u_quad_t));

#ifdef WANT_COMPAT_SYSCALL

/* if compat syscall, the frame is passed via pointer */
#define FRAME		(*frame)
void syscall __P((struct trapframe *));

#else /* ! WANT_COMPAT_SYSCALL */

#define FRAME		frame
void syscall __P((struct trapframe));
void trap __P((struct trapframe));
#if defined(I386_CPU)
int trapwrite __P((unsigned));
#endif

#endif /* WANT_COMPAT_SYSCALL */

/*
 * Define the code needed before returning to user mode, for
 * trap and syscall.
 */
static __inline void
userret(p, pc, oticks)
	register struct proc *p;
	int pc;
	u_quad_t oticks;
{
	int sig;

	/* take pending signals */
	while ((sig = CURSIG(p)) != 0)
		postsig(sig);
	p->p_priority = p->p_usrpri;
	if (want_resched) {
		/*
		 * We are being preempted.
		 */
		preempt(NULL);
		while ((sig = CURSIG(p)) != 0)
			postsig(sig);
	}

	/*
	 * If profiling, charge recent system time to the trapped pc.
	 */
	if (p->p_flag & P_PROFIL) { 
		extern int psratio;

		addupc_task(p, pc, (int)(p->p_sticks - oticks) * psratio);
	}                   

	curcpu()->ci_schedstate.spc_curpriority = p->p_priority;
}

#ifndef WANT_COMPAT_SYSCALL
const char *trap_type[] = {
	"privileged instruction fault",		/*  0 T_PRIVINFLT */
	"breakpoint trap",			/*  1 T_BPTFLT */
	"arithmetic trap",			/*  2 T_ARITHTRAP */
	"asynchronous system trap",		/*  3 T_ASTFLT */
	"protection fault",			/*  4 T_PROTFLT */
	"trace trap",				/*  5 T_TRCTRAP */
	"page fault",				/*  6 T_PAGEFLT */
	"alignment fault",			/*  7 T_ALIGNFLT */
	"integer divide fault",			/*  8 T_DIVIDE */
	"non-maskable interrupt",		/*  9 T_NMI */
	"overflow trap",			/* 10 T_OFLOW */
	"bounds check fault",			/* 11 T_BOUND */
	"FPU not available fault",		/* 12 T_DNA */
	"double fault",				/* 13 T_DOUBLEFLT */
	"FPU operand fetch fault",		/* 14 T_FPOPFLT */
	"invalid TSS fault",			/* 15 T_TSSFLT */
	"segment not present fault",		/* 16 T_SEGNPFLT */
	"stack fault",				/* 17 T_STKFLT */
	"reserved trap",			/* 18 T_RESERVED */
};
int	trap_types = sizeof trap_type / sizeof trap_type[0];

#ifdef DEBUG
int	trapdebug = 0;
#endif

/*
 * trap(frame):
 *	Exception, fault, and trap interface to BSD kernel. This
 * common code is called from assembly language IDT gate entry
 * routines that prepare a suitable stack frame, and restore this
 * frame after the exception has been processed. Note that the
 * effect is as if the arguments were passed call by reference.
 */
/*ARGSUSED*/
void
trap(frame)
	struct trapframe frame;
{
	register struct proc *p = curproc;
	int type = frame.tf_trapno;
	u_quad_t sticks;
	struct pcb *pcb = NULL;
	extern char fusubail[],
		    resume_iret[], resume_pop_ds[], resume_pop_es[];
	struct trapframe *vframe;
	int resume;

	uvmexp.traps++;

#ifdef DEBUG
	if (trapdebug) {
		printf("trap %d code %x eip %x cs %x eflags %x cr2 %x cpl %x\n",
		    frame.tf_trapno, frame.tf_err, frame.tf_eip, frame.tf_cs,
		    frame.tf_eflags, rcr2(), cpl);
		printf("curproc %p\n", curproc);
	}
#endif

	if (!KERNELMODE(frame.tf_cs, frame.tf_eflags)) {
		type |= T_USER;
		sticks = p->p_sticks;
		p->p_md.md_regs = &frame;
	}
	else
		sticks = 0;

	switch (type) {

	default:
	we_re_toast:
#ifdef KGDB
		if (kgdb_trap(type, &frame))
			return;
		else {
			/*
			 * If this is a breakpoint, don't panic
			 * if we're not connected.
			 */
			if (type == T_BPTFLT) {
				printf("kgdb: ignored %s\n", trap_type[type]);
				return;
			}
		}
#endif
#ifdef DDB
		if (kdb_trap(type, 0, &frame))
			return;
#endif
		if (frame.tf_trapno < trap_types)
			printf("fatal %s", trap_type[frame.tf_trapno]);
		else
			printf("unknown trap %d", frame.tf_trapno);
		printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");
		printf("trap type %d code %x eip %x cs %x eflags %x cr2 %x cpl %x\n",
		    type, frame.tf_err, frame.tf_eip, frame.tf_cs, frame.tf_eflags, rcr2(), cpl);

		panic("trap");
		/*NOTREACHED*/

	case T_PROTFLT:
	case T_SEGNPFLT:
	case T_ALIGNFLT:
		/* Check for copyin/copyout fault. */
		pcb = &p->p_addr->u_pcb;
		if (pcb->pcb_onfault != 0) {
		copyfault:
			frame.tf_eip = (int)pcb->pcb_onfault;
			return;
		}

		/*
		 * Check for failure during return to user mode.
		 *
		 * We do this by looking at the instruction we faulted on.  The
		 * specific instructions we recognize only happen when
		 * returning from a trap, syscall, or interrupt.
		 *
		 * XXX
		 * The heuristic used here will currently fail for the case of
		 * one of the 2 pop instructions faulting when returning from a
		 * a fast interrupt.  This should not be possible.  It can be
		 * fixed by rearranging the trap frame so that the stack format
		 * at this point is the same as on exit from a `slow'
		 * interrupt.
		 */
		switch (*(u_char *)frame.tf_eip) {
		case 0xcf:	/* iret */
			vframe = (void *)((int)&frame.tf_esp - 44);
			resume = (int)resume_iret;
			break;
		case 0x1f:	/* popl %ds */
			vframe = (void *)((int)&frame.tf_esp - 4);
			resume = (int)resume_pop_ds;
			break;
		case 0x07:	/* popl %es */
			vframe = (void *)((int)&frame.tf_esp - 0);
			resume = (int)resume_pop_es;
			break;
		default:
			goto we_re_toast;
		}
		if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags))
			goto we_re_toast;

		frame.tf_eip = resume;
		return;

	case T_PROTFLT|T_USER:		/* protection fault */
#ifdef VM86
		if (frame.tf_eflags & PSL_VM) {
			vm86_gpfault(p, type & ~T_USER);
			goto out;
		}
#endif
	case T_TSSFLT|T_USER:
	case T_SEGNPFLT|T_USER:
	case T_STKFLT|T_USER:
	case T_ALIGNFLT|T_USER:
	case T_NMI|T_USER:
		trapsignal(p, SIGBUS, type &~ T_USER);
		goto out;

	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
		trapsignal(p, SIGILL, type &~ T_USER);
		goto out;

	case T_ASTFLT|T_USER:		/* Allow process switch */
		uvmexp.softs++;
		if (p->p_flag & P_OWEUPC) {
			p->p_flag &= ~P_OWEUPC;
			ADDUPROF(p);
		}
		goto out;

	case T_DNA|T_USER: {
#ifdef MATH_EMULATE
		int rv;
		if ((rv = math_emulate(&frame)) == 0) {
			if (frame.tf_eflags & PSL_T)
				goto trace;
			return;
		}
		trapsignal(p, rv, type &~ T_USER);
		goto out;
#else
		printf("pid %d killed due to lack of floating point\n",
		    p->p_pid);
		trapsignal(p, SIGKILL, type &~ T_USER);
		goto out;
#endif
	}

	case T_BOUND|T_USER:
	case T_OFLOW|T_USER:
	case T_DIVIDE|T_USER:
		trapsignal(p, SIGFPE, type &~ T_USER);
		goto out;

	case T_ARITHTRAP|T_USER:
		trapsignal(p, SIGFPE, frame.tf_err);
		goto out;

	case T_PAGEFLT:			/* allow page faults in kernel mode */
		if (p == 0)
			goto we_re_toast;
		pcb = &p->p_addr->u_pcb;
		/*
		 * fusubail is used by [fs]uswintr() to prevent page faulting
		 * from inside the profiling interrupt.
		 */
		if (pcb->pcb_onfault == fusubail)
			goto copyfault;
#if 0
		/* XXX - check only applies to 386's and 486's with WP off */
		if (frame.tf_err & PGEX_P)
			goto we_re_toast;
#endif
		/* FALLTHROUGH */

	case T_PAGEFLT|T_USER: {	/* page fault */
		register vaddr_t va;
		register struct vmspace *vm = p->p_vmspace;
		register vm_map_t map;
		int rv;
		vm_prot_t ftype;
		extern vm_map_t kernel_map;
		unsigned nss;

		if (vm == NULL)
			goto we_re_toast;
		va = trunc_page((vaddr_t)rcr2());
		/*
		 * It is only a kernel address space fault iff:
		 *	1. (type & T_USER) == 0  and
		 *	2. pcb_onfault not set or
		 *	3. pcb_onfault set but supervisor space fault
		 * The last can occur during an exec() copyin where the
		 * argument space is lazy-allocated.
		 */
		if (type == T_PAGEFLT && va >= KERNBASE)
			map = kernel_map;
		else
			map = &vm->vm_map;
		if (frame.tf_err & PGEX_W)
			ftype = VM_PROT_READ | VM_PROT_WRITE;
		else
			ftype = VM_PROT_READ;

#ifdef DIAGNOSTIC
		if (map == kernel_map && va == 0) {
			printf("trap: bad kernel access at %lx\n", va);
			goto we_re_toast;
		}
#endif

		nss = 0;
		if ((caddr_t)va >= vm->vm_maxsaddr
		    && (caddr_t)va < (caddr_t)VM_MAXUSER_ADDRESS
		    && map != kernel_map) {
			nss = btoc(USRSTACK-(unsigned)va);
			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
				/*
				 * We used to fail here. However, it may
				 * just have been an mmap()ed page low
				 * in the stack, which is legal. If it
				 * wasn't, uvm_fault() will fail below.
				 *
				 * Set nss to 0, since this case is not
				 * a "stack extension".
				 */
				nss = 0;
			}
		}

		/* Fault the original page in. */
		rv = uvm_fault(map, va, 0, ftype);
		if (rv == KERN_SUCCESS) {
			if (nss > vm->vm_ssize)
				vm->vm_ssize = nss;

			if (type == T_PAGEFLT)
				return;
			goto out;
		}

		if (type == T_PAGEFLT) {
			if (pcb->pcb_onfault != 0)
				goto copyfault;
			printf("uvm_fault(%p, 0x%lx, 0, %d) -> %x\n",
			    map, va, ftype, rv);
			goto we_re_toast;
		}
		if (rv == KERN_RESOURCE_SHORTAGE) {
			printf("UVM: pid %d (%s), uid %d killed: out of swap\n",
			       p->p_pid, p->p_comm,
			       p->p_cred && p->p_ucred ?
			       p->p_ucred->cr_uid : -1);
			trapsignal(p, SIGKILL, T_PAGEFLT);
		} else {
			trapsignal(p, SIGSEGV, T_PAGEFLT);
		}
		break;
	}

#if !defined(DDB) && !defined(KGDB)
	/* XXX need to deal with this when DDB is present, too */
	case T_TRCTRAP:	/* kernel trace trap; someone single stepping lcall's */
			/* syscall has to turn off the trace bit itself */
		return;
#endif

	case T_BPTFLT|T_USER:		/* bpt instruction fault */
	case T_TRCTRAP|T_USER:		/* trace trap */
#ifdef MATH_EMULATE
	trace:
#endif
		trapsignal(p, SIGTRAP, type &~ T_USER);
		break;

#if	NISA > 0 || NMCA > 0
	case T_NMI:
#if defined(KGDB) || defined(DDB)
		/* NMI can be hooked up to a pushbutton for debugging */
		printf ("NMI ... going to debugger\n");
#ifdef KGDB

		if (kgdb_trap(type, &frame))
			return;
#endif
#ifdef DDB
		if (kdb_trap(type, 0, &frame))
			return;
#endif
#endif /* KGDB || DDB */
		/* machine/parity/power fail/"kitchen sink" faults */

#if NMCA > 0
		/* mca_nmi() takes care to call isa_nmi() if appropriate */
		if (mca_nmi() != 0)
			goto we_re_toast;
		else
			return;
#else /* NISA > 0 */
		if (isa_nmi() != 0)
			goto we_re_toast;
		else
			return;
#endif /* NMCA > 0 */
#endif /* NISA > 0 || NMCA > 0 */
	}

	if ((type & T_USER) == 0)
		return;
out:
	userret(p, frame.tf_eip, sticks);
}

#if defined(I386_CPU)
/*
 * Compensate for 386 brain damage (missing URKR)
 */
int
trapwrite(addr)
	unsigned addr;
{
	vaddr_t va;
	unsigned nss;
	struct proc *p;
	struct vmspace *vm;

	va = trunc_page((vaddr_t)addr);
	if (va >= VM_MAXUSER_ADDRESS)
		return 1;

	nss = 0;
	p = curproc;
	vm = p->p_vmspace;
	if ((caddr_t)va >= vm->vm_maxsaddr) {
		nss = btoc(USRSTACK-(unsigned)va);
		if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur))
			nss = 0;
	}

	if (uvm_fault(&vm->vm_map, va, 0, VM_PROT_READ | VM_PROT_WRITE)
	    != KERN_SUCCESS)
		return 1;

	if (nss > vm->vm_ssize)
		vm->vm_ssize = nss;

	return 0;
}
#endif /* I386_CPU */

#endif /*  WANT_COMPAT_SYSCALL */

/*
 * syscall(frame):
 *	System call request from POSIX system call gate interface to kernel.
 * Like trap(), argument is call by reference.
 */
/*ARGSUSED*/
void
syscall(frame)
	struct trapframe FRAME;
{
	register caddr_t params;
	register const struct sysent *callp;
	register struct proc *p;
	int error, opc, nsys;
	size_t argsize;
	register_t code, args[8], rval[2];
	u_quad_t sticks;

	p = curproc;
#ifndef WANT_COMPAT_SYSCALL
	if (p->p_emul->e_syscall) {
		p->p_emul->e_syscall(&frame);
		return;
	}
#endif

	uvmexp.syscalls++;
	if (!USERMODE(FRAME.tf_cs, FRAME.tf_eflags))
		panic("syscall");
	sticks = p->p_sticks;
	p->p_md.md_regs = &FRAME;
	opc = FRAME.tf_eip;
	code = FRAME.tf_eax;

	nsys = p->p_emul->e_nsysent;
	callp = p->p_emul->e_sysent;

#ifdef COMPAT_IBCS2_SOURCE
	if (IBCS2_HIGH_SYSCALL(code))
		code = IBCS2_CVT_HIGH_SYSCALL(code);
#endif /* COMPAT_IBCS2_SOURCE */

	params = (caddr_t)FRAME.tf_esp + sizeof(int);

#ifdef VM86
	/*
	 * VM86 mode application found our syscall trap gate by accident; let
	 * it get a SIGSYS and have the VM86 handler in the process take care
	 * of it.
	 */
	if (FRAME.tf_eflags & PSL_VM)
		code = -1;
	else
#endif /* VM86 */

	switch (code) {
	case SYS_syscall:
#ifdef COMPAT_LINUX_SOURCE
		/* Linux has a special system setup call as number 0 */
		break;
#endif /* COMPAT_LINUX_SOURCE */
		/*
		 * Code is first argument, followed by actual args.
		 */
		code = fuword(params);
		params += sizeof(int);
		break;
#if !defined(COMPAT_IBCS2_SOURCE) && !defined(COMPAT_LINUX_SOURCE)
	case SYS___syscall:
		/*
		 * Like syscall, but code is a quad, so as to maintain
		 * quad alignment for the rest of the arguments.
		 */
		if (p->p_emul->e_flags & EMUL_HAS___SYSCALL) {
			code = fuword(params + _QUAD_LOWWORD * sizeof(int));
			params += sizeof(quad_t);
		}
		break;
#endif /* !COMPAT_IBCS2_SOURCE && !COMPAT_LINUX_SOURCE */
	default:
		break;
	}
	if (code < 0 || code >= nsys) {
		callp += p->p_emul->e_nosys;		/* illegal */
		printf("p_emul->e_nosys\n");
	} else
		callp += code;
	argsize = callp->sy_argsize;
	if (argsize) {
#ifdef COMPAT_LINUX_SOURCE
		/*
		 * Linux passes the args in ebx, ecx, edx, esi, edi, in
		 * increasing order.
		 */
		switch (argsize >> 2) {
		case 5:
			args[4] = FRAME.tf_edi;
		case 4:
			args[3] = FRAME.tf_esi;
		case 3:
			args[2] = FRAME.tf_edx;
		case 2:
			args[1] = FRAME.tf_ecx;
		case 1:
			args[0] = FRAME.tf_ebx;
			break;
		default:
			panic("linux syscall bogus argument size %d",
			    argsize);
			break;
		}
#else /* !COMPAT_LINUX_SOURCE */
		error = copyin(params, (caddr_t)args, argsize);
		if (error)
			goto bad;
#endif /* COMPAT_LINUX_SOURCE */
	}
#ifdef SYSCALL_DEBUG
	scdebug_call(p, code, args);
#endif /* SYSCALL_DEBUG */
#ifdef KTRACE
	if (KTRPOINT(p, KTR_SYSCALL))
		ktrsyscall(p, code, argsize, args);
#endif /* KTRACE */
	rval[0] = 0;
	rval[1] = FRAME.tf_edx;
	error = (*callp->sy_call)(p, args, rval);
	switch (error) {
	case 0:
		/*
		 * Reinitialize proc pointer `p' as it may be different
		 * if this is a child returning from fork syscall.
		 */
		p = curproc;
		FRAME.tf_eax = rval[0];
#ifndef COMPAT_LINUX_SOURCE
		FRAME.tf_edx = rval[1];
#endif
		FRAME.tf_eflags &= ~PSL_C;	/* carry bit */
		break;
	case ERESTART:
		/*
		 * The offset to adjust the PC by depends on whether we entered
		 * the kernel through the trap or call gate.  We pushed the
		 * size of the instruction into tf_err on entry.
		 */
		FRAME.tf_eip = opc - FRAME.tf_err;
		break;
	case EJUSTRETURN:
		/* nothing to do */
		break;
	default:
#ifndef COMPAT_LINUX_SOURCE
	bad:
#endif
		if (p->p_emul->e_errno)
			error = p->p_emul->e_errno[error];
		FRAME.tf_eax = error;
		FRAME.tf_eflags |= PSL_C;	/* carry bit */
		break;
	}

#ifdef SYSCALL_DEBUG
	scdebug_ret(p, code, error, rval);
#endif /* SYSCALL_DEBUG */
	userret(p, FRAME.tf_eip, sticks);
#ifdef KTRACE
	if (KTRPOINT(p, KTR_SYSRET))
		ktrsysret(p, code, error, rval[0]);
#endif /* KTRACE */
}

#ifndef WANT_COMPAT_SYSCALL
void
child_return(arg)
	void *arg;
{
	struct proc *p = arg;
	struct trapframe *tf = p->p_md.md_regs;

	tf->tf_eax = 0;
	tf->tf_eflags &= ~PSL_C;

	userret(p, tf->tf_eip, 0);
#ifdef KTRACE
	if (KTRPOINT(p, KTR_SYSRET))
		ktrsysret(p, SYS_fork, 0, 0);
#endif
}
#endif /* WANT_COMPAT_SYSCALL */

--ELM975144572-885-0_--