Subject: Re: kernel stack overflow detection
To: None <cgd@broadcom.com>
From: YAMAMOTO Takashi <yamt@mwd.biglobe.ne.jp>
List: tech-kern
Date: 06/10/2002 21:47:20
----Next_Part(Mon_Jun_10_21:47:20_2002_157)--
Content-Type: Text/Plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

From: cgd@broadcom.com
Date: 30 May 2002 09:43:14 -0700
> you've given yourself an extra page, so fill it all with the magic
> number.  It's possible for a stack overflow to get 'lucky' and miss
> a single magic location.
> 
> 
> Actually, while you're at it...
> 
> You might provide a more generally-useful mechanism for debugging and
> system analysis:
> 
> * fill _all_ kernel stack areas _completely_ with a magic number, and
> 
> * provide a mechanism to check all of them to see how much has been
>   used.

i slightly updated my patch as you suggest. (attached)

btw, kernel stack overflows i saw seems due to
ipsec+nfsclient. is anyone else using them?

---
YAMAMOTO Takashi<yamt@mwd.biglobe.ne.jp>

----Next_Part(Mon_Jun_10_21:47:20_2002_157)--
Content-Type: Text/Plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="kstack2.diff"

Index: arch/i386/conf/files.i386
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/arch/i386/conf/files.i386,v
retrieving revision 1.206
diff -u -p -r1.206 files.i386
--- arch/i386/conf/files.i386	2002/04/18 12:54:12	1.206
+++ arch/i386/conf/files.i386	2002/06/10 10:50:43
@@ -57,6 +57,10 @@ defparam opt_pcibios.h	PCIBIOS_IRQS_HINT
 # Large page size
 defflag			LARGEPAGES
 
+# kernel stack debug
+defflag	opt_kstack_safe.h		KSTACK_SAFE
+defflag	opt_kstack_dr0.h		KSTACK_CHECK_DR0
+
 file	arch/i386/i386/autoconf.c
 file	arch/i386/i386/bus_machdep.c
 file	arch/i386/i386/conf.c
Index: arch/i386/i386/pmap.c
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.136
diff -u -p -r1.136 pmap.c
--- arch/i386/i386/pmap.c	2002/03/27 04:47:28	1.136
+++ arch/i386/i386/pmap.c	2002/06/10 10:50:44
@@ -65,6 +65,7 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1
 #include "opt_cputype.h"
 #include "opt_user_ldt.h"
 #include "opt_largepages.h"
+#include "opt_kstack_dr0.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -1654,6 +1655,11 @@ pmap_ldt_cleanup(p)
 }
 #endif /* USER_LDT */
 
+#ifdef KSTACK_CHECK_DR0
+/* XXX should be in cpufunc.h or somewhere */
+void dr0(caddr_t, u_int32_t, u_int32_t, u_int32_t);
+#endif
+
 /*
  * pmap_activate: activate a process' pmap (fill in %cr3 and LDT info)
  *
@@ -1675,6 +1681,16 @@ pmap_activate(p)
 		lcr3(pcb->pcb_cr3);
 	if (pcb == curpcb)
 		lldt(pcb->pcb_ldt_sel);
+
+#ifdef KSTACK_CHECK_DR0
+	/*
+	 * setup breakpoint on the top of stack
+	 */
+	if (p == &proc0)
+		dr0(0, 0, 0, 0);
+	else
+		dr0(KSTACK_END(p), 1, 3, 1);
+#endif
 }
 
 /*
Index: arch/i386/i386/trap.c
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/arch/i386/i386/trap.c,v
retrieving revision 1.165
diff -u -p -r1.165 trap.c
--- arch/i386/i386/trap.c	2002/02/18 15:58:02	1.165
+++ arch/i386/i386/trap.c	2002/06/10 10:50:44
@@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.1
 #include "opt_math_emulate.h"
 #include "opt_vm86.h"
 #include "opt_cputype.h"
+#include "opt_kstack_dr0.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -155,6 +156,33 @@ int	trapdebug = 0;
 
 #define	IDTVEC(name)	__CONCAT(X, name)
 
+#ifdef KSTACK_CHECK_DR0
+/* XXX should be in cpufunc.h */
+static __inline u_int
+rdr6(void)
+{
+	u_int val;
+
+	__asm __volatile("movl %%dr6,%0" : "=r" (val));
+	return val;
+}
+
+/* XXX should be in cpufunc.h */
+static __inline void
+ldr6(u_int val)
+{
+
+	__asm __volatile("movl %0,%%dr6" : : "r" (val));
+}
+#endif
+
+#ifdef KSTACK_SAFE
+int kstack_check_dr0_panic;
+#ifdef DDB
+int kstack_check_dr0_debug;
+#endif
+#endif
+
 /*
  * trap(frame):
  *	Exception, fault, and trap interface to BSD kernel. This
@@ -202,6 +230,30 @@ trap(frame)
 
 	default:
 	we_re_toast:
+#ifdef KSTACK_CHECK_DR0
+		if (type == T_TRCTRAP) {
+			u_int mask, dr6 = rdr6();
+
+			mask = 1 << 0; /* dr0 */
+			if (dr6 & mask) {
+				/*
+				 * XXX should be just panic
+				 */
+#ifdef KSTACK_SAFE
+				if (kstack_check_dr0_panic)
+#endif
+					panic("trap on DR0: maybe kernel stack overflow\n");
+				printf("trap on DR0: maybe kernel stack overflow\n");
+				dr6 &= ~mask;
+				ldr6(dr6);
+#ifdef DDB
+				if (kstack_check_dr0_debug)
+					Debugger();
+#endif
+				return;
+			}
+		}
+#endif
 #ifdef KGDB
 		if (kgdb_trap(type, &frame))
 			return;
Index: arch/i386/include/param.h
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/arch/i386/include/param.h,v
retrieving revision 1.48
diff -u -p -r1.48 param.h
--- arch/i386/include/param.h	2002/02/26 15:13:23	1.48
+++ arch/i386/include/param.h	2002/06/10 10:50:44
@@ -90,7 +90,17 @@
 
 #define	SSIZE		1		/* initial stack size/NBPG */
 #define	SINCR		1		/* increment of stack/NBPG */
-#define	UPAGES		2		/* pages of u-area */
+
+#if defined(_KERNEL_OPT)
+#include "opt_kstack_safe.h"
+#endif
+#define SHAM_UPAGES	2
+#define	SHAM_USPACE	(SHAM_UPAGES * NBPG)
+#ifdef KSTACK_SAFE
+#define	UPAGES		(SHAM_UPAGES + 1) /* "real" pages of u-area */
+#else
+#define	UPAGES		SHAM_UPAGES	/* pages of u-area */
+#endif
 #define	USPACE		(UPAGES * NBPG)	/* total size of u-area */
 
 #ifndef MSGBUFSIZE
Index: conf/files
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/conf/files,v
retrieving revision 1.532
diff -u -p -r1.532 files
--- conf/files	2002/06/06 23:54:48	1.532
+++ conf/files	2002/06/10 10:51:04
@@ -138,6 +138,7 @@ defparam opt_kgdb.h		KGDB_DEV KGDB_DEVNA
 				KGDB_DEVADDR KGDB_DEVRATE KGDB_DEVMODE
 defflag				LOCKDEBUG
 defflag				SYSCALL_DEBUG
+defflag	opt_kstack.h		KSTACK_CHECK_MAGIC
 
 # memory (ram) disk options
 #
@@ -263,6 +264,13 @@ include "dev/raidframe/files.raidframe"
 #
 device	aic: scsi
 file	dev/ic/aic6360.c		aic
+
+define	scsi_low
+file	dev/scsipi/scsi_low.c	scsi_low
+
+# FutureDomain TMC18C[35]0
+device	stg: scsi, scsi_low
+file	dev/ic/tmc18c30.c	stg
 
 # SMC 93Cx6 Serial EEPROM devices
 #
Index: kern/kern_fork.c
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/kern/kern_fork.c,v
retrieving revision 1.88
diff -u -p -r1.88 kern_fork.c
--- kern/kern_fork.c	2001/12/08 00:35:30	1.88
+++ kern/kern_fork.c	2002/06/10 10:51:17
@@ -81,6 +81,7 @@
 __KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.88 2001/12/08 00:35:30 thorpej Exp $");
 
 #include "opt_ktrace.h"
+#include "opt_kstack.h"
 #include "opt_multiprocessor.h"
 
 #include <sys/param.h>
@@ -101,6 +102,96 @@ __KERNEL_RCSID(0, "$NetBSD: kern_fork.c,
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 
+#ifdef KSTACK_CHECK_MAGIC
+#include <sys/user.h>
+
+#define KSTACK_MAGIC	0xdeadbeaf
+
+void kstack_setup_magic(const struct proc *);
+void kstack_check_magic(const struct proc *);
+
+#define	KSTACK_SIZE (SHAM_USPACE - sizeof(struct user))
+
+/* XXX should be per process basis? */
+int kstackleftmin = KSTACK_SIZE;
+int kstackleftthres = KSTACK_SIZE / 8; /* warn if remaining stack is less than this */
+
+void
+kstack_setup_magic(const struct proc *p)
+{
+	u_int32_t *ip;
+	u_int32_t const *end;
+
+	KASSERT(p != 0);
+	KASSERT(p != &proc0);
+
+	/*
+	 * fill all the stack with magic number
+	 * so that later modification on it can be detected.
+	 */
+	ip = (u_int32_t *)(KSTACK_END(p) + SHAM_USPACE - USPACE);
+	end = (u_int32_t *)((caddr_t)KSTACK_END(p) + KSTACK_SIZE / 2); /* XXX */
+	for (; ip < end; ip++) {
+		*ip = KSTACK_MAGIC;
+	}
+}
+
+void
+kstack_check_magic(const struct proc *p)
+{
+	static int dont_check;
+	u_int32_t const *ip, *end;
+	int stackleft;
+
+	KASSERT(p != 0);
+
+	/* don't check proc0 */ /*XXX*/
+	if (p == &proc0)
+		return;
+
+	ip = (u_int32_t *)(KSTACK_END(p) + SHAM_USPACE - USPACE);
+	end = (u_int32_t *)((caddr_t)p->p_addr + USPACE);
+	for (; ip < end; ip++)
+		if (*ip != KSTACK_MAGIC)
+			break;
+
+	stackleft = (caddr_t)ip - KSTACK_END(p);
+	if (kstackleftmin > stackleft) {
+		kstackleftmin = stackleft;
+		if (stackleft < kstackleftthres)
+			printf("warning: kernel stack left %d bytes(pid %u)\n",
+				stackleft, p->p_pid);
+	}
+
+	if (dont_check)
+		return;
+
+	if (stackleft <= 0) {
+#ifndef KSTACK_SAFE
+		panic("magic on the top of kernel stack changed for pid %u: "
+		    "maybe kernel stack overflow\n", p->p_pid);
+#else
+		printf("magic on the top of kernel stack changed for pid %u: "
+		    "maybe kernel stack overflow\n", p->p_pid);
+
+		/*
+		 * no more checks.
+		 * if kernel stack once overflow, it's enough bad..
+		 */
+		dont_check = 1;
+		printf("kernel stack magic check disabled\n");
+#if 0
+		/*
+		 * re-setup magic
+		 * XXX this will break stack more..
+		 */
+		kstack_setup_magic(p);
+#endif
+#endif
+	}
+}
+#endif /*KSTACK_CHECK_MAGIC*/
+
 #include <sys/syscallargs.h>
 
 #include <uvm/uvm_extern.h>
@@ -376,6 +467,9 @@ fork1(struct proc *p1, int flags, int ex
 	    stack, stacksize,
 	    (func != NULL) ? func : child_return,
 	    (arg != NULL) ? arg : p2);
+#ifdef KSTACK_CHECK_MAGIC
+	kstack_setup_magic(p2);
+#endif
 
 	/*
 	 * BEGIN PID ALLOCATION.
Index: kern/kern_synch.c
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/kern/kern_synch.c,v
retrieving revision 1.108
diff -u -p -r1.108 kern_synch.c
--- kern/kern_synch.c	2002/05/21 01:38:27	1.108
+++ kern/kern_synch.c	2002/06/10 10:51:17
@@ -82,6 +82,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_synch.c
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
+#include "opt_kstack.h"
 #include "opt_lockdebug.h"
 #include "opt_multiprocessor.h"
 
@@ -95,6 +96,11 @@ __KERNEL_RCSID(0, "$NetBSD: kern_synch.c
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 
+#ifdef KSTACK_CHECK_MAGIC
+/* XXX should be in proc.h or user.h */
+void kstack_check_magic(const struct proc *);
+#endif
+
 #include <uvm/uvm_extern.h>
 
 #ifdef KTRACE
@@ -847,6 +853,10 @@ mi_switch(struct proc *p)
 	 * scheduling flags.
 	 */
 	spc->spc_flags &= ~SPCF_SWITCHCLEAR;
+
+#ifdef KSTACK_CHECK_MAGIC
+	kstack_check_magic(p);
+#endif
 
 	/*
 	 * Pick a new current process and switch to it.  When we
Index: sys/param.h
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/sys/param.h,v
retrieving revision 1.140
diff -u -p -r1.140 param.h
--- sys/param.h	2002/05/22 03:29:09	1.140
+++ sys/param.h	2002/06/10 10:51:17
@@ -277,6 +277,14 @@
 
 #ifdef _KERNEL
 /*
+ * kernel stack paramaters
+ */
+#ifndef SHAM_USPACE
+#define	SHAM_USPACE	USPACE
+#endif
+#define KSTACK_END(p)   ((caddr_t)ALIGN((p)->p_addr + 1) + USPACE - SHAM_USPACE)
+
+/*
  * macro to convert from milliseconds to hz without integer overflow
  * Default version using only 32bits arithmetics.
  * 64bit port can define 64bit version in their <machine/param.h>
Index: uvm/uvm_meter.c
===================================================================
RCS file: /cvs/cvsroot/syssrc/sys/uvm/uvm_meter.c,v
retrieving revision 1.23
diff -u -p -r1.23 uvm_meter.c
--- uvm/uvm_meter.c	2001/12/09 03:07:19	1.23
+++ uvm/uvm_meter.c	2002/06/10 10:51:18
@@ -215,7 +215,7 @@ uvm_sysctl(name, namelen, oldp, oldlenp,
 		return (sysctl_rdint(oldp, oldlenp, newp, maxslp));
 
 	case VM_USPACE:
-		return (sysctl_rdint(oldp, oldlenp, newp, USPACE));
+		return (sysctl_rdint(oldp, oldlenp, newp, SHAM_USPACE));
 
 	default:
 		return (EOPNOTSUPP);

----Next_Part(Mon_Jun_10_21:47:20_2002_157)----