Subject: Re: Getting "TLB IPI rendezvous failed..."
To: Stephan Uphoff <ups@tree.com>
From: Manuel Bouyer <bouyer@antioche.lip6.fr>
List: tech-kern
Date: 01/27/2005 16:30:00
--TB36FDmn/VVEgNH/
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Thu, Jan 27, 2005 at 03:18:15PM +0100, Manuel Bouyer wrote:
> > Just to make sure that this is not our problem can you replace the line
> > 	self->ci_tlb_ipi_mask = cpumask;
> > in pmap_tlb_shootnow with 
> > 	x86_atomic_setbits(&self->ci_tlb_ipi_mask,cpumask);
> 
> I'll try this too.

This didn't help. I'm now running with the attached patch, and got:
pmap_tlb_shootnow: CPU 0 interrupt level 0x6 pending 0x10000000 depth 1 ci_ipis 16
pmap_tlb_shootnow: CPU 0 interrupt level 0xd pending 0x400 depth 0 ci_ipis 16
pmap_tlb_shootnow: CPU 0 interrupt level 0x6 pending 0x0 depth 1 ci_ipis 16

But retrying the IPI seems to be enouth, the box didn't panic and the dump
completed.
From a previous run I got:
CPU 0 interrupt level 0x6 pending 0x400 depth 1 ci_ipis 8
This was before I added the x86_atomic_setbits_l(), and I didn't add the
function name in the printf(), but I suspect this one is from npxsave_lwp().
Retrying the IPI seems to be enouth here too, to keep the box running.

-- 
Manuel Bouyer <bouyer@antioche.eu.org>
     NetBSD: 26 ans d'experience feront toujours la difference
--

--TB36FDmn/VVEgNH/
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=diff

Index: i386/db_interface.c
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/db_interface.c,v
retrieving revision 1.42
diff -u -r1.42 db_interface.c
--- i386/db_interface.c	13 Feb 2004 11:36:13 -0000	1.42
+++ i386/db_interface.c	27 Jan 2005 15:24:02 -0000
@@ -262,6 +262,19 @@
 void
 cpu_Debugger()
 {
+	volatile struct cpu_info *ci;
+	int i;
+	
+	for (i=0; i < X86_MAXPROCS; i++) {
+		ci = cpu_info[i];
+		if (ci == NULL)
+			continue;
+
+		printf("CPU %ld interrupt level 0x%x pending 0x%x depth %d "
+		   "ci_ipis %d\n",
+		   ci->ci_cpuid, ci->ci_ilevel, ci->ci_ipending, ci->ci_idepth,
+		   ci->ci_ipis);
+	}
 	breakpoint();
 }
 
@@ -365,6 +378,9 @@
 			db_printf("CPU %ld not paused\n", addr);
 			return;
 		}
+		printf("CPU %ld current interrupt level 0x%x pending 0x%x "
+		    "depth %d ci_ipis %d\n", ci->ci_cpuid, ci->ci_ilevel,
+		    ci->ci_ipending, ci->ci_idepth, ci->ci_ipis);
 	}
 	if (ci->ci_ddb_regs == 0) {
 		db_printf("CPU %ld has no saved regs\n", addr);
Index: i386/machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/machdep.c,v
retrieving revision 1.559
diff -u -r1.559 machdep.c
--- i386/machdep.c	20 Oct 2004 04:20:05 -0000	1.559
+++ i386/machdep.c	27 Jan 2005 15:24:02 -0000
@@ -1823,6 +1823,8 @@
 #endif
 	pmap_update(pmap_kernel());
 
+	memset(idt_allocmap, 0, sizeof(idt_allocmap));
+
 	tgdt = gdt;
 	gdt = (union descriptor *)
 		    ((char *)idt + NIDT * sizeof (struct gate_descriptor));
Index: i386/pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/i386/pmap.c,v
retrieving revision 1.180
diff -u -r1.180 pmap.c
--- i386/pmap.c	1 Jan 2005 21:00:06 -0000	1.180
+++ i386/pmap.c	27 Jan 2005 15:24:03 -0000
@@ -3659,6 +3659,7 @@
 	int s;
 #ifdef DIAGNOSTIC
 	int count = 0;
+	int ipi_retry = 0;
 #endif
 #endif
 
@@ -3668,7 +3669,7 @@
 	self = curcpu();
 #ifdef MULTIPROCESSOR
 	s = splipi();
-	self->ci_tlb_ipi_mask = cpumask;
+	x86_atomic_setbits_l(&self->ci_tlb_ipi_mask,cpumask);
 #endif
 
 	pmap_do_tlb_shootdown(self);	/* do *our* work. */
@@ -3679,6 +3680,7 @@
 	/*
 	 * Send the TLB IPI to other CPUs pending shootdowns.
 	 */
+ipi_again:
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		if (ci == self)
 			continue;
@@ -3690,9 +3692,20 @@
 
 	while (self->ci_tlb_ipi_mask != 0) {
 #ifdef DIAGNOSTIC
-		if (count++ > 10000000)
+		if (count++ > 10000000) {
+			for (CPU_INFO_FOREACH(cii, ci)) {
+				if (ci == self)
+					continue;
+				printf("pmap_tlb_shootnow: CPU %ld interrupt level 0x%x pending "
+				    "0x%x depth %d ci_ipis %d\n", ci->ci_cpuid,
+				    ci->ci_ilevel, ci->ci_ipending,
+				    ci->ci_idepth, ci->ci_ipis);
+			}
+			if (ipi_retry++ < 5)
+				goto ipi_again;
 			panic("TLB IPI rendezvous failed (mask %x)",
 			    self->ci_tlb_ipi_mask);
+		}
 #endif
 		x86_pause();
 	}
Index: isa/npx.c
===================================================================
RCS file: /cvsroot/src/sys/arch/i386/isa/npx.c,v
retrieving revision 1.106
diff -u -r1.106 npx.c
--- isa/npx.c	6 Jul 2004 01:30:08 -0000	1.106
+++ isa/npx.c	27 Jan 2005 15:24:03 -0000
@@ -715,6 +715,7 @@
 {
 	struct cpu_info *ci = curcpu();
 	struct cpu_info *oci;
+	int ipi_retry = 0;
 
 	KDASSERT(l->l_addr != NULL);
 
@@ -740,6 +741,7 @@
 		    oci->ci_dev->dv_xname,
 		    save? "save" : "flush", l));
 
+ipi_again:
 		x86_send_ipi(oci,
 		    save ? X86_IPI_SYNCH_FPU : X86_IPI_FLUSH_FPU);
 
@@ -751,6 +753,19 @@
 #ifdef DIAGNOSTIC
 			spincount++;
 			if (spincount > 10000000) {
+				struct cpu_info *ci;
+				CPU_INFO_ITERATOR cii;
+				struct cpu_info *self = curcpu();
+				for (CPU_INFO_FOREACH(cii, ci)) {
+					if (ci == self)
+						continue;
+					printf("npxsave_lwp: CPU %ld interrupt level 0x%x pending "
+					    "0x%x depth %d ci_ipis %d\n", ci->ci_cpuid,
+					    ci->ci_ilevel, ci->ci_ipending,
+					    ci->ci_idepth, ci->ci_ipis);
+				}
+				if (ipi_retry++ < 5)
+					goto ipi_again;
 				panic("fp_save ipi didn't");
 			}
 #endif

--TB36FDmn/VVEgNH/--