Subject: Re: kern/25285: i386 MP panic: TLB IPI rendezvous failed (mask 1)
To: None <yamt@mwd.biglobe.ne.jp>
From: Paul Dokas <dokas@cs.umn.edu>
List: current-users
Date: 06/10/2004 12:00:55
On Thu, 10 Jun 2004 09:13:46 -0500 Paul Dokas <dokas@cs.umn.edu> wrote:
> 
> I've been running with this additional patch for over 12 hours under a
> moderate load and no panics so far.  I'll keep trying to crash the machine.

Ok, it crashed.  Cpu 0 shows the same panic location from all other previous
panics.  I'm not going to reproduce the whole trace here.  The others seem
to have been attempting to gain kernel locks.

Here's the backtraces (copied by hand):

cpu 0:
  pmap_tlb_shootdown()
  .
  .
  .

cpu 2:
  acquire()
  spinlock_acquire_count()
  _kernel_lock_acquire_count()
  mi_switch()
  ltsleep()
  sbwat()
  so_receive()
  soo_read()
  dofileread()
  sys_read()
  syscall_plain()
  --- syscall (number 3) ---

cpu 4:
  acquire()
  spinlock_acquire_count()
  _kernel_lock_acquire_count()
  mi_switch()
  ltsleep()
  sys_nanosleep()
  syscall_plain()
  --- syscall (number 240) ---

cpu 6:
  acquire()
  spinlock_acquire_count()
  _kernel_lock_acquire_count()
  mi_switch()
  ltsleep()
  sched_sync()


And, here's a summary of all of the patches that were running when I got this panic:

*** ./arch/i386/i386/vector.S.orig      Tue May 18 10:03:48 2004
--- ./arch/i386/i386/vector.S   Wed Jun  9 21:47:15 2004
***************
*** 163,169 ****
        pushl   $0
        pushl   $T_ASTFLT
        INTRENTRY
-       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
        movl    CPUVAR(ILEVEL),%ebx
        cmpl    $IPL_IPI,%ebx
        jae     2f
--- 163,168 ----
***************
*** 173,178 ****
--- 172,178 ----
          sti
        pushl   %ebx
        call    _C_LABEL(x86_ipi_handler)
+       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
        jmp     _C_LABEL(Xdoreti)
  2:
        orl     $(1 << LIR_IPI),CPUVAR(IPENDING)
***************
*** 624,629 ****
--- 624,630 ----
  
  IDTVEC(softserial)
        movl    $IPL_SOFTSERIAL, CPUVAR(ILEVEL)
+       sti
        incl    CPUVAR(IDEPTH)
  #ifdef MULTIPROCESSOR
        call    _C_LABEL(x86_softintlock)
***************
*** 642,647 ****
--- 643,649 ----
  
  IDTVEC(softnet)
        movl    $IPL_SOFTNET, CPUVAR(ILEVEL)
+       sti
        incl    CPUVAR(IDEPTH)
  #ifdef MULTIPROCESSOR
        call    _C_LABEL(x86_softintlock)
***************
*** 673,678 ****
--- 675,681 ----
  
  IDTVEC(softclock)
        movl    $IPL_SOFTCLOCK, CPUVAR(ILEVEL)
+       sti
        incl    CPUVAR(IDEPTH)
  #ifdef MULTIPROCESSOR
        call    _C_LABEL(x86_softintlock)


*** ./arch/i386/i386/spl.S.orig Wed Jun  9 21:43:23 2004
--- ./arch/i386/i386/spl.S      Wed Jun  9 21:44:31 2004
***************
*** 109,118 ****
        cli
        andl    CPUVAR(IPENDING),%eax           # any non-masked bits left?
        jz      2f
-       sti
        bsrl    %eax,%eax
        btrl    %eax,CPUVAR(IPENDING)
-       jnc     1b
        movl    CPUVAR(ISOURCES)(,%eax,4),%eax
        jmp     *IS_RECURSE(%eax)
  2:
--- 109,116 ----
***************
*** 143,152 ****
        cli
        andl    CPUVAR(IPENDING),%eax
        jz      2f
-       sti
        bsrl    %eax,%eax               # slow, but not worth optimizing
        btrl    %eax,CPUVAR(IPENDING)
-       jnc     1b                      # some intr cleared the in-memory bit
        movl    CPUVAR(ISOURCES)(,%eax, 4),%eax
        jmp     *IS_RESUME(%eax)
  2:    /* Check for ASTs on exit to user mode. */
--- 141,148 ----


*** ./arch/x86/include/intr.h.orig      Fri Jun  4 13:15:11 2004
--- ./arch/x86/include/intr.h   Fri Jun  4 13:17:13 2004
***************
*** 156,171 ****
  spllower(int nlevel)
  {
        struct cpu_info *ci = curcpu();
  
        __splbarrier();
!       /*
!        * Since this should only lower the interrupt level,
!        * the XOR below should only show interrupts that
!        * are being unmasked.
!        */
!       ci->ci_ilevel = nlevel;
!       if (ci->ci_ipending & IUNMASK(ci,nlevel))
!               Xspllower(nlevel);
  }
  
  /*
--- 156,176 ----
  spllower(int nlevel)
  {
        struct cpu_info *ci = curcpu();
+       u_int32_t imask;
+       u_long psl;
  
        __splbarrier();
! 
!       imask = IUNMASK(ci, nlevel);
!       psl = read_psl();
!       disable_intr();
!       if (ci->ci_ipending & imask) {
!               Xspllower(nlevel);
!               /* Xspllower does enable_intr() */
!       } else {
!               ci->ci_ilevel = nlevel;
!               write_psl(psl);
!       }
  }
  
  /*

-- 
Paul Dokas                                            dokas@cs.umn.edu
======================================================================
Don Juan Matus:  "an enigma wrapped in mystery wrapped in a tortilla."