Re: kern/47512: netbsd-6 system crash

To: kern-bug-people%netbsd.org@localhost,gnats-admin%netbsd.org@localhost,netbsd-bugs%netbsd.org@localhost,htodd%twofifty.com@localhost
Subject: Re: kern/47512: netbsd-6 system crash
From: Chuck Silvers <chuq%chuq.com@localhost>
Date: Sun, 7 Apr 2013 15:40:55 +0000 (UTC)
The following reply was made to PR kern/47512; it has been noted by GNATS.

From: Chuck Silvers <chuq%chuq.com@localhost>
To: Hisashi T Fujinaka <htodd%twofifty.com@localhost>
Cc: kern-bug-people%netbsd.org@localhost, netbsd-bugs%netbsd.org@localhost,
        gnats-bugs%NetBSD.org@localhost
Subject: Re: kern/47512: netbsd-6 system crash
Date: Sun, 7 Apr 2013 08:38:53 -0700

 the reason for the first crash was that an amap has gotten corrupted:
 
 (gdb) bt
 #0  0xffffffff802cb09e in cpu_reboot (howto=260, bootstr=<optimized out>)
     at /usr/src/sys/arch/amd64/amd64/machdep.c:817
 #1  0xffffffff803c2084 in vpanic (fmt=0xffffffff8053627d "lock error", 
     ap=0xfffffe8000b7d410) at /usr/src/sys/kern/subr_prf.c:308
 #2  0xffffffff803c2159 in panic (fmt=<unavailable>)
     at /usr/src/sys/kern/subr_prf.c:205
 #3  0xffffffff803bba6a in lockdebug_abort (lock=0xfffffe80278b3250, 
     ops=0xffffffff806ecaa0, func=0xffffffff804d7320 "rw_vector_enter", 
     msg=0xffffffff8051a707 "locking against myself")
     at /usr/src/sys/kern/subr_lockdebug.c:858
 #4  0xffffffff8027d62f in rw_vector_enter (rw=0xfffffe80278b3250, op=RW_READER)
     at /usr/src/sys/kern/kern_rwlock.c:341
 #5  0xffffffff8043a8a3 in uvmfault_lookup (write_lock=false, 
     ufi=0xfffffe8000b7d6a0) at /usr/src/sys/uvm/uvm_fault_i.h:126
 #6  uvm_fault_check (maxprot=false, ranons=<optimized out>, 
     flt=0xfffffe8000b7d6e0, ufi=0xfffffe8000b7d6a0)
     at /usr/src/sys/uvm/uvm_fault.c:957
 #7  uvm_fault_internal (orig_map=<optimized out>, vaddr=<optimized out>, 
     access_type=<optimized out>, fault_flag=<optimized out>)
     at /usr/src/sys/uvm/uvm_fault.c:877
 #8  0xffffffff803faab7 in trap (frame=0xfffffe8000b7d840)
     at /usr/src/sys/arch/amd64/amd64/trap.c:568
 #9  0xffffffff80101046 in calltrap ()
 #10 0xfffffe8044908b48 in ?? ()
 #11 0x0000000000000000 in ?? ()
 
 (gdb) fr 8
 #8  0xffffffff803faab7 in trap (frame=0xfffffe8000b7d840)
     at /usr/src/sys/arch/amd64/amd64/trap.c:568
 568     /usr/src/sys/arch/amd64/amd64/trap.c: No such file or directory.
         in /usr/src/sys/arch/amd64/amd64/trap.c
 
 (gdb) x/i frame->tf_rip
    0xffffffff80435cd2 <amap_copy+175>:  mov    %rdi,(%rcx,%rsi,1)
 
 
 (gdb) p *(struct vm_amap *)frame->tf_rbx
 $6 = {
   am_lock = 0x0, 
   am_ref = 0x1, 
   am_flags = 0x0, 
   am_maxslot = 0x40, 
   am_nslot = 0x40, 
   am_nused = 0x0, 
   am_slots = 0xfffffe8019ec5808, 
   am_bckptr = 0xfffffe8019ec56c8, 
   am_anon = 0x8, 
   am_ppref = 0x0, 
   am_list = {
     le_next = 0x0, 
     le_prev = 0x7f7ff23094a8
   }
 }
 
 
 "am_anon" is a pointer, so "8" is not a good value.  we crashed deferencing 
this pointer.
 this amap is freshly allocated, and not yet visible to anything, so the 
problem is
 most likely with kernel memory allocation and not with the amap code.
 the memory that was just allocated for am_anon was 512 bytes, and assuming 
that you
 don't have any of the kmem debugging stuff enabled, this would have come from 
the
 512-byte kmem bucket.  the first place to look is the per-CPU caches for this 
kmem bucket:
 
 (gdb) p ncpu
 $1 = 2
 
 (gdb) p *(pool_cache_cpu_t *) kmem_cache[0x3f]->pc_cpus[0]
 $7 = {
   cc_misses = 483, 
   cc_hits = 197706, 
   cc_current = 0xffffffff807120e0, 
   cc_previous = 0xffffffff807120e0, 
   cc_cache = 0xfffffe807fe744c0, 
   cc_ipl = 6, 
   cc_cpuindex = 0, 
   cc_iplcookie = {
     _ipl = 6 '\006'
   }
 }
 (gdb) p *(pool_cache_cpu_t *) kmem_cache[0x3f]->pc_cpus[1]
 $8 = {
   cc_misses = 499, 
   cc_hits = 210399, 
   cc_current = 0xffffffff807120e0, 
   cc_previous = 0xffffffff807120e0, 
   cc_cache = 0xfffffe807fe744c0, 
   cc_ipl = 6, 
   cc_cpuindex = 1, 
   cc_iplcookie = {
     _ipl = 6 '\006'
   }
 }
 
 (gdb) p &pcg_dummy
 $10 = (const pcg_t *) 0xffffffff807120e0
 
 these all reference pcg_dummy, so there's nothing in the per-CPU pool caches.
 
 
 let's look at the underlying pool_cache and pool:
 
 (gdb) p *((pool_cache_cpu_t *) kmem_cache[0x3f]->pc_cpus[0])->cc_cache
 $27 = {
   pc_pool = {
     pr_poollist = {
       tqe_next = 0xfffffe807fe83180, 
       tqe_prev = 0xfffffe807fe82140
     }, 
     pr_emptypages = {
       lh_first = 0x0
     }, 
     pr_fullpages = {
       lh_first = 0x0
     }, 
     pr_partpages = {
       lh_first = 0xfffffe8004a84578
     }, 
     pr_curpage = 0xfffffe8004a84578, 
 ...
 
     pr_nget = 697, 
     pr_nfail = 0, 
     pr_nput = 695, 
     pr_npagealloc = 16, 
     pr_npagefree = 15, 
 ...
   }, 
   pc_lock = {
     u = {
       mtxa_owner = 0x601
     }
   }, 
   pc_cachelist = {
     tqe_next = 0xfffffe807fe83180, 
     tqe_prev = 0xfffffe807fe822e8
   }, 
   pc_emptygroups = 0x0, 
   pc_fullgroups = 0x0, 
   pc_partgroups = 0x0, 
   pc_pcgpool = 0xffffffff807122a0, 
   pc_pcgsize = 15, 
   pc_ncpu = 2, 
   pc_hits = 285, 
   pc_misses = 697, 
 
 }
 
 
 there's only one page in this pool, so it looks like all the cached items
 were recently freed up.  maybe the system is short on memory?  let's check:
 
 (gdb) p uvmexp
 $28 = {
   pagesize = 4096, 
   pagemask = 4095, 
   pageshift = 12, 
   npages = 507177, 
   free = 1, 
   paging = 2070, 
 
 }
 
 
 so yes, there are no pages free (except the one reserved for the pagedaemon),
 and we're currently paging out a bunch of pages.
 
 the thread which crashed is running on CPU 0, perhaps the pool reclaimation 
thread
 is running on CPU 1?  let's check:
 
 (gdb) p cpu_info_primary->ci_next->ci_curlwp->l_name 
 $29 = 0xffffffff8051a043 "swapper"
 (gdb) p cpu_info_primary->ci_next->ci_curlwp    
 $30 = (struct lwp *) 0xffffffff806eb8e0
 (gdb) p &lwp0
 $31 = (lwp_t *) 0xffffffff806eb8e0
 
 
 (gdb) p uvm.pagedaemon_lwp
 $33 = (struct lwp *) 0xfffffe8004f8e1e0
 
 
 pool reclamation is done by the pagedaemon thread, so it's not running right 
now,
 but it has probably run recently since pageouts are in progress.
 let's check l_stime for the pagedaemon vs. the threads currently running:
 
 (gdb) p uvm.pagedaemon_lwp->l_stime
 $36 = {
   sec = 1282100, 
   frac = 15116554923636832317
 }
 
 (gdb) p cpu_info_primary->ci_curlwp->l_stime
 $39 = {
   sec = 1282100, 
   frac = 15116635311572007711
 }
 (gdb) p cpu_info_primary->ci_next->ci_curlwp->l_stime
 $40 = {
   sec = 1282101, 
   frac = 2945563514467659413
 }
 
 
 
 (gdb) p uvm.pagedaemon_lwp->l_stat 
 $43 = 2
 (gdb) p uvm.pagedaemon_lwp->l_cpu->ci_data.cpu_index
 $42 = 0
 
 
 so the pagedaemon thread has run recently and it is currently waiting for
 a chance to run on CPU 0.  it's possible that it recently ran on CPU 1.
 I'm not sure if there's a way to tell for sure.
 
 the most likely theory that I have for what caused this
 is a race between pool reclamation and pool allocation.
 I haven't yet looked through the pool code for someting like this though.
 
 
 
 the second crash is exactly the same:
 
 (gdb) bt
 #0  0xffffffff802cb41e in cpu_reboot (howto=260, bootstr=<optimized out>)
     at /usr/src/sys/arch/amd64/amd64/machdep.c:817
 #1  0xffffffff803c25e4 in vpanic (fmt=0xffffffff80536abd "lock error", 
     ap=0xfffffe8000042410) at /usr/src/sys/kern/subr_prf.c:308
 #2  0xffffffff803c26b9 in panic (fmt=<unavailable>)
     at /usr/src/sys/kern/subr_prf.c:205
 #3  0xffffffff803bbfca in lockdebug_abort (lock=0xfffffe806efc31b8, 
     ops=0xffffffff806edaa0, func=0xffffffff804d7b00 "rw_vector_enter", 
     msg=0xffffffff8051af47 "locking against myself")
     at /usr/src/sys/kern/subr_lockdebug.c:858
 #4  0xffffffff8027d9cf in rw_vector_enter (rw=0xfffffe806efc31b8, op=RW_READER)
     at /usr/src/sys/kern/kern_rwlock.c:341
 #5  0xffffffff8043aeb3 in uvmfault_lookup (write_lock=false, 
     ufi=0xfffffe80000426a0) at /usr/src/sys/uvm/uvm_fault_i.h:126
 #6  uvm_fault_check (maxprot=false, ranons=<optimized out>, 
     flt=0xfffffe80000426e0, ufi=0xfffffe80000426a0)
     at /usr/src/sys/uvm/uvm_fault.c:957
 #7  uvm_fault_internal (orig_map=<optimized out>, vaddr=<optimized out>, 
     access_type=<optimized out>, fault_flag=<optimized out>)
     at /usr/src/sys/uvm/uvm_fault.c:877
 #8  0xffffffff803fb057 in trap (frame=0xfffffe8000042840)
     at /usr/src/sys/arch/amd64/amd64/trap.c:568
 #9  0xffffffff80101046 in calltrap ()
 #10 0xfffffe8039dd6d48 in ?? ()
 #11 0x0000000000000000 in ?? ()
 (gdb) fr 8
 #8  0xffffffff803fb057 in trap (frame=0xfffffe8000042840)
     at /usr/src/sys/arch/amd64/amd64/trap.c:568
 568     /usr/src/sys/arch/amd64/amd64/trap.c: No such file or directory.
         in /usr/src/sys/arch/amd64/amd64/trap.c
 
 (gdb) x/i frame->tf_rip
    0xffffffff804362e2 <amap_copy+175>:  mov    %rdi,(%rcx,%rsi,1)
 
 (gdb) p *(struct vm_amap *)frame->tf_rbx
 $2 = {
   am_lock = 0x0, 
   am_ref = 1, 
   am_flags = 0, 
   am_maxslot = 240, 
   am_nslot = 240, 
   am_nused = 0, 
   am_slots = 0xfffffe806ecd8c08, 
   am_bckptr = 0xfffffe806f6f0808, 
   am_anon = 0x8, 
   am_ppref = 0x0, 
   am_list = {
     le_next = 0x0, 
     le_prev = 0x0
   }
 }
 
 (gdb) p uvmexp
 $3 = {
   pagesize = 4096, 
   pagemask = 4095, 
   pageshift = 12, 
   npages = 507176, 
   free = 0, 
   paging = 1875, 
 ...
 }
 
 
 this time the pagedaemon is currently running on the other CPU:
 
 (gdb) p cpu_info_primary->ci_next->ci_curlwp->l_name
 $5 = 0xfffffe8004ffb688 "pgdaemon"
 
 I'm not sure how to get a stack trace out of a dump for something
 that was running on the other CPU.
 
 it's interesting that the bad address that we got both times was the same.
 
 
 -Chuck
Prev by Date: Re: misc/47674 (vt220 terminfo description seems incorrect)
Next by Date: Re: kern/42802 (acpi interrupt routing issues)
Previous by Thread: Re: kern/47512: netbsd-6 system crash
Next by Thread: re: kern/47512: netbsd-6 system crash
Indexes:
Home | Main Index | Thread Index | Old Index