Subject: Re: tricky bug in ufs_lookup?
To: Karl Janmar <karl@utopiafoundation.org>
From: Karl Janmar <karl@utopiafoundation.org>
List: tech-kern
Date: 08/20/2005 14:04:11
Sorry, I was mixing up the kernel that I tried to debug...
So I have to use the kernel written by savecore, without symbols.

Anyway discard the 2 objdump outputs from the original message.

Backtrace:
(gdb) bt
#0  0xc09f1000 in ?? ()
#1  0xc046af8b in cpu_reboot ()
#2  0xc03ce835 in panic ()
#3  0xc04763f8 in trap ()
#4  0xc010ae53 in calltrap ()
#5  0xc03f77b2 in sys_sync ()
#6  0xc03f5f06 in vfs_shutdown ()
#7  0xc046af9f in cpu_reboot ()
#8  0xc03ce835 in panic ()
#9  0xc04763f8 in trap ()
#10 0xc010ae53 in calltrap ()
#11 0xc03ff03b in VOP_LOOKUP ()
#12 0xc03f279e in lookup ()
#13 0xc03f21db in namei ()
#14 0xc03fddf5 in vn_open ()
#15 0xc03f8431 in sys_open ()
#16 0xc0475d2a in syscall_plain ()


at stack level #12:
(gdb) x /10i $eip-30
0xc03f2780 <lookup+496>:        add    $0x89,%al
0xc03f2782 <lookup+498>:        jo     0xc03f2798 <lookup+520>
0xc03f2784 <lookup+500>:        movl   $0x0,0x10(%eax)
0xc03f278b <lookup+507>:        mov    0xffffffd0(%ebp),%edx
0xc03f278e <lookup+510>:        mov    %edi,0x8(%esp,1)
0xc03f2792 <lookup+514>:        mov    %esi,(%esp,1)
0xc03f2795 <lookup+517>:        mov    %edx,0x4(%esp,1)
0xc03f2799 <lookup+521>:        call   0xc03ff00b <VOP_LOOKUP>
0xc03f279e <lookup+526>:        test   %eax,%eax
(gdb) i reg
eax            0x0      0
ecx            0x0      0
edx            0x0      0
ebx            0xceb27ed8       -827162920
esp            0xceb27908       0xceb27908
ebp            0xceb27d5c       0xceb27d5c
esi            0xd45c3a80       -732153216
edi            0xccc802a0       -859307360
eip            0xc03f279e       0xc03f279e

And if we take a closer look at esi, that should be the variable dp in lookup (a
struct vnode *), the first argument for VOP_LOOKUP.
I do this by using my other kernel with symbol to get the struct decoding right:

(gdb) print *(struct vnode *)0xd45c3a80
$2 = {v_uobj = {vmobjlock = {lock_data = 0}, pgops = 0xc083f770, memq = {
      tqh_first = 0x0, tqh_last = 0xd45c3a88}, uo_npages = 0, uo_refs = 1},
  v_size = 2048, v_flag = 128, v_numoutput = 0, v_writecount = 0,
  v_holdcnt = 1, v_mount = 0xc2fc0000, v_op = 0xc2a42700, v_freelist = {
    tqe_next = 0xd1572938, tqe_prev = 0xd427a8d0}, v_mntvnodes = {
    le_next = 0xd4989dd8, le_prev = 0xd174f4f0}, v_cleanblkhd = {
    lh_first = 0xc322f428}, v_dirtyblkhd = {lh_first = 0x0}, v_synclist = {
    le_next = 0xd4989dd8, le_prev = 0xd2b39e2c}, v_dnclist = {
    lh_first = 0xd500add4}, v_nclist = {lh_first = 0xd1b33f30}, v_un = {
    vu_mountedhere = 0x0, vu_socket = 0x0, vu_specinfo = 0x0,
    vu_fifoinfo = 0x0}, v_lease = 0x0, v_type = VDIR, v_tag = VT_UFS,
  v_lock = {lk_interlock = {lock_data = 0}, lk_flags = 1024,
    lk_sharecount = 0, lk_exclusivecount = 1, lk_recurselevel = 0,
    lk_waitcount = 0, lk_wmesg = 0xc079d7dc "vnlock", lk_un = {lk_un_sleep = {
        lk_sleep_lockholder = 3785, lk_sleep_locklwp = 1, lk_sleep_prio = 20,
        lk_sleep_timo = 0, lk_newlock = 0x0}, lk_un_spin = {
        lk_spin_cpu = 3785}}}, v_vnlock = 0xd45c3af0, v_data = 0x5c518c,
  v_klist = {slh_first = 0x0}}


I switch back to my real kernel without debugging symbols and see where the v_op
points at.

(gdb) x /4x 0xc2a42700
0xc2a42700:     0xc03f1254      0xc03ee7f6      0xc0377fb0      0xc037b4bc

And for the vop_lookup, it has an offset of 2 so we are dig deeper at 0xc0377fb0:

(gdb) x /30i 0xc0377fb0
0xc0377fb0 <ufs_lookup>:        push   %ebp
0xc0377fb1 <ufs_lookup+1>:      mov    %esp,%ebp
0xc0377fb3 <ufs_lookup+3>:      push   %edi
0xc0377fb4 <ufs_lookup+4>:      push   %esi
0xc0377fb5 <ufs_lookup+5>:      push   %ebx
0xc0377fb6 <ufs_lookup+6>:      sub    $0xbc,%esp
0xc0377fbc <ufs_lookup+12>:     movl   $0xffffffff,0xffffffd4(%ebp)
0xc0377fc3 <ufs_lookup+19>:     movl   $0x0,0xffffffec(%ebp)
0xc0377fca <ufs_lookup+26>:     mov    0x8(%ebp),%eax
0xc0377fcd <ufs_lookup+29>:     mov    0x8(%ebp),%ebx
0xc0377fd0 <ufs_lookup+32>:     mov    0x8(%ebp),%ecx
0xc0377fd3 <ufs_lookup+35>:     mov    0xc(%ebx),%ebx
0xc0377fd6 <ufs_lookup+38>:     mov    0x4(%eax),%eax
0xc0377fd9 <ufs_lookup+41>:     mov    0x8(%ecx),%ecx
0xc0377fdc <ufs_lookup+44>:     mov    0xa0(%eax),%edx
0xc0377fe2 <ufs_lookup+50>:     mov    %eax,0xffffffe4(%ebp)
0xc0377fe5 <ufs_lookup+53>:     movl   $0x0,(%ecx)
0xc0377feb <ufs_lookup+59>:     mov    (%ebx),%eax
0xc0377fed <ufs_lookup+61>:     mov    %edx,0xffffffe0(%ebp)
0xc0377ff0 <ufs_lookup+64>:     mov    %ecx,0xffffffa8(%ebp)
0xc0377ff3 <ufs_lookup+67>:     mov    %eax,0xffffff9c(%ebp)
0xc0377ff6 <ufs_lookup+70>:     mov    0xc(%ebx),%edi
0xc0377ff9 <ufs_lookup+73>:     mov    %ebx,0xffffffa4(%ebp)
0xc0377ffc <ufs_lookup+76>:     mov    0x44(%edx),%eax  <---- here is the trap
0xc0377fff <ufs_lookup+79>:     mov    %edi,0xffffffa0(%ebp)
0xc0378002 <ufs_lookup+82>:     mov    0x16c(%eax),%eax
0xc0378008 <ufs_lookup+88>:     mov    %eax,0xffffff98(%ebp)
0xc037800b <ufs_lookup+91>:     mov    0x4(%ebx),%eax
0xc037800e <ufs_lookup+94>:     mov    %eax,%edx
0xc0378010 <ufs_lookup+96>:     mov    %eax,%ecx


At stack level #11:

(gdb) disassemble VOP_LOOKUP
Dump of assembler code for function VOP_LOOKUP:
0xc03ff00b <VOP_LOOKUP>:        push   %ebp
0xc03ff00c <VOP_LOOKUP+1>:      mov    %esp,%ebp
0xc03ff00e <VOP_LOOKUP+3>:      sub    $0x28,%esp
0xc03ff011 <VOP_LOOKUP+6>:      mov    0xc(%ebp),%eax
0xc03ff014 <VOP_LOOKUP+9>:      mov    0x8(%ebp),%edx
0xc03ff017 <VOP_LOOKUP+12>:     movl   $0xc068cb20,0xffffffe8(%ebp)
0xc03ff01e <VOP_LOOKUP+19>:     mov    %eax,0xfffffff0(%ebp)
0xc03ff021 <VOP_LOOKUP+22>:     mov    %edx,0xffffffec(%ebp)
0xc03ff024 <VOP_LOOKUP+25>:     mov    0x10(%ebp),%eax
0xc03ff027 <VOP_LOOKUP+28>:     mov    %eax,0xfffffff4(%ebp)
0xc03ff02a <VOP_LOOKUP+31>:     mov    0x34(%edx),%edx
0xc03ff02d <VOP_LOOKUP+34>:     lea    0xffffffe8(%ebp),%eax
0xc03ff030 <VOP_LOOKUP+37>:     mov    %eax,(%esp,1)
0xc03ff033 <VOP_LOOKUP+40>:     mov    0xc068cb20,%eax
0xc03ff038 <VOP_LOOKUP+45>:     call   *(%edx,%eax,4)
0xc03ff03b <VOP_LOOKUP+48>:     mov    %ebp,%esp
0xc03ff03d <VOP_LOOKUP+50>:     pop    %ebp
0xc03ff03e <VOP_LOOKUP+51>:     ret
End of assembler dump.
(gdb) i reg
eax            0x0      0
ecx            0x0      0
edx            0x0      0
ebx            0xceb27ed8       -827162920
esp            0xceb27908       0xceb27908
ebp            0xceb27d0c       0xceb27d0c
esi            0xd45c3a80       -732153216
edi            0xccc802a0       -859307360
eip            0xc03ff03b       0xc03ff03b

So here the call ends up in ufs_lookup that I disassembled above.
So the frame where the trap occured is lost?
Can somebody with sharp eyes spot something or want more information?

Regards,
Karl Janmar


Karl Janmar wrote:
> Hi,
> 
> I occasionally get panics by a page-fault in kernel-mode.
> This system is the netbsd-3 tag since 10 July.
> I could not diff against anoncvs.netbsd.org as it's down....do I miss some vital
> update?
> 
> uname -a
> NetBSD ngong.utopiafoundation.org 3.0_BETA NetBSD 3.0_BETA (NGONG) #32: Thu Jul
> 28 16:55:12 CEST 2005
> karl@ngong.utopiafoundation.org:/usr/src/sys/arch/i386/compile/NGONG i386
> 
> Unfortuantly I can't gdb generate a backtrace.
> It says:
> (gdb) target kcore /archive/crashes/netbsd.14.core
> can not access 0x4, invalid translation (invalid PTE)
> can not access 0x4, invalid translation (invalid PTE)
> warning: cannot read switchframe registers
> #0  0x00000008 in ?? ()
> It seems like something is trashed, when I read the msgbufp it reads
> zero, but the message-buffer is intact as the dmesg below shows.
> (gdb) print msgbufp
> $5 = (struct kern_msgbuf *) 0x0
> 
> 
> dmesg -M netbsd.14.core reveal:
> ...[cut]....
> uvm_fault(0xd8ebb380, 0x5c5000, 0, 1) -> 0xe
> fatal page fault in supervisor mode
> trap type 6 code 0 eip c0377ffc cs 8 eflags 10296 cr2 5c51d0 ilevel 0
> panic: trap
> syncing disks... uvm_fault(0xd8ebb380, 0x5c5000, 0, 1) -> 0xe
> fatal page fault in supervisor mode
> trap type 6 code 0 eip c035b275 cs 8 eflags 10202 cr2 5c51d8 ilevel 0
> panic: trap
> 
> dumping to dev 0,6 offset 4001797
> ....[cut]....
> 
> 
> However I can resolv the vm_map that belongs to the page fault, but I don't have
> a clue what to do with it:
> 
> (gdb) print *(struct vm_map *) 0xd8ebb380
> $6 = {pmap = 0xe6964510, lock = {lk_interlock = {lock_data = 0}, lk_flags = 0,
>     lk_sharecount = 0, lk_exclusivecount = 0, lk_recurselevel = 0,
>     lk_waitcount = 0, lk_wmesg = 0xc079f46a "vmmaplk", lk_un = {lk_un_sleep = {
>         lk_sleep_lockholder = -1, lk_sleep_locklwp = 0, lk_sleep_prio = 4,
>         lk_sleep_timo = 0, lk_newlock = 0x0}, lk_un_spin = {
>         lk_spin_cpu = 4294967295}}}, rbhead = {rbh_root = 0xd8e84420},
>   header = {rb_entry = {rbe_left = 0x0, rbe_right = 0x0, rbe_parent = 0x0,
>       rbe_color = 0}, ownspace = 0, space = 0, prev = 0xd8e848f0,
>     next = 0xd8e84790, start = 3217031168, end = 0, object = {uvm_obj = 0x0,
>       sub_map = 0x0}, offset = 0, etype = 0, protection = 0,
>     max_protection = 0, inheritance = 0, wired_count = 0, aref = {
>       ar_pageoff = 0, ar_amap = 0x0}, advice = 0, flags = 0 '\0'},
>   nentries = 23, size = 38744064, ref_count = 1, ref_lock = {lock_data = 0},
>   hint = 0xd8ebb3b4, hint_lock = {lock_data = 0}, first_free = 0xd8e84f20,
>   flags = 65, flags_lock = {lock_data = 0}, timestamp = 143}
> 
> Below is the code in my kernel that triggered the kernel fault. Is somebody able
> to read something out if this? or if you want more info please email me and I
> can fix that.
> 
> Objdump of the affected location:
> c0377fd4 <ufs_lookup>:
> c0377fd4:       55                      push   %ebp
> c0377fd5:       89 e5                   mov    %esp,%ebp
> c0377fd7:       57                      push   %edi
> c0377fd8:       56                      push   %esi
> c0377fd9:       53                      push   %ebx
> c0377fda:       81 ec bc 00 00 00       sub    $0xbc,%esp
> c0377fe0:       c7 45 d4 ff ff ff ff    movl   $0xffffffff,0xffffffd4(%ebp)
> c0377fe7:       c7 45 ec 00 00 00 00    movl   $0x0,0xffffffec(%ebp)
> c0377fee:       8b 45 08                mov    0x8(%ebp),%eax
> c0377ff1:       8b 5d 08                mov    0x8(%ebp),%ebx
> c0377ff4:       8b 4d 08                mov    0x8(%ebp),%ecx
> c0377ff7:       8b 5b 0c                mov    0xc(%ebx),%ebx
> c0377ffa:       8b 40 04                mov    0x4(%eax),%eax
> c0377ffd:       8b 49 08                mov    0x8(%ecx),%ecx
> c0378000:       8b 90 a0 00 00 00       mov    0xa0(%eax),%edx
> 
> and the second:
> 
> c035b0e3 <ffs_sync>:
> c035b0e3:       55                      push   %ebp
> c035b0e4:       89 e5                   mov    %esp,%ebp
> c035b0e6:       57                      push   %edi
> c035b0e7:       56                      push   %esi
> c035b0e8:       53                      push   %ebx
> c035b0e9:       83 ec 2c                sub    $0x2c,%esp
> c035b0ec:       c7 45 e8 00 00 00 00    movl   $0x0,0xffffffe8(%ebp)
> c035b0f3:       8b 45 08                mov    0x8(%ebp),%eax
> c035b0f6:       8b b8 08 09 00 00       mov    0x908(%eax),%edi
> c035b0fc:       8b 57 14                mov    0x14(%edi),%edx
> c035b0ff:       89 55 ec                mov    %edx,0xffffffec(%ebp)
> ...[cut]....
> c035b263:       89 44 24 08             mov    %eax,0x8(%esp)
> c035b267:       8d 45 f0                lea    0xfffffff0(%ebp),%eax
> c035b26a:       89 44 24 04             mov    %eax,0x4(%esp)
> c035b26e:       8b 07                   mov    (%edi),%eax
> c035b270:       89 04 24                mov    %eax,(%esp)
> c035b273:       e8 f2 4a ff ff          call   c034fd6a <softdep_flushworklist>
> c035b278:       85 c0                   test   %eax,%eax
> c035b27a:       0f 44 45 e8             cmove  0xffffffe8(%ebp),%eax
> c035b27e:       85 c0                   test   %eax,%eax
> c035b280:       89 45 e8                mov    %eax,0xffffffe8(%ebp)
> c035b283:       0f 85 c6 fe ff ff       jne    c035b14f <ffs_sync+0x6c>
> c035b289:       8b 4d f0                mov    0xfffffff0(%ebp),%ecx
> c035b28c:       85 c9                   test   %ecx,%ecx
> 
> 
> Regards,
> Karl Janmar
> - karl@utopiafoundation.org
>