Subject: Re: tricky bug in ufs_lookup?
To: Karl Janmar <karl@utopiafoundation.org>
From: Karl Janmar <karl@utopiafoundation.org>
List: tech-kern
Date: 08/20/2005 14:04:11
Sorry, I was mixing up the kernel that I tried to debug...
So I have to use the kernel written by savecore, without symbols.
Anyway discard the 2 objdump outputs from the original message.
Backtrace:
(gdb) bt
#0 0xc09f1000 in ?? ()
#1 0xc046af8b in cpu_reboot ()
#2 0xc03ce835 in panic ()
#3 0xc04763f8 in trap ()
#4 0xc010ae53 in calltrap ()
#5 0xc03f77b2 in sys_sync ()
#6 0xc03f5f06 in vfs_shutdown ()
#7 0xc046af9f in cpu_reboot ()
#8 0xc03ce835 in panic ()
#9 0xc04763f8 in trap ()
#10 0xc010ae53 in calltrap ()
#11 0xc03ff03b in VOP_LOOKUP ()
#12 0xc03f279e in lookup ()
#13 0xc03f21db in namei ()
#14 0xc03fddf5 in vn_open ()
#15 0xc03f8431 in sys_open ()
#16 0xc0475d2a in syscall_plain ()
at stack level #12:
(gdb) x /10i $eip-30
0xc03f2780 <lookup+496>: add $0x89,%al
0xc03f2782 <lookup+498>: jo 0xc03f2798 <lookup+520>
0xc03f2784 <lookup+500>: movl $0x0,0x10(%eax)
0xc03f278b <lookup+507>: mov 0xffffffd0(%ebp),%edx
0xc03f278e <lookup+510>: mov %edi,0x8(%esp,1)
0xc03f2792 <lookup+514>: mov %esi,(%esp,1)
0xc03f2795 <lookup+517>: mov %edx,0x4(%esp,1)
0xc03f2799 <lookup+521>: call 0xc03ff00b <VOP_LOOKUP>
0xc03f279e <lookup+526>: test %eax,%eax
(gdb) i reg
eax 0x0 0
ecx 0x0 0
edx 0x0 0
ebx 0xceb27ed8 -827162920
esp 0xceb27908 0xceb27908
ebp 0xceb27d5c 0xceb27d5c
esi 0xd45c3a80 -732153216
edi 0xccc802a0 -859307360
eip 0xc03f279e 0xc03f279e
And if we take a closer look at esi, that should be the variable dp in lookup (a
struct vnode *), the first argument for VOP_LOOKUP.
I do this by using my other kernel with symbol to get the struct decoding right:
(gdb) print *(struct vnode *)0xd45c3a80
$2 = {v_uobj = {vmobjlock = {lock_data = 0}, pgops = 0xc083f770, memq = {
tqh_first = 0x0, tqh_last = 0xd45c3a88}, uo_npages = 0, uo_refs = 1},
v_size = 2048, v_flag = 128, v_numoutput = 0, v_writecount = 0,
v_holdcnt = 1, v_mount = 0xc2fc0000, v_op = 0xc2a42700, v_freelist = {
tqe_next = 0xd1572938, tqe_prev = 0xd427a8d0}, v_mntvnodes = {
le_next = 0xd4989dd8, le_prev = 0xd174f4f0}, v_cleanblkhd = {
lh_first = 0xc322f428}, v_dirtyblkhd = {lh_first = 0x0}, v_synclist = {
le_next = 0xd4989dd8, le_prev = 0xd2b39e2c}, v_dnclist = {
lh_first = 0xd500add4}, v_nclist = {lh_first = 0xd1b33f30}, v_un = {
vu_mountedhere = 0x0, vu_socket = 0x0, vu_specinfo = 0x0,
vu_fifoinfo = 0x0}, v_lease = 0x0, v_type = VDIR, v_tag = VT_UFS,
v_lock = {lk_interlock = {lock_data = 0}, lk_flags = 1024,
lk_sharecount = 0, lk_exclusivecount = 1, lk_recurselevel = 0,
lk_waitcount = 0, lk_wmesg = 0xc079d7dc "vnlock", lk_un = {lk_un_sleep = {
lk_sleep_lockholder = 3785, lk_sleep_locklwp = 1, lk_sleep_prio = 20,
lk_sleep_timo = 0, lk_newlock = 0x0}, lk_un_spin = {
lk_spin_cpu = 3785}}}, v_vnlock = 0xd45c3af0, v_data = 0x5c518c,
v_klist = {slh_first = 0x0}}
I switch back to my real kernel without debugging symbols and see where the v_op
points at.
(gdb) x /4x 0xc2a42700
0xc2a42700: 0xc03f1254 0xc03ee7f6 0xc0377fb0 0xc037b4bc
And for the vop_lookup, it has an offset of 2 so we are dig deeper at 0xc0377fb0:
(gdb) x /30i 0xc0377fb0
0xc0377fb0 <ufs_lookup>: push %ebp
0xc0377fb1 <ufs_lookup+1>: mov %esp,%ebp
0xc0377fb3 <ufs_lookup+3>: push %edi
0xc0377fb4 <ufs_lookup+4>: push %esi
0xc0377fb5 <ufs_lookup+5>: push %ebx
0xc0377fb6 <ufs_lookup+6>: sub $0xbc,%esp
0xc0377fbc <ufs_lookup+12>: movl $0xffffffff,0xffffffd4(%ebp)
0xc0377fc3 <ufs_lookup+19>: movl $0x0,0xffffffec(%ebp)
0xc0377fca <ufs_lookup+26>: mov 0x8(%ebp),%eax
0xc0377fcd <ufs_lookup+29>: mov 0x8(%ebp),%ebx
0xc0377fd0 <ufs_lookup+32>: mov 0x8(%ebp),%ecx
0xc0377fd3 <ufs_lookup+35>: mov 0xc(%ebx),%ebx
0xc0377fd6 <ufs_lookup+38>: mov 0x4(%eax),%eax
0xc0377fd9 <ufs_lookup+41>: mov 0x8(%ecx),%ecx
0xc0377fdc <ufs_lookup+44>: mov 0xa0(%eax),%edx
0xc0377fe2 <ufs_lookup+50>: mov %eax,0xffffffe4(%ebp)
0xc0377fe5 <ufs_lookup+53>: movl $0x0,(%ecx)
0xc0377feb <ufs_lookup+59>: mov (%ebx),%eax
0xc0377fed <ufs_lookup+61>: mov %edx,0xffffffe0(%ebp)
0xc0377ff0 <ufs_lookup+64>: mov %ecx,0xffffffa8(%ebp)
0xc0377ff3 <ufs_lookup+67>: mov %eax,0xffffff9c(%ebp)
0xc0377ff6 <ufs_lookup+70>: mov 0xc(%ebx),%edi
0xc0377ff9 <ufs_lookup+73>: mov %ebx,0xffffffa4(%ebp)
0xc0377ffc <ufs_lookup+76>: mov 0x44(%edx),%eax <---- here is the trap
0xc0377fff <ufs_lookup+79>: mov %edi,0xffffffa0(%ebp)
0xc0378002 <ufs_lookup+82>: mov 0x16c(%eax),%eax
0xc0378008 <ufs_lookup+88>: mov %eax,0xffffff98(%ebp)
0xc037800b <ufs_lookup+91>: mov 0x4(%ebx),%eax
0xc037800e <ufs_lookup+94>: mov %eax,%edx
0xc0378010 <ufs_lookup+96>: mov %eax,%ecx
At stack level #11:
(gdb) disassemble VOP_LOOKUP
Dump of assembler code for function VOP_LOOKUP:
0xc03ff00b <VOP_LOOKUP>: push %ebp
0xc03ff00c <VOP_LOOKUP+1>: mov %esp,%ebp
0xc03ff00e <VOP_LOOKUP+3>: sub $0x28,%esp
0xc03ff011 <VOP_LOOKUP+6>: mov 0xc(%ebp),%eax
0xc03ff014 <VOP_LOOKUP+9>: mov 0x8(%ebp),%edx
0xc03ff017 <VOP_LOOKUP+12>: movl $0xc068cb20,0xffffffe8(%ebp)
0xc03ff01e <VOP_LOOKUP+19>: mov %eax,0xfffffff0(%ebp)
0xc03ff021 <VOP_LOOKUP+22>: mov %edx,0xffffffec(%ebp)
0xc03ff024 <VOP_LOOKUP+25>: mov 0x10(%ebp),%eax
0xc03ff027 <VOP_LOOKUP+28>: mov %eax,0xfffffff4(%ebp)
0xc03ff02a <VOP_LOOKUP+31>: mov 0x34(%edx),%edx
0xc03ff02d <VOP_LOOKUP+34>: lea 0xffffffe8(%ebp),%eax
0xc03ff030 <VOP_LOOKUP+37>: mov %eax,(%esp,1)
0xc03ff033 <VOP_LOOKUP+40>: mov 0xc068cb20,%eax
0xc03ff038 <VOP_LOOKUP+45>: call *(%edx,%eax,4)
0xc03ff03b <VOP_LOOKUP+48>: mov %ebp,%esp
0xc03ff03d <VOP_LOOKUP+50>: pop %ebp
0xc03ff03e <VOP_LOOKUP+51>: ret
End of assembler dump.
(gdb) i reg
eax 0x0 0
ecx 0x0 0
edx 0x0 0
ebx 0xceb27ed8 -827162920
esp 0xceb27908 0xceb27908
ebp 0xceb27d0c 0xceb27d0c
esi 0xd45c3a80 -732153216
edi 0xccc802a0 -859307360
eip 0xc03ff03b 0xc03ff03b
So here the call ends up in ufs_lookup that I disassembled above.
So the frame where the trap occured is lost?
Can somebody with sharp eyes spot something or want more information?
Regards,
Karl Janmar
Karl Janmar wrote:
> Hi,
>
> I occasionally get panics by a page-fault in kernel-mode.
> This system is the netbsd-3 tag since 10 July.
> I could not diff against anoncvs.netbsd.org as it's down....do I miss some vital
> update?
>
> uname -a
> NetBSD ngong.utopiafoundation.org 3.0_BETA NetBSD 3.0_BETA (NGONG) #32: Thu Jul
> 28 16:55:12 CEST 2005
> karl@ngong.utopiafoundation.org:/usr/src/sys/arch/i386/compile/NGONG i386
>
> Unfortuantly I can't gdb generate a backtrace.
> It says:
> (gdb) target kcore /archive/crashes/netbsd.14.core
> can not access 0x4, invalid translation (invalid PTE)
> can not access 0x4, invalid translation (invalid PTE)
> warning: cannot read switchframe registers
> #0 0x00000008 in ?? ()
> It seems like something is trashed, when I read the msgbufp it reads
> zero, but the message-buffer is intact as the dmesg below shows.
> (gdb) print msgbufp
> $5 = (struct kern_msgbuf *) 0x0
>
>
> dmesg -M netbsd.14.core reveal:
> ...[cut]....
> uvm_fault(0xd8ebb380, 0x5c5000, 0, 1) -> 0xe
> fatal page fault in supervisor mode
> trap type 6 code 0 eip c0377ffc cs 8 eflags 10296 cr2 5c51d0 ilevel 0
> panic: trap
> syncing disks... uvm_fault(0xd8ebb380, 0x5c5000, 0, 1) -> 0xe
> fatal page fault in supervisor mode
> trap type 6 code 0 eip c035b275 cs 8 eflags 10202 cr2 5c51d8 ilevel 0
> panic: trap
>
> dumping to dev 0,6 offset 4001797
> ....[cut]....
>
>
> However I can resolv the vm_map that belongs to the page fault, but I don't have
> a clue what to do with it:
>
> (gdb) print *(struct vm_map *) 0xd8ebb380
> $6 = {pmap = 0xe6964510, lock = {lk_interlock = {lock_data = 0}, lk_flags = 0,
> lk_sharecount = 0, lk_exclusivecount = 0, lk_recurselevel = 0,
> lk_waitcount = 0, lk_wmesg = 0xc079f46a "vmmaplk", lk_un = {lk_un_sleep = {
> lk_sleep_lockholder = -1, lk_sleep_locklwp = 0, lk_sleep_prio = 4,
> lk_sleep_timo = 0, lk_newlock = 0x0}, lk_un_spin = {
> lk_spin_cpu = 4294967295}}}, rbhead = {rbh_root = 0xd8e84420},
> header = {rb_entry = {rbe_left = 0x0, rbe_right = 0x0, rbe_parent = 0x0,
> rbe_color = 0}, ownspace = 0, space = 0, prev = 0xd8e848f0,
> next = 0xd8e84790, start = 3217031168, end = 0, object = {uvm_obj = 0x0,
> sub_map = 0x0}, offset = 0, etype = 0, protection = 0,
> max_protection = 0, inheritance = 0, wired_count = 0, aref = {
> ar_pageoff = 0, ar_amap = 0x0}, advice = 0, flags = 0 '\0'},
> nentries = 23, size = 38744064, ref_count = 1, ref_lock = {lock_data = 0},
> hint = 0xd8ebb3b4, hint_lock = {lock_data = 0}, first_free = 0xd8e84f20,
> flags = 65, flags_lock = {lock_data = 0}, timestamp = 143}
>
> Below is the code in my kernel that triggered the kernel fault. Is somebody able
> to read something out if this? or if you want more info please email me and I
> can fix that.
>
> Objdump of the affected location:
> c0377fd4 <ufs_lookup>:
> c0377fd4: 55 push %ebp
> c0377fd5: 89 e5 mov %esp,%ebp
> c0377fd7: 57 push %edi
> c0377fd8: 56 push %esi
> c0377fd9: 53 push %ebx
> c0377fda: 81 ec bc 00 00 00 sub $0xbc,%esp
> c0377fe0: c7 45 d4 ff ff ff ff movl $0xffffffff,0xffffffd4(%ebp)
> c0377fe7: c7 45 ec 00 00 00 00 movl $0x0,0xffffffec(%ebp)
> c0377fee: 8b 45 08 mov 0x8(%ebp),%eax
> c0377ff1: 8b 5d 08 mov 0x8(%ebp),%ebx
> c0377ff4: 8b 4d 08 mov 0x8(%ebp),%ecx
> c0377ff7: 8b 5b 0c mov 0xc(%ebx),%ebx
> c0377ffa: 8b 40 04 mov 0x4(%eax),%eax
> c0377ffd: 8b 49 08 mov 0x8(%ecx),%ecx
> c0378000: 8b 90 a0 00 00 00 mov 0xa0(%eax),%edx
>
> and the second:
>
> c035b0e3 <ffs_sync>:
> c035b0e3: 55 push %ebp
> c035b0e4: 89 e5 mov %esp,%ebp
> c035b0e6: 57 push %edi
> c035b0e7: 56 push %esi
> c035b0e8: 53 push %ebx
> c035b0e9: 83 ec 2c sub $0x2c,%esp
> c035b0ec: c7 45 e8 00 00 00 00 movl $0x0,0xffffffe8(%ebp)
> c035b0f3: 8b 45 08 mov 0x8(%ebp),%eax
> c035b0f6: 8b b8 08 09 00 00 mov 0x908(%eax),%edi
> c035b0fc: 8b 57 14 mov 0x14(%edi),%edx
> c035b0ff: 89 55 ec mov %edx,0xffffffec(%ebp)
> ...[cut]....
> c035b263: 89 44 24 08 mov %eax,0x8(%esp)
> c035b267: 8d 45 f0 lea 0xfffffff0(%ebp),%eax
> c035b26a: 89 44 24 04 mov %eax,0x4(%esp)
> c035b26e: 8b 07 mov (%edi),%eax
> c035b270: 89 04 24 mov %eax,(%esp)
> c035b273: e8 f2 4a ff ff call c034fd6a <softdep_flushworklist>
> c035b278: 85 c0 test %eax,%eax
> c035b27a: 0f 44 45 e8 cmove 0xffffffe8(%ebp),%eax
> c035b27e: 85 c0 test %eax,%eax
> c035b280: 89 45 e8 mov %eax,0xffffffe8(%ebp)
> c035b283: 0f 85 c6 fe ff ff jne c035b14f <ffs_sync+0x6c>
> c035b289: 8b 4d f0 mov 0xfffffff0(%ebp),%ecx
> c035b28c: 85 c9 test %ecx,%ecx
>
>
> Regards,
> Karl Janmar
> - karl@utopiafoundation.org
>