Subject: port-i386/20012: i386 MP kernel hang/deadlock post-SA integration
To: None <gnats-bugs@gnats.netbsd.org>
From: None <he@netbsd.org>
List: netbsd-bugs
Date: 01/23/2003 13:19:08
>Number:         20012
>Category:       port-i386
>Synopsis:       i386 MP kernel hang/deadlock post-SA integration
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    port-i386-maintainer
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Thu Jan 23 04:20:00 PST 2003
>Closed-Date:
>Last-Modified:
>Originator:     Havard Eidnes
>Release:        NetBSD 1.6M
>Organization:
	Unorganized, Inc.
>Environment:
System: NetBSD splitter-pine.urc.uninett.no 1.6M NetBSD 1.6M (GENERIC.MPDEBUG) #10: Tue Jan 21 22:09:26 CET 2003     fvdl@splitter-pine.urc.uninett.no:/usr/src/sys/arch/i386/compile/GENERIC.MPDEBUG i386
Architecture: i386
Machine: i386
>Description:

	Machine appeared to hang during a copy of a largish file with scp.
	Available memory was down to about 2M on this 1G memory machine, which
	may have been coincidental.

	Breaking out into DDB reveals the following state:

[BREAK]
Stopped in pid 8216.1 (scp) at  cpu_Debugger+0x4:       leave
db{3}> trace
cpu_Debugger(c1f4cde0,c,d,400006,e5942848) at cpu_Debugger+0x4
comintr(c1e2b400,c,10,30,400010) at comintr+0x128
Xintr_ioapic4() at Xintr_ioapic4+0x9c
--- interrupt ---
issignal(e5942848,c056b589,1cd,202,c21871b4) at issignal+0x2c
ltsleep(c21871b4,110,c056e414,0,c21871b4) at ltsleep+0x2c3
_lockmgr(c21871b4,2,0,c056e41b,211) at _lockmgr+0xbe7
pipe_direct_write(c2187160,e6643ed0,33d,c0345f31,e6643f80) at pipe_direct_write+0x2fa
pipe_write(e580f034,e580f05c,e6643ed0,c21e0680,1) at pipe_write+0x16f
dofilewrite(e590aa80,6,e580f034,8057000,2000) at dofilewrite+0x9b
sys_write(e5942848,e6643f80,e6643f78,c03b6553,e5942848) at sys_write+0x6b
syscall_plain(1f,1f,1f,bfbf001f,2000) at syscall_plain+0xc0
db{3}> machine cpu 0
using cpu 0
db{3}> trace
_spinlock_acquire_count(c07182e0,1,c056b589,3bc,e49e4318) at _spinlock_acquire_count+0x162
mi_switch(e49e4318,0,e49feb7c,c0311e68,e49e4318) at mi_switch+0x2c4
preempt(1,2c701,e49feccc,c03498f6,e63a9b74) at preempt+0x78
genfs_putpages(e49fed40,1ee,0,e49fed2c,0) at genfs_putpages+0x588
ffs_putpages(e49fed40,0,161,c03063ef,0) at ffs_putpages+0x117
VOP_PUTPAGES(e63a9b74,0,0,0,0) at VOP_PUTPAGES+0x49
ffs_full_fsync(e49fee58,0,e49fedec,c0347df8,0) at ffs_full_fsync+0xa1
ffs_fsync(e49fee58,10012,e5d85470,c05720e0,0) at ffs_fsync+0x3c
VOP_FSYNC(e63a9b74,c1e3cf00,0,0,0) at VOP_FSYNC+0x58
ffs_sync(c1fa8200,3,c1e3cf00,e49dba68,0) at ffs_sync+0x153
sync_fsync(e49fef28,0,c1e3da70,c031150f,0) at sync_fsync+0x7a
VOP_FSYNC(e560e2a4,c1e3cf00,8,0,0) at VOP_FSYNC+0x58
sched_sync(e49e4318,0,0,0,c010030c) at sched_sync+0x121
db{3}> machine cpu 1
using cpu 1
db{3}> trace
_lockmgr(c07182e0,400002,0,c056846c,533) at _lockmgr+0xadc
_kernel_proc_lock(e5942194,e676bf80,8,e5942194,0) at _kernel_proc_lock+0x56
syscall_plain(1f,1f,1f,bfbf001f,bfbfb3f4) at syscall_plain+0xaf
db{3}> machine cpu 2
using cpu 2
db{3}> trace
_spinlock_acquire_count(c07182e0,1,c056b589,3bc,e5942638) at _spinlock_acquire_count+0x15c
mi_switch(e5942638,0,e5a42e10,c0311e68,e5942638) at mi_switch+0x2c4
preempt(1,2,0,1,2000) at preempt+0x78
uiomove(e57b9000,2000,e5a42ed0,c050a1f2,e5a42f80) at uiomove+0xa1
pipe_read(e580f784,e580f7ac,e5a42ed0,c21e0680,1) at pipe_read+0xae
dofileread(e590a390,0,e580f784,bfbf932c,2000) at dofileread+0x9a
sys_read(e5942638,e5a42f80,e5a42f78,c03b6553,e5942638) at sys_read+0x6b
syscall_plain(80b001f,bfbf001f,bfbf001f,1f,8082cb0) at syscall_plain+0xc0
db{3}> machine cpu 3
using cpu 3
db{3}> trace
cpu_Debugger(c1f4cde0,c,d,400006,e5942848) at cpu_Debugger+0x4
comintr(c1e2b400,c,10,30,400010) at comintr+0x128
Xintr_ioapic4() at Xintr_ioapic4+0x9c
--- interrupt ---
issignal(e5942848,c056b589,1cd,202,c21871b4) at issignal+0x2c
ltsleep(c21871b4,110,c056e414,0,c21871b4) at ltsleep+0x2c3
_lockmgr(c21871b4,2,0,c056e41b,211) at _lockmgr+0xbe7
pipe_direct_write(c2187160,e6643ed0,33d,c0345f31,e6643f80) at pipe_direct_write+0x2fa
pipe_write(e580f034,e580f05c,e6643ed0,c21e0680,1) at pipe_write+0x16f
dofilewrite(e590aa80,6,e580f034,8057000,2000) at dofilewrite+0x9b
sys_write(e5942848,e6643f80,e6643f78,c03b6553,e5942848) at sys_write+0x6b
syscall_plain(1f,1f,1f,bfbf001f,2000) at syscall_plain+0xc0
db{3}> show regi
ds                0x10
es                0x10
fs                0x30
gs                0x10
edi         0xc1e2b400  end+0x16f43dc
esi                0xc
ebp         0xe6643c48
ebx         0xc1e2b5c6  end+0x16f45a2
edx             0xff00  gdt_desc+0xddd4
ecx                  0
eax               0x7f
eip         0xc03a3a3c  cpu_Debugger+0x4
cs                 0x8
eflags           0x202
esp         0xe6643c48
ss                0x10
cpu_Debugger+0x4:       leave
db{3}> machine cpu 0
using cpu 0
db{3}> show regi
ds                0x10
es                0x10
fs                0x30
gs                0x10
edi                0xd
esi                  0
ebp         0xe49feafc
ebx         0xc07182e0  kernel_lock
edx         0xc1f45000  end+0x180dfdc
ecx                  0
eax           0x400400  gdt_desc+0x3fe2d4
eip         0xc03067ae  _spinlock_acquire_count+0x162
cs                 0x8
eflags           0x202
esp         0xe49fead4
ss                0x10
_spinlock_acquire_count+0x162:  jz      _spinlock_acquire_count+0x1bc
db{3}> machine cpu 1
using cpu 1
db{3}> show regi
ds                0x10
es                0x10
fs                0x30
gs                0x10
edi         0xe5942194
esi                  0
ebp         0xe676bf00
ebx         0xc07182e0  kernel_lock
edx         0xc1f48800  end+0x18117dc
ecx                  0
eax           0x400400  gdt_desc+0x3fe2d4
eip         0xc03058a8  _lockmgr+0xadc
cs                 0x8
eflags           0x286
esp         0xe676beb8
ss                0x10
_lockmgr+0xadc: movl          0x1c(%ebx),%eax
db{3}> machine cpu 2
using cpu 2
db{3}> show regi
ds                0x10
es                0x10
fs                0x30
gs                0x10
edi                0xd
esi                  0
ebp         0xe5a42d90
ebx         0xc07182e0  kernel_lock
edx         0xc1f48000  end+0x1810fdc
ecx                  0
eax           0x400400  gdt_desc+0x3fe2d4
eip         0xc03067a8  _spinlock_acquire_count+0x15c
cs                 0x8
eflags           0x282
esp         0xe5a42d68
ss                0x10
_spinlock_acquire_count+0x15c:  movl          0x1c(%ebx),%eax
db{3}> ps
 PID           PPID     PGRP        UID S   FLAGS LWPS          COMMAND    WAIT
 8259          8153     8259       1000 2  0x4102    1           systat
 8217          8216     8216       1000 2  0x4002    1              ssh
*8216          7830     8216       1000 2  0x4002    1              scp  pipelk
 8153          8152     8153       1000 2  0x4002    1             tcsh   pause
 8152          8148     8148       1000 2  0x4100    1            xterm  select
 8148          8147     8148       1000 2  0x4000    1             tcsh   pause
 8147          8145     8145       1000 2   0x100    1             sshd
 8145           219     8145          0 2   0x100    1             sshd   netio
 7847          7830     7847          0 4  0x5002    1              csh
 7830          7829     7830       1000 2  0x4002    1             tcsh   pause
 7829          7826     7826       1000 2  0x4100    1            xterm  select
 7826          7825     7826       1000 2  0x4000    1             tcsh   pause
 7825          7823     7823       1000 2   0x100    1             sshd  select
 7823           219     7823          0 2   0x100    1             sshd   netio
 7982          7412     7982          0 2  0x4002    1             tail  kqread
 7419          7401     7419       1000 2  0x4102    1           systat
 7412          7388     7412          0 2  0x5002    1              csh   pause
 7401          7400     7401       1000 2  0x4002    1             tcsh   pause
 7400          7397     7397       1000 2  0x4100    1            xterm  select
 7397          7396     7397       1000 2  0x4000    1             tcsh   pause
 7396          7394     7394       1000 2   0x100    1             sshd  select
 7394           219     7394          0 2   0x100    1             sshd   netio
 7388          7387     7388       1000 2  0x4002    1             tcsh   pause
 7387          7384     7384       1000 2  0x4100    1            xterm  select
 7384          7383     7384       1000 2  0x4000    1             tcsh   pause
 7383          7381     7381       1000 2   0x100    1             sshd  select
 7381           219     7381          0 2   0x100    1             sshd   netio
 17390        28968    17390          0 2  0x5002    1             tcsh   ttyin
 28979            1    28977       1001 2  0x4003    1         gconfd-1  select
 28977        28968    28977       1001 2  0x4003    1       galeon-bin
 28976        28968    28976       1001 2  0x4002    1       junkbuster  netcon
 28968        28967    28968       1001 2  0x4002    1             bash    wait
 28967        28964    28964       1001 2  0x4100    1            xterm  select
 28964        28963    28964       1001 2  0x4000    1              csh   pause
 28963        28961    28961       1001 2   0x100    1             sshd
 28961          219    28961          0 2   0x100    1             sshd   netio
 19709            1    19709       1001 2  0x4000    1             oafd  select
 6331             1     6331       1000 2  0x4000    1             oafd  select
 259              1      245       1000 2  0x4000    1       setiathome
 258              1      246       1000 2  0x4000    1       setiathome
 257              1      244       1000 2  0x4000    1       setiathome
 256              1      243       1000 2  0x4000    1       setiathome
 251              1      251          0 2  0x4002    1            getty   ttyin
 250              1      250          0 2  0x4002    1            getty   ttyin
 249              1      249          0 2  0x4002    1            getty   ttyin
 248              1        1          0 2  0x4000    1            getty nanosle
 247              1      247          0 2  0x4002    1            getty
 237              1      237          0 2       0    1             cron
 233              1      233          0 2       0    1            inetd   pause
 219              1      219          0 2       0    1             sshd  select
 204              1      204          0 2       0    1             ntpd
 187            186        9       1002 2     0x2    1         postgres  select
 186            183        9       1002 2     0x2    1         postgres  select
 183              1        9       1002 2  0x4002    1         postgres  select
 128              1      128          0 2       0    1        mount_mfs  mfsidl
 106              1      106          0 2       0    1          syslogd  select
 8                0        0          0 2 0x20200    1         aiodoned aiodone
 7                0        0          0 2 0x20200    1          ioflush
 6                0        0          0 2 0x20200    1           reaper  reaper
 5                0        0          0 2 0x20200    1       pagedaemon pgdaemo
 4                0        0          0 2 0x20200    1         scsibus2  sccomp
 3                0        0          0 2 0x20200    1         scsibus1  sccomp
 2                0        0          0 2 0x20200    1         scsibus0  sccomp
 1                0        1          0 2  0x4000    1             init    wait
 0               -1        0          0 2 0x20200    1          swapper
 8277      kernel: page fault trap, code=0
Faulted in DDB; continuing...
db{3}>

	("*" above in ps listing was originally a >, replaced to
	placate gnats)

	To my untrained eye it would seem that two CPUs are inside
	_spinlock_acquire_count with an identical first argument.

	I tried to take a crash dump with "reboot 104", but that
	failed with a "locking against myself" error.


	It should also be noted that the comment above the mentioned
	function is misleading (doesn't return "N", since it's a void
	function), and the function name may also be sligthly
	misleading.


>How-To-Repeat:

	Sorry, don't know.  The machine had survived building a full
	release, so it's apparently not easily triggered, and so far
	I've only observed this once.

	If needed, remote console access could be a possibility should
	that be required to pinpoint this further, and if it should
	happen again.


>Fix:
	Sorry, don't know.
>Release-Note:
>Audit-Trail:
>Unformatted: