tech-kern: Re: panic: lockmgr: release of unlocked lock

Subject: Re: panic: lockmgr: release of unlocked lock
To: Dan McMahill <dmcmahill@netbsd.org>
From: Bill Studenmund <wrstuden@netbsd.org>
List: tech-kern
Date: 03/15/2005 08:41:48
--hHWLQfXTYDoKhP50
Content-Type: multipart/mixed; boundary="MGYHOYXEY6WxJCY8"
Content-Disposition: inline


--MGYHOYXEY6WxJCY8
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Tue, Mar 15, 2005 at 07:47:47AM -0500, Dan McMahill wrote:
> On Mon, Feb 07, 2005 at 11:06:46AM -0800, Bill Studenmund wrote:
> > On Mon, Feb 07, 2005 at 07:42:36PM +0100, Manuel Bouyer wrote:
> > > Could something with a vnlock try to allocate a new vnode ? If so this
> > > is probably the source of the problem.
> >=20
> > I doubt it. It could be true, but I expect it's more likely that the vn=
ode=20
> > is locked while the fs is trying to do something that's blocking for=20
> > memory. Then something else ends up waiting on that vnode lock, then=20
> > something else on the second one, and then you run the race for root. O=
nce=20
> > the race is done, the root vnode's locked and everything that looks up =
a=20
> > path begining with "/" gets to wait. :-|
> >=20
>=20
> any further progress on this issue?  Manuel, do you still have this probl=
em?
> I've been seeing it on my alpha too.

Can you either use kgdb or get a kernel core dump? I have a gdb script,=20
which I'll attach, that _should_ be able to figure out what's locked. The=
=20
idea is you'd run "lkchain some_vnode->v_vnlock". Well, I've never tested=
=20
the script, but that's the idea.

So it will print out a series of "lock foo held by proc bar lwp baz which=
=20
is in turn waiting on foo message bar." messages. The end of the list is a=
=20
process that's waiting on something other than a lock.

Thus if one proc is wedged and everything else is piled up, this should=20
tell you what's piled up on what.

Take care,

Bill

--MGYHOYXEY6WxJCY8
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=lkchain

#	$NetBSD: $

#
# routines to follow different lock chains. Such as chains of vnodes
# sleeping on each other.
#

#
# pfind - Walk the list of procs and find pid
define pfind
	set $p=(struct proc *)proclist.lh_first

	while ($p)
		if ($p->p_pid == $arg0)
			break
		end
		set $p=$p->p_list.le_next
	end
end

#
# lfind - given a process and an lid, find the lwp
define lfind
	set $l=(struct lwp *)$arg0->p_lwps.lh_first

	while ($l)
		if ($l->l_lid == $arg1)
			break
		end
		set $l=$l->l_list.le_next
	end
end

#
# lkchain - given a lock, find the process that holds it, then find what
# lock that process is in turn waiting on. Keep itterating.
#
define lkchain

	set $lk=(struct lock *)$arg0
	set $wc=$lk->lk_wmesg
	while ($lk->lk_wmesg == $wc)
		# First make sure lk is really locked exclusively
		if (($lk->flags & 0x02) == 0)
			printf "Lock $lk not exclusively locked\n"
			break
		end
		set $pid = $lk->lk_un.lk_un_sleep.lk_sleep_lockholder
		set $lid = $lk->lk_un.lk_un_sleep.lk_sleep_locklwp

		pfind $pid
		if (!($p))
			printf "Error finding process $pid\n"
			break
		end

		lfind $p $lid
		if (!($l))
			printf "Error finding lwp $lid in proc $pid at $p\n"
			break
		end

		set $lknew = (struct lock *)$l->l_wchan

		printf "Lock $lk held by proc $pid ($p) lwp $lid ($l)\n"
		if ($l->l_wchan == NULL)
			printf "\twhich doesn't seem to be sleeping. Exiting.\n"
			break
		end

		printf "\twhich in turn is waiting on %p message %s.\n",
			$l->l_wchan, $l->l_wmesg

		if ($l->l_wmesg != $wc)
			break
		end

		set $lk = $lknew
	end
end

--MGYHOYXEY6WxJCY8--

--hHWLQfXTYDoKhP50
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (NetBSD)

iD8DBQFCNxBMWz+3JHUci9cRAgRWAJoDthbwEk5VQo2O8w4DllrME/5FVQCeNMkx
M/ul/H495T21ltGX9UV1rPQ=
=i6U0
-----END PGP SIGNATURE-----

--hHWLQfXTYDoKhP50--