NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: kern/59339: heartbeat watchdog fires since 10.99.14



The following reply was made to PR kern/59339; it has been noted by GNATS.

From: Taylor R Campbell <riastradh%NetBSD.org@localhost>
To: Thomas Klausner <wiz%NetBSD.org@localhost>, Patrick Welche <prlw1%welche.eu@localhost>
Cc: gnats-bugs%NetBSD.org@localhost, netbsd-bugs%NetBSD.org@localhost
Subject: Re: kern/59339: heartbeat watchdog fires since 10.99.14
Date: Sat, 10 May 2025 01:03:40 +0000

 This is a multi-part message in MIME format.
 --=_8ZQlRQ4u9eg1J0DCFMQoE3LK0lMgXmEF
 
 Better yet, try the attached patch.  It will print a message to the
 console if the circumstances that I hypothesize _could_ cause the hang
 occur, but avoids those circumstances at the same time.
 
 In particular, in current, itimer_callout will:
 
 1. sample the clock giving now,
 2. decide at what timespec t to next schedule the itimer for based on
    the clock sample in (1), 
 3. sample the clock again giving now1,
 4. decide how many ticks to sleep from now1 to t.
 
 If the duration from (1) to (3) is long enough, even though the
 decision in (2) is designed to guarantee now < t (except in case of
 arithmetic overflow, not likely relevant unless someone is attacking
 your ntp client on the network), it may turn out that t < now1, which
 may lead the itimer to fire immediately again, possibly leading into
 some kind of infinite loop that triggers the heartbeat panic.
 
 However, that's the point at which my hypothesis breaks down: even if
 the itimer callout reschedules itself with callout_schedule(ch, 0),
 that shouldn't prevent the heartbeat callout from running, because (if
 I'm reading kern_timeout.c correctly) callouts at the same time run in
 FIFO order.  So I'm kinda grasping at straws here.
 
 The patch skips step (3) -- except to print as a diagnostic -- and
 uses the sample now from (1) in the decision at (4) of how many ticks
 to sleep until t.
 
 --=_8ZQlRQ4u9eg1J0DCFMQoE3LK0lMgXmEF
 Content-Type: text/plain; charset="ISO-8859-1"; name="pr59339-itimercalloutassert-v3"
 Content-Transfer-Encoding: quoted-printable
 Content-Disposition: attachment; filename="pr59339-itimercalloutassert-v3.patch"
 
 diff -r 55a6ded9e19e -r 2808a674f18f sys/kern/kern_time.c
 --- a/sys/kern/kern_time.c	Thu May 08 20:51:40 2025 +0000
 +++ b/sys/kern/kern_time.c	Sat May 10 00:37:46 2025 +0000
 @@ -837,6 +837,82 @@ itimer_arm_real(struct itimer * const it
  }
 =20
  /*
 + * itimer_rearm_real:
 + *
 + *	Re-arm a non-virtual timer, given the current clock readout.
 + */
 +static void
 +itimer_rearm_real(struct itimer * const it, const struct timespec * const =
 now)
 +{
 +	const struct timespec * const interval =3D &it->it_time.it_interval;
 +	const struct timespec * const next =3D &it->it_time.it_value;
 +	struct timespec delta;
 +	int ticks;
 +
 +	KASSERT(!it->it_dying);
 +	KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
 +	KASSERT(!callout_pending(&it->it_ch));
 +
 +	if (__predict_true(timespecisset(next))) {
 +		struct timespec now1;
 +
 +		if (it->it_clockid =3D=3D CLOCK_MONOTONIC) {
 +			getnanouptime(&now1);
 +		} else {
 +			getnanotime(&now1);
 +		}
 +		if (timespeccmp(next, &now1, <=3D)) {
 +			printf("PR kern/59339 (heartbeat watchdog fires"
 +			    " since 10.99.14) would fire:"
 +			    " [clock %u]"
 +			    " [now=3D%lld.%09ld -> %lld.%09ld]"
 +			    " interval=3D%lld.%09ld next=3D%lld.%09ld\n",
 +			    it->it_clockid,
 +			    (long long)now->tv_sec, (long)now->tv_nsec,
 +			    (long long)now1.tv_sec, (long)now1.tv_nsec,
 +			    (long long)interval->tv_sec,
 +			    (long)interval->tv_nsec,
 +			    (long long)next->tv_sec, (long)next->tv_nsec);
 +		}
 +
 +		KASSERTMSG(timespeccmp(now, next, <),
 +		    "[clock %u]"
 +		    " now=3D%lld.%09ld interval=3D%lld.%09ld next=3D%lld.%09ld",
 +		    it->it_clockid,
 +		    (long long)now->tv_sec, (long)now->tv_nsec,
 +		    (long long)interval->tv_sec, (long)interval->tv_nsec,
 +		    (long long)next->tv_sec, (long)next->tv_nsec);
 +		timespecsub(next, now, &delta);
 +	} else {
 +		/*
 +		 * If the arithmetic overflowed, just take the interval
 +		 * as the time to wait.  This is unlikely to happen
 +		 * unless you are messing with your clock to set it
 +		 * ahead by hundreds of years.
 +		 */
 +		printf("[clock %u]"
 +		    " itimer arithmetic overflowed:"
 +		    " now=3D%lld.%09ld interval=3D%lld.%09ld\n",
 +		    it->it_clockid,
 +		    (long long)now->tv_sec, (long)now->tv_nsec,
 +		    (long long)interval->tv_sec, (long)interval->tv_nsec);
 +		delta =3D it->it_time.it_interval; /* overflow */
 +	}
 +	ticks =3D tstohz(&delta);
 +	KASSERTMSG(ticks > 0,
 +	    "[clock %u]"
 +	    " now=3D%lld.%09ld interval=3D%lld.%09ld next=3D%lld.%09ld"
 +	    " delta=3D%lld.%09ld ticks=3D%d",
 +	    it->it_clockid,
 +	    (long long)now->tv_sec, (long)now->tv_nsec,
 +	    (long long)interval->tv_sec, (long)interval->tv_nsec,
 +	    (long long)next->tv_sec, (long)next->tv_nsec,
 +	    (long long)delta.tv_sec, (long)delta.tv_nsec,
 +	    ticks);
 +	callout_schedule(&it->it_ch, ticks);
 +}
 +
 +/*
   * itimer_callout:
   *
   *	Callout to expire a non-virtual timer.  Queue it up for processing,
 @@ -872,6 +948,18 @@ itimer_callout(void *arg)
  	 * now, compute the next itimer value and count overruns.
  	 */
  	itimer_transition(&it->it_time, &now, &next, &overruns);
 +	KASSERTMSG(timespeccmp(&now, &next, <),
 +	    "[clock %u]"
 +	    " it->it_time.it_value=3D%lld.%09ld"
 +	    " it->it_time.it_interval=3D%lld.%09ld"
 +	    " now=3D%lld.%09ld next=3D%lld.%09ld",
 +	    it->it_clockid,
 +	    (long long)it->it_time.it_value.tv_sec,
 +	    (long)it->it_time.it_value.tv_nsec,
 +	    (long long)it->it_time.it_interval.tv_sec,
 +	    (long)it->it_time.it_interval.tv_nsec,
 +	    (long long)now.tv_sec, (long)now.tv_nsec,
 +	    (long long)next.tv_sec, (long)next.tv_nsec);
  	it->it_time.it_value =3D next;
  	it->it_overruns +=3D overruns;
 =20
 @@ -879,7 +967,7 @@ itimer_callout(void *arg)
  	 * Reset the callout, if it's not going away.
  	 */
  	if (!it->it_dying)
 -		itimer_arm_real(it);
 +		itimer_rearm_real(it, &now);
  	itimer_unlock();
  }
 =20
 
 --=_8ZQlRQ4u9eg1J0DCFMQoE3LK0lMgXmEF--
 


Home | Main Index | Thread Index | Old Index