Re: a real clue about the ~7.5 days! (was: timekeeping regression?)

To: NetBSD/xen Discussion List <port-xen%NetBSD.org@localhost>
Subject: Re: a real clue about the ~7.5 days! (was: timekeeping regression?)
From: "Greg A. Woods" <woods%planix.ca@localhost>
Date: Tue, 30 Jul 2024 13:38:49 -0700
/*	$NetBSD: xen_clock.c,v 1.18 2023/09/10 15:23:01 bouyer Exp $	*/

/*-
 * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "opt_xen.h"

#ifndef XEN_CLOCK_DEBUG
#define	XEN_CLOCK_DEBUG	0
#endif

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.18 2023/09/10 15:23:01 bouyer Exp $");

#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/device.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/sdt.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>

#include <dev/clock_subr.h>

#include <machine/cpu.h>
#include <machine/cpu_counter.h>
#include <machine/lock.h>

#include <xen/evtchn.h>
#include <xen/hypervisor.h>
#include <xen/include/public/vcpu.h>
#include <xen/xen.h>

#include <x86/rtc.h>

#define NS_PER_TICK ((uint64_t)1000000000ULL/hz)

static uint64_t	xen_vcputime_sched_systime_ns(void);
static uint64_t	xen_global_systime_ns(void);
static unsigned	xen_get_timecount(struct timecounter *);
static int	xen_timer_handler(void *, struct clockframe *);

/*
 * dtrace probes
 */
SDT_PROBE_DEFINE2(sdt, xen, tsc, backwards,
    "uint64_t"/*tsc*/,
    "uint64_t"/*tsc_at_start*/);
SDT_PROBE_DEFINE3(sdt, xen, global_ns, ns_per_tick_diff,
    "uint64_t"/*global_ns*/,
    "uint64_t"/*local_ns*/,
    "uint64_t"/*ns_per_tick*/);
SDT_PROBE_DEFINE2(sdt, xen, global_ns, backwards,
    "uint64_t"/*local_ns*/,
    "uint64_t"/*global_ns*/);
SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backwards,
    "uint64_t"/*last_systime_ns*/,
    "uint64_t"/*this_systime_ns*/);
SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick,
    "uint64_t"/*last_systime_ns*/,
    "uint64_t"/*this_systime_ns*/);
SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump,
    "uint64_t"/*last_systime_ns*/,
    "uint64_t"/*this_systime_ns*/,
    "uint64_t"/*nticks*/);
SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed,
    "uint64_t"/*last_systime_ns*/,
    "uint64_t"/*this_systime_ns*/,
    "uint64_t"/*remaining_ns*/);

/*
 * xen timecounter:
 *
 *	Xen vCPU system time, plus an adjustment with rdtsc.
 */
static struct timecounter xen_timecounter = {
	.tc_get_timecount = xen_get_timecount,
	.tc_poll_pps = NULL,
	.tc_counter_mask = ~0U,
	.tc_frequency = 1000000000ULL,	/* 1 GHz, i.e. units of nanoseconds */
	.tc_name = "xen_system_time",	/* XXX "xen_TSC" */
	.tc_quality = 10000,
};

/*
 * xen_global_systime_ns_stamp
 *
 *	The latest Xen vCPU system time that has been observed on any
 *	CPU, for a global monotonic view of the Xen system time clock.
 */
static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned;
static uint64_t xen_initial_systime_ns;

#ifdef DOM0OPS
/*
 * xen timepush state:
 *
 *	Callout to periodically, after a sysctl-configurable number of
 *	NetBSD ticks, set the Xen hypervisor's wall clock time.
 *
 * Linux does this once every 11 minutes, as well as any time settimeofday() is
 * called (and maybe every time the clock is jumped by NTP etc.)
 *
 * Since here in NetBSD we do it via the todr_chip_handle.todr_settime call,
 * i.e. resettodr(9), it will also occur any time either settimofday() or
 * clock_settime(CLOCK_REALTIME, ...) are called.
 *
 * Pick a default frequency for timepush, but avoid an exact # of min/sec.
 */
#define XEN_TIMEPUSH_TICKS	(10*hz) /* xxx orig nbsd: (53*hz + 3) */
static struct {
	struct callout	ch;
	int		ticks;
} xen_timepush;

static void	xen_timepush_init(void);
static void	xen_timepush_intr(void *);
static int	sysctl_xen_timepush(SYSCTLFN_ARGS);
#endif


/*
 *	Xen "system time" and "wall clock time"
 *
 * Xen provides guests with two timestamp values, the system-time (time since
 * guest boot or resume) and the wall-clock time (time since the epoch at the
 * point when system-time was zero, i.e. Xen wall clock time is actually
 * boot-time (or resume time) for the guest, and for dom0 this is also very
 * close to the boot-time of the hypervisor).
 *
 * These are provided through a shared memory structure (shared_info_page, and
 * in the array of vcpu_time_info within).
 *
 * The system-time in the vCPU's vcpu_time_info is updated by Xen every time the
 * guest is being scheduled, along with a snapshot of the CPU's TSC register
 * value (and some related values for scaling the TSC to nanoseconds).  While
 * running the guest can get the current system-time by extrapolating from the
 * values in vcpu_time_info using the value of the TSC register (an x86 register
 * counting CPU clock cycles, often emulated in domUs).
 *
 * TSC values in Xen are obtained through the RTDSC instruction and are either
 * native, i.e. accessed directly from the CPU register (in dom0, and possibly
 * in some situations in domUs); or emulated, i.e. intercepted through a trap by
 * Xen (in domUs, e.g. on hardware without the TSC_INVARIANT CPU feature).  In
 * emulated mode the CPU clock is at a ficticious frequency of 1 GHz.  Either
 * way multiplier and shift values are provided to adjust the TSC value to
 * nanoseconds so the frequency need not be measured (it was measured by Xen
 * when it first booted).
 *
 * XXX for SMP domains with multiple vCPUs it looks like the tsc_timestamp is
 * separately updated for each vCPU as the domain is scheduled to run so I think
 * we might want to be careful to read the TSC from the CPU associated with the
 * vcpu_time_info we're calculating "local" system-time from.
 */

/*
 * xen_rdtsc()
 *
 *	Read the local pCPU's tsc.
 *
 * RDTSC is possibly emulated in a domU, but is always "raw" in a dom0.
 */
static inline uint64_t
xen_rdtsc(void)
{
	uint32_t lo, hi;

	asm volatile("rdtsc" : "=a"(lo), "=d"(hi));

	return ((uint64_t)hi << 32) | lo;
}

/*
 * struct xen_vcputime_ticket
 *
 *	State for a vCPU read section, during which a caller may read
 *	from fields of a struct vcpu_time_info and call xen_rdtsc.
 *	Caller must enter with xen_vcputime_enter, exit with
 *	xen_vcputime_exit, and be prepared to retry if
 *	xen_vcputime_exit fails.
 */
struct xen_vcputime_ticket {
	uint64_t	version;
};

/*
 * xen_vcputime_enter(tp)
 *
 *	Enter a vCPU time read section and store a ticket in *tp, which
 *	the caller must use with xen_vcputime_exit.  Return a pointer
 *	to the current CPU's vcpu_time_info structure.  Caller must
 *	already be bound to the CPU.
 */
static inline volatile struct vcpu_time_info *
xen_vcputime_enter(struct xen_vcputime_ticket *tp)
{
	volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time;

	while (__predict_false(1 & (tp->version = vt->version)))
		SPINLOCK_BACKOFF_HOOK;

	/*
	 * Must read the version before reading the tsc on the local
	 * pCPU.  We are racing only with interruption by the
	 * hypervisor, so no need for a stronger memory barrier.
	 */
	__insn_barrier();

	return vt;
}

/*
 * xen_vcputime_exit(vt, tp)
 *
 *	Exit a vCPU time read section with the ticket in *tp from
 *	xen_vcputime_enter.  Return true on success, false if caller
 *	must retry.
 */
static inline bool
xen_vcputime_exit(volatile struct vcpu_time_info *vt,
    struct xen_vcputime_ticket *tp)
{

	/*
	 * fired (in a possibly older kernel) during wonky times, from:
	 *
	 * xen_global_systime_ns() at netbsd:xen_global_systime_ns+0x146
	 * xen_get_timecount() at netbsd:xen_get_timecount+0x12
	 * binuptime() at netbsd:binuptime+0x60
	 */

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	/*
	 * Must read the tsc before re-reading the version on the local
	 * pCPU.  We are racing only with interruption by the
	 * hypervisor, so no need for a stronger memory barrier.
	 */
	__insn_barrier();

	return tp->version == vt->version;
}

/*
 * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
 *
 *	Convert a difference in tsc units to a difference in
 *	nanoseconds given a multiplier and shift for the unit
 *	conversion.
 *
 * from xen.h:
 *
 * Current system time:
 *   system_time +
 *   ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
 * CPU frequency (Hz):
 *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
 */
static inline uint64_t
xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
    int8_t tsc_shift)
{
	uint32_t delta_tsc_hi, delta_tsc_lo;

	if (delta_tsc == 0)
		return 0;

	if (tsc_shift < 0)
		delta_tsc >>= -tsc_shift;
	else
		delta_tsc <<= tsc_shift;

	delta_tsc_hi = delta_tsc >> 32;
	delta_tsc_lo = delta_tsc & 0xffffffffUL;

	/* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
	return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
	    (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
}

/*
 * xen_vcputime_sched_systime_ns()
 *
 *	Return a snapshot of the current Xen system time to the
 *	resolution of the Xen hypervisor tick, in units of nanoseconds.
 *
 *	I.e. the Xen system time at the time this domain was last scheduled.
 *
 * N.B. it is assumed this is only called when premption is impossible.
 */
static uint64_t
xen_vcputime_sched_systime_ns(void)
{
	volatile struct vcpu_time_info *vt;
	struct xen_vcputime_ticket ticket;
	uint64_t sched_systime_ns;

	do {
		vt = xen_vcputime_enter(&ticket);
		sched_systime_ns = vt->system_time;
	} while (!xen_vcputime_exit(vt, &ticket));

	return sched_systime_ns;
}

/*
 * struct xen_wallclock_ticket
 *
 *	State for a wall clock read section, during which a caller may
 *	read from the wall clock fields of HYPERVISOR_shared_info.
 *	Caller must enter with xen_wallclock_enter, exit with
 *	xen_wallclock_exit, and be prepared to retry if
 *	xen_wallclock_exit fails.
 */
struct xen_wallclock_ticket {
	uint32_t version;
};

/*
 * xen_wallclock_enter(tp)
 *
 *	Enter a wall clock read section and store a ticket in *tp,
 *	which the caller must use with xen_wallclock_exit.
 */
static inline void
xen_wallclock_enter(struct xen_wallclock_ticket *tp)
{

	while (__predict_false(1 & (tp->version =
		    HYPERVISOR_shared_info->wc_version)))
		SPINLOCK_BACKOFF_HOOK;

	/*
	 * Must read the version from memory before reading the
	 * timestamp from memory, as written potentially by another
	 * pCPU.
	 */
	membar_consumer();
}

/*
 * xen_wallclock_exit(tp)
 *
 *	Exit a wall clock read section with the ticket in *tp from
 *	xen_wallclock_enter.  Return true on success, false if caller
 *	must retry.
 */
static inline bool
xen_wallclock_exit(struct xen_wallclock_ticket *tp)
{

	/*
	 * Must read the timestamp from memory before re-reading the
	 * version from memory, as written potentially by another pCPU.
	 */
	membar_consumer();

	return tp->version == HYPERVISOR_shared_info->wc_version;
}

/*
 * xen_global_systime_ns()
 *
 *	Return a monotonic view of the system time (current domain's time since
 *	boot) in nanoseconds.
 *
 *	First compute the current vCPU's "system time", which is the vCPU
 *	"system time" (Xen's view of this domain's "system time" at the time
 *	this domain was scheduled), plus an adjustment based on the TSC offset
 *	since the time this domain was scheduled (scaled to nanoseconds using
 *	Xen's supplied scaling factors).
 *
 *	If this vCPU's current "system time" is greater than the last recorded
 *	"global system time" then store this as the new global system time and
 *	return it, else return the current "global system time" (thus keeping
 *	the global system time monotonically advancing).
 *
 * XXX FreeBSD's implementation does not worry about being prempted, nor does it
 * try to keep track of skew between vCPUs.
 *
 * see it in sys/dev/xen/timer/xen_timer.c:xentimer_get_timecount()
 * also see  sys/x86/x86/pvclock.c:pvclock_get_timecount()
 */
static uint64_t
xen_global_systime_ns(void)
{
	uint64_t local_ns, global_ns, result_ns;

	/*
	 * XXX Can we avoid retrying if the CAS fails?
	 *
	 * XXX Has enough time passed in this "loop" that we really need to
	 * fetch a new TSC value and calculate a new local_ns before trying to
	 * store it again?  I guess we could have been premepted....
	 */
	do {
		/* XXX this next hunk of code is partly copied in xen_delay() */
		volatile struct vcpu_time_info *vt;
		struct xen_vcputime_ticket ticket;
		uint64_t start_systime_ns, tsc_at_start, tsc, delta_tsc, delta_ns;
		uint32_t tsc_to_system_mul;
		int8_t tsc_shift;
		int s;
		struct cpu_info *ci;

		s = splsched(); /* make sure we won't be interrupted XXX splhigh()??? */
		ci = curcpu();
		do {
			vt = xen_vcputime_enter(&ticket);

			/*
			 * Grab Xen's snapshot of system time and the TSC value
			 * at the time this domain was last scheduled to run.
			 */
			start_systime_ns = vt->system_time;
			tsc_at_start = vt->tsc_timestamp;

			/* Get Xen's current idea of how fast the TSC is counting.  */
			/* xxx these should be static-enough and could be fetched done outside this loop? */
			tsc_to_system_mul = vt->tsc_to_system_mul;
			tsc_shift = vt->tsc_shift;

			/*
			 * Read the CPU's current TSC (or the emulated one).
			 *
			 * xxx we need to do this on the same vCPU as is
			 * represented by vt, thus the splsched()/splx()
			 */
			tsc = xen_rdtsc();

		} while (!xen_vcputime_exit(vt, &ticket));
		splx(s);

		if (__predict_false(tsc < tsc_at_start)) {
			SDT_PROBE2(sdt, xen, tsc, backwards,
				   tsc, tsc_at_start);
#if XEN_CLOCK_DEBUG
			device_printf(ci->ci_dev,
				      "xen tsc ran backwards:" /* xxx or wrapped */
				      " tsc=%"PRIu64" tsc_at_start=%"PRIu64"\n",
				      tsc, tsc_at_start);
#endif
			ci->ci_xen_tsc_backwards_evcnt.ev_count++;
			tsc = tsc_at_start;
		}

		/* Find how far the CPU's TSC has advanced since we were scheduled.  */
		delta_tsc = tsc - tsc_at_start;

		/* Convert the TSC delta to a nanosecond delta.  */
		delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul,
		    tsc_shift);

		/* Compute the TSC-adjusted system time for this vCPU. */
		local_ns = start_systime_ns + delta_ns;

		global_ns = atomic_load_acquire(&xen_global_systime_ns_stamp);
		/*
		 * The global_ns is sometimes (often, in (SMP-only?) dom0, even
		 * with vCPUs pinned) greater than the value computed from the
		 * current vCPU (local_ns) so we don't want to risk it going
		 * backwards (global_ns is (likely) the same value that was last
		 * returned to the timecounter by xen_get_timecount(), and may
		 * well be the next value returned too)
		 *
		 * This is more or less what FreeBSD does as well.
		 */
		if (__predict_false(local_ns <= global_ns)) {
			const uint64_t ns_per_tick = NS_PER_TICK;

			if (__predict_false((global_ns - local_ns) > ns_per_tick)) {
				SDT_PROBE3(sdt, xen, global_ns, ns_per_tick_diff,
					   global_ns, local_ns, ns_per_tick);
#if 0 /* XEN_CLOCK_DEBUG XXX way too noisy, causes hangs! (only in dom0) */
				device_printf(ci->ci_dev,
					      "xen local_ns one tick or more behind global_ns:"
					      " global_ns=%"PRIu64" local_ns=%"PRIu64", diff=%"PRIu64" ticks\n",
					      global_ns, local_ns, global_ns - local_ns, (global_ns - local_ns) / ns_per_tick);
#endif
				ci->ci_xen_global_ns_per_tick_diff_evcnt.ev_count++;
			}
			/* don't complain if they are the same, just get out... */
			if (local_ns != global_ns) {
				SDT_PROBE2(sdt, xen, global_ns, backwards,
					   local_ns, global_ns);
#if 0 /* XEN_CLOCK_DEBUG XXX way too noisy, causes hangs! (esp. in dom0) */
				device_printf(ci->ci_dev,
					      "xen global_ns prevented from running backwards:"
					      " local_ns=%"PRIu64" global_ns=%"PRIu64"\n",
					      local_ns, global_ns);
#endif
				ci->ci_xen_global_ns_backwards_evcnt.ev_count++;
			}
			result_ns = global_ns;
			/* avoid saving a lower, or same, global_ns again */
#if __NetBSD_Prereq__(9, 99, 97)	/* xxx only for my wonky -current */
			membar_release();
#else
			membar_exit();
#endif
			break;
		} else {
			result_ns = local_ns;
		}
#if __NetBSD_Prereq__(9, 99, 97)	/* xxx only for my wonky -current */
		membar_release();
#else
		membar_exit();
#endif
	} while (atomic_cas_64(&xen_global_systime_ns_stamp, global_ns, result_ns)
	    != global_ns);

	return result_ns;
}

/*
 * xen_get_timecount(tc)
 *
 *	Return the low 32 bits of a global monotonic view of the Xen
 *	system time.
 *
 * This is the timecounter(9) hook for the "xen_system_time" timecounter.
 */
static unsigned
xen_get_timecount(struct timecounter *tc)
{

	KASSERT(tc == &xen_timecounter);

	return (unsigned)xen_global_systime_ns();
}

/*
 * xen_delay(n)
 *
 *	Wait approximately n microseconds.
 */
void
xen_delay(unsigned us)
{
	int bound;

	/* Bind to the CPU so we don't compare tsc on different CPUs.  */
	bound = curlwp_bind();

	if (curcpu()->ci_vcpu == NULL) {
		curlwp_bindx(bound);
		return;
	}

	/* Short wait (<50000us) or long wait?  */
	if (us < 500000) {
		/* XXX this next hunk of code is partly copied in xen_global_systime_ns() */
		/*
		 * Xen system time is not precise enough for short
		 * delays, so use the tsc instead.
		 *
		 * We work with the current tsc frequency, and figure
		 * that if it changes while we're delaying, we've
		 * probably delayed long enough -- up to 500us.
		 *
		 * We do not use cpu_frequency(ci), which uses a
		 * quantity detected at boot time, and which may have
		 * changed by now if Xen has migrated this vCPU to
		 * another pCPU.
		 *
		 * XXX How long does it take to migrate pCPUs?
		 */
		volatile struct vcpu_time_info *vt;
		struct xen_vcputime_ticket ticket;
		uint64_t tsc_at_start, last_tsc, tsc;
		uint32_t tsc_to_system_mul;
		int8_t tsc_shift;

		/* Get the starting tsc and tsc frequency.  */
		do {
			vt = xen_vcputime_enter(&ticket);
			tsc_at_start = last_tsc = xen_rdtsc();
			tsc_to_system_mul = vt->tsc_to_system_mul;
			tsc_shift = vt->tsc_shift;
		} while (!xen_vcputime_exit(vt, &ticket));

		/*
		 * Wait until as many tsc ticks as there are in n
		 * microseconds have elapsed, or the tsc has gone
		 * backwards meaning we've probably migrated pCPUs.
		 */
		for (;;) {
			tsc = xen_rdtsc();
			if (__predict_false(tsc < last_tsc))
				break;
			if (xen_tsc_to_ns_delta(tsc - tsc_at_start,
				tsc_to_system_mul, tsc_shift)/1000 >= us) {
				break;
			}
			last_tsc = tsc;
		}
	} else {
		/*
		 * Use the Xen system time for >=50000us delays.  From my
		 * testing, it seems to sometimes run backward by about
		 * 110us, which is not so bad.
		 */
		uint64_t us_ns = 1000 * (uint64_t)us;
		uint64_t start_ns;

		/* Get the start time.  */
		start_ns = xen_vcputime_sched_systime_ns();

		/* Wait until the system time has passed the end.  */
		do {
			HYPERVISOR_yield();
		} while (xen_vcputime_sched_systime_ns() - start_ns < us_ns);
	}

	/* Unbind from the CPU if we weren't already bound.  */
	curlwp_bindx(bound);
}

/*
 * xen_suspendclocks(ci)
 *
 *	Stop handling the Xen timer event on the CPU of ci.  Caller
 *	must be running on and bound to ci's CPU.
 *
 *	Actually, caller must have kpreemption disabled, because that's
 *	easier to assert at the moment.
 */
void
xen_suspendclocks(struct cpu_info *ci)
{
	int evtch;

	KASSERT(ci == curcpu());
	KASSERT(kpreempt_disabled());

	/*
	 * Find the VIRQ_TIMER event channel and close it so new timer
	 * interrupt events stop getting delivered to it.
	 *
	 * XXX Should this happen later?  This is not the reverse order
	 * of xen_resumeclocks.  It is apparently necessary in this
	 * order only because we don't stash evtchn anywhere, but we
	 * could stash it.
	 */
	evtch = unbind_virq_from_evtch(VIRQ_TIMER);
	KASSERT(evtch != -1);

	/*
	 * Mask the event channel so we stop getting new interrupts on
	 * it.
	 */
	hypervisor_mask_event(evtch);

	/*
	 * Now that we are no longer getting new interrupts, remove the
	 * handler and wait for any existing calls to the handler to
	 * complete.  After this point, there can be no concurrent
	 * calls to xen_timer_handler.
	 */
	event_remove_handler(evtch,
	    __FPTRCAST(int (*)(void *), xen_timer_handler), ci);

	aprint_verbose("Xen clock: removed event channel %d\n", evtch);

	/* We'd better not have switched CPUs.  */
	KASSERT(ci == curcpu());
}

/*
 * xen_resumeclocks(ci)
 *
 *	Start handling the Xen timer event on the CPU of ci.  Arm the
 *	Xen timer.  Caller must be running on and bound to ci's CPU.
 *
 *	Actually, caller must have kpreemption disabled, because that's
 *	easier to assert at the moment.
 */
void
xen_resumeclocks(struct cpu_info *ci)
{
	char intr_xname[INTRDEVNAMEBUF];
	int evtch;
	int error __diagused;

	KASSERT(ci == curcpu());
	KASSERT(kpreempt_disabled());

	/*
	 * Allocate an event channel to receive VIRQ_TIMER events.
	 */
	evtch = bind_virq_to_evtch(VIRQ_TIMER);
	KASSERT(evtch != -1);

	/*
	 * Set an event handler for VIRQ_TIMER events to call
	 * xen_timer_handler.
	 */
	snprintf(intr_xname, sizeof(intr_xname), "%s clock",
	    device_xname(ci->ci_dev));
	/* XXX sketchy function pointer cast -- fix the API, please */
	if (event_set_handler(evtch,
	    __FPTRCAST(int (*)(void *), xen_timer_handler),
	    ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL)
		panic("failed to establish timer interrupt handler");

	aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch);

	/* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy.  */
	if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
		error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
		    ci->ci_vcpuid, NULL);
		KASSERT(error == 0);
	}

	/* Pretend the last hardclock happened when we were last scheduled.  */
	ci->ci_xen_hardclock_systime_ns = xen_vcputime_sched_systime_ns();

	/*
	 * xxx also save the initial systime_ns as global offset to add to
	 * nanouptime() in xen_rtc_set()
	 */
	xen_initial_systime_ns = xen_global_systime_ns_stamp;

	/* Arm the one-shot timer.  */
	error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns +
	    NS_PER_TICK);
	KASSERT(error == 0);

	/*
	 * Ready to go.  Unmask the event.  After this point, Xen may
	 * start calling xen_timer_handler.
	 */
	hypervisor_unmask_event(evtch);

	/* We'd better not have switched CPUs.  */
	KASSERT(ci == curcpu());
}

/*
 * xen_timer_handler(cookie, frame)
 *
 *	Periodic Xen timer event handler for NetBSD hardclock.  Calls
 *	to this may get delayed(*), so we run hardclock as many times as
 *	we need to in order to cover the Xen system time that elapsed.
 *	After that, re-arm the timer to run again at the next tick.
 *	The cookie is the pointer to struct cpu_info.
 *
 * (*) n.b. comment in FreeBSD xen_timer.c says:
 *
 *	Xen timers may fire up to 100us off
 */
static int
xen_timer_handler(void *cookie, struct clockframe *frame)
{
	const uint64_t ns_per_tick = NS_PER_TICK;
	struct cpu_info *ci = curcpu();
	uint64_t last, now, delta, next;
	int error;

	KASSERT(cpu_intr_p());
	KASSERT(cookie == ci);

#if defined(XENPV)
	frame = NULL; /* We use values cached in curcpu()  */
#endif
	/*
	 * Find how many nanoseconds of Xen system time has elapsed
	 * since the last hardclock tick.
	 */
	last = ci->ci_xen_hardclock_systime_ns;
	now = xen_global_systime_ns();	/* xxx use global, avoid skew! */
	SDT_PROBE2(sdt, xen, hardclock, tick,  last, now);
	if (__predict_false(now < last)) {
		SDT_PROBE2(sdt, xen, hardclock, systime__backwards,
		    last, now);
#if XEN_CLOCK_DEBUG
		device_printf(ci->ci_dev, "xen systime ran backwards"
		    " in hardclock %"PRIu64"ns\n",
		    last - now);
#endif
		ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++;
		/*
		 * we've lost track of time. Just pretends that one
		 * tick elapsed, and reset our idea of last tick.
		 */
		ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick;
	}
	delta = now - last;

	if (__predict_false(delta >= 2*ns_per_tick)) {
		/*
		 * Warn if we violate timecounter(9) contract: with a
		 * k-bit timeocunter (here k = 32), and timecounter
		 * frequency f (here f = 1 GHz), the maximum period
		 * between hardclock calls is 2^k / f.
		 */
		if (delta > xen_timecounter.tc_counter_mask) {
			SDT_PROBE3(sdt, xen, hardclock, jump,
				   last, now, delta/ns_per_tick);
			printf("WARNING: hardclock skipped %"PRIu64"ns"
			    " (%"PRIu64" -> %"PRIu64"),"
			    " exceeding maximum of %"PRIu32"ns"
			    " for timecounter(9)\n",
			    last, now, delta,
			    xen_timecounter.tc_counter_mask);
			ci->ci_xen_timecounter_jump_evcnt.ev_count++;
		}
		/* don't try to catch up more than one second at once */
		if (delta > 1000000000UL)
			delta = 1000000000UL;
	}
	/*
	 * Play hardclock catchup: run the hardclock timer as many
	 * times as appears necessary based on how much time has
	 * passed.
	 *
	 * XXX This happens extremely frequently -- ~50-80% of the HZ rate!
	 */
	while (delta >= ns_per_tick) {
		ci->ci_xen_hardclock_systime_ns += ns_per_tick;
		delta -= ns_per_tick;
		hardclock(frame);
		if (__predict_false(delta >= ns_per_tick)) {
			SDT_PROBE3(sdt, xen, hardclock, missed,
			    last, now, delta);
			ci->ci_xen_missed_hardclock_evcnt.ev_count++;
		}
	}

	/*
	 * Re-arm the timer.  If it fails, it's probably because the desired
	 * time is in the past, possibly because we're in the process of
	 * catching up missed hardclock calls.  In this case schedule a tick in
	 * the near future.
	 */
	next = ci->ci_xen_hardclock_systime_ns + ns_per_tick;
	error = HYPERVISOR_set_timer_op(next);
	if (error) {
		/* xxx should there be an SDT_PROBE() here?  So far this event is always zero... */
		ci->ci_xen_next_hardclock_in_past_evcnt.ev_count++;
		next = now + ns_per_tick / 2;
		error = HYPERVISOR_set_timer_op(next);
		if (error) {
			panic("failed to re-arm Xen timer %d", error);
		}
	}

	/* Success!  */
	return 0;
}

/*
 * xen_initclocks()
 *
 *	Initialize the Xen clocks on the current CPU.
 */
void
xen_initclocks(void)
{
	struct cpu_info *ci = curcpu();

	/* If this is the primary CPU, do global initialization first.  */
	if (ci == &cpu_info_primary) {
		/* Initialize the systemwide Xen timecounter.  */
		tc_init(&xen_timecounter);
	}

	/* Attach the event counters.  */
	evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen systime went backwards in hardclock");
	evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen missed hardclock");
	evcnt_attach_dynamic(&ci->ci_xen_tsc_backwards_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen tsc ran backwards");
	evcnt_attach_dynamic(&ci->ci_xen_global_ns_per_tick_diff_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen local_ns one tick or more behind global_ns");
	evcnt_attach_dynamic(&ci->ci_xen_global_ns_backwards_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen global_ns prevented from running backwards");
	evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen hardclock jumped past timecounter max");
	evcnt_attach_dynamic(&ci->ci_xen_next_hardclock_in_past_evcnt,
	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
	    "xen next hardclock time was in the past");

	/* Fire up the clocks.  */
	xen_resumeclocks(ci);

#ifdef DOM0OPS
	/*
	 * If this is a privileged dom0, start pushing the wall
	 * clock time back to the Xen hypervisor.
	 */
	if (ci == &cpu_info_primary && xendomain_is_privileged())
		xen_timepush_init();

	/*
	 * and do it right away too, as otherwise we may have to wait as long as
	 * until ntpdate first runs....
	 */
	resettodr();
#endif
}

#ifdef DOM0OPS

/*
 * xen_timepush_init()
 *
 *	Initialize callout to periodically set Xen hypervisor's wall
 *	clock time.
 */
static void
xen_timepush_init(void)
{
	struct sysctllog *log = NULL;
	const struct sysctlnode *node = NULL;
	int error;

	/* Start periodically updating the hypervisor's wall clock time.  */
	callout_init(&xen_timepush.ch, 0);
	callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);

	xen_timepush.ticks = XEN_TIMEPUSH_TICKS;

	/* Create machdep.xen node.  */
	/* XXX Creation of the `machdep.xen' node should be elsewhere.  it is!  see ballon.c, hypervisor.c, AND xen_machdep.c  */
	error = sysctl_createv(&log, 0, NULL, &node,
	    CTLFLAG_PERMANENT,
	    CTLTYPE_NODE, "xen",
	    SYSCTL_DESCR("Xen top level node"),
	    NULL, 0, NULL, 0,
	    CTL_MACHDEP, CTL_CREATE, CTL_EOL);
	if (error)
		goto fail;
	KASSERT(node != NULL);

	/* Create int machdep.xen.timepush_ticks knob.  */
	error = sysctl_createv(&log, 0, &node, NULL,
	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
	    CTLTYPE_INT, "timepush_ticks",
	    SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
		" 0 to disable"),
	    sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
	    CTL_CREATE, CTL_EOL);
	if (error)
		goto fail;

	/* Start the timepush callout.  */
	callout_schedule(&xen_timepush.ch, xen_timepush.ticks);

	/* Success!  */
	return;

fail:	sysctl_teardown(&log);
}

/*
 * xen_timepush_intr(cookie)
 *
 *	Callout interrupt handler to push NetBSD's idea of the wall
 *	clock time, usually synchronized with NTP, back to the Xen
 *	hypervisor.
 */
static void
xen_timepush_intr(void *cookie)
{

	resettodr();
	if (xen_timepush.ticks)
		callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
}

/*
 * sysctl_xen_timepush(...)
 *
 *	Sysctl handler to set machdep.xen.timepush_ticks.
 */
static int
sysctl_xen_timepush(SYSCTLFN_ARGS)
{
	struct sysctlnode node;
	int ticks;
	int error;

	ticks = xen_timepush.ticks;
	node = *rnode;
	node.sysctl_data = &ticks;
	error = sysctl_lookup(SYSCTLFN_CALL(&node));
	if (error || newp == NULL)
		return error;

	if (ticks < 0)
		return EINVAL;

	if (ticks != xen_timepush.ticks) {
		xen_timepush.ticks = ticks;

		if (ticks == 0)
			callout_stop(&xen_timepush.ch);
		else
			callout_schedule(&xen_timepush.ch, ticks);
	}

	return 0;
}

#endif	/* DOM0OPS */

static int	xen_rtc_get(struct todr_chip_handle *, struct timeval *);
static int	xen_rtc_set(struct todr_chip_handle *, struct timeval *);
static void	xen_wallclock_time(struct timespec *);
/*
 * xen time of day register:
 *
 *	Xen wall clock time, plus a Xen vCPU system time adjustment.
 */
static struct todr_chip_handle xen_todr_chip = {
	.todr_gettime = xen_rtc_get,
	.todr_settime = xen_rtc_set,
};

/*
 * xen_startrtclock()
 *
 *	Initialize the real-time clock from x86 machdep autoconf.
 */
void
xen_startrtclock(void)
{

	todr_attach(&xen_todr_chip);
}

/*
 * xen_rtc_get(todr, tv)
 *
 *	Get the current real-time clock from the Xen wall clock time
 *	and vCPU system time adjustment.
 */
static int
xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
{
	struct timespec ts;

	xen_wallclock_time(&ts);
	TIMESPEC_TO_TIMEVAL(tvp, &ts);

	return 0;
}

/*
 * xen_rtc_set(todr, tv)
 *
 *	Set the Xen wall clock time, if we can.
 */
static int
xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
{
#ifdef DOM0OPS
	if (xendomain_is_privileged()) {
		struct clock_ymdhms dt;
		xen_platform_op_t op;
		uint64_t systime_ns;
		int error;
		struct timespec ts;
		uint64_t nanouptime_ns;

		/* Convert to ymdhms and set the x86 ISA RTC.  */
		clock_secs_to_ymdhms(tvp->tv_sec, &dt);
		rtc_set_ymdhms(NULL, &dt);

		/*
		 *
		 * Get the domain's system time, to pass to XENPF_settime
		 *
		 * Xen will subtract system_time from our current realtime clock
		 * to get this domain's boot time, which Xen calls "wall clock
		 * time", and it will then update its global wall clock time and
		 * the wall clock time for all domains, i.e. what
		 * xen_wallclock_time() and thus what xen_rtc_get() returns.
		 *
		 * NetBSD itself never uses todr_gettime() again after the
		 * initial use during boot, but it will be used by newly started
		 * NetBSD domains of course, and other guests may use it
		 * regularly.
		 */
		/*
		 * I'm not sure why XENPF_settime needs the current system_time
		 * value and can't just use Xen's internal value, e.g. from its
		 * own get_s_time() or the NOW() wrapper, just as it does itself
		 * during boot.  Perhaps they worry about the delay between when
		 * the realtime clock is read here and when do_settime() is
		 * executed in the Xen kernel?
		 *
		 * xxx we could probably use nanouptime() here instead, but we
		 * might have to capture the initial system_ns, indicating the
		 * time between when Xen started keeping time and when this dom0
		 * kernel started keeping time, and always add that to the
		 * submitted system_time.
		 *
		 * XXX When using xen_global_systime_ns() the time between when
		 * Xen last set the vcpu's system_time used here, and when we
		 * send it back to Xen, is "lost".  Perhaps using nanouptime()
		 * instead avoids some/most of this loss.
		 *
		 * (FreeBSD also just passes back vcpu->time, so nothing really
		 * different than using systime_ns here.)
		 */
		nanouptime(&ts);
		nanouptime_ns = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
		systime_ns = nanouptime_ns + xen_initial_systime_ns;

		/* Set the hypervisor wall clock time.  */
		memset(&op, 0, sizeof(op));
		/*
		 * XXX NetBSD kernels are currently built with
		 * __XEN_INTERFACE_VERSION__ = 0x0003020a (see
		 * ../conf/std.xenversion), so we implicitly get
		 * XENPF_settime32 and struct xenpf_settime32.
		 *
		 * However it looks like with HYPERVISOR_platform_op() it should
		 * be possible to explicitly set op.interface_version to get the
		 * newer API/ABI so long as we explicitly use the version-
		 * specific opcode XENPF_settime64 and the new struct
		 * xenpf_settime64.
		 *
		 * xxx adjust XEN_CLOCK_DEBUG appropriately as well....
		 */
# if 0
		op.interface_version = XENPF_INTERFACE_VERSION;
		op.cmd = XENPF_settime64;
		op.u.settime64.mbz = 0;
		op.u.settime64.secs = tvp->tv_sec;
		op.u.settime64.nsecs = tvp->tv_usec * 1000;
		op.u.settime64.system_time = systime_ns;
# else
		op.cmd = XENPF_settime;
		op.u.settime.secs = tvp->tv_sec;
		op.u.settime.nsecs = tvp->tv_usec * 1000;
		op.u.settime.system_time = systime_ns;
# endif

# ifdef XEN_CLOCK_DEBUG
		/*  */ {
		int64_t dt_ns;

		/*
		 * note xen_global_systime_ns_stamp was just updated/used by
		 * xen_get_timecount() via the call to nanouptime() above, so in
		 * theory there should never be any change in the differnce
		 * calculated here.
		 *
		 * XXX The weird thing here is that sometime between 650k and
		 * 660k seconds after boot the result from nanouptime() seems to
		 * "drag" behind systime_ns, as if it's suddenly calculating ns
		 * from the timecounter with a different frequency (i.e. a
		 * different th_scale).  Sometimes it keeps time normally for a
		 * brief period, but then drags behind again.
		 *
		 * Also somehow despite nanouptime() dragging, the callout still
		 * keeps firing almost exactly every xen_timepush.ticks!
		 *
		 * xxx would calling xen_global_systime_ns() again directly be
		 * interesting?  Might it show any radical difference with the
		 * "previous" value of xen_global_systime_ns_stamp?
		 */
		dt_ns = (int64_t) xen_global_systime_ns_stamp -
			((int64_t) nanouptime_ns + xen_initial_systime_ns);
		/* xxx I suppose we could call xen_wallclock_time() to calc the adjustment... */
		printf("xen_rtc_set: Setting to %"PRIu32".%09"PRIu32" s"
		       " at systime %"PRIu64" ns"
		       " (initial: %"PRIu64" ns, nanouptime: %"PRIu64" ns)"
		       " diff(gst-(nt+init)): %"PRId64".%09"PRId64" s\n",
		       op.u.settime.secs,
		       op.u.settime.nsecs,
		       op.u.settime.system_time,
		       xen_initial_systime_ns,
		       nanouptime_ns,
		       (int64_t) (dt_ns / 1000000000LL), (int64_t) llabs((dt_ns % 1000000000LL)));
		}
# endif
		error = HYPERVISOR_platform_op(&op);
# ifdef XEN_CLOCK_DEBUG	/* kern_todr.c:todr_save_systime() already reports with a printf */
		if (error) {
			printf("xen_rtc_set: XENPF_settime failed (%d)\n", error);
		}
# endif

		return error;
	}
#endif

	/* XXX Should this fail if not on privileged dom0?  */
	return 0;
}

/*
 * xen_wallclock_time(tsp)
 *
 *	Return the current low-resolution real wall clock
 *	time (boot time of the domain plus Xen system time), in tsp.
 */
static void
xen_wallclock_time(struct timespec *tsp)
{
	struct xen_wallclock_ticket ticket;
	uint64_t systime_ns;

	int s = splsched(); /* make sure we won't be interrupted */
	/* Read the last wall clock sample from the hypervisor. */
	do {
		xen_wallclock_enter(&ticket);
		tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
		tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
	} while (!xen_wallclock_exit(&ticket));

	/* Get the Xen global system time.  */
	systime_ns = xen_global_systime_ns();
	splx(s);

	/*
	 * Add the Xen system time (uptime) to the Xen "wall clock time" to get
	 * the current real wall clock time.
	 */
	systime_ns += tsp->tv_nsec;
	tsp->tv_sec += systime_ns / 1000000000ull;
	tsp->tv_nsec = systime_ns % 1000000000ull;
}

#ifdef XENPV
/*
 * setstatclockrate(rate)
 *
 *	Set the statclock to run at rate, in units of ticks per second.
 *
 *	Currently Xen does not have a separate statclock, so this is a
 *	noop; instad the statclock runs in hardclock.
 */
void
setstatclockrate(int rate)
{
}
#endif /* XENPV */
Attachment: pgpuATx4wwk9g.pgp
Description: OpenPGP Digital Signature
References:
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression?
  - From: Brad Spencer
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression?
  - From: Brad Spencer
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression?
  - From: Greg A. Woods
- Re: timekeeping regression? (a possible clue about the ~7.5 days)
  - From: Greg A. Woods
- a real clue about the ~7.5 days! (was: timekeeping regression?)
  - From: Greg A. Woods
Prev by Date: a real clue about the ~7.5 days! (was: timekeeping regression?)
Next by Date: 2nd round of xen benchmarking
Previous by Thread: a real clue about the ~7.5 days! (was: timekeeping regression?)
Next by Thread: Re: timekeeping regression?
Indexes:
Home | Main Index | Thread Index | Old Index