At Sat, 06 Jul 2024 21:26:42 -0700, "Greg A. Woods" <woods%planix.ca@localhost> wrote:
Subject: Re: timekeeping regression?
>
> I've been rewriting some chunks of xen_clock.c (in a separate post
> [[hopefully!]]) and getting some good results, but some confusion
> remains.
--
Greg A. Woods <gwoods%acm.org@localhost>
Kelowna, BC +1 250 762-7675 RoboHack <woods%robohack.ca@localhost>
Planix, Inc. <woods%planix.com@localhost> Avoncote Farms <woods%avoncote.ca@localhost>
/* $NetBSD: xen_clock.c,v 1.18 2023/09/10 15:23:01 bouyer Exp $ */
/*-
* Copyright (c) 2017, 2018 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "opt_xen.h"
#ifndef XEN_CLOCK_DEBUG
#define XEN_CLOCK_DEBUG 0
#endif
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.18 2023/09/10 15:23:01 bouyer Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/device.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/sdt.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <dev/clock_subr.h>
#include <machine/cpu.h>
#include <machine/cpu_counter.h>
#include <machine/lock.h>
#include <xen/evtchn.h>
#include <xen/hypervisor.h>
#include <xen/include/public/vcpu.h>
#include <xen/xen.h>
#include <x86/rtc.h>
#define NS_PER_TICK ((uint64_t)1000000000ULL/hz)
static uint64_t xen_vcputime_sched_systime_ns(void);
static uint64_t xen_global_systime_ns(void);
static unsigned xen_get_timecount(struct timecounter *);
static int xen_timer_handler(void *, struct clockframe *);
/*
* dtrace probes
*/
SDT_PROBE_DEFINE2(sdt, xen, tsc, backwards,
"uint64_t"/*tsc*/,
"uint64_t"/*tsc_at_start*/);
SDT_PROBE_DEFINE2(sdt, xen, global_ns, backwards,
"uint64_t"/*local_ns*/,
"uint64_t"/*global_ns*/);
SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward,
"uint64_t"/*last_systime_ns*/,
"uint64_t"/*this_systime_ns*/);
SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick,
"uint64_t"/*last_systime_ns*/,
"uint64_t"/*this_systime_ns*/);
SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump,
"uint64_t"/*last_systime_ns*/,
"uint64_t"/*this_systime_ns*/,
"uint64_t"/*nticks*/);
SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed,
"uint64_t"/*last_systime_ns*/,
"uint64_t"/*this_systime_ns*/,
"uint64_t"/*remaining_ns*/);
/*
* xen timecounter:
*
* Xen vCPU system time, plus an adjustment with rdtsc.
*/
static struct timecounter xen_timecounter = {
.tc_get_timecount = xen_get_timecount,
.tc_poll_pps = NULL,
.tc_counter_mask = ~0U,
.tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */
.tc_name = "xen_system_time", /* XXX "xen_TSC" */
.tc_quality = 10000,
};
/*
* xen_global_systime_ns_stamp
*
* The latest Xen vCPU system time that has been observed on any
* CPU, for a global monotonic view of the Xen system time clock.
*/
static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned;
#ifdef DOM0OPS
/*
* xen timepush state:
*
* Callout to periodically, after a sysctl-configurable number of
* NetBSD ticks, set the Xen hypervisor's wall clock time.
*
* Linux does this once every 11 minutes, as well as any time settimeofday() is
* called (and maybe every time the clock is jumped by NTP etc.)
*/
static struct {
struct callout ch;
int ticks;
} xen_timepush;
static void xen_timepush_init(void);
static void xen_timepush_intr(void *);
static int sysctl_xen_timepush(SYSCTLFN_ARGS);
#endif
/*
* Xen "system time" and "wall clock time"
*
* Xen provides guests with two timestamp values, the system-time (time since
* guest boot or resume) and the wall-clock time (time since the epoch at the
* point when system-time was zero, i.e. Xen wall clock time is actually
* boot-time (or resume time) for the guest, and for dom0 this is also very
* close to the boot-time of the hypervisor).
*
* These are provided through a shared memory structure (shared_info_page, and
* in the array of vcpu_time_info within).
*
* The system-time in the vCPU's vcpu_time_info is updated by Xen every time the
* guest is being scheduled, along with a snapshot of the CPU's TSC register
* value (and some related values for scaling the TSC to nanoseconds). While
* running the guest can get the current system-time by extrapolating from the
* values in vcpu_time_info using the value of the TSC register (an x86 register
* counting CPU clock cycles, often emulated in domUs).
*
* TSC values in Xen are obtained through the RTDSC instruction and are either
* native, i.e. accessed directly from the CPU register (in dom0, and possibly
* in some situations in domUs); or emulated, i.e. intercepted through a trap by
* Xen (in domUs, e.g. on hardware without the TSC_INVARIANT CPU feature). In
* emulated mode the CPU clock is at a ficticious frequency of 1 GHz. Either
* way multiplier and shift values are provided to adjust the TSC value to
* nanoseconds so the frequency need not be measured (it was measured by Xen
* when it first booted).
*
* XXX for SMP domains with multiple vCPUs it looks like the tsc_timestamp is
* separately updated for each vCPU as the domain is scheduled to run so I think
* we might want to be careful to read the TSC from the CPU associated with the
* vcpu_time_info we're calculating "local" system-time from.
*/
/*
* xen_rdtsc()
*
* Read the local pCPU's tsc.
*/
static inline uint64_t
xen_rdtsc(void)
{
uint32_t lo, hi;
asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
}
/*
* struct xen_vcputime_ticket
*
* State for a vCPU read section, during which a caller may read
* from fields of a struct vcpu_time_info and call xen_rdtsc.
* Caller must enter with xen_vcputime_enter, exit with
* xen_vcputime_exit, and be prepared to retry if
* xen_vcputime_exit fails.
*/
struct xen_vcputime_ticket {
uint64_t version;
};
/*
* xen_vcputime_enter(tp)
*
* Enter a vCPU time read section and store a ticket in *tp, which
* the caller must use with xen_vcputime_exit. Return a pointer
* to the current CPU's vcpu_time_info structure. Caller must
* already be bound to the CPU.
*/
static inline volatile struct vcpu_time_info *
xen_vcputime_enter(struct xen_vcputime_ticket *tp)
{
volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time;
while (__predict_false(1 & (tp->version = vt->version)))
SPINLOCK_BACKOFF_HOOK;
/*
* Must read the version before reading the tsc on the local
* pCPU. We are racing only with interruption by the
* hypervisor, so no need for a stronger memory barrier.
*/
__insn_barrier();
return vt;
}
/*
* xen_vcputime_exit(vt, tp)
*
* Exit a vCPU time read section with the ticket in *tp from
* xen_vcputime_enter. Return true on success, false if caller
* must retry.
*/
static inline bool
xen_vcputime_exit(volatile struct vcpu_time_info *vt,
struct xen_vcputime_ticket *tp)
{
KASSERT(vt == &curcpu()->ci_vcpu->time);
/*
* Must read the tsc before re-reading the version on the local
* pCPU. We are racing only with interruption by the
* hypervisor, so no need for a stronger memory barrier.
*/
__insn_barrier();
return tp->version == vt->version;
}
/*
* xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
*
* Convert a difference in tsc units to a difference in
* nanoseconds given a multiplier and shift for the unit
* conversion.
*
* from xen.h:
*
* Current system time:
* system_time +
* ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
* CPU frequency (Hz):
* ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
*/
static inline uint64_t
xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
int8_t tsc_shift)
{
uint32_t delta_tsc_hi, delta_tsc_lo;
if (delta_tsc == 0)
return 0;
if (tsc_shift < 0)
delta_tsc >>= -tsc_shift;
else
delta_tsc <<= tsc_shift;
delta_tsc_hi = delta_tsc >> 32;
delta_tsc_lo = delta_tsc & 0xffffffffUL;
/* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
(((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
}
/*
* xen_vcputime_sched_systime_ns()
*
* Return a snapshot of the current Xen system time to the
* resolution of the Xen hypervisor tick, in units of nanoseconds.
*
* I.e. the Xen system time at the time this domain was last scheduled.
*
* N.B. it is assumed this is only called when premption is impossible.
*/
static uint64_t
xen_vcputime_sched_systime_ns(void)
{
volatile struct vcpu_time_info *vt;
struct xen_vcputime_ticket ticket;
uint64_t sched_systime_ns;
do {
vt = xen_vcputime_enter(&ticket);
sched_systime_ns = vt->system_time;
} while (!xen_vcputime_exit(vt, &ticket));
return sched_systime_ns;
}
/*
* struct xen_wallclock_ticket
*
* State for a wall clock read section, during which a caller may
* read from the wall clock fields of HYPERVISOR_shared_info.
* Caller must enter with xen_wallclock_enter, exit with
* xen_wallclock_exit, and be prepared to retry if
* xen_wallclock_exit fails.
*/
struct xen_wallclock_ticket {
uint32_t version;
};
/*
* xen_wallclock_enter(tp)
*
* Enter a wall clock read section and store a ticket in *tp,
* which the caller must use with xen_wallclock_exit.
*/
static inline void
xen_wallclock_enter(struct xen_wallclock_ticket *tp)
{
while (__predict_false(1 & (tp->version =
HYPERVISOR_shared_info->wc_version)))
SPINLOCK_BACKOFF_HOOK;
/*
* Must read the version from memory before reading the
* timestamp from memory, as written potentially by another
* pCPU.
*/
membar_consumer();
}
/*
* xen_wallclock_exit(tp)
*
* Exit a wall clock read section with the ticket in *tp from
* xen_wallclock_enter. Return true on success, false if caller
* must retry.
*/
static inline bool
xen_wallclock_exit(struct xen_wallclock_ticket *tp)
{
/*
* Must read the timestamp from memory before re-reading the
* version from memory, as written potentially by another pCPU.
*/
membar_consumer();
return tp->version == HYPERVISOR_shared_info->wc_version;
}
/*
* xen_global_systime_ns()
*
* Return a monotonic view of the system time (current domain's time since
* boot) in nanoseconds.
*
* First compute the current vCPU's "system time", which is the vCPU
* "system time" (Xen's view of this domain's "system time" at the time
* this domain was scheduled), plus an adjustment based on the TSC offset
* since the time this domain was scheduled (scaled to nanoseconds using
* Xen's supplied scaling factors).
*
* If this vCPU's current "system time" is greater than the last recorded
* "global system time" then store this as the new global system time and
* return it, else return the current "global system time" (thus keeping
* the global system time monotonically advancing).
*
* XXX FreeBSD's implementation does not worry about being prempted, nor does it
* try to keep track of skew between vCPUs.
*
* see it in sys/dev/xen/timer/xen_timer.c:xentimer_get_timecount()
* also see sys/x86/x86/pvclock.c:pvclock_get_timecount()
*/
static uint64_t
xen_global_systime_ns(void)
{
uint64_t local_ns, global_ns, result_ns;
/*
* XXX Can we avoid retrying if the CAS fails?
*
* XXX Has enough time passed in this "loop" that we really need to
* fetch a new TSC value and calculate a new local_ns before trying to
* store it again? I guess we could have been premepted....
*/
do {
/* XXX this next hunk of code is partly copied in xen_delay() */
volatile struct vcpu_time_info *vt;
struct xen_vcputime_ticket ticket;
uint64_t start_systime_ns, tsc_at_start, tsc, delta_tsc, delta_ns;
uint32_t tsc_to_system_mul;
int8_t tsc_shift;
int s;
struct cpu_info *ci;
s = splsched(); /* make sure we won't be interrupted XXX splhigh()??? */
ci = curcpu();
do {
vt = xen_vcputime_enter(&ticket);
/*
* Grab Xen's snapshot of system time and the TSC value
* at the time this domain was last scheduled to run.
*/
start_systime_ns = vt->system_time;
tsc_at_start = vt->tsc_timestamp;
/* Get Xen's current idea of how fast the TSC is counting. */
/* xxx these should be static-enough and could be fetched done outside this loop? */
tsc_to_system_mul = vt->tsc_to_system_mul;
tsc_shift = vt->tsc_shift;
/*
* Read the CPU's current TSC (or the emulated one).
*
* xxx we need to do this on the same vCPU as is
* represented by vt, thus the splsched()/splx()
*/
tsc = xen_rdtsc();
} while (!xen_vcputime_exit(vt, &ticket));
splx(s);
if (__predict_false(tsc < tsc_at_start)) {
SDT_PROBE2(sdt, xen, tsc, backwards,
tsc, tsc_at_start);
#if XEN_CLOCK_DEBUG
device_printf(ci->ci_dev,
"xen tsc ran backwards:"
" tsc=%"PRIu64" tsc_at_start=%"PRIu64"\n",
tsc, tsc_at_start);
#endif
ci->ci_xen_tsc_backwards_evcnt.ev_count++;
tsc = tsc_at_start;
}
/* Find how far the CPU's TSC has advanced since we were scheduled. */
delta_tsc = tsc - tsc_at_start;
/* Convert the TSC delta to a nanosecond delta. */
delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul,
tsc_shift);
/* Compute the TSC-adjusted system time for this vCPU. */
local_ns = start_systime_ns + delta_ns;
global_ns = atomic_load_acquire(&xen_global_systime_ns_stamp);
/*
* The global_ns is sometimes (often, in (SMP-only?) dom0, even
* with vCPUs pinned) greater than the value computed from the
* current vCPU (local_ns) so we don't want to risk it going
* backwards (global_ns is (likely) the same value that was last
* returned to the timecounter by xen_get_timecount(), and may
* well be the next value returned too)
*
* This is more or less what FreeBSD does as well.
*/
if (__predict_false(local_ns <= global_ns)) {
if (local_ns != global_ns) {
SDT_PROBE2(sdt, xen, global_ns, backward,
local_ns, global_ns);
#if 0 /* XEN_CLOCK_DEBUG XXX way too noisy, causes hangs! (esp. in dom0) */
device_printf(ci->ci_dev,
"xen global_ns prevented from running backwards:"
" local_ns=%"PRIu64" global_ns=%"PRIu64"\n",
local_ns, global_ns);
#endif
ci->ci_xen_global_ns_backwards_evcnt.ev_count++;
}
result_ns = global_ns;
/* avoid saving a lower, or same, global_ns again */
#if __NetBSD_Prereq__(9, 99, 97) /* xxx only for my wonky -current */
membar_release();
#else
membar_exit();
#endif
break;
} else {
result_ns = local_ns;
}
#if __NetBSD_Prereq__(9, 99, 97) /* xxx only for my wonky -current */
membar_release();
#else
membar_exit();
#endif
} while (atomic_cas_64(&xen_global_systime_ns_stamp, global_ns, result_ns)
!= global_ns);
return result_ns;
}
/*
* xen_get_timecount(tc)
*
* Return the low 32 bits of a global monotonic view of the Xen
* system time.
*/
static unsigned
xen_get_timecount(struct timecounter *tc)
{
KASSERT(tc == &xen_timecounter);
return (unsigned)xen_global_systime_ns();
}
/*
* xen_delay(n)
*
* Wait approximately n microseconds.
*/
void
xen_delay(unsigned us)
{
int bound;
/* Bind to the CPU so we don't compare tsc on different CPUs. */
bound = curlwp_bind();
if (curcpu()->ci_vcpu == NULL) {
curlwp_bindx(bound);
return;
}
/* Short wait (<50000us) or long wait? */
if (us < 500000) {
/* XXX this next hunk of code is partly copied in xen_global_systime_ns() */
/*
* Xen system time is not precise enough for short
* delays, so use the tsc instead.
*
* We work with the current tsc frequency, and figure
* that if it changes while we're delaying, we've
* probably delayed long enough -- up to 500us.
*
* We do not use cpu_frequency(ci), which uses a
* quantity detected at boot time, and which may have
* changed by now if Xen has migrated this vCPU to
* another pCPU.
*
* XXX How long does it take to migrate pCPUs?
*/
volatile struct vcpu_time_info *vt;
struct xen_vcputime_ticket ticket;
uint64_t tsc_at_start, last_tsc, tsc;
uint32_t tsc_to_system_mul;
int8_t tsc_shift;
/* Get the starting tsc and tsc frequency. */
do {
vt = xen_vcputime_enter(&ticket);
tsc_at_start = last_tsc = xen_rdtsc();
tsc_to_system_mul = vt->tsc_to_system_mul;
tsc_shift = vt->tsc_shift;
} while (!xen_vcputime_exit(vt, &ticket));
/*
* Wait until as many tsc ticks as there are in n
* microseconds have elapsed, or the tsc has gone
* backwards meaning we've probably migrated pCPUs.
*/
for (;;) {
tsc = xen_rdtsc();
if (__predict_false(tsc < last_tsc))
break;
if (xen_tsc_to_ns_delta(tsc - tsc_at_start,
tsc_to_system_mul, tsc_shift)/1000 >= us) {
break;
}
last_tsc = tsc;
}
} else {
/*
* Use the Xen system time for >=50000us delays. From my
* testing, it seems to sometimes run backward by about
* 110us, which is not so bad.
*/
uint64_t us_ns = 1000 * (uint64_t)us;
uint64_t start_ns;
/* Get the start time. */
start_ns = xen_vcputime_sched_systime_ns();
/* Wait until the system time has passed the end. */
do {
HYPERVISOR_yield();
} while (xen_vcputime_sched_systime_ns() - start_ns < us_ns);
}
/* Unbind from the CPU if we weren't already bound. */
curlwp_bindx(bound);
}
/*
* xen_suspendclocks(ci)
*
* Stop handling the Xen timer event on the CPU of ci. Caller
* must be running on and bound to ci's CPU.
*
* Actually, caller must have kpreemption disabled, because that's
* easier to assert at the moment.
*/
void
xen_suspendclocks(struct cpu_info *ci)
{
int evtch;
KASSERT(ci == curcpu());
KASSERT(kpreempt_disabled());
/*
* Find the VIRQ_TIMER event channel and close it so new timer
* interrupt events stop getting delivered to it.
*
* XXX Should this happen later? This is not the reverse order
* of xen_resumeclocks. It is apparently necessary in this
* order only because we don't stash evtchn anywhere, but we
* could stash it.
*/
evtch = unbind_virq_from_evtch(VIRQ_TIMER);
KASSERT(evtch != -1);
/*
* Mask the event channel so we stop getting new interrupts on
* it.
*/
hypervisor_mask_event(evtch);
/*
* Now that we are no longer getting new interrupts, remove the
* handler and wait for any existing calls to the handler to
* complete. After this point, there can be no concurrent
* calls to xen_timer_handler.
*/
event_remove_handler(evtch,
__FPTRCAST(int (*)(void *), xen_timer_handler), ci);
aprint_verbose("Xen clock: removed event channel %d\n", evtch);
/* We'd better not have switched CPUs. */
KASSERT(ci == curcpu());
}
/*
* xen_resumeclocks(ci)
*
* Start handling the Xen timer event on the CPU of ci. Arm the
* Xen timer. Caller must be running on and bound to ci's CPU.
*
* Actually, caller must have kpreemption disabled, because that's
* easier to assert at the moment.
*/
void
xen_resumeclocks(struct cpu_info *ci)
{
char intr_xname[INTRDEVNAMEBUF];
int evtch;
int error __diagused;
KASSERT(ci == curcpu());
KASSERT(kpreempt_disabled());
/*
* Allocate an event channel to receive VIRQ_TIMER events.
*/
evtch = bind_virq_to_evtch(VIRQ_TIMER);
KASSERT(evtch != -1);
/*
* Set an event handler for VIRQ_TIMER events to call
* xen_timer_handler.
*/
snprintf(intr_xname, sizeof(intr_xname), "%s clock",
device_xname(ci->ci_dev));
/* XXX sketchy function pointer cast -- fix the API, please */
if (event_set_handler(evtch,
__FPTRCAST(int (*)(void *), xen_timer_handler),
ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL)
panic("failed to establish timer interrupt handler");
aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch);
/* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */
if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
ci->ci_vcpuid, NULL);
KASSERT(error == 0);
}
/* Pretend the last hardclock happened when we were last scheduled. */
ci->ci_xen_hardclock_systime_ns = xen_vcputime_sched_systime_ns();
/* Arm the one-shot timer. */
error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns +
NS_PER_TICK);
KASSERT(error == 0);
/*
* Ready to go. Unmask the event. After this point, Xen may
* start calling xen_timer_handler.
*/
hypervisor_unmask_event(evtch);
/* We'd better not have switched CPUs. */
KASSERT(ci == curcpu());
}
/*
* xen_timer_handler(cookie, frame)
*
* Periodic Xen timer event handler for NetBSD hardclock. Calls
* to this may get delayed(*), so we run hardclock as many times as
* we need to in order to cover the Xen system time that elapsed.
* After that, re-arm the timer to run again at the next tick.
* The cookie is the pointer to struct cpu_info.
*
* (*) n.b. comment in FreeBSD xen_timer.c says:
*
* Xen timers may fire up to 100us off
*/
static int
xen_timer_handler(void *cookie, struct clockframe *frame)
{
const uint64_t ns_per_tick = NS_PER_TICK;
struct cpu_info *ci = curcpu();
uint64_t last, now, delta, next;
int error;
KASSERT(cpu_intr_p());
KASSERT(cookie == ci);
#if defined(XENPV)
frame = NULL; /* We use values cached in curcpu() */
#endif
/*
* Find how many nanoseconds of Xen system time has elapsed
* since the last hardclock tick.
*/
last = ci->ci_xen_hardclock_systime_ns;
now = xen_global_systime_ns(); /* xxx use global, avoid skew! */
SDT_PROBE2(sdt, xen, hardclock, tick, last, now);
if (__predict_false(now < last)) {
SDT_PROBE2(sdt, xen, hardclock, systime__backward,
last, now);
#if XEN_CLOCK_DEBUG
device_printf(ci->ci_dev, "xen systime ran backwards"
" in hardclock %"PRIu64"ns\n",
last - now);
#endif
ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++;
/*
* we've lost track of time. Just pretends that one
* tick elapsed, and reset our idea of last tick.
*/
ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick;
}
delta = now - last;
if (__predict_false(delta >= 2*ns_per_tick)) {
/*
* Warn if we violate timecounter(9) contract: with a
* k-bit timeocunter (here k = 32), and timecounter
* frequency f (here f = 1 GHz), the maximum period
* between hardclock calls is 2^k / f.
*/
if (delta > xen_timecounter.tc_counter_mask) {
SDT_PROBE3(sdt, xen, hardclock, jump,
last, now, delta/ns_per_tick);
printf("WARNING: hardclock skipped %"PRIu64"ns"
" (%"PRIu64" -> %"PRIu64"),"
" exceeding maximum of %"PRIu32"ns"
" for timecounter(9)\n",
last, now, delta,
xen_timecounter.tc_counter_mask);
ci->ci_xen_timecounter_jump_evcnt.ev_count++;
}
/* don't try to catch up more than one second at once */
if (delta > 1000000000UL)
delta = 1000000000UL;
}
/*
* Play hardclock catchup: run the hardclock timer as many
* times as appears necessary based on how much time has
* passed.
*
* XXX This happens extremely frequently -- ~50-80% of the HZ rate!
*/
while (delta >= ns_per_tick) {
ci->ci_xen_hardclock_systime_ns += ns_per_tick;
delta -= ns_per_tick;
hardclock(frame);
if (__predict_false(delta >= ns_per_tick)) {
SDT_PROBE3(sdt, xen, hardclock, missed,
last, now, delta);
ci->ci_xen_missed_hardclock_evcnt.ev_count++;
}
}
/*
* Re-arm the timer. If it fails, it's probably because the desired
* time is in the past, possibly because we're in the process of
* catching up missed hardclock calls. In this case schedule a tick in
* the near future.
*/
next = ci->ci_xen_hardclock_systime_ns + ns_per_tick;
error = HYPERVISOR_set_timer_op(next);
if (error) {
/* xxx should there be an SDT_PROBE() here? So far this event is always zero... */
ci->ci_xen_next_hardclock_in_past_evcnt.ev_count++;
next = now + ns_per_tick / 2;
error = HYPERVISOR_set_timer_op(next);
if (error) {
panic("failed to re-arm Xen timer %d", error);
}
}
/* Success! */
return 0;
}
/*
* xen_initclocks()
*
* Initialize the Xen clocks on the current CPU.
*/
void
xen_initclocks(void)
{
struct cpu_info *ci = curcpu();
/* If this is the primary CPU, do global initialization first. */
if (ci == &cpu_info_primary) {
/* Initialize the systemwide Xen timecounter. */
tc_init(&xen_timecounter);
}
/* Attach the event counters. */
evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt,
EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
"xen systime went backwards in hardclock");
evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt,
EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
"xen missed hardclock");
evcnt_attach_dynamic(&ci->ci_xen_tsc_backwards_evcnt,
EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
"xen tsc ran backwards");
evcnt_attach_dynamic(&ci->ci_xen_global_ns_backwards_evcnt,
EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
"xen global_ns prevented from running backwards");
evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt,
EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
"xen hardclock jumped past timecounter max");
evcnt_attach_dynamic(&ci->ci_xen_next_hardclock_in_past_evcnt,
EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
"xen next hardclock time was in the past");
/* Fire up the clocks. */
xen_resumeclocks(ci);
#ifdef DOM0OPS
/*
* If this is a privileged dom0, start pushing the wall
* clock time back to the Xen hypervisor.
*/
if (ci == &cpu_info_primary && xendomain_is_privileged())
xen_timepush_init();
#endif
}
#ifdef DOM0OPS
/*
* xen_timepush_init()
*
* Initialize callout to periodically set Xen hypervisor's wall
* clock time.
*/
static void
xen_timepush_init(void)
{
struct sysctllog *log = NULL;
const struct sysctlnode *node = NULL;
int error;
/* Start periodically updating the hypervisor's wall clock time. */
callout_init(&xen_timepush.ch, 0);
callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);
/* Pick a default frequency for timepush. (Linux uses 11 minutes) */
xen_timepush.ticks = 530*hz + 3; /* avoid exact # of min/sec */
/* Create machdep.xen node. */
/* XXX Creation of the `machdep.xen' node should be elsewhere. it is! see ballon.c, hypervisor.c, AND xen_machdep.c */
error = sysctl_createv(&log, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "xen",
SYSCTL_DESCR("Xen top level node"),
NULL, 0, NULL, 0,
CTL_MACHDEP, CTL_CREATE, CTL_EOL);
if (error)
goto fail;
KASSERT(node != NULL);
/* Create int machdep.xen.timepush_ticks knob. */
error = sysctl_createv(&log, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "timepush_ticks",
SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
" 0 to disable"),
sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
CTL_CREATE, CTL_EOL);
if (error)
goto fail;
/* Start the timepush callout. */
callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
/* Success! */
return;
fail: sysctl_teardown(&log);
}
/*
* xen_timepush_intr(cookie)
*
* Callout interrupt handler to push NetBSD's idea of the wall
* clock time, usually synchronized with NTP, back to the Xen
* hypervisor.
*/
static void
xen_timepush_intr(void *cookie)
{
resettodr();
if (xen_timepush.ticks)
callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
}
/*
* sysctl_xen_timepush(...)
*
* Sysctl handler to set machdep.xen.timepush_ticks.
*/
static int
sysctl_xen_timepush(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int ticks;
int error;
ticks = xen_timepush.ticks;
node = *rnode;
node.sysctl_data = &ticks;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (ticks < 0)
return EINVAL;
if (ticks != xen_timepush.ticks) {
xen_timepush.ticks = ticks;
if (ticks == 0)
callout_stop(&xen_timepush.ch);
else
callout_schedule(&xen_timepush.ch, ticks);
}
return 0;
}
#endif /* DOM0OPS */
static int xen_rtc_get(struct todr_chip_handle *, struct timeval *);
static int xen_rtc_set(struct todr_chip_handle *, struct timeval *);
static void xen_wallclock_time(struct timespec *);
/*
* xen time of day register:
*
* Xen wall clock time, plus a Xen vCPU system time adjustment.
*/
static struct todr_chip_handle xen_todr_chip = {
.todr_gettime = xen_rtc_get,
.todr_settime = xen_rtc_set,
};
/*
* xen_startrtclock()
*
* Initialize the real-time clock from x86 machdep autoconf.
*/
void
xen_startrtclock(void)
{
todr_attach(&xen_todr_chip);
}
/*
* xen_rtc_get(todr, tv)
*
* Get the current real-time clock from the Xen wall clock time
* and vCPU system time adjustment.
*/
static int
xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
{
struct timespec ts;
xen_wallclock_time(&ts);
TIMESPEC_TO_TIMEVAL(tvp, &ts);
return 0;
}
/*
* xen_rtc_set(todr, tv)
*
* Set the Xen wall clock time, if we can.
*/
static int
xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
{
#ifdef DOM0OPS
if (xendomain_is_privileged()) {
struct clock_ymdhms dt;
xen_platform_op_t op;
uint64_t systime_ns;
int error;
# ifdef XEN_CLOCK_DEBUG
struct timespec ts;
uint64_t nanouptime_ns;
int64_t dt_ns;
# endif
/* Convert to ymdhms and set the x86 ISA RTC. */
clock_secs_to_ymdhms(tvp->tv_sec, &dt);
rtc_set_ymdhms(NULL, &dt);
/*
*
* Get the domain's system time, to pass to XENPF_settime
*
* Xen will subtract this from our current wall clock time to
* get this domain's boot time, which Xen calls "wall clock
* time"
*/
/*
* xxx we could probably use nanouptime() here instead, but
* there's a weird so-far unaccounted-for difference -- it's too
* big, I think, to be the time between when Xen thinks it first
* created the domain, which should be systime_ns, and when the
* domain first started keeping time, which should be
* nanouptime_ns.
*/
systime_ns = xen_global_systime_ns();
/* Set the hypervisor wall clock time. */
memset(&op, 0, sizeof(op));
/*
* XXX NetBSD kernels are currently built with
* __XEN_INTERFACE_VERSION__ = 0x0003020a (see
* ../conf/std.xenversion), so we implicitly get
* XENPF_settime32 and struct xenpf_settime32.
*
* However it looks like with HYPERVISOR_platform_op() it should
* be possible to explicitly set op.interface_version to get the
* newer API/ABI so long as we explicitly use the version-
* specific opcode XENPF_settime64 and the new struct
* xenpf_settime64.
*
* xxx adjust XEN_CLOCK_DEBUG appropriately as well....
*/
# if 0
op.interface_version = XENPF_INTERFACE_VERSION;
op.cmd = XENPF_settime64;
op.u.settime64.mbz = 0;
op.u.settime64.secs = tvp->tv_sec;
op.u.settime64.nsecs = tvp->tv_usec * 1000;
op.u.settime64.system_time = systime_ns;
# else
op.cmd = XENPF_settime;
op.u.settime.secs = tvp->tv_sec;
op.u.settime.nsecs = tvp->tv_usec * 1000;
op.u.settime.system_time = systime_ns;
# endif
# ifdef XEN_CLOCK_DEBUG
nanouptime(&ts);
nanouptime_ns = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
dt_ns = (int64_t) op.u.settime.system_time - (int64_t) nanouptime_ns;
printf("xen_rtc_set: Setting to %"PRIu32".%09"PRIu32" s at systime %"PRIu64
" ns (nanouptime: %"PRIu64" ns, diff(st-nt): %"PRId64".%09"PRId64" s)\n",
op.u.settime.secs,
op.u.settime.nsecs,
op.u.settime.system_time,
nanouptime_ns,
(int64_t) (dt_ns / 1000000000LL), (int64_t) (dt_ns % 1000000000LL));
# endif
error = HYPERVISOR_platform_op(&op);
# ifdef XEN_CLOCK_DEBUG /* kern_todr.c:todr_save_systime() already reports with a printf */
if (error) {
printf("xen_rtc_set: XENPF_settime failed (%d)\n", error);
}
# endif
return error;
}
#endif
/* XXX Should this fail if not on privileged dom0? */
return 0;
}
/*
* xen_wallclock_time(tsp)
*
* Return the current low-resolution wall clock
* time (boot time of the domain plus systime), in tsp.
*/
static void
xen_wallclock_time(struct timespec *tsp)
{
struct xen_wallclock_ticket ticket;
uint64_t systime_ns;
int s = splsched(); /* make sure we won't be interrupted */
/* Read the last wall clock sample from the hypervisor. */
do {
xen_wallclock_enter(&ticket);
tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
} while (!xen_wallclock_exit(&ticket));
/* Get the global system time. */
systime_ns = xen_global_systime_ns();
splx(s);
/* Add the system time to the wall clock time. */
systime_ns += tsp->tv_nsec;
tsp->tv_sec += systime_ns / 1000000000ull;
tsp->tv_nsec = systime_ns % 1000000000ull;
}
#ifdef XENPV
/*
* setstatclockrate(rate)
*
* Set the statclock to run at rate, in units of ticks per second.
*
* Currently Xen does not have a separate statclock, so this is a
* noop; instad the statclock runs in hardclock.
*/
void
setstatclockrate(int rate)
{
}
#endif /* XENPV */
Attachment:
pgpGgiZUJ0fW9.pgp
Description: OpenPGP Digital Signature