Subject: Re: Xen time issues
To: None <tech-kern@netbsd.org>
From: Jed Davis <jdev@panix.com>
List: tech-kern
Date: 08/01/2006 23:03:37
--=-=-=

Well, it's pretty clear that I haven't found time to look at this any
over the past few weeks, so I'm going to do what I should have done a
few weeks ago and send the diff I have for timecounters under Xen, so
other people can look at it if they want.

I've tested it on a Pentium 4, and it seems to work as well as the TSC
source on NetBSD/i386 does (i.e., not so well when hyperthreading is
on, and Xen has issues with it disabled); but it seems it won't handle
the mysterious backsteps that are being seen on some AMD platforms.

And, as always, there's a confused comment or two.

-- 
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l))))))  (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k)))))))    '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))

--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment; filename=xen-tc.diff
Content-Description: Timecounters for /xen -- but maybe not robust enough.

Index: arch/xen/conf/files.xen
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/conf/files.xen,v
retrieving revision 1.45
diff -u -p -r1.45 files.xen
--- arch/xen/conf/files.xen	12 Jul 2006 15:02:15 -0000	1.45
+++ arch/xen/conf/files.xen	1 Aug 2006 01:57:44 -0000
@@ -44,7 +44,6 @@ file	arch/xen/i386/machdep.c
 file	arch/xen/i386/identcpu.c
 file	arch/i386/i386/math_emulate.c	math_emulate
 file	arch/i386/i386/mem.c
-file	kern/kern_microtime.c		i586_cpu | i686_cpu
 file	arch/i386/i386/mtrr_k6.c	mtrr
 file	netns/ns_cksum.c		ns
 file	arch/xen/i386/pmap.c
Index: arch/xen/i386/machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/i386/machdep.c,v
retrieving revision 1.28
diff -u -p -r1.28 machdep.c
--- arch/xen/i386/machdep.c	22 May 2006 13:44:53 -0000	1.28
+++ arch/xen/i386/machdep.c	1 Aug 2006 01:57:46 -0000
@@ -281,7 +281,6 @@ void (*microtime_func)(struct timeval *)
 void (*initclock_func)(void) = i8254_initclocks;
 #else
 void (*delay_func)(int) = xen_delay;
-void (*microtime_func)(struct timeval *) = xen_microtime;
 void (*initclock_func)(void) = xen_initclocks;
 #endif
 
Index: arch/xen/include/cpu.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/cpu.h,v
retrieving revision 1.12
diff -u -p -r1.12 cpu.h
--- arch/xen/include/cpu.h	16 Feb 2006 20:17:15 -0000	1.12
+++ arch/xen/include/cpu.h	1 Aug 2006 01:57:48 -0000
@@ -58,7 +58,6 @@
 #include <sys/device.h>
 #include <sys/lock.h>			/* will also get LOCKDEBUG */
 #include <sys/cpu_data.h>
-#include <sys/cc_microtime.h>
 
 #include <lib/libkern/libkern.h>	/* offsetof */
 
@@ -84,7 +83,6 @@ struct cpu_info {
 	cpuid_t ci_cpuid;		/* our CPU ID */
 	u_int ci_apicid;		/* our APIC ID */
 	struct cpu_data ci_data;	/* MI per-cpu data */
-	struct cc_microtime_state ci_cc;/* cc_microtime state */
 
 	/*
 	 * Private members.
@@ -296,12 +294,9 @@ struct clockframe {
  * We need a machine-independent name for this.
  */
 extern void (*delay_func)(int);
-struct timeval;
-extern void (*microtime_func)(struct timeval *);
 
 #define	DELAY(x)		(*delay_func)(x)
 #define delay(x)		(*delay_func)(x)
-#define microtime(tv)		(*microtime_func)(tv)
 
 /*
  * pull in #defines for kinds of processors
@@ -377,6 +372,7 @@ void	savectx(struct pcb *);
 void	proc_trampoline(void);
 
 /* clock.c */
+/* XXX: Are the i8254 funcs ever used under Xen?  Should they be? */
 #ifdef ISA_CLOCK
 void	initrtclock(void);
 void	startrtclock(void);
@@ -386,7 +382,6 @@ void	i8254_initclocks(void);
 #else
 void	startrtclock(void);
 void	xen_delay(int);
-void	xen_microtime(struct timeval *);
 void	xen_initclocks(void);
 #endif
 
Index: arch/xen/include/types.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/types.h,v
retrieving revision 1.2
diff -u -p -r1.2 types.h
--- arch/xen/include/types.h	7 Jun 2006 22:45:21 -0000	1.2
+++ arch/xen/include/types.h	1 Aug 2006 01:57:48 -0000
@@ -77,4 +77,6 @@ typedef	volatile int		__cpu_simple_lock_
 #define __HAVE_RAS
 #endif
 
+#define __HAVE_TIMECOUNTER
+
 #endif	/* _MACHTYPES_H_ */
Index: arch/xen/xen/clock.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/clock.c,v
retrieving revision 1.27
diff -u -p -r1.27 clock.c
--- arch/xen/xen/clock.c	11 Jul 2006 12:26:58 -0000	1.27
+++ arch/xen/xen/clock.c	1 Aug 2006 01:57:49 -0000
@@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/time.h>
+#include <sys/timetc.h>
+#include <sys/timevar.h>
 #include <sys/kernel.h>
 #include <sys/device.h>
 #include <sys/sysctl.h>
@@ -58,17 +60,36 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
 
 static int xen_timer_handler(void *, struct intrframe *);
 
-/* These are peridically updated in shared_info, and then copied here. */
+/* A timecounter: Xen system_time extrapolated with a TSC. */
+u_int xen_get_timecount(struct timecounter*);
+static struct timecounter xen_timecounter = {
+	.tc_get_timecount = xen_get_timecount,
+	.tc_poll_pps = NULL,
+	.tc_counter_mask = ~0U,
+	.tc_frequency = 1000000000ULL,
+	.tc_name = "xen_system_time",
+	.tc_quality = 800 /* ??? */
+};
+
+/* These are periodically updated in shared_info, and then copied here. */
 volatile static uint64_t shadow_tsc_stamp;
 volatile static uint64_t shadow_system_time;
 #ifndef XEN3
 volatile static unsigned long shadow_time_version;
 #endif
-volatile static struct timeval shadow_tv;
+volatile static struct timespec shadow_ts;
 
 static int timeset;
 
-static uint64_t processed_system_time;
+/* The time when the last hardclock(9) call should have taken place. */
+static volatile uint64_t processed_system_time;
+
+/*
+ * The clock (as returned by xen_get_timecount) may need to be held
+ * back to maintain the illusion that hardclock(9) was called when it
+ * was supposed to be, not when Xen got around to scheduling us.
+ */
+static volatile uint64_t xen_clock_bias = 0;
 
 #ifdef DOM0OPS
 /* If we're dom0, send our time to Xen every minute or so. */
@@ -80,7 +101,7 @@ static struct callout xen_timepush_co = 
 
 /*
  * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.  Must be called at splclock.
+ * area.  Must be called at splhigh (per timecounter requirements).
  */
 static void
 get_time_values_from_xen(void)
@@ -99,25 +120,29 @@ get_time_values_from_xen(void)
 	do {
 		tversion = HYPERVISOR_shared_info->wc_version;
 		x86_lfence();
-		shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
-		shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_nsec;
+		shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+		shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
 		x86_lfence();
 	} while ((HYPERVISOR_shared_info->wc_version & 1) ||
 	    (tversion != HYPERVISOR_shared_info->wc_version));
-	shadow_tv.tv_usec = shadow_tv.tv_usec / 1000;
 #else /* XEN3 */
 	do {
 		shadow_time_version = HYPERVISOR_shared_info->time_version2;
 		x86_lfence();
-		shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
-		shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec;
+		shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+		shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_usec;
 		shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
 		shadow_system_time = HYPERVISOR_shared_info->system_time;
 		x86_lfence();
 	} while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
+	shadow_ts.tv_nsec *= 1000;
 #endif
 }
 
+/* 
+ * Use cycle counter to determine ns elapsed since last Xen time update.
+ * Must be called at splhigh (per timecounter requirements).
+ */
 static uint64_t
 get_tsc_offset_ns(void)
 {
@@ -125,6 +150,10 @@ get_tsc_offset_ns(void)
 	struct cpu_info *ci = curcpu();
 
 	tsc_delta = cpu_counter() - shadow_tsc_stamp;
+	/*
+	 * XXX with Xen 3 updating the time info only once per second,
+	 * this will overflow with a ~18 GHz TSC.
+	 */
 	offset = tsc_delta * 1000000000ULL / cpu_frequency(ci);
 #ifdef XEN_CLOCK_DEBUG
 	if (offset > 10000000000ULL)
@@ -137,11 +166,8 @@ get_tsc_offset_ns(void)
 void
 inittodr(time_t base)
 {
+	struct timespec sts;
 	int s;
-	struct cpu_info *ci = curcpu();
-#if defined(XEN3)
-	uint64_t t;
-#endif /* defined(XEN3) */
 
 	/*
 	 * if the file system time is more than a year older than the
@@ -152,28 +178,16 @@ inittodr(time_t base)
 		base = CONFIG_TIME;
 	}
 
-	s = splclock();
+	s = splhigh();
 	get_time_values_from_xen();
+	sts = shadow_ts;
 	splx(s);
 
-#if defined(XEN3)
-	t = (shadow_tv.tv_sec + rtc_offset * 60) * UINT64_C(1000000) +
-	    shadow_tv.tv_usec + processed_system_time / 1000;
-	time.tv_usec = t % UINT64_C(1000000);
-	time.tv_sec = t / UINT64_C(1000000);
-#else /* defined(XEN3) */
-	time.tv_usec = shadow_tv.tv_usec;
-	time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60;
-#endif /* defined(XEN3) */
-#ifdef XEN_CLOCK_DEBUG
-	printf("readclock: %ld (%ld)\n", time.tv_sec, base);
-#endif
-	/* reset microset, so that the next call to microset() will init */
-	ci->ci_cc.cc_denom = 0;
-
-	if (base != 0 && base < time.tv_sec - 5*SECYR)
+	tc_setclock(&sts); /* XXX what about rtc_offset? */
+	
+	if (base != 0 && base < time_second - 5*SECYR)
 		printf("WARNING: file system time much less than clock time\n");
-	else if (base > time.tv_sec + 5*SECYR) {
+	else if (base > time_second + 5*SECYR) {
 		printf("WARNING: clock time much less than file system time\n");
 		printf("WARNING: using file system time\n");
 		goto fstime;
@@ -184,12 +198,13 @@ inittodr(time_t base)
 
 fstime:
 	timeset = 1;
-	time.tv_sec = base;
+	sts.tv_sec = base;
+	tc_setclock(&sts);
 	printf("WARNING: CHECK AND RESET THE DATE!\n");
 }
 
-static void
-resettodr_i(void)
+void
+resettodr(void)
 {
 #ifdef DOM0OPS
 	dom0_op_t op;
@@ -206,8 +221,8 @@ resettodr_i(void)
 	if (!timeset)
 		return;
 
-#ifdef DEBUG_CLOCK
-        {
+#ifdef XXX_DEBUG_CLOCK
+        {       /* XXX annoying debug printf not yet timecounterized */
 		char pm;
  
 		if (timercmp(&time, &shadow_tv, >)) {
@@ -223,38 +238,26 @@ resettodr_i(void)
 #endif
 #ifdef DOM0OPS
 	if (xen_start_info.flags & SIF_PRIVILEGED) {
-		s = splclock();
+		struct timespec now;
 
 		op.cmd = DOM0_SETTIME;
-		op.u.settime.secs	 = time.tv_sec - rtc_offset * 60;
+		nanotime(&now);
+		/* XXX is rtc_offset handled correctly everywhere? */
+		op.u.settime.secs	 = now.tv_sec - rtc_offset * 60;
 #ifdef XEN3
-		op.u.settime.nsecs	 = time.tv_usec * 1000;
+		op.u.settime.nsecs	 = now.tv_nsec;
 #else
-		op.u.settime.usecs	 = time.tv_usec;
+		op.u.settime.usecs	 = now.tv_nsec / 1000;
 #endif
-		op.u.settime.system_time = processed_system_time;
-		HYPERVISOR_dom0_op(&op);
-
+		s = splhigh();
+		op.u.settime.system_time = shadow_system_time
+		    + get_tsc_offset_ns();
 		splx(s);
+		HYPERVISOR_dom0_op(&op);
 	}
 #endif
 }
 
-/*
- * When the clock is administratively set, in addition to resetting
- * Xen's clock if possible, we should also allow xen_microtime to 
- * step backwards without complaint.
- */
-void
-resettodr()
-{
-	CPU_INFO_ITERATOR cii;
-	struct cpu_info *ci;
-
-	resettodr_i();
-	for (CPU_INFO_FOREACH(cii, ci))
-		timerclear(&ci->ci_cc.cc_time);
-}
 
 void
 startrtclock()
@@ -292,78 +295,19 @@ xen_delay(int n)
 		return;
 	} else {
 		uint64_t when;
-
+		int s;
 		/* for large delays, shadow_system_time is OK */
+		
+		s = splhigh();
 		get_time_values_from_xen();
 		when = shadow_system_time + n * 1000;
-		while (shadow_system_time < when)
+		while (shadow_system_time < when) {
+			splx(s);
+			s = splhigh();
 			get_time_values_from_xen();
-	}
-}
-
-/*
- * A MD microtime for xen.
- *
- * This abuses/reuses the cc_microtime fields already in cpuinfo:
- *  cc_ms_delta = usec added to time(9) on last call to hardclock;
- *                this is used to scale the actual elapsed time 
- *        cc_cc = reference value of cpu_counter()
- *     cc_denom = nsec between last hardclock(9) and time of cc_cc setting
- *                (provided by Xen)
- *
- * We are taking Xen's word for the CPU frequency rather than trying to
- * time it ourselves like cc_microtime does, since Xen could reschedule
- * our virtual CPU(s) onto any physical CPU and only tell us afterwards
- * with a clock interrupt -- and that could invalidate all stored
- * cpu_counter values.
- */
-void
-xen_microtime(struct timeval *tv)
-{
-	int s = splclock();
-	struct cpu_info *ci = curcpu();
-	int64_t cycles;
-	
-	*tv = time;
-	/* Extrapolate from hardclock()'s last step. */
-	cycles = cpu_counter() - ci->ci_cc.cc_cc;
-#ifdef XEN_CLOCK_DEBUG
-	if (cycles <= 0) {
-		printf("xen_microtime: CPU counter has decreased by %" PRId64
-		    " since last hardclock(9)\n", -cycles);
- 	}
-#endif
-	cycles += ci->ci_cc.cc_denom * cpu_frequency(ci) / 1000000000LL;
-	tv->tv_usec += cycles * ci->ci_cc.cc_ms_delta * hz / cpu_frequency(ci);
-#ifdef XEN_CLOCK_DEBUG
-	if (tv->tv_usec >= 2000000)
-		printf("xen_microtime: unexpectedly large tv_usec %ld\n", tv->tv_usec);
-#endif
-	if (tv->tv_usec >= 1000000) {
-		tv->tv_sec++;
-		tv->tv_usec -= 1000000;
-	}
-	/* Avoid small backsteps, e.g. at the beginning of a negative adjustment. */
-	if (timerisset(&ci->ci_cc.cc_time) &&	
-	    timercmp(tv, &ci->ci_cc.cc_time, <)) {
-		struct timeval backstep;
-
-		/* XXXjld: not sure if this check can be safely removed now */
-		timersub(&ci->ci_cc.cc_time, tv, &backstep);
-		if (backstep.tv_sec == 0) { /* if it was < 1sec */
-			*tv = ci->ci_cc.cc_time;
-#ifdef XEN_CLOCK_DEBUG
-			printf("xen_microtime[%d]: clamping at %ld.%06ld (-%ldus)\n",
-			    (int)ci->ci_cpuid, tv->tv_sec, tv->tv_usec, backstep.tv_usec);
-		} else {
-			printf("xen_microtime[%d]: allowing large backstep "
-			    "%lds to %ld.%06ld\n", (int)ci->ci_cpuid,
-			    backstep.tv_sec, tv->tv_sec, tv->tv_usec);
-#endif
 		}
+		splx(s);
 	}
-	ci->ci_cc.cc_time = *tv;
-	splx(s);
 }
 
 #ifdef DOM0OPS
@@ -373,7 +317,7 @@ xen_timepush(void *arg)
 {
 	struct callout *co = arg;
 
-	resettodr_i();
+	resettodr();
 	if (xen_timepush_ticks > 0)
 		callout_schedule(co, xen_timepush_ticks);
 }
@@ -406,6 +350,16 @@ sysctl_xen_timepush(SYSCTLFN_ARGS)
 }
 #endif
 
+/* ARGSUSED */
+u_int
+xen_get_timecount(struct timecounter *tc)
+{
+	uint64_t ns;
+
+	ns = shadow_system_time + get_tsc_offset_ns() - xen_clock_bias;
+	return (u_int)ns;
+}
+
 void
 xen_initclocks()
 {
@@ -416,6 +370,8 @@ xen_initclocks()
 
 	get_time_values_from_xen();
 	processed_system_time = shadow_system_time;
+	tc_init(&xen_timecounter);
+	/* The splhigh requirements start here. */
 
 	event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
 	    NULL, IPL_CLOCK, "clock");
@@ -435,47 +391,33 @@ xen_initclocks()
 #endif
 }
 
+/* ARGSUSED */
 static int
 xen_timer_handler(void *arg, struct intrframe *regs)
 {
-	int64_t delta, newcc;
-	int ticks_done;
-	struct timeval oldtime, elapsed;
-	struct cpu_info *ci = curcpu();
-	
-	get_time_values_from_xen();
-	newcc = cpu_counter();
+	int64_t delta;
+	int s;
 
-	ticks_done = 0;
+	s = splhigh();
+	get_time_values_from_xen();
 	delta = (int64_t)(shadow_system_time + get_tsc_offset_ns()
 	    - processed_system_time);
-	while (delta >= (int64_t)NS_PER_TICK) {
-		/* Have hardclock do its thing. */
-		oldtime = time;
-		hardclock((struct clockframe *)regs);
-		
-		/* Use that tick length for the coming tick's microtimes. */
-		timersub(&time, &oldtime, &elapsed);
-#ifdef XEN_CLOCK_DEBUG
-		if (elapsed.tv_sec != 0) {
-			printf("xen_timer_handler: hardclock(9) stepped by %ld.%06lds\n",
-			    elapsed.tv_sec, elapsed.tv_usec);
-		}
-#endif
-		ci->ci_cc.cc_ms_delta = elapsed.tv_usec;
+	splx(s);
 
-		delta -= NS_PER_TICK;
+	/* Several ticks may have passed without our being run; catch up. */
+	while (delta >= (int64_t)NS_PER_TICK) {
+		s = splhigh();
 		processed_system_time += NS_PER_TICK;
-		ticks_done++;
+		xen_clock_bias = (delta -= NS_PER_TICK);
+		splx(s);
+		hardclock((struct clockframe *)regs);
+	}
+	
+	if (xen_clock_bias) {
+		s = splhigh();
+ 		xen_clock_bias = 0;
+		splx(s);
 	}
-
-	/*
-	 * Right now, delta holds the number of ns elapsed from when the last
-	 * hardclock(9) allegedly was to when this domain/vcpu was actually
-	 * rescheduled.
-	 */
-	ci->ci_cc.cc_denom = delta;
-	ci->ci_cc.cc_cc = newcc;
 
 	return 0;
 }
@@ -488,12 +430,17 @@ setstatclockrate(int arg)
 void
 idle_block(void)
 {
+	uint64_t next_tick;
+	int s;
 	/*
 	 * We set the timer to when we expect the next timer
 	 * interrupt.  We could set the timer to later if we could
 	 * easily find out when we will have more work (callouts) to
 	 * process from hardclock.
 	 */
-	if (HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK) == 0)
+	s = splhigh();
+	next_tick = processed_system_time + NS_PER_TICK;
+	splx(s); /* XXX should this go after the set_timer_op? */
+	if (HYPERVISOR_set_timer_op(next_tick) == 0)
 		HYPERVISOR_block();
 }

--=-=-=--