Subject: xen_microtime() revisited
To: None <port-xen@NetBSD.org>
From: Jed Davis <jdev@panix.com>
List: port-xen
Date: 02/28/2006 23:12:21
--=-=-=

I've attached a patch that changes the microtime function used under
Xen from cc_microtime to a xen_microtime that's actually useful -- it
extrapolates from the last step made by hardclock(9) by combining the
Xen system_time "left over" after the last call to hardclock(9) with
the CPU counter difference from when the system_time was last updated.

This works out to be a lot like cc_microtime, except that it uses
Xen's idea of the CPU frequency instead of trying to measure it using
the clock interrupt -- which latter, since the "clock interrupt" here
might be delivered with a certain amount of jitter even in
non-degenerate cases, seems like a bad idea.

There is also a check to keep the clock from going backwards by
clamping the returned time; this applies for only backsteps of <1s,
because (as the accompanying XXX comment states) I haven't attempted
to specifically exempt administrative settimeofday use, and also out
of general paranoia (i.e., something I haven't thought of goes wrong,
and then the clock spends a few hours stopped, and I find out the hard
way the next morning).  This will be triggered in the case where the
time(9) clock speed is reduced, e.g. by the start or end of an
adjtime(2) adjustment; the extrapolation overshoots what hardclock(9)
actually does at the end of that tick, and so winds up a few
microseconds fast just that once.  (For the rest of the adjustment,
it's fine.)  This could probably be handled more elegantly.

The XEN_CLOCK_DEBUG define may of course be removed if one doesn't
want to get a printf whenever the above check is triggered.

This may not be even remotely correct as far as MP is concerned.

I don't know what the implications of timecounters on this will be; in
any case, the release branches won't have timecounters.

-- 
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l))))))  (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k)))))))    '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))

--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment; filename=xen-microtime.diff
Content-Description: New microtime for NetBSD/Xen

Index: arch/xen/xen/clock.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/clock.c,v
retrieving revision 1.16
diff -u -p -r1.16 clock.c
--- arch/xen/xen/clock.c	15 Jan 2006 22:09:52 -0000	1.16
+++ arch/xen/xen/clock.c	28 Feb 2006 22:57:24 -0000
@@ -51,6 +51,8 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
 
 #include "config_time.h"		/* for CONFIG_TIME */
 
+#define XEN_CLOCK_DEBUG
+
 static int xen_timer_handler(void *, struct intrframe *);
 
 /* These are peridically updated in shared_info, and then copied here. */
@@ -110,11 +112,15 @@ get_time_values_from_xen(void)
 static uint64_t
 get_tsc_offset_ns(void)
 {
-	uint32_t tsc_delta;
+	uint64_t tsc_delta, offset;
 	struct cpu_info *ci = curcpu();
 
-	tsc_delta = cpu_counter32() - shadow_tsc_stamp;
-	return tsc_delta * 1000000000 / cpu_frequency(ci);
+	tsc_delta = cpu_counter() - shadow_tsc_stamp;
+	offset = tsc_delta * 1000000000ULL / cpu_frequency(ci);
+	if (offset > 10 * NS_PER_TICK)
+		printf("get_tsc_offset_ns: tsc_delta=%llu offset=%llu\n",
+		    tsc_delta, offset); 
+	return offset;
 }
 
 void
@@ -249,12 +255,59 @@ xen_delay(int n)
 	}
 }
 
+/*
+ * A MD microtime for xen.
+ *
+ * This abuses/reuses the cc_microtime fields already in cpuinfo:
+ *  cc_ms_delta = usec added to time(9) on last call to hardclock;
+ *                this is used to scale the actual elapsed time 
+ *        cc_cc = reference value of cpu_counter()
+ *     cc_denom = nsec between last hardclock(9) and time of cc_cc setting
+ *                (provided by Xen)
+ *
+ * We are taking Xen's word for the CPU frequency rather than trying to
+ * time it ourselves like cc_microtime does, since Xen could reschedule
+ * our virtual CPU(s) onto any physical CPU and only tell us afterwards
+ * with a clock interrupt -- and that could invalidate all stored
+ * cpu_counter values.
+ */
 void
 xen_microtime(struct timeval *tv)
 {
 	int s = splclock();
-	get_time_values_from_xen();
-	*tv = shadow_tv;
+	struct cpu_info *ci = curcpu();
+	int64_t cycles;
+	
+	*tv = time;
+	/* Extrapolate from hardclock()'s last step. */
+	cycles = cpu_counter() - ci->ci_cc.cc_cc;
+	KDASSERT(cycles > 0);
+	cycles += ci->ci_cc.cc_denom * cpu_frequency(ci) / 1000000000LL;
+	tv->tv_usec += cycles * ci->ci_cc.cc_ms_delta * hz / cpu_frequency(ci);
+	KDASSERT(tv->tv_usec < 2000000);
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_sec++;
+		tv->tv_usec -= 1000000;
+	}
+	/* Avoid backsteps, e.g. at the beginning of a negative adjustment. */
+	if (timerisset(&ci->ci_cc.cc_time) &&	
+	    timercmp(tv, &ci->ci_cc.cc_time, <)) {
+		struct timeval backstep; /* XXX hook resettodr() instead of doing this. */
+
+		timersub(&ci->ci_cc.cc_time, tv, &backstep);
+		if (backstep.tv_sec == 0) { /* if it was < 1sec */
+			*tv = ci->ci_cc.cc_time;
+#ifdef XEN_CLOCK_DEBUG
+			printf("xen_microtime[%d]: clamping at %ld.%06ld (-%ldus)\n",
+			    (int)ci->ci_cpuid, tv->tv_sec, tv->tv_usec, backstep.tv_usec);
+		} else {
+			printf("xen_microtime[%d]: allowing large backstep "
+			    "%lds to %ld.%06ld\n", (int)ci->ci_cpuid,
+			    backstep.tv_sec, tv->tv_sec, tv->tv_usec);
+#endif
+		}
+	}
+	ci->ci_cc.cc_time = *tv;
 	splx(s);
 }
 
@@ -276,34 +329,40 @@ xen_initclocks()
 static int
 xen_timer_handler(void *arg, struct intrframe *regs)
 {
-	int64_t delta;
-	static int microset_iter = 0; /* call cc_microset once/sec */
+	int64_t delta, newcc;
+	int ticks_done;
+	struct timeval oldtime, elapsed;
 	struct cpu_info *ci = curcpu();
 	
 	get_time_values_from_xen();
+	newcc = cpu_counter();
 
-	delta = (int64_t)(shadow_system_time + get_tsc_offset_ns() -
-			  processed_system_time);
+	ticks_done = 0;
+	delta = (int64_t)(shadow_system_time + get_tsc_offset_ns()
+	    - processed_system_time);
 	while (delta >= NS_PER_TICK) {
-		if (ci->ci_feature_flags & CPUID_TSC) {
-			if (
-#if defined(MULTIPROCESSOR)
-		 	   CPU_IS_PRIMARY(ci) &&
-#endif
-			    (microset_iter--) == 0) {
-				microset_iter = hz - 1;
-#if defined(MULTIPROCESSOR)
-				x86_broadcast_ipi(X86_IPI_MICROSET);
-#endif
-				cc_microset_time = time;
-				cc_microset(ci);
-			}
-		}
+		/* Have hardclock do its thing. */
+		oldtime = time;
 		hardclock((struct clockframe *)regs);
+		
+		/* Use that tick length for the coming tick's microtimes. */
+		timersub(&time, &oldtime, &elapsed);
+		KDASSERT(elapsed.tv_sec == 0);
+		ci->ci_cc.cc_ms_delta = elapsed.tv_usec;
+
 		delta -= NS_PER_TICK;
 		processed_system_time += NS_PER_TICK;
+		ticks_done++;
 	}
 
+	/*
+	 * Right now, delta holds the number of ns elapsed from when the last
+	 * hardclock(9) allegedly was to when this domain/vcpu was actually
+	 * rescheduled.
+	 */
+	ci->ci_cc.cc_denom = delta;
+	ci->ci_cc.cc_cc = newcc;
+
 	return 0;
 }
 
Index: arch/xen/i386/identcpu.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/i386/identcpu.c,v
retrieving revision 1.9
diff -u -p -r1.9 identcpu.c
--- arch/xen/i386/identcpu.c	15 Jan 2006 22:09:51 -0000	1.9
+++ arch/xen/i386/identcpu.c	28 Feb 2006 22:57:25 -0000
@@ -1174,9 +1174,11 @@ identifycpu(struct cpu_info *ci)
 		 * being probed.. */
 		ci->ci_tsc_freq = HYPERVISOR_shared_info->cpu_freq;
 #endif /* XEN3 */
+#if 0
 #ifndef NO_TSC_TIME
 		microtime_func = cc_microtime;
 #endif
+#endif
 	}
 
 	snprintf(cpu_model, sizeof(cpu_model), "%s%s%s%s%s%s%s (%s-class)",

--=-=-=--