Port-xen archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Xen time issues



Well, it's pretty clear that I haven't found time to look at this any
over the past few weeks, so I'm going to do what I should have done a
few weeks ago and send the diff I have for timecounters under Xen, so
other people can look at it if they want.

I've tested it on a Pentium 4, and it seems to work as well as the TSC
source on NetBSD/i386 does (i.e., not so well when hyperthreading is
on, and Xen has issues with it disabled); but it seems it won't handle
the mysterious backsteps that are being seen on some AMD platforms.

And, as always, there's a confused comment or two.

-- 
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l))))))  (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k)))))))    '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))
Index: arch/xen/conf/files.xen
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/conf/files.xen,v
retrieving revision 1.45
diff -u -p -r1.45 files.xen
--- arch/xen/conf/files.xen     12 Jul 2006 15:02:15 -0000      1.45
+++ arch/xen/conf/files.xen     1 Aug 2006 01:57:44 -0000
@@ -44,7 +44,6 @@ file  arch/xen/i386/machdep.c
 file   arch/xen/i386/identcpu.c
 file   arch/i386/i386/math_emulate.c   math_emulate
 file   arch/i386/i386/mem.c
-file   kern/kern_microtime.c           i586_cpu | i686_cpu
 file   arch/i386/i386/mtrr_k6.c        mtrr
 file   netns/ns_cksum.c                ns
 file   arch/xen/i386/pmap.c
Index: arch/xen/i386/machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/i386/machdep.c,v
retrieving revision 1.28
diff -u -p -r1.28 machdep.c
--- arch/xen/i386/machdep.c     22 May 2006 13:44:53 -0000      1.28
+++ arch/xen/i386/machdep.c     1 Aug 2006 01:57:46 -0000
@@ -281,7 +281,6 @@ void (*microtime_func)(struct timeval *)
 void (*initclock_func)(void) = i8254_initclocks;
 #else
 void (*delay_func)(int) = xen_delay;
-void (*microtime_func)(struct timeval *) = xen_microtime;
 void (*initclock_func)(void) = xen_initclocks;
 #endif
 
Index: arch/xen/include/cpu.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/cpu.h,v
retrieving revision 1.12
diff -u -p -r1.12 cpu.h
--- arch/xen/include/cpu.h      16 Feb 2006 20:17:15 -0000      1.12
+++ arch/xen/include/cpu.h      1 Aug 2006 01:57:48 -0000
@@ -58,7 +58,6 @@
 #include <sys/device.h>
 #include <sys/lock.h>                  /* will also get LOCKDEBUG */
 #include <sys/cpu_data.h>
-#include <sys/cc_microtime.h>
 
 #include <lib/libkern/libkern.h>       /* offsetof */
 
@@ -84,7 +83,6 @@ struct cpu_info {
        cpuid_t ci_cpuid;               /* our CPU ID */
        u_int ci_apicid;                /* our APIC ID */
        struct cpu_data ci_data;        /* MI per-cpu data */
-       struct cc_microtime_state ci_cc;/* cc_microtime state */
 
        /*
         * Private members.
@@ -296,12 +294,9 @@ struct clockframe {
  * We need a machine-independent name for this.
  */
 extern void (*delay_func)(int);
-struct timeval;
-extern void (*microtime_func)(struct timeval *);
 
 #define        DELAY(x)                (*delay_func)(x)
 #define delay(x)               (*delay_func)(x)
-#define microtime(tv)          (*microtime_func)(tv)
 
 /*
  * pull in #defines for kinds of processors
@@ -377,6 +372,7 @@ void        savectx(struct pcb *);
 void   proc_trampoline(void);
 
 /* clock.c */
+/* XXX: Are the i8254 funcs ever used under Xen?  Should they be? */
 #ifdef ISA_CLOCK
 void   initrtclock(void);
 void   startrtclock(void);
@@ -386,7 +382,6 @@ void        i8254_initclocks(void);
 #else
 void   startrtclock(void);
 void   xen_delay(int);
-void   xen_microtime(struct timeval *);
 void   xen_initclocks(void);
 #endif
 
Index: arch/xen/include/types.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/types.h,v
retrieving revision 1.2
diff -u -p -r1.2 types.h
--- arch/xen/include/types.h    7 Jun 2006 22:45:21 -0000       1.2
+++ arch/xen/include/types.h    1 Aug 2006 01:57:48 -0000
@@ -77,4 +77,6 @@ typedef       volatile int            __cpu_simple_lock_
 #define __HAVE_RAS
 #endif
 
+#define __HAVE_TIMECOUNTER
+
 #endif /* _MACHTYPES_H_ */
Index: arch/xen/xen/clock.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/clock.c,v
retrieving revision 1.27
diff -u -p -r1.27 clock.c
--- arch/xen/xen/clock.c        11 Jul 2006 12:26:58 -0000      1.27
+++ arch/xen/xen/clock.c        1 Aug 2006 01:57:49 -0000
@@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/time.h>
+#include <sys/timetc.h>
+#include <sys/timevar.h>
 #include <sys/kernel.h>
 #include <sys/device.h>
 #include <sys/sysctl.h>
@@ -58,17 +60,36 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
 
 static int xen_timer_handler(void *, struct intrframe *);
 
-/* These are peridically updated in shared_info, and then copied here. */
+/* A timecounter: Xen system_time extrapolated with a TSC. */
+u_int xen_get_timecount(struct timecounter*);
+static struct timecounter xen_timecounter = {
+       .tc_get_timecount = xen_get_timecount,
+       .tc_poll_pps = NULL,
+       .tc_counter_mask = ~0U,
+       .tc_frequency = 1000000000ULL,
+       .tc_name = "xen_system_time",
+       .tc_quality = 800 /* ??? */
+};
+
+/* These are periodically updated in shared_info, and then copied here. */
 volatile static uint64_t shadow_tsc_stamp;
 volatile static uint64_t shadow_system_time;
 #ifndef XEN3
 volatile static unsigned long shadow_time_version;
 #endif
-volatile static struct timeval shadow_tv;
+volatile static struct timespec shadow_ts;
 
 static int timeset;
 
-static uint64_t processed_system_time;
+/* The time when the last hardclock(9) call should have taken place. */
+static volatile uint64_t processed_system_time;
+
+/*
+ * The clock (as returned by xen_get_timecount) may need to be held
+ * back to maintain the illusion that hardclock(9) was called when it
+ * was supposed to be, not when Xen got around to scheduling us.
+ */
+static volatile uint64_t xen_clock_bias = 0;
 
 #ifdef DOM0OPS
 /* If we're dom0, send our time to Xen every minute or so. */
@@ -80,7 +101,7 @@ static struct callout xen_timepush_co = 
 
 /*
  * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.  Must be called at splclock.
+ * area.  Must be called at splhigh (per timecounter requirements).
  */
 static void
 get_time_values_from_xen(void)
@@ -99,25 +120,29 @@ get_time_values_from_xen(void)
        do {
                tversion = HYPERVISOR_shared_info->wc_version;
                x86_lfence();
-               shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
-               shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_nsec;
+               shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+               shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
                x86_lfence();
        } while ((HYPERVISOR_shared_info->wc_version & 1) ||
            (tversion != HYPERVISOR_shared_info->wc_version));
-       shadow_tv.tv_usec = shadow_tv.tv_usec / 1000;
 #else /* XEN3 */
        do {
                shadow_time_version = HYPERVISOR_shared_info->time_version2;
                x86_lfence();
-               shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
-               shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec;
+               shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+               shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_usec;
                shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
                shadow_system_time = HYPERVISOR_shared_info->system_time;
                x86_lfence();
        } while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
+       shadow_ts.tv_nsec *= 1000;
 #endif
 }
 
+/* 
+ * Use cycle counter to determine ns elapsed since last Xen time update.
+ * Must be called at splhigh (per timecounter requirements).
+ */
 static uint64_t
 get_tsc_offset_ns(void)
 {
@@ -125,6 +150,10 @@ get_tsc_offset_ns(void)
        struct cpu_info *ci = curcpu();
 
        tsc_delta = cpu_counter() - shadow_tsc_stamp;
+       /*
+        * XXX with Xen 3 updating the time info only once per second,
+        * this will overflow with a ~18 GHz TSC.
+        */
        offset = tsc_delta * 1000000000ULL / cpu_frequency(ci);
 #ifdef XEN_CLOCK_DEBUG
        if (offset > 10000000000ULL)
@@ -137,11 +166,8 @@ get_tsc_offset_ns(void)
 void
 inittodr(time_t base)
 {
+       struct timespec sts;
        int s;
-       struct cpu_info *ci = curcpu();
-#if defined(XEN3)
-       uint64_t t;
-#endif /* defined(XEN3) */
 
        /*
         * if the file system time is more than a year older than the
@@ -152,28 +178,16 @@ inittodr(time_t base)
                base = CONFIG_TIME;
        }
 
-       s = splclock();
+       s = splhigh();
        get_time_values_from_xen();
+       sts = shadow_ts;
        splx(s);
 
-#if defined(XEN3)
-       t = (shadow_tv.tv_sec + rtc_offset * 60) * UINT64_C(1000000) +
-           shadow_tv.tv_usec + processed_system_time / 1000;
-       time.tv_usec = t % UINT64_C(1000000);
-       time.tv_sec = t / UINT64_C(1000000);
-#else /* defined(XEN3) */
-       time.tv_usec = shadow_tv.tv_usec;
-       time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60;
-#endif /* defined(XEN3) */
-#ifdef XEN_CLOCK_DEBUG
-       printf("readclock: %ld (%ld)\n", time.tv_sec, base);
-#endif
-       /* reset microset, so that the next call to microset() will init */
-       ci->ci_cc.cc_denom = 0;
-
-       if (base != 0 && base < time.tv_sec - 5*SECYR)
+       tc_setclock(&sts); /* XXX what about rtc_offset? */
+       
+       if (base != 0 && base < time_second - 5*SECYR)
                printf("WARNING: file system time much less than clock time\n");
-       else if (base > time.tv_sec + 5*SECYR) {
+       else if (base > time_second + 5*SECYR) {
                printf("WARNING: clock time much less than file system time\n");
                printf("WARNING: using file system time\n");
                goto fstime;
@@ -184,12 +198,13 @@ inittodr(time_t base)
 
 fstime:
        timeset = 1;
-       time.tv_sec = base;
+       sts.tv_sec = base;
+       tc_setclock(&sts);
        printf("WARNING: CHECK AND RESET THE DATE!\n");
 }
 
-static void
-resettodr_i(void)
+void
+resettodr(void)
 {
 #ifdef DOM0OPS
        dom0_op_t op;
@@ -206,8 +221,8 @@ resettodr_i(void)
        if (!timeset)
                return;
 
-#ifdef DEBUG_CLOCK
-        {
+#ifdef XXX_DEBUG_CLOCK
+        {       /* XXX annoying debug printf not yet timecounterized */
                char pm;
  
                if (timercmp(&time, &shadow_tv, >)) {
@@ -223,38 +238,26 @@ resettodr_i(void)
 #endif
 #ifdef DOM0OPS
        if (xen_start_info.flags & SIF_PRIVILEGED) {
-               s = splclock();
+               struct timespec now;
 
                op.cmd = DOM0_SETTIME;
-               op.u.settime.secs        = time.tv_sec - rtc_offset * 60;
+               nanotime(&now);
+               /* XXX is rtc_offset handled correctly everywhere? */
+               op.u.settime.secs        = now.tv_sec - rtc_offset * 60;
 #ifdef XEN3
-               op.u.settime.nsecs       = time.tv_usec * 1000;
+               op.u.settime.nsecs       = now.tv_nsec;
 #else
-               op.u.settime.usecs       = time.tv_usec;
+               op.u.settime.usecs       = now.tv_nsec / 1000;
 #endif
-               op.u.settime.system_time = processed_system_time;
-               HYPERVISOR_dom0_op(&op);
-
+               s = splhigh();
+               op.u.settime.system_time = shadow_system_time
+                   + get_tsc_offset_ns();
                splx(s);
+               HYPERVISOR_dom0_op(&op);
        }
 #endif
 }
 
-/*
- * When the clock is administratively set, in addition to resetting
- * Xen's clock if possible, we should also allow xen_microtime to 
- * step backwards without complaint.
- */
-void
-resettodr()
-{
-       CPU_INFO_ITERATOR cii;
-       struct cpu_info *ci;
-
-       resettodr_i();
-       for (CPU_INFO_FOREACH(cii, ci))
-               timerclear(&ci->ci_cc.cc_time);
-}
 
 void
 startrtclock()
@@ -292,78 +295,19 @@ xen_delay(int n)
                return;
        } else {
                uint64_t when;
-
+               int s;
                /* for large delays, shadow_system_time is OK */
+               
+               s = splhigh();
                get_time_values_from_xen();
                when = shadow_system_time + n * 1000;
-               while (shadow_system_time < when)
+               while (shadow_system_time < when) {
+                       splx(s);
+                       s = splhigh();
                        get_time_values_from_xen();
-       }
-}
-
-/*
- * A MD microtime for xen.
- *
- * This abuses/reuses the cc_microtime fields already in cpuinfo:
- *  cc_ms_delta = usec added to time(9) on last call to hardclock;
- *                this is used to scale the actual elapsed time 
- *        cc_cc = reference value of cpu_counter()
- *     cc_denom = nsec between last hardclock(9) and time of cc_cc setting
- *                (provided by Xen)
- *
- * We are taking Xen's word for the CPU frequency rather than trying to
- * time it ourselves like cc_microtime does, since Xen could reschedule
- * our virtual CPU(s) onto any physical CPU and only tell us afterwards
- * with a clock interrupt -- and that could invalidate all stored
- * cpu_counter values.
- */
-void
-xen_microtime(struct timeval *tv)
-{
-       int s = splclock();
-       struct cpu_info *ci = curcpu();
-       int64_t cycles;
-       
-       *tv = time;
-       /* Extrapolate from hardclock()'s last step. */
-       cycles = cpu_counter() - ci->ci_cc.cc_cc;
-#ifdef XEN_CLOCK_DEBUG
-       if (cycles <= 0) {
-               printf("xen_microtime: CPU counter has decreased by %" PRId64
-                   " since last hardclock(9)\n", -cycles);
-       }
-#endif
-       cycles += ci->ci_cc.cc_denom * cpu_frequency(ci) / 1000000000LL;
-       tv->tv_usec += cycles * ci->ci_cc.cc_ms_delta * hz / cpu_frequency(ci);
-#ifdef XEN_CLOCK_DEBUG
-       if (tv->tv_usec >= 2000000)
-               printf("xen_microtime: unexpectedly large tv_usec %ld\n", 
tv->tv_usec);
-#endif
-       if (tv->tv_usec >= 1000000) {
-               tv->tv_sec++;
-               tv->tv_usec -= 1000000;
-       }
-       /* Avoid small backsteps, e.g. at the beginning of a negative 
adjustment. */
-       if (timerisset(&ci->ci_cc.cc_time) &&   
-           timercmp(tv, &ci->ci_cc.cc_time, <)) {
-               struct timeval backstep;
-
-               /* XXXjld: not sure if this check can be safely removed now */
-               timersub(&ci->ci_cc.cc_time, tv, &backstep);
-               if (backstep.tv_sec == 0) { /* if it was < 1sec */
-                       *tv = ci->ci_cc.cc_time;
-#ifdef XEN_CLOCK_DEBUG
-                       printf("xen_microtime[%d]: clamping at %ld.%06ld 
(-%ldus)\n",
-                           (int)ci->ci_cpuid, tv->tv_sec, tv->tv_usec, 
backstep.tv_usec);
-               } else {
-                       printf("xen_microtime[%d]: allowing large backstep "
-                           "%lds to %ld.%06ld\n", (int)ci->ci_cpuid,
-                           backstep.tv_sec, tv->tv_sec, tv->tv_usec);
-#endif
                }
+               splx(s);
        }
-       ci->ci_cc.cc_time = *tv;
-       splx(s);
 }
 
 #ifdef DOM0OPS
@@ -373,7 +317,7 @@ xen_timepush(void *arg)
 {
        struct callout *co = arg;
 
-       resettodr_i();
+       resettodr();
        if (xen_timepush_ticks > 0)
                callout_schedule(co, xen_timepush_ticks);
 }
@@ -406,6 +350,16 @@ sysctl_xen_timepush(SYSCTLFN_ARGS)
 }
 #endif
 
+/* ARGSUSED */
+u_int
+xen_get_timecount(struct timecounter *tc)
+{
+       uint64_t ns;
+
+       ns = shadow_system_time + get_tsc_offset_ns() - xen_clock_bias;
+       return (u_int)ns;
+}
+
 void
 xen_initclocks()
 {
@@ -416,6 +370,8 @@ xen_initclocks()
 
        get_time_values_from_xen();
        processed_system_time = shadow_system_time;
+       tc_init(&xen_timecounter);
+       /* The splhigh requirements start here. */
 
        event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
            NULL, IPL_CLOCK, "clock");
@@ -435,47 +391,33 @@ xen_initclocks()
 #endif
 }
 
+/* ARGSUSED */
 static int
 xen_timer_handler(void *arg, struct intrframe *regs)
 {
-       int64_t delta, newcc;
-       int ticks_done;
-       struct timeval oldtime, elapsed;
-       struct cpu_info *ci = curcpu();
-       
-       get_time_values_from_xen();
-       newcc = cpu_counter();
+       int64_t delta;
+       int s;
 
-       ticks_done = 0;
+       s = splhigh();
+       get_time_values_from_xen();
        delta = (int64_t)(shadow_system_time + get_tsc_offset_ns()
            - processed_system_time);
-       while (delta >= (int64_t)NS_PER_TICK) {
-               /* Have hardclock do its thing. */
-               oldtime = time;
-               hardclock((struct clockframe *)regs);
-               
-               /* Use that tick length for the coming tick's microtimes. */
-               timersub(&time, &oldtime, &elapsed);
-#ifdef XEN_CLOCK_DEBUG
-               if (elapsed.tv_sec != 0) {
-                       printf("xen_timer_handler: hardclock(9) stepped by 
%ld.%06lds\n",
-                           elapsed.tv_sec, elapsed.tv_usec);
-               }
-#endif
-               ci->ci_cc.cc_ms_delta = elapsed.tv_usec;
+       splx(s);
 
-               delta -= NS_PER_TICK;
+       /* Several ticks may have passed without our being run; catch up. */
+       while (delta >= (int64_t)NS_PER_TICK) {
+               s = splhigh();
                processed_system_time += NS_PER_TICK;
-               ticks_done++;
+               xen_clock_bias = (delta -= NS_PER_TICK);
+               splx(s);
+               hardclock((struct clockframe *)regs);
+       }
+       
+       if (xen_clock_bias) {
+               s = splhigh();
+               xen_clock_bias = 0;
+               splx(s);
        }
-
-       /*
-        * Right now, delta holds the number of ns elapsed from when the last
-        * hardclock(9) allegedly was to when this domain/vcpu was actually
-        * rescheduled.
-        */
-       ci->ci_cc.cc_denom = delta;
-       ci->ci_cc.cc_cc = newcc;
 
        return 0;
 }
@@ -488,12 +430,17 @@ setstatclockrate(int arg)
 void
 idle_block(void)
 {
+       uint64_t next_tick;
+       int s;
        /*
         * We set the timer to when we expect the next timer
         * interrupt.  We could set the timer to later if we could
         * easily find out when we will have more work (callouts) to
         * process from hardclock.
         */
-       if (HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK) == 0)
+       s = splhigh();
+       next_tick = processed_system_time + NS_PER_TICK;
+       splx(s); /* XXX should this go after the set_timer_op? */
+       if (HYPERVISOR_set_timer_op(next_tick) == 0)
                HYPERVISOR_block();
 }


Home | Main Index | Thread Index | Old Index