Subject: Re: Xen time issues
To: None <tech-kern@netbsd.org>
From: Jed Davis <jdev@panix.com>
List: tech-kern
Date: 08/01/2006 23:03:37
--=-=-=
Well, it's pretty clear that I haven't found time to look at this any
over the past few weeks, so I'm going to do what I should have done a
few weeks ago and send the diff I have for timecounters under Xen, so
other people can look at it if they want.
I've tested it on a Pentium 4, and it seems to work as well as the TSC
source on NetBSD/i386 does (i.e., not so well when hyperthreading is
on, and Xen has issues with it disabled); but it seems it won't handle
the mysterious backsteps that are being seen on some AMD platforms.
And, as always, there's a confused comment or two.
--
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l)))))) (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k))))))) '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))
--=-=-=
Content-Type: text/x-patch
Content-Disposition: attachment; filename=xen-tc.diff
Content-Description: Timecounters for /xen -- but maybe not robust enough.
Index: arch/xen/conf/files.xen
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/conf/files.xen,v
retrieving revision 1.45
diff -u -p -r1.45 files.xen
--- arch/xen/conf/files.xen 12 Jul 2006 15:02:15 -0000 1.45
+++ arch/xen/conf/files.xen 1 Aug 2006 01:57:44 -0000
@@ -44,7 +44,6 @@ file arch/xen/i386/machdep.c
file arch/xen/i386/identcpu.c
file arch/i386/i386/math_emulate.c math_emulate
file arch/i386/i386/mem.c
-file kern/kern_microtime.c i586_cpu | i686_cpu
file arch/i386/i386/mtrr_k6.c mtrr
file netns/ns_cksum.c ns
file arch/xen/i386/pmap.c
Index: arch/xen/i386/machdep.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/i386/machdep.c,v
retrieving revision 1.28
diff -u -p -r1.28 machdep.c
--- arch/xen/i386/machdep.c 22 May 2006 13:44:53 -0000 1.28
+++ arch/xen/i386/machdep.c 1 Aug 2006 01:57:46 -0000
@@ -281,7 +281,6 @@ void (*microtime_func)(struct timeval *)
void (*initclock_func)(void) = i8254_initclocks;
#else
void (*delay_func)(int) = xen_delay;
-void (*microtime_func)(struct timeval *) = xen_microtime;
void (*initclock_func)(void) = xen_initclocks;
#endif
Index: arch/xen/include/cpu.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/cpu.h,v
retrieving revision 1.12
diff -u -p -r1.12 cpu.h
--- arch/xen/include/cpu.h 16 Feb 2006 20:17:15 -0000 1.12
+++ arch/xen/include/cpu.h 1 Aug 2006 01:57:48 -0000
@@ -58,7 +58,6 @@
#include <sys/device.h>
#include <sys/lock.h> /* will also get LOCKDEBUG */
#include <sys/cpu_data.h>
-#include <sys/cc_microtime.h>
#include <lib/libkern/libkern.h> /* offsetof */
@@ -84,7 +83,6 @@ struct cpu_info {
cpuid_t ci_cpuid; /* our CPU ID */
u_int ci_apicid; /* our APIC ID */
struct cpu_data ci_data; /* MI per-cpu data */
- struct cc_microtime_state ci_cc;/* cc_microtime state */
/*
* Private members.
@@ -296,12 +294,9 @@ struct clockframe {
* We need a machine-independent name for this.
*/
extern void (*delay_func)(int);
-struct timeval;
-extern void (*microtime_func)(struct timeval *);
#define DELAY(x) (*delay_func)(x)
#define delay(x) (*delay_func)(x)
-#define microtime(tv) (*microtime_func)(tv)
/*
* pull in #defines for kinds of processors
@@ -377,6 +372,7 @@ void savectx(struct pcb *);
void proc_trampoline(void);
/* clock.c */
+/* XXX: Are the i8254 funcs ever used under Xen? Should they be? */
#ifdef ISA_CLOCK
void initrtclock(void);
void startrtclock(void);
@@ -386,7 +382,6 @@ void i8254_initclocks(void);
#else
void startrtclock(void);
void xen_delay(int);
-void xen_microtime(struct timeval *);
void xen_initclocks(void);
#endif
Index: arch/xen/include/types.h
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/include/types.h,v
retrieving revision 1.2
diff -u -p -r1.2 types.h
--- arch/xen/include/types.h 7 Jun 2006 22:45:21 -0000 1.2
+++ arch/xen/include/types.h 1 Aug 2006 01:57:48 -0000
@@ -77,4 +77,6 @@ typedef volatile int __cpu_simple_lock_
#define __HAVE_RAS
#endif
+#define __HAVE_TIMECOUNTER
+
#endif /* _MACHTYPES_H_ */
Index: arch/xen/xen/clock.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/clock.c,v
retrieving revision 1.27
diff -u -p -r1.27 clock.c
--- arch/xen/xen/clock.c 11 Jul 2006 12:26:58 -0000 1.27
+++ arch/xen/xen/clock.c 1 Aug 2006 01:57:49 -0000
@@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
+#include <sys/timetc.h>
+#include <sys/timevar.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/sysctl.h>
@@ -58,17 +60,36 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
static int xen_timer_handler(void *, struct intrframe *);
-/* These are peridically updated in shared_info, and then copied here. */
+/* A timecounter: Xen system_time extrapolated with a TSC. */
+u_int xen_get_timecount(struct timecounter*);
+static struct timecounter xen_timecounter = {
+ .tc_get_timecount = xen_get_timecount,
+ .tc_poll_pps = NULL,
+ .tc_counter_mask = ~0U,
+ .tc_frequency = 1000000000ULL,
+ .tc_name = "xen_system_time",
+ .tc_quality = 800 /* ??? */
+};
+
+/* These are periodically updated in shared_info, and then copied here. */
volatile static uint64_t shadow_tsc_stamp;
volatile static uint64_t shadow_system_time;
#ifndef XEN3
volatile static unsigned long shadow_time_version;
#endif
-volatile static struct timeval shadow_tv;
+volatile static struct timespec shadow_ts;
static int timeset;
-static uint64_t processed_system_time;
+/* The time when the last hardclock(9) call should have taken place. */
+static volatile uint64_t processed_system_time;
+
+/*
+ * The clock (as returned by xen_get_timecount) may need to be held
+ * back to maintain the illusion that hardclock(9) was called when it
+ * was supposed to be, not when Xen got around to scheduling us.
+ */
+static volatile uint64_t xen_clock_bias = 0;
#ifdef DOM0OPS
/* If we're dom0, send our time to Xen every minute or so. */
@@ -80,7 +101,7 @@ static struct callout xen_timepush_co =
/*
* Reads a consistent set of time-base values from Xen, into a shadow data
- * area. Must be called at splclock.
+ * area. Must be called at splhigh (per timecounter requirements).
*/
static void
get_time_values_from_xen(void)
@@ -99,25 +120,29 @@ get_time_values_from_xen(void)
do {
tversion = HYPERVISOR_shared_info->wc_version;
x86_lfence();
- shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
- shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_nsec;
+ shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+ shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
x86_lfence();
} while ((HYPERVISOR_shared_info->wc_version & 1) ||
(tversion != HYPERVISOR_shared_info->wc_version));
- shadow_tv.tv_usec = shadow_tv.tv_usec / 1000;
#else /* XEN3 */
do {
shadow_time_version = HYPERVISOR_shared_info->time_version2;
x86_lfence();
- shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
- shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec;
+ shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+ shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_usec;
shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
shadow_system_time = HYPERVISOR_shared_info->system_time;
x86_lfence();
} while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
+ shadow_ts.tv_nsec *= 1000;
#endif
}
+/*
+ * Use cycle counter to determine ns elapsed since last Xen time update.
+ * Must be called at splhigh (per timecounter requirements).
+ */
static uint64_t
get_tsc_offset_ns(void)
{
@@ -125,6 +150,10 @@ get_tsc_offset_ns(void)
struct cpu_info *ci = curcpu();
tsc_delta = cpu_counter() - shadow_tsc_stamp;
+ /*
+ * XXX with Xen 3 updating the time info only once per second,
+ * this will overflow with a ~18 GHz TSC.
+ */
offset = tsc_delta * 1000000000ULL / cpu_frequency(ci);
#ifdef XEN_CLOCK_DEBUG
if (offset > 10000000000ULL)
@@ -137,11 +166,8 @@ get_tsc_offset_ns(void)
void
inittodr(time_t base)
{
+ struct timespec sts;
int s;
- struct cpu_info *ci = curcpu();
-#if defined(XEN3)
- uint64_t t;
-#endif /* defined(XEN3) */
/*
* if the file system time is more than a year older than the
@@ -152,28 +178,16 @@ inittodr(time_t base)
base = CONFIG_TIME;
}
- s = splclock();
+ s = splhigh();
get_time_values_from_xen();
+ sts = shadow_ts;
splx(s);
-#if defined(XEN3)
- t = (shadow_tv.tv_sec + rtc_offset * 60) * UINT64_C(1000000) +
- shadow_tv.tv_usec + processed_system_time / 1000;
- time.tv_usec = t % UINT64_C(1000000);
- time.tv_sec = t / UINT64_C(1000000);
-#else /* defined(XEN3) */
- time.tv_usec = shadow_tv.tv_usec;
- time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60;
-#endif /* defined(XEN3) */
-#ifdef XEN_CLOCK_DEBUG
- printf("readclock: %ld (%ld)\n", time.tv_sec, base);
-#endif
- /* reset microset, so that the next call to microset() will init */
- ci->ci_cc.cc_denom = 0;
-
- if (base != 0 && base < time.tv_sec - 5*SECYR)
+ tc_setclock(&sts); /* XXX what about rtc_offset? */
+
+ if (base != 0 && base < time_second - 5*SECYR)
printf("WARNING: file system time much less than clock time\n");
- else if (base > time.tv_sec + 5*SECYR) {
+ else if (base > time_second + 5*SECYR) {
printf("WARNING: clock time much less than file system time\n");
printf("WARNING: using file system time\n");
goto fstime;
@@ -184,12 +198,13 @@ inittodr(time_t base)
fstime:
timeset = 1;
- time.tv_sec = base;
+ sts.tv_sec = base;
+ tc_setclock(&sts);
printf("WARNING: CHECK AND RESET THE DATE!\n");
}
-static void
-resettodr_i(void)
+void
+resettodr(void)
{
#ifdef DOM0OPS
dom0_op_t op;
@@ -206,8 +221,8 @@ resettodr_i(void)
if (!timeset)
return;
-#ifdef DEBUG_CLOCK
- {
+#ifdef XXX_DEBUG_CLOCK
+ { /* XXX annoying debug printf not yet timecounterized */
char pm;
if (timercmp(&time, &shadow_tv, >)) {
@@ -223,38 +238,26 @@ resettodr_i(void)
#endif
#ifdef DOM0OPS
if (xen_start_info.flags & SIF_PRIVILEGED) {
- s = splclock();
+ struct timespec now;
op.cmd = DOM0_SETTIME;
- op.u.settime.secs = time.tv_sec - rtc_offset * 60;
+ nanotime(&now);
+ /* XXX is rtc_offset handled correctly everywhere? */
+ op.u.settime.secs = now.tv_sec - rtc_offset * 60;
#ifdef XEN3
- op.u.settime.nsecs = time.tv_usec * 1000;
+ op.u.settime.nsecs = now.tv_nsec;
#else
- op.u.settime.usecs = time.tv_usec;
+ op.u.settime.usecs = now.tv_nsec / 1000;
#endif
- op.u.settime.system_time = processed_system_time;
- HYPERVISOR_dom0_op(&op);
-
+ s = splhigh();
+ op.u.settime.system_time = shadow_system_time
+ + get_tsc_offset_ns();
splx(s);
+ HYPERVISOR_dom0_op(&op);
}
#endif
}
-/*
- * When the clock is administratively set, in addition to resetting
- * Xen's clock if possible, we should also allow xen_microtime to
- * step backwards without complaint.
- */
-void
-resettodr()
-{
- CPU_INFO_ITERATOR cii;
- struct cpu_info *ci;
-
- resettodr_i();
- for (CPU_INFO_FOREACH(cii, ci))
- timerclear(&ci->ci_cc.cc_time);
-}
void
startrtclock()
@@ -292,78 +295,19 @@ xen_delay(int n)
return;
} else {
uint64_t when;
-
+ int s;
/* for large delays, shadow_system_time is OK */
+
+ s = splhigh();
get_time_values_from_xen();
when = shadow_system_time + n * 1000;
- while (shadow_system_time < when)
+ while (shadow_system_time < when) {
+ splx(s);
+ s = splhigh();
get_time_values_from_xen();
- }
-}
-
-/*
- * A MD microtime for xen.
- *
- * This abuses/reuses the cc_microtime fields already in cpuinfo:
- * cc_ms_delta = usec added to time(9) on last call to hardclock;
- * this is used to scale the actual elapsed time
- * cc_cc = reference value of cpu_counter()
- * cc_denom = nsec between last hardclock(9) and time of cc_cc setting
- * (provided by Xen)
- *
- * We are taking Xen's word for the CPU frequency rather than trying to
- * time it ourselves like cc_microtime does, since Xen could reschedule
- * our virtual CPU(s) onto any physical CPU and only tell us afterwards
- * with a clock interrupt -- and that could invalidate all stored
- * cpu_counter values.
- */
-void
-xen_microtime(struct timeval *tv)
-{
- int s = splclock();
- struct cpu_info *ci = curcpu();
- int64_t cycles;
-
- *tv = time;
- /* Extrapolate from hardclock()'s last step. */
- cycles = cpu_counter() - ci->ci_cc.cc_cc;
-#ifdef XEN_CLOCK_DEBUG
- if (cycles <= 0) {
- printf("xen_microtime: CPU counter has decreased by %" PRId64
- " since last hardclock(9)\n", -cycles);
- }
-#endif
- cycles += ci->ci_cc.cc_denom * cpu_frequency(ci) / 1000000000LL;
- tv->tv_usec += cycles * ci->ci_cc.cc_ms_delta * hz / cpu_frequency(ci);
-#ifdef XEN_CLOCK_DEBUG
- if (tv->tv_usec >= 2000000)
- printf("xen_microtime: unexpectedly large tv_usec %ld\n", tv->tv_usec);
-#endif
- if (tv->tv_usec >= 1000000) {
- tv->tv_sec++;
- tv->tv_usec -= 1000000;
- }
- /* Avoid small backsteps, e.g. at the beginning of a negative adjustment. */
- if (timerisset(&ci->ci_cc.cc_time) &&
- timercmp(tv, &ci->ci_cc.cc_time, <)) {
- struct timeval backstep;
-
- /* XXXjld: not sure if this check can be safely removed now */
- timersub(&ci->ci_cc.cc_time, tv, &backstep);
- if (backstep.tv_sec == 0) { /* if it was < 1sec */
- *tv = ci->ci_cc.cc_time;
-#ifdef XEN_CLOCK_DEBUG
- printf("xen_microtime[%d]: clamping at %ld.%06ld (-%ldus)\n",
- (int)ci->ci_cpuid, tv->tv_sec, tv->tv_usec, backstep.tv_usec);
- } else {
- printf("xen_microtime[%d]: allowing large backstep "
- "%lds to %ld.%06ld\n", (int)ci->ci_cpuid,
- backstep.tv_sec, tv->tv_sec, tv->tv_usec);
-#endif
}
+ splx(s);
}
- ci->ci_cc.cc_time = *tv;
- splx(s);
}
#ifdef DOM0OPS
@@ -373,7 +317,7 @@ xen_timepush(void *arg)
{
struct callout *co = arg;
- resettodr_i();
+ resettodr();
if (xen_timepush_ticks > 0)
callout_schedule(co, xen_timepush_ticks);
}
@@ -406,6 +350,16 @@ sysctl_xen_timepush(SYSCTLFN_ARGS)
}
#endif
+/* ARGSUSED */
+u_int
+xen_get_timecount(struct timecounter *tc)
+{
+ uint64_t ns;
+
+ ns = shadow_system_time + get_tsc_offset_ns() - xen_clock_bias;
+ return (u_int)ns;
+}
+
void
xen_initclocks()
{
@@ -416,6 +370,8 @@ xen_initclocks()
get_time_values_from_xen();
processed_system_time = shadow_system_time;
+ tc_init(&xen_timecounter);
+ /* The splhigh requirements start here. */
event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
NULL, IPL_CLOCK, "clock");
@@ -435,47 +391,33 @@ xen_initclocks()
#endif
}
+/* ARGSUSED */
static int
xen_timer_handler(void *arg, struct intrframe *regs)
{
- int64_t delta, newcc;
- int ticks_done;
- struct timeval oldtime, elapsed;
- struct cpu_info *ci = curcpu();
-
- get_time_values_from_xen();
- newcc = cpu_counter();
+ int64_t delta;
+ int s;
- ticks_done = 0;
+ s = splhigh();
+ get_time_values_from_xen();
delta = (int64_t)(shadow_system_time + get_tsc_offset_ns()
- processed_system_time);
- while (delta >= (int64_t)NS_PER_TICK) {
- /* Have hardclock do its thing. */
- oldtime = time;
- hardclock((struct clockframe *)regs);
-
- /* Use that tick length for the coming tick's microtimes. */
- timersub(&time, &oldtime, &elapsed);
-#ifdef XEN_CLOCK_DEBUG
- if (elapsed.tv_sec != 0) {
- printf("xen_timer_handler: hardclock(9) stepped by %ld.%06lds\n",
- elapsed.tv_sec, elapsed.tv_usec);
- }
-#endif
- ci->ci_cc.cc_ms_delta = elapsed.tv_usec;
+ splx(s);
- delta -= NS_PER_TICK;
+ /* Several ticks may have passed without our being run; catch up. */
+ while (delta >= (int64_t)NS_PER_TICK) {
+ s = splhigh();
processed_system_time += NS_PER_TICK;
- ticks_done++;
+ xen_clock_bias = (delta -= NS_PER_TICK);
+ splx(s);
+ hardclock((struct clockframe *)regs);
+ }
+
+ if (xen_clock_bias) {
+ s = splhigh();
+ xen_clock_bias = 0;
+ splx(s);
}
-
- /*
- * Right now, delta holds the number of ns elapsed from when the last
- * hardclock(9) allegedly was to when this domain/vcpu was actually
- * rescheduled.
- */
- ci->ci_cc.cc_denom = delta;
- ci->ci_cc.cc_cc = newcc;
return 0;
}
@@ -488,12 +430,17 @@ setstatclockrate(int arg)
void
idle_block(void)
{
+ uint64_t next_tick;
+ int s;
/*
* We set the timer to when we expect the next timer
* interrupt. We could set the timer to later if we could
* easily find out when we will have more work (callouts) to
* process from hardclock.
*/
- if (HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK) == 0)
+ s = splhigh();
+ next_tick = processed_system_time + NS_PER_TICK;
+ splx(s); /* XXX should this go after the set_timer_op? */
+ if (HYPERVISOR_set_timer_op(next_tick) == 0)
HYPERVISOR_block();
}
--=-=-=--