At Sat, 06 Jul 2024 21:26:42 -0700, "Greg A. Woods" <woods%planix.ca@localhost> wrote: Subject: Re: timekeeping regression? > > I've been rewriting some chunks of xen_clock.c (in a separate post > [[hopefully!]]) and getting some good results, but some confusion > remains. -- Greg A. Woods <gwoods%acm.org@localhost> Kelowna, BC +1 250 762-7675 RoboHack <woods%robohack.ca@localhost> Planix, Inc. <woods%planix.com@localhost> Avoncote Farms <woods%avoncote.ca@localhost> /* $NetBSD: xen_clock.c,v 1.18 2023/09/10 15:23:01 bouyer Exp $ */ /*- * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "opt_xen.h" #ifndef XEN_CLOCK_DEBUG #define XEN_CLOCK_DEBUG 0 #endif #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: xen_clock.c,v 1.18 2023/09/10 15:23:01 bouyer Exp $"); #include <sys/param.h> #include <sys/types.h> #include <sys/atomic.h> #include <sys/callout.h> #include <sys/cpu.h> #include <sys/device.h> #include <sys/evcnt.h> #include <sys/intr.h> #include <sys/kernel.h> #include <sys/lwp.h> #include <sys/proc.h> #include <sys/sdt.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/timetc.h> #include <dev/clock_subr.h> #include <machine/cpu.h> #include <machine/cpu_counter.h> #include <machine/lock.h> #include <xen/evtchn.h> #include <xen/hypervisor.h> #include <xen/include/public/vcpu.h> #include <xen/xen.h> #include <x86/rtc.h> #define NS_PER_TICK ((uint64_t)1000000000ULL/hz) static uint64_t xen_vcputime_sched_systime_ns(void); static uint64_t xen_global_systime_ns(void); static unsigned xen_get_timecount(struct timecounter *); static int xen_timer_handler(void *, struct clockframe *); /* * dtrace probes */ SDT_PROBE_DEFINE2(sdt, xen, tsc, backwards, "uint64_t"/*tsc*/, "uint64_t"/*tsc_at_start*/); SDT_PROBE_DEFINE2(sdt, xen, global_ns, backwards, "uint64_t"/*local_ns*/, "uint64_t"/*global_ns*/); SDT_PROBE_DEFINE2(sdt, xen, hardclock, systime__backward, "uint64_t"/*last_systime_ns*/, "uint64_t"/*this_systime_ns*/); SDT_PROBE_DEFINE2(sdt, xen, hardclock, tick, "uint64_t"/*last_systime_ns*/, "uint64_t"/*this_systime_ns*/); SDT_PROBE_DEFINE3(sdt, xen, hardclock, jump, "uint64_t"/*last_systime_ns*/, "uint64_t"/*this_systime_ns*/, "uint64_t"/*nticks*/); SDT_PROBE_DEFINE3(sdt, xen, hardclock, missed, "uint64_t"/*last_systime_ns*/, "uint64_t"/*this_systime_ns*/, "uint64_t"/*remaining_ns*/); /* * xen timecounter: * * Xen vCPU system time, plus an adjustment with rdtsc. */ static struct timecounter xen_timecounter = { .tc_get_timecount = xen_get_timecount, .tc_poll_pps = NULL, .tc_counter_mask = ~0U, .tc_frequency = 1000000000ULL, /* 1 GHz, i.e. units of nanoseconds */ .tc_name = "xen_system_time", /* XXX "xen_TSC" */ .tc_quality = 10000, }; /* * xen_global_systime_ns_stamp * * The latest Xen vCPU system time that has been observed on any * CPU, for a global monotonic view of the Xen system time clock. */ static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned; #ifdef DOM0OPS /* * xen timepush state: * * Callout to periodically, after a sysctl-configurable number of * NetBSD ticks, set the Xen hypervisor's wall clock time. * * Linux does this once every 11 minutes, as well as any time settimeofday() is * called (and maybe every time the clock is jumped by NTP etc.) */ static struct { struct callout ch; int ticks; } xen_timepush; static void xen_timepush_init(void); static void xen_timepush_intr(void *); static int sysctl_xen_timepush(SYSCTLFN_ARGS); #endif /* * Xen "system time" and "wall clock time" * * Xen provides guests with two timestamp values, the system-time (time since * guest boot or resume) and the wall-clock time (time since the epoch at the * point when system-time was zero, i.e. Xen wall clock time is actually * boot-time (or resume time) for the guest, and for dom0 this is also very * close to the boot-time of the hypervisor). * * These are provided through a shared memory structure (shared_info_page, and * in the array of vcpu_time_info within). * * The system-time in the vCPU's vcpu_time_info is updated by Xen every time the * guest is being scheduled, along with a snapshot of the CPU's TSC register * value (and some related values for scaling the TSC to nanoseconds). While * running the guest can get the current system-time by extrapolating from the * values in vcpu_time_info using the value of the TSC register (an x86 register * counting CPU clock cycles, often emulated in domUs). * * TSC values in Xen are obtained through the RTDSC instruction and are either * native, i.e. accessed directly from the CPU register (in dom0, and possibly * in some situations in domUs); or emulated, i.e. intercepted through a trap by * Xen (in domUs, e.g. on hardware without the TSC_INVARIANT CPU feature). In * emulated mode the CPU clock is at a ficticious frequency of 1 GHz. Either * way multiplier and shift values are provided to adjust the TSC value to * nanoseconds so the frequency need not be measured (it was measured by Xen * when it first booted). * * XXX for SMP domains with multiple vCPUs it looks like the tsc_timestamp is * separately updated for each vCPU as the domain is scheduled to run so I think * we might want to be careful to read the TSC from the CPU associated with the * vcpu_time_info we're calculating "local" system-time from. */ /* * xen_rdtsc() * * Read the local pCPU's tsc. */ static inline uint64_t xen_rdtsc(void) { uint32_t lo, hi; asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; } /* * struct xen_vcputime_ticket * * State for a vCPU read section, during which a caller may read * from fields of a struct vcpu_time_info and call xen_rdtsc. * Caller must enter with xen_vcputime_enter, exit with * xen_vcputime_exit, and be prepared to retry if * xen_vcputime_exit fails. */ struct xen_vcputime_ticket { uint64_t version; }; /* * xen_vcputime_enter(tp) * * Enter a vCPU time read section and store a ticket in *tp, which * the caller must use with xen_vcputime_exit. Return a pointer * to the current CPU's vcpu_time_info structure. Caller must * already be bound to the CPU. */ static inline volatile struct vcpu_time_info * xen_vcputime_enter(struct xen_vcputime_ticket *tp) { volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time; while (__predict_false(1 & (tp->version = vt->version))) SPINLOCK_BACKOFF_HOOK; /* * Must read the version before reading the tsc on the local * pCPU. We are racing only with interruption by the * hypervisor, so no need for a stronger memory barrier. */ __insn_barrier(); return vt; } /* * xen_vcputime_exit(vt, tp) * * Exit a vCPU time read section with the ticket in *tp from * xen_vcputime_enter. Return true on success, false if caller * must retry. */ static inline bool xen_vcputime_exit(volatile struct vcpu_time_info *vt, struct xen_vcputime_ticket *tp) { KASSERT(vt == &curcpu()->ci_vcpu->time); /* * Must read the tsc before re-reading the version on the local * pCPU. We are racing only with interruption by the * hypervisor, so no need for a stronger memory barrier. */ __insn_barrier(); return tp->version == vt->version; } /* * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift) * * Convert a difference in tsc units to a difference in * nanoseconds given a multiplier and shift for the unit * conversion. * * from xen.h: * * Current system time: * system_time + * ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32) * CPU frequency (Hz): * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift */ static inline uint64_t xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul, int8_t tsc_shift) { uint32_t delta_tsc_hi, delta_tsc_lo; if (delta_tsc == 0) return 0; if (tsc_shift < 0) delta_tsc >>= -tsc_shift; else delta_tsc <<= tsc_shift; delta_tsc_hi = delta_tsc >> 32; delta_tsc_lo = delta_tsc & 0xffffffffUL; /* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */ return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) + (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32); } /* * xen_vcputime_sched_systime_ns() * * Return a snapshot of the current Xen system time to the * resolution of the Xen hypervisor tick, in units of nanoseconds. * * I.e. the Xen system time at the time this domain was last scheduled. * * N.B. it is assumed this is only called when premption is impossible. */ static uint64_t xen_vcputime_sched_systime_ns(void) { volatile struct vcpu_time_info *vt; struct xen_vcputime_ticket ticket; uint64_t sched_systime_ns; do { vt = xen_vcputime_enter(&ticket); sched_systime_ns = vt->system_time; } while (!xen_vcputime_exit(vt, &ticket)); return sched_systime_ns; } /* * struct xen_wallclock_ticket * * State for a wall clock read section, during which a caller may * read from the wall clock fields of HYPERVISOR_shared_info. * Caller must enter with xen_wallclock_enter, exit with * xen_wallclock_exit, and be prepared to retry if * xen_wallclock_exit fails. */ struct xen_wallclock_ticket { uint32_t version; }; /* * xen_wallclock_enter(tp) * * Enter a wall clock read section and store a ticket in *tp, * which the caller must use with xen_wallclock_exit. */ static inline void xen_wallclock_enter(struct xen_wallclock_ticket *tp) { while (__predict_false(1 & (tp->version = HYPERVISOR_shared_info->wc_version))) SPINLOCK_BACKOFF_HOOK; /* * Must read the version from memory before reading the * timestamp from memory, as written potentially by another * pCPU. */ membar_consumer(); } /* * xen_wallclock_exit(tp) * * Exit a wall clock read section with the ticket in *tp from * xen_wallclock_enter. Return true on success, false if caller * must retry. */ static inline bool xen_wallclock_exit(struct xen_wallclock_ticket *tp) { /* * Must read the timestamp from memory before re-reading the * version from memory, as written potentially by another pCPU. */ membar_consumer(); return tp->version == HYPERVISOR_shared_info->wc_version; } /* * xen_global_systime_ns() * * Return a monotonic view of the system time (current domain's time since * boot) in nanoseconds. * * First compute the current vCPU's "system time", which is the vCPU * "system time" (Xen's view of this domain's "system time" at the time * this domain was scheduled), plus an adjustment based on the TSC offset * since the time this domain was scheduled (scaled to nanoseconds using * Xen's supplied scaling factors). * * If this vCPU's current "system time" is greater than the last recorded * "global system time" then store this as the new global system time and * return it, else return the current "global system time" (thus keeping * the global system time monotonically advancing). * * XXX FreeBSD's implementation does not worry about being prempted, nor does it * try to keep track of skew between vCPUs. * * see it in sys/dev/xen/timer/xen_timer.c:xentimer_get_timecount() * also see sys/x86/x86/pvclock.c:pvclock_get_timecount() */ static uint64_t xen_global_systime_ns(void) { uint64_t local_ns, global_ns, result_ns; /* * XXX Can we avoid retrying if the CAS fails? * * XXX Has enough time passed in this "loop" that we really need to * fetch a new TSC value and calculate a new local_ns before trying to * store it again? I guess we could have been premepted.... */ do { /* XXX this next hunk of code is partly copied in xen_delay() */ volatile struct vcpu_time_info *vt; struct xen_vcputime_ticket ticket; uint64_t start_systime_ns, tsc_at_start, tsc, delta_tsc, delta_ns; uint32_t tsc_to_system_mul; int8_t tsc_shift; int s; struct cpu_info *ci; s = splsched(); /* make sure we won't be interrupted XXX splhigh()??? */ ci = curcpu(); do { vt = xen_vcputime_enter(&ticket); /* * Grab Xen's snapshot of system time and the TSC value * at the time this domain was last scheduled to run. */ start_systime_ns = vt->system_time; tsc_at_start = vt->tsc_timestamp; /* Get Xen's current idea of how fast the TSC is counting. */ /* xxx these should be static-enough and could be fetched done outside this loop? */ tsc_to_system_mul = vt->tsc_to_system_mul; tsc_shift = vt->tsc_shift; /* * Read the CPU's current TSC (or the emulated one). * * xxx we need to do this on the same vCPU as is * represented by vt, thus the splsched()/splx() */ tsc = xen_rdtsc(); } while (!xen_vcputime_exit(vt, &ticket)); splx(s); if (__predict_false(tsc < tsc_at_start)) { SDT_PROBE2(sdt, xen, tsc, backwards, tsc, tsc_at_start); #if XEN_CLOCK_DEBUG device_printf(ci->ci_dev, "xen tsc ran backwards:" " tsc=%"PRIu64" tsc_at_start=%"PRIu64"\n", tsc, tsc_at_start); #endif ci->ci_xen_tsc_backwards_evcnt.ev_count++; tsc = tsc_at_start; } /* Find how far the CPU's TSC has advanced since we were scheduled. */ delta_tsc = tsc - tsc_at_start; /* Convert the TSC delta to a nanosecond delta. */ delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul, tsc_shift); /* Compute the TSC-adjusted system time for this vCPU. */ local_ns = start_systime_ns + delta_ns; global_ns = atomic_load_acquire(&xen_global_systime_ns_stamp); /* * The global_ns is sometimes (often, in (SMP-only?) dom0, even * with vCPUs pinned) greater than the value computed from the * current vCPU (local_ns) so we don't want to risk it going * backwards (global_ns is (likely) the same value that was last * returned to the timecounter by xen_get_timecount(), and may * well be the next value returned too) * * This is more or less what FreeBSD does as well. */ if (__predict_false(local_ns <= global_ns)) { if (local_ns != global_ns) { SDT_PROBE2(sdt, xen, global_ns, backward, local_ns, global_ns); #if 0 /* XEN_CLOCK_DEBUG XXX way too noisy, causes hangs! (esp. in dom0) */ device_printf(ci->ci_dev, "xen global_ns prevented from running backwards:" " local_ns=%"PRIu64" global_ns=%"PRIu64"\n", local_ns, global_ns); #endif ci->ci_xen_global_ns_backwards_evcnt.ev_count++; } result_ns = global_ns; /* avoid saving a lower, or same, global_ns again */ #if __NetBSD_Prereq__(9, 99, 97) /* xxx only for my wonky -current */ membar_release(); #else membar_exit(); #endif break; } else { result_ns = local_ns; } #if __NetBSD_Prereq__(9, 99, 97) /* xxx only for my wonky -current */ membar_release(); #else membar_exit(); #endif } while (atomic_cas_64(&xen_global_systime_ns_stamp, global_ns, result_ns) != global_ns); return result_ns; } /* * xen_get_timecount(tc) * * Return the low 32 bits of a global monotonic view of the Xen * system time. */ static unsigned xen_get_timecount(struct timecounter *tc) { KASSERT(tc == &xen_timecounter); return (unsigned)xen_global_systime_ns(); } /* * xen_delay(n) * * Wait approximately n microseconds. */ void xen_delay(unsigned us) { int bound; /* Bind to the CPU so we don't compare tsc on different CPUs. */ bound = curlwp_bind(); if (curcpu()->ci_vcpu == NULL) { curlwp_bindx(bound); return; } /* Short wait (<50000us) or long wait? */ if (us < 500000) { /* XXX this next hunk of code is partly copied in xen_global_systime_ns() */ /* * Xen system time is not precise enough for short * delays, so use the tsc instead. * * We work with the current tsc frequency, and figure * that if it changes while we're delaying, we've * probably delayed long enough -- up to 500us. * * We do not use cpu_frequency(ci), which uses a * quantity detected at boot time, and which may have * changed by now if Xen has migrated this vCPU to * another pCPU. * * XXX How long does it take to migrate pCPUs? */ volatile struct vcpu_time_info *vt; struct xen_vcputime_ticket ticket; uint64_t tsc_at_start, last_tsc, tsc; uint32_t tsc_to_system_mul; int8_t tsc_shift; /* Get the starting tsc and tsc frequency. */ do { vt = xen_vcputime_enter(&ticket); tsc_at_start = last_tsc = xen_rdtsc(); tsc_to_system_mul = vt->tsc_to_system_mul; tsc_shift = vt->tsc_shift; } while (!xen_vcputime_exit(vt, &ticket)); /* * Wait until as many tsc ticks as there are in n * microseconds have elapsed, or the tsc has gone * backwards meaning we've probably migrated pCPUs. */ for (;;) { tsc = xen_rdtsc(); if (__predict_false(tsc < last_tsc)) break; if (xen_tsc_to_ns_delta(tsc - tsc_at_start, tsc_to_system_mul, tsc_shift)/1000 >= us) { break; } last_tsc = tsc; } } else { /* * Use the Xen system time for >=50000us delays. From my * testing, it seems to sometimes run backward by about * 110us, which is not so bad. */ uint64_t us_ns = 1000 * (uint64_t)us; uint64_t start_ns; /* Get the start time. */ start_ns = xen_vcputime_sched_systime_ns(); /* Wait until the system time has passed the end. */ do { HYPERVISOR_yield(); } while (xen_vcputime_sched_systime_ns() - start_ns < us_ns); } /* Unbind from the CPU if we weren't already bound. */ curlwp_bindx(bound); } /* * xen_suspendclocks(ci) * * Stop handling the Xen timer event on the CPU of ci. Caller * must be running on and bound to ci's CPU. * * Actually, caller must have kpreemption disabled, because that's * easier to assert at the moment. */ void xen_suspendclocks(struct cpu_info *ci) { int evtch; KASSERT(ci == curcpu()); KASSERT(kpreempt_disabled()); /* * Find the VIRQ_TIMER event channel and close it so new timer * interrupt events stop getting delivered to it. * * XXX Should this happen later? This is not the reverse order * of xen_resumeclocks. It is apparently necessary in this * order only because we don't stash evtchn anywhere, but we * could stash it. */ evtch = unbind_virq_from_evtch(VIRQ_TIMER); KASSERT(evtch != -1); /* * Mask the event channel so we stop getting new interrupts on * it. */ hypervisor_mask_event(evtch); /* * Now that we are no longer getting new interrupts, remove the * handler and wait for any existing calls to the handler to * complete. After this point, there can be no concurrent * calls to xen_timer_handler. */ event_remove_handler(evtch, __FPTRCAST(int (*)(void *), xen_timer_handler), ci); aprint_verbose("Xen clock: removed event channel %d\n", evtch); /* We'd better not have switched CPUs. */ KASSERT(ci == curcpu()); } /* * xen_resumeclocks(ci) * * Start handling the Xen timer event on the CPU of ci. Arm the * Xen timer. Caller must be running on and bound to ci's CPU. * * Actually, caller must have kpreemption disabled, because that's * easier to assert at the moment. */ void xen_resumeclocks(struct cpu_info *ci) { char intr_xname[INTRDEVNAMEBUF]; int evtch; int error __diagused; KASSERT(ci == curcpu()); KASSERT(kpreempt_disabled()); /* * Allocate an event channel to receive VIRQ_TIMER events. */ evtch = bind_virq_to_evtch(VIRQ_TIMER); KASSERT(evtch != -1); /* * Set an event handler for VIRQ_TIMER events to call * xen_timer_handler. */ snprintf(intr_xname, sizeof(intr_xname), "%s clock", device_xname(ci->ci_dev)); /* XXX sketchy function pointer cast -- fix the API, please */ if (event_set_handler(evtch, __FPTRCAST(int (*)(void *), xen_timer_handler), ci, IPL_CLOCK, NULL, intr_xname, true, ci) == NULL) panic("failed to establish timer interrupt handler"); aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch); /* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy. */ if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) { error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, ci->ci_vcpuid, NULL); KASSERT(error == 0); } /* Pretend the last hardclock happened when we were last scheduled. */ ci->ci_xen_hardclock_systime_ns = xen_vcputime_sched_systime_ns(); /* Arm the one-shot timer. */ error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns + NS_PER_TICK); KASSERT(error == 0); /* * Ready to go. Unmask the event. After this point, Xen may * start calling xen_timer_handler. */ hypervisor_unmask_event(evtch); /* We'd better not have switched CPUs. */ KASSERT(ci == curcpu()); } /* * xen_timer_handler(cookie, frame) * * Periodic Xen timer event handler for NetBSD hardclock. Calls * to this may get delayed(*), so we run hardclock as many times as * we need to in order to cover the Xen system time that elapsed. * After that, re-arm the timer to run again at the next tick. * The cookie is the pointer to struct cpu_info. * * (*) n.b. comment in FreeBSD xen_timer.c says: * * Xen timers may fire up to 100us off */ static int xen_timer_handler(void *cookie, struct clockframe *frame) { const uint64_t ns_per_tick = NS_PER_TICK; struct cpu_info *ci = curcpu(); uint64_t last, now, delta, next; int error; KASSERT(cpu_intr_p()); KASSERT(cookie == ci); #if defined(XENPV) frame = NULL; /* We use values cached in curcpu() */ #endif /* * Find how many nanoseconds of Xen system time has elapsed * since the last hardclock tick. */ last = ci->ci_xen_hardclock_systime_ns; now = xen_global_systime_ns(); /* xxx use global, avoid skew! */ SDT_PROBE2(sdt, xen, hardclock, tick, last, now); if (__predict_false(now < last)) { SDT_PROBE2(sdt, xen, hardclock, systime__backward, last, now); #if XEN_CLOCK_DEBUG device_printf(ci->ci_dev, "xen systime ran backwards" " in hardclock %"PRIu64"ns\n", last - now); #endif ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++; /* * we've lost track of time. Just pretends that one * tick elapsed, and reset our idea of last tick. */ ci->ci_xen_hardclock_systime_ns = last = now - ns_per_tick; } delta = now - last; if (__predict_false(delta >= 2*ns_per_tick)) { /* * Warn if we violate timecounter(9) contract: with a * k-bit timeocunter (here k = 32), and timecounter * frequency f (here f = 1 GHz), the maximum period * between hardclock calls is 2^k / f. */ if (delta > xen_timecounter.tc_counter_mask) { SDT_PROBE3(sdt, xen, hardclock, jump, last, now, delta/ns_per_tick); printf("WARNING: hardclock skipped %"PRIu64"ns" " (%"PRIu64" -> %"PRIu64")," " exceeding maximum of %"PRIu32"ns" " for timecounter(9)\n", last, now, delta, xen_timecounter.tc_counter_mask); ci->ci_xen_timecounter_jump_evcnt.ev_count++; } /* don't try to catch up more than one second at once */ if (delta > 1000000000UL) delta = 1000000000UL; } /* * Play hardclock catchup: run the hardclock timer as many * times as appears necessary based on how much time has * passed. * * XXX This happens extremely frequently -- ~50-80% of the HZ rate! */ while (delta >= ns_per_tick) { ci->ci_xen_hardclock_systime_ns += ns_per_tick; delta -= ns_per_tick; hardclock(frame); if (__predict_false(delta >= ns_per_tick)) { SDT_PROBE3(sdt, xen, hardclock, missed, last, now, delta); ci->ci_xen_missed_hardclock_evcnt.ev_count++; } } /* * Re-arm the timer. If it fails, it's probably because the desired * time is in the past, possibly because we're in the process of * catching up missed hardclock calls. In this case schedule a tick in * the near future. */ next = ci->ci_xen_hardclock_systime_ns + ns_per_tick; error = HYPERVISOR_set_timer_op(next); if (error) { /* xxx should there be an SDT_PROBE() here? So far this event is always zero... */ ci->ci_xen_next_hardclock_in_past_evcnt.ev_count++; next = now + ns_per_tick / 2; error = HYPERVISOR_set_timer_op(next); if (error) { panic("failed to re-arm Xen timer %d", error); } } /* Success! */ return 0; } /* * xen_initclocks() * * Initialize the Xen clocks on the current CPU. */ void xen_initclocks(void) { struct cpu_info *ci = curcpu(); /* If this is the primary CPU, do global initialization first. */ if (ci == &cpu_info_primary) { /* Initialize the systemwide Xen timecounter. */ tc_init(&xen_timecounter); } /* Attach the event counters. */ evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt, EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), "xen systime went backwards in hardclock"); evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt, EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), "xen missed hardclock"); evcnt_attach_dynamic(&ci->ci_xen_tsc_backwards_evcnt, EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), "xen tsc ran backwards"); evcnt_attach_dynamic(&ci->ci_xen_global_ns_backwards_evcnt, EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), "xen global_ns prevented from running backwards"); evcnt_attach_dynamic(&ci->ci_xen_timecounter_jump_evcnt, EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), "xen hardclock jumped past timecounter max"); evcnt_attach_dynamic(&ci->ci_xen_next_hardclock_in_past_evcnt, EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev), "xen next hardclock time was in the past"); /* Fire up the clocks. */ xen_resumeclocks(ci); #ifdef DOM0OPS /* * If this is a privileged dom0, start pushing the wall * clock time back to the Xen hypervisor. */ if (ci == &cpu_info_primary && xendomain_is_privileged()) xen_timepush_init(); #endif } #ifdef DOM0OPS /* * xen_timepush_init() * * Initialize callout to periodically set Xen hypervisor's wall * clock time. */ static void xen_timepush_init(void) { struct sysctllog *log = NULL; const struct sysctlnode *node = NULL; int error; /* Start periodically updating the hypervisor's wall clock time. */ callout_init(&xen_timepush.ch, 0); callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL); /* Pick a default frequency for timepush. (Linux uses 11 minutes) */ xen_timepush.ticks = 530*hz + 3; /* avoid exact # of min/sec */ /* Create machdep.xen node. */ /* XXX Creation of the `machdep.xen' node should be elsewhere. it is! see ballon.c, hypervisor.c, AND xen_machdep.c */ error = sysctl_createv(&log, 0, NULL, &node, CTLFLAG_PERMANENT, CTLTYPE_NODE, "xen", SYSCTL_DESCR("Xen top level node"), NULL, 0, NULL, 0, CTL_MACHDEP, CTL_CREATE, CTL_EOL); if (error) goto fail; KASSERT(node != NULL); /* Create int machdep.xen.timepush_ticks knob. */ error = sysctl_createv(&log, 0, &node, NULL, CTLFLAG_PERMANENT | CTLFLAG_READWRITE, CTLTYPE_INT, "timepush_ticks", SYSCTL_DESCR("How often to update the hypervisor's time-of-day;" " 0 to disable"), sysctl_xen_timepush, 0, &xen_timepush.ticks, 0, CTL_CREATE, CTL_EOL); if (error) goto fail; /* Start the timepush callout. */ callout_schedule(&xen_timepush.ch, xen_timepush.ticks); /* Success! */ return; fail: sysctl_teardown(&log); } /* * xen_timepush_intr(cookie) * * Callout interrupt handler to push NetBSD's idea of the wall * clock time, usually synchronized with NTP, back to the Xen * hypervisor. */ static void xen_timepush_intr(void *cookie) { resettodr(); if (xen_timepush.ticks) callout_schedule(&xen_timepush.ch, xen_timepush.ticks); } /* * sysctl_xen_timepush(...) * * Sysctl handler to set machdep.xen.timepush_ticks. */ static int sysctl_xen_timepush(SYSCTLFN_ARGS) { struct sysctlnode node; int ticks; int error; ticks = xen_timepush.ticks; node = *rnode; node.sysctl_data = &ticks; error = sysctl_lookup(SYSCTLFN_CALL(&node)); if (error || newp == NULL) return error; if (ticks < 0) return EINVAL; if (ticks != xen_timepush.ticks) { xen_timepush.ticks = ticks; if (ticks == 0) callout_stop(&xen_timepush.ch); else callout_schedule(&xen_timepush.ch, ticks); } return 0; } #endif /* DOM0OPS */ static int xen_rtc_get(struct todr_chip_handle *, struct timeval *); static int xen_rtc_set(struct todr_chip_handle *, struct timeval *); static void xen_wallclock_time(struct timespec *); /* * xen time of day register: * * Xen wall clock time, plus a Xen vCPU system time adjustment. */ static struct todr_chip_handle xen_todr_chip = { .todr_gettime = xen_rtc_get, .todr_settime = xen_rtc_set, }; /* * xen_startrtclock() * * Initialize the real-time clock from x86 machdep autoconf. */ void xen_startrtclock(void) { todr_attach(&xen_todr_chip); } /* * xen_rtc_get(todr, tv) * * Get the current real-time clock from the Xen wall clock time * and vCPU system time adjustment. */ static int xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp) { struct timespec ts; xen_wallclock_time(&ts); TIMESPEC_TO_TIMEVAL(tvp, &ts); return 0; } /* * xen_rtc_set(todr, tv) * * Set the Xen wall clock time, if we can. */ static int xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp) { #ifdef DOM0OPS if (xendomain_is_privileged()) { struct clock_ymdhms dt; xen_platform_op_t op; uint64_t systime_ns; int error; # ifdef XEN_CLOCK_DEBUG struct timespec ts; uint64_t nanouptime_ns; int64_t dt_ns; # endif /* Convert to ymdhms and set the x86 ISA RTC. */ clock_secs_to_ymdhms(tvp->tv_sec, &dt); rtc_set_ymdhms(NULL, &dt); /* * * Get the domain's system time, to pass to XENPF_settime * * Xen will subtract this from our current wall clock time to * get this domain's boot time, which Xen calls "wall clock * time" */ /* * xxx we could probably use nanouptime() here instead, but * there's a weird so-far unaccounted-for difference -- it's too * big, I think, to be the time between when Xen thinks it first * created the domain, which should be systime_ns, and when the * domain first started keeping time, which should be * nanouptime_ns. */ systime_ns = xen_global_systime_ns(); /* Set the hypervisor wall clock time. */ memset(&op, 0, sizeof(op)); /* * XXX NetBSD kernels are currently built with * __XEN_INTERFACE_VERSION__ = 0x0003020a (see * ../conf/std.xenversion), so we implicitly get * XENPF_settime32 and struct xenpf_settime32. * * However it looks like with HYPERVISOR_platform_op() it should * be possible to explicitly set op.interface_version to get the * newer API/ABI so long as we explicitly use the version- * specific opcode XENPF_settime64 and the new struct * xenpf_settime64. * * xxx adjust XEN_CLOCK_DEBUG appropriately as well.... */ # if 0 op.interface_version = XENPF_INTERFACE_VERSION; op.cmd = XENPF_settime64; op.u.settime64.mbz = 0; op.u.settime64.secs = tvp->tv_sec; op.u.settime64.nsecs = tvp->tv_usec * 1000; op.u.settime64.system_time = systime_ns; # else op.cmd = XENPF_settime; op.u.settime.secs = tvp->tv_sec; op.u.settime.nsecs = tvp->tv_usec * 1000; op.u.settime.system_time = systime_ns; # endif # ifdef XEN_CLOCK_DEBUG nanouptime(&ts); nanouptime_ns = ts.tv_sec * 1000000000ULL + ts.tv_nsec; dt_ns = (int64_t) op.u.settime.system_time - (int64_t) nanouptime_ns; printf("xen_rtc_set: Setting to %"PRIu32".%09"PRIu32" s at systime %"PRIu64 " ns (nanouptime: %"PRIu64" ns, diff(st-nt): %"PRId64".%09"PRId64" s)\n", op.u.settime.secs, op.u.settime.nsecs, op.u.settime.system_time, nanouptime_ns, (int64_t) (dt_ns / 1000000000LL), (int64_t) (dt_ns % 1000000000LL)); # endif error = HYPERVISOR_platform_op(&op); # ifdef XEN_CLOCK_DEBUG /* kern_todr.c:todr_save_systime() already reports with a printf */ if (error) { printf("xen_rtc_set: XENPF_settime failed (%d)\n", error); } # endif return error; } #endif /* XXX Should this fail if not on privileged dom0? */ return 0; } /* * xen_wallclock_time(tsp) * * Return the current low-resolution wall clock * time (boot time of the domain plus systime), in tsp. */ static void xen_wallclock_time(struct timespec *tsp) { struct xen_wallclock_ticket ticket; uint64_t systime_ns; int s = splsched(); /* make sure we won't be interrupted */ /* Read the last wall clock sample from the hypervisor. */ do { xen_wallclock_enter(&ticket); tsp->tv_sec = HYPERVISOR_shared_info->wc_sec; tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec; } while (!xen_wallclock_exit(&ticket)); /* Get the global system time. */ systime_ns = xen_global_systime_ns(); splx(s); /* Add the system time to the wall clock time. */ systime_ns += tsp->tv_nsec; tsp->tv_sec += systime_ns / 1000000000ull; tsp->tv_nsec = systime_ns % 1000000000ull; } #ifdef XENPV /* * setstatclockrate(rate) * * Set the statclock to run at rate, in units of ticks per second. * * Currently Xen does not have a separate statclock, so this is a * noop; instad the statclock runs in hardclock. */ void setstatclockrate(int rate) { } #endif /* XENPV */
Attachment:
pgpGgiZUJ0fW9.pgp
Description: OpenPGP Digital Signature