/*	$NetBSD$	*/

/*-
 * Copyright (c) 2017 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "opt_xen.h"

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD$");

#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/device.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>

#include <dev/clock_subr.h>

#include <machine/cpu.h>
#include <machine/cpu_counter.h>
#include <machine/lock.h>

#include <xen/xen.h>
#include <xen/hypervisor.h>
#include <xen/evtchn.h>
#include <xen/xen-public/vcpu.h>

#include <x86/rtc.h>

#define NS_PER_TICK (1000000000ULL/hz)

static uint64_t	xen_vcputime_systime_ns(volatile struct vcpu_time_info *);
static uint64_t	xen_vcputime_raw_systime_ns(volatile struct vcpu_time_info *);
static void	xen_wallclock_time(struct timespec *);
static unsigned	xen_get_timecount(struct timecounter *);
static int	xen_rtc_get(struct todr_chip_handle *, struct timeval *);
static int	xen_rtc_set(struct todr_chip_handle *, struct timeval *);
static int	xen_timer_handler(void *, struct intrframe *);

/*
 * xen timecounter:
 *
 *	Xen vCPU system time, plus an adjustment with rdtsc.
 */
static struct timecounter xen_timecounter = {
	.tc_get_timecount = xen_get_timecount,
	.tc_poll_pps = NULL,
	.tc_counter_mask = ~0U,
	.tc_frequency = 1000000000ULL,	/* 1 GHz, i.e. units of nanoseconds */
	.tc_name = "xen_system_time",
	.tc_quality = 10000,
};

/*
 * xen time of day register:
 *
 *	Xen wall clock time, plus a Xen vCPU system time adjustment.
 */
static struct todr_chip_handle xen_todr_chip = {
	.todr_gettime = xen_rtc_get,
	.todr_settime = xen_rtc_set,
};

#ifdef DOM0OPS
/*
 * xen timepush state:
 *
 *	Callout to periodically, after a sysctl-configurable number of
 *	NetBSD ticks, set the Xen hypervisor's wall clock time.
 */
static struct {
	struct callout	ch;
	int		ticks;
} xen_timepush;

static void	xen_timepush_init(void);
static void	xen_timepush_intr(void *);
static int	sysctl_xen_timepush(SYSCTLFN_ARGS);
#endif

/*
 * startrtclock()
 *
 *	Initialize the real-time clock from x86 machdep autoconf.
 */
void
startrtclock(void)
{

	todr_attach(&xen_todr_chip);
}

/*
 * setstatclockrate(rate)
 *
 *	Set the statclock to run at rate, in units of ticks per second.
 *
 *	Currently Xen does not have a separate statclock, so this is a
 *	noop; instad the statclock runs in hardclock.
 */
void
setstatclockrate(int rate)
{
}

/*
 * idle_block()
 *
 *	Called from the idle loop when we have nothing to do but wait
 *	for an interrupt.
 */
void
idle_block(void)
{

	KASSERT(curcpu()->ci_ipending == 0);
	HYPERVISOR_block();
}

/*****************************************************************************
 ###################### XXX BEGIN KLUDGERIFIC X86ISMS ########################
 *****************************************************************************/

/*
 * xen_rdtsc_fence()
 *
 *	Wait for all prior instructions to complete before allowing any
 *	subsequent xen_rdtsc() to begin.  Subsequent instructions may
 *	be reordered to start earlier, however.
 *
 *	In principle, this could be a noop if xen_rdtsc did rdtscp.
 *	However, I'm not sure we can rely on rdtscp in Xen, if we can
 *	use it at all.  On Intel CPUs, according to the manuals, LFENCE
 *	is enough; on AMD CPUs, according to the Linux source code,
 *	MFENCE is needed.
 */
static inline void
xen_rdtsc_fence(void)
{

	x86_mfence();
}

/*
 * xen_rdtsc()
 *
 *	Read the tsc after all instructions before the prior
 *	xen_rdtsc_fence() have completed.
 *
 *	In principle, this could be an rdtscp, and xen_rdtsc_fence
 *	could be a noop.  However, I'm not sure we can rely on rdtscp
 *	in Xen, if we can use it at all.
 */
static inline uint64_t
xen_rdtsc(void)
{
	uint32_t lo, hi;

	asm volatile("rdtsc" : "=a"(lo), "=d"(hi));

	return curcpu()->ci_data.cpu_cc_skew + (((uint64_t)hi << 32) | lo);
}

/*****************************************************************************
 ###################### XXX END KLUDGERIFIC X86ISMS ##########################
 *****************************************************************************/

/*
 * struct xen_vcputime_ticket
 *
 *	State for a vCPU read section, during which a caller may read
 *	from fields of a struct vcpu_time_info.  Caller must enter with
 *	xen_vcputime_enter, exit with xen_vcputime_exit, and be
 *	prepared to retry if xen_vcputime_exit fails.
 */
struct xen_vcputime_ticket {
	uint64_t	version;
};

/*
 * xen_vcputime_enter(vt, tp)
 *
 *	Enter a vCPU time read section and store a ticket in *tp, which
 *	the caller must use with xen_vcputime_exit.
 */
static inline void
xen_vcputime_enter(volatile struct vcpu_time_info *vt,
    struct xen_vcputime_ticket *tp)
{

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	while (1 & (tp->version = vt->version))
		SPINLOCK_BACKOFF_HOOK;
	membar_consumer();
}

/*
 * xen_vcputime_exit(vt, tp)
 *
 *	Exit a vCPU time read section with the ticket in *tp from
 *	xen_vcputime_enter.  Return true on success, false if caller
 *	must retry.
 */
static inline bool
xen_vcputime_exit(volatile struct vcpu_time_info *vt,
    struct xen_vcputime_ticket *tp)
{

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	membar_consumer();
	return tp->version == vt->version;
}

/*
 * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
 *
 *	Convert a difference in tsc units to a difference in
 *	nanoseconds given a multiplier and shift for the unit
 *	conversion.
 */
static inline uint64_t
xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
    int8_t tsc_shift)
{
	uint32_t delta_tsc_hi, delta_tsc_lo;

	if (tsc_shift < 0)
		delta_tsc >>= -tsc_shift;
	else
		delta_tsc <<= tsc_shift;

	delta_tsc_hi = delta_tsc >> 32;
	delta_tsc_lo = delta_tsc & 0xffffffffUL;

	/* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
	return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
	    (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
}

/*
 * xen_vcputime_systime_ns(vt)
 *
 *	Return a snapshot of the Xen system time plus an adjustment
 *	from the tsc, in units of nanoseconds.
 */
static uint64_t
xen_vcputime_systime_ns(volatile struct vcpu_time_info *vt)
{
	struct xen_vcputime_ticket ticket;
	uint64_t systime_ns, cached_tsc, fresh_tsc, delta_tsc, delta_ns;
	uint64_t cached_delta_tsc, cached_delta_ns;

	/* We'd better be bound to the CPU in _some_ way.  */
	KASSERT(cpu_intr_p() ||
	    cpu_softintr_p() ||
	    kpreempt_disabled() ||
	    (curlwp->l_flag & LP_BOUND));

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	/*
	 * Repeatedly try to read the system time, corresponding tsc
	 * timestamp, and tsc frequency until we get a consistent view.
	 */
	do {
		xen_vcputime_enter(vt, &ticket);
		systime_ns = vt->system_time;
		cached_tsc = vt->tsc_timestamp;
		fresh_tsc = xen_rdtsc();
		delta_tsc = __predict_false(fresh_tsc < cached_tsc) ? 0 :
		    fresh_tsc - cached_tsc;
		delta_ns = xen_tsc_to_ns_delta(delta_tsc,
		    vt->tsc_to_system_mul, vt->tsc_shift);
		cached_delta_tsc =
		    __predict_false(cached_tsc <
			curcpu()->ci_xen_last_tsc_timestamp)
		    ? curcpu()->ci_xen_last_tsc_timestamp - cached_tsc : 0;
		cached_delta_ns = xen_tsc_to_ns_delta(cached_delta_tsc,
		    vt->tsc_to_system_mul, vt->tsc_shift);
	} while (!xen_vcputime_exit(vt, &ticket));

	/*
	 * Notify the console if the Xen hypervisor's raw system_time
	 * ran backwards.  This shouldn't happen because the Xen
	 * hypervisor is supposed to be smarter than that.
	 */
	if (__predict_false(systime_ns <
		curcpu()->ci_xen_last_raw_systime)) {
		printf("xen vcpu_time_info system_time ran backwards"
		    " %"PRIu64"ns\n",
		    curcpu()->ci_xen_last_raw_systime - systime_ns);
	} else if (__predict_false((systime_ns -
		    curcpu()->ci_xen_last_raw_systime)
		< cached_delta_ns)) {
		printf("xen system time advanced but tsc retreated more:"
		    " systime delta %"PRIu64"ns <"
		    " tsc timestamp delta %"PRIu64" = "
		    " %"PRIu64"ns\n",
		    systime_ns - curcpu()->ci_xen_last_raw_systime,
		    cached_delta_tsc,
		    cached_delta_ns);
	}

	/*
	 * Notify the console if the Xen tsc timestamp ran backwards
	 * while the system time remained the same.  This shouldn't
	 * happen because the Xen hypervisor is supposed to be smarter
	 * than that.
	 *
	 * XXX But this is also a rather rigid criterion.  What if the
	 * systime advances just a wee bit, but the tsc timestamp
	 * retreats a lot?
	 */
	if (__predict_false(systime_ns == curcpu()->ci_xen_last_raw_systime &&
		cached_tsc < curcpu()->ci_xen_last_tsc_timestamp)) {
		printf("xen vcpu_time_info tsc_timestamp ran backwards"
		    " %"PRIu64"\n",
		    curcpu()->ci_xen_last_tsc_timestamp - cached_tsc);
	}

	/*
	 * Notify the console if the CPU's tsc ran backwards.  This
	 * shouldn't happen because the CPU tsc isn't supposed to
	 * change, although maybe in cases of migration it will.
	 */
	if (__predict_false(fresh_tsc < curcpu()->ci_xen_last_tsc)) {
		printf("xen cpu tsc ran backwards %"PRIu64"\n",
		    curcpu()->ci_xen_last_tsc - fresh_tsc);
	}

	/*
	 * Notify the console if the CPU's tsc appeared to run behind
	 * Xen's idea of the tsc.  This shouldn't happen because the
	 * Xen hypervisor is supposed to have read the tsc _before_
	 * writing to the vcpu_time_info page, _before_ we read the
	 * tsc.  Further, if we switched pCPUs after reading the tsc
	 * timestamp but before reading the CPU's tsc, the hypervisor
	 * had better notify us by updating the version too and forcing
	 * us to retry the vCPU time read.
	 */
	if (__predict_false(fresh_tsc < cached_tsc)) {
		printf("xen cpu tsc %"PRIu64
		    " ran backwards from timestamp %"PRIu64
		    " by %"PRIu64"\n",
		    fresh_tsc, cached_tsc, cached_tsc - fresh_tsc);
	}

	/*
	 * Notify the console if the delta computation yielded a
	 * negative.
	 */
	if (__predict_false((int64_t)delta_ns < 0)) {
		printf("xen tsc delta in ns went negative: %"PRId64"\n",
		    delta_ns);
	}

	/*
	 * Notify the console if the addition will wrap around.
	 */
	if (__predict_false((systime_ns + delta_ns) < systime_ns)) {
		printf("xen systime + delta wrapped around:"
		    " %"PRIu64" + %"PRIu64" = %"PRIu64"\n",
		    systime_ns, delta_ns, systime_ns + delta_ns);
	}

	/*
	 * There remains one possibility we do NOT detect here: The Xen
	 * raw system time changes by d_0 > 0, and the tsc delta
	 */

	/* Remember the various timestamps.  */
	curcpu()->ci_xen_last_raw_systime = systime_ns;
	curcpu()->ci_xen_last_tsc_timestamp = cached_tsc;
	curcpu()->ci_xen_last_tsc = fresh_tsc;

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	/* Add the delta to the raw system, in nanoseconds.  */
	return systime_ns + delta_ns;
}

/*
 * xen_vcputime_raw_systime_ns(vt)
 *
 *	Return a snapshot of the current Xen system time to the
 *	resolution of the Xen hypervisor tick, in units of nanoseconds.
 */
static uint64_t
xen_vcputime_raw_systime_ns(volatile struct vcpu_time_info *vt)
{
	struct xen_vcputime_ticket ticket;
	uint64_t systime_ns;

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	do {
		xen_vcputime_enter(vt, &ticket);
		systime_ns = vt->system_time;
	} while (!xen_vcputime_exit(vt, &ticket));

	KASSERT(vt == &curcpu()->ci_vcpu->time);

	return systime_ns;
}

/*
 * struct xen_wallclock_ticket
 *
 *	State for a wall clock read section, during which a caller may
 *	read from the wall clock fields of HYPERVISOR_shared_info.
 *	Caller must enter with xen_wallclock_enter, exit with
 *	xen_wallclock_exit, and be prepared to retry if
 *	xen_wallclock_exit fails.
 */
struct xen_wallclock_ticket {
	uint32_t version;
};

/*
 * xen_wallclock_enter(tp)
 *
 *	Enter a wall clock read section and store a ticket in *tp,
 *	which the caller must use with xen_wallclock_exit.  Caller must
 *	be prepared to retry if xen_wallclock_exit fails.  During a
 *	wall clock read section, caller may read from the wall clock
 *	fields of HYPERVISOR_shared_info.
 */
static inline void
xen_wallclock_enter(struct xen_wallclock_ticket *tp)
{

	while (1 & (tp->version = HYPERVISOR_shared_info->wc_version))
		SPINLOCK_BACKOFF_HOOK;
	membar_consumer();
}

/*
 * xen_wallclock_exit(tp)
 *
 *	Exit a wall clock read section with the ticket in *tp from
 *	xen_wallclock_enter.  Return true on success, false if caller
 *	must retry.
 */
static inline bool
xen_wallclock_exit(struct xen_wallclock_ticket *tp)
{

	membar_consumer();
	return tp->version == HYPERVISOR_shared_info->wc_version;
}

/*
 * xen_wallclock_time(tsp)
 *
 *	Return a snapshot of the current low-resolution wall clock
 *	time, as reported by the hypervisor, in tsp.
 */
static void
xen_wallclock_time(struct timespec *tsp)
{
	struct xen_wallclock_ticket ticket;
	uint64_t systime_ns;
	int bound;

	/* Prevent switching CPUs while we read the vCPU system time.  */
	bound = curlwp_bind();

	/* Get the vCPU system time.  */
	systime_ns = xen_vcputime_systime_ns(&curcpu()->ci_vcpu->time);

	/* Read the last wall clock sample from the hypervisor. */
	do {
		xen_wallclock_enter(&ticket);
		tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
		tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
	} while (!xen_wallclock_exit(&ticket));

	/* All done on the CPU.  */
	curlwp_bindx(bound);

	/* Add the system time to the wall clock time.  */
	systime_ns += tsp->tv_nsec;
	tsp->tv_sec += systime_ns / 1000000000ull;
	tsp->tv_nsec = systime_ns % 1000000000ull;
}

/*
 * xen_get_timecount(tc)
 *
 *	Return the Xen timecount, vCPU system time plus rdtsc
 *	adjustment.
 */
static unsigned
xen_get_timecount(struct timecounter *tc)
{
	struct cpu_info *ci;
	uint64_t ncsw;
	uint64_t last_timecount;
	uint64_t systime_ns, systime_delta_ns;
	int s;

	KASSERT(tc == &xen_timecounter);

	/*
	 * Bind to the current CPU and lock the systime delta by
	 * blocking xen_timer_handler, which adjusts the systime delta
	 * to fake incremental hardclock advances.
	 */
	s = splclock();
	ci = curcpu();
	ncsw = curlwp->l_ncsw;
	systime_ns = xen_vcputime_systime_ns(&curcpu()->ci_vcpu->time);
	systime_delta_ns = curcpu()->ci_xen_systime_delta_ns;
	last_timecount = curcpu()->ci_xen_last_timecount;
	curcpu()->ci_xen_last_timecount = systime_ns - systime_delta_ns;
	KASSERT(ncsw == curlwp->l_ncsw);
	KASSERT(ci == curcpu());
	splx(s);

	if (systime_ns - systime_delta_ns < last_timecount) {
		printf("xen 64-bit timecount ran backwards: %"PRIu64"ns"
		    " (true %"PRIu64", delta %"PRIu64")\n",
		    last_timecount - (systime_ns - systime_delta_ns),
		    systime_ns, systime_delta_ns);
	}

	return (unsigned)(systime_ns - systime_delta_ns);
}

/*
 * xen_rtc_get(todr, tv)
 *
 *	Get the current real-time clock from the Xen wall clock time
 *	and vCPU system time adjustment.
 */
static int
xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
{
	struct timespec ts;

	xen_wallclock_time(&ts);
	TIMESPEC_TO_TIMEVAL(tvp, &ts);

	return 0;
}

/*
 * xen_rtc_set(todr, tv)
 *
 *	Set the Xen wall clock time, if we can.
 */
static int
xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
{
#ifdef DOM0OPS
	struct clock_ymdhms dt;
#if __XEN_INTERFACE_VERSION__ < 0x00030204
	dom0_op_t op;
#else
	xen_platform_op_t op;
#endif
	uint64_t systime_ns;
	int bound;

	if (xendomain_is_privileged()) {
		/* Convert to ymdhms and set the x86 ISA RTC.  */
		clock_secs_to_ymdhms(tvp->tv_sec, &dt);
		rtc_set_ymdhms(NULL, &dt);

		/* Get the system time so we can preserve it.  */
		bound = curlwp_bind();
		systime_ns = xen_vcputime_systime_ns(&curcpu()->ci_vcpu->time);
		curlwp_bindx(bound);

		/* Set the hypervisor wall clock time.  */
		op.u.settime.nsecs = tvp->tv_sec;
		op.u.settime.nsecs = tvp->tv_usec * 1000;
		op.u.settime.system_time = systime_ns;
#if __XEN_INTERFACE_VERSION__ < 0x00030204
		op.cmd = DOM0_SETTIME;
		return HYPERVISOR_dom0_op(&op);
#else
		op.cmd = XENPF_settime;
		return HYPERVISOR_platform_op(&op);
#endif
	}
#endif

	/* XXX Should this fail if not on privileged dom0?  */
	return 0;
}

/*
 * xen_delay(n)
 *
 *	Wait approximately n microseconds.
 */
void
xen_delay(unsigned n)
{
	struct cpu_info *ci;
	int bound;

	/* Bind to the CPU so we don't compare tsc on different CPUs.  */
	bound = curlwp_bind();
	ci = curcpu();

	if (n < 500000) {
		/*
		 * Xen system time is not precise enough for short
		 * delays, so use the tsc instead.
		 */
		uint64_t start, end;

		/*
		 * Get the start and end times.
		 *
		 * XXX cpu_frequency(ci) can easily get stale, from my
		 * cursory read of cpu_get_tsc_freq.
		 */
		start = xen_rdtsc();
		end = start + ((uint64_t)n * cpu_frequency(ci))/1000000;

		/* If the end time wrapped around, wait for us to wrap.  */
		if (end < start) {
			do {
				xen_rdtsc_fence();
			} while (start < xen_rdtsc());
		}

		/* Wait until we've passed the end. */
		do {
			xen_rdtsc_fence();
		} while (xen_rdtsc() < end);
	} else {
		/* Use the Xen system time.  */
		volatile struct vcpu_time_info *t = &ci->ci_vcpu->time;
		uint64_t start, end;

		/*
		 * Get the start and end times.
		 *
		 * Nanoseconds since boot takes centuries to overflow,
		 * so no need to worry about wrapping.  We do not
		 * bother with tsc adjustment for delays this long.
		 *
		 * XXX Do we ever need to issue delays this long?  That
		 * seems likely to be a bug.
		 */
		start = xen_vcputime_raw_systime_ns(t);
		end = start + 1000*(uint64_t)n;

		/* Wait until the system time has passed the end.  */
		do {
			HYPERVISOR_yield();
		} while (xen_vcputime_raw_systime_ns(t) < end);
	}

	/* Unbind from the CPU if we weren't already bound.  */
	curlwp_bindx(bound);
}

/*
 * xen_suspendclocks(ci)
 *
 *	Stop handling the Xen timer event on the CPU of ci.  Caller
 *	must be running on and bound to ci's CPU.
 *
 *	Actually, caller must have kpreemption disabled, because that's
 *	easier to assert at the moment.
 */
void
xen_suspendclocks(struct cpu_info *ci)
{
	int evtch;

	KASSERT(ci == curcpu());
	KASSERT(kpreempt_disabled());

	evtch = unbind_virq_from_evtch(VIRQ_TIMER);
	KASSERT(evtch != -1);

	hypervisor_mask_event(evtch);
	event_remove_handler(evtch, (int (*)(void *))xen_timer_handler, ci);

	aprint_verbose("Xen clock: removed event channel %d\n", evtch);
}

/*
 * xen_resumeclocks(ci)
 *
 *	Start handling the Xen timer event on the CPU of ci.  Caller
 *	must be running on and bound to ci's CPU.
 *
 *	Actually, caller must have kpreemption disabled, because that's
 *	easier to assert at the moment.
 */
void
xen_resumeclocks(struct cpu_info *ci)
{
	int evtch;

	KASSERT(ci == curcpu());
	KASSERT(kpreempt_disabled());

	evtch = bind_virq_to_evtch(VIRQ_TIMER);
	KASSERT(evtch != -1);

	/* XXX sketchy function pointer cast */
	event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
	    ci, IPL_CLOCK, "clock");
	hypervisor_enable_event(evtch);

	aprint_verbose("Xen clock: using event channel %d\n", evtch);
}

/*
 * xen_timer_handler(cookie, regs)
 *
 *	Periodic Xen timer event handler for NetBSD hardclock.  Calls
 *	to this may get delayed, so we run hardclock as many times as
 *	we need to in order to cover the Xen system time that elapsed.
 *	After that, re-arm the timer to run again at the next tick.
 */
static int
xen_timer_handler(void *cookie, struct intrframe *regs)
{
	struct cpu_info *ci = cookie;
	uint64_t last, now, delta, next;
	int error;

	KASSERT(cpu_intr_p());
	KASSERT(ci == curcpu());

again:
	/*
	 * Find how many nanoseconds of Xen system time has elapsed
	 * since the last hardclock tick.
	 */
	last = ci->ci_xen_hardclock_systime_ns;
	now = xen_vcputime_systime_ns(&ci->ci_vcpu->time);
	if (now < last) {
		printf("xen systime ran backwards in hardclock %"PRIu64"ns\n",
		    last - now);
		now = last;
	}
	delta = now - last;

#if 1
	if (delta >= NS_PER_TICK) {
		ci->ci_xen_hardclock_systime_ns = now;
		ci->ci_xen_systime_delta_ns = 0;
		hardclock((struct clockframe *)regs);
		ci->ci_xen_hardclock_evcnt.ev_count++;
	}
#else
	/*
	 * Run the hardclock timer as many times as necessary.  We
	 * maintain the charade that the Xen system time is as if we
	 * ticked every NS_PER_TICK nanoseconds exactly, by setting
	 * ci->ci_xen_systime_delta_ns to the current delta from the
	 * theoretical hardclock tick system time and the current
	 * system time.
	 */
	while (delta >= NS_PER_TICK) {
		ci->ci_xen_hardclock_systime_ns += NS_PER_TICK;
		ci->ci_xen_systime_delta_ns = (delta -= NS_PER_TICK);
		hardclock((struct clockframe *)regs);
		ci->ci_xen_hardclock_evcnt.ev_count++;
	}
#endif

	/*
	 * Re-arm the timer.  If it fails, it's probably because the
	 * time is in the past, so update our idea of what the Xen
	 * system time is and try again.
	 */
	next = ci->ci_xen_hardclock_systime_ns + NS_PER_TICK;
	error = HYPERVISOR_set_timer_op(next);
	if (error)
		goto again;

	/*
	 * Done with the charade about the Xen system time.  Restore
	 * the Xen system time delta to zero.
	 */
	ci->ci_xen_systime_delta_ns = 0;

	/* Success!  */
	return 0;
}

/*
 * xen_initclocks()
 *
 *	Initialize the Xen clocks on the current CPU.
 */
void
xen_initclocks(void)
{
	struct cpu_info *ci = curcpu();

	printf("ohai, im in ur clocks, fixin ur monotonicity\n");

	/* If this is the primary CPU, do global initialization first.  */
	if (ci == &cpu_info_primary) {
		/* Initialize the systemwide Xen timecounter.  */
		tc_init(&xen_timecounter);

#ifdef DOM0OPS
		/*
		 * If this is a privileged dom0, start pushing the wall
		 * clock time back to the Xen hypervisor.
		 */
		if (xendomain_is_privileged())
			xen_timepush_init();
#endif
	}

	/* Pretend the last hardclock happened right now.  */
	ci->ci_xen_hardclock_systime_ns =
	    xen_vcputime_systime_ns(&ci->ci_vcpu->time);
	ci->ci_xen_systime_delta_ns = 0;

	/* Attach the hardclock event counter.  */
	evcnt_attach_dynamic(&ci->ci_xen_hardclock_evcnt, EVCNT_TYPE_INTR,
	    NULL, device_xname(ci->ci_dev), "hardclock");

	/* Fire up the clocks.  */
	xen_resumeclocks(ci);
}

#ifdef DOM0OPS

/*
 * xen_timepush_init()
 *
 *	Initialize callout to periodically set Xen hypervisor's wall
 *	clock time.
 */
static void
xen_timepush_init(void)
{
	struct sysctllog *log = NULL;
	const struct sysctlnode *node = NULL;
	int error;

	/* Start periodically updating the hypervisor's wall clock time.  */
	callout_init(&xen_timepush.ch, 0);
	callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);

	/* Pick a default frequency for timepush.  */
	xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */

	/* Create machdep.xen node.  */
	/* XXX Creation of the `machdep.xen' node should be elsewhere.  */
	error = sysctl_createv(&log, 0, NULL, &node, 0,
	    CTLTYPE_NODE, "xen",
	    SYSCTL_DESCR("Xen top level node"),
	    NULL, 0, NULL, 0,
	    CTL_MACHDEP, CTL_CREATE, CTL_EOL);
	if (error)
		goto fail;
	KASSERT(node != NULL);

	/* Create int machdep.xen.timepush_ticks knob.  */
	error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE,
	    CTLTYPE_INT, "timepush_ticks",
	    SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
		" 0 to disable"),
	    sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
	    CTL_CREATE, CTL_EOL);
	if (error)
		goto fail;

	/* Start the timepush callout.  */
	callout_schedule(&xen_timepush.ch, xen_timepush.ticks);

	/* Success!  */
	return;

fail:	sysctl_teardown(&log);
}

/*
 * xen_timepush_intr(cookie)
 *
 *	Callout interrupt handler to push NetBSD's idea of the wall
 *	clock time, usually synchronized with NTP, back to the Xen
 *	hypervisor.
 */
static void
xen_timepush_intr(void *cookie)
{

	resettodr();
	if (xen_timepush.ticks)
		callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
}

/*
 * sysctl_xen_timepush(...)
 *
 *	Sysctl handler to set machdep.xen.timepush_ticks.
 */
static int
sysctl_xen_timepush(SYSCTLFN_ARGS)
{
	struct sysctlnode node;
	int ticks;
	int error;

	ticks = xen_timepush.ticks;
	node = *rnode;
	node.sysctl_data = &ticks;
	error = sysctl_lookup(SYSCTLFN_CALL(&node));
	if (error || newp == NULL)
		return error;

	if (ticks < 0)
		return EINVAL;

	if (ticks != xen_timepush.ticks) {
		xen_timepush.ticks = ticks;

		if (ticks == 0)
			callout_stop(&xen_timepush.ch);
		else
			callout_schedule(&xen_timepush.ch, ticks);
	}

	return 0;
}

#endif	/* DOM0OPS */