From: jbeulich@novell.com Subject: replace Xen's custom time handling with such using GENERIC_CLOCKEVENTS infrastructure Patch-mainline: n/a Once validated this could be merged into the 2.6.?? patch. --- head-2011-03-11.orig/arch/x86/Kconfig 2011-02-01 16:43:32.000000000 +0100 +++ head-2011-03-11/arch/x86/Kconfig 2011-02-02 15:09:52.000000000 +0100 @@ -90,7 +90,6 @@ config CLOCKSOURCE_WATCHDOG config GENERIC_CLOCKEVENTS def_bool y - depends on !XEN config GENERIC_CLOCKEVENTS_BROADCAST def_bool y --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-02 15:09:52.000000000 +0100 @@ -74,7 +74,6 @@ extern start_info_t *xen_start_info; #define init_hypervisor_platform() init_hypervisor(&boot_cpu_data) DECLARE_PER_CPU(struct vcpu_runstate_info, runstate); -struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu); #define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running) /* arch/xen/kernel/evtchn.c */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 15:09:47.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-02 15:09:52.000000000 +0100 @@ -4,6 +4,8 @@ #include #ifndef __ASSEMBLY__ +#include +#include /* * The use of 'barrier' in the following reflects their use as local-lock * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following @@ -43,10 +45,6 @@ do { \ force_evtchn_callback(); \ } while (0) -void xen_safe_halt(void); - -void xen_halt(void); - #define arch_local_save_flags() xen_save_fl() #define arch_local_irq_restore(flags) xen_restore_fl(flags) @@ -59,19 +57,16 @@ void xen_halt(void); * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline void arch_safe_halt(void) -{ - xen_safe_halt(); -} +#define arch_safe_halt HYPERVISOR_block /* * Used when interrupts are already enabled or to * shutdown the processor: */ -static inline void halt(void) -{ - xen_halt(); -} +#define halt() VOID(irqs_disabled() \ + ? HYPERVISOR_vcpu_op(VCPUOP_down, \ + smp_processor_id(), NULL) \ + : 0) /* * For spinlocks, etc: --- head-2011-03-11.orig/arch/x86/kernel/time-xen.c 2010-10-05 16:57:34.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/time-xen.c 2011-02-02 15:09:52.000000000 +0100 @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include @@ -55,13 +55,7 @@ static DEFINE_PER_CPU(struct shadow_time static struct timespec shadow_tv; static u32 shadow_tv_version; -/* Keep track of last time we did processing/updating of jiffies and xtime. */ -static u64 processed_system_time; /* System time (ns) at last processing. */ -static DEFINE_PER_CPU(u64, processed_system_time); - -/* How much CPU time was spent blocked and how much was 'stolen'? */ -static DEFINE_PER_CPU(u64, processed_stolen_time); -static DEFINE_PER_CPU(u64, processed_blocked_time); +static u64 jiffies_bias, system_time_bias; /* Current runstate of each CPU (updated automatically by the hypervisor). */ DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -69,16 +63,6 @@ DEFINE_PER_CPU(struct vcpu_runstate_info /* Must be signed, as it's compared with s64 quantities which can be -ve. */ #define NS_PER_TICK (1000000000LL/HZ) -static struct vcpu_set_periodic_timer xen_set_periodic_tick = { - .period_ns = NS_PER_TICK -}; - -/* - * GCC 4.3 can turn loops over an induction variable into division. We do - * not support arbitrary 64-bit division, and so must break the induction. - */ -#define clobber_induction_variable(v) asm ( "" : "+r" (v) ) - /* Does this guest OS track Xen time, or set its wall clock independently? */ static int independent_wallclock = 0; static int __init __independent_wallclock(char *str) @@ -185,6 +169,11 @@ static u64 get_nsec_offset(struct shadow return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } +static inline u64 processed_system_time(void) +{ + return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias; +} + static void update_wallclock(void) { static DEFINE_MUTEX(uwc_mutex); @@ -201,7 +190,7 @@ static void update_wallclock(void) } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); if (!independent_wallclock) { - u64 tmp = processed_system_time; + u64 tmp = processed_system_time(); long nsec = do_div(tmp, NSEC_PER_SEC); struct timespec tv; @@ -219,6 +208,13 @@ static void _update_wallclock(struct wor } static DECLARE_WORK(update_wallclock_work, _update_wallclock); +void xen_check_wallclock_update(void) +{ + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version + && keventd_up()) + schedule_work(&update_wallclock_work); +} + /* * Reads a consistent set of time-base values from Xen, into a shadow data * area. @@ -285,7 +281,7 @@ static void sync_xen_wallclock(unsigned op.cmd = XENPF_settime; op.u.settime.secs = now.tv_sec; op.u.settime.nsecs = now.tv_nsec; - op.u.settime.system_time = processed_system_time; + op.u.settime.system_time = processed_system_time(); WARN_ON(HYPERVISOR_platform_op(&op)); update_wallclock(); @@ -294,7 +290,7 @@ static void sync_xen_wallclock(unsigned mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); } -static unsigned long long local_clock(void) +unsigned long long xen_local_clock(void) { unsigned int cpu = get_cpu(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); @@ -318,7 +314,7 @@ static unsigned long long local_clock(vo /* * Runstate accounting */ -static void get_runstate_snapshot(struct vcpu_runstate_info *res) +void get_runstate_snapshot(struct vcpu_runstate_info *res) { u64 state_time; struct vcpu_runstate_info *state; @@ -354,7 +350,7 @@ unsigned long long sched_clock(void) */ preempt_disable(); - now = local_clock(); + now = xen_local_clock(); get_runstate_snapshot(&runstate); @@ -397,138 +393,6 @@ unsigned long profile_pc(struct pt_regs } EXPORT_SYMBOL(profile_pc); -/* - * Default timer interrupt handler - */ -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - s64 delta, delta_cpu, stolen, blocked; - unsigned int i, cpu = smp_processor_id(); - struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); - struct vcpu_runstate_info runstate; - - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - - do { - get_time_values_from_xen(cpu); - - /* Obtain a consistent snapshot of elapsed wallclock cycles. */ - delta = delta_cpu = - shadow->system_timestamp + get_nsec_offset(shadow); - delta -= processed_system_time; - delta_cpu -= per_cpu(processed_system_time, cpu); - - get_runstate_snapshot(&runstate); - } while (!time_values_up_to_date(cpu)); - - if ((unlikely(delta < -(s64)permitted_clock_jitter) || - unlikely(delta_cpu < -(s64)permitted_clock_jitter)) - && printk_ratelimit()) { - printk("Timer ISR/%u: Time went backwards: " - "delta=%lld delta_cpu=%lld shadow=%lld " - "off=%lld processed=%lld cpu_processed=%lld\n", - cpu, delta, delta_cpu, shadow->system_timestamp, - (s64)get_nsec_offset(shadow), - processed_system_time, - per_cpu(processed_system_time, cpu)); - for (i = 0; i < num_online_cpus(); i++) - printk(" %d: %lld\n", i, - per_cpu(processed_system_time, i)); - } - - /* System-wide jiffy work. */ - if (delta >= NS_PER_TICK) { - do_div(delta, NS_PER_TICK); - processed_system_time += delta * NS_PER_TICK; - while (delta > HZ) { - clobber_induction_variable(delta); - do_timer(HZ); - delta -= HZ; - } - do_timer(delta); - } - - write_sequnlock(&xtime_lock); - - if (shadow_tv_version != HYPERVISOR_shared_info->wc_version - && keventd_up()) - schedule_work(&update_wallclock_work); - - /* - * Account stolen ticks. - * ensures that the ticks are accounted as stolen. - */ - stolen = runstate.time[RUNSTATE_runnable] - + runstate.time[RUNSTATE_offline] - - per_cpu(processed_stolen_time, cpu); - if ((stolen > 0) && (delta_cpu > 0)) { - delta_cpu -= stolen; - if (unlikely(delta_cpu < 0)) - stolen += delta_cpu; /* clamp local-time progress */ - do_div(stolen, NS_PER_TICK); - per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; - per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; - account_steal_ticks(stolen); - } - - /* - * Account blocked ticks. - * ensures that the ticks are accounted as idle/wait. - */ - blocked = runstate.time[RUNSTATE_blocked] - - per_cpu(processed_blocked_time, cpu); - if ((blocked > 0) && (delta_cpu > 0)) { - delta_cpu -= blocked; - if (unlikely(delta_cpu < 0)) - blocked += delta_cpu; /* clamp local-time progress */ - do_div(blocked, NS_PER_TICK); - per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; - per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; - account_idle_ticks(blocked); - } - - /* Account user/system ticks. */ - if (delta_cpu > 0) { - cputime_t ct; - - do_div(delta_cpu, NS_PER_TICK); - per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; - ct = jiffies_to_cputime(delta_cpu); - if (user_mode_vm(get_irq_regs())) - account_user_time(current, ct, cputime_to_scaled(ct)); - else if (current != idle_task(cpu) - || irq_count() != HARDIRQ_OFFSET) - account_system_time(current, HARDIRQ_OFFSET, - ct, cputime_to_scaled(ct)); - else - account_idle_ticks(delta_cpu); - } - - /* Offlined for more than a few seconds? Avoid lockup warnings. */ - if (stolen > 5*HZ) - touch_softlockup_watchdog(); - - /* Local timer processing (see update_process_times()). */ - run_local_timers(); - rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs())); - printk_tick(); - scheduler_tick(); - run_posix_cpu_timers(current); - profile_tick(CPU_PROFILING); - - return IRQ_HANDLED; -} - void mark_tsc_unstable(char *reason) { #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */ @@ -537,24 +401,13 @@ void mark_tsc_unstable(char *reason) } EXPORT_SYMBOL_GPL(mark_tsc_unstable); -static void init_missing_ticks_accounting(unsigned int cpu) -{ - struct vcpu_runstate_info *runstate = setup_runstate_area(cpu); - - per_cpu(processed_blocked_time, cpu) = - runstate->time[RUNSTATE_blocked]; - per_cpu(processed_stolen_time, cpu) = - runstate->time[RUNSTATE_runnable] + - runstate->time[RUNSTATE_offline]; -} - static cycle_t cs_last; static cycle_t xen_clocksource_read(struct clocksource *cs) { #ifdef CONFIG_SMP cycle_t last = get64(&cs_last); - cycle_t ret = local_clock(); + cycle_t ret = xen_local_clock(); if (unlikely((s64)(ret - last) < 0)) { if (last - ret > permitted_clock_jitter @@ -580,37 +433,28 @@ static cycle_t xen_clocksource_read(stru last = cur; } #else - return local_clock(); + return xen_local_clock(); #endif } /* No locking required. Interrupts are disabled on all CPUs. */ static void xen_clocksource_resume(struct clocksource *cs) { + unsigned long seq; unsigned int cpu; init_cpu_khz(); - for_each_online_cpu(cpu) { - switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, - &xen_set_periodic_tick)) { - case 0: -#if CONFIG_XEN_COMPAT <= 0x030004 - case -ENOSYS: -#endif - break; - default: - BUG(); - } + for_each_online_cpu(cpu) get_time_values_from_xen(cpu); - per_cpu(processed_system_time, cpu) = - per_cpu(shadow_time, 0).system_timestamp; - init_missing_ticks_accounting(cpu); - } - processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + do { + seq = read_seqbegin(&xtime_lock); + jiffies_bias = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + system_time_bias = per_cpu(shadow_time, 0).system_timestamp; - cs_last = local_clock(); + cs_last = xen_local_clock(); } static struct clocksource clocksource_xen = { @@ -655,7 +499,7 @@ void xen_read_persistent_clock(struct ti rmb(); } while ((s->wc_version & 1) | (version ^ s->wc_version)); - delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec; + delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec; do_div(delta, NSEC_PER_SEC); ts->tv_sec = delta; @@ -670,24 +514,10 @@ int xen_update_persistent_clock(void) return 0; } -/* Dynamically-mapped IRQ. */ -static int __read_mostly timer_irq = -1; -static struct irqaction timer_action = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED|IRQF_TIMER, - .name = "timer" -}; - -static void __init setup_cpu0_timer_irq(void) -{ - timer_irq = bind_virq_to_irqaction(VIRQ_TIMER, 0, &timer_action); - BUG_ON(timer_irq < 0); -} - static void __init _late_time_init(void) { update_wallclock(); - setup_cpu0_timer_irq(); + xen_clockevents_init(); } void __init time_init(void) @@ -696,22 +526,11 @@ void __init time_init(void) printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, - &xen_set_periodic_tick)) { - case 0: -#if CONFIG_XEN_COMPAT <= 0x030004 - case -ENOSYS: -#endif - break; - default: - BUG(); - } - + setup_runstate_area(0); get_time_values_from_xen(0); - processed_system_time = per_cpu(shadow_time, 0).system_timestamp; - per_cpu(processed_system_time, 0) = processed_system_time; - init_missing_ticks_accounting(0); + jiffies_bias = jiffies_64; + system_time_bias = per_cpu(shadow_time, 0).system_timestamp; clocksource_register(&clocksource_xen); @@ -737,13 +556,13 @@ u64 jiffies_to_st(unsigned long j) if (delta < 1) { /* Triggers in some wrap-around cases, but that's okay: * we just end up with a shorter timeout. */ - st = processed_system_time + NS_PER_TICK; + st = processed_system_time() + NS_PER_TICK; } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { /* Very long timeout means there is no pending timer. * We indicate this to Xen by passing zero timeout. */ st = 0; } else { - st = processed_system_time + delta * (u64)NS_PER_TICK; + st = processed_system_time() + delta * (u64)NS_PER_TICK; } } while (read_seqretry(&xtime_lock, seq)); @@ -751,122 +570,6 @@ u64 jiffies_to_st(unsigned long j) } EXPORT_SYMBOL(jiffies_to_st); -/* - * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu - * These functions are based on implementations from arch/s390/kernel/time.c - */ -static void stop_hz_timer(void) -{ - struct vcpu_set_singleshot_timer singleshot; - unsigned int cpu = smp_processor_id(); - unsigned long j; - int rc; - - cpumask_set_cpu(cpu, nohz_cpu_mask); - - /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ - /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */ - /* value of rcp->cur that matches rdp->quiescbatch and allows us to */ - /* stop the hz timer then the cpumasks created for subsequent values */ - /* of cur in rcu_start_batch are guaranteed to pick up the updated */ - /* nohz_cpu_mask and so will not depend on this cpu. */ - - smp_mb(); - - /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - local_softirq_pending() || - (j = get_next_timer_interrupt(jiffies), - time_before_eq(j, jiffies))) { - cpumask_clear_cpu(cpu, nohz_cpu_mask); - j = jiffies + 1; - } - - singleshot.timeout_abs_ns = jiffies_to_st(j); - if (!singleshot.timeout_abs_ns) - return; - singleshot.timeout_abs_ns += NS_PER_TICK / 2; - singleshot.flags = 0; - rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot); -#if CONFIG_XEN_COMPAT <= 0x030004 - if (rc) { - BUG_ON(rc != -ENOSYS); - rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns); - } -#endif - BUG_ON(rc); -} - -static void start_hz_timer(void) -{ - unsigned int cpu = smp_processor_id(); - int rc = HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL); - -#if CONFIG_XEN_COMPAT <= 0x030004 - if (rc) { - BUG_ON(rc != -ENOSYS); - rc = HYPERVISOR_set_timer_op(0); - } -#endif - BUG_ON(rc); - cpumask_clear_cpu(cpu, nohz_cpu_mask); -} - -void xen_safe_halt(void) -{ - stop_hz_timer(); - /* Blocking includes an implicit local_irq_enable(). */ - HYPERVISOR_block(); - start_hz_timer(); -} - -void xen_halt(void) -{ - if (irqs_disabled()) - VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); -} - -#ifdef CONFIG_SMP -int __cpuinit local_setup_timer(unsigned int cpu) -{ - int seq, irq; - - BUG_ON(cpu == 0); - - switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, - &xen_set_periodic_tick)) { - case 0: -#if CONFIG_XEN_COMPAT <= 0x030004 - case -ENOSYS: -#endif - break; - default: - BUG(); - } - - do { - seq = read_seqbegin(&xtime_lock); - /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ - per_cpu(processed_system_time, cpu) = - per_cpu(shadow_time, 0).system_timestamp; - init_missing_ticks_accounting(cpu); - } while (read_seqretry(&xtime_lock, seq)); - - irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action); - if (irq < 0) - return irq; - BUG_ON(timer_irq != irq); - - return 0; -} - -void __cpuinit local_teardown_timer(unsigned int cpu) -{ - BUG_ON(cpu == 0); - unbind_from_per_cpu_irq(timer_irq, cpu, &timer_action); -} -#endif - #ifdef CONFIG_CPU_FREQ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) --- head-2011-03-11.orig/drivers/xen/Kconfig 2011-02-03 14:49:15.000000000 +0100 +++ head-2011-03-11/drivers/xen/Kconfig 2011-02-17 10:32:19.000000000 +0100 @@ -356,9 +356,6 @@ endmenu config HAVE_IRQ_IGNORE_UNHANDLED def_bool y -config NO_IDLE_HZ - def_bool y - config ARCH_HAS_WALK_MEMORY def_bool y depends on X86 --- head-2011-03-11.orig/drivers/xen/core/Makefile 2010-04-19 14:55:02.000000000 +0200 +++ head-2011-03-11/drivers/xen/core/Makefile 2011-02-02 15:09:52.000000000 +0100 @@ -12,6 +12,7 @@ obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_KEXEC) += machine_kexec.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o obj-$(CONFIG_XEN_DOMCTL) += domctl.o CFLAGS_domctl.o := -D__XEN_PUBLIC_XEN_H__ -D__XEN_PUBLIC_GRANT_TABLE_H__ CFLAGS_domctl.o += -D__XEN_TOOLS__ -imacros xen/interface/domctl.h --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/drivers/xen/core/clockevents.c 2011-02-02 15:09:52.000000000 +0100 @@ -0,0 +1,298 @@ +/* + * Xen clockevent functions + * + * See arch/x86/xen/time.c for copyright and credits for derived + * portions of this file. + * + * Xen clockevent implementation + * + * Xen has two clockevent implementations: + * + * The old timer_op one works with all released versions of Xen prior + * to version 3.0.4. This version of the hypervisor provides a + * single-shot timer with nanosecond resolution. However, sharing the + * same event channel is a 100Hz tick which is delivered while the + * vcpu is running. We don't care about or use this tick, but it will + * cause the core time code to think the timer fired too soon, and + * will end up resetting it each time. It could be filtered, but + * doing so has complications when the ktime clocksource is not yet + * the xen clocksource (ie, at boot time). + * + * The new vcpu_op-based timer interface allows the tick timer period + * to be changed or turned off. The tick timer is not useful as a + * periodic timer because events are only delivered to running vcpus. + * The one-shot timer can report when a timeout is in the past, so + * set_next_event is capable of returning -ETIME when appropriate. + * This interface is used when available. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define XEN_SHIFT 22 + +/* Xen may fire a timer up to this many ns early */ +#define TIMER_SLOP 100000 +#define NS_PER_TICK (1000000000LL / HZ) + +/* + * Get a hypervisor absolute time. In theory we could maintain an + * offset between the kernel's time and the hypervisor's time, and + * apply that to a kernel's absolute timeout. Unfortunately the + * hypervisor and kernel times can drift even if the kernel is using + * the Xen clocksource, because ntp can warp the kernel's clocksource. + */ +static u64 get_abs_timeout(unsigned long delta) +{ + return xen_local_clock() + delta; +} + +#if CONFIG_XEN_COMPAT <= 0x030004 +static void timerop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); /* unsupported */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + if (HYPERVISOR_set_timer_op(0)) /* cancel timeout */ + BUG(); + break; + } +} + +static int timerop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) + BUG(); + + /* + * We may have missed the deadline, but there's no real way of + * knowing for sure. If the event was in the past, then we'll + * get an immediate interrupt. + */ + + return 0; +} +#endif + +static void vcpuop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); /* unsupported */ + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, + smp_processor_id(), NULL)) + BUG(); + /* fall through */ + case CLOCK_EVT_MODE_ONESHOT: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, + smp_processor_id(), NULL)) + BUG(); + break; + + case CLOCK_EVT_MODE_RESUME: + break; + } +} + +static int vcpuop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + struct vcpu_set_singleshot_timer single; + int ret; + + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + single.timeout_abs_ns = get_abs_timeout(delta); + single.flags = VCPU_SSHOTTMR_future; + + ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, + smp_processor_id(), &single); + + BUG_ON(ret != 0 && ret != -ETIME); + + return ret; +} + +static DEFINE_PER_CPU(struct clock_event_device, xen_clock_event) = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .irq = -1, +}; + +/* snapshots of runstate info */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); + +/* unused ns of stolen and blocked time */ +static DEFINE_PER_CPU(unsigned int, xen_residual_stolen); +static DEFINE_PER_CPU(unsigned int, xen_residual_blocked); + +static void init_missing_ticks_accounting(unsigned int cpu) +{ + per_cpu(xen_runstate_snapshot, cpu) = *setup_runstate_area(cpu); + if (cpu == smp_processor_id()) + get_runstate_snapshot(&__get_cpu_var(xen_runstate_snapshot)); + per_cpu(xen_residual_stolen, cpu) = 0; + per_cpu(xen_residual_blocked, cpu) = 0; +} + +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(xen_clock_event); + struct vcpu_runstate_info state, *snap; + s64 blocked, stolen; + irqreturn_t ret = IRQ_NONE; + + if (evt->event_handler) { + evt->event_handler(evt); + ret = IRQ_HANDLED; + } + + xen_check_wallclock_update(); + + get_runstate_snapshot(&state); + snap = &__get_cpu_var(xen_runstate_snapshot); + + stolen = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable] + + state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline] + + percpu_read(xen_residual_stolen); + + if (stolen >= NS_PER_TICK) + account_steal_ticks(div_u64_rem(stolen, NS_PER_TICK, + &__get_cpu_var(xen_residual_stolen))); + else + percpu_write(xen_residual_stolen, stolen > 0 ? stolen : 0); + + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked] + + percpu_read(xen_residual_blocked); + + if (blocked >= NS_PER_TICK) + account_idle_ticks(div_u64_rem(blocked, NS_PER_TICK, + &__get_cpu_var(xen_residual_blocked))); + else + percpu_write(xen_residual_blocked, blocked > 0 ? blocked : 0); + + *snap = state; + + return ret; +} + +static struct irqaction timer_action = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED|IRQF_TIMER, + .name = "timer" +}; + +void __cpuinit xen_setup_cpu_clockevents(void) +{ + unsigned int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu); + + init_missing_ticks_accounting(cpu); + + evt->cpumask = cpumask_of(cpu); + clockevents_register_device(evt); +} + +#ifdef CONFIG_SMP +int __cpuinit local_setup_timer(unsigned int cpu) +{ + struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu); + + BUG_ON(cpu == smp_processor_id()); + + evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action); + if (evt->irq < 0) + return evt->irq; + BUG_ON(per_cpu(xen_clock_event.irq, 0) != evt->irq); + + evt->set_mode = percpu_read(xen_clock_event.set_mode); + evt->set_next_event = percpu_read(xen_clock_event.set_next_event); + + return 0; +} + +void __cpuinit local_teardown_timer(unsigned int cpu) +{ + struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu); + + BUG_ON(cpu == 0); + unbind_from_per_cpu_irq(evt->irq, cpu, &timer_action); +} +#endif + +void xen_clockevents_resume(void) +{ + unsigned int cpu; + + if (percpu_read(xen_clock_event.set_mode) != vcpuop_set_mode) + return; + + for_each_online_cpu(cpu) { + init_missing_ticks_accounting(cpu); + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + } +} + +void __init xen_clockevents_init(void) +{ + unsigned int cpu = smp_processor_id(); + struct clock_event_device *evt = &__get_cpu_var(xen_clock_event); + + switch (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, + cpu, NULL)) { + case 0: + /* + * Successfully turned off 100Hz tick, so we have the + * vcpuop-based timer interface + */ + evt->set_mode = vcpuop_set_mode; + evt->set_next_event = vcpuop_set_next_event; + break; +#if CONFIG_XEN_COMPAT <= 0x030004 + case -ENOSYS: + printk(KERN_DEBUG "Xen: using timerop interface\n"); + evt->set_mode = timerop_set_mode; + evt->set_next_event = timerop_set_next_event; + break; +#endif + default: + BUG(); + } + + evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action); + BUG_ON(evt->irq < 0); + + xen_setup_cpu_clockevents(); +} --- head-2011-03-11.orig/drivers/xen/core/evtchn.c 2011-02-16 08:29:06.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/evtchn.c 2011-02-02 15:09:52.000000000 +0100 @@ -382,6 +382,7 @@ asmlinkage void __irq_entry evtchn_do_up wmb(); #endif +#ifndef CONFIG_NO_HZ /* * Handle timer interrupts before all others, so that all * hardirq handlers see an up-to-date system time even if we @@ -407,6 +408,7 @@ asmlinkage void __irq_entry evtchn_do_up BUG(); } } +#endif /* CONFIG_NO_HZ */ l1 = xchg(&vcpu_info->evtchn_pending_sel, 0); --- head-2011-03-11.orig/drivers/xen/core/machine_reboot.c 2011-01-13 16:21:42.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/machine_reboot.c 2011-02-02 15:09:52.000000000 +0100 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -158,10 +159,12 @@ static int take_machine_down(void *_susp } else BUG_ON(suspend_cancelled > 0); suspend->resume_notifier(suspend_cancelled); - if (suspend_cancelled >= 0) { + if (suspend_cancelled >= 0) post_suspend(suspend_cancelled); + if (!suspend_cancelled) + xen_clockevents_resume(); + if (suspend_cancelled >= 0) sysdev_resume(); - } if (!suspend_cancelled) { #ifdef __x86_64__ /* --- head-2011-03-11.orig/drivers/xen/core/smpboot.c 2011-03-03 16:14:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/smpboot.c 2011-02-07 12:28:20.000000000 +0100 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -146,6 +147,7 @@ static void __cpuinit cpu_bringup(void) identify_secondary_cpu(__this_cpu_ptr(&cpu_info)); touch_softlockup_watchdog(); preempt_disable(); + xen_setup_cpu_clockevents(); local_irq_enable(); } --- head-2011-03-11.orig/drivers/xen/core/spinlock.c 2011-03-15 16:18:17.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/spinlock.c 2011-03-15 16:18:37.000000000 +0100 @@ -13,6 +13,7 @@ #include #include #include +#include #include struct spinning { --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/include/xen/clock.h 2011-02-02 15:09:52.000000000 +0100 @@ -0,0 +1,19 @@ +#ifndef __XEN_CPU_CLOCK_H__ +#define __XEN_CPU_CLOCK_H__ + +struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu); +void get_runstate_snapshot(struct vcpu_runstate_info *); + +unsigned long long xen_local_clock(void); +void xen_check_wallclock_update(void); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +void xen_clockevents_init(void); +void xen_setup_cpu_clockevents(void); +void xen_clockevents_resume(void); +#else +static inline void xen_setup_cpu_clockevents(void) {} +static inline void xen_clockevents_resume(void) {} +#endif + +#endif /* __XEN_CPU_CLOCK_H__ */