Subject: xen3 common From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 1017:948c933f8839) Patch-mainline: n/a Acked-by: jbeulich@novell.com List of files that don't require modification anymore (and hence got removed from this patch), for reference and in case upstream wants to take the forward porting patches: 2.6.22/include/linux/sched.h 2.6.22/kernel/softlockup.c 2.6.22/kernel/timer.c 2.6.25/mm/highmem.c 2.6.30/include/linux/pci_regs.h --- head-2010-05-25.orig/drivers/Makefile 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/Makefile 2010-05-12 08:58:07.000000000 +0200 @@ -34,6 +34,7 @@ obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ obj-$(CONFIG_NUBUS) += nubus/ obj-y += macintosh/ +obj-$(CONFIG_XEN) += xen/ obj-$(CONFIG_IDE) += ide/ obj-$(CONFIG_SCSI) += scsi/ obj-$(CONFIG_ATA) += ata/ --- head-2010-05-25.orig/drivers/acpi/Makefile 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/acpi/Makefile 2010-03-24 14:53:41.000000000 +0100 @@ -64,5 +64,8 @@ obj-$(CONFIG_ACPI_POWER_METER) += power_ processor-y := processor_driver.o processor_throttling.o processor-y += processor_idle.o processor_thermal.o processor-$(CONFIG_CPU_FREQ) += processor_perflib.o +ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +processor-objs += processor_perflib.o processor_extcntl.o +endif obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o --- head-2010-05-25.orig/drivers/acpi/acpica/hwsleep.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/acpi/acpica/hwsleep.c 2010-03-24 14:53:41.000000000 +0100 @@ -236,7 +236,11 @@ acpi_status asmlinkage acpi_enter_sleep_ u32 pm1b_control; struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; +#if !(defined(CONFIG_XEN) && defined(CONFIG_X86)) u32 in_value; +#else + int err; +#endif struct acpi_object_list arg_list; union acpi_object arg; acpi_status status; @@ -347,6 +351,7 @@ acpi_status asmlinkage acpi_enter_sleep_ /* Write #2: Write both SLP_TYP + SLP_EN */ +#if !(defined(CONFIG_XEN) && defined(CONFIG_X86)) status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); @@ -386,6 +391,16 @@ acpi_status asmlinkage acpi_enter_sleep_ /* Spin until we wake */ } while (!in_value); +#else + /* PV ACPI just need check hypercall return value */ + err = acpi_notify_hypervisor_state(sleep_state, + PM1Acontrol, PM1Bcontrol); + if (err) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, + "Hypervisor failure [%d]\n", err)); + return_ACPI_STATUS(AE_ERROR); + } +#endif return_ACPI_STATUS(AE_OK); } --- head-2010-05-25.orig/drivers/acpi/processor_driver.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/acpi/processor_driver.c 2010-05-25 09:19:10.000000000 +0200 @@ -443,7 +443,8 @@ static int acpi_processor_get_info(struc */ if (pr->id == -1) { if (ACPI_FAILURE - (acpi_processor_hotadd_init(pr->handle, &pr->id))) { + (acpi_processor_hotadd_init(pr->handle, &pr->id)) && + !processor_cntl_external()) { return -ENODEV; } } @@ -494,7 +495,11 @@ static int acpi_processor_get_info(struc return 0; } +#ifndef CONFIG_XEN static DEFINE_PER_CPU(void *, processor_device_array); +#else +static void *processor_device_array[NR_ACPI_CPUS]; +#endif static void acpi_processor_notify(struct acpi_device *device, u32 event) { @@ -575,8 +580,11 @@ static int __cpuinit acpi_processor_add( strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); device->driver_data = pr; + processor_extcntl_init(); + result = acpi_processor_get_info(device); - if (result) { + if (result || + ((pr->id == -1) && !processor_cntl_external())) { /* Processor is physically not present */ return 0; } @@ -586,23 +594,36 @@ static int __cpuinit acpi_processor_add( return 0; #endif - BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0)); + BUG_ON(!processor_cntl_external() && + ((pr->id >= nr_cpu_ids) || (pr->id < 0))); /* * Buggy BIOS check * ACPI id of processors can be reported wrongly by the BIOS. * Don't trust it blindly */ +#ifndef CONFIG_XEN if (per_cpu(processor_device_array, pr->id) != NULL && per_cpu(processor_device_array, pr->id) != device) { +#else + BUG_ON(pr->acpi_id >= NR_ACPI_CPUS); + if (processor_device_array[pr->acpi_id] != NULL && + processor_device_array[pr->acpi_id] != device) { +#endif printk(KERN_WARNING "BIOS reported wrong ACPI id " "for the processor\n"); result = -ENODEV; goto err_free_cpumask; } +#ifndef CONFIG_XEN per_cpu(processor_device_array, pr->id) = device; per_cpu(processors, pr->id) = pr; +#else + processor_device_array[pr->acpi_id] = device; + if (pr->id != -1) + per_cpu(processors, pr->id) = pr; +#endif result = acpi_processor_add_fs(device); if (result) @@ -614,15 +635,27 @@ static int __cpuinit acpi_processor_add( goto err_remove_fs; } -#ifdef CONFIG_CPU_FREQ +#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) acpi_processor_ppc_has_changed(pr, 0); #endif - acpi_processor_get_throttling_info(pr); - acpi_processor_get_limit_info(pr); + /* + * pr->id may equal to -1 while processor_cntl_external enabled. + * throttle and thermal module don't support this case. + * Tx only works when dom0 vcpu == pcpu num by far, as we give + * control to dom0. + */ + if (pr->id != -1) { + acpi_processor_get_throttling_info(pr); + acpi_processor_get_limit_info(pr); + } acpi_processor_power_init(pr, device); + result = processor_extcntl_prepare(pr); + if (result) + goto end; + pr->cdev = thermal_cooling_device_register("Processor", device, &processor_cooling_ops); if (IS_ERR(pr->cdev)) { @@ -674,7 +707,7 @@ static int acpi_processor_remove(struct pr = acpi_driver_data(device); - if (pr->id >= nr_cpu_ids) + if (!processor_cntl_external() && pr->id >= nr_cpu_ids) goto free; if (type == ACPI_BUS_REMOVAL_EJECT) { @@ -695,8 +728,14 @@ static int acpi_processor_remove(struct pr->cdev = NULL; } +#ifndef CONFIG_XEN per_cpu(processors, pr->id) = NULL; per_cpu(processor_device_array, pr->id) = NULL; +#else + if (pr->id != -1) + per_cpu(processors, pr->id) = NULL; + processor_device_array[pr->acpi_id] = NULL; +#endif free: free_cpumask_var(pr->throttling.shared_cpu_map); @@ -752,6 +791,10 @@ int acpi_processor_device_add(acpi_handl return -ENODEV; } + if (processor_cntl_external() && acpi_driver_data(*device)) + processor_notify_external(acpi_driver_data(*device), + PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD); + return 0; } @@ -781,6 +824,10 @@ static void __ref acpi_processor_hotplug "Unable to add the device\n"); break; } + pr = acpi_driver_data(device); + if (processor_cntl_external() && pr) + processor_notify_external(pr, + PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD); break; case ACPI_NOTIFY_EJECT_REQUEST: ACPI_DEBUG_PRINT((ACPI_DB_INFO, @@ -797,6 +844,9 @@ static void __ref acpi_processor_hotplug "Driver data is NULL, dropping EJECT\n"); return; } + if (processor_cntl_external()) + processor_notify_external(pr, PROCESSOR_HOTPLUG, + HOTPLUG_TYPE_REMOVE); break; default: ACPI_DEBUG_PRINT((ACPI_DB_INFO, @@ -861,6 +911,11 @@ static acpi_status acpi_processor_hotadd static int acpi_processor_handle_eject(struct acpi_processor *pr) { +#ifdef CONFIG_XEN + if (pr->id == -1) + return (0); +#endif + if (cpu_online(pr->id)) cpu_down(pr->id); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/drivers/acpi/processor_extcntl.c 2010-03-24 14:53:41.000000000 +0100 @@ -0,0 +1,241 @@ +/* + * processor_extcntl.c - channel to external control logic + * + * Copyright (C) 2008, Intel corporation + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#define ACPI_PROCESSOR_COMPONENT 0x01000000 +#define ACPI_PROCESSOR_CLASS "processor" +#define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver" +#define _COMPONENT ACPI_PROCESSOR_COMPONENT +ACPI_MODULE_NAME("acpi_processor") + +static int processor_extcntl_parse_csd(struct acpi_processor *pr); +static int processor_extcntl_get_performance(struct acpi_processor *pr); +/* + * External processor control logic may register with its own set of + * ops to get ACPI related notification. One example is like VMM. + */ +const struct processor_extcntl_ops *processor_extcntl_ops; +EXPORT_SYMBOL(processor_extcntl_ops); + +static int processor_notify_smm(void) +{ + acpi_status status; + static int is_done = 0; + + /* only need successfully notify BIOS once */ + /* avoid double notification which may lead to unexpected result */ + if (is_done) + return 0; + + /* Can't write pstate_cnt to smi_cmd if either value is zero */ + if ((!acpi_fadt.smi_cmd) || (!acpi_fadt.pstate_cnt)) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO,"No SMI port or pstate_cnt\n")); + return 0; + } + + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n", + acpi_fadt.pstate_cnt, acpi_fadt.smi_cmd)); + + /* FADT v1 doesn't support pstate_cnt, many BIOS vendors use + * it anyway, so we need to support it... */ + if (acpi_fadt_is_v1) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Using v1.0 FADT reserved value for pstate_cnt\n")); + } + + status = acpi_os_write_port(acpi_fadt.smi_cmd, + (u32) acpi_fadt.pstate_cnt, 8); + if (ACPI_FAILURE(status)) + return status; + + is_done = 1; + + return 0; +} + +int processor_notify_external(struct acpi_processor *pr, int event, int type) +{ + int ret = -EINVAL; + + if (!processor_cntl_external()) + return -EINVAL; + + switch (event) { + case PROCESSOR_PM_INIT: + case PROCESSOR_PM_CHANGE: + if ((type >= PM_TYPE_MAX) || + !processor_extcntl_ops->pm_ops[type]) + break; + + ret = processor_extcntl_ops->pm_ops[type](pr, event); + break; + case PROCESSOR_HOTPLUG: + if (processor_extcntl_ops->hotplug) + ret = processor_extcntl_ops->hotplug(pr, type); + break; + default: + printk(KERN_ERR "Unsupport processor events %d.\n", event); + break; + } + + return ret; +} + +/* + * External control logic can decide to grab full or part of physical + * processor control bits. Take a VMM for example, physical processors + * are owned by VMM and thus existence information like hotplug is + * always required to be notified to VMM. Similar is processor idle + * state which is also necessarily controlled by VMM. But for other + * control bits like performance/throttle states, VMM may choose to + * control or not upon its own policy. + */ +void processor_extcntl_init(void) +{ + if (!processor_extcntl_ops) + arch_acpi_processor_init_extcntl(&processor_extcntl_ops); +} + +/* + * This is called from ACPI processor init, and targeted to hold + * some tricky housekeeping jobs to satisfy external control model. + * For example, we may put dependency parse stub here for idle + * and performance state. Those information may be not available + * if splitting from dom0 control logic like cpufreq driver. + */ +int processor_extcntl_prepare(struct acpi_processor *pr) +{ + /* parse cstate dependency information */ + if (processor_pm_external()) + processor_extcntl_parse_csd(pr); + + /* Initialize performance states */ + if (processor_pmperf_external()) + processor_extcntl_get_performance(pr); + + return 0; +} + +/* + * Currently no _CSD is implemented which is why existing ACPI code + * doesn't parse _CSD at all. But to keep interface complete with + * external control logic, we put a placeholder here for future + * compatibility. + */ +static int processor_extcntl_parse_csd(struct acpi_processor *pr) +{ + int i; + + for (i = 0; i < pr->power.count; i++) { + if (!pr->power.states[i].valid) + continue; + + /* No dependency by default */ + pr->power.states[i].domain_info = NULL; + pr->power.states[i].csd_count = 0; + } + + return 0; +} + +/* + * Existing ACPI module does parse performance states at some point, + * when acpi-cpufreq driver is loaded which however is something + * we'd like to disable to avoid confliction with external control + * logic. So we have to collect raw performance information here + * when ACPI processor object is found and started. + */ +static int processor_extcntl_get_performance(struct acpi_processor *pr) +{ + int ret; + struct acpi_processor_performance *perf; + struct acpi_psd_package *pdomain; + + if (pr->performance) + return -EBUSY; + + perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); + if (!perf) + return -ENOMEM; + + pr->performance = perf; + /* Get basic performance state information */ + ret = acpi_processor_get_performance_info(pr); + if (ret < 0) + goto err_out; + + /* + * Well, here we need retrieve performance dependency information + * from _PSD object. The reason why existing interface is not used + * is due to the reason that existing interface sticks to Linux cpu + * id to construct some bitmap, however we want to split ACPI + * processor objects from Linux cpu id logic. For example, even + * when Linux is configured as UP, we still want to parse all ACPI + * processor objects to external logic. In this case, it's preferred + * to use ACPI ID instead. + */ + pdomain = &pr->performance->domain_info; + pdomain->num_processors = 0; + ret = acpi_processor_get_psd(pr); + if (ret < 0) { + /* + * _PSD is optional - assume no coordination if absent (or + * broken), matching native kernels' behavior. + */ + pdomain->num_entries = ACPI_PSD_REV0_ENTRIES; + pdomain->revision = ACPI_PSD_REV0_REVISION; + pdomain->domain = pr->acpi_id; + pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL; + pdomain->num_processors = 1; + } + + /* Some sanity check */ + if ((pdomain->revision != ACPI_PSD_REV0_REVISION) || + (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) || + ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) && + (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) && + (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) { + ret = -EINVAL; + goto err_out; + } + + /* Last step is to notify BIOS that external logic exists */ + processor_notify_smm(); + + processor_notify_external(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF); + + return 0; +err_out: + pr->performance = NULL; + kfree(perf); + return ret; +} --- head-2010-05-25.orig/drivers/acpi/processor_idle.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/acpi/processor_idle.c 2010-04-15 09:43:01.000000000 +0200 @@ -456,7 +456,8 @@ static int acpi_processor_get_power_info */ cx.entry_method = ACPI_CSTATE_HALT; snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT"); - } else { + /* This doesn't apply to external control case */ + } else if (!processor_pm_external()) { continue; } if (cx.type == ACPI_STATE_C1 && @@ -495,6 +496,12 @@ static int acpi_processor_get_power_info cx.power = obj->integer.value; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL + /* cache control methods to notify external logic */ + if (processor_pm_external()) + memcpy(&cx.reg, reg, sizeof(*reg)); +#endif + current_count++; memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx)); @@ -1229,6 +1236,11 @@ int __cpuinit acpi_processor_power_init( if (!entry) return -EIO; #endif + + if (processor_pm_external()) + processor_notify_external(pr, + PROCESSOR_PM_INIT, PM_TYPE_IDLE); + return 0; } --- head-2010-05-25.orig/drivers/acpi/processor_perflib.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/acpi/processor_perflib.c 2010-04-15 09:43:05.000000000 +0200 @@ -79,6 +79,7 @@ MODULE_PARM_DESC(ignore_ppc, "If the fre static int acpi_processor_ppc_status; +#ifdef CONFIG_CPU_FREQ static int acpi_processor_ppc_notifier(struct notifier_block *nb, unsigned long event, void *data) { @@ -121,6 +122,7 @@ static int acpi_processor_ppc_notifier(s static struct notifier_block acpi_ppc_notifier_block = { .notifier_call = acpi_processor_ppc_notifier, }; +#endif /* CONFIG_CPU_FREQ */ static int acpi_processor_get_platform_limit(struct acpi_processor *pr) { @@ -209,7 +211,12 @@ int acpi_processor_ppc_has_changed(struc if (ret < 0) return (ret); else +#ifdef CONFIG_CPU_FREQ return cpufreq_update_policy(pr->id); +#elif defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) + return processor_notify_external(pr, + PROCESSOR_PM_CHANGE, PM_TYPE_PERF); +#endif } int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) @@ -225,6 +232,7 @@ int acpi_processor_get_bios_limit(int cp } EXPORT_SYMBOL(acpi_processor_get_bios_limit); +#ifdef CONFIG_CPU_FREQ void acpi_processor_ppc_init(void) { if (!cpufreq_register_notifier @@ -243,6 +251,7 @@ void acpi_processor_ppc_exit(void) acpi_processor_ppc_status &= ~PPC_REGISTERED; } +#endif /* CONFIG_CPU_FREQ */ static int acpi_processor_get_performance_control(struct acpi_processor *pr) { @@ -390,7 +399,10 @@ static int acpi_processor_get_performanc return result; } -static int acpi_processor_get_performance_info(struct acpi_processor *pr) +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL +static +#endif +int acpi_processor_get_performance_info(struct acpi_processor *pr) { int result = 0; acpi_status status = AE_OK; @@ -435,6 +447,7 @@ static int acpi_processor_get_performanc return result; } +#ifdef CONFIG_CPU_FREQ int acpi_processor_notify_smm(struct module *calling_module) { acpi_status status; @@ -495,8 +508,12 @@ int acpi_processor_notify_smm(struct mod } EXPORT_SYMBOL(acpi_processor_notify_smm); +#endif /* CONFIG_CPU_FREQ */ -static int acpi_processor_get_psd(struct acpi_processor *pr) +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL +static +#endif +int acpi_processor_get_psd(struct acpi_processor *pr) { int result = 0; acpi_status status = AE_OK; --- head-2010-05-25.orig/drivers/acpi/sleep.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/acpi/sleep.c 2010-03-24 14:53:41.000000000 +0100 @@ -60,6 +60,7 @@ static struct notifier_block tts_notifie static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP +#ifndef CONFIG_ACPI_PV_SLEEP /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { if (!acpi_wakeup_address) { @@ -69,6 +70,7 @@ static int acpi_sleep_prepare(u32 acpi_s (acpi_physical_address)acpi_wakeup_address); } +#endif ACPI_FLUSH_CPU_CACHE(); acpi_enable_wakeup_device_prep(acpi_state); #endif @@ -249,7 +251,14 @@ static int acpi_suspend_enter(suspend_st break; case ACPI_STATE_S3: +#ifdef CONFIG_ACPI_PV_SLEEP + /* Hyperviosr will save and restore CPU context + * and then we can skip low level housekeeping here. + */ + acpi_enter_sleep_state(acpi_state); +#else do_suspend_lowlevel(); +#endif break; } --- head-2010-05-25.orig/drivers/char/agp/intel-agp.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/char/agp/intel-agp.c 2010-04-15 09:43:13.000000000 +0200 @@ -443,6 +443,13 @@ static struct page *i8xx_alloc_pages(voi if (page == NULL) return NULL; +#ifdef CONFIG_XEN + if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) { + __free_pages(page, 2); + return NULL; + } +#endif + if (set_pages_uc(page, 4) < 0) { set_pages_wb(page, 4); __free_pages(page, 2); @@ -459,6 +466,9 @@ static void i8xx_destroy_pages(struct pa return; set_pages_wb(page, 4); +#ifdef CONFIG_XEN + xen_destroy_contiguous_region((unsigned long)page_address(page), 2); +#endif put_page(page); __free_pages(page, 2); atomic_dec(&agp_bridge->current_memory_agp); --- head-2010-05-25.orig/drivers/char/mem.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/char/mem.c 2010-04-15 09:43:24.000000000 +0200 @@ -89,6 +89,7 @@ void __weak unxlate_dev_mem_ptr(unsigned { } +#ifndef ARCH_HAS_DEV_MEM /* * This funcion reads the *physical* memory. The f_pos points directly to the * memory location. @@ -211,6 +212,7 @@ static ssize_t write_mem(struct file *fi *ppos += written; return written; } +#endif int __weak phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) @@ -337,6 +339,9 @@ static int mmap_mem(struct file *file, s static int mmap_kmem(struct file *file, struct vm_area_struct *vma) { unsigned long pfn; +#ifdef CONFIG_XEN + unsigned long i, count; +#endif /* Turn a kernel-virtual address into a physical page frame */ pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT; @@ -351,6 +356,13 @@ static int mmap_kmem(struct file *file, if (!pfn_valid(pfn)) return -EIO; +#ifdef CONFIG_XEN + count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + for (i = 0; i < count; i++) + if ((pfn + i) != mfn_to_local_pfn(pfn_to_mfn(pfn + i))) + return -EIO; +#endif + vma->vm_pgoff = pfn; return mmap_mem(file, vma); } @@ -845,6 +857,7 @@ static int open_port(struct inode * inod #define open_kmem open_mem #define open_oldmem open_mem +#ifndef ARCH_HAS_DEV_MEM static const struct file_operations mem_fops = { .llseek = memory_lseek, .read = read_mem, @@ -853,6 +866,9 @@ static const struct file_operations mem_ .open = open_mem, .get_unmapped_area = get_unmapped_area_mem, }; +#else +extern const struct file_operations mem_fops; +#endif #ifdef CONFIG_DEVKMEM static const struct file_operations kmem_fops = { --- head-2010-05-25.orig/drivers/char/tpm/Makefile 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/char/tpm/Makefile 2010-03-24 14:53:41.000000000 +0100 @@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_TIS) += tpm_tis.o obj-$(CONFIG_TCG_NSC) += tpm_nsc.o obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o +obj-$(CONFIG_TCG_XEN) += tpm_xenu.o +tpm_xenu-y = tpm_xen.o tpm_vtpm.o --- head-2010-05-25.orig/drivers/char/tpm/tpm.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/char/tpm/tpm.h 2010-03-24 14:53:41.000000000 +0100 @@ -108,6 +108,9 @@ struct tpm_chip { struct dentry **bios_dir; struct list_head list; +#ifdef CONFIG_XEN + void *priv; +#endif void (*release) (struct device *); }; @@ -266,6 +269,18 @@ struct tpm_cmd_t { ssize_t tpm_getcap(struct device *, __be32, cap_t *, const char *); +#ifdef CONFIG_XEN +static inline void *chip_get_private(const struct tpm_chip *chip) +{ + return chip->priv; +} + +static inline void chip_set_private(struct tpm_chip *chip, void *priv) +{ + chip->priv = priv; +} +#endif + extern void tpm_get_timeouts(struct tpm_chip *); extern void tpm_gen_interrupt(struct tpm_chip *); extern void tpm_continue_selftest(struct tpm_chip *); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/drivers/char/tpm/tpm_vtpm.c 2010-03-24 14:53:41.000000000 +0100 @@ -0,0 +1,542 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Authors: + * Stefan Berger + * + * Generic device driver part for device drivers in a virtualized + * environment. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ + +#include +#include +#include +#include +#include +#include "tpm.h" +#include "tpm_vtpm.h" + +/* read status bits */ +enum { + STATUS_BUSY = 0x01, + STATUS_DATA_AVAIL = 0x02, + STATUS_READY = 0x04 +}; + +struct transmission { + struct list_head next; + + unsigned char *request; + size_t request_len; + size_t request_buflen; + + unsigned char *response; + size_t response_len; + size_t response_buflen; + + unsigned int flags; +}; + +enum { + TRANSMISSION_FLAG_WAS_QUEUED = 0x1 +}; + + +enum { + DATAEX_FLAG_QUEUED_ONLY = 0x1 +}; + + +/* local variables */ + +/* local function prototypes */ +static int _vtpm_send_queued(struct tpm_chip *chip); + + +/* ============================================================= + * Some utility functions + * ============================================================= + */ +static void vtpm_state_init(struct vtpm_state *vtpms) +{ + vtpms->current_request = NULL; + spin_lock_init(&vtpms->req_list_lock); + init_waitqueue_head(&vtpms->req_wait_queue); + INIT_LIST_HEAD(&vtpms->queued_requests); + + vtpms->current_response = NULL; + spin_lock_init(&vtpms->resp_list_lock); + init_waitqueue_head(&vtpms->resp_wait_queue); + + vtpms->disconnect_time = jiffies; +} + + +static inline struct transmission *transmission_alloc(void) +{ + return kzalloc(sizeof(struct transmission), GFP_ATOMIC); +} + +static unsigned char * +transmission_set_req_buffer(struct transmission *t, + unsigned char *buffer, size_t len) +{ + if (t->request_buflen < len) { + kfree(t->request); + t->request = kmalloc(len, GFP_KERNEL); + if (!t->request) { + t->request_buflen = 0; + return NULL; + } + t->request_buflen = len; + } + + memcpy(t->request, buffer, len); + t->request_len = len; + + return t->request; +} + +static unsigned char * +transmission_set_res_buffer(struct transmission *t, + const unsigned char *buffer, size_t len) +{ + if (t->response_buflen < len) { + kfree(t->response); + t->response = kmalloc(len, GFP_ATOMIC); + if (!t->response) { + t->response_buflen = 0; + return NULL; + } + t->response_buflen = len; + } + + memcpy(t->response, buffer, len); + t->response_len = len; + + return t->response; +} + +static inline void transmission_free(struct transmission *t) +{ + kfree(t->request); + kfree(t->response); + kfree(t); +} + +/* ============================================================= + * Interface with the lower layer driver + * ============================================================= + */ +/* + * Lower layer uses this function to make a response available. + */ +int vtpm_vd_recv(const struct tpm_chip *chip, + const unsigned char *buffer, size_t count, + void *ptr) +{ + unsigned long flags; + int ret_size = 0; + struct transmission *t; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + /* + * The list with requests must contain one request + * only and the element there must be the one that + * was passed to me from the front-end. + */ + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + if (vtpms->current_request != ptr) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return 0; + } + + if ((t = vtpms->current_request)) { + transmission_free(t); + vtpms->current_request = NULL; + } + + t = transmission_alloc(); + if (t) { + if (!transmission_set_res_buffer(t, buffer, count)) { + transmission_free(t); + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return -ENOMEM; + } + ret_size = count; + vtpms->current_response = t; + wake_up_interruptible(&vtpms->resp_wait_queue); + } + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + + return ret_size; +} + + +/* + * Lower layer indicates its status (connected/disconnected) + */ +void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status) +{ + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + vtpms->vd_status = vd_status; + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) { + vtpms->disconnect_time = jiffies; + } +} + +/* ============================================================= + * Interface with the generic TPM driver + * ============================================================= + */ +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + int rc = 0; + unsigned long flags; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + /* + * Check if the previous operation only queued the command + * In this case there won't be a response, so I just + * return from here and reset that flag. In any other + * case I should receive a response from the back-end. + */ + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) { + vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY; + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + /* + * The first few commands (measurements) must be + * queued since it might not be possible to talk to the + * TPM, yet. + * Return a response of up to 30 '0's. + */ + + count = min_t(size_t, count, 30); + memset(buf, 0x0, count); + return count; + } + /* + * Check whether something is in the responselist and if + * there's nothing in the list wait for something to appear. + */ + + if (!vtpms->current_response) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + interruptible_sleep_on_timeout(&vtpms->resp_wait_queue, + 1000); + spin_lock_irqsave(&vtpms->resp_list_lock ,flags); + } + + if (vtpms->current_response) { + struct transmission *t = vtpms->current_response; + vtpms->current_response = NULL; + rc = min(count, t->response_len); + memcpy(buf, t->response, rc); + transmission_free(t); + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return rc; +} + +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) +{ + int rc = 0; + unsigned long flags; + struct transmission *t = transmission_alloc(); + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + if (!t) + return -ENOMEM; + /* + * If there's a current request, it must be the + * previous request that has timed out. + */ + spin_lock_irqsave(&vtpms->req_list_lock, flags); + if (vtpms->current_request != NULL) { + printk("WARNING: Sending although there is a request outstanding.\n" + " Previous request must have timed out.\n"); + transmission_free(vtpms->current_request); + vtpms->current_request = NULL; + } + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + /* + * Queue the packet if the driver below is not + * ready, yet, or there is any packet already + * in the queue. + * If the driver below is ready, unqueue all + * packets first before sending our current + * packet. + * For each unqueued packet, except for the + * last (=current) packet, call the function + * tpm_xen_recv to wait for the response to come + * back. + */ + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) { + if (time_after(jiffies, + vtpms->disconnect_time + HZ * 10)) { + rc = -ENOENT; + } else { + goto queue_it; + } + } else { + /* + * Send all queued packets. + */ + if (_vtpm_send_queued(chip) == 0) { + + vtpms->current_request = t; + + rc = vtpm_vd_send(vtpms->tpm_private, + buf, + count, + t); + /* + * The generic TPM driver will call + * the function to receive the response. + */ + if (rc < 0) { + vtpms->current_request = NULL; + goto queue_it; + } + } else { +queue_it: + if (!transmission_set_req_buffer(t, buf, count)) { + transmission_free(t); + rc = -ENOMEM; + goto exit; + } + /* + * An error occurred. Don't event try + * to send the current request. Just + * queue it. + */ + spin_lock_irqsave(&vtpms->req_list_lock, flags); + vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY; + list_add_tail(&t->next, &vtpms->queued_requests); + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + } + } + +exit: + return rc; +} + + +/* + * Send all queued requests. + */ +static int _vtpm_send_queued(struct tpm_chip *chip) +{ + int rc; + int error = 0; + long flags; + unsigned char buffer[1]; + struct vtpm_state *vtpms; + vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->req_list_lock, flags); + + while (!list_empty(&vtpms->queued_requests)) { + /* + * Need to dequeue them. + * Read the result into a dummy buffer. + */ + struct transmission *qt = (struct transmission *) + vtpms->queued_requests.next; + list_del(&qt->next); + vtpms->current_request = qt; + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + rc = vtpm_vd_send(vtpms->tpm_private, + qt->request, + qt->request_len, + qt); + + if (rc < 0) { + spin_lock_irqsave(&vtpms->req_list_lock, flags); + if ((qt = vtpms->current_request) != NULL) { + /* + * requeue it at the beginning + * of the list + */ + list_add(&qt->next, + &vtpms->queued_requests); + } + vtpms->current_request = NULL; + error = 1; + break; + } + /* + * After this point qt is not valid anymore! + * It is freed when the front-end is delivering + * the data by calling tpm_recv + */ + /* + * Receive response into provided dummy buffer + */ + rc = vtpm_recv(chip, buffer, sizeof(buffer)); + spin_lock_irqsave(&vtpms->req_list_lock, flags); + } + + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + return error; +} + +static void vtpm_cancel(struct tpm_chip *chip) +{ + unsigned long flags; + struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->resp_list_lock,flags); + + if (!vtpms->current_response && vtpms->current_request) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + interruptible_sleep_on(&vtpms->resp_wait_queue); + spin_lock_irqsave(&vtpms->resp_list_lock,flags); + } + + if (vtpms->current_response) { + struct transmission *t = vtpms->current_response; + vtpms->current_response = NULL; + transmission_free(t); + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock,flags); +} + +static u8 vtpm_status(struct tpm_chip *chip) +{ + u8 rc = 0; + unsigned long flags; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + /* + * Data are available if: + * - there's a current response + * - the last packet was queued only (this is fake, but necessary to + * get the generic TPM layer to call the receive function.) + */ + if (vtpms->current_response || + 0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) { + rc = STATUS_DATA_AVAIL; + } else if (!vtpms->current_response && !vtpms->current_request) { + rc = STATUS_READY; + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return rc; +} + +static struct file_operations vtpm_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = tpm_open, + .read = tpm_read, + .write = tpm_write, + .release = tpm_release, +}; + +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL); +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL); +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, + NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel); + +static struct attribute *vtpm_attrs[] = { + &dev_attr_pubek.attr, + &dev_attr_pcrs.attr, + &dev_attr_enabled.attr, + &dev_attr_active.attr, + &dev_attr_owned.attr, + &dev_attr_temp_deactivated.attr, + &dev_attr_caps.attr, + &dev_attr_cancel.attr, + NULL, +}; + +static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs }; + +#define TPM_LONG_TIMEOUT (10 * 60 * HZ) + +static struct tpm_vendor_specific tpm_vtpm = { + .recv = vtpm_recv, + .send = vtpm_send, + .cancel = vtpm_cancel, + .status = vtpm_status, + .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL, + .req_complete_val = STATUS_DATA_AVAIL, + .req_canceled = STATUS_READY, + .attr_group = &vtpm_attr_grp, + .miscdev = { + .fops = &vtpm_ops, + }, + .duration = { + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + }, +}; + +struct tpm_chip *init_vtpm(struct device *dev, + struct tpm_private *tp) +{ + long rc; + struct tpm_chip *chip; + struct vtpm_state *vtpms; + + vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL); + if (!vtpms) + return ERR_PTR(-ENOMEM); + + vtpm_state_init(vtpms); + vtpms->tpm_private = tp; + + chip = tpm_register_hardware(dev, &tpm_vtpm); + if (!chip) { + rc = -ENODEV; + goto err_free_mem; + } + + chip_set_private(chip, vtpms); + + return chip; + +err_free_mem: + kfree(vtpms); + + return ERR_PTR(rc); +} + +void cleanup_vtpm(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip); + tpm_remove_hardware(dev); + kfree(vtpms); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/drivers/char/tpm/tpm_vtpm.h 2010-03-24 14:53:41.000000000 +0100 @@ -0,0 +1,55 @@ +#ifndef TPM_VTPM_H +#define TPM_VTPM_H + +struct tpm_chip; +struct tpm_private; + +struct vtpm_state { + struct transmission *current_request; + spinlock_t req_list_lock; + wait_queue_head_t req_wait_queue; + + struct list_head queued_requests; + + struct transmission *current_response; + spinlock_t resp_list_lock; + wait_queue_head_t resp_wait_queue; // processes waiting for responses + + u8 vd_status; + u8 flags; + + unsigned long disconnect_time; + + /* + * The following is a private structure of the underlying + * driver. It is passed as parameter in the send function. + */ + struct tpm_private *tpm_private; +}; + + +enum vdev_status { + TPM_VD_STATUS_DISCONNECTED = 0x0, + TPM_VD_STATUS_CONNECTED = 0x1 +}; + +/* this function is called from tpm_vtpm.c */ +int vtpm_vd_send(struct tpm_private * tp, + const u8 * buf, size_t count, void *ptr); + +/* these functions are offered by tpm_vtpm.c */ +struct tpm_chip *init_vtpm(struct device *, + struct tpm_private *); +void cleanup_vtpm(struct device *); +int vtpm_vd_recv(const struct tpm_chip* chip, + const unsigned char *buffer, size_t count, void *ptr); +void vtpm_vd_status(const struct tpm_chip *, u8 status); + +static inline struct tpm_private *tpm_private_from_dev(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct vtpm_state *vtpms = chip_get_private(chip); + return vtpms->tpm_private; +} + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/drivers/char/tpm/tpm_xen.c 2010-03-24 14:53:41.000000000 +0100 @@ -0,0 +1,722 @@ +/* + * Copyright (c) 2005, IBM Corporation + * + * Author: Stefan Berger, stefanb@us.ibm.com + * Grant table support: Mahadevan Gomathisankaran + * + * This code has been derived from drivers/xen/netfront/netfront.c + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tpm.h" +#include "tpm_vtpm.h" + +#undef DEBUG + +/* local structures */ +struct tpm_private { + struct tpm_chip *chip; + + tpmif_tx_interface_t *tx; + atomic_t refcnt; + unsigned int irq; + u8 is_connected; + u8 is_suspended; + + spinlock_t tx_lock; + + struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE]; + + atomic_t tx_busy; + void *tx_remember; + + domid_t backend_id; + wait_queue_head_t wait_q; + + struct xenbus_device *dev; + int ring_ref; +}; + +struct tx_buffer { + unsigned int size; // available space in data + unsigned int len; // used space in data + unsigned char *data; // pointer to a page +}; + + +/* locally visible variables */ +static grant_ref_t gref_head; +static struct tpm_private *my_priv; + +/* local function prototypes */ +static irqreturn_t tpmif_int(int irq, + void *tpm_priv, + struct pt_regs *ptregs); +static void tpmif_rx_action(unsigned long unused); +static int tpmif_connect(struct xenbus_device *dev, + struct tpm_private *tp, + domid_t domid); +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0); +static int tpmif_allocate_tx_buffers(struct tpm_private *tp); +static void tpmif_free_tx_buffers(struct tpm_private *tp); +static void tpmif_set_connected_state(struct tpm_private *tp, + u8 newstate); +static int tpm_xmit(struct tpm_private *tp, + const u8 * buf, size_t count, int userbuffer, + void *remember); +static void destroy_tpmring(struct tpm_private *tp); +void __exit tpmif_exit(void); + +#define DPRINTK(fmt, args...) \ + pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_tpm_fr: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args) + +#define GRANT_INVALID_REF 0 + + +static inline int +tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len, + int isuserbuffer) +{ + int copied = len; + + if (len > txb->size) + copied = txb->size; + if (isuserbuffer) { + if (copy_from_user(txb->data, src, copied)) + return -EFAULT; + } else { + memcpy(txb->data, src, copied); + } + txb->len = len; + return copied; +} + +static inline struct tx_buffer *tx_buffer_alloc(void) +{ + struct tx_buffer *txb; + + txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL); + if (!txb) + return NULL; + + txb->len = 0; + txb->size = PAGE_SIZE; + txb->data = (unsigned char *)__get_free_page(GFP_KERNEL); + if (txb->data == NULL) { + kfree(txb); + txb = NULL; + } + + return txb; +} + + +static inline void tx_buffer_free(struct tx_buffer *txb) +{ + if (txb) { + free_page((long)txb->data); + kfree(txb); + } +} + +/************************************************************** + Utility function for the tpm_private structure +**************************************************************/ +static void tpm_private_init(struct tpm_private *tp) +{ + spin_lock_init(&tp->tx_lock); + init_waitqueue_head(&tp->wait_q); + atomic_set(&tp->refcnt, 1); +} + +static void tpm_private_put(void) +{ + if (!atomic_dec_and_test(&my_priv->refcnt)) + return; + + tpmif_free_tx_buffers(my_priv); + kfree(my_priv); + my_priv = NULL; +} + +static struct tpm_private *tpm_private_get(void) +{ + int err; + + if (my_priv) { + atomic_inc(&my_priv->refcnt); + return my_priv; + } + + my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL); + if (!my_priv) + return NULL; + + tpm_private_init(my_priv); + err = tpmif_allocate_tx_buffers(my_priv); + if (err < 0) + tpm_private_put(); + + return my_priv; +} + +/************************************************************** + + The interface to let the tpm plugin register its callback + function and send data to another partition using this module + +**************************************************************/ + +static DEFINE_MUTEX(suspend_lock); +/* + * Send data via this module by calling this function + */ +int vtpm_vd_send(struct tpm_private *tp, + const u8 * buf, size_t count, void *ptr) +{ + int sent; + + mutex_lock(&suspend_lock); + sent = tpm_xmit(tp, buf, count, 0, ptr); + mutex_unlock(&suspend_lock); + + return sent; +} + +/************************************************************** + XENBUS support code +**************************************************************/ + +static int setup_tpmring(struct xenbus_device *dev, + struct tpm_private *tp) +{ + tpmif_tx_interface_t *sring; + int err; + + tp->ring_ref = GRANT_INVALID_REF; + + sring = (void *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + tp->tx = sring; + + err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx)); + if (err < 0) { + free_page((unsigned long)sring); + tp->tx = NULL; + xenbus_dev_fatal(dev, err, "allocating grant reference"); + goto fail; + } + tp->ring_ref = err; + + err = tpmif_connect(dev, tp, dev->otherend_id); + if (err) + goto fail; + + return 0; +fail: + destroy_tpmring(tp); + return err; +} + + +static void destroy_tpmring(struct tpm_private *tp) +{ + tpmif_set_connected_state(tp, 0); + + if (tp->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx); + tp->ring_ref = GRANT_INVALID_REF; + tp->tx = NULL; + } + + if (tp->irq) + unbind_from_irqhandler(tp->irq, tp); + + tp->irq = 0; +} + + +static int talk_to_backend(struct xenbus_device *dev, + struct tpm_private *tp) +{ + const char *message = NULL; + int err; + struct xenbus_transaction xbt; + + err = setup_tpmring(dev, tp); + if (err) { + xenbus_dev_fatal(dev, err, "setting up ring"); + goto out; + } + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_tpmring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref","%u", tp->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(tp->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) { + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_tpmring; + } + + xenbus_switch_state(dev, XenbusStateConnected); + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_error(dev, err, "%s", message); +destroy_tpmring: + destroy_tpmring(tp); +out: + return err; +} + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + DPRINTK("\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + break; + + case XenbusStateConnected: + tpmif_set_connected_state(tp, 1); + break; + + case XenbusStateClosing: + tpmif_set_connected_state(tp, 0); + xenbus_frontend_closed(dev); + break; + + case XenbusStateClosed: + tpmif_set_connected_state(tp, 0); + if (tp->is_suspended == 0) + device_unregister(&dev->dev); + xenbus_frontend_closed(dev); + break; + } +} + +static int tpmfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + int handle; + struct tpm_private *tp = tpm_private_get(); + + if (!tp) + return -ENOMEM; + + tp->chip = init_vtpm(&dev->dev, tp); + if (IS_ERR(tp->chip)) + return PTR_ERR(tp->chip); + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "handle", "%i", &handle); + if (XENBUS_EXIST_ERR(err)) + return err; + + if (err < 0) { + xenbus_dev_fatal(dev,err,"reading virtual-device"); + return err; + } + + tp->dev = dev; + + err = talk_to_backend(dev, tp); + if (err) { + tpm_private_put(); + return err; + } + + return 0; +} + + +static int tpmfront_remove(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + destroy_tpmring(tp); + cleanup_vtpm(&dev->dev); + return 0; +} + +static int tpmfront_suspend(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + u32 ctr; + + /* Take the lock, preventing any application from sending. */ + mutex_lock(&suspend_lock); + tp->is_suspended = 1; + + for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) { + if ((ctr % 10) == 0) + printk("TPM-FE [INFO]: Waiting for outstanding " + "request.\n"); + /* Wait for a request to be responded to. */ + interruptible_sleep_on_timeout(&tp->wait_q, 100); + } + + return 0; +} + +static int tpmfront_suspend_finish(struct tpm_private *tp) +{ + tp->is_suspended = 0; + /* Allow applications to send again. */ + mutex_unlock(&suspend_lock); + return 0; +} + +static int tpmfront_suspend_cancel(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + return tpmfront_suspend_finish(tp); +} + +static int tpmfront_resume(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + destroy_tpmring(tp); + return talk_to_backend(dev, tp); +} + +static int tpmif_connect(struct xenbus_device *dev, + struct tpm_private *tp, + domid_t domid) +{ + int err; + + tp->backend_id = domid; + + err = bind_listening_port_to_irqhandler( + domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp); + if (err <= 0) { + WPRINTK("bind_listening_port_to_irqhandler failed " + "(err=%d)\n", err); + return err; + } + tp->irq = err; + + return 0; +} + +static struct xenbus_device_id tpmfront_ids[] = { + { "vtpm" }, + { "" } +}; + +static struct xenbus_driver tpmfront = { + .name = "vtpm", + .owner = THIS_MODULE, + .ids = tpmfront_ids, + .probe = tpmfront_probe, + .remove = tpmfront_remove, + .resume = tpmfront_resume, + .otherend_changed = backend_changed, + .suspend = tpmfront_suspend, + .suspend_cancel = tpmfront_suspend_cancel, +}; + +static void __init init_tpm_xenbus(void) +{ + xenbus_register_frontend(&tpmfront); +} + +static int tpmif_allocate_tx_buffers(struct tpm_private *tp) +{ + unsigned int i; + + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) { + tp->tx_buffers[i] = tx_buffer_alloc(); + if (!tp->tx_buffers[i]) { + tpmif_free_tx_buffers(tp); + return -ENOMEM; + } + } + return 0; +} + +static void tpmif_free_tx_buffers(struct tpm_private *tp) +{ + unsigned int i; + + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) + tx_buffer_free(tp->tx_buffers[i]); +} + +static void tpmif_rx_action(unsigned long priv) +{ + struct tpm_private *tp = (struct tpm_private *)priv; + int i = 0; + unsigned int received; + unsigned int offset = 0; + u8 *buffer; + tpmif_tx_request_t *tx = &tp->tx->ring[i].req; + + atomic_set(&tp->tx_busy, 0); + wake_up_interruptible(&tp->wait_q); + + received = tx->size; + + buffer = kmalloc(received, GFP_ATOMIC); + if (!buffer) + return; + + for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) { + struct tx_buffer *txb = tp->tx_buffers[i]; + tpmif_tx_request_t *tx; + unsigned int tocopy; + + tx = &tp->tx->ring[i].req; + tocopy = tx->size; + if (tocopy > PAGE_SIZE) + tocopy = PAGE_SIZE; + + memcpy(&buffer[offset], txb->data, tocopy); + + gnttab_release_grant_reference(&gref_head, tx->ref); + + offset += tocopy; + } + + vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember); + kfree(buffer); +} + + +static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs) +{ + struct tpm_private *tp = tpm_priv; + unsigned long flags; + + spin_lock_irqsave(&tp->tx_lock, flags); + tpmif_rx_tasklet.data = (unsigned long)tp; + tasklet_schedule(&tpmif_rx_tasklet); + spin_unlock_irqrestore(&tp->tx_lock, flags); + + return IRQ_HANDLED; +} + + +static int tpm_xmit(struct tpm_private *tp, + const u8 * buf, size_t count, int isuserbuffer, + void *remember) +{ + tpmif_tx_request_t *tx; + TPMIF_RING_IDX i; + unsigned int offset = 0; + + spin_lock_irq(&tp->tx_lock); + + if (unlikely(atomic_read(&tp->tx_busy))) { + printk("tpm_xmit: There's an outstanding request/response " + "on the way!\n"); + spin_unlock_irq(&tp->tx_lock); + return -EBUSY; + } + + if (tp->is_connected != 1) { + spin_unlock_irq(&tp->tx_lock); + return -EIO; + } + + for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) { + struct tx_buffer *txb = tp->tx_buffers[i]; + int copied; + + if (!txb) { + DPRINTK("txb (i=%d) is NULL. buffers initilized?\n" + "Not transmitting anything!\n", i); + spin_unlock_irq(&tp->tx_lock); + return -EFAULT; + } + + copied = tx_buffer_copy(txb, &buf[offset], count, + isuserbuffer); + if (copied < 0) { + /* An error occurred */ + spin_unlock_irq(&tp->tx_lock); + return copied; + } + count -= copied; + offset += copied; + + tx = &tp->tx->ring[i].req; + tx->addr = virt_to_machine(txb->data); + tx->size = txb->len; + tx->unused = 0; + + DPRINTK("First 4 characters sent by TPM-FE are " + "0x%02x 0x%02x 0x%02x 0x%02x\n", + txb->data[0],txb->data[1],txb->data[2],txb->data[3]); + + /* Get the granttable reference for this page. */ + tx->ref = gnttab_claim_grant_reference(&gref_head); + if (tx->ref == -ENOSPC) { + spin_unlock_irq(&tp->tx_lock); + DPRINTK("Grant table claim reference failed in " + "func:%s line:%d file:%s\n", + __FUNCTION__, __LINE__, __FILE__); + return -ENOSPC; + } + gnttab_grant_foreign_access_ref(tx->ref, + tp->backend_id, + virt_to_mfn(txb->data), + 0 /*RW*/); + wmb(); + } + + atomic_set(&tp->tx_busy, 1); + tp->tx_remember = remember; + + mb(); + + notify_remote_via_irq(tp->irq); + + spin_unlock_irq(&tp->tx_lock); + return offset; +} + + +static void tpmif_notify_upperlayer(struct tpm_private *tp) +{ + /* Notify upper layer about the state of the connection to the BE. */ + vtpm_vd_status(tp->chip, (tp->is_connected + ? TPM_VD_STATUS_CONNECTED + : TPM_VD_STATUS_DISCONNECTED)); +} + + +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected) +{ + /* + * Don't notify upper layer if we are in suspend mode and + * should disconnect - assumption is that we will resume + * The mutex keeps apps from sending. + */ + if (is_connected == 0 && tp->is_suspended == 1) + return; + + /* + * Unlock the mutex if we are connected again + * after being suspended - now resuming. + * This also removes the suspend state. + */ + if (is_connected == 1 && tp->is_suspended == 1) + tpmfront_suspend_finish(tp); + + if (is_connected != tp->is_connected) { + tp->is_connected = is_connected; + tpmif_notify_upperlayer(tp); + } +} + + + +/* ================================================================= + * Initialization function. + * ================================================================= + */ + + +static int __init tpmif_init(void) +{ + struct tpm_private *tp; + + if (is_initial_xendomain()) + return -EPERM; + + tp = tpm_private_get(); + if (!tp) + return -ENOMEM; + + IPRINTK("Initialising the vTPM driver.\n"); + if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE, + &gref_head) < 0) { + tpm_private_put(); + return -EFAULT; + } + + init_tpm_xenbus(); + return 0; +} + + +module_init(tpmif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- head-2010-05-25.orig/drivers/edac/edac_mc.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/edac/edac_mc.c 2010-03-24 14:53:41.000000000 +0100 @@ -602,6 +602,10 @@ static void edac_mc_scrub_block(unsigned debugf3("%s()\n", __func__); +#ifdef CONFIG_XEN + page = mfn_to_local_pfn(page); +#endif + /* ECC error page was not in our memory. Ignore it. */ if (!pfn_valid(page)) return; --- head-2010-05-25.orig/drivers/firmware/dell_rbu.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/firmware/dell_rbu.c 2010-04-15 09:43:35.000000000 +0200 @@ -170,9 +170,28 @@ static int create_packet(void *data, siz spin_lock(&rbu_data.lock); goto out_alloc_packet_array; } +#ifdef CONFIG_XEN + if (ordernum && xen_create_contiguous_region( + (unsigned long)packet_data_temp_buf, ordernum, 0)) { + free_pages((unsigned long)packet_data_temp_buf, + ordernum); + printk(KERN_WARNING + "dell_rbu:%s: failed to adjust new " + "packet\n", __func__); + retval = -ENOMEM; + spin_lock(&rbu_data.lock); + goto out_alloc_packet_array; + } +#endif - if ((unsigned long)virt_to_phys(packet_data_temp_buf) + if ((unsigned long)virt_to_bus(packet_data_temp_buf) < allocation_floor) { +#ifdef CONFIG_XEN + if (ordernum) + xen_destroy_contiguous_region( + (unsigned long)packet_data_temp_buf, + ordernum); +#endif pr_debug("packet 0x%lx below floor at 0x%lx.\n", (unsigned long)virt_to_phys( packet_data_temp_buf), @@ -186,7 +205,7 @@ static int create_packet(void *data, siz newpacket->data = packet_data_temp_buf; pr_debug("create_packet: newpacket at physical addr %lx\n", - (unsigned long)virt_to_phys(newpacket->data)); + (unsigned long)virt_to_bus(newpacket->data)); /* packets may not have fixed size */ newpacket->length = length; @@ -205,7 +224,7 @@ out_alloc_packet_array: /* always free packet array */ for (;idx>0;idx--) { pr_debug("freeing unused packet below floor 0x%lx.\n", - (unsigned long)virt_to_phys( + (unsigned long)virt_to_bus( invalid_addr_packet_array[idx-1])); free_pages((unsigned long)invalid_addr_packet_array[idx-1], ordernum); @@ -349,6 +368,13 @@ static void packet_empty_list(void) * to make sure there are no stale RBU packets left in memory */ memset(newpacket->data, 0, rbu_data.packetsize); +#ifdef CONFIG_XEN + if (newpacket->ordernum) + xen_destroy_contiguous_region( + (unsigned long)newpacket->data, + newpacket->ordernum); +#endif + free_pages((unsigned long) newpacket->data, newpacket->ordernum); kfree(newpacket); @@ -403,7 +429,9 @@ static int img_update_realloc(unsigned l { unsigned char *image_update_buffer = NULL; unsigned long rc; +#ifndef CONFIG_XEN unsigned long img_buf_phys_addr; +#endif int ordernum; int dma_alloc = 0; @@ -434,15 +462,19 @@ static int img_update_realloc(unsigned l spin_unlock(&rbu_data.lock); +#ifndef CONFIG_XEN ordernum = get_order(size); image_update_buffer = (unsigned char *) __get_free_pages(GFP_KERNEL, ordernum); img_buf_phys_addr = - (unsigned long) virt_to_phys(image_update_buffer); + (unsigned long) virt_to_bus(image_update_buffer); if (img_buf_phys_addr > BIOS_SCAN_LIMIT) { free_pages((unsigned long) image_update_buffer, ordernum); +#else + { +#endif ordernum = -1; image_update_buffer = dma_alloc_coherent(NULL, size, &dell_rbu_dmaaddr, GFP_KERNEL); @@ -695,6 +727,12 @@ static struct bin_attribute rbu_packet_s static int __init dcdrbu_init(void) { int rc; + +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return -ENODEV; +#endif + spin_lock_init(&rbu_data.lock); init_packet_head(); --- head-2010-05-25.orig/drivers/ide/ide-lib.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/ide/ide-lib.c 2010-03-24 14:53:41.000000000 +0100 @@ -18,12 +18,12 @@ void ide_toggle_bounce(ide_drive_t *driv { u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */ - if (!PCI_DMA_BUS_IS_PHYS) { - addr = BLK_BOUNCE_ANY; - } else if (on && drive->media == ide_disk) { + if (on && drive->media == ide_disk) { struct device *dev = drive->hwif->dev; - if (dev && dev->dma_mask) + if (!PCI_DMA_BUS_IS_PHYS) + addr = BLK_BOUNCE_ANY; + else if (dev && dev->dma_mask) addr = *dev->dma_mask; } --- head-2010-05-25.orig/drivers/oprofile/buffer_sync.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/buffer_sync.c 2010-04-15 09:43:44.000000000 +0200 @@ -8,6 +8,10 @@ * @author Barry Kasindorf * @author Robert Richter * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the * global event buffer. Such processing is necessary @@ -43,6 +47,8 @@ static cpumask_var_t marked_cpus; static DEFINE_SPINLOCK(task_mortuary); static void process_task_mortuary(void); +static int cpu_current_domain[NR_CPUS]; + /* Take ownership of the task struct and place it on the * list for processing. Only after two full buffer syncs * does the task eventually get freed, because by then @@ -61,7 +67,6 @@ task_free_notify(struct notifier_block * return NOTIFY_OK; } - /* The task is on its way out. A sync of the buffer means we can catch * any remaining samples for this task. */ @@ -154,6 +159,11 @@ static void end_sync(void) int sync_start(void) { int err; + int i; + + for (i = 0; i < NR_CPUS; i++) { + cpu_current_domain[i] = COORDINATOR_DOMAIN; + } if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL)) return -ENOMEM; @@ -285,13 +295,29 @@ static void add_cpu_switch(int i) last_cookie = INVALID_COOKIE; } -static void add_kernel_ctx_switch(unsigned int in_kernel) +static void add_cpu_mode_switch(unsigned int cpu_mode) { add_event_entry(ESCAPE_CODE); - if (in_kernel) + switch (cpu_mode) { + case CPU_MODE_USER: + add_event_entry(USER_ENTER_SWITCH_CODE); + break; + case CPU_MODE_KERNEL: add_event_entry(KERNEL_ENTER_SWITCH_CODE); - else - add_event_entry(KERNEL_EXIT_SWITCH_CODE); + break; + case CPU_MODE_XEN: + add_event_entry(XEN_ENTER_SWITCH_CODE); + break; + default: + break; + } +} + +static void add_domain_switch(unsigned long domain_id) +{ + add_event_entry(ESCAPE_CODE); + add_event_entry(DOMAIN_SWITCH_CODE); + add_event_entry(domain_id); } static void @@ -372,12 +398,12 @@ static inline void add_sample_entry(unsi * for later lookup from userspace. Return 0 on failure. */ static int -add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) +add_sample(struct mm_struct *mm, struct op_sample *s, int cpu_mode) { unsigned long cookie; off_t offset; - if (in_kernel) { + if (cpu_mode >= CPU_MODE_KERNEL) { add_sample_entry(s->eip, s->event); return 1; } @@ -502,9 +528,10 @@ void sync_buffer(int cpu) unsigned long val; struct task_struct *new; unsigned long cookie = 0; - int in_kernel = 1; + int cpu_mode = CPU_MODE_KERNEL; sync_buffer_state state = sb_buffer_start; unsigned int i; + int domain_switch = 0; unsigned long available; unsigned long flags; struct op_entry entry; @@ -514,6 +541,11 @@ void sync_buffer(int cpu) add_cpu_switch(cpu); + /* We need to assign the first samples in this CPU buffer to the + same domain that we were processing at the last sync_buffer */ + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) + add_domain_switch(cpu_current_domain[cpu]); + op_cpu_buffer_reset(cpu); available = op_cpu_buffer_entries(cpu); @@ -522,6 +554,13 @@ void sync_buffer(int cpu) if (!sample) break; + if (domain_switch) { + cpu_current_domain[cpu] = sample->eip; + add_domain_switch(sample->eip); + domain_switch = 0; + continue; + } + if (is_code(sample->eip)) { flags = sample->event; if (flags & TRACE_BEGIN) { @@ -530,10 +569,10 @@ void sync_buffer(int cpu) } if (flags & KERNEL_CTX_SWITCH) { /* kernel/userspace switch */ - in_kernel = flags & IS_KERNEL; + cpu_mode = flags & CPU_MODE_MASK; if (state == sb_buffer_start) state = sb_sample_start; - add_kernel_ctx_switch(flags & IS_KERNEL); + add_cpu_mode_switch(cpu_mode); } if (flags & USER_CTX_SWITCH && op_cpu_buffer_get_data(&entry, &val)) { @@ -546,16 +585,23 @@ void sync_buffer(int cpu) cookie = get_exec_dcookie(mm); add_user_ctx_switch(new, cookie); } + if (flags & DOMAIN_SWITCH) + domain_switch = 1; if (op_cpu_buffer_get_size(&entry)) add_data(&entry, mm); continue; } + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) { + add_sample_entry(sample->eip, sample->event); + continue; + } + if (state < sb_bt_start) /* ignore sample */ continue; - if (add_sample(mm, sample, in_kernel)) + if (add_sample(mm, sample, cpu_mode)) continue; /* ignore backtraces if failed to add a sample */ @@ -566,6 +612,10 @@ void sync_buffer(int cpu) } release_mm(mm); + /* We reset domain to COORDINATOR at each CPU switch */ + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) + add_domain_switch(COORDINATOR_DOMAIN); + mark_done(cpu); mutex_unlock(&buffer_mutex); --- head-2010-05-25.orig/drivers/oprofile/cpu_buffer.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/cpu_buffer.c 2010-03-24 14:53:41.000000000 +0100 @@ -8,6 +8,10 @@ * @author Barry Kasindorf * @author Robert Richter * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. * Eventually each CPU's buffer is processed into the global @@ -54,6 +58,8 @@ static void wq_sync_buffer(struct work_s #define DEFAULT_TIMER_EXPIRE (HZ / 10) static int work_enabled; +static int32_t current_domain = COORDINATOR_DOMAIN; + unsigned long oprofile_get_cpu_buffer_size(void) { return oprofile_cpu_buffer_size; @@ -97,7 +103,7 @@ int alloc_cpu_buffers(void) struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); b->last_task = NULL; - b->last_is_kernel = -1; + b->last_cpu_mode = -1; b->tracing = 0; b->buffer_size = buffer_size; b->sample_received = 0; @@ -215,7 +221,7 @@ unsigned long op_cpu_buffer_entries(int static int op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, - int is_kernel, struct task_struct *task) + int cpu_mode, struct task_struct *task) { struct op_entry entry; struct op_sample *sample; @@ -228,16 +234,15 @@ op_add_code(struct oprofile_cpu_buffer * flags |= TRACE_BEGIN; /* notice a switch from user->kernel or vice versa */ - is_kernel = !!is_kernel; - if (cpu_buf->last_is_kernel != is_kernel) { - cpu_buf->last_is_kernel = is_kernel; - flags |= KERNEL_CTX_SWITCH; - if (is_kernel) - flags |= IS_KERNEL; + if (cpu_buf->last_cpu_mode != cpu_mode) { + cpu_buf->last_cpu_mode = cpu_mode; + flags |= KERNEL_CTX_SWITCH | cpu_mode; } /* notice a task switch */ - if (cpu_buf->last_task != task) { + /* if not processing other domain samples */ + if (cpu_buf->last_task != task && + current_domain == COORDINATOR_DOMAIN) { cpu_buf->last_task = task; flags |= USER_CTX_SWITCH; } @@ -286,14 +291,14 @@ op_add_sample(struct oprofile_cpu_buffer /* * This must be safe from any context. * - * is_kernel is needed because on some architectures you cannot + * cpu_mode is needed because on some architectures you cannot * tell if you are in kernel or user space simply by looking at - * pc. We tag this in the buffer by generating kernel enter/exit - * events whenever is_kernel changes + * pc. We tag this in the buffer by generating kernel/user (and + * xen) enter events whenever cpu_mode changes */ static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, - unsigned long backtrace, int is_kernel, unsigned long event) + unsigned long backtrace, int cpu_mode, unsigned long event) { cpu_buf->sample_received++; @@ -302,7 +307,7 @@ log_sample(struct oprofile_cpu_buffer *c return 0; } - if (op_add_code(cpu_buf, backtrace, is_kernel, current)) + if (op_add_code(cpu_buf, backtrace, cpu_mode, current)) goto fail; if (op_add_sample(cpu_buf, pc, event)) @@ -457,6 +462,25 @@ fail: return; } +int oprofile_add_domain_switch(int32_t domain_id) +{ + struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; + + /* should have space for switching into and out of domain + (2 slots each) plus one sample and one cpu mode switch */ + if (((nr_available_slots(cpu_buf) < 6) && + (domain_id != COORDINATOR_DOMAIN)) || + (nr_available_slots(cpu_buf) < 2)) + return 0; + + add_code(cpu_buf, DOMAIN_SWITCH); + add_sample(cpu_buf, domain_id, 0); + + current_domain = domain_id; + + return 1; +} + /* * This serves to avoid cpu buffer overflow, and makes sure * the task mortuary progresses --- head-2010-05-25.orig/drivers/oprofile/cpu_buffer.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/cpu_buffer.h 2010-03-24 14:53:41.000000000 +0100 @@ -40,7 +40,7 @@ struct op_entry; struct oprofile_cpu_buffer { unsigned long buffer_size; struct task_struct *last_task; - int last_is_kernel; + int last_cpu_mode; int tracing; unsigned long sample_received; unsigned long sample_lost_overflow; @@ -62,7 +62,7 @@ static inline void op_cpu_buffer_reset(i { struct oprofile_cpu_buffer *cpu_buf = &per_cpu(op_cpu_buffer, cpu); - cpu_buf->last_is_kernel = -1; + cpu_buf->last_cpu_mode = -1; cpu_buf->last_task = NULL; } @@ -112,9 +112,13 @@ int op_cpu_buffer_get_data(struct op_ent } /* extra data flags */ -#define KERNEL_CTX_SWITCH (1UL << 0) -#define IS_KERNEL (1UL << 1) +#define CPU_MODE_USER 0 +#define CPU_MODE_KERNEL 1 +#define CPU_MODE_XEN 2 +#define CPU_MODE_MASK 3 #define TRACE_BEGIN (1UL << 2) #define USER_CTX_SWITCH (1UL << 3) +#define KERNEL_CTX_SWITCH (1UL << 4) +#define DOMAIN_SWITCH (1UL << 5) #endif /* OPROFILE_CPU_BUFFER_H */ --- head-2010-05-25.orig/drivers/oprofile/event_buffer.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/event_buffer.h 2010-03-24 14:53:41.000000000 +0100 @@ -30,6 +30,9 @@ void wake_up_buffer_waiter(void); #define INVALID_COOKIE ~0UL #define NO_COOKIE 0UL +/* Constant used to refer to coordinator domain (Xen) */ +#define COORDINATOR_DOMAIN -1 + extern const struct file_operations event_buffer_fops; /* mutex between sync_cpu_buffers() and the --- head-2010-05-25.orig/drivers/oprofile/oprof.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/oprof.c 2010-03-24 14:53:41.000000000 +0100 @@ -5,6 +5,10 @@ * @remark Read the file COPYING * * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #include @@ -35,6 +39,32 @@ static DEFINE_MUTEX(start_mutex); */ static int timer = 0; +int oprofile_set_active(int active_domains[], unsigned int adomains) +{ + int err; + + if (!oprofile_ops.set_active) + return -EINVAL; + + mutex_lock(&start_mutex); + err = oprofile_ops.set_active(active_domains, adomains); + mutex_unlock(&start_mutex); + return err; +} + +int oprofile_set_passive(int passive_domains[], unsigned int pdomains) +{ + int err; + + if (!oprofile_ops.set_passive) + return -EINVAL; + + mutex_lock(&start_mutex); + err = oprofile_ops.set_passive(passive_domains, pdomains); + mutex_unlock(&start_mutex); + return err; +} + int oprofile_setup(void) { int err; --- head-2010-05-25.orig/drivers/oprofile/oprof.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/oprof.h 2010-03-24 14:53:41.000000000 +0100 @@ -39,4 +39,7 @@ void oprofile_timer_init(struct oprofile int oprofile_set_backtrace(unsigned long depth); int oprofile_set_timeout(unsigned long time); +int oprofile_set_active(int active_domains[], unsigned int adomains); +int oprofile_set_passive(int passive_domains[], unsigned int pdomains); + #endif /* OPROF_H */ --- head-2010-05-25.orig/drivers/oprofile/oprofile_files.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/drivers/oprofile/oprofile_files.c 2010-03-24 14:53:41.000000000 +0100 @@ -5,11 +5,17 @@ * @remark Read the file COPYING * * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #include #include #include +#include +#include #include "event_buffer.h" #include "oprofile_stats.h" @@ -165,6 +171,195 @@ static const struct file_operations dump .write = dump_write, }; +#define TMPBUFSIZE 512 + +static unsigned int adomains = 0; +static int active_domains[MAX_OPROF_DOMAINS + 1]; +static DEFINE_MUTEX(adom_mutex); + +static ssize_t adomain_write(struct file * file, char const __user * buf, + size_t count, loff_t * offset) +{ + char *tmpbuf; + char *startp, *endp; + int i; + unsigned long val; + ssize_t retval = count; + + if (*offset) + return -EINVAL; + if (count > TMPBUFSIZE - 1) + return -EINVAL; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + if (copy_from_user(tmpbuf, buf, count)) { + kfree(tmpbuf); + return -EFAULT; + } + tmpbuf[count] = 0; + + mutex_lock(&adom_mutex); + + startp = tmpbuf; + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */ + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) { + val = simple_strtoul(startp, &endp, 0); + if (endp == startp) + break; + while (ispunct(*endp) || isspace(*endp)) + endp++; + active_domains[i] = val; + if (active_domains[i] != val) + /* Overflow, force error below */ + i = MAX_OPROF_DOMAINS + 1; + startp = endp; + } + /* Force error on trailing junk */ + adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i; + + kfree(tmpbuf); + + if (adomains > MAX_OPROF_DOMAINS + || oprofile_set_active(active_domains, adomains)) { + adomains = 0; + retval = -EINVAL; + } + + mutex_unlock(&adom_mutex); + return retval; +} + +static ssize_t adomain_read(struct file * file, char __user * buf, + size_t count, loff_t * offset) +{ + char * tmpbuf; + size_t len; + int i; + ssize_t retval; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + mutex_lock(&adom_mutex); + + len = 0; + for (i = 0; i < adomains; i++) + len += snprintf(tmpbuf + len, + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0, + "%u ", active_domains[i]); + WARN_ON(len > TMPBUFSIZE); + if (len != 0 && len <= TMPBUFSIZE) + tmpbuf[len-1] = '\n'; + + mutex_unlock(&adom_mutex); + + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len); + + kfree(tmpbuf); + return retval; +} + + +static const struct file_operations active_domain_ops = { + .read = adomain_read, + .write = adomain_write, +}; + +static unsigned int pdomains = 0; +static int passive_domains[MAX_OPROF_DOMAINS]; +static DEFINE_MUTEX(pdom_mutex); + +static ssize_t pdomain_write(struct file * file, char const __user * buf, + size_t count, loff_t * offset) +{ + char *tmpbuf; + char *startp, *endp; + int i; + unsigned long val; + ssize_t retval = count; + + if (*offset) + return -EINVAL; + if (count > TMPBUFSIZE - 1) + return -EINVAL; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + if (copy_from_user(tmpbuf, buf, count)) { + kfree(tmpbuf); + return -EFAULT; + } + tmpbuf[count] = 0; + + mutex_lock(&pdom_mutex); + + startp = tmpbuf; + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */ + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) { + val = simple_strtoul(startp, &endp, 0); + if (endp == startp) + break; + while (ispunct(*endp) || isspace(*endp)) + endp++; + passive_domains[i] = val; + if (passive_domains[i] != val) + /* Overflow, force error below */ + i = MAX_OPROF_DOMAINS + 1; + startp = endp; + } + /* Force error on trailing junk */ + pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i; + + kfree(tmpbuf); + + if (pdomains > MAX_OPROF_DOMAINS + || oprofile_set_passive(passive_domains, pdomains)) { + pdomains = 0; + retval = -EINVAL; + } + + mutex_unlock(&pdom_mutex); + return retval; +} + +static ssize_t pdomain_read(struct file * file, char __user * buf, + size_t count, loff_t * offset) +{ + char * tmpbuf; + size_t len; + int i; + ssize_t retval; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + mutex_lock(&pdom_mutex); + + len = 0; + for (i = 0; i < pdomains; i++) + len += snprintf(tmpbuf + len, + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0, + "%u ", passive_domains[i]); + WARN_ON(len > TMPBUFSIZE); + if (len != 0 && len <= TMPBUFSIZE) + tmpbuf[len-1] = '\n'; + + mutex_unlock(&pdom_mutex); + + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len); + + kfree(tmpbuf); + return retval; +} + +static const struct file_operations passive_domain_ops = { + .read = pdomain_read, + .write = pdomain_write, +}; + void oprofile_create_files(struct super_block *sb, struct dentry *root) { /* reinitialize default values */ @@ -175,6 +370,8 @@ void oprofile_create_files(struct super_ oprofilefs_create_file(sb, root, "enable", &enable_fops); oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); + oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops); + oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops); oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops); oprofilefs_create_ulong(sb, root, "buffer_size", &oprofile_buffer_size); oprofilefs_create_ulong(sb, root, "buffer_watershed", &oprofile_buffer_watershed); --- head-2010-05-25.orig/fs/aio.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/fs/aio.c 2010-03-24 14:53:41.000000000 +0100 @@ -40,6 +40,11 @@ #include #include +#ifdef CONFIG_EPOLL +#include +#include +#endif + #if DEBUG > 1 #define dprintk printk #else @@ -997,6 +1002,11 @@ put_rq: if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); +#ifdef CONFIG_EPOLL + if (ctx->file && waitqueue_active(&ctx->poll_wait)) + wake_up(&ctx->poll_wait); +#endif + spin_unlock_irqrestore(&ctx->ctx_lock, flags); return ret; } @@ -1005,6 +1015,8 @@ EXPORT_SYMBOL(aio_complete); /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) + * If ent parameter is 0, just returns the number of events that would + * be fetched. * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ @@ -1027,13 +1039,18 @@ static int aio_read_evt(struct kioctx *i head = ring->head % info->nr; if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp, KM_USER1); + if (ent) { /* event requested */ + struct io_event *evp = + aio_ring_event(info, head, KM_USER1); + *ent = *evp; + head = (head + 1) % info->nr; + /* finish reading the event before updatng the head */ + smp_mb(); + ring->head = head; + ret = 1; + put_aio_ring_event(evp, KM_USER1); + } else /* only need to know availability */ + ret = 1; } spin_unlock(&info->ring_lock); @@ -1218,6 +1235,13 @@ static void io_destroy(struct kioctx *io aio_cancel_all(ioctx); wait_for_all_aios(ioctx); +#ifdef CONFIG_EPOLL + /* forget the poll file, but it's up to the user to close it */ + if (ioctx->file) { + ioctx->file->private_data = 0; + ioctx->file = 0; + } +#endif /* * Wake up any waiters. The setting of ctx->dead must be seen @@ -1228,6 +1252,67 @@ static void io_destroy(struct kioctx *io put_ioctx(ioctx); /* once for the lookup */ } +#ifdef CONFIG_EPOLL + +static int aio_queue_fd_close(struct inode *inode, struct file *file) +{ + struct kioctx *ioctx = file->private_data; + if (ioctx) { + file->private_data = 0; + spin_lock_irq(&ioctx->ctx_lock); + ioctx->file = 0; + spin_unlock_irq(&ioctx->ctx_lock); + } + return 0; +} + +static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait) +{ unsigned int pollflags = 0; + struct kioctx *ioctx = file->private_data; + + if (ioctx) { + + spin_lock_irq(&ioctx->ctx_lock); + /* Insert inside our poll wait queue */ + poll_wait(file, &ioctx->poll_wait, wait); + + /* Check our condition */ + if (aio_read_evt(ioctx, 0)) + pollflags = POLLIN | POLLRDNORM; + spin_unlock_irq(&ioctx->ctx_lock); + } + + return pollflags; +} + +static const struct file_operations aioq_fops = { + .release = aio_queue_fd_close, + .poll = aio_queue_fd_poll +}; + +/* make_aio_fd: + * Create a file descriptor that can be used to poll the event queue. + * Based and piggybacked on the excellent epoll code. + */ + +static int make_aio_fd(struct kioctx *ioctx) +{ + int error, fd; + struct inode *inode; + struct file *file; + + error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops); + if (error) + return error; + + /* associate the file with the IO context */ + file->private_data = ioctx; + ioctx->file = file; + init_waitqueue_head(&ioctx->poll_wait); + return fd; +} +#endif + /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and @@ -1240,18 +1325,30 @@ static void io_destroy(struct kioctx *io * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented. + * + * To request a selectable fd, the user context has to be initialized + * to 1, instead of 0, and the return value is the fd. + * This keeps the system call compatible, since a non-zero value + * was not allowed so far. */ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; + int make_fd = 0; ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out; ret = -EINVAL; +#ifdef CONFIG_EPOLL + if (ctx == 1) { + make_fd = 1; + ctx = 0; + } +#endif if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", ctx, nr_events); @@ -1262,8 +1359,12 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_e ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) - return 0; +#ifdef CONFIG_EPOLL + if (make_fd && ret >= 0) + ret = make_aio_fd(ioctx); +#endif + if (ret >= 0) + return ret; get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ io_destroy(ioctx); --- head-2010-05-25.orig/fs/compat_ioctl.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/fs/compat_ioctl.c 2010-05-12 08:57:55.000000000 +0200 @@ -116,6 +116,13 @@ #include #endif +#ifdef CONFIG_XEN +#include +#include +#include +#include +#endif + static int w_long(unsigned int fd, unsigned int cmd, compat_ulong_t __user *argp) { @@ -1518,6 +1525,19 @@ IGNORE_IOCTL(FBIOGETCMAP32) IGNORE_IOCTL(FBIOSCURSOR32) IGNORE_IOCTL(FBIOGCURSOR32) #endif + +#ifdef CONFIG_XEN +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_32, privcmd_ioctl_32) +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_32, privcmd_ioctl_32) +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_V2_32, privcmd_ioctl_32) +COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET) +#endif }; /* --- head-2010-05-25.orig/include/acpi/processor.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/acpi/processor.h 2010-03-24 14:53:41.000000000 +0100 @@ -17,6 +17,12 @@ #define ACPI_PROCESSOR_MAX_THROTTLE 250 /* 25% */ #define ACPI_PROCESSOR_MAX_DUTY_WIDTH 4 +#ifdef CONFIG_XEN +#define NR_ACPI_CPUS (NR_CPUS < 256 ? 256 : NR_CPUS) +#else +#define NR_ACPI_CPUS NR_CPUS +#endif /* CONFIG_XEN */ + #define ACPI_PDC_REVISION_ID 0x1 #define ACPI_PSD_REV0_REVISION 0 /* Support for _PSD as in ACPI 3.0 */ @@ -42,6 +48,17 @@ struct acpi_processor_cx; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +struct acpi_csd_package { + acpi_integer num_entries; + acpi_integer revision; + acpi_integer domain; + acpi_integer coord_type; + acpi_integer num_processors; + acpi_integer index; +} __attribute__ ((packed)); +#endif + struct acpi_power_register { u8 descriptor; u16 length; @@ -74,6 +91,12 @@ struct acpi_processor_cx { u32 power; u32 usage; u64 time; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL + /* Require raw information for external control logic */ + struct acpi_power_register reg; + u32 csd_count; + struct acpi_csd_package *domain_info; +#endif struct acpi_processor_cx_policy promotion; struct acpi_processor_cx_policy demotion; char desc[ACPI_CX_DESC_LEN]; @@ -300,6 +323,9 @@ static inline void acpi_processor_ppc_ex { return; } +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +int acpi_processor_ppc_has_changed(struct acpi_processor *pr); +#else static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) { @@ -313,6 +339,7 @@ static inline int acpi_processor_ppc_has } return 0; } +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) { return -ENODEV; @@ -366,4 +393,120 @@ static inline void acpi_thermal_cpufreq_ } #endif +/* + * Following are interfaces geared to external processor PM control + * logic like a VMM + */ +/* Events notified to external control logic */ +#define PROCESSOR_PM_INIT 1 +#define PROCESSOR_PM_CHANGE 2 +#define PROCESSOR_HOTPLUG 3 + +/* Objects for the PM events */ +#define PM_TYPE_IDLE 0 +#define PM_TYPE_PERF 1 +#define PM_TYPE_THR 2 +#define PM_TYPE_MAX 3 + +/* Processor hotplug events */ +#define HOTPLUG_TYPE_ADD 0 +#define HOTPLUG_TYPE_REMOVE 1 + +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +struct processor_extcntl_ops { + /* Transfer processor PM events to external control logic */ + int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event); + /* Notify physical processor status to external control logic */ + int (*hotplug)(struct acpi_processor *pr, int type); +}; +extern const struct processor_extcntl_ops *processor_extcntl_ops; + +static inline int processor_cntl_external(void) +{ + return (processor_extcntl_ops != NULL); +} + +static inline int processor_pm_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_IDLE] != NULL); +} + +static inline int processor_pmperf_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_PERF] != NULL); +} + +static inline int processor_pmthr_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_THR] != NULL); +} + +extern int processor_notify_external(struct acpi_processor *pr, + int event, int type); +extern void processor_extcntl_init(void); +extern int processor_extcntl_prepare(struct acpi_processor *pr); +extern int acpi_processor_get_performance_info(struct acpi_processor *pr); +extern int acpi_processor_get_psd(struct acpi_processor *pr); +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **); +#else +static inline int processor_cntl_external(void) {return 0;} +static inline int processor_pm_external(void) {return 0;} +static inline int processor_pmperf_external(void) {return 0;} +static inline int processor_pmthr_external(void) {return 0;} +static inline int processor_notify_external(struct acpi_processor *pr, + int event, int type) +{ + return 0; +} +static inline void processor_extcntl_init(void) {} +static inline int processor_extcntl_prepare(struct acpi_processor *pr) +{ + return 0; +} +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ + +#ifdef CONFIG_XEN +static inline void xen_convert_pct_reg(struct xen_pct_register *xpct, + struct acpi_pct_register *apct) +{ + xpct->descriptor = apct->descriptor; + xpct->length = apct->length; + xpct->space_id = apct->space_id; + xpct->bit_width = apct->bit_width; + xpct->bit_offset = apct->bit_offset; + xpct->reserved = apct->reserved; + xpct->address = apct->address; +} + +static inline void xen_convert_pss_states(struct xen_processor_px *xpss, + struct acpi_processor_px *apss, int state_count) +{ + int i; + for(i=0; icore_frequency = apss->core_frequency; + xpss->power = apss->power; + xpss->transition_latency = apss->transition_latency; + xpss->bus_master_latency = apss->bus_master_latency; + xpss->control = apss->control; + xpss->status = apss->status; + xpss++; + apss++; + } +} + +static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd, + struct acpi_psd_package *apsd) +{ + xpsd->num_entries = apsd->num_entries; + xpsd->revision = apsd->revision; + xpsd->domain = apsd->domain; + xpsd->coord_type = apsd->coord_type; + xpsd->num_processors = apsd->num_processors; +} + +#endif /* CONFIG_XEN */ + #endif --- head-2010-05-25.orig/include/asm-generic/pgtable.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/asm-generic/pgtable.h 2010-03-24 14:53:41.000000000 +0100 @@ -99,6 +99,10 @@ static inline void ptep_set_wrprotect(st } #endif +#ifndef arch_change_pte_range +#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0 +#endif + #ifndef __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (pte_val(A) == pte_val(B)) #endif --- head-2010-05-25.orig/include/linux/aio.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/aio.h 2010-03-24 14:53:41.000000000 +0100 @@ -199,6 +199,12 @@ struct kioctx { struct delayed_work wq; +#ifdef CONFIG_EPOLL + /* poll integration */ + wait_queue_head_t poll_wait; + struct file *file; +#endif + struct rcu_head rcu_head; }; --- head-2010-05-25.orig/include/linux/highmem.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/highmem.h 2010-03-24 14:53:41.000000000 +0100 @@ -136,12 +136,14 @@ alloc_zeroed_user_highpage_movable(struc return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr); } +#ifndef __HAVE_ARCH_CLEAR_HIGHPAGE static inline void clear_highpage(struct page *page) { void *kaddr = kmap_atomic(page, KM_USER0); clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); } +#endif static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, @@ -195,6 +197,8 @@ static inline void copy_user_highpage(st #endif +#ifndef __HAVE_ARCH_COPY_HIGHPAGE + static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; @@ -206,4 +210,6 @@ static inline void copy_highpage(struct kunmap_atomic(vto, KM_USER1); } +#endif + #endif /* _LINUX_HIGHMEM_H */ --- head-2010-05-25.orig/include/linux/interrupt.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/interrupt.h 2010-03-24 14:53:41.000000000 +0100 @@ -317,6 +317,12 @@ static inline int disable_irq_wake(unsig } #endif /* CONFIG_GENERIC_HARDIRQS */ +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED +int irq_ignore_unhandled(unsigned int irq); +#else +#define irq_ignore_unhandled(irq) 0 +#endif + #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) #define or_softirq_pending(x) (local_softirq_pending() |= (x)) --- head-2010-05-25.orig/include/linux/kexec.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/kexec.h 2010-03-24 14:53:41.000000000 +0100 @@ -46,6 +46,13 @@ KEXEC_CORE_NOTE_NAME_BYTES + \ KEXEC_CORE_NOTE_DESC_BYTES ) +#ifndef KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) page_to_pfn(page) +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn) +#define kexec_virt_to_phys(addr) virt_to_phys(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(addr) +#endif + /* * This structure is used to hold the arguments that are used when loading * kernel binaries. @@ -112,6 +119,12 @@ struct kimage { extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); +#ifdef CONFIG_XEN +extern int xen_machine_kexec_load(struct kimage *image); +extern void xen_machine_kexec_unload(struct kimage *image); +extern void xen_machine_kexec_setup_resources(void); +extern void xen_machine_kexec_register_resources(struct resource *res); +#endif extern asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, --- head-2010-05-25.orig/include/linux/mm.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/mm.h 2010-03-24 14:53:41.000000000 +0100 @@ -103,7 +103,12 @@ extern unsigned int kobjsize(const void #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ +#ifndef CONFIG_XEN #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ +#else +#define VM_SAO 0 +#define VM_FOREIGN 0x20000000 /* Has pages belonging to another VM */ +#endif #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ @@ -128,6 +133,12 @@ extern unsigned int kobjsize(const void */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#ifdef CONFIG_XEN +struct vm_foreign_map { + struct page **map; +}; +#endif + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -199,6 +210,15 @@ struct vm_operations_struct { */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + + /* Area-specific function for clearing the PTE at @ptep. Returns the + * original value of @ptep. */ + pte_t (*zap_pte)(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int is_fullmm); + + /* called before close() to indicate no more pages should be mapped */ + void (*unmap)(struct vm_area_struct *area); + #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy --- head-2010-05-25.orig/include/linux/oprofile.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/oprofile.h 2010-03-24 14:53:41.000000000 +0100 @@ -16,6 +16,8 @@ #include #include #include + +#include /* Each escaped entry is prefixed by ESCAPE_CODE * then one of the following codes, then the @@ -28,14 +30,18 @@ #define CPU_SWITCH_CODE 2 #define COOKIE_SWITCH_CODE 3 #define KERNEL_ENTER_SWITCH_CODE 4 -#define KERNEL_EXIT_SWITCH_CODE 5 +#define USER_ENTER_SWITCH_CODE 5 #define MODULE_LOADED_CODE 6 #define CTX_TGID_CODE 7 #define TRACE_BEGIN_CODE 8 #define TRACE_END_CODE 9 #define XEN_ENTER_SWITCH_CODE 10 +#ifndef CONFIG_XEN #define SPU_PROFILING_CODE 11 #define SPU_CTX_SWITCH_CODE 12 +#else +#define DOMAIN_SWITCH_CODE 11 +#endif #define IBS_FETCH_CODE 13 #define IBS_OP_CODE 14 @@ -49,6 +55,11 @@ struct oprofile_operations { /* create any necessary configuration files in the oprofile fs. * Optional. */ int (*create_files)(struct super_block * sb, struct dentry * root); + /* setup active domains with Xen */ + int (*set_active)(int *active_domains, unsigned int adomains); + /* setup passive domains with Xen */ + int (*set_passive)(int *passive_domains, unsigned int pdomains); + /* Do any necessary interrupt setup. Optional. */ int (*setup)(void); /* Do any necessary interrupt shutdown. Optional. */ @@ -110,6 +121,9 @@ void oprofile_add_pc(unsigned long pc, i /* add a backtrace entry, to be called from the ->backtrace callback */ void oprofile_add_trace(unsigned long eip); +/* add a domain switch entry */ +int oprofile_add_domain_switch(int32_t domain_id); + /** * Create a file of the given name as a child of the given root, with --- head-2010-05-25.orig/include/linux/page-flags.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/page-flags.h 2010-03-24 14:53:41.000000000 +0100 @@ -109,6 +109,11 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif +#ifdef CONFIG_XEN + PG_foreign, /* Page is owned by foreign allocator. */ + PG_netback, /* Page is owned by netback */ + PG_blkback, /* Page is owned by blkback */ +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -337,6 +342,27 @@ static inline void SetPageUptodate(struc CLEARPAGEFLAG(Uptodate, uptodate) +#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) +#define SetPageForeign(_page, dtor) do { \ + set_bit(PG_foreign, &(_page)->flags); \ + BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \ + (_page)->index = (long)(dtor); \ +} while (0) +#define ClearPageForeign(page) do { \ + clear_bit(PG_foreign, &(page)->flags); \ + (page)->index = 0; \ +} while (0) +#define PageForeignDestructor(_page, order) \ + ((void (*)(struct page *, unsigned int))(_page)->index)(_page, order) + +#define PageNetback(page) test_bit(PG_netback, &(page)->flags) +#define SetPageNetback(page) set_bit(PG_netback, &(page)->flags) +#define ClearPageNetback(page) clear_bit(PG_netback, &(page)->flags) + +#define PageBlkback(page) test_bit(PG_blkback, &(page)->flags) +#define SetPageBlkback(page) set_bit(PG_blkback, &(page)->flags) +#define ClearPageBlkback(page) clear_bit(PG_blkback, &(page)->flags) + extern void cancel_dirty_page(struct page *page, unsigned int account_size); int test_clear_page_writeback(struct page *page); @@ -413,6 +439,14 @@ PAGEFLAG_FALSE(MemError) #define __PG_MLOCKED 0 #endif +#if !defined(CONFIG_XEN) +# define __PG_XEN 0 +#elif defined(CONFIG_X86) +# define __PG_XEN ((1 << PG_pinned) | (1 << PG_foreign)) +#else +# define __PG_XEN (1 << PG_foreign) +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -422,7 +456,7 @@ PAGEFLAG_FALSE(MemError) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | __PG_XEN) /* * Flags checked when a page is prepped for return by the page allocator. --- head-2010-05-25.orig/include/linux/pci.h 2010-03-24 13:55:21.000000000 +0100 +++ head-2010-05-25/include/linux/pci.h 2010-03-24 14:53:41.000000000 +0100 @@ -962,6 +962,11 @@ static inline int pci_msi_enabled(void) { return 0; } + +#ifdef CONFIG_XEN +#define register_msi_get_owner(func) 0 +#define unregister_msi_get_owner(func) 0 +#endif #else extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec); extern void pci_msi_shutdown(struct pci_dev *dev); @@ -974,6 +979,10 @@ extern void pci_disable_msix(struct pci_ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev); extern void pci_restore_msi_state(struct pci_dev *dev); extern int pci_msi_enabled(void); +#ifdef CONFIG_XEN +extern int register_msi_get_owner(int (*func)(struct pci_dev *dev)); +extern int unregister_msi_get_owner(int (*func)(struct pci_dev *dev)); +#endif #endif #ifndef CONFIG_PCIEASPM --- head-2010-05-25.orig/include/linux/skbuff.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/skbuff.h 2010-04-15 09:43:55.000000000 +0200 @@ -272,6 +272,8 @@ typedef unsigned char *sk_buff_data_t; * @local_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @nohdr: Payload reference only, must not modify header + * @proto_data_valid: Protocol data validated since arriving at localhost + * @proto_csum_blank: Protocol csum must be added before leaving localhost * @pkt_type: Packet class * @fclone: skbuff clone status * @ip_summed: Driver fed us an IP checksum @@ -377,9 +379,13 @@ struct sk_buff { #ifdef CONFIG_NETVM __u8 emergency:1; #endif +#ifdef CONFIG_XEN + __u8 proto_data_valid:1, + proto_csum_blank:1; +#endif kmemcheck_bitfield_end(flags2); - /* 0/14 bit hole */ + /* 0/9...15 bit hole */ #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; --- head-2010-05-25.orig/include/linux/vermagic.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/include/linux/vermagic.h 2010-03-24 14:53:41.000000000 +0100 @@ -22,6 +22,11 @@ #else #define MODULE_VERMAGIC_MODVERSIONS "" #endif +#ifdef CONFIG_XEN +#define MODULE_VERMAGIC_XEN "Xen " +#else +#define MODULE_VERMAGIC_XEN +#endif #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif @@ -30,5 +35,5 @@ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC + MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC --- head-2010-05-25.orig/kernel/irq/spurious.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/kernel/irq/spurious.c 2010-03-24 14:53:41.000000000 +0100 @@ -233,7 +233,7 @@ void note_interrupt(unsigned int irq, st */ if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; - else + else if (!irq_ignore_unhandled(irq)) desc->irqs_unhandled++; desc->last_unhandled = jiffies; if (unlikely(action_ret != IRQ_NONE)) --- head-2010-05-25.orig/kernel/kexec.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/kernel/kexec.c 2010-03-24 14:53:41.000000000 +0100 @@ -360,13 +360,26 @@ static int kimage_is_destination_range(s return 0; } -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit) { struct page *pages; pages = alloc_pages(gfp_mask, order); if (pages) { unsigned int count, i; +#ifdef CONFIG_XEN + int address_bits; + + if (limit == ~0UL) + address_bits = BITS_PER_LONG; + else + address_bits = long_log2(limit); + + if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) { + __free_pages(pages, order); + return NULL; + } +#endif pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; @@ -430,10 +443,10 @@ static struct page *kimage_alloc_normal_ do { unsigned long pfn, epfn, addr, eaddr; - pages = kimage_alloc_pages(GFP_KERNEL, order); + pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = kexec_page_to_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -467,6 +480,7 @@ static struct page *kimage_alloc_normal_ return pages; } +#ifndef CONFIG_XEN static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -520,7 +534,7 @@ static struct page *kimage_alloc_crash_c } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); + pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT); break; } } @@ -547,6 +561,13 @@ struct page *kimage_alloc_control_pages( return pages; } +#else /* !CONFIG_XEN */ +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + return kimage_alloc_normal_control_pages(image, order); +} +#endif static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { @@ -562,7 +583,7 @@ static int kimage_add_entry(struct kimag return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -621,13 +642,13 @@ static void kimage_terminate(struct kima #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = kexec_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -639,6 +660,10 @@ static void kimage_free(struct kimage *i if (!image) return; +#ifdef CONFIG_XEN + xen_machine_kexec_unload(image); +#endif + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { @@ -714,7 +739,7 @@ static struct page *kimage_alloc_page(st * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = kexec_page_to_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -725,16 +750,16 @@ static struct page *kimage_alloc_page(st kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); + page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT); if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (kexec_page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unuseable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = kexec_page_to_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -757,7 +782,7 @@ static struct page *kimage_alloc_page(st struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -813,7 +838,7 @@ static int kimage_load_normal_segment(st result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, kexec_page_to_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -845,6 +870,7 @@ out: return result; } +#ifndef CONFIG_XEN static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -867,7 +893,7 @@ static int kimage_load_crash_segment(str char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = kexec_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -916,6 +942,13 @@ static int kimage_load_segment(struct ki return result; } +#else /* CONFIG_XEN */ +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + return kimage_load_normal_segment(image, segment); +} +#endif /* * Exec Kernel system call: for obvious reasons only root may call it. @@ -1019,6 +1052,13 @@ SYSCALL_DEFINE4(kexec_load, unsigned lon } kimage_terminate(image); } +#ifdef CONFIG_XEN + if (image) { + result = xen_machine_kexec_load(image); + if (result) + goto out; + } +#endif /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); --- head-2010-05-25.orig/kernel/sysctl.c 2010-03-24 14:09:47.000000000 +0100 +++ head-2010-05-25/kernel/sysctl.c 2010-03-24 14:53:41.000000000 +0100 @@ -776,7 +776,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP) { .procname = "acpi_video_flags", .data = &acpi_realmode_flags, --- head-2010-05-25.orig/mm/memory.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/mm/memory.c 2010-04-15 09:44:04.000000000 +0200 @@ -599,6 +599,12 @@ struct page *vm_normal_page(struct vm_ar { unsigned long pfn = pte_pfn(pte); +#if defined(CONFIG_XEN) && defined(CONFIG_X86) + /* XEN: Covers user-space grant mappings (even of local pages). */ + if (unlikely(vma->vm_flags & VM_FOREIGN)) + return NULL; +#endif + if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; @@ -630,6 +636,9 @@ struct page *vm_normal_page(struct vm_ar return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { +#ifdef CONFIG_XEN + if (!(vma->vm_flags & VM_RESERVED)) +#endif print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -935,8 +944,12 @@ static unsigned long zap_pte_range(struc page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); + if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte)) + ptent = vma->vm_ops->zap_pte(vma, addr, pte, + tlb->fullmm); + else + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; @@ -1203,6 +1216,7 @@ unsigned long zap_page_range(struct vm_a tlb_finish_mmu(tlb, address, end); return end; } +EXPORT_SYMBOL(zap_page_range); /** * zap_vma_ptes - remove ptes mapping the vma @@ -1399,6 +1413,28 @@ int __get_user_pages(struct task_struct continue; } +#ifdef CONFIG_XEN + if (vma && (vma->vm_flags & VM_FOREIGN)) { + struct vm_foreign_map *foreign_map = + vma->vm_private_data; + struct page **map = foreign_map->map; + int offset = (start - vma->vm_start) >> PAGE_SHIFT; + if (map[offset] != NULL) { + if (pages) { + struct page *page = map[offset]; + + pages[i] = page; + get_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + } +#endif if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) --- head-2010-05-25.orig/mm/mmap.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/mm/mmap.c 2010-04-29 09:42:36.000000000 +0200 @@ -1944,6 +1944,12 @@ static void unmap_region(struct mm_struc tlb_finish_mmu(tlb, start, end); } +static inline void unmap_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma->vm_ops && vma->vm_ops->unmap)) + vma->vm_ops->unmap(vma); +} + /* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. @@ -1959,6 +1965,7 @@ detach_vmas_to_be_unmapped(struct mm_str insertion_point = (prev ? &prev->vm_next : &mm->mmap); do { rb_erase(&vma->vm_rb, &mm->mm_rb); + unmap_vma(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2297,6 +2304,9 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); + for (vma = mm->mmap; vma; vma = vma->vm_next) + unmap_vma(vma); + vma = mm->mmap; if (!vma) /* Can happen if dup_mmap() received an OOM */ return; --- head-2010-05-25.orig/mm/mprotect.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/mm/mprotect.c 2010-04-15 09:44:14.000000000 +0200 @@ -90,6 +90,8 @@ static inline void change_pmd_range(stru next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; + if (arch_change_pte_range(mm, pmd, addr, next, newprot)) + continue; change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); } while (pmd++, addr = next, addr != end); } --- head-2010-05-25.orig/mm/page_alloc.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/mm/page_alloc.c 2010-03-24 14:59:37.000000000 +0100 @@ -609,6 +609,13 @@ static void __free_pages_ok(struct page int bad = 0; int wasMlocked = __TestClearPageMlocked(page); +#ifdef CONFIG_XEN + if (PageForeign(page)) { + PageForeignDestructor(page, order); + return; + } +#endif + trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); @@ -1110,6 +1117,13 @@ void free_hot_cold_page(struct page *pag int migratetype; int wasMlocked = __TestClearPageMlocked(page); +#ifdef CONFIG_XEN + if (PageForeign(page)) { + PageForeignDestructor(page, 0); + return; + } +#endif + trace_mm_page_free_direct(page, 0); kmemcheck_free_shadow(page, 0); --- head-2010-05-25.orig/net/core/dev.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/net/core/dev.c 2010-05-25 09:19:25.000000000 +0200 @@ -139,6 +139,12 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) +#ifdef CONFIG_XEN +#include +#include +#include +#endif + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -2005,6 +2011,43 @@ static struct netdev_queue *dev_pick_tx( return netdev_get_tx_queue(dev, queue_index); } +#ifdef CONFIG_XEN +inline int skb_checksum_setup(struct sk_buff *skb) +{ + if (skb->proto_csum_blank) { + if (skb->protocol != htons(ETH_P_IP)) + goto out; + skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; + if (skb->h.raw >= skb->tail) + goto out; + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + skb->csum = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + skb->csum = offsetof(struct udphdr, check); + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" + " %d packet", skb->nh.iph->protocol); + goto out; + } + if ((skb->h.raw + skb->csum + 2) > skb->tail) + goto out; + skb->ip_summed = CHECKSUM_HW; + skb->proto_csum_blank = 0; + } + return 0; +out: + return -EPROTO; +} +#else +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; } +#endif +EXPORT_SYMBOL(skb_checksum_setup); + static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) @@ -2086,6 +2129,12 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + /* If a checksum-deferred packet is forwarded to a device that needs a + * checksum, correct the pointers and force checksumming. + */ + if (skb_checksum_setup(skb)) + goto out_kfree_skb; + /* GSO will handle the following emulations directly. */ if (netif_needs_gso(dev, skb)) goto gso; @@ -2574,6 +2623,19 @@ int netif_receive_skb(struct sk_buff *sk } #endif +#ifdef CONFIG_XEN + switch (skb->ip_summed) { + case CHECKSUM_UNNECESSARY: + skb->proto_data_valid = 1; + break; + case CHECKSUM_HW: + /* XXX Implement me. */ + default: + skb->proto_data_valid = 0; + break; + } +#endif + if (skb_emergency(skb)) goto skip_taps; --- head-2010-05-25.orig/net/core/skbuff.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/net/core/skbuff.c 2010-03-24 14:53:41.000000000 +0100 @@ -645,6 +645,10 @@ static struct sk_buff *__skb_clone(struc n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; +#ifdef CONFIG_XEN + C(proto_data_valid); + C(proto_csum_blank); +#endif n->destructor = NULL; C(tail); C(end); --- head-2010-05-25.orig/net/ipv4/netfilter/nf_nat_proto_tcp.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/net/ipv4/netfilter/nf_nat_proto_tcp.c 2010-03-24 14:53:41.000000000 +0100 @@ -75,6 +75,9 @@ tcp_manip_pkt(struct sk_buff *skb, if (hdrsize < sizeof(*hdr)) return true; + if (skb_checksum_setup(skb)) + return false; + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); return true; --- head-2010-05-25.orig/net/ipv4/netfilter/nf_nat_proto_udp.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/net/ipv4/netfilter/nf_nat_proto_udp.c 2010-03-24 14:53:41.000000000 +0100 @@ -60,6 +60,10 @@ udp_manip_pkt(struct sk_buff *skb, newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } + + if (skb_checksum_setup(skb)) + return false; + if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, --- head-2010-05-25.orig/net/ipv4/xfrm4_output.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/net/ipv4/xfrm4_output.c 2010-03-24 14:53:41.000000000 +0100 @@ -81,7 +81,7 @@ static int xfrm4_output_finish(struct sk #endif skb->protocol = htons(ETH_P_IP); - return xfrm_output(skb); + return skb_checksum_setup(skb) ?: xfrm_output(skb); } int xfrm4_output(struct sk_buff *skb) --- head-2010-05-25.orig/scripts/Makefile.build 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/scripts/Makefile.build 2010-03-24 14:53:41.000000000 +0100 @@ -76,6 +76,21 @@ ifndef obj $(warning kbuild: Makefile.build is included improperly) endif +ifeq ($(CONFIG_XEN),y) +Makefile.xen := $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD),$(objtree)/scripts)/Makefile.xen +$(Makefile.xen): $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build + @echo ' Updating $@' + $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\ + ,$(error 'Your awk program does not define gensub. Use gawk or another awk with gensub')) + @$(AWK) -f $< $(filter-out $<,$^) >$@ + +xen-src-single-used-m := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c)))) +xen-single-used-m := $(xen-src-single-used-m:-xen.c=.o) +single-used-m := $(filter-out $(xen-single-used-m),$(single-used-m)) + +-include $(Makefile.xen) +endif + # =========================================================================== ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),) --- head-2010-05-25.orig/scripts/Makefile.lib 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/scripts/Makefile.lib 2010-03-24 14:53:41.000000000 +0100 @@ -22,6 +22,12 @@ obj-m := $(filter-out $(obj-y),$(obj-m)) lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m))) +# Remove objects forcibly disabled + +obj-y := $(filter-out $(disabled-obj-y),$(obj-y)) +obj-m := $(filter-out $(disabled-obj-y),$(obj-m)) +lib-y := $(filter-out $(disabled-obj-y),$(lib-y)) + # Handle objects in subdirs # ---------------------------------------------------------------------------