From: www.kernel.org Subject: Linux 2.6.20 Patch-mainline: 2.6.20 Automatically created from "patches.kernel.org/patch-2.6.20" by xen-port-patches.py Acked-by: jbeulich@novell.com --- head-2011-02-17.orig/arch/x86/Kconfig 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/Kconfig 2011-01-31 17:32:16.000000000 +0100 @@ -1651,6 +1651,7 @@ config PHYSICAL_START config RELOCATABLE bool "Build a relocatable kernel" + depends on !X86_XEN default y ---help--- This builds a kernel image that retains relocation information @@ -1672,7 +1673,8 @@ config X86_NEED_RELOCS depends on X86_32 && RELOCATABLE config PHYSICAL_ALIGN - hex "Alignment value to which kernel should be aligned" if X86_32 + hex "Alignment value to which kernel should be aligned" if X86_32 && !XEN + default 0x2000 if XEN default "0x1000000" range 0x2000 0x1000000 ---help--- --- head-2011-02-17.orig/arch/x86/kernel/asm-offsets_32.c 2011-01-31 14:54:00.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/asm-offsets_32.c 2011-01-31 17:32:16.000000000 +0100 @@ -55,6 +55,7 @@ void foo(void) OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); + OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); @@ -108,6 +109,11 @@ void foo(void) OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_START_mfn_list, start_info, mfn_list); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); --- head-2011-02-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/cpu/common-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -22,6 +22,7 @@ #define phys_pkg_id(a,b) a #endif #endif +#include #include #include "cpu.h" @@ -29,10 +30,8 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -#ifndef CONFIG_XEN -DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); -EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); -#endif +struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; @@ -60,7 +59,7 @@ static struct cpu_dev __cpuinitdata defa .c_init = default_init, .c_vendor = "Unknown", }; -static struct cpu_dev * this_cpu = &default_cpu; +static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu; static int __init cachesize_setup(char *str) { @@ -242,29 +241,14 @@ static int __cpuinit have_cpuid_p(void) return flag_is_changeable_p(X86_EFLAGS_ID); } -/* Do minimum CPU detection early. - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. - The others are not touched to avoid unwanted side effects. - - WARNING: this function is only called on the BP. Don't add code here - that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +void __init cpu_detect(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; - - if (!have_cpuid_p()) - return; - /* Get vendor name */ cpuid(0x00000000, &c->cpuid_level, (int *)&c->x86_vendor_id[0], (int *)&c->x86_vendor_id[8], (int *)&c->x86_vendor_id[4]); - get_cpu_vendor(c, 1); - c->x86 = 4; if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; @@ -281,6 +265,26 @@ static void __init early_cpu_detect(void } } +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. + + WARNING: this function is only called on the BP. Don't add code here + that is supposed to run on all CPUs. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c, 1); +} + static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; @@ -317,6 +321,8 @@ static void __cpuinit generic_identify(s c->apicid = (ebx >> 24) & 0xFF; #endif #endif + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; @@ -381,6 +387,7 @@ void __cpuinit identify_cpu(struct cpuin c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; + c->x86_clflush_size = 32; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { @@ -601,61 +608,23 @@ void __init early_cpu_init(void) #endif } -static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr) +/* Make sure %gs is initialized properly in idle threads */ +struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { - unsigned long frames[16]; - unsigned long va; - int f; - - for (va = gdt_descr->address, f = 0; - va < gdt_descr->address + gdt_descr->size; - va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); - make_lowmem_page_readonly( - (void *)va, XENFEAT_writable_descriptor_tables); - } - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8)) - BUG(); + memset(regs, 0, sizeof(struct pt_regs)); + regs->xgs = __KERNEL_PDA; + return regs; } -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) +static __cpuinit int alloc_gdt(int cpu) { - int cpu = smp_processor_id(); -#ifndef CONFIG_X86_NO_TSS - struct tss_struct * t = &per_cpu(init_tss, cpu); -#endif - struct thread_struct *thread = ¤t->thread; - struct desc_struct *gdt; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; - if (cpu_test_and_set(cpu, cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); - } - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - if (cpu_has_vme || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); -#ifndef CONFIG_XEN - /* The CPU hotplug case */ - if (cpu_gdt_descr->address) { - gdt = (struct desc_struct *)cpu_gdt_descr->address; - memset(gdt, 0, PAGE_SIZE); - goto old_gdt; - } /* * This is a horrible hack to allocate the GDT. The problem * is that cpu_init() is called really early for the boot CPU @@ -663,54 +632,141 @@ void __cpuinit cpu_init(void) * CPUs, when bootmem will have gone away */ if (NODE_DATA(0)->bdata->node_bootmem_map) { - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); - /* alloc_bootmem_pages panics on failure, so no check */ + BUG_ON(gdt != NULL || pda != NULL); + + gdt = alloc_bootmem_pages(PAGE_SIZE); + pda = alloc_bootmem(sizeof(*pda)); + /* alloc_bootmem(_pages) panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + memset(pda, 0, sizeof(*pda)); } else { - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - if (unlikely(!gdt)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - for (;;) - local_irq_enable(); + /* GDT and PDA might already have been allocated if + this is a CPU hotplug re-insertion. */ + if (gdt == NULL) + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + + if (pda == NULL) + pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); + + if (unlikely(!gdt || !pda)) { + free_pages((unsigned long)gdt, 0); + kfree(pda); + return 0; } } -old_gdt: + + cpu_gdt_descr->address = (unsigned long)gdt; + cpu_pda(cpu) = pda; + + return 1; +} + +/* Initial PDA used by boot CPU */ +struct i386_pda boot_pda = { + ._pda = &boot_pda, + .cpu_number = 0, + .pcurrent = &init_task, +}; + +static inline void set_kernel_gs(void) +{ + /* Set %gs for this CPU's PDA. Memory clobber is to create a + barrier with respect to any PDA operations, so the compiler + doesn't move any before here. */ + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); +} + +/* Initialize the CPU's GDT and PDA. The boot CPU does this for + itself, but secondaries find this done for them. */ +__cpuinit int init_gdt(int cpu, struct task_struct *idle) +{ + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; + + /* For non-boot CPUs, the GDT and PDA should already have been + allocated. */ + if (!alloc_gdt(cpu)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); + return 0; + } + + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); + + BUG_ON(gdt == NULL || pda == NULL); + /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); + cpu_gdt_descr->size = GDT_SIZE - 1; - /* Set up GDT entry for 16bit stack */ - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | - (CPU_16BIT_STACK_SIZE - 1); + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, + (u32 *)&gdt[GDT_ENTRY_PDA].b, + (unsigned long)pda, sizeof(*pda) - 1, + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ + + memset(pda, 0, sizeof(*pda)); + pda->_pda = pda; + pda->cpu_number = cpu; + pda->pcurrent = idle; - cpu_gdt_descr->size = GDT_SIZE - 1; - cpu_gdt_descr->address = (unsigned long)gdt; -#else - if (cpu == 0 && cpu_gdt_descr->address == 0) { - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); - /* alloc_bootmem_pages panics on failure, so no check */ - memset(gdt, 0, PAGE_SIZE); + return 1; +} - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - - cpu_gdt_descr->size = GDT_SIZE; - cpu_gdt_descr->address = (unsigned long)gdt; +void __cpuinit cpu_set_gdt(int cpu) +{ + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + unsigned long va, frames[16]; + int f; + + for (va = cpu_gdt_descr->address, f = 0; + va < cpu_gdt_descr->address + cpu_gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly( + (void *)va, XENFEAT_writable_descriptor_tables); } + BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8)); + + set_kernel_gs(); +} + +/* Common CPU init for both boot and secondary CPUs */ +static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +{ +#ifndef CONFIG_X86_NO_TSS + struct tss_struct * t = &per_cpu(init_tss, cpu); #endif + struct thread_struct *thread = &curr->thread; + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } - cpu_gdt_init(cpu_gdt_descr); + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } /* * Set up and load the per-CPU TSS and LDT */ atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - BUG_ON(current->mm); - enter_lazy_tlb(&init_mm, current); + curr->active_mm = &init_mm; + if (curr->mm) + BUG(); + enter_lazy_tlb(&init_mm, curr); load_esp0(t, thread); @@ -721,8 +777,8 @@ old_gdt: __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs and %gs. */ - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); + /* Clear %fs. */ + asm volatile ("mov %0, %%fs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); @@ -740,6 +796,38 @@ old_gdt: mxcsr_feature_mask_init(); } +/* Entrypoint to initialize secondary CPU */ +void __cpuinit secondary_cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + _cpu_init(cpu, curr); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + /* Set up the real GDT and PDA, so we can transition from the + boot versions. */ + if (!init_gdt(cpu, curr)) { + /* failed to allocate something; not much we can do... */ + for (;;) + local_irq_enable(); + } + + cpu_set_gdt(cpu); + _cpu_init(cpu, curr); +} + #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { --- head-2011-02-17.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/cpu/mtrr/main-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -12,7 +12,7 @@ static DEFINE_MUTEX(mtrr_mutex); void generic_get_mtrr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { struct xen_platform_op op; @@ -115,8 +115,7 @@ int mtrr_del_page(int reg, unsigned long { unsigned i; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; int error = -EINVAL; struct xen_platform_op op; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-02-17/arch/x86/kernel/e820_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -0,0 +1,1002 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +struct e820map e820; +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif +extern int user_defined_memmap; +struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +static int romsignature(const unsigned char *x) +{ + unsigned short sig; + int ret = 0; + if (probe_kernel_address((const unsigned short *)x, sig) == 0) + ret = (sig == 0xaa55); + return ret; +} + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + +#ifdef CONFIG_XEN + /* Nothing to do if not running in dom0. */ + if (!is_initial_xendomain()) + return; +#endif + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +#ifdef CONFIG_XEN +static struct e820map machine_e820 __initdata; +#define e820 machine_e820 +#endif + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +{ + int i; + + probe_roms(); + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; +#ifndef CONFIG_RESOURCES_64BIT + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; +#endif + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ +#ifndef CONFIG_XEN + request_resource(res, code_resource); + request_resource(res, data_resource); +#endif +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#ifdef CONFIG_XEN + xen_machine_kexec_register_resources(res); +#endif +#endif + } + } +} + +#undef e820 + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + /* Nothing to do if not running in dom0. */ + if (!is_initial_xendomain()) + return 0; + + printk("Setting up standard PCI resources\n"); + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(&code_resource, &data_resource); + + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); + +void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + printk("sanitize start\n"); + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) { + printk("sanitize bail 0\n"); + return -1; + } + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; iaddr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if > , swap */ + /* or, if current= & last=, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; ipbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; itype > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + printk("sanitize end\n"); + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ +#ifndef CONFIG_XEN + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; +#else + BUG_ON(nr_map < 1); +#endif + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + +#ifndef CONFIG_XEN + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + printk("copy_e820_map() type is E820_RAM\n"); + if (start < 0x100000ULL && end > 0xA0000ULL) { + printk("copy_e820_map() lies in range...\n"); + if (start < 0xA0000ULL) { + printk("copy_e820_map() start < 0xA0000ULL\n"); + add_memory_region(start, 0xA0000ULL-start, type); + } + if (end <= 0x100000ULL) { + printk("copy_e820_map() end <= 0x100000ULL\n"); + continue; + } + start = 0x100000ULL; + size = end - start; + } + } +#endif + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + +#ifdef CONFIG_XEN + if (is_initial_xendomain()) { + struct xen_memory_map memmap; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, machine_e820.map); + + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; + } else + machine_e820 = e820; +#endif + + return 0; +} + +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, PFN_UP(start), PFN_DOWN(end)); + return 0; +} + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + memory_present(0, start, end); + } +} + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= (max_low_pfn << PAGE_SHIFT)) + return 0; + if (end >= (max_low_pfn << PAGE_SHIFT)) + end = max_low_pfn << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + +#ifdef CONFIG_XEN + /* + * Truncate to the number of actual pages currently + * present. + */ + if (last_pfn > xen_start_info->nr_pages) + last_pfn = xen_start_info->nr_pages; +#endif + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +void __init e820_register_memory(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + +#ifdef CONFIG_XEN +#define e820 machine_e820 +#endif + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } +#undef e820 + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} + +void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +static __init __always_inline void efi_limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + efi_memory_desc_t *md, *next_md; + void *p, *p1; + int i, j; + + j = 0; + p1 = memmap.map; + for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + md = p; + next_md = p1; + current_addr = md->phys_addr + + PFN_PHYS(md->num_pages); + if (is_available_memory(md)) { + if (md->phys_addr >= size) continue; + memcpy(next_md, md, memmap.desc_size); + if (current_addr >= size) { + next_md->num_pages -= + PFN_UP(current_addr-size); + } + p1 += memmap.desc_size; + next_md = p1; + j++; + } else if ((md->attribute & EFI_MEMORY_RUNTIME) == + EFI_MEMORY_RUNTIME) { + /* In order to make runtime services + * available we have to include runtime + * memory regions in memory map */ + memcpy(next_md, md, memmap.desc_size); + p1 += memmap.desc_size; + next_md = p1; + j++; + } + } + memmap.nr_map = j; + memmap.map_end = memmap.map + + (memmap.nr_map * memmap.desc_size); +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + print_memory_map("limit_regions start"); + if (efi_enabled) { + efi_limit_regions(size); + return; + } + for (i = 0; i < e820.nr_map; i++) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + print_memory_map("limit_regions endfor"); + return; + } +#ifdef CONFIG_XEN + if (current_addr < size) { + /* + * The e820 map finished before our requested size so + * extend the final entry to the requested address. + */ + --i; + if (e820.map[i].type == E820_RAM) + e820.map[i].size -= current_addr - size; + else + add_memory_region(current_addr, size - current_addr, E820_RAM); + } +#endif + print_memory_map("limit_regions endfunc"); +} + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + + /* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) +{ + u64 start = s; + u64 end = e; + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; + } + } + return 0; +} +early_param("memmap", parse_memmap); --- head-2011-02-17.orig/arch/x86/kernel/entry_32-xen.S 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/entry_32-xen.S 2011-01-31 17:32:16.000000000 +0100 @@ -30,12 +30,13 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss + * 24(%esp) - %gs + * 28(%esp) - orig_eax + * 2C(%esp) - %eip + * 30(%esp) - %cs + * 34(%esp) - %eflags + * 38(%esp) - %oldesp + * 3C(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -48,27 +49,25 @@ #include #include #include +#include #include #include "irq_vectors.h" #include -#define nr_syscalls ((syscall_table_size)/4) +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ -EBX = 0x00 -ECX = 0x04 -EDX = 0x08 -ESI = 0x0C -EDI = 0x10 -EBP = 0x14 -EAX = 0x18 -DS = 0x1C -ES = 0x20 -ORIG_EAX = 0x24 -EIP = 0x28 -CS = 0x2C -EFLAGS = 0x30 -OLDESP = 0x34 -OLDSS = 0x38 +#define nr_syscalls ((syscall_table_size)/4) CF_MASK = 0x00000001 TF_MASK = 0x00000100 @@ -79,61 +78,16 @@ VM_MASK = 0x00020000 /* Pseudo-eflags. */ NMI_MASK = 0x80000000 -#ifndef CONFIG_XEN -/* These are replaces for paravirtualization */ -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -#define INTERRUPT_RETURN iret -#define GET_CR0_INTO_EAX movl %cr0, %eax -#else -/* Offsets into shared_info_t. */ -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 6 - -#ifdef CONFIG_SMP -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ - shl $sizeof_vcpu_shift,%esi ; \ - addl HYPERVISOR_shared_info,%esi -#else -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi -#endif - -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) -#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \ - __DISABLE_INTERRUPTS -#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \ - __ENABLE_INTERRUPTS -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \ -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ - __TEST_PENDING ; \ - jnz 14f # process more events if necessary... ; \ - movl ESI(%esp), %esi ; \ - sysexit ; \ -14: __DISABLE_INTERRUPTS ; \ - TRACE_IRQS_OFF ; \ -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ - push %esp ; \ - call evtchn_do_upcall ; \ - add $4,%esp ; \ - jmp ret_from_intr -#define INTERRUPT_RETURN iret -#endif - #ifdef CONFIG_PREEMPT -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop +#define preempt_stop(clobbers) #define resume_kernel restore_nocheck #endif .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS - testl $IF_MASK,EFLAGS(%esp) # interrupts off? + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? jz 1f TRACE_IRQS_ON 1: @@ -148,6 +102,9 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT #define SAVE_ALL \ cld; \ + pushl %gs; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET gs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -177,7 +134,9 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + movl $(__KERNEL_PDA), %edx; \ + movl %edx, %gs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -210,17 +169,22 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ +3: popl %gs; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE gs;*/\ +.pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ + jmp 1b; \ +5: movl $0,(%esp); \ jmp 2b; \ -.previous; \ +6: movl $0,(%esp); \ + jmp 3b; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ -.previous + .long 1b,4b; \ + .long 2b,5b; \ + .long 3b,6b; \ +.popsection #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ @@ -239,18 +203,18 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ CFI_SIGNAL_FRAME;\ - CFI_DEF_CFA esp, OLDESP-EBX;\ - /*CFI_OFFSET cs, CS-OLDESP;*/\ - CFI_OFFSET eip, EIP-OLDESP;\ - /*CFI_OFFSET es, ES-OLDESP;*/\ - /*CFI_OFFSET ds, DS-OLDESP;*/\ - CFI_OFFSET eax, EAX-OLDESP;\ - CFI_OFFSET ebp, EBP-OLDESP;\ - CFI_OFFSET edi, EDI-OLDESP;\ - CFI_OFFSET esi, ESI-OLDESP;\ - CFI_OFFSET edx, EDX-OLDESP;\ - CFI_OFFSET ecx, ECX-OLDESP;\ - CFI_OFFSET ebx, EBX-OLDESP + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ + CFI_OFFSET ebx, PT_EBX-PT_OLDESP ENTRY(ret_from_fork) CFI_STARTPROC @@ -278,17 +242,18 @@ ENTRY(ret_from_fork) ALIGN RING0_PTREGS_FRAME ret_from_exception: - preempt_stop + preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: - movl EFLAGS(%esp), %eax # mix EFLAGS and CS - movb CS(%esp), %al + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al andl $(VM_MASK | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace + ENTRY(resume_userspace) - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -299,14 +264,14 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all - testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq jmp need_resched @@ -328,7 +293,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -340,12 +305,16 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ +#ifndef CONFIG_COMPAT_VDSO /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) +#else + pushl $SYSENTER_RETURN +#endif CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 @@ -372,19 +341,27 @@ sysenter_past_esp: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) - DISABLE_INTERRUPTS + movl %eax,PT_EAX(%esp) + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work /* if something modifies registers it must also disable sysexit */ - movl EIP(%esp), %edx - movl OLDESP(%esp), %ecx + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON +1: mov PT_GS(%esp), %gs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC +.pushsection .fixup,"ax" +2: movl $0,PT_GS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection # pv sysenter call handler stub ENTRY(sysenter_entry_pv) @@ -419,7 +396,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testl $TF_MASK,EFLAGS(%esp) + testl $TF_MASK,PT_EFLAGS(%esp) jz no_singlestep orl $_TIF_SINGLESTEP,TI_flags(%ebp) no_singlestep: @@ -431,9 +408,9 @@ no_singlestep: jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) # store the return value + movl %eax,PT_EAX(%esp) # store the return value syscall_exit: - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -443,12 +420,12 @@ syscall_exit: restore_all: #ifndef CONFIG_XEN - movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: OLDSS(%esp) contains the wrong/random values if we + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. # See comments in process.c:copy_thread() for details. - movb OLDSS(%esp), %ah - movb CS(%esp), %al + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE @@ -456,7 +433,7 @@ restore_all: restore_nocheck: #else restore_nocheck: - movl EFLAGS(%esp), %eax + movl PT_EFLAGS(%esp), %eax testl $(VM_MASK|NMI_MASK), %eax CFI_REMEMBER_STATE jnz hypervisor_iret @@ -470,13 +447,13 @@ restore_nocheck: TRACE_IRQS_IRET restore_nocheck_notrace: RESTORE_REGS - addl $4, %esp + addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: #ifndef CONFIG_XEN - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) #endif pushl $0 # no error code pushl $do_iret_error @@ -490,33 +467,42 @@ iret_exc: CFI_RESTORE_STATE #ifndef CONFIG_XEN ldt_ss: - larl OLDSS(%esp), %eax + larl PT_OLDSS(%esp), %eax jnz restore_nocheck testl $0x00400000, %eax # returning to 32bit stack? jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, paravirt_ops+PARAVIRT_enabled + jne restore_nocheck +#endif + /* If returning to userspace with 16bit stack, * try to fix the higher word of ESP, as the CPU * won't restore it. * This is an "official" bug of all the x86-compatible * CPUs, which we can try to work around to make * dosemu and wine happy. */ - subl $8, %esp # reserve space for switch16 pointer - CFI_ADJUST_CFA_OFFSET 8 - DISABLE_INTERRUPTS + movl PT_OLDESP(%esp), %eax + movl %esp, %edx + call patch_espfix_desc + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + DISABLE_INTERRUPTS(CLBR_EAX) TRACE_IRQS_OFF - movl %esp, %eax - /* Set up the 16bit stack frame with switch32 pointer on top, - * and a switch16 pointer on top of the current frame. */ - call setup_x86_bogus_stack - CFI_ADJUST_CFA_OFFSET -8 # frame has moved - TRACE_IRQS_IRET - RESTORE_REGS - lss 20+4(%esp), %esp # switch to 16bit stack -1: INTERRUPT_RETURN -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous + lss (%esp), %esp + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck #else ALIGN restore_all_enable_events: @@ -540,7 +526,7 @@ ecrit: /**** END OF CRITICAL REGION *** CFI_RESTORE_STATE hypervisor_iret: - andl $~NMI_MASK, EFLAGS(%esp) + andl $~NMI_MASK, PT_EFLAGS(%esp) RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 @@ -556,7 +542,7 @@ work_pending: jz work_notifysig work_resched: call schedule - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -569,7 +555,8 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests - testl $VM_MASK, EFLAGS(%esp) +#ifdef CONFIG_VM86 + testl $VM_MASK, PT_EFLAGS(%esp) movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space @@ -579,29 +566,30 @@ work_notifysig: # deal with pending s ALIGN work_notifysig_v86: -#ifdef CONFIG_VM86 pushl %ecx # save ti_flags for do_notify_resume CFI_ADJUST_CFA_OFFSET 4 call save_v86_state # %eax contains pt_regs pointer popl %ecx CFI_ADJUST_CFA_OFFSET -4 movl %eax, %esp +#else + movl %esp, %eax +#endif xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig -#endif # perform syscall exit tracing ALIGN syscall_trace_entry: - movl $-ENOSYS,EAX(%esp) + movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax xorl %edx,%edx call do_syscall_trace cmpl $0, %eax jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, # so must skip actual syscall - movl ORIG_EAX(%esp), %eax + movl PT_ORIG_EAX(%esp), %eax cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -612,7 +600,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -626,40 +614,39 @@ syscall_fault: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - movl $-EFAULT,EAX(%esp) + movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace syscall_badsys: - movl $-ENOSYS,EAX(%esp) + movl $-ENOSYS,PT_EAX(%esp) jmp resume_userspace CFI_ENDPROC #ifndef CONFIG_XEN #define FIXUP_ESPFIX_STACK \ - movl %esp, %eax; \ - /* switch to 32bit stack using the pointer on top of 16bit stack */ \ - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ - /* copy data from 16bit stack to 32bit stack */ \ - call fixup_x86_bogus_stack; \ - /* put ESP to the proper location */ \ - movl %eax, %esp; -#define UNWIND_ESPFIX_STACK \ + /* since we are on a wrong stack, we cant make it a C code :( */ \ + movl %gs:PDA_cpu, %ebx; \ + PER_CPU(cpu_gdt_descr, %ebx); \ + movl GDS_address(%ebx), %ebx; \ + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ + addl %esp, %eax; \ + pushl $__KERNEL_DS; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl %eax; \ CFI_ADJUST_CFA_OFFSET 4; \ + lss (%esp), %esp; \ + CFI_ADJUST_CFA_OFFSET -8; +#define UNWIND_ESPFIX_STACK \ movl %ss, %eax; \ - /* see if on 16bit stack */ \ + /* see if on espfix stack */ \ cmpw $__ESPFIX_SS, %ax; \ - je 28f; \ -27: popl %eax; \ - CFI_ADJUST_CFA_OFFSET -4; \ -.section .fixup,"ax"; \ -28: movl $__KERNEL_DS, %eax; \ + jne 27f; \ + movl $__KERNEL_DS, %eax; \ movl %eax, %ds; \ movl %eax, %es; \ - /* switch to 32bit stack */ \ + /* switch to normal stack */ \ FIXUP_ESPFIX_STACK; \ - jmp 27b; \ -.previous +27:; /* * Build the entry stubs and pointer table with @@ -723,13 +710,16 @@ KPROBE_ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: + /* the function address is in %gs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ pushl %ds CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ds, 0*/ pushl %eax CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eax, 0 - xorl %eax, %eax pushl %ebp CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebp, 0 @@ -742,7 +732,6 @@ error_code: pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - decl %eax # eax = -1 pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 @@ -750,18 +739,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %es + pushl %gs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ + /*CFI_REL_OFFSET gs, 0*/ + movl $(__KERNEL_PDA), %ecx + movl %ecx, %gs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl ES(%esp), %edi # get the function address - movl ORIG_EAX(%esp), %edx # get the error code - movl %eax, ORIG_EAX(%esp) - movl %ecx, ES(%esp) - /*CFI_REL_OFFSET es, ES*/ + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_GS(%esp) + /*CFI_REL_OFFSET gs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -793,8 +784,8 @@ ENTRY(hypervisor_callback) pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - movl CS(%esp),%ecx - movl EIP(%esp),%eax + movl PT_CS(%esp),%ecx + movl PT_EIP(%esp),%eax andl $SEGMENT_RPL_MASK,%ecx cmpl $USER_RPL,%ecx jae .Ldo_upcall @@ -808,7 +799,7 @@ ENTRY(hypervisor_callback) jb .Ldo_upcall cmpl $sysexit_ecrit,%eax ja .Ldo_upcall - addl $OLDESP,%esp # Remove eflags...ebx from stack frame. + addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame. #endif .Ldo_upcall: push %esp @@ -830,7 +821,7 @@ critical_region_fixup: movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped testl %ecx,%ecx leal (%esp,%ecx,4),%esi # %esi points at end of src region - leal OLDESP(%esp),%edi # %edi points at end of dst region + leal PT_OLDESP(%esp),%edi # %edi points at end of dst region jle 17f # skip loop if nothing to copy 16: subl $4,%esi # pre-decrementing copy loop subl $4,%edi @@ -853,8 +844,9 @@ critical_fixup_table: .byte 6 # pop %eax .byte 7 # pop %ds .byte 8 # pop %es - .byte 9,9,9 # add $4,%esp - .byte 10 # iret + .byte 9,9 # pop %gs + .byte 10,10,10 # add $4,%esp + .byte 11 # iret .byte -1,-1,-1,-1 # movb $1,1(%esi) = __DISABLE_INTERRUPTS .previous @@ -944,7 +936,7 @@ ENTRY(device_not_available) jmp ret_from_exception device_available_emulate: #endif - preempt_stop + preempt_stop(CLBR_ANY) call math_state_restore jmp ret_from_exception CFI_ENDPROC @@ -1014,7 +1006,7 @@ KPROBE_ENTRY(nmi) cmpw $__ESPFIX_SS, %ax popl %eax CFI_ADJUST_CFA_OFFSET -4 - je nmi_16bit_stack + je nmi_espfix_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax @@ -1057,7 +1049,7 @@ nmi_debug_stack_check: FIX_STACK(24,nmi_stack_correct, 1) jmp nmi_stack_correct -nmi_16bit_stack: +nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * * create the pointer to lss back @@ -1066,7 +1058,6 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 pushl %esp CFI_ADJUST_CFA_OFFSET 4 - movzwl %sp, %esp addw $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 @@ -1077,11 +1068,11 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp - CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS - lss 12+4(%esp), %esp # back to 16bit stack + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" @@ -1097,12 +1088,25 @@ KPROBE_ENTRY(nmi) xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - orl $NMI_MASK, EFLAGS(%esp) + orl $NMI_MASK, PT_EFLAGS(%esp) jmp restore_all CFI_ENDPROC #endif KPROBE_END(nmi) +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +#endif + KPROBE_ENTRY(int3) RING0_INT_FRAME pushl $-1 # mark this as an int @@ -1218,37 +1222,6 @@ ENTRY(spurious_interrupt_bug) CFI_ENDPROC #endif /* !CONFIG_XEN */ -#ifdef CONFIG_STACK_UNWIND -ENTRY(arch_unwind_init_running) - CFI_STARTPROC - movl 4(%esp), %edx - movl (%esp), %ecx - leal 4(%esp), %eax - movl %ebx, EBX(%edx) - xorl %ebx, %ebx - movl %ebx, ECX(%edx) - movl %ebx, EDX(%edx) - movl %esi, ESI(%edx) - movl %edi, EDI(%edx) - movl %ebp, EBP(%edx) - movl %ebx, EAX(%edx) - movl $__USER_DS, DS(%edx) - movl $__USER_DS, ES(%edx) - movl %ebx, ORIG_EAX(%edx) - movl %ecx, EIP(%edx) - movl 12(%esp), %ecx - movl $__KERNEL_CS, CS(%edx) - movl %ebx, EFLAGS(%edx) - movl %eax, OLDESP(%edx) - movl 8(%esp), %eax - movl %ecx, 8(%esp) - movl EBX(%edx), %ebx - movl $__KERNEL_DS, OLDSS(%edx) - jmpl *%eax - CFI_ENDPROC -ENDPROC(arch_unwind_init_running) -#endif - ENTRY(fixup_4gb_segment) RING0_EC_FRAME pushl $do_fixup_4gb_segment --- head-2011-02-17.orig/arch/x86/kernel/head_32-xen.S 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/head_32-xen.S 2011-01-31 17:32:16.000000000 +0100 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,8 @@ ENTRY(startup_32) /* Set up the stack pointer */ movl $(init_thread_union+THREAD_SIZE),%esp + call setup_pda + /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID XEN_CPUID @@ -57,14 +60,58 @@ ENTRY(startup_32) movb $1,X86_HARD_MATH - xorl %eax,%eax # Clear FS/GS and LDT + xorl %eax,%eax # Clear FS movl %eax,%fs - movl %eax,%gs + + movl $(__KERNEL_PDA),%eax + mov %eax,%gs + cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder jmp start_kernel +/* + * Point the GDT at this CPU's PDA. This will be + * cpu_gdt_table and boot_pda. + */ +setup_pda: + /* get the PDA pointer */ + movl $boot_pda, %eax + + /* slot the PDA address into the GDT */ + mov $cpu_gdt_table, %ecx + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ + shr $16, %eax + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ + + # %esi still points to start_info, and no registers + # need to be preserved. + + movl XEN_START_mfn_list(%esi), %ebx + movl $(cpu_gdt_table - __PAGE_OFFSET), %eax + shrl $PAGE_SHIFT, %eax + movl (%ebx,%eax,4), %ecx + pushl %ecx # frame number for set_gdt below + + xorl %esi, %esi + xorl %edx, %edx + shldl $PAGE_SHIFT, %ecx, %edx + shll $PAGE_SHIFT, %ecx + orl $0x61, %ecx + movl $cpu_gdt_table, %ebx + movl $__HYPERVISOR_update_va_mapping, %eax + int $0x82 + + movl $(PAGE_SIZE_asm / 8), %ecx + movl %esp, %ebx + movl $__HYPERVISOR_set_gdt, %eax + int $0x82 + + popl %ecx + ret + #define HYPERCALL_PAGE_OFFSET 0x1000 .org HYPERCALL_PAGE_OFFSET ENTRY(hypercall_page) @@ -93,7 +140,8 @@ ENTRY(empty_zero_page) /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ - .align L1_CACHE_BYTES + .section .data.page_aligned, "aw" + .align PAGE_SIZE_asm ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -135,12 +183,13 @@ ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ .quad 0x0000000000000000 /* 0xc8 APM DS data */ - .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */ - .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */ + .quad 0x00cf92000000ffff /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + .align PAGE_SIZE_asm #if CONFIG_XEN_COMPAT <= 0x030002 /* @@ -165,9 +214,9 @@ ENTRY(cpu_gdt_table) .ascii ",ELF_PADDR_OFFSET=0x" utoa __PAGE_OFFSET .ascii ",VIRT_ENTRY=0x" - utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET) + utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) .ascii ",HYPERCALL_PAGE=0x" - utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) + utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) .ascii ",FEATURES=writable_page_tables" .ascii "|writable_descriptor_tables" .ascii "|auto_translated_physmap" --- head-2011-02-17.orig/arch/x86/kernel/io_apic_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/io_apic_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -199,14 +200,20 @@ static struct IO_APIC_route_entry ioapic * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; union entry_union eu; eu.entry = e; - spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -889,8 +896,7 @@ static int __init find_isa_irq_pin(int i if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -909,8 +915,7 @@ static int __init find_isa_irq_apic(int if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -1043,12 +1048,6 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) (0) -/* NEC98 interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_NEC98_trigger(idx) (0) -#define default_NEC98_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -1083,11 +1082,6 @@ static int __init MPBIOS_polarity(int id polarity = default_MCA_polarity(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - polarity = default_NEC98_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1157,11 +1151,6 @@ static int MPBIOS_trigger(int idx) trigger = default_MCA_trigger(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - trigger = default_NEC98_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1223,7 +1212,6 @@ static int pin_2_irq(int idx, int apic, case MP_BUS_ISA: /* ISA pin */ case MP_BUS_EISA: case MP_BUS_MCA: - case MP_BUS_NEC98: { irq = mp_irqs[idx].mpc_srcbusirq; break; @@ -1291,7 +1279,7 @@ static inline int IO_APIC_irq_trigger(in } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ static int __assign_irq_vector(int irq) { @@ -1417,8 +1405,8 @@ static void __init setup_IO_APIC_irqs(vo if (!apic && (irq < 16)) disable_8259A_irq(irq); } - ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1988,6 +1976,15 @@ static void __init setup_ioapic_ids_from #endif #ifndef CONFIG_XEN +static int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1996,10 +1993,13 @@ static void __init setup_ioapic_ids_from * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ -static int __init timer_irq_works(void) +int __init timer_irq_works(void) { unsigned long t1 = jiffies; + if (no_timer_check) + return 1; + local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); @@ -2226,9 +2226,15 @@ static inline void unlock_ExtINT_logic(v unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); @@ -2273,7 +2279,7 @@ int timer_uses_ioapic_pin_0; * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector; @@ -2558,7 +2564,7 @@ device_initcall(ioapic_init_sysfs); int create_irq(void) { /* Allocate an unused irq */ - int irq, new, vector; + int irq, new, vector = 0; unsigned long flags; irq = -ENOSPC; @@ -2939,8 +2945,8 @@ int io_apic_set_pci_routing (int ioapic, if (!ioapic && (irq < 16)) disable_8259A_irq(irq); - ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(ioapic, pin, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); --- head-2011-02-17.orig/arch/x86/kernel/ldt_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/ldt_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -177,16 +177,14 @@ static int read_default_ldt(void __user { int err; unsigned long size; - void *address; err = 0; - address = &default_ldt[0]; size = 5*sizeof(struct desc_struct); if (size > bytecount) size = bytecount; err = size; - if (copy_to_user(ptr, address, size)) + if (clear_user(ptr, size)) err = -EFAULT; return err; --- head-2011-02-17.orig/arch/x86/kernel/microcode-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/microcode-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -1,7 +1,7 @@ /* * Intel CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2004 Tigran Aivazian + * Copyright (C) 2000-2006 Tigran Aivazian * 2006 Shaohua Li * * This driver allows to upgrade microcode on Intel processors @@ -43,7 +43,7 @@ #include MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); +MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); static int verbose; @@ -195,7 +195,7 @@ static int __init microcode_init (void) request_microcode(); printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); return 0; } --- head-2011-02-17.orig/arch/x86/kernel/mpparse_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/mpparse_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -36,7 +36,7 @@ /* Have we found an MP table */ int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; +unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Various Linux-internal data structures created from the @@ -102,10 +102,10 @@ static int __init mpf_checksum(unsigned */ static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata; #ifndef CONFIG_XEN -static void __devinit MP_processor_info (struct mpc_config_processor *m) +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { int ver, apicid; physid_mask_t phys_cpu; @@ -221,7 +221,7 @@ static void __devinit MP_processor_info bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; } #else -void __init MP_processor_info (struct mpc_config_processor *m) +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { num_processors++; } @@ -256,8 +256,6 @@ static void __init MP_bus_info (struct m mp_current_pci_id++; } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; } else { printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } @@ -842,7 +840,7 @@ void __init mp_register_lapic_address(u6 #endif } -void __devinit mp_register_lapic (u8 id, u8 enabled) +void __cpuinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; int boot_cpu = 0; --- head-2011-02-17.orig/arch/x86/kernel/pci-dma-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/pci-dma-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -273,7 +273,7 @@ EXPORT_SYMBOL(dma_free_coherent); int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dma_addr_t device_addr, size_t size, int flags) { - void __iomem *mem_base; + void __iomem *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = (pages + 31)/32; @@ -290,14 +290,12 @@ int dma_declare_coherent_memory(struct d if (!mem_base) goto out; - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); if (!dev->dma_mem) goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dev->dma_mem->bitmap) goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); dev->dma_mem->virt_base = mem_base; dev->dma_mem->device_base = device_addr; @@ -312,6 +310,8 @@ int dma_declare_coherent_memory(struct d free1_out: kfree(dev->dma_mem->bitmap); out: + if (mem_base) + iounmap(mem_base); return 0; } EXPORT_SYMBOL(dma_declare_coherent_memory); --- head-2011-02-17.orig/arch/x86/kernel/process_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/process_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -60,6 +60,7 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -104,28 +105,24 @@ EXPORT_SYMBOL(enable_hlt); */ static void poll_idle (void) { - local_irq_enable(); - - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + cpu_relax(); } static void xen_idle(void) { - local_irq_disable(); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); - if (need_resched()) + local_irq_disable(); + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); - safe_halt(); - current_thread_info()->status |= TS_POLLING; - } + current_thread_info()->status |= TS_POLLING; } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); @@ -250,8 +247,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); + printk(" DS: %04x ES: %04x GS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -282,6 +279,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xds = __USER_DS; regs.xes = __USER_DS; + regs.xgs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -359,7 +357,6 @@ int copy_thread(int nr, unsigned long cl p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -438,7 +435,7 @@ void dump_thread(struct pt_regs * regs, dump->regs.ds = regs->xds; dump->regs.es = regs->xes; savesegment(fs,dump->regs.fs); - savesegment(gs,dump->regs.gs); + dump->regs.gs = regs->xgs; dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; @@ -635,17 +632,19 @@ struct task_struct fastcall * __switch_t if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) BUG(); + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) + prefetch(&next->i387.fxsave); + /* - * Restore %fs and %gs if needed. + * Restore %fs if needed. * - * Glibc normally makes %fs be zero, and %gs is one of - * the TLS segments. + * Glibc normally makes %fs be zero. */ if (unlikely(next->fs)) loadsegment(fs, next->fs); - if (next->gs) - loadsegment(gs, next->gs); + write_pda(pcurrent, next_p); /* * Now maybe handle debug registers @@ -655,6 +654,13 @@ struct task_struct fastcall * __switch_t disable_tsc(prev_p, next_p); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter > 5) + math_state_restore(); + return prev_p; } --- head-2011-02-17.orig/arch/x86/kernel/setup_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/setup_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -76,9 +76,6 @@ #include #endif -/* Forward Declaration. */ -void __init find_max_pfn(void); - static int xen_panic_event(struct notifier_block *, unsigned long, void *); static struct notifier_block xen_panic_block = { xen_panic_event, NULL, 0 /* try to go last */ @@ -89,14 +86,11 @@ int disable_pse __devinitdata = 0; /* * Machine setup.. */ - -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif +extern struct resource code_resource; +extern struct resource data_resource; /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; /* common cpu data for all cpus */ struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; EXPORT_SYMBOL(boot_cpu_data); @@ -112,12 +106,6 @@ unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -150,10 +138,6 @@ struct ist_info ist_info; defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) EXPORT_SYMBOL(ist_info); #endif -struct e820map e820; -#ifdef CONFIG_XEN -struct e820map machine_e820; -#endif extern void early_cpu_init(void); extern int root_mountflags; @@ -168,209 +152,6 @@ static char command_line[COMMAND_LINE_SI unsigned char __initdata boot_params[PARAM_SIZE]; -static struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - -#ifdef CONFIG_XEN - /* Nothing to do if not running in dom0. */ - if (!is_initial_xendomain()) - return; -#endif - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - /* * Point at the empty zero page to start with. We map the real shared_info * page as soon as fixmap is up and running. @@ -386,353 +167,6 @@ EXPORT_SYMBOL(phys_to_machine_mapping); start_info_t *xen_start_info; EXPORT_SYMBOL(xen_start_info); -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; - } -} /* add_memory_region */ - -static void __init limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - int i; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map, i = 0; p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - current_addr = md->phys_addr + (md->num_pages << 12); - if (md->type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - md->num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } - } - for (i = 0; i < e820.nr_map; i++) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr < size) - continue; - - if (e820.map[i].type != E820_RAM) - continue; - - if (e820.map[i].addr >= size) { - /* - * This region starts past the end of the - * requested size, skip it completely. - */ - e820.nr_map = i; - } else { - e820.nr_map = i + 1; - e820.map[i].size -= current_addr - size; - } - return; - } -#ifdef CONFIG_XEN - if (i==e820.nr_map && current_addr < size) { - /* - * The e820 map finished before our requested size so - * extend the final entry to the requested address. - */ - --i; - if (e820.map[i].type == E820_RAM) - e820.map[i].size -= current_addr - size; - else - add_memory_region(current_addr, size - current_addr, E820_RAM); - } -#endif -} - -#define E820_DEBUG 1 - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %lu\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -static struct change_member change_point_list[2*E820MAX] __initdata; -static struct change_member *change_point[2*E820MAX] __initdata; -static struct e820entry *overlap_list[E820MAX] __initdata; -static struct e820entry new_bios[E820MAX] __initdata; - -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; iaddr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if > , swap */ - /* or, if current= & last=, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; ipbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; itype > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ -#ifndef CONFIG_XEN - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; -#else - BUG_ON(nr_map < 1); -#endif - - do { - unsigned long long start = biosmap->addr; - unsigned long long size = biosmap->size; - unsigned long long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - -#ifndef CONFIG_XEN - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } -#endif - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - -#ifdef CONFIG_XEN - if (is_initial_xendomain()) { - struct xen_memory_map memmap; - - memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, machine_e820.map); - - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) - BUG(); - machine_e820.nr_map = memmap.nr_entries; - } else - machine_e820 = e820; -#endif - - return 0; -} - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE @@ -758,7 +192,7 @@ static inline void copy_edd(void) } #endif -static int __initdata user_defined_memmap = 0; +int __initdata user_defined_memmap = 0; /* * "mem=nopentium" disables the 4MB page tables. @@ -795,51 +229,6 @@ static int __init parse_mem(char *arg) } early_param("mem", parse_mem); -static int __init parse_memmap(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "exactmap") == 0) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - e820.nr_map = 0; - user_defined_memmap = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(arg, &arg); - if (*arg == '@') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*arg == '#') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*arg == '$') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - user_defined_memmap = 1; - } - } - return 0; -} -early_param("memmap", parse_memmap); - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. @@ -906,127 +295,6 @@ early_param("reservetop", parse_reservet #endif /* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - -/* - * This function checks if any part of the range is mapped - * with type. - */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) -{ - int i; - -#ifndef CONFIG_XEN - for (i = 0; i < e820.nr_map; i++) { - const struct e820entry *ei = &e820.map[i]; -#else - if (!is_initial_xendomain()) - return 0; - for (i = 0; i < machine_e820.nr_map; ++i) { - const struct e820entry *ei = &machine_e820.map[i]; -#endif - - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - - /* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - -#ifndef CONFIG_XEN - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; -#else - if (!is_initial_xendomain()) - return 0; - for (i = 0; i < machine_e820.nr_map; ++i) { - const struct e820entry *ei = &machine_e820.map[i]; -#endif - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - memory_present(0, start, end); - } -} - -/* * Determine low and high memory ranges: */ unsigned long __init find_max_low_pfn(void) @@ -1085,77 +353,6 @@ unsigned long __init find_max_low_pfn(vo return max_low_pfn; } -/* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -static void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - -#ifdef CONFIG_XEN - /* - * Truncate to the number of actual pages currently - * present. - */ - if (last_pfn > xen_start_info->nr_pages) - last_pfn = xen_start_info->nr_pages; -#endif - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - #ifndef CONFIG_XEN /* * workaround for Dell systems that neglect to reserve EBDA @@ -1247,8 +444,8 @@ void __init setup_bootmem_allocator(void * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); + reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); #ifndef CONFIG_XEN /* @@ -1330,160 +527,6 @@ void __init remapped_pgdat_init(void) } } -/* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ -static void __init -legacy_init_iomem_resources(struct e820entry *e820, int nr_map, - struct resource *code_resource, - struct resource *data_resource) -{ - int i; - - probe_roms(); - - for (i = 0; i < nr_map; i++) { - struct resource *res; -#ifndef CONFIG_RESOURCES_64BIT - if (e820[i].addr + e820[i].size > 0x100000000ULL) - continue; -#endif - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (e820[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820[i].addr; - res->end = res->start + e820[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res)) { - kfree(res); - continue; - } - if (e820[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ -#ifndef CONFIG_XEN - request_resource(res, code_resource); - request_resource(res, data_resource); -#endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - request_resource(res, &crashk_res); -#ifdef CONFIG_XEN - xen_machine_kexec_register_resources(res); -#endif -#endif - } - } -} - -/* - * Locate a unused range of the physical address space below 4G which - * can be used for PCI mappings. - */ -static void __init -e820_setup_gap(struct e820entry *e820, int nr_map) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the bigest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = nr_map; - while (--i >= 0) { - unsigned long long start = e820[i].addr; - unsigned long long end = start + e820[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - /* Nothing to do if not running in dom0. */ - if (!is_initial_xendomain()) - return 0; - - printk("Setting up standard PCI resources\n"); -#ifdef CONFIG_XEN - legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map, - &code_resource, &data_resource); -#else - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); - else - legacy_init_iomem_resources(e820.map, e820.nr_map, - &code_resource, &data_resource); -#endif - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} - -subsys_initcall(request_standard_resources); - -static void __init register_memory(void) -{ -#ifdef CONFIG_XEN - if (is_initial_xendomain()) - e820_setup_gap(machine_e820.map, machine_e820.nr_map); - else -#endif - e820_setup_gap(e820.map, e820.nr_map); -} - #ifdef CONFIG_MCA static void set_mca_bus(int x) { @@ -1493,6 +536,12 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __init __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -1580,7 +629,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(machine_specific_memory_setup()); + print_memory_map(memory_setup()); } copy_edd(); @@ -1759,7 +808,7 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif - register_memory(); + e820_register_memory(); if (is_initial_xendomain()) { #ifdef CONFIG_VT --- head-2011-02-17.orig/arch/x86/kernel/smp_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/smp_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -659,6 +659,10 @@ int smp_call_function_single(int cpu, vo put_cpu(); return -EBUSY; } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); --- head-2011-02-17.orig/arch/x86/kernel/time-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/time-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -60,6 +60,7 @@ #include #include #include +#include #include #include "mach_time.h" @@ -125,11 +126,11 @@ static DEFINE_PER_CPU(struct vcpu_runsta /* Must be signed, as it's compared with s64 quantities which can be -ve. */ #define NS_PER_TICK (1000000000LL/HZ) -static void __clock_was_set(void *unused) +static void __clock_was_set(struct work_struct *unused) { clock_was_set(); } -static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL); +static DECLARE_WORK(clock_was_set_work, __clock_was_set); /* * GCC 4.3 can turn loops over an induction variable into division. We do @@ -542,10 +543,7 @@ static int set_rtc_mmss(unsigned long no /* gets recalled with irq locally disabled */ /* XXX - does irqsave resolve this? -johnstul */ spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_set_rtc_mmss(nowtime); - else - retval = mach_set_rtc_mmss(nowtime); + retval = set_wallclock(nowtime); spin_unlock_irqrestore(&rtc_lock, flags); return retval; @@ -876,10 +874,7 @@ unsigned long get_cmos_time(void) spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_get_time(); - else - retval = mach_get_cmos_time(); + retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); @@ -981,7 +976,7 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - time_init_hook(); + do_time_init(); } #endif --- head-2011-02-17.orig/arch/x86/kernel/traps_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/traps_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -29,6 +29,8 @@ #include #include #include +#include +#include #ifdef CONFIG_EISA #include @@ -61,9 +63,6 @@ int panic_on_unrecovered_nmi; asmlinkage int system_call(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; - /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -100,12 +99,7 @@ asmlinkage void fixup_4gb_segment(void); #endif asmlinkage void machine_check(void); -static int kstack_depth_to_print = 24; -#ifdef CONFIG_STACK_UNWIND -static int call_trace = 1; -#else -#define call_trace (-1) -#endif +int kstack_depth_to_print = 24; ATOMIC_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) @@ -159,25 +153,7 @@ static inline unsigned long print_contex return ebp; } -struct ops_and_data { - struct stacktrace_ops *ops; - void *data; -}; - -static asmlinkage int -dump_trace_unwind(struct unwind_frame_info *info, void *data) -{ - struct ops_and_data *oad = (struct ops_and_data *)data; - int n = 0; - - while (unwind(info) == 0 && UNW_PC(info)) { - n++; - oad->ops->address(oad->data, UNW_PC(info)); - if (arch_unw_user_mode(info)) - break; - } - return n; -} +#define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, @@ -188,39 +164,6 @@ void dump_trace(struct task_struct *task if (!task) task = current; - if (call_trace >= 0) { - int unw_ret = 0; - struct unwind_frame_info info; - struct ops_and_data oad = { .ops = ops, .data = data }; - - if (regs) { - if (unwind_init_frame_info(&info, task, regs) == 0) - unw_ret = dump_trace_unwind(&info, &oad); - } else if (task == current) - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); - else { - if (unwind_init_blocked(&info, task) == 0) - unw_ret = dump_trace_unwind(&info, &oad); - } - if (unw_ret > 0) { - if (call_trace == 1 && !arch_unw_user_mode(&info)) { - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", - UNW_PC(&info)); - if (UNW_SP(&info) >= PAGE_OFFSET) { - ops->warning(data, "Leftover inexact backtrace:\n"); - stack = (void *)UNW_SP(&info); - if (!stack) - return; - ebp = UNW_FP(&info); - } else - ops->warning(data, "Full inexact backtrace again:\n"); - } else if (call_trace >= 1) - return; - else - ops->warning(data, "Full inexact backtrace again:\n"); - } else - ops->warning(data, "Inexact backtrace:\n"); - } if (!stack) { unsigned long dummy; stack = &dummy; @@ -253,6 +196,7 @@ void dump_trace(struct task_struct *task stack = (unsigned long*)context->previous_esp; if (!stack) break; + touch_nmi_watchdog(); } } EXPORT_SYMBOL(dump_trace); @@ -385,7 +329,7 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (in_kernel) { - u8 __user *eip; + u8 *eip; int code_bytes = 64; unsigned char c; @@ -394,18 +338,20 @@ void show_registers(struct pt_regs *regs printk(KERN_EMERG "Code: "); - eip = (u8 __user *)regs->eip - 43; - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + eip = (u8 *)regs->eip - 43; + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { /* try starting at EIP */ - eip = (u8 __user *)regs->eip; + eip = (u8 *)regs->eip; code_bytes = 32; } for (i = 0; i < code_bytes; i++, eip++) { - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 __user *)regs->eip) + if (eip == (u8 *)regs->eip) printk("<%02x> ", c); else printk("%02x ", c); @@ -414,43 +360,22 @@ void show_registers(struct pt_regs *regs printk("\n"); } -static void handle_BUG(struct pt_regs *regs) +int is_valid_bugaddr(unsigned long eip) { - unsigned long eip = regs->eip; unsigned short ud2; if (eip < PAGE_OFFSET) - return; - if (probe_kernel_address((unsigned short __user *)eip, ud2)) - return; - if (ud2 != 0x0b0f) - return; + return 0; + if (probe_kernel_address((unsigned short *)eip, ud2)) + return 0; - printk(KERN_EMERG "------------[ cut here ]------------\n"); - -#ifdef CONFIG_DEBUG_BUGVERBOSE - do { - unsigned short line; - char *file; - char c; - - if (probe_kernel_address((unsigned short __user *)(eip + 2), - line)) - break; - if (__get_user(file, (char * __user *)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = ""; - - printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); - return; - } while (0); -#endif - printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n"); + return ud2 == 0x0b0f; } -/* This is gone through when something in the kernel - * has done something bad and is about to be terminated. -*/ +/* + * This is gone through when something in the kernel has done something bad and + * is about to be terminated. + */ void die(const char * str, struct pt_regs * regs, long err) { static struct { @@ -458,7 +383,7 @@ void die(const char * str, struct pt_reg u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -482,7 +407,8 @@ void die(const char * str, struct pt_reg unsigned long esp; unsigned short ss; - handle_BUG(regs); + report_bug(regs->eip); + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk(KERN_EMERG "PREEMPT "); @@ -682,8 +608,7 @@ mem_parity_error(unsigned char reason, s { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); - printk(KERN_EMERG "You probably have a hardware problem with your RAM " - "chips\n"); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); @@ -741,7 +666,6 @@ void __kprobes die_nmi(struct pt_regs *r printk(" on CPU%d, eip %08lx, registers:\n", smp_processor_id(), regs->eip); show_registers(regs); - printk(KERN_EMERG "console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); @@ -1057,49 +981,24 @@ fastcall void do_spurious_interrupt_bug( #endif } -fastcall void setup_x86_bogus_stack(unsigned char * stk) +fastcall unsigned long patch_espfix_desc(unsigned long uesp, + unsigned long kesp) { - unsigned long *switch16_ptr, *switch32_ptr; - struct pt_regs *regs; - unsigned long stack_top, stack_bot; - unsigned short iret_frame16_off; - int cpu = smp_processor_id(); - /* reserve the space on 32bit stack for the magic switch16 pointer */ - memmove(stk, stk + 8, sizeof(struct pt_regs)); - switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); - regs = (struct pt_regs *)stk; - /* now the switch32 on 16bit stack */ - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; - /* copy iret frame on 16bit stack */ - memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); - /* fill in the switch pointers */ - switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; - switch16_ptr[1] = __ESPFIX_SS; - switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + - 8 - CPU_16BIT_STACK_SIZE; - switch32_ptr[1] = __KERNEL_DS; -} - -fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) -{ - unsigned long *switch32_ptr; - unsigned char *stack16, *stack32; - unsigned long stack_top, stack_bot; - int len; int cpu = smp_processor_id(); - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - /* copy the data from 16bit stack to 32bit stack */ - len = CPU_16BIT_STACK_SIZE - 8 - sp; - stack16 = (unsigned char *)(stack_bot + sp); - stack32 = (unsigned char *) - (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); - memcpy(stack32, stack16, len); - return stack32; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + return new_kesp; } #endif @@ -1113,7 +1012,7 @@ fastcall unsigned char * fixup_x86_bogus * Must be called with kernel preemption disabled (in this case, * local interrupts are disabled at the call-site in entry.S). */ -asmlinkage void math_state_restore(struct pt_regs regs) +asmlinkage void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; @@ -1123,6 +1022,7 @@ asmlinkage void math_state_restore(struc init_fpu(tsk); restore_fpu(tsk); thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } #ifndef CONFIG_MATH_EMULATION @@ -1234,19 +1134,3 @@ static int __init kstack_setup(char *s) return 1; } __setup("kstack=", kstack_setup); - -#ifdef CONFIG_STACK_UNWIND -static int __init call_trace_setup(char *s) -{ - if (strcmp(s, "old") == 0) - call_trace = -1; - else if (strcmp(s, "both") == 0) - call_trace = 0; - else if (strcmp(s, "newfallback") == 0) - call_trace = 1; - else if (strcmp(s, "new") == 2) - call_trace = 2; - return 1; -} -__setup("call_trace=", call_trace_setup); -#endif --- head-2011-02-17.orig/arch/x86/kernel/vmlinux.lds.S 2011-02-17 09:59:45.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/vmlinux.lds.S 2011-01-31 17:32:16.000000000 +0100 @@ -84,6 +84,10 @@ SECTIONS { #ifdef CONFIG_X86_32 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; +#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002 +#undef LOAD_OFFSET +#define LOAD_OFFSET 0 +#endif phys_startup_32 = startup_32 - LOAD_OFFSET; #else . = __START_KERNEL; --- head-2011-02-17.orig/arch/x86/kvm/Kconfig 2011-02-17 09:59:45.000000000 +0100 +++ head-2011-02-17/arch/x86/kvm/Kconfig 2011-01-31 17:32:16.000000000 +0100 @@ -7,6 +7,7 @@ source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" depends on HAVE_KVM || X86 + depends on !XEN default y ---help--- Say Y here to get to see options for using your Linux host to run other --- head-2011-02-17.orig/arch/x86/mm/fault_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/fault_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -22,9 +22,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -167,7 +167,7 @@ static inline unsigned long get_segment_ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) { unsigned long limit; - unsigned long instr = get_segment_eip (regs, &limit); + unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); int scan_more = 1; int prefetch = 0; int i; @@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs unsigned char instr_hi; unsigned char instr_lo; - if (instr > limit) + if (instr > (unsigned char *)limit) break; - if (__get_user(opcode, (unsigned char __user *) instr)) + if (probe_kernel_address(instr, opcode)) break; instr_hi = opcode & 0xf0; @@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (instr > limit) + if (instr > (unsigned char *)limit) break; - if (__get_user(opcode, (unsigned char __user *) instr)) + if (probe_kernel_address(instr, opcode)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); --- head-2011-02-17.orig/arch/x86/mm/highmem_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/highmem_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -32,7 +32,7 @@ static void *__kmap_atomic(struct page * unsigned long vaddr; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - inc_preempt_count(); + pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -63,26 +63,22 @@ void kunmap_atomic(void *kvaddr, enum km unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); -#ifdef CONFIG_DEBUG_HIGHMEM - if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) { - dec_preempt_count(); - preempt_check_resched(); - return; - } - - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); -#endif /* * Force other mappings to Oops if they'll try to access this pte * without first remap it. Keeping stale mappings around is a bad idea * also, in case the page changes cacheability attributes or becomes * a protected page in a hypervisor. */ - kpte_clear_flush(kmap_pte-idx, vaddr); + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + kpte_clear_flush(kmap_pte-idx, vaddr); + else { +#ifdef CONFIG_DEBUG_HIGHMEM + BUG_ON(vaddr < PAGE_OFFSET); + BUG_ON(vaddr >= (unsigned long)high_memory); +#endif + } - dec_preempt_count(); - preempt_check_resched(); + pagefault_enable(); } /* This is the same as kmap_atomic() but can map memory that doesn't @@ -93,7 +89,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum fixed_addresses idx; unsigned long vaddr; - inc_preempt_count(); + pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); --- head-2011-02-17.orig/arch/x86/mm/init_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/init_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -233,8 +233,6 @@ static inline int page_kills_ppro(unsign #endif -extern int is_available_memory(efi_memory_desc_t *); - int page_is_ram(unsigned long pagenr) { int i; @@ -326,7 +324,7 @@ void __init add_one_highpage_init(struct SetPageReserved(page); } -static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) +static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) { free_new_highpage(page, pfn); totalram_pages++; @@ -343,7 +341,7 @@ static int add_one_highpage_hotplug(stru * has been added dynamically that would be * onlined here is in HIGHMEM */ -void online_page(struct page *page) +void __meminit online_page(struct page *page) { ClearPageReserved(page); add_one_highpage_hotplug(page, page_to_pfn(page)); @@ -738,16 +736,10 @@ void __init mem_init(void) set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags); } -/* - * this is for the non-NUMA, single node SMP system case. - * Specifically, in the case of x86, we will always add - * memory to the highmem for now. - */ #ifdef CONFIG_MEMORY_HOTPLUG -#ifndef CONFIG_NEED_MULTIPLE_NODES int arch_add_memory(int nid, u64 start, u64 size) { - struct pglist_data *pgdata = &contig_page_data; + struct pglist_data *pgdata = NODE_DATA(nid); struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -759,11 +751,11 @@ int remove_memory(u64 start, u64 size) { return -EINVAL; } -#endif +EXPORT_SYMBOL_GPL(remove_memory); #endif -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +struct kmem_cache *pgd_cache; +struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { --- head-2011-02-17.orig/arch/x86/mm/pgtable_32-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/pgtable_32-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -203,7 +203,7 @@ void pte_free(struct page *pte) __free_page(pte); } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags) { memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } @@ -244,7 +244,7 @@ static inline void pgd_list_del(pgd_t *p page->mapping = NULL; } -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; @@ -265,7 +265,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *c } /* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ --- head-2011-02-17.orig/arch/x86/pci/irq-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/pci/irq-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -768,7 +768,7 @@ static void __init pirq_find_router(stru DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", rt->rtr_vendor, rt->rtr_device); - pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); + pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); if (!pirq_router_dev) { DBG(KERN_DEBUG "PCI: Interrupt router not found at " "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); @@ -788,6 +788,8 @@ static void __init pirq_find_router(stru pirq_router_dev->vendor, pirq_router_dev->device, pci_name(pirq_router_dev)); + + /* The device remains referenced for the kernel lifetime */ } static struct irq_info *pirq_get_info(struct pci_dev *dev) --- head-2011-02-17.orig/arch/x86/kernel/entry_64-xen.S 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/entry_64-xen.S 2011-01-31 17:32:16.000000000 +0100 @@ -261,7 +261,6 @@ ENTRY(system_call) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) - CFI_REMEMBER_STATE jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -272,7 +271,6 @@ ENTRY(system_call) * Syscall return path ending with SYSRET (fast path) * Has incomplete stack frame and undefined top of stack. */ - .globl ret_from_sys_call ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ @@ -282,8 +280,8 @@ sysret_check: TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx - CFI_REMEMBER_STATE jnz sysret_careful + CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: */ @@ -292,10 +290,10 @@ sysret_check: RESTORE_ARGS 0,8,0 HYPERVISOR_IRET VGCF_IN_SYSCALL + CFI_RESTORE_STATE /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -334,7 +332,6 @@ badsys: /* Do syscall tracing */ tracesys: - CFI_RESTORE_STATE SAVE_REST movq $-ENOSYS,RAX(%rsp) FIXUP_TOP_OF_STACK %rdi @@ -350,32 +347,13 @@ tracesys: call *sys_call_table(,%rax,8) 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ - jmp int_ret_from_sys_call - CFI_ENDPROC -END(system_call) /* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. - */ -ENTRY(int_ret_from_sys_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-ARGOFFSET - /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP-ARGOFFSET - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/ - CFI_REL_OFFSET rip,RIP-ARGOFFSET - CFI_REL_OFFSET rdx,RDX-ARGOFFSET - CFI_REL_OFFSET rcx,RCX-ARGOFFSET - CFI_REL_OFFSET rax,RAX-ARGOFFSET - CFI_REL_OFFSET rdi,RDI-ARGOFFSET - CFI_REL_OFFSET rsi,RSI-ARGOFFSET - CFI_REL_OFFSET r8,R8-ARGOFFSET - CFI_REL_OFFSET r9,R9-ARGOFFSET - CFI_REL_OFFSET r10,R10-ARGOFFSET - CFI_REL_OFFSET r11,R11-ARGOFFSET + */ + .globl int_ret_from_sys_call +int_ret_from_sys_call: XEN_BLOCK_EVENTS(%rsi) TRACE_IRQS_OFF testb $3,CS-ARGOFFSET(%rsp) @@ -428,8 +406,6 @@ int_very_careful: popq %rdi CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi - XEN_BLOCK_EVENTS(%rsi) - TRACE_IRQS_OFF jmp int_restore_rest int_signal: @@ -445,7 +421,7 @@ int_restore_rest: TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC -END(int_ret_from_sys_call) +END(system_call) /* * Certain special system calls that need to save a complete full stack frame. @@ -1270,36 +1246,3 @@ ENTRY(call_softirq) ret CFI_ENDPROC ENDPROC(call_softirq) - -#ifdef CONFIG_STACK_UNWIND -ENTRY(arch_unwind_init_running) - CFI_STARTPROC - movq %r15, R15(%rdi) - movq %r14, R14(%rdi) - xchgq %rsi, %rdx - movq %r13, R13(%rdi) - movq %r12, R12(%rdi) - xorl %eax, %eax - movq %rbp, RBP(%rdi) - movq %rbx, RBX(%rdi) - movq (%rsp), %rcx - movq %rax, R11(%rdi) - movq %rax, R10(%rdi) - movq %rax, R9(%rdi) - movq %rax, R8(%rdi) - movq %rax, RAX(%rdi) - movq %rax, RCX(%rdi) - movq %rax, RDX(%rdi) - movq %rax, RSI(%rdi) - movq %rax, RDI(%rdi) - movq %rax, ORIG_RAX(%rdi) - movq %rcx, RIP(%rdi) - leaq 8(%rsp), %rcx - movq $__KERNEL_CS, CS(%rdi) - movq %rax, EFLAGS(%rdi) - movq %rcx, RSP(%rdi) - movq $__KERNEL_DS, SS(%rdi) - jmpq *%rdx - CFI_ENDPROC -ENDPROC(arch_unwind_init_running) -#endif --- head-2011-02-17.orig/arch/x86/kernel/head64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/head64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -104,7 +104,10 @@ void __init x86_64_start_kernel(char * r machine_to_phys_order++; #if 0 - for (i = 0; i < 256; i++) + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); #endif --- head-2011-02-17.orig/arch/x86/kernel/io_apic_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/io_apic_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -202,14 +202,20 @@ static struct IO_APIC_route_entry ioapic * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; union entry_union eu; eu.entry = e; - spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -720,6 +726,22 @@ static int assign_irq_vector(int irq, cp } #ifndef CONFIG_XEN +static void __clear_irq_vector(int irq) +{ + cpumask_t mask; + int cpu, vector; + + BUG_ON(!irq_vector[irq]); + + vector = irq_vector[irq]; + cpus_and(mask, irq_domain[irq], cpu_online_map); + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + irq_vector[irq] = 0; + irq_domain[irq] = CPU_MASK_NONE; +} + void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ @@ -767,26 +789,65 @@ static void ioapic_register_intr(int irq #define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) #endif /* !CONFIG_XEN */ -static void __init setup_IO_APIC_irqs(void) +static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) { struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + int vector; unsigned long flags; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + } + + if (/* !apic && */ !IO_APIC_IRQ(irq)) + return; + + if (IO_APIC_IRQ(irq)) { + cpumask_t mask; + vector = assign_irq_vector(irq, TARGET_CPUS, &mask); + if (vector < 0) + return; + + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.vector = vector; + + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + + ioapic_write_entry(apic, pin, entry); + + spin_lock_irqsave(&ioapic_lock, flags); + set_native_irq_info(irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { @@ -798,39 +859,11 @@ static void __init setup_IO_APIC_irqs(vo continue; } - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - } - irq = pin_2_irq(idx, apic, pin); add_pin_to_irq(irq, apic, pin); - if (/* !apic && */ !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - cpumask_t mask; - vector = assign_irq_vector(irq, TARGET_CPUS, &mask); - if (vector < 0) - continue; - - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); - entry.vector = vector; + setup_IO_APIC_irq(apic, pin, idx, irq); - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); - - spin_lock_irqsave(&ioapic_lock, flags); - set_native_irq_info(irq, TARGET_CPUS); - spin_unlock_irqrestore(&ioapic_lock, flags); } } @@ -1826,7 +1859,7 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); - irq_vector[irq] = 0; + __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); } @@ -2131,7 +2164,15 @@ void __init setup_ioapic_dest(void) if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + if(!irq_vector[irq]) + setup_IO_APIC_irq(ioapic, pin, irq_entry, irq); + else + set_ioapic_affinity_irq(irq, TARGET_CPUS); } } --- head-2011-02-17.orig/arch/x86/kernel/mpparse_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/mpparse_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -35,8 +35,6 @@ int smp_found_config; unsigned int __initdata maxcpus = NR_CPUS; -int acpi_found_madt; - /* * Various Linux-internal data structures created from the * MP-table. --- head-2011-02-17.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:30:50.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/process_64-xen.c 2011-02-02 08:30:59.000000000 +0100 @@ -118,29 +118,23 @@ void exit_idle(void) static void poll_idle (void) { local_irq_enable(); - - asm volatile( - "2:" - "testl %0,%1;" - "rep; nop;" - "je 2b;" - : : - "i" (_TIF_NEED_RESCHED), - "m" (current_thread_info()->flags)); + cpu_relax(); } static void xen_idle(void) { + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); local_irq_disable(); - - if (need_resched()) - local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); + if (!need_resched()) safe_halt(); - current_thread_info()->status |= TS_POLLING; - } + else + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; } #ifdef CONFIG_HOTPLUG_CPU @@ -180,6 +174,12 @@ void cpu_idle (void) idle = xen_idle; /* no alternatives */ if (cpu_is_offline(smp_processor_id())) play_dead(); + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); enter_idle(); idle(); /* In many cases the interrupt that ended idle --- head-2011-02-17.orig/arch/x86/kernel/setup_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/setup_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -574,8 +574,7 @@ void __init setup_arch(char **cmdline_p) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; } else { @@ -991,11 +990,8 @@ static void __cpuinit init_amd(struct cp /* Fix cpuid4 emulation for more */ num_cache_leaves = 3; - /* When there is only one core no need to synchronize RDTSC */ - if (num_possible_cpus() == 1) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + /* RDTSC can be speculated around */ + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -1094,6 +1090,15 @@ static void __cpuinit init_intel(struct set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); } + if (cpu_has_ds) { + unsigned int l1, l2; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_bit(X86_FEATURE_BTS, c->x86_capability); + if (!(l1 & (1<<12))) + set_bit(X86_FEATURE_PEBS, c->x86_capability); + } + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -1113,7 +1118,10 @@ static void __cpuinit init_intel(struct set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); if (c->x86 == 6) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + if (c->x86 == 15) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); --- head-2011-02-17.orig/arch/x86/kernel/smp_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/smp_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -384,12 +384,17 @@ int smp_call_function_single (int cpu, v put_cpu(); return 0; } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); put_cpu(); return 0; } +EXPORT_SYMBOL(smp_call_function_single); /* * this function sends a 'generic call function' IPI to all other CPUs --- head-2011-02-17.orig/arch/x86/kernel/traps_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/traps_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -30,9 +30,10 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -108,12 +109,7 @@ static inline void preempt_conditional_c preempt_enable_no_resched(); } -static int kstack_depth_to_print = 12; -#ifdef CONFIG_STACK_UNWIND -static int call_trace = 1; -#else -#define call_trace (-1) -#endif +int kstack_depth_to_print = 12; #ifdef CONFIG_KALLSYMS void printk_address(unsigned long address) @@ -218,24 +214,7 @@ static unsigned long *in_exception_stack return NULL; } -struct ops_and_data { - struct stacktrace_ops *ops; - void *data; -}; - -static int dump_trace_unwind(struct unwind_frame_info *info, void *context) -{ - struct ops_and_data *oad = (struct ops_and_data *)context; - int n = 0; - - while (unwind(info) == 0 && UNW_PC(info)) { - n++; - oad->ops->address(oad->data, UNW_PC(info)); - if (arch_unw_user_mode(info)) - break; - } - return n; -} +#define MSG(txt) ops->warning(data, txt) /* * x86-64 can have upto three kernel stacks: @@ -250,61 +229,24 @@ static inline int valid_stack_ptr(struct return p > t && p < t + THREAD_SIZE - 3; } -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, + unsigned long *stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + const unsigned cpu = get_cpu(); + unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; if (!tsk) tsk = current; - if (call_trace >= 0) { - int unw_ret = 0; - struct unwind_frame_info info; - struct ops_and_data oad = { .ops = ops, .data = data }; - - if (regs) { - if (unwind_init_frame_info(&info, tsk, regs) == 0) - unw_ret = dump_trace_unwind(&info, &oad); - } else if (tsk == current) - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); - else { - if (unwind_init_blocked(&info, tsk) == 0) - unw_ret = dump_trace_unwind(&info, &oad); - } - if (unw_ret > 0) { - if (call_trace == 1 && !arch_unw_user_mode(&info)) { - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", - UNW_PC(&info)); - if ((long)UNW_SP(&info) < 0) { - ops->warning(data, "Leftover inexact backtrace:\n"); - stack = (unsigned long *)UNW_SP(&info); - if (!stack) - return; - } else - ops->warning(data, "Full inexact backtrace again:\n"); - } else if (call_trace >= 1) - return; - else - ops->warning(data, "Full inexact backtrace again:\n"); - } else - ops->warning(data, "Inexact backtrace:\n"); - } if (!stack) { unsigned long dummy; stack = &dummy; if (tsk && tsk != current) stack = (unsigned long *)tsk->thread.rsp; } - /* - * Align the stack pointer on word boundary, later loops - * rely on that (and corruption / debug info bugs can cause - * unaligned values here): - */ - stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1)); /* * Print function call entries within a stack. 'cond' is the @@ -314,9 +256,9 @@ void dump_trace(struct task_struct *tsk, #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (oops_in_progress ? \ - __kernel_text_address(addr) : \ - kernel_text_address(addr)) { \ + /* Use unlocked access here because except for NMIs \ + we should be already protected against module unloads */ \ + if (__kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -379,9 +321,10 @@ void dump_trace(struct task_struct *tsk, /* * This handles the process stack: */ - tinfo = current_thread_info(); + tinfo = task_thread_info(tsk); HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK + put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -518,30 +461,15 @@ bad: printk("\n"); } -void handle_BUG(struct pt_regs *regs) -{ - struct bug_frame f; - long len; - const char *prefix = ""; +int is_valid_bugaddr(unsigned long rip) +{ + unsigned short ud2; - if (user_mode(regs)) - return; - if (__copy_from_user(&f, (const void __user *) regs->rip, - sizeof(struct bug_frame))) - return; - if (f.filename >= 0 || - f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) - return; - len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1; - if (len < 0 || len >= PATH_MAX) - f.filename = (int)(long)"unmapped filename"; - else if (len > 50) { - f.filename += len - 50; - prefix = "..."; - } - printk("----------- [cut here ] --------- [please bite here ] ---------\n"); - printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line); -} + if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) + return 0; + + return ud2 == 0x0b0f; +} #ifdef CONFIG_BUG void out_of_line_bug(void) @@ -621,7 +549,9 @@ void die(const char * str, struct pt_reg { unsigned long flags = oops_begin(); - handle_BUG(regs); + if (!user_mode(regs)) + report_bug(regs->rip); + __die(str, regs, err); oops_end(flags); do_exit(SIGSEGV); @@ -790,8 +720,7 @@ mem_parity_error(unsigned char reason, s { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk(KERN_EMERG "You probably have a hardware problem with your " - "RAM chips\n"); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); @@ -1227,21 +1156,3 @@ static int __init kstack_setup(char *s) return 0; } early_param("kstack", kstack_setup); - -#ifdef CONFIG_STACK_UNWIND -static int __init call_trace_setup(char *s) -{ - if (!s) - return -EINVAL; - if (strcmp(s, "old") == 0) - call_trace = -1; - else if (strcmp(s, "both") == 0) - call_trace = 0; - else if (strcmp(s, "newfallback") == 0) - call_trace = 1; - else if (strcmp(s, "new") == 0) - call_trace = 2; - return 0; -} -early_param("call_trace", call_trace_setup); -#endif --- head-2011-02-17.orig/arch/x86/kernel/vsyscall_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/kernel/vsyscall_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -42,6 +42,7 @@ #include #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __syscall_clobber "r11","rcx","memory" int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; @@ -224,8 +225,7 @@ out: static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } @@ -277,7 +277,6 @@ static void __cpuinit cpu_vsyscall_init( vsyscall_set_cpu(raw_smp_processor_id()); } -#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { @@ -286,13 +285,13 @@ cpu_vsyscall_notifier(struct notifier_bl smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); return NOTIFY_DONE; } -#endif static void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); } --- head-2011-02-17.orig/arch/x86/mm/fault_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/fault_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -23,9 +23,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -96,7 +96,7 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char __user *instr; + unsigned char *instr; int scan_more = 1; int prefetch = 0; unsigned char *max_instr; @@ -116,7 +116,7 @@ static noinline int is_prefetch(struct p unsigned char instr_hi; unsigned char instr_lo; - if (__get_user(opcode, (char __user *)instr)) + if (probe_kernel_address(instr, opcode)) break; instr_hi = opcode & 0xf0; @@ -154,7 +154,7 @@ static noinline int is_prefetch(struct p case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (__get_user(opcode, (char __user *)instr)) + if (probe_kernel_address(instr, opcode)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -170,7 +170,7 @@ static noinline int is_prefetch(struct p static int bad_address(void *p) { unsigned long dummy; - return __get_user(dummy, (unsigned long __user *)p); + return probe_kernel_address((unsigned long *)p, dummy); } void dump_pagetable(unsigned long address) --- head-2011-02-17.orig/arch/x86/mm/init_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/init_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -1164,14 +1164,15 @@ static __init int x8664_sysctl_init(void __initcall(x8664_sysctl_init); #endif -/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only +/* A pseudo VMA to allow ptrace access for the vsyscall page. This only covers the 64bit vsyscall page now. 32bit has a real VMA now and does not need special handling anymore. */ static struct vm_area_struct gate_vma = { .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_END, - .vm_page_prot = PAGE_READONLY + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC }; struct vm_area_struct *get_gate_vma(struct task_struct *tsk) --- head-2011-02-17.orig/arch/x86/mm/pageattr_64-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/mm/pageattr_64-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -330,34 +330,40 @@ static struct page *split_large_page(uns return base; } - -static void flush_kernel_map(void *address) +static void cache_flush_page(void *adr) { - if (0 && address && cpu_has_clflush) { - /* is this worth it? */ - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (address + i)); - } else - asm volatile("wbinvd":::"memory"); - if (address) - __flush_tlb_one(address); - else - __flush_tlb_all(); + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (adr + i)); } +static void flush_kernel_map(void *arg) +{ + struct list_head *l = (struct list_head *)arg; + struct page *pg; -static inline void flush_map(unsigned long address) + /* When clflush is available always use it because it is + much cheaper than WBINVD */ + if (!cpu_has_clflush) + asm volatile("wbinvd" ::: "memory"); + list_for_each_entry(pg, l, lru) { + void *adr = page_address(pg); + if (cpu_has_clflush) + cache_flush_page(adr); + __flush_tlb_one(adr); + } +} + +static inline void flush_map(struct list_head *l) { - on_each_cpu(flush_kernel_map, (void *)address, 1, 1); + on_each_cpu(flush_kernel_map, l, 1, 1); } -static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ +static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ static inline void save_page(struct page *fpage) { - fpage->lru.next = (struct list_head *)deferred_pages; - deferred_pages = fpage; + list_add(&fpage->lru, &deferred_pages); } /* @@ -487,18 +493,18 @@ int change_page_attr(struct page *page, void global_flush_tlb(void) { - struct page *dpage; + struct page *pg, *next; + struct list_head l; down_read(&init_mm.mmap_sem); - dpage = xchg(&deferred_pages, NULL); + list_replace_init(&deferred_pages, &l); up_read(&init_mm.mmap_sem); - flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); - while (dpage) { - struct page *tmp = dpage; - dpage = (struct page *)dpage->lru.next; - ClearPagePrivate(tmp); - __free_page(tmp); + flush_map(&l); + + list_for_each_entry_safe(pg, next, &l, lru) { + ClearPagePrivate(pg); + __free_page(pg); } } --- head-2011-02-17.orig/drivers/pci/msi-xen.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/pci/msi-xen.c 2011-01-31 17:32:16.000000000 +0100 @@ -273,10 +273,8 @@ void disable_msi_mode(struct pci_dev *de pci_write_config_word(dev, msi_control_reg(pos), control); dev->msix_enabled = 0; } - if (pci_find_capability(dev, PCI_CAP_ID_EXP)) { - /* PCI Express Endpoint device detected */ - pci_intx(dev, 1); /* enable intx */ - } + + pci_intx(dev, 1); /* enable intx */ } static void enable_msi_mode(struct pci_dev *dev, int pos, int type) @@ -294,10 +292,8 @@ static void enable_msi_mode(struct pci_d pci_write_config_word(dev, msi_control_reg(pos), control); dev->msix_enabled = 1; } - if (pci_find_capability(dev, PCI_CAP_ID_EXP)) { - /* PCI Express Endpoint device detected */ - pci_intx(dev, 0); /* disable intx */ - } + + pci_intx(dev, 0); /* disable intx */ } #ifdef CONFIG_PM --- head-2011-02-17.orig/drivers/xen/balloon/balloon.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/balloon/balloon.c 2011-01-31 17:32:16.000000000 +0100 @@ -106,8 +106,8 @@ static unsigned long __read_mostly total static LIST_HEAD(ballooned_pages); /* Main work function, always executed in process context. */ -static void balloon_process(void *unused); -static DECLARE_WORK(balloon_worker, balloon_process, NULL); +static void balloon_process(struct work_struct *unused); +static DECLARE_WORK(balloon_worker, balloon_process); static struct timer_list balloon_timer; /* When ballooning out (allocating memory to return to Xen) we don't really @@ -414,7 +414,7 @@ static int decrease_reservation(unsigned * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ -static void balloon_process(void *unused) +static void balloon_process(struct work_struct *unused) { int need_sleep = 0; long credit; --- head-2011-02-17.orig/drivers/xen/blkback/blkback.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/blkback/blkback.c 2011-01-31 17:32:16.000000000 +0100 @@ -37,6 +37,7 @@ #include #include +#include #include #include #include --- head-2011-02-17.orig/drivers/xen/blkback/interface.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-02-17/drivers/xen/blkback/interface.c 2011-01-31 17:32:16.000000000 +0100 @@ -35,7 +35,7 @@ #include #include -static kmem_cache_t *blkif_cachep; +static struct kmem_cache *blkif_cachep; blkif_t *blkif_alloc(domid_t domid) { --- head-2011-02-17.orig/drivers/xen/blkfront/blkfront.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/blkfront/blkfront.c 2011-01-31 17:32:16.000000000 +0100 @@ -71,7 +71,7 @@ static int setup_blkring(struct xenbus_d static void kick_pending_request_queues(struct blkfront_info *); static irqreturn_t blkif_int(int irq, void *dev_id); -static void blkif_restart_queue(void *arg); +static void blkif_restart_queue(struct work_struct *arg); static int blkif_recover(struct blkfront_info *); static void blkif_completion(struct blk_shadow *); static void blkif_free(struct blkfront_info *, int); @@ -111,7 +111,7 @@ static int blkfront_probe(struct xenbus_ info->xbdev = dev; info->vdevice = vdevice; info->connected = BLKIF_STATE_DISCONNECTED; - INIT_WORK(&info->work, blkif_restart_queue, (void *)info); + INIT_WORK(&info->work, blkif_restart_queue); for (i = 0; i < BLK_RING_SIZE; i++) info->shadow[i].req.id = i+1; @@ -482,9 +482,9 @@ static void kick_pending_request_queues( } } -static void blkif_restart_queue(void *arg) +static void blkif_restart_queue(struct work_struct *arg) { - struct blkfront_info *info = (struct blkfront_info *)arg; + struct blkfront_info *info = container_of(arg, struct blkfront_info, work); spin_lock_irq(&blkif_io_lock); if (info->connected == BLKIF_STATE_CONNECTED) kick_pending_request_queues(info); --- head-2011-02-17.orig/drivers/xen/blktap/blktap.c 2011-02-17 10:07:17.000000000 +0100 +++ head-2011-02-17/drivers/xen/blktap/blktap.c 2011-01-31 17:32:16.000000000 +0100 @@ -40,6 +40,7 @@ #include #include +#include #include #include #include "common.h" --- head-2011-02-17.orig/drivers/xen/blktap/interface.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-02-17/drivers/xen/blktap/interface.c 2011-01-31 17:32:16.000000000 +0100 @@ -35,7 +35,7 @@ #include #include -static kmem_cache_t *blkif_cachep; +static struct kmem_cache *blkif_cachep; blkif_t *tap_alloc_blkif(domid_t domid) { --- head-2011-02-17.orig/drivers/xen/char/mem.c 2007-08-06 15:10:49.000000000 +0200 +++ head-2011-02-17/drivers/xen/char/mem.c 2011-01-31 17:32:16.000000000 +0100 @@ -157,7 +157,7 @@ static loff_t memory_lseek(struct file * { loff_t ret; - mutex_lock(&file->f_dentry->d_inode->i_mutex); + mutex_lock(&file->f_path.dentry->d_inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -172,7 +172,7 @@ static loff_t memory_lseek(struct file * default: ret = -EINVAL; } - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); return ret; } --- head-2011-02-17.orig/drivers/xen/console/console.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/console/console.c 2011-01-31 17:32:16.000000000 +0100 @@ -85,11 +85,6 @@ static int xc_num = -1; #define XEN_HVC_MAJOR 229 #define XEN_HVC_MINOR 0 -#ifdef CONFIG_MAGIC_SYSRQ -static unsigned long sysrq_requested; -extern int sysrq_enabled; -#endif - static int __init xencons_setup(char *str) { char *q; @@ -354,8 +349,8 @@ void __init dom0_init_screen_info(const #define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \ ((_tty)->index != (xc_num - 1))) -static struct termios *xencons_termios[MAX_NR_CONSOLES]; -static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; +static struct ktermios *xencons_termios[MAX_NR_CONSOLES]; +static struct ktermios *xencons_termios_locked[MAX_NR_CONSOLES]; static struct tty_struct *xencons_tty; static int xencons_priv_irq; static char x_char; @@ -371,7 +366,9 @@ void xencons_rx(char *buf, unsigned len) for (i = 0; i < len; i++) { #ifdef CONFIG_MAGIC_SYSRQ - if (sysrq_enabled) { + if (sysrq_on()) { + static unsigned long sysrq_requested; + if (buf[i] == '\x0f') { /* ^O */ if (!sysrq_requested) { sysrq_requested = jiffies; --- head-2011-02-17.orig/drivers/xen/core/reboot.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/core/reboot.c 2011-01-31 17:32:16.000000000 +0100 @@ -33,8 +33,8 @@ static int suspend_cancelled; /* Can we leave APs online when we suspend? */ static int fast_suspend; -static void __shutdown_handler(void *unused); -static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); +static void __shutdown_handler(struct work_struct *unused); +static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler); static int setup_suspend_evtchn(void); @@ -104,7 +104,7 @@ static int xen_suspend(void *__unused) case SHUTDOWN_RESUMING: break; default: - schedule_work(&shutdown_work); + schedule_delayed_work(&shutdown_work, 0); break; } @@ -136,12 +136,12 @@ static void switch_shutdown_state(int ne /* Either we kick off the work, or we leave it to xen_suspend(). */ if (old_state == SHUTDOWN_INVALID) - schedule_work(&shutdown_work); + schedule_delayed_work(&shutdown_work, 0); else BUG_ON(old_state != SHUTDOWN_RESUMING); } -static void __shutdown_handler(void *unused) +static void __shutdown_handler(struct work_struct *unused) { int err; --- head-2011-02-17.orig/drivers/xen/core/smpboot.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/core/smpboot.c 2011-01-31 17:32:16.000000000 +0100 @@ -158,7 +158,12 @@ static void xen_smp_intr_exit(unsigned i void __cpuinit cpu_bringup(void) { +#ifdef __i386__ + cpu_set_gdt(current_thread_info()->cpu); + secondary_cpu_init(); +#else cpu_init(); +#endif identify_cpu(cpu_data + smp_processor_id()); touch_softlockup_watchdog(); preempt_disable(); @@ -296,11 +301,12 @@ void __init smp_prepare_cpus(unsigned in if (cpu == 0) continue; + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + #ifdef __x86_64__ gdt_descr = &cpu_gdt_descr[cpu]; -#else - gdt_descr = &per_cpu(cpu_gdt_descr, cpu); -#endif gdt_descr->address = get_zeroed_page(GFP_KERNEL); if (unlikely(!gdt_descr->address)) { printk(KERN_CRIT "CPU%d failed to allocate GDT\n", @@ -309,6 +315,11 @@ void __init smp_prepare_cpus(unsigned in } gdt_descr->size = GDT_SIZE; memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE); +#else + if (unlikely(!init_gdt(cpu, idle))) + continue; + gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +#endif make_page_readonly( (void *)gdt_descr->address, XENFEAT_writable_descriptor_tables); @@ -327,10 +338,6 @@ void __init smp_prepare_cpus(unsigned in x86_cpu_to_apicid[cpu] = apicid; - idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - #ifdef __x86_64__ cpu_pda(cpu)->pcurrent = idle; cpu_pda(cpu)->cpunumber = cpu; --- head-2011-02-17.orig/drivers/xen/fbfront/xenfb.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/fbfront/xenfb.c 2011-01-31 17:32:16.000000000 +0100 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include --- head-2011-02-17.orig/drivers/xen/netback/loopback.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/netback/loopback.c 2011-01-31 17:32:16.000000000 +0100 @@ -54,6 +54,7 @@ #include #include /* secpath_reset() */ #include /* is_initial_xendomain() */ +#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */ static int nloopbacks = -1; module_param(nloopbacks, int, 0); --- head-2011-02-17.orig/drivers/xen/pciback/conf_space_header.c 2010-03-02 09:56:10.000000000 +0100 +++ head-2011-02-17/drivers/xen/pciback/conf_space_header.c 2011-01-31 17:32:16.000000000 +0100 @@ -24,7 +24,7 @@ static int command_read(struct pci_dev * int ret; ret = pciback_read_config_word(dev, offset, value, data); - if (!dev->is_enabled) + if (!atomic_read(&dev->enable_cnt)) return ret; for (i = 0; i < PCI_ROM_RESOURCE; i++) { @@ -41,14 +41,14 @@ static int command_write(struct pci_dev { int err; - if (!dev->is_enabled && is_enable_cmd(value)) { + if (!atomic_read(&dev->enable_cnt) && is_enable_cmd(value)) { if (unlikely(verbose_request)) printk(KERN_DEBUG "pciback: %s: enable\n", pci_name(dev)); err = pci_enable_device(dev); if (err) return err; - } else if (dev->is_enabled && !is_enable_cmd(value)) { + } else if (atomic_read(&dev->enable_cnt) && !is_enable_cmd(value)) { if (unlikely(verbose_request)) printk(KERN_DEBUG "pciback: %s: disable\n", pci_name(dev)); --- head-2011-02-17.orig/drivers/xen/pciback/pciback.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/pciback/pciback.h 2011-01-31 17:32:16.000000000 +0100 @@ -100,7 +100,7 @@ void pciback_release_devices(struct pcib /* Handles events from front-end */ irqreturn_t pciback_handle_event(int irq, void *dev_id); -void pciback_do_op(void *data); +void pciback_do_op(struct work_struct *work); int pciback_xenbus_register(void); void pciback_xenbus_unregister(void); --- head-2011-02-17.orig/drivers/xen/pciback/pciback_ops.c 2011-02-17 10:07:33.000000000 +0100 +++ head-2011-02-17/drivers/xen/pciback/pciback_ops.c 2011-02-17 10:07:46.000000000 +0100 @@ -34,7 +34,7 @@ void pciback_reset_device(struct pci_dev pci_write_config_word(dev, PCI_COMMAND, 0); - dev->is_enabled = 0; + atomic_set(&dev->enable_cnt, 0); dev->is_busmaster = 0; } else { pci_read_config_word(dev, PCI_COMMAND, &cmd); @@ -75,9 +75,9 @@ void test_and_schedule_op(struct pciback * context because some of the pci_* functions can sleep (mostly due to ACPI * use of semaphores). This function is intended to be called from a work * queue in process context taking a struct pciback_device as a parameter */ -void pciback_do_op(void *data) +void pciback_do_op(struct work_struct *work) { - struct pciback_device *pdev = data; + struct pciback_device *pdev = container_of(work, struct pciback_device, op_work); struct pci_dev *dev; struct xen_pci_op *op = &pdev->sh_info->op; --- head-2011-02-17.orig/drivers/xen/pciback/xenbus.c 2009-04-07 13:58:48.000000000 +0200 +++ head-2011-02-17/drivers/xen/pciback/xenbus.c 2011-01-31 17:32:16.000000000 +0100 @@ -33,7 +33,7 @@ static struct pciback_device *alloc_pdev pdev->evtchn_irq = INVALID_EVTCHN_IRQ; pdev->be_watching = 0; - INIT_WORK(&pdev->op_work, pciback_do_op, pdev); + INIT_WORK(&pdev->op_work, pciback_do_op); if (pciback_init_devices(pdev)) { kfree(pdev); --- head-2011-02-17.orig/drivers/xen/pcifront/pci_op.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/pcifront/pci_op.c 2011-01-31 17:32:16.000000000 +0100 @@ -634,9 +634,9 @@ static pci_ers_result_t pcifront_common_ } -void pcifront_do_aer(void *data) +void pcifront_do_aer(struct work_struct *data) { - struct pcifront_device *pdev = data; + struct pcifront_device *pdev = container_of(data, struct pcifront_device, op_work); int cmd = pdev->sh_info->aer_op.cmd; pci_channel_state_t state = (pci_channel_state_t)pdev->sh_info->aer_op.err; --- head-2011-02-17.orig/drivers/xen/pcifront/pcifront.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/pcifront/pcifront.h 2011-01-31 17:32:16.000000000 +0100 @@ -49,7 +49,7 @@ int pcifront_rescan_root(struct pcifront unsigned int domain, unsigned int bus); void pcifront_free_roots(struct pcifront_device *pdev); -void pcifront_do_aer( void *data); +void pcifront_do_aer(struct work_struct *data); irqreturn_t pcifront_handler_aer(int irq, void *dev); --- head-2011-02-17.orig/drivers/xen/pcifront/xenbus.c 2010-10-05 09:58:12.000000000 +0200 +++ head-2011-02-17/drivers/xen/pcifront/xenbus.c 2011-01-31 17:32:16.000000000 +0100 @@ -50,7 +50,7 @@ static struct pcifront_device *alloc_pde pdev->gnt_ref = INVALID_GRANT_REF; pdev->irq = -1; - INIT_WORK(&pdev->op_work, pcifront_do_aer, pdev); + INIT_WORK(&pdev->op_work, pcifront_do_aer); dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n", pdev, pdev->sh_info); --- head-2011-02-17.orig/drivers/xen/scsiback/interface.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-02-17/drivers/xen/scsiback/interface.c 2011-01-31 17:32:16.000000000 +0100 @@ -40,7 +40,7 @@ #include -static kmem_cache_t *scsiback_cachep; +static struct kmem_cache *scsiback_cachep; struct vscsibk_info *vscsibk_info_alloc(domid_t domid) { --- head-2011-02-17.orig/drivers/xen/scsiback/scsiback.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/scsiback/scsiback.c 2011-01-31 17:32:16.000000000 +0100 @@ -341,13 +341,11 @@ static int scsiback_merge_bio(struct req if (!rq->bio) blk_rq_bio_prep(q, rq, bio); - else if (!q->back_merge_fn(q, rq, bio)) + else if (!ll_back_merge_fn(q, rq, bio)) return -EINVAL; else { rq->biotail->bi_next = bio; rq->biotail = bio; - rq->hard_nr_sectors += bio_sectors(bio); - rq->nr_sectors = rq->hard_nr_sectors; } return 0; --- head-2011-02-17.orig/drivers/xen/sfc_netfront/accel_vi.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/sfc_netfront/accel_vi.c 2011-01-31 17:32:16.000000000 +0100 @@ -465,7 +465,7 @@ netfront_accel_enqueue_skb_multi(netfron if (skb->ip_summed == CHECKSUM_PARTIAL) { /* Set to zero to encourage falcon to work it out for us */ - *(u16*)(skb->h.raw + skb->csum) = 0; + *(u16*)(skb->h.raw + skb->csum_offset) = 0; } if (multi_post_start_new_buffer(vnic, &state)) { @@ -584,7 +584,7 @@ netfront_accel_enqueue_skb_single(netfro if (skb->ip_summed == CHECKSUM_PARTIAL) { /* Set to zero to encourage falcon to work it out for us */ - *(u16*)(skb->h.raw + skb->csum) = 0; + *(u16*)(skb->h.raw + skb->csum_offset) = 0; } NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT (skb, idx, frag_data, frag_len, { --- head-2011-02-17.orig/drivers/xen/tpmback/interface.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-02-17/drivers/xen/tpmback/interface.c 2011-01-31 17:32:16.000000000 +0100 @@ -16,7 +16,7 @@ #include #include -static kmem_cache_t *tpmif_cachep; +static struct kmem_cache *tpmif_cachep; int num_frontends = 0; LIST_HEAD(tpmif_list); --- head-2011-02-17.orig/drivers/xen/usbback/usbback.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/usbback/usbback.c 2011-01-31 17:32:16.000000000 +0100 @@ -534,9 +534,10 @@ struct set_interface_request { struct work_struct work; }; -static void usbbk_set_interface_work(void *data) +static void usbbk_set_interface_work(struct work_struct *arg) { - struct set_interface_request *req = (struct set_interface_request *) data; + struct set_interface_request *req + = container_of(arg, struct set_interface_request, work); pending_req_t *pending_req = req->pending_req; struct usb_device *udev = req->pending_req->stub->udev; @@ -564,7 +565,7 @@ static int usbbk_set_interface(pending_r req->pending_req = pending_req; req->interface = interface; req->alternate = alternate; - INIT_WORK(&req->work, usbbk_set_interface_work, req); + INIT_WORK(&req->work, usbbk_set_interface_work); usb_get_dev(udev); schedule_work(&req->work); return 0; @@ -576,9 +577,10 @@ struct clear_halt_request { struct work_struct work; }; -static void usbbk_clear_halt_work(void *data) +static void usbbk_clear_halt_work(struct work_struct *arg) { - struct clear_halt_request *req = (struct clear_halt_request *) data; + struct clear_halt_request *req + = container_of(arg, struct clear_halt_request, work); pending_req_t *pending_req = req->pending_req; struct usb_device *udev = req->pending_req->stub->udev; int ret; @@ -604,7 +606,7 @@ static int usbbk_clear_halt(pending_req_ return -ENOMEM; req->pending_req = pending_req; req->pipe = pipe; - INIT_WORK(&req->work, usbbk_clear_halt_work, req); + INIT_WORK(&req->work, usbbk_clear_halt_work); usb_get_dev(udev); schedule_work(&req->work); @@ -617,9 +619,10 @@ struct port_reset_request { struct work_struct work; }; -static void usbbk_port_reset_work(void *data) +static void usbbk_port_reset_work(struct work_struct *arg) { - struct port_reset_request *req = (struct port_reset_request *) data; + struct port_reset_request *req + = container_of(arg, struct port_reset_request, work); pending_req_t *pending_req = req->pending_req; struct usb_device *udev = pending_req->stub->udev; int ret, ret_lock; @@ -648,7 +651,7 @@ static int usbbk_port_reset(pending_req_ return -ENOMEM; req->pending_req = pending_req; - INIT_WORK(&req->work, usbbk_port_reset_work, req); + INIT_WORK(&req->work, usbbk_port_reset_work); usb_get_dev(udev); schedule_work(&req->work); --- head-2011-02-17.orig/drivers/xen/xenbus/xenbus_comms.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/drivers/xen/xenbus/xenbus_comms.c 2011-01-31 17:32:16.000000000 +0100 @@ -49,8 +49,8 @@ static int xenbus_irq; -extern void xenbus_probe(void *); -static DECLARE_WORK(probe_work, xenbus_probe, NULL); +extern void xenbus_probe(struct work_struct *); +static DECLARE_WORK(probe_work, xenbus_probe); static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); --- head-2011-02-17.orig/drivers/xen/xenbus/xenbus_probe.c 2011-01-31 17:51:15.000000000 +0100 +++ head-2011-02-17/drivers/xen/xenbus/xenbus_probe.c 2011-01-31 17:32:16.000000000 +0100 @@ -860,7 +860,7 @@ void unregister_xenstore_notifier(struct EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); -void xenbus_probe(void *unused) +void xenbus_probe(struct work_struct *unused) { BUG_ON(!is_xenstored_ready()); --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/desc_32.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/desc_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -4,8 +4,6 @@ #include #include -#define CPU_16BIT_STACK_SIZE 1024 - #ifndef __ASSEMBLY__ #include @@ -15,8 +13,6 @@ extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; -DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); - struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); @@ -32,11 +28,6 @@ static inline struct desc_struct *get_cp return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; extern struct desc_struct idt_table[]; extern void set_intr_gate(unsigned int irq, void * addr); @@ -63,8 +54,8 @@ static inline void pack_gate(__u32 *a, _ #define DESCTYPE_DPL3 0x60 /* DPL-3 */ #define DESCTYPE_S 0x10 /* !system */ +#ifndef CONFIG_XEN #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) @@ -75,6 +66,7 @@ static inline void pack_gate(__u32 *a, _ #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) #define store_tr(tr) __asm__ ("str %0":"=m" (tr)) #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) +#endif #if TLS_SIZE != 24 # error update this code. @@ -90,22 +82,43 @@ static inline void load_TLS(struct threa } #ifndef CONFIG_XEN +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) { __u32 *lp = (__u32 *)((char *)dt + entry*8); *lp = entry_a; *(lp+1) = entry_b; } - -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define set_ldt native_set_ldt #else extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); +#define set_ldt xen_set_ldt +#endif + +#ifndef CONFIG_XEN +static inline fastcall void native_set_ldt(const void *addr, + unsigned int entries) +{ + if (likely(entries == 0)) + __asm__ __volatile__("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + __u32 a, b; + + pack_descriptor(&a, &b, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } +} #endif -#ifndef CONFIG_X86_NO_IDT -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#ifndef CONFIG_X86_NO_IDT static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) { __u32 a, b; @@ -125,14 +138,6 @@ static inline void __set_tss_desc(unsign } #endif -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries) -{ - __u32 a, b; - pack_descriptor(&a, &b, (unsigned long)addr, - entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); -} #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) @@ -163,36 +168,22 @@ static inline void set_ldt_desc(unsigned static inline void clear_LDT(void) { - int cpu = get_cpu(); - - /* - * NB. We load the default_ldt for lcall7/27 handling on demand, as - * it slows down context switching. Noone uses it anyway. - */ - cpu = cpu; /* XXX avoid compiler warning */ - xen_set_ldt(NULL, 0); - put_cpu(); + set_ldt(NULL, 0); } /* * load one particular LDT into the current CPU */ -static inline void load_LDT_nolock(mm_context_t *pc, int cpu) +static inline void load_LDT_nolock(mm_context_t *pc) { - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) - segments = NULL; - - xen_set_ldt(segments, count); + set_ldt(pc->ldt, pc->size); } static inline void load_LDT(mm_context_t *pc) { - int cpu = get_cpu(); - load_LDT_nolock(pc, cpu); - put_cpu(); + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); } static inline unsigned long get_desc_base(unsigned long *desc) @@ -204,6 +195,29 @@ static inline unsigned long get_desc_bas return base; } +#else /* __ASSEMBLY__ */ + +/* + * GET_DESC_BASE reads the descriptor base of the specified segment. + * + * Args: + * idx - descriptor index + * gdt - GDT pointer + * base - 32bit register to which the base will be written + * lo_w - lo word of the "base" register + * lo_b - lo byte of the "base" register + * hi_b - hi byte of the low word of the "base" register + * + * Example: + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. + */ +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ + movb idx*8+4(gdt), lo_b; \ + movb idx*8+7(gdt), hi_b; \ + shll $16, base; \ + movw idx*8+2(gdt), lo_w; + #endif /* !__ASSEMBLY__ */ #endif --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -13,13 +13,16 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H - /* used by vmalloc.c, vsyscall.lds.S. * * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. */ extern unsigned long __FIXADDR_TOP; +#ifdef CONFIG_COMPAT_VDSO +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +#endif #ifndef __ASSEMBLY__ #include --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/highmem.h 2011-01-31 17:32:16.000000000 +0100 @@ -85,7 +85,7 @@ static inline void clear_user_highpage(s void copy_highpage(struct page *to, struct page *from); static inline void copy_user_highpage(struct page *to, struct page *from, - unsigned long vaddr) + unsigned long vaddr, struct vm_area_struct *vma) { copy_highpage(to, from); } --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/hypervisor.h 2011-01-31 17:32:16.000000000 +0100 @@ -47,15 +47,6 @@ #include #include #include -#if defined(__i386__) -# ifdef CONFIG_X86_PAE -# include -# else -# include -# endif -#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) -# include -#endif extern shared_info_t *HYPERVISOR_shared_info; --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/irqflags_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -22,9 +22,6 @@ #define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) - #define raw_local_irq_restore(x) \ do { \ vcpu_info_t *_vcpu; \ @@ -66,18 +63,6 @@ void raw_safe_halt(void); */ void halt(void); -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return (flags != 0); -} - -#define raw_irqs_disabled() \ -({ \ - unsigned long flags = __raw_local_save_flags(); \ - \ - raw_irqs_disabled_flags(flags); \ -}) - /* * For spinlocks, etc: */ @@ -90,9 +75,64 @@ static inline int raw_irqs_disabled_flag flags; \ }) +#else +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + addl HYPERVISOR_shared_info,%esi +#else +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \ +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ + __TEST_PENDING ; \ + jnz 14f /* process more events if necessary... */ ; \ + movl PT_ESI(%esp), %esi ; \ + sysexit ; \ +14: __DISABLE_INTERRUPTS ; \ + TRACE_IRQS_OFF ; \ +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ + mov $__KERNEL_PDA, %ecx ; \ + push %esp ; \ + mov %ecx, %gs ; \ + call evtchn_do_upcall ; \ + add $4,%esp ; \ + jmp ret_from_intr +#define INTERRUPT_RETURN iret +#endif /* __ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#define raw_irqs_disabled() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_irqs_disabled_flags(flags); \ +}) #endif /* __ASSEMBLY__ */ /* --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -27,14 +27,13 @@ static inline void enter_lazy_tlb(struct static inline void __prepare_arch_switch(void) { /* - * Save away %fs and %gs. No need to save %es and %ds, as those - * are always kernel segments while inside the kernel. Must - * happen before reload of cr3/ldt (i.e., not in __switch_to). + * Save away %fs. No need to save %gs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. */ - asm volatile ( "mov %%fs,%0 ; mov %%gs,%1" - : "=m" (current->thread.fs), - "=m" (current->thread.gs)); - asm volatile ( "movl %0,%%fs ; movl %0,%%gs" + asm volatile ( "mov %%fs,%0" + : "=m" (current->thread.fs)); + asm volatile ( "movl %0,%%fs" : : "r" (0) ); } @@ -89,14 +88,14 @@ static inline void switch_mm(struct mm_s * tlb flush IPI delivery. We must reload %cr3. */ load_cr3(next->pgd); - load_LDT_nolock(&next->context, cpu); + load_LDT_nolock(&next->context); } } #endif } -#define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) +#define deactivate_mm(tsk, mm) \ + asm("movl %0,%%fs": :"r" (0)); static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-01-31 17:32:16.000000000 +0100 @@ -1,8 +1,6 @@ #ifndef _I386_PGTABLE_3LEVEL_H #define _I386_PGTABLE_3LEVEL_H -#include - /* * Intel Physical Address Extension (PAE) Mode - three-level page * tables on PPro+ CPUs. @@ -75,6 +73,23 @@ static inline void set_pte(pte_t *ptep, xen_l3_entry_update((pudptr), (pudval)) /* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; + } +} + +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) + +/* * Pentium-II erratum A13: in PAE mode we explicitly have to flush * the TLB via cr3 if the top-level pgd is changed... * We do not let the generic code free and clear pgd entries due to @@ -93,45 +108,16 @@ static inline void pud_clear (pud_t * pu #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) -static inline int pte_none(pte_t pte) -{ - return !(pte.pte_low | pte.pte_high); -} - -/* - * For PTEs and PDEs, we must clear the P-bit first when clearing a page table - * entry, so clear the bottom half first and enforce ordering with a compiler - * barrier. - */ -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res) { - if ((mm != current->mm && mm != &init_mm) - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { - ptep->pte_low = 0; - smp_wmb(); + uint64_t val = __pte_val(res); + if (__cmpxchg64(ptep, val, 0) != val) { + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; ptep->pte_high = 0; } -} - -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) - -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_none(pte)) { - if ((mm != &init_mm) || - HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { - uint64_t val = __pte_val(pte); - if (__cmpxchg64(ptep, val, 0) != val) { - /* xchg acts as a barrier before the setting of the high bits */ - pte.pte_low = xchg(&ptep->pte_low, 0); - pte.pte_high = ptep->pte_high; - ptep->pte_high = 0; - } - } - } - return pte; + return res; } #define __HAVE_ARCH_PTEP_CLEAR_FLUSH @@ -160,6 +146,11 @@ static inline int pte_same(pte_t a, pte_ #define pte_page(x) pfn_to_page(pte_pfn(x)) +static inline int pte_none(pte_t pte) +{ + return !(pte.pte_low | pte.pte_high); +} + #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ ((_pte).pte_high << (32-PAGE_SHIFT))) #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-07 15:37:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -38,14 +38,14 @@ struct vm_area_struct; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; extern pgd_t *swapper_pg_dir; -extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; +extern struct kmem_cache *pgd_cache; +extern struct kmem_cache *pmd_cache; extern spinlock_t pgd_lock; extern struct page *pgd_list; -void pmd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_dtor(void *, kmem_cache_t *, unsigned long); +void pmd_ctor(void *, struct kmem_cache *, unsigned long); +void pgd_ctor(void *, struct kmem_cache *, unsigned long); +void pgd_dtor(void *, struct kmem_cache *, unsigned long); void pgtable_cache_init(void); void paging_init(void); @@ -276,7 +276,6 @@ static inline pte_t pte_mkhuge(pte_t pte #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) - /* * We only update the dirty/accessed state if we set * the dirty bit by hand in the kernel, since the hardware @@ -342,6 +341,19 @@ do { \ __young; \ }) +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte) + && (mm != &init_mm + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { + pte = raw_ptep_get_and_clear(ptep, pte); + pte_update(mm, addr, ptep); + } + return pte; +} + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL #define ptep_get_and_clear_full(mm, addr, ptep, full) \ ((full) ? ({ \ --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/processor_32.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/processor_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -20,6 +20,7 @@ #include #include #include +#include #include /* flag for disabling the tsc */ @@ -73,6 +74,7 @@ struct cpuinfo_x86 { #endif unsigned char x86_max_cores; /* cpuid returned max cores value */ unsigned char apicid; + unsigned short x86_clflush_size; #ifdef CONFIG_SMP unsigned char booted_cores; /* number of cores as seen by OS */ __u8 phys_proc_id; /* Physical processor id. */ @@ -114,6 +116,8 @@ extern struct cpuinfo_x86 cpu_data[]; extern int cpu_llc_id[NR_CPUS]; extern char ignore_fpu_irq; +void __init cpu_detect(struct cpuinfo_x86 *c); + extern void identify_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); @@ -146,8 +150,8 @@ static inline void detect_ht(struct cpui #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ __asm__(XEN_CPUID @@ -158,59 +162,6 @@ static inline void __cpuid(unsigned int : "0" (*eax), "2" (*ecx)); } -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return eax; -} -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ebx; -} -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ecx; -} -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return edx; -} - #define load_cr3(pgdir) write_cr3(__pa(pgdir)) /* @@ -480,9 +431,9 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .gs = __KERNEL_PDA, \ } -#ifndef CONFIG_X86_NO_TSS /* * Note that the .io_bitmap member must be extra-big. This is because * the CPU will access an additional byte beyond the end of the IO @@ -497,26 +448,9 @@ struct thread_struct { .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -} -#define load_esp0(tss, thread) \ - __load_esp0(tss, thread) -#else -#define load_esp0(tss, thread) do { \ - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ - BUG(); \ -} while (0) -#endif - #define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + __asm__("movl %0,%%fs": :"r" (0)); \ + regs->xgs = 0; \ set_fs(USER_DS); \ regs->xds = __USER_DS; \ regs->xes = __USER_DS; \ @@ -526,26 +460,6 @@ static inline void __load_esp0(struct ts regs->esp = new_esp; \ } while (0) -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = HYPERVISOR_get_debugreg((register)) -#define set_debugreg(value, register) \ - WARN_ON(HYPERVISOR_set_debugreg((register), (value))) - -/* - * Set IOPL bits in EFLAGS from given mask - */ -static inline void set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); -} - /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; @@ -637,6 +551,105 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() +#define paravirt_enabled() 1 +#define __cpuid xen_cpuid + +#ifndef CONFIG_X86_NO_TSS +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} +#define load_esp0(tss, thread) \ + __load_esp0(tss, thread) +#else +#define load_esp0(tss, thread) do { \ + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ + BUG(); \ +} while (0) +#endif + + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = HYPERVISOR_get_debugreg(register) +#define set_debugreg(value, register) \ + WARN_ON(HYPERVISOR_set_debugreg(register, value)) + +#define set_iopl_mask xen_set_iopl_mask + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); +} + + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, + int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; +} +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ebx; +} +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; +} +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; +} + /* generic versions from gas */ #define GENERIC_NOP1 ".byte 0x90\n" #define GENERIC_NOP2 ".byte 0x89,0xf6\n" @@ -736,4 +749,8 @@ extern unsigned long boot_option_idle_ov extern void enable_sep_cpu(void); extern int sysenter_setup(void); +extern int init_gdt(int cpu, struct task_struct *idle); +extern void cpu_set_gdt(int); +extern void secondary_cpu_init(void); + #endif /* __ASM_I386_PROCESSOR_H */ --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/smp_32.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/smp_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -8,6 +8,7 @@ #include #include #include +#include #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -56,7 +57,7 @@ extern void cpu_uninit(void); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (current_thread_info()->cpu) +#define raw_smp_processor_id() (read_pda(cpu_number)) extern cpumask_t cpu_possible_map; #define cpu_callin_map cpu_possible_map --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/system_32.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/system_32.h 2011-01-31 17:32:16.000000000 +0100 @@ -139,17 +139,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define write_cr4(x) \ __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) -/* - * Clear and set 'TS' bit respectively - */ +#define wbinvd() \ + __asm__ __volatile__ ("wbinvd": : :"memory") + +/* Clear the 'TS' bit */ #define clts() (HYPERVISOR_fpu_taskswitch(0)) + +/* Set the 'TS' bit */ #define stts() (HYPERVISOR_fpu_taskswitch(1)) #endif /* __KERNEL__ */ -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory") - static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/desc_64.h 2011-01-31 17:32:16.000000000 +0100 @@ -9,62 +9,11 @@ #include #include +#include #include #include -// 8 byte segment descriptor -struct desc_struct { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1; - unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8; -} __attribute__((packed)); - -struct n_desc_struct { - unsigned int a,b; -}; - -enum { - GATE_INTERRUPT = 0xE, - GATE_TRAP = 0xF, - GATE_CALL = 0xC, -}; - -// 16byte gate -struct gate_struct { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long)(x) >> 32) - -enum { - DESC_TSS = 0x9, - DESC_LDT = 0x2, -}; - -// LDT or TSS descriptor in the GDT. 16 bytes. -struct ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - -struct desc_ptr { - unsigned short size; - unsigned long address; -} __attribute__((packed)) ; - extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS]; extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-07 15:37:14.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-01-31 17:32:16.000000000 +0100 @@ -237,19 +237,18 @@ extern unsigned int __kernel_page_user; static inline unsigned long pgd_bad(pgd_t pgd) { - unsigned long val = __pgd_val(pgd); - val &= ~PTE_MASK; - val &= ~(_PAGE_USER | _PAGE_DIRTY); - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); + return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); } -static inline unsigned long pud_bad(pud_t pud) -{ - unsigned long val = __pud_val(pud); - val &= ~PTE_MASK; - val &= ~(_PAGE_USER | _PAGE_DIRTY); - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); -} +static inline unsigned long pud_bad(pud_t pud) +{ + return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); +} + +static inline unsigned long pmd_bad(pmd_t pmd) +{ + return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); +} #define set_pte_at(_mm,addr,ptep,pteval) do { \ if (((_mm) != current->mm && (_mm) != &init_mm) || \ @@ -404,8 +403,6 @@ static inline int pmd_large(pmd_t pte) { #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) #endif #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \ - != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT))) #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/processor_64.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/processor_64.h 2011-01-31 17:32:16.000000000 +0100 @@ -484,6 +484,14 @@ static inline void __mwait(unsigned long : :"a" (eax), "c" (ecx)); } +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax,%ecx;" */ + asm volatile( + "sti; .byte 0x0f,0x01,0xc9;" + : :"a" (eax), "c" (ecx)); +} + extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); #define stack_current() \ --- head-2011-02-17.orig/arch/x86/include/mach-xen/asm/smp_64.h 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-02-17/arch/x86/include/mach-xen/asm/smp_64.h 2011-01-31 17:32:16.000000000 +0100 @@ -88,11 +88,6 @@ extern u8 x86_cpu_to_log_apicid[NR_CPUS] extern u8 bios_cpu_apicid[]; #ifdef CONFIG_X86_LOCAL_APIC -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) -{ - return cpus_addr(cpumask)[0]; -} - static inline int cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < NR_CPUS) @@ -127,13 +122,6 @@ static __inline int logical_smp_processo #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] #else #define cpu_physical_id(cpu) boot_cpu_id -static inline int smp_call_function_single(int cpuid, void (*func) (void *info), - void *info, int retry, int wait) -{ - /* Disable interrupts here? */ - func(info); - return 0; -} #endif /* !CONFIG_SMP */ #endif --- head-2011-02-17.orig/include/xen/net-util.h 2011-02-09 15:49:42.000000000 +0100 +++ head-2011-02-17/include/xen/net-util.h 2011-02-09 15:50:19.000000000 +0100 @@ -39,12 +39,12 @@ static inline int skb_checksum_setup(str switch (iph->protocol) { case IPPROTO_TCP: - skb->csum = offsetof(struct tcphdr, check); + skb->csum_offset = offsetof(struct tcphdr, check); if (csum) csum = &skb->h.th->check; break; case IPPROTO_UDP: - skb->csum = offsetof(struct udphdr, check); + skb->csum_offset = offsetof(struct udphdr, check); if (csum) csum = &skb->h.uh->check; break; @@ -56,7 +56,7 @@ static inline int skb_checksum_setup(str goto out; } - if ((skb->h.raw + skb->csum + sizeof(*csum)) > skb->tail) + if ((skb->h.raw + skb->csum_offset + sizeof(*csum)) > skb->tail) goto out; if (csum) { --- head-2011-02-17.orig/kernel/kexec.c 2011-01-31 17:01:49.000000000 +0100 +++ head-2011-02-17/kernel/kexec.c 2011-01-31 17:32:16.000000000 +0100 @@ -371,7 +371,7 @@ static struct page *kimage_alloc_pages(g if (limit == ~0UL) address_bits = BITS_PER_LONG; else - address_bits = long_log2(limit); + address_bits = ilog2(limit); if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) { __free_pages(pages, order);