From: kernel.org Subject: 2.6.26 Patch-mainline: 2.6.26 Acked-by: Jeff Mahoney <jeffm@suse.com> Automatically created from "patches.kernel.org/patch-2.6.26" by xen-port-patches.py 2.6.35: arch/x86/include/asm/scatterlist.h change removed (would need to be reverted there) --- head-2011-03-11.orig/arch/x86/Kconfig 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/Kconfig 2011-01-31 18:07:35.000000000 +0100 @@ -41,7 +41,7 @@ config X86 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM if !XEN - select HAVE_ARCH_KGDB + select HAVE_ARCH_KGDB if !XEN select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -609,6 +609,7 @@ config NO_BOOTMEM config MEMTEST bool "Memtest" + depends on !XEN ---help--- This option adds a kernel parameter 'memtest', which allows memtest to be set. @@ -1193,7 +1194,7 @@ config ARCH_DMA_ADDR_T_64BIT config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EXPERT default y - depends on X86_64 + depends on X86_64 && !XEN ---help--- Allow the kernel linear mapping to use 1GB pages on CPUs that support it. This can improve the kernel's performance a tiny bit by @@ -2267,6 +2268,4 @@ source "crypto/Kconfig" source "arch/x86/kvm/Kconfig" -source "drivers/xen/Kconfig" - source "lib/Kconfig" --- head-2011-03-11.orig/arch/x86/ia32/ia32entry-xen.S 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/ia32/ia32entry-xen.S 2011-01-31 18:07:35.000000000 +0100 @@ -129,12 +129,14 @@ sysenter_tracesys: SAVE_REST CLEAR_RREGS movq %r9,R9(%rsp) - movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d + cmpl $(IA32_NR_syscalls-1),%eax + ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC ENDPROC(ia32_sysenter_target) @@ -200,13 +202,15 @@ cstar_tracesys: SAVE_REST CLEAR_RREGS movq %r9,R9(%rsp) - movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d movl RSP-ARGOFFSET(%rsp), %r8d + cmpl $(IA32_NR_syscalls-1),%eax + ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -264,7 +268,7 @@ ENTRY(ia32_syscall) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax - ja ia32_badsys + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ IA32_ARG_FIXUP call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: @@ -274,7 +278,7 @@ ia32_sysret: ia32_tracesys: SAVE_REST CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ @@ -365,7 +369,7 @@ ia32_sys_call_table: .quad sys_setuid16 .quad sys_getuid16 .quad compat_sys_stime /* stime */ /* 25 */ - .quad sys32_ptrace /* ptrace */ + .quad compat_sys_ptrace /* ptrace */ .quad sys_alarm .quad sys_fstat /* (old)fstat */ .quad sys_pause --- head-2011-03-11.orig/arch/x86/kernel/Makefile 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/Makefile 2011-01-31 18:07:35.000000000 +0100 @@ -127,8 +127,7 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_XEN) += nmi_64.o time_64-$(CONFIG_XEN) += time_32.o - pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o endif -disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \ - smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o +disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \ + pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o --- head-2011-03-11.orig/arch/x86/kernel/acpi/Makefile 2011-01-31 14:53:50.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/acpi/Makefile 2011-01-31 18:07:35.000000000 +0100 @@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/w $(obj)/realmode/wakeup.bin: FORCE $(Q)$(MAKE) $(build)=$(obj)/realmode -disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_$(BITS).o +disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o --- head-2011-03-11.orig/arch/x86/kernel/acpi/boot.c 2011-03-11 10:56:21.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/acpi/boot.c 2011-03-11 10:59:02.000000000 +0100 @@ -206,6 +206,7 @@ static int __init acpi_parse_madt(struct static void __cpuinit acpi_register_lapic(int id, u8 enabled) { +#ifndef CONFIG_XEN unsigned int ver = 0; if (id >= (MAX_LOCAL_APIC-1)) { @@ -222,6 +223,7 @@ static void __cpuinit acpi_register_lapi ver = apic_version[boot_cpu_physical_apicid]; generic_processor_info(id, ver); +#endif } static int __init --- head-2011-03-11.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/acpi/sleep-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -10,15 +10,19 @@ #include <linux/dmi.h> #include <linux/cpumask.h> -#include <asm/smp.h> +#include "realmode/wakeup.h" +#include "sleep.h" #ifndef CONFIG_ACPI_PV_SLEEP -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; +unsigned long acpi_wakeup_address; unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; -extern unsigned long acpi_copy_wakeup_routine(unsigned long); +/* address in low memory of the wakeup routine. */ +static unsigned long acpi_realmode; + +#ifdef CONFIG_64BIT +static char temp_stack[10240]; +#endif #endif /** @@ -26,17 +30,69 @@ extern unsigned long acpi_copy_wakeup_ro * * Create an identity mapped page table and copy the wakeup routine to * low memory. + * + * Note that this is too late to change acpi_wakeup_address. */ int acpi_save_state_mem(void) { #ifndef CONFIG_ACPI_PV_SLEEP - if (!acpi_wakeup_address) { - printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); + struct wakeup_header *header; + + if (!acpi_realmode) { + printk(KERN_ERR "Could not allocate memory during boot, " + "S3 disabled\n"); return -ENOMEM; } - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); + memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE); + + header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); + if (header->signature != 0x51ee1111) { + printk(KERN_ERR "wakeup header does not match\n"); + return -EINVAL; + } + + header->video_mode = saved_video_mode; + + header->wakeup_jmp_seg = acpi_wakeup_address >> 4; + /* GDT[0]: GDT self-pointer */ + header->wakeup_gdt[0] = + (u64)(sizeof(header->wakeup_gdt) - 1) + + ((u64)(acpi_wakeup_address + + ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) + << 16); + /* GDT[1]: real-mode-like code segment */ + header->wakeup_gdt[1] = (0x009bULL << 40) + + ((u64)acpi_wakeup_address << 16) + 0xffff; + /* GDT[2]: real-mode-like data segment */ + header->wakeup_gdt[2] = (0x0093ULL << 40) + + ((u64)acpi_wakeup_address << 16) + 0xffff; + +#ifndef CONFIG_64BIT + store_gdt((struct desc_ptr *)&header->pmode_gdt); + + header->pmode_efer_low = nx_enabled; + if (header->pmode_efer_low & 1) { + /* This is strange, why not save efer, always? */ + rdmsr(MSR_EFER, header->pmode_efer_low, + header->pmode_efer_high); + } +#endif /* !CONFIG_64BIT */ + + header->pmode_cr0 = read_cr0(); + header->pmode_cr4 = read_cr4(); + header->realmode_flags = acpi_realmode_flags; + header->real_magic = 0x12345678; + +#ifndef CONFIG_64BIT + header->pmode_entry = (u32)&wakeup_pmode_return; + header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); + saved_magic = 0x12345678; +#else /* CONFIG_64BIT */ + header->trampoline_segment = setup_trampoline() >> 4; + init_rsp = (unsigned long)temp_stack + 4096; + initial_code = (unsigned long)wakeup_long64; + saved_magic = 0x123456789abcdef0; +#endif /* CONFIG_64BIT */ #endif return 0; @@ -61,15 +117,20 @@ void acpi_restore_state_mem(void) void __init acpi_reserve_bootmem(void) { #ifndef CONFIG_ACPI_PV_SLEEP - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n"); return; } - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if (!acpi_wakeup_address) + acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); + + if (!acpi_realmode) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); #endif } --- head-2011-03-11.orig/arch/x86/kernel/cpu/common-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/cpu/common-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -5,7 +5,6 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/bootmem.h> -#include <asm/semaphore.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/msr.h> @@ -13,6 +12,7 @@ #include <asm/mmu_context.h> #include <asm/mtrr.h> #include <asm/mce.h> +#include <asm/pat.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> #include <asm/apic.h> @@ -69,9 +69,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuin static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 * c) +static void __cpuinit default_init(struct cpuinfo_x86 *c) { /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -88,11 +88,11 @@ static struct cpu_dev __cpuinitdata defa .c_init = default_init, .c_vendor = "Unknown", }; -static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu; +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; static int __init cachesize_setup(char *str) { - get_option (&str, &cachesize_override); + get_option(&str, &cachesize_override); return 1; } __setup("cachesize=", cachesize_setup); @@ -114,12 +114,12 @@ int __cpuinit get_model_name(struct cpui /* Intel chips right-justify this string for some dumb reason; undo that brain damage */ p = q = &c->x86_model_id[0]; - while ( *p == ' ' ) + while (*p == ' ') p++; - if ( p != q ) { - while ( *p ) + if (p != q) { + while (*p) *q++ = *p++; - while ( q <= &c->x86_model_id[48] ) + while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } @@ -137,7 +137,7 @@ void __cpuinit display_cacheinfo(struct cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); + c->x86_cache_size = (ecx>>24)+(edx>>24); } if (n < 0x80000006) /* Some chips just has a large L1. */ @@ -145,16 +145,16 @@ void __cpuinit display_cacheinfo(struct ecx = cpuid_ecx(0x80000006); l2size = ecx >> 16; - + /* do processor-specific cache resizing */ if (this_cpu->c_size_cache) - l2size = this_cpu->c_size_cache(c,l2size); + l2size = this_cpu->c_size_cache(c, l2size); /* Allow user to override all this if necessary. */ if (cachesize_override != -1) l2size = cachesize_override; - if ( l2size == 0 ) + if (l2size == 0) return; /* Again, no L2 cache is possible */ c->x86_cache_size = l2size; @@ -163,16 +163,19 @@ void __cpuinit display_cacheinfo(struct l2size, ecx & 0xFF); } -/* Naming convention should be: <Name> [(<Codename>)] */ -/* This table only is used unless init_<vendor>() below doesn't set it; */ -/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ +/* + * Naming convention should be: <Name> [(<Codename>)] + * This table only is used unless init_<vendor>() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ /* Look up CPU names by table lookup. */ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) { struct cpu_model_info *info; - if ( c->x86_model >= 16 ) + if (c->x86_model >= 16) return NULL; /* Range check */ if (!this_cpu) @@ -197,9 +200,9 @@ static void __cpuinit get_cpu_vendor(str for (i = 0; i < X86_VENDOR_NUM; i++) { if (cpu_devs[i]) { - if (!strcmp(v,cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v,cpu_devs[i]->c_ident[1]))) { + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { c->x86_vendor = i; if (!early) this_cpu = cpu_devs[i]; @@ -217,7 +220,7 @@ static void __cpuinit get_cpu_vendor(str } -static int __init x86_fxsr_setup(char * s) +static int __init x86_fxsr_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_FXSR); setup_clear_cpu_cap(X86_FEATURE_XMM); @@ -226,7 +229,7 @@ static int __init x86_fxsr_setup(char * __setup("nofxsr", x86_fxsr_setup); -static int __init x86_sep_setup(char * s) +static int __init x86_sep_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_SEP); return 1; @@ -315,12 +318,15 @@ static void __cpuinit early_get_cap(stru } -/* Do minimum CPU detection early. - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. - The others are not touched to avoid unwanted side effects. - - WARNING: this function is only called on the BP. Don't add code here - that is supposed to run on all CPUs. */ +/* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, + * cache alignment. + * The others are not touched to avoid unwanted side effects. + * + * WARNING: this function is only called on the BP. Don't add code here + * that is supposed to run on all CPUs. + */ static void __init early_cpu_detect(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -335,19 +341,14 @@ static void __init early_cpu_detect(void get_cpu_vendor(c, 1); - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - early_init_amd(c); - break; - case X86_VENDOR_INTEL: - early_init_intel(c); - break; - } + if (c->x86_vendor != X86_VENDOR_UNKNOWN && + cpu_devs[c->x86_vendor]->c_early_init) + cpu_devs[c->x86_vendor]->c_early_init(c); early_get_cap(c); } -static void __cpuinit generic_identify(struct cpuinfo_x86 * c) +static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { u32 tfms, xlvl; unsigned int ebx; @@ -358,13 +359,12 @@ static void __cpuinit generic_identify(s (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); - + get_cpu_vendor(c, 0); /* Initialize the standard set of capabilities */ /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ - if ( c->cpuid_level >= 0x00000001 ) { + if (c->cpuid_level >= 0x00000001) { u32 capability, excap; cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; @@ -377,13 +377,15 @@ static void __cpuinit generic_identify(s c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; #ifndef CONFIG_XEN + c->initial_apicid = (ebx >> 24) & 0xFF; #ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); + c->apicid = phys_pkg_id(c->initial_apicid, 0); + c->phys_proc_id = c->initial_apicid; #else - c->apicid = (ebx >> 24) & 0xFF; + c->apicid = c->initial_apicid; #endif #endif - if (c->x86_capability[0] & (1<<19)) + if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ @@ -392,33 +394,30 @@ static void __cpuinit generic_identify(s /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); - if ( (xlvl & 0xffff0000) == 0x80000000 ) { - if ( xlvl >= 0x80000001 ) { + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { c->x86_capability[1] = cpuid_edx(0x80000001); c->x86_capability[6] = cpuid_ecx(0x80000001); } - if ( xlvl >= 0x80000004 ) + if (xlvl >= 0x80000004) get_model_name(c); /* Default name */ } init_scattered_cpuid_features(c); } -#ifdef CONFIG_X86_HT - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; -#endif } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { /* Disable processor serial number */ - unsigned long lo,hi; - rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_bit(X86_FEATURE_PN, c->x86_capability); + clear_cpu_cap(c, X86_FEATURE_PN); /* Disabling the serial number may affect the cpuid level */ c->cpuid_level = cpuid_eax(0); @@ -455,9 +454,11 @@ void __cpuinit identify_cpu(struct cpuin memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { - /* First of all, decide if this is a 486 or higher */ - /* It's a 486 if we can modify the AC flag */ - if ( flag_is_changeable_p(X86_EFLAGS_AC) ) + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) c->x86 = 4; else c->x86 = 3; @@ -490,10 +491,10 @@ void __cpuinit identify_cpu(struct cpuin */ /* If the model name is still unset, do table lookup. */ - if ( !c->x86_model_id[0] ) { + if (!c->x86_model_id[0]) { char *p; p = table_lookup_model(c); - if ( p ) + if (p) strcpy(c->x86_model_id, p); else /* Last resort... */ @@ -507,9 +508,9 @@ void __cpuinit identify_cpu(struct cpuin * common between the CPUs. The first time this routine gets * executed, c == &boot_cpu_data. */ - if ( c != &boot_cpu_data ) { + if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for ( i = 0 ; i < NCAPINTS ; i++ ) + for (i = 0 ; i < NCAPINTS ; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } @@ -553,7 +554,7 @@ void __cpuinit detect_ht(struct cpuinfo_ if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { + } else if (smp_num_siblings > 1) { if (smp_num_siblings > NR_CPUS) { printk(KERN_WARNING "CPU: Unsupported number of the " @@ -563,7 +564,7 @@ void __cpuinit detect_ht(struct cpuinfo_ } index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); @@ -574,7 +575,7 @@ void __cpuinit detect_ht(struct cpuinfo_ core_bits = get_count_order(c->x86_max_cores); - c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & ((1 << core_bits) - 1); if (c->x86_max_cores > 1) @@ -608,7 +609,7 @@ void __cpuinit print_cpu_info(struct cpu else printk("%s", c->x86_model_id); - if (c->x86_mask || c->cpuid_level >= 0) + if (c->x86_mask || c->cpuid_level >= 0) printk(" stepping %02x\n", c->x86_mask); else printk("\n"); @@ -627,24 +628,17 @@ __setup("clearcpuid=", setup_disablecpui cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -/* This is hacky. :) - * We're emulating future behavior. - * In the future, the cpu-specific init functions will be called implicitly - * via the magic of initcalls. - * They will insert themselves into the cpu_devs structure. - * Then, when cpu_init() is called, we can just iterate over that array. - */ void __init early_cpu_init(void) { - intel_cpu_init(); - cyrix_init_cpu(); - nsc_init_cpu(); - amd_init_cpu(); - centaur_init_cpu(); - transmeta_init_cpu(); - nexgen_init_cpu(); - umc_init_cpu(); + struct cpu_vendor_dev *cvdev; + + for (cvdev = __x86cpuvendor_start ; + cvdev < __x86cpuvendor_end ; + cvdev++) + cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + early_cpu_detect(); + validate_pat_support(&boot_cpu_data); } /* Make sure %fs is initialized properly in idle threads */ @@ -689,7 +683,7 @@ void __cpuinit cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; #ifndef CONFIG_X86_NO_TSS - struct tss_struct * t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(init_tss, cpu); #endif struct thread_struct *thread = &curr->thread; @@ -742,7 +736,7 @@ void __cpuinit cpu_init(void) mxcsr_feature_mask_init(); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) void __cpuinit cpu_uninit(void) { int cpu = raw_smp_processor_id(); --- head-2011-03-11.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/cpu/mtrr/main-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -35,6 +35,8 @@ struct mtrr_ops *mtrr_if = &generic_mtrr unsigned int num_var_ranges; unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +static u64 tom2; + static void __init set_num_var_ranges(void) { struct xen_platform_op op; @@ -162,8 +164,144 @@ mtrr_del(int reg, unsigned long base, un EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); +/* + * Returns the effective MTRR type for the region + * Error returns: + * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR + * - 0xFF - when MTRR is not enabled + */ +u8 mtrr_type_lookup(u64 start, u64 end) +{ + int i, error; + u64 start_mfn, end_mfn, base_mfn, top_mfn; + u8 prev_match, curr_match; + struct xen_platform_op op; + + if (!is_initial_xendomain()) + return MTRR_TYPE_WRBACK; + + if (!num_var_ranges) + return 0xFF; + + start_mfn = start >> PAGE_SHIFT; + /* Make end inclusive end, instead of exclusive */ + end_mfn = --end >> PAGE_SHIFT; + + /* Look in fixed ranges. Just return the type as per start */ + if (start_mfn < 0x100) { +#if 0//todo + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = ???; + error = HYPERVISOR_platform_op(&op); + if (!error) + return op.u.read_memtype.type; +#endif + return MTRR_TYPE_UNCACHABLE; + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = i; + error = HYPERVISOR_platform_op(&op); + + if (error || !op.u.read_memtype.nr_mfns) + continue; + + base_mfn = op.u.read_memtype.mfn; + top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1; + + if (base_mfn > end_mfn || start_mfn > top_mfn) { + continue; + } + + if (base_mfn > start_mfn || end_mfn > top_mfn) { + return 0xFE; + } + + curr_match = op.u.read_memtype.type; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) { + return MTRR_TYPE_UNCACHABLE; + } + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) { + return MTRR_TYPE_UNCACHABLE; + } + } + + if (tom2) { + if (start >= (1ULL<<32) && (end < tom2)) + return MTRR_TYPE_WRBACK; + } + + if (prev_match != 0xFF) + return prev_match; + +#if 0//todo + op.cmd = XENPF_read_def_memtype; + error = HYPERVISOR_platform_op(&op); + if (!error) + return op.u.read_def_memtype.type; +#endif + return MTRR_TYPE_UNCACHABLE; +} + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +int __init amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + void __init mtrr_bp_init(void) { + if (amd_special_default_mtrr()) { + /* TOP_MEM2 */ + rdmsrl(MSR_K8_TOP_MEM2, tom2); + tom2 &= 0xffffff8000000ULL; + } } void mtrr_ap_init(void) --- head-2011-03-11.orig/arch/x86/kernel/cpu/bugs.c 2010-08-02 00:11:14.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/cpu/bugs.c 2011-03-03 16:41:48.000000000 +0100 @@ -17,6 +17,7 @@ #include <asm/paravirt.h> #include <asm/alternative.h> +#ifndef CONFIG_XEN static int __init no_halt(char *s) { boot_cpu_data.hlt_works_ok = 0; @@ -24,6 +25,7 @@ static int __init no_halt(char *s) } __setup("no-hlt", no_halt); +#endif static int __init no_387(char *s) { @@ -79,13 +81,16 @@ static void __init check_fpu(void) : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); +#ifndef CONFIG_XEN boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); +#endif } static void __init check_hlt(void) { +#ifndef CONFIG_XEN if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; @@ -99,6 +104,7 @@ static void __init check_hlt(void) halt(); halt(); printk(KERN_CONT "OK.\n"); +#endif } /* --- head-2011-03-11.orig/arch/x86/kernel/cpu/proc.c 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/cpu/proc.c 2011-03-03 16:38:42.000000000 +0100 @@ -10,7 +10,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) if (c->x86_max_cores * smp_num_siblings > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", @@ -32,18 +32,22 @@ static void show_cpuinfo_misc(struct seq */ int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); seq_printf(m, +#ifndef CONFIG_XEN "fdiv_bug\t: %s\n" "hlt_bug\t\t: %s\n" "f00f_bug\t: %s\n" "coma_bug\t: %s\n" +#endif "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: %s\n", +#ifndef CONFIG_XEN c->fdiv_bug ? "yes" : "no", c->hlt_works_ok ? "no" : "yes", c->f00f_bug ? "yes" : "no", c->coma_bug ? "yes" : "no", +#endif c->hard_math ? "yes" : "no", fpu_exception ? "yes" : "no", c->cpuid_level, --- head-2011-03-11.orig/arch/x86/kernel/e820_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/e820_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -469,7 +469,7 @@ int __init sanitize_e820_map(struct e820 * thinkpad 560x, for example, does not cooperate with the memory * detection code.) */ -int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +int __init copy_e820_map(struct e820entry *biosmap, int nr_map) { #ifndef CONFIG_XEN /* Only one memory region (or negative)? Ignore it */ @@ -480,33 +480,17 @@ int __init copy_e820_map(struct e820entr #endif do { - unsigned long long start = biosmap->addr; - unsigned long long size = biosmap->size; - unsigned long long end = start + size; - unsigned long type = biosmap->type; + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) return -1; -#ifndef CONFIG_XEN - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } -#endif add_memory_region(start, size, type); - } while (biosmap++,--nr_map); + } while (biosmap++, --nr_map); #ifdef CONFIG_XEN if (is_initial_xendomain()) { @@ -528,7 +512,7 @@ int __init copy_e820_map(struct e820entr /* * Find the highest page frame number we have available */ -void __init find_max_pfn(void) +void __init propagate_e820_map(void) { int i; @@ -801,7 +785,7 @@ static int __init parse_memmap(char *arg * size before original memory map is * reset. */ - find_max_pfn(); + propagate_e820_map(); saved_max_pfn = max_pfn; #endif e820.nr_map = 0; --- head-2011-03-11.orig/arch/x86/kernel/e820_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/e820_64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -41,11 +41,11 @@ unsigned long end_pfn; #ifndef CONFIG_XEN /* - * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. - * The direct mapping extends to end_pfn_map, so that we can directly access + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. */ -unsigned long end_pfn_map; +unsigned long max_pfn_mapped; #endif /* @@ -65,8 +65,8 @@ struct early_res { static struct early_res early_res[MAX_EARLY_RES] __initdata = { #ifndef CONFIG_XEN { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#ifdef CONFIG_SMP - { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" }, +#ifdef CONFIG_X86_TRAMPOLINE + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, #endif #endif {} @@ -91,19 +91,47 @@ void __init reserve_early(unsigned long strncpy(r->name, name, sizeof(r->name) - 1); } -void __init early_res_to_bootmem(void) +void __init free_early(unsigned long start, unsigned long end) +{ + struct early_res *r; + int i, j; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (start == r->start && end == r->end) + break; + } + if (i >= MAX_EARLY_RES || !early_res[i].end) + panic("free_early on not reserved area: %lx-%lx!", start, end); + + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; +} + +void __init early_res_to_bootmem(unsigned long start, unsigned long end) { int i; + unsigned long final_start, final_end; for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { struct early_res *r = &early_res[i]; - printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i, - r->start, r->end - 1, r->name); - reserve_bootmem_generic(r->start, r->end - r->start); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) + continue; + printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, + final_start, final_end - 1, r->name); + reserve_bootmem_generic(final_start, final_end - final_start); } } /* Check for already reserved areas */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) +static inline int __init +bad_addr(unsigned long *addrp, unsigned long size, unsigned long align) { int i; unsigned long addr = *addrp, last; @@ -113,7 +141,7 @@ again: for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { struct early_res *r = &early_res[i]; if (last >= r->start && addr < r->end) { - *addrp = addr = r->end; + *addrp = addr = round_up(r->end, align); changed = 1; goto again; } @@ -121,6 +149,40 @@ again: return changed; } +/* Check for already reserved areas */ +static inline int __init +bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align) +{ + int i; + unsigned long addr = *addrp, last; + unsigned long size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} /* * This function checks if any part of the range <start,end> is mapped * with type. @@ -196,26 +258,27 @@ int __init e820_all_mapped(unsigned long * Find a free area with specified alignment in a specific range. */ unsigned long __init find_e820_area(unsigned long start, unsigned long end, - unsigned size, unsigned long align) + unsigned long size, unsigned long align) { int i; - unsigned long mask = ~(align - 1); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; + unsigned long addr, last; + unsigned long ei_last; if (ei->type != E820_RAM) continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; if (addr < start) - addr = start; - if (addr > ei->addr + ei->size) + addr = round_up(start, align); + if (addr >= ei_last) continue; - while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) + while (bad_addr(&addr, size, align) && addr+size <= ei_last) ; - addr = (addr + align - 1) & mask; last = addr + size; - if (last > ei->addr + ei->size) + if (last > ei_last) continue; if (last > end) continue; @@ -225,6 +288,40 @@ unsigned long __init find_e820_area(unsi } /* + * Find next free range after *start + */ +unsigned long __init find_e820_area_size(unsigned long start, + unsigned long *sizep, + unsigned long align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr, last; + unsigned long ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + return -1UL; + +} +/* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) @@ -233,31 +330,29 @@ unsigned long __init e820_end_of_ram(voi end_pfn = find_max_pfn_with_active_regions(); - if (end_pfn > end_pfn_map) - end_pfn_map = end_pfn; - if (end_pfn_map > MAXMEM>>PAGE_SHIFT) - end_pfn_map = MAXMEM>>PAGE_SHIFT; + if (end_pfn > max_pfn_mapped) + max_pfn_mapped = end_pfn; + if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT) + max_pfn_mapped = MAXMEM>>PAGE_SHIFT; if (end_pfn > end_user_pfn) end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; + if (end_pfn > max_pfn_mapped) + end_pfn = max_pfn_mapped; - printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); + printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped); return end_pfn; } /* * Mark e820 reserved areas as busy for the resource manager. */ -void __init e820_reserve_resources(struct e820entry *e820, int nr_map, - struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) +void __init e820_reserve_resources(struct e820entry *e820, int nr_map) { int i; + struct resource *res; + + res = alloc_bootmem_low(sizeof(struct resource) * nr_map); for (i = 0; i < nr_map; i++) { - struct resource *res; - res = alloc_bootmem_low(sizeof(struct resource)); switch (e820[i].type) { case E820_RAM: res->name = "System RAM"; break; case E820_ACPI: res->name = "ACPI Tables"; break; @@ -267,26 +362,8 @@ void __init e820_reserve_resources(struc res->start = e820[i].addr; res->end = res->start + e820[i].size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - request_resource(&iomem_resource, res); - if (e820[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ -#ifndef CONFIG_XEN - request_resource(res, code_resource); - request_resource(res, data_resource); - request_resource(res, bss_resource); -#endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - request_resource(res, &crashk_res); -#ifdef CONFIG_XEN - xen_machine_kexec_register_resources(res); -#endif -#endif - } + insert_resource(&iomem_resource, res); + res++; } } @@ -345,9 +422,9 @@ static int __init e820_find_active_regio if (*ei_startpfn >= *ei_endpfn) return 0; - /* Check if end_pfn_map should be updated */ - if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) - end_pfn_map = *ei_endpfn; + /* Check if max_pfn_mapped should be updated */ + if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped) + max_pfn_mapped = *ei_endpfn; /* Skip if map is outside the node */ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || @@ -678,10 +755,10 @@ static int __init copy_e820_map(struct e #endif do { - unsigned long start = biosmap->addr; - unsigned long size = biosmap->size; - unsigned long end = start + size; - unsigned long type = biosmap->type; + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) @@ -731,11 +808,7 @@ char * __init machine_specific_memory_se char *who = "Xen"; int rc; struct xen_memory_map memmap; - /* - * This is rather large for a stack variable but this early in - * the boot process we know we have plenty slack space. - */ - struct e820entry map[E820MAX]; + static struct e820entry __initdata map[E820MAX]; memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); @@ -812,7 +885,7 @@ static int __init parse_memmap_opt(char saved_max_pfn = e820_end_of_ram(); remove_all_active_ranges(); #endif - end_pfn_map = 0; + max_pfn_mapped = 0; e820.nr_map = 0; userdef = 1; return 0; --- head-2011-03-11.orig/arch/x86/kernel/early_printk-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/early_printk-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -13,7 +13,7 @@ #define VGABASE (__ISA_IO_base + 0xb8000) static int max_ypos = 25, max_xpos = 80; -static int current_ypos = 25, current_xpos = 0; +static int current_ypos = 25, current_xpos; static void early_vga_write(struct console *con, const char *str, unsigned n) { @@ -108,12 +108,12 @@ static __init void early_serial_init(cha if (*s) { unsigned port; - if (!strncmp(s,"0x",2)) { + if (!strncmp(s, "0x", 2)) { early_serial_base = simple_strtoul(s, &e, 16); } else { static int bases[] = { 0x3f8, 0x2f8 }; - if (!strncmp(s,"ttyS",4)) + if (!strncmp(s, "ttyS", 4)) s += 4; port = simple_strtoul(s, &e, 10); if (port > 1 || s == e) @@ -223,7 +223,7 @@ static struct console simnow_console = { /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; -static int early_console_initialized = 0; +static int early_console_initialized; void early_printk(const char *fmt, ...) { @@ -231,9 +231,9 @@ void early_printk(const char *fmt, ...) int n; va_list ap; - va_start(ap,fmt); - n = vscnprintf(buf,512,fmt,ap); - early_console->write(early_console,buf,n); + va_start(ap, fmt); + n = vscnprintf(buf, 512, fmt, ap); + early_console->write(early_console, buf, n); va_end(ap); } @@ -259,16 +259,16 @@ static int __init setup_early_printk(cha early_console = &early_serial_console; } else if (!strncmp(buf, "vga", 3)) { #ifndef CONFIG_XEN - && boot_params.screen_info.orig_video_isVGA == 1) { + && boot_params.screen_info.orig_video_isVGA == 1) { max_xpos = boot_params.screen_info.orig_video_cols; max_ypos = boot_params.screen_info.orig_video_lines; current_ypos = boot_params.screen_info.orig_y; #endif early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; + } else if (!strncmp(buf, "simnow", 6)) { + simnow_init(buf + 6); + early_console = &simnow_console; + keep_early = 1; #ifdef CONFIG_XEN } else if (!strncmp(buf, "xen", 3)) { early_console = &xenboot_console; --- head-2011-03-11.orig/arch/x86/kernel/entry_32-xen.S 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/entry_32-xen.S 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,4 @@ /* - * linux/arch/i386/entry.S * * Copyright (C) 1991, 1992 Linus Torvalds */ @@ -51,6 +50,7 @@ #include <asm/desc.h> #include <asm/percpu.h> #include <asm/dwarf2.h> +#include <asm/processor-flags.h> #include "irq_vectors.h" #include <xen/interface/xen.h> @@ -69,12 +69,6 @@ #define nr_syscalls ((syscall_table_size)/4) -CF_MASK = 0x00000001 -TF_MASK = 0x00000100 -IF_MASK = 0x00000200 -DF_MASK = 0x00000400 -NT_MASK = 0x00004000 -VM_MASK = 0x00020000 /* Pseudo-eflags. */ NMI_MASK = 0x80000000 @@ -87,7 +81,7 @@ NMI_MASK = 0x80000000 .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS - testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? jz 1f TRACE_IRQS_ON 1: @@ -249,7 +243,7 @@ ret_from_intr: check_userspace: movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al - andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace @@ -258,6 +252,7 @@ ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret + TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done on # int/exception return? @@ -274,7 +269,7 @@ need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all - testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq jmp need_resched @@ -299,10 +294,10 @@ ENTRY(ia32_sysenter_target) movl SYSENTER_stack_sp0(%esp),%esp sysenter_past_esp: /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. */ - ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -310,6 +305,7 @@ sysenter_past_esp: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET esp, 0 pushfl + orl $X86_EFLAGS_IF, (%esp) CFI_ADJUST_CFA_OFFSET 4 pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 @@ -323,6 +319,11 @@ sysenter_past_esp: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + /* * Load the potential sixth argument from user stack. * Careful about security. @@ -330,14 +331,12 @@ sysenter_past_esp: cmpl $__PAGE_OFFSET-3,%ebp jae syscall_fault 1: movl (%ebp),%ebp + movl %ebp,PT_EBP(%esp) .section __ex_table,"a" .align 4 .long 1b,syscall_fault .previous - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL GET_THREAD_INFO(%ebp) test_tif %ebp jnz syscall_trace_entry @@ -414,7 +413,7 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF - testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit + testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit jz no_singlestep orl $_TIF_SINGLESTEP,TI_flags(%ebp) no_singlestep: @@ -430,7 +429,7 @@ restore_all: # See comments in process.c:copy_thread() for details. movb PT_OLDSS(%esp), %ah movb PT_CS(%esp), %al - andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS @@ -438,7 +437,7 @@ restore_nocheck: #else restore_nocheck: movl PT_EFLAGS(%esp), %eax - testl $(VM_MASK|NMI_MASK), %eax + testl $(X86_EFLAGS_VM|NMI_MASK), %eax CFI_REMEMBER_STATE jnz hypervisor_iret shr $9, %eax # EAX[0] == IRET_EFLAGS.IF @@ -456,7 +455,7 @@ restore_nocheck_notrace: irq_return: INTERRUPT_RETURN .section .fixup,"ax" -iret_exc: +ENTRY(iret_exc) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -560,7 +559,7 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests #ifdef CONFIG_VM86 - testl $VM_MASK, PT_EFLAGS(%esp) + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space @@ -617,9 +616,6 @@ END(syscall_exit_work) RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: - pushl %eax # save orig_eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace --- head-2011-03-11.orig/arch/x86/kernel/entry_64-xen.S 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/entry_64-xen.S 2011-01-31 18:07:35.000000000 +0100 @@ -331,19 +331,17 @@ badsys: /* Do syscall tracing */ tracesys: SAVE_REST - movq $-ENOSYS,RAX(%rsp) + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST cmpq $__NR_syscall_max,%rax - movq $-ENOSYS,%rcx - cmova %rcx,%rax - ja 1f + ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) -1: movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ /* --- head-2011-03-11.orig/arch/x86/kernel/head64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/head64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -17,6 +17,7 @@ #include <linux/string.h> #include <linux/percpu.h> #include <linux/start_kernel.h> +#include <linux/io.h> #include <linux/module.h> #include <asm/processor.h> @@ -29,6 +30,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> +#include <asm/bios_ebda.h> unsigned long start_pfn; @@ -75,34 +77,75 @@ EXPORT_SYMBOL(machine_to_phys_mapping); unsigned int machine_to_phys_order; EXPORT_SYMBOL(machine_to_phys_order); -#define EBDA_ADDR_POINTER 0x40E +#define BIOS_LOWMEM_KILOBYTES 0x413 -static __init void reserve_ebda(void) +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +static void __init reserve_ebda_region(void) { #ifndef CONFIG_XEN - unsigned ebda_addr, ebda_size; + unsigned int lowmem, ebda_addr; - /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); - ebda_addr <<= 4; - - if (!ebda_addr) + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) return; - ebda_size = *(unsigned short *)__va(ebda_addr); + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; - /* Round EBDA up to pages */ - if (ebda_size == 0) - ebda_size = 1; - ebda_size <<= 10; - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); - if (ebda_size > 64*1024) - ebda_size = 64*1024; + /* reserve all memory between lowmem and the 1MB mark */ + reserve_early(lowmem, 0x100000, "BIOS reserved"); +#endif +} - reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA"); +static void __init reserve_setup_data(void) +{ +#ifndef CONFIG_XEN + struct setup_data *data; + unsigned long pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } #endif } @@ -112,6 +155,19 @@ void __init x86_64_start_kernel(char * r unsigned long machine_to_phys_nr_ents; int i; + /* + * Build-time sanity checks on the kernel image and module + * area mappings. (these are purely build-time and produce no code) + */ + BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START); + BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE); + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0); + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); + xen_setup_features(); xen_start_info = (struct start_info *)real_mode_data; @@ -140,7 +196,7 @@ void __init x86_64_start_kernel(char * r /* Cleanup the over mapped high alias */ cleanup_highmap(); - for (i = 0; i < IDT_ENTRIES; i++) { + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { #ifdef CONFIG_EARLY_PRINTK set_intr_gate(i, &early_idt_handlers[i]); #else @@ -163,7 +219,8 @@ void __init x86_64_start_kernel(char * r reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), start_pfn << PAGE_SHIFT, "Xen provided"); - reserve_ebda(); + reserve_ebda_region(); + reserve_setup_data(); /* * At this point everything still needed from the boot loader --- head-2011-03-11.orig/arch/x86/kernel/head_32-xen.S 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/head_32-xen.S 2011-01-31 18:07:35.000000000 +0100 @@ -69,7 +69,7 @@ ENTRY(startup_32) cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder - jmp start_kernel + jmp i386_start_kernel #define HYPERCALL_PAGE_OFFSET 0x1000 .org HYPERCALL_PAGE_OFFSET --- head-2011-03-11.orig/arch/x86/kernel/io_apic_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/io_apic_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -88,6 +88,16 @@ int sis_apic_bug = -1; */ int nr_ioapic_registers[MAX_IO_APICS]; +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + static int disable_timer_pin_1 __initdata; /* @@ -863,10 +873,7 @@ static int __init find_isa_irq_pin(int i for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA - ) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -882,10 +889,7 @@ static int __init find_isa_irq_apic(int for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA - ) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -926,7 +930,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + if (!test_bit(lbus, mp_bus_not_pci) && !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { @@ -977,6 +981,7 @@ void __init setup_ioapic_dest(void) #endif /* !CONFIG_XEN */ #endif +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR */ @@ -990,6 +995,13 @@ static int EISA_ELCR(unsigned int irq) "Broken MPtable reports ISA irq %d\n", irq); return 0; } +#endif + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as @@ -997,13 +1009,7 @@ static int EISA_ELCR(unsigned int irq) * be read in from the ELCR */ #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) /* PCI interrupts are always polarity one level triggered, * when listed as conforming in the MP table. */ @@ -1015,7 +1021,7 @@ static int EISA_ELCR(unsigned int irq) * when listed as conforming in the MP table. */ #define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) static int MPBIOS_polarity(int idx) { @@ -1029,35 +1035,9 @@ static int MPBIOS_polarity(int idx) { case 0: /* conforms, ie. bus-type dependent polarity */ { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } + polarity = test_bit(bus, mp_bus_not_pci)? + default_ISA_polarity(idx): + default_PCI_polarity(idx); break; } case 1: /* high active */ @@ -1098,11 +1078,15 @@ static int MPBIOS_trigger(int idx) { case 0: /* conforms, ie. bus-type dependent */ { + trigger = test_bit(bus, mp_bus_not_pci)? + default_ISA_trigger(idx): + default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ { - trigger = default_ISA_trigger(idx); + /* set before the switch */ break; } case MP_BUS_EISA: /* EISA pin */ @@ -1112,7 +1096,7 @@ static int MPBIOS_trigger(int idx) } case MP_BUS_PCI: /* PCI pin */ { - trigger = default_PCI_trigger(idx); + /* set before the switch */ break; } case MP_BUS_MCA: /* MCA pin */ @@ -1127,6 +1111,7 @@ static int MPBIOS_trigger(int idx) break; } } +#endif break; } case 1: /* edge */ @@ -1176,39 +1161,22 @@ static int pin_2_irq(int idx, int apic, if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + if (test_bit(bus, mp_bus_not_pci)) + irq = mp_irqs[idx].mpc_srcbusirq; + else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } /* @@ -1314,7 +1282,6 @@ static void __init setup_IO_APIC_irqs(vo { struct IO_APIC_route_entry entry; int apic, pin, idx, irq, first_notcon = 1, vector; - unsigned long flags; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1380,9 +1347,7 @@ static void __init setup_IO_APIC_irqs(vo if (!apic && (irq < 16)) disable_8259A_irq(irq); } - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, entry); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } } @@ -1577,8 +1542,8 @@ void /*__init*/ print_local_APIC(void * printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, + GET_APIC_ID(read_apic_id())); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1791,7 +1756,7 @@ void disable_IO_APIC(void) entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; entry.dest.physical.physical_dest = - GET_APIC_ID(apic_read(APIC_ID)); + GET_APIC_ID(read_apic_id()); /* * Add it to the IO-APIC irq-routing table: @@ -2090,8 +2055,7 @@ static inline void init_IO_APIC_traps(vo * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for (irq = 0; irq < NR_IRQS ; irq++) { - int tmp = irq; - if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { + if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2166,7 +2130,7 @@ static void __init setup_nmi(void) * cycles as some i82489DX-based boards have glue logic that keeps the * 8259A interrupt line asserted until INTA. --macro */ -static inline void unlock_ExtINT_logic(void) +static inline void __init unlock_ExtINT_logic(void) { int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; @@ -2218,8 +2182,6 @@ static inline void unlock_ExtINT_logic(v ioapic_write_entry(apic, pin, entry0); } -int timer_uses_ioapic_pin_0; - /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2259,9 +2221,6 @@ static inline void __init check_timer(vo pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - if (pin1 == 0) - timer_uses_ioapic_pin_0 = 1; - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); @@ -2555,6 +2514,7 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); + clear_bit(irq_vector[irq], used_vectors); irq_vector[irq] = 0; spin_unlock_irqrestore(&vector_lock, flags); } @@ -2871,7 +2831,6 @@ int __init io_apic_get_redir_entries (in int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) { struct IO_APIC_route_entry entry; - unsigned long flags; if (!IO_APIC_IRQ(irq)) { printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -2912,9 +2871,7 @@ int io_apic_set_pci_routing (int ioapic, if (!ioapic && (irq < 16)) disable_8259A_irq(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(ioapic, pin, entry); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic, pin, entry); return 0; } --- head-2011-03-11.orig/arch/x86/kernel/io_apic_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/io_apic_64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -43,13 +43,15 @@ #include <asm/smp.h> #include <asm/desc.h> #include <asm/proto.h> -#include <asm/mach_apic.h> #include <asm/acpi.h> #include <asm/dma.h> #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> +#include <mach_ipi.h> +#include <mach_apic.h> + struct irq_cfg { #ifndef CONFIG_XEN cpumask_t domain; @@ -101,6 +103,16 @@ DEFINE_SPINLOCK(vector_lock); */ int nr_ioapic_registers[MAX_IO_APICS]; +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -181,11 +193,10 @@ static inline void io_apic_modify(unsign writel(value, &io_apic->data); } -static int io_apic_level_ack_pending(unsigned int irq) +static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; unsigned long flags; - int pending = 0; spin_lock_irqsave(&ioapic_lock, flags); entry = irq_2_pin + irq; @@ -198,13 +209,17 @@ static int io_apic_level_ack_pending(uns break; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - pending |= (reg >> 14) & 1; + if ((reg >> 14) & 1) { + spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } if (!entry->next) break; entry = irq_2_pin + entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); - return pending; + + return false; } #endif @@ -762,7 +777,7 @@ static void __clear_irq_vector(int irq) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cfg->domain = CPU_MASK_NONE; + cpus_clear(cfg->domain); } void __setup_vector_irq(int cpu) @@ -899,9 +914,8 @@ static void __init setup_IO_APIC_irqs(vo static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; - unsigned long flags; - memset(&entry,0,sizeof(entry)); + memset(&entry, 0, sizeof(entry)); disable_8259A_irq(0); @@ -929,10 +943,7 @@ static void __init setup_ExtINT_IRQ0_pin /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); enable_8259A_irq(0); } @@ -1061,8 +1072,7 @@ void __apicdebuginit print_local_APIC(vo printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1260,7 +1270,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest = GET_APIC_ID(apic_read(APIC_ID)); + entry.dest = GET_APIC_ID(read_apic_id()); /* * Add it to the IO-APIC irq-routing table: @@ -1353,9 +1363,7 @@ static int ioapic_retrigger_irq(unsigned unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - cpus_clear(mask); - cpu_set(first_cpu(cfg->domain), mask); - + mask = cpumask_of_cpu(first_cpu(cfg->domain)); send_IPI_mask(mask, cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); @@ -1519,8 +1527,7 @@ static inline void init_IO_APIC_traps(vo * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for (irq = 0; irq < NR_IRQS ; irq++) { - int tmp = irq; - if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) { + if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -1597,22 +1604,19 @@ static void __init setup_nmi(void) * cycles as some i82489DX-based boards have glue logic that keeps the * 8259A interrupt line asserted until INTA. --macro */ -static inline void unlock_ExtINT_logic(void) +static inline void __init unlock_ExtINT_logic(void) { int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; - unsigned long flags; pin = find_isa_irq_pin(8, mp_INT); apic = find_isa_irq_apic(8, mp_INT); if (pin == -1) return; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -1625,10 +1629,7 @@ static inline void unlock_ExtINT_logic(v entry1.trigger = 0; entry1.vector = 0; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry1); save_control = CMOS_READ(RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); @@ -1647,10 +1648,7 @@ static inline void unlock_ExtINT_logic(v CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); clear_IO_APIC_pin(apic, pin); - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry0); } /* @@ -2327,7 +2325,6 @@ static struct resource * __init ioapic_s res = (void *)mem; if (mem != NULL) { - memset(mem, 0, n); mem += sizeof(struct resource) * nr_ioapics; for (i = 0; i < nr_ioapics; i++) { --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:56:33.000000000 +0100 @@ -0,0 +1,63 @@ +#include <linux/cpumask.h> +#include <linux/interrupt.h> + +#include <asm/smp.h> + +#ifdef CONFIG_X86_32 +#include <xen/evtchn.h> + +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); + +static inline void __send_IPI_one(unsigned int cpu, int vector) +{ + int irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + +void __send_IPI_shortcut(unsigned int shortcut, int vector) +{ + int cpu; + + switch (shortcut) { + case APIC_DEST_SELF: + __send_IPI_one(smp_processor_id(), vector); + break; + case APIC_DEST_ALLBUT: + for_each_online_cpu(cpu) + if (cpu != smp_processor_id()) + __send_IPI_one(cpu, vector); + break; + default: + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, + vector); + break; + } +} + +void send_IPI_self(int vector) +{ + __send_IPI_shortcut(APIC_DEST_SELF, vector); +} + +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +{ + cpumask_t mask; + unsigned int cpu; + + cpus_andnot(mask, cpumask, cpu_online_map); + WARN_ON(!cpus_empty(mask)); + for_each_online_cpu(cpu) + if (cpu_isset(cpu, cpumask)) + __send_IPI_one(cpu, vector); +} + +void send_IPI_mask_sequence(cpumask_t mask, int vector) +{ + send_IPI_mask_bitmask(mask, vector); +} + +/* must come after the send_IPI functions above for inlining */ +#include <mach_ipi.h> + +#endif --- head-2011-03-11.orig/arch/x86/kernel/machine_kexec_64.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/machine_kexec_64.c 2011-01-31 18:07:35.000000000 +0100 @@ -114,8 +114,6 @@ int __init machine_kexec_setup_resources return 0; } -void machine_kexec_register_resources(struct resource *res) { ; } - #else /* CONFIG_XEN */ #define x__pmd(x) __pmd(x) --- head-2011-03-11.orig/arch/x86/kernel/microcode-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/microcode-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -162,7 +162,7 @@ static int request_microcode(void) c->x86, c->x86_model, c->x86_mask); error = request_firmware(&firmware, name, µcode_pdev->dev); if (error) { - pr_debug("ucode data file %s load failed\n", name); + pr_debug("microcode: ucode data file %s load failed\n", name); return error; } --- head-2011-03-11.orig/arch/x86/kernel/mmconf-fam10h_64.c 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/mmconf-fam10h_64.c 2011-01-31 18:07:35.000000000 +0100 @@ -205,12 +205,20 @@ void __cpuinit fam10h_check_enable_mmcfg return; } +#ifndef CONFIG_XEN printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)); val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | FAM10H_MMIO_CONF_ENABLE; wrmsrl(address, val); +#else + if ((val & ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | + (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT))) + != (fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | + FAM10H_MMIO_CONF_ENABLE)) + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; +#endif } static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/kernel/mpparse-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,1117 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de> + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/bitops.h> +#include <linux/acpi.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/pgalloc.h> +#include <asm/io_apic.h> +#include <asm/proto.h> +#include <asm/acpi.h> +#include <asm/bios_ebda.h> + +#include <mach_apic.h> +#ifdef CONFIG_X86_32 +#include <mach_apicdef.h> +#include <mach_mpparse.h> +#endif + +/* Have we found an MP table */ +int smp_found_config; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); +int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; + +static int mp_current_pci_id; + +int pic_mode; + +/* + * Intel MP BIOS table parsing routines: + */ + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +#ifdef CONFIG_X86_NUMAQ +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ + +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] + __cpuinitdata; +#endif + +static void __cpuinit MP_processor_info(struct mpc_config_processor *m) +{ +#ifndef CONFIG_XEN + int apicid; + char *bootup_cpu = ""; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } +#ifdef CONFIG_X86_NUMAQ + apicid = mpc_apic_id(m, translation_table[mpc_record]); +#else + apicid = m->mpc_apicid; +#endif + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_physical_apicid = m->mpc_apicid; + } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + generic_processor_info(apicid, m->mpc_apicver); +#else /* CONFIG_XEN */ + num_processors++; +#endif +} + +static void __init MP_bus_info(struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + +#ifdef CONFIG_X86_NUMAQ + mpc_oem_bus_info(m, str, translation_table[mpc_record]); +#else + Dprintk("Bus #%d is %s\n", m->mpc_busid, str); +#endif + +#if MAX_MP_BUSSES < 256 + if (m->mpc_busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->mpc_busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { + set_bit(m->mpc_busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined (CONFIG_MCA) + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { +#ifdef CONFIG_X86_NUMAQ + mpc_oem_pci_bus(m, translation_table[mpc_record]); +#endif + clear_bit(m->mpc_busid, mp_bus_not_pci); + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; +#if defined(CONFIG_EISA) || defined (CONFIG_MCA) + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; +#endif + } else + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); +} + +#ifdef CONFIG_X86_IO_APIC + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +static void __init MP_ioapic_info(struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + + if (bad_ioapic(m->mpc_apicaddr)) + return; + + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info(struct mpc_config_intsrc *m) +{ + mp_irqs[mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +#endif + +static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); +} + +#ifdef CONFIG_X86_NUMAQ +static void __init MP_translation_info(struct mpc_config_translation *m) +{ + printk(KERN_INFO + "Translation: record %d, type %d, quad %d, global %d, local %d\n", + mpc_record, m->trans_type, m->trans_quad, m->trans_global, + m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, + unsigned short oemsize) +{ + int count = sizeof(*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable) + count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", + oemtable); + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { + printk(KERN_WARNING + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], oemtable->oem_signature[1], + oemtable->oem_signature[2], oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m = + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING + "Unrecognised OEM table entry type! - %d\n", + (int)*oemptr); + return; + } + } + } +} + +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! May not be a NUMA-Q system!\n"); + if (mpc->mpc_oemptr) + smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, + mpc->mpc_oemsize); +} +#endif /* CONFIG_X86_NUMAQ */ + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->mpc_signature[0], mpc->mpc_signature[1], + mpc->mpc_signature[2], mpc->mpc_signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) { + printk(KERN_ERR "MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) { + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->mpc_oem, 8); + oem[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); + + memcpy(str, mpc->mpc_productid, 12); + str[12] = 0; + printk("Product ID: %s ", str); + +#ifdef CONFIG_X86_32 + mps_oem_check(mpc, oem, str); +#endif + printk(KERN_INFO "MPTABLE: Product ID: %s ", str); + +#ifndef CONFIG_XEN + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); + + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; +#endif + + if (early) + return 1; + + /* + * Now process the configuration blocks. + */ +#ifdef CONFIG_X86_NUMAQ + mpc_record = 0; +#endif + while (count < mpc->mpc_length) { + switch (*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m = + (struct mpc_config_processor *)mpt; + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m = + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { +#ifdef CONFIG_X86_IO_APIC + struct mpc_config_ioapic *m = + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); +#endif + mpt += sizeof(struct mpc_config_ioapic); + count += sizeof(struct mpc_config_ioapic); + break; + } + case MP_INTSRC: + { +#ifdef CONFIG_X86_IO_APIC + struct mpc_config_intsrc *m = + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); +#endif + mpt += sizeof(struct mpc_config_intsrc); + count += sizeof(struct mpc_config_intsrc); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m = + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + default: + /* wrong mptable */ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); + printk(KERN_ERR "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->mpc_length, 1); + count = mpc->mpc_length; + break; + } +#ifdef CONFIG_X86_NUMAQ + ++mpc_record; +#endif + } + setup_apic_routing(); + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " + "falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... " + "not using ELCR\n"); + else { + printk(KERN_INFO + "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; +#ifdef CONFIG_X86_IO_APIC + struct mpc_config_ioapic ioapic; +#endif + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + +#ifndef CONFIG_XEN + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +#endif + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + +#ifdef CONFIG_X86_IO_APIC + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +#endif + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +#ifndef CONFIG_XEN +static void __init __get_smp_config(unsigned early) +#else +void __init get_smp_config(void) +#define early 0 +#endif +{ + struct intel_mp_floating *mpf = mpf_found; + + if (acpi_lapic && early) + return; + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + return; + } else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", + mpf->mpf_specification); +#ifdef CONFIG_X86_32 + if (mpf->mpf_feature2 & (1 << 7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { +#ifndef CONFIG_XEN + if (early) { + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + return; + } +#endif + + printk(KERN_INFO "Default MP configuration #%d\n", + mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) { + smp_found_config = 0; + printk(KERN_ERR + "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. " + "(tell your hw vendor)\n"); + return; + } + + if (early) + return; +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. " + "(tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + } else + BUG(); + + if (!early) + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +#undef early +} + +#ifndef CONFIG_XEN +void __init early_get_smp_config(void) +{ + __get_smp_config(1); +} + +void __init get_smp_config(void) +{ + __get_smp_config(0); +} +#endif + +static int __init smp_scan_config(unsigned long base, unsigned long length, + unsigned reserve) +{ + unsigned int *bp = isa_bus_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4))) { + + smp_found_config = 1; + mpf_found = mpf; +#ifdef CONFIG_X86_32 +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); + if (mpf->mpf_physptr) { + /* + * We cannot access to MPC table to compute + * table size yet, as only few megabytes from + * the bottom is mapped now. + * PC-9800's MPC table places on the very last + * of physical memory; so that simply reserving + * PAGE_SIZE from mpg->mpf_physptr yields BUG() + * in reserve_bootmem. + */ + unsigned long size = PAGE_SIZE; + unsigned long end = max_low_pfn * PAGE_SIZE; + if (mpf->mpf_physptr + size > end) + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size, + BOOTMEM_DEFAULT); + } +#else + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, ((void *)bp - isa_bus_to_virt(base)) + base); +#endif +#elif !defined(CONFIG_XEN) + if (!reserve) + return 1; + + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) + reserve_bootmem_generic(mpf->mpf_physptr, + PAGE_SIZE); +#endif + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +static void __init __find_smp_config(unsigned reserve) +{ +#ifndef CONFIG_XEN + unsigned int address; +#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400, reserve) || + smp_scan_config(639 * 0x400, 0x400, reserve) || + smp_scan_config(0xF0000, 0x10000, reserve)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400, reserve); +#endif +} + +void __init early_find_smp_config(void) +{ + __find_smp_config(0); +} + +void __init find_smp_config(void) +{ + __find_smp_config(1); +} + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +/* + * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: + */ +int es7000_plat; + +#ifdef CONFIG_ACPI + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 + +extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; + +static int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +static u8 __init uniq_ioapic_id(u8 id) +{ +#ifdef CONFIG_X86_32 +#ifndef CONFIG_XEN + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else +#endif + return id; +#else + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); +#ifdef CONFIG_X86_32 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); +#else + mp_ioapics[idx].mpc_apicver = 0; +#endif + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; +} + +void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + MP_intsrc_info(&intsrc); +} + +void __init mp_config_acpi_legacy_irqs(void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; +#endif + set_bit(MP_ISA_BUS, mp_bus_not_pci); + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; +#ifdef CONFIG_X86_IO_APIC + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; +#endif + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS + && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + MP_intsrc_info(&intsrc); + } +} + +int mp_register_gsi(u32 gsi, int triggering, int polarity) +{ + int ioapic; + int ioapic_pin; +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) +#define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 + + static int pci_irq = IRQ_COMPRESSION_START; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; +#else + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; +#endif + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + +#ifndef CONFIG_X86_32 + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); +#endif + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (ioapic_pin > MP_MAX_IOAPIC_PIN) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); +#else + return gsi; +#endif + } + + set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_gbl_FADT.sci_interrupt) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } +#endif + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ --- head-2011-03-11.orig/arch/x86/kernel/mpparse_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,1163 +0,0 @@ -/* - * Intel Multiprocessor Specification 1.1 and 1.4 - * compliant MP-table parsing routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Erich Boleyn : MP v1.4 and additional changes. - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Maciej W. Rozycki: Bits for default MP configurations - * Paul Diefenbaugh: Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/acpi.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/bitops.h> - -#include <asm/smp.h> -#include <asm/acpi.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/io_apic.h> - -#include <mach_apic.h> -#include <mach_apicdef.h> -#include <mach_mpparse.h> -#include <bios_ebda.h> - -/* Have we found an MP table */ -int smp_found_config; -unsigned int __cpuinitdata maxcpus = NR_CPUS; - -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -int apic_version [MAX_APICS]; -int mp_bus_id_to_type [MAX_MP_BUSSES]; -int mp_bus_id_to_node [MAX_MP_BUSSES]; -int mp_bus_id_to_local [MAX_MP_BUSSES]; -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -static int mp_current_pci_id; - -/* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; - -/* # of MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* MP IRQ source entries */ -int mp_irq_entries; - -int nr_ioapics; - -int pic_mode; -unsigned long mp_lapic_addr; - -unsigned int def_to_bigsmp = 0; - -#ifndef CONFIG_XEN -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; -#endif -/* Internal processor count */ -unsigned int num_processors; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; - -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - -/* - * Intel MP BIOS table parsing routines: - */ - - -/* - * Checksum an MP configuration block. - */ - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ - -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata; - -#ifndef CONFIG_XEN -static void __cpuinit MP_processor_info (struct mpc_config_processor *m) -{ - int ver, apicid; - physid_mask_t phys_cpu; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) - return; - - apicid = mpc_apic_id(m, translation_table[mpc_record]); - - if (m->mpc_featureflag&(1<<0)) - Dprintk(" Floating point unit present.\n"); - if (m->mpc_featureflag&(1<<7)) - Dprintk(" Machine Exception supported.\n"); - if (m->mpc_featureflag&(1<<8)) - Dprintk(" 64 bit compare & exchange supported.\n"); - if (m->mpc_featureflag&(1<<9)) - Dprintk(" Internal APIC present.\n"); - if (m->mpc_featureflag&(1<<11)) - Dprintk(" SEP present.\n"); - if (m->mpc_featureflag&(1<<12)) - Dprintk(" MTRR present.\n"); - if (m->mpc_featureflag&(1<<13)) - Dprintk(" PGE present.\n"); - if (m->mpc_featureflag&(1<<14)) - Dprintk(" MCA present.\n"); - if (m->mpc_featureflag&(1<<15)) - Dprintk(" CMOV present.\n"); - if (m->mpc_featureflag&(1<<16)) - Dprintk(" PAT present.\n"); - if (m->mpc_featureflag&(1<<17)) - Dprintk(" PSE present.\n"); - if (m->mpc_featureflag&(1<<18)) - Dprintk(" PSN present.\n"); - if (m->mpc_featureflag&(1<<19)) - Dprintk(" Cache Line Flush Instruction present.\n"); - /* 20 Reserved */ - if (m->mpc_featureflag&(1<<21)) - Dprintk(" Debug Trace and EMON Store present.\n"); - if (m->mpc_featureflag&(1<<22)) - Dprintk(" ACPI Thermal Throttle Registers present.\n"); - if (m->mpc_featureflag&(1<<23)) - Dprintk(" MMX present.\n"); - if (m->mpc_featureflag&(1<<24)) - Dprintk(" FXSR present.\n"); - if (m->mpc_featureflag&(1<<25)) - Dprintk(" XMM present.\n"); - if (m->mpc_featureflag&(1<<26)) - Dprintk(" Willamette New Instructions present.\n"); - if (m->mpc_featureflag&(1<<27)) - Dprintk(" Self Snoop present.\n"); - if (m->mpc_featureflag&(1<<28)) - Dprintk(" HT present.\n"); - if (m->mpc_featureflag&(1<<29)) - Dprintk(" Thermal Monitor present.\n"); - /* 30, 31 Reserved */ - - - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); - boot_cpu_physical_apicid = m->mpc_apicid; - } - - ver = m->mpc_apicver; - - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; - - phys_cpu = apicid_to_cpu_present(apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - - cpu_set(num_processors, cpu_possible_map); - num_processors++; - - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj <ashok.raj@intel.com> - */ - if (num_processors > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(ver)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; -} -#else -static void __cpuinit MP_processor_info (struct mpc_config_processor *m) -{ - num_processors++; -} -#endif /* CONFIG_XEN */ - -static void __init MP_bus_info (struct mpc_config_bus *m) -{ - char str[7]; - - memcpy(str, m->mpc_bustype, 6); - str[6] = 0; - - mpc_oem_bus_info(m, str, translation_table[mpc_record]); - -#if MAX_MP_BUSSES < 256 - if (m->mpc_busid >= MAX_MP_BUSSES) { - printk(KERN_WARNING "MP table busid value (%d) for bustype %s " - " is too large, max. supported is %d\n", - m->mpc_busid, str, MAX_MP_BUSSES - 1); - return; - } -#endif - - if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; - } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { - mpc_oem_pci_bus(m, translation_table[mpc_record]); - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; - } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else { - printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); - } -} - -static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -{ - if (!(m->mpc_flags & MPC_APIC_USABLE)) - return; - - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", - MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); - } - if (!m->mpc_apicaddr) { - printk(KERN_ERR "WARNING: bogus zero I/O APIC address" - " found in MP table, skipping!\n"); - return; - } - mp_ioapics[nr_ioapics] = *m; - nr_ioapics++; -} - -static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -{ - mp_irqs [mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -{ - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); -} - -#ifdef CONFIG_X86_NUMAQ -static void __init MP_translation_info (struct mpc_config_translation *m) -{ - printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ - unsigned short oemsize) -{ - int count = sizeof (*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable)+count; - - mpc_record = 0; - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); - if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) - { - printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->oem_signature[0], - oemtable->oem_signature[1], - oemtable->oem_signature[2], - oemtable->oem_signature[3]); - return; - } - if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) - { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - while (count < oemtable->oem_length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_config_translation *m= - (struct mpc_config_translation *)oemptr; - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - { - printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); - return; - } - } - } -} - -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! May not be a NUMA-Q system!\n"); - if (mpc->mpc_oemptr) - smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, - mpc->mpc_oemsize); -} -#endif /* CONFIG_X86_NUMAQ */ - -/* - * Read/parse the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - char oem[10]; - int count=sizeof(*mpc); - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", - *(u32 *)mpc->mpc_signature); - return 0; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk(KERN_ERR "SMP mptable: checksum error!\n"); - return 0; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", - mpc->mpc_spec); - return 0; - } - if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); - return 0; - } - memcpy(oem,mpc->mpc_oem,8); - oem[8]=0; - printk(KERN_INFO "OEM ID: %s ",oem); - - memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); - - mps_oem_check(mpc, oem, str); - - printk("APIC at: 0x%X\n", mpc->mpc_lapic); - - /* - * Save the local APIC address (it might be non-default) -- but only - * if we're not using ACPI. - */ - if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - mpc_record = 0; - while (count < mpc->mpc_length) { - switch(*mpt) { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - /* ACPI may have already provided this data */ - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - MP_bus_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_lintsrc *m= - (struct mpc_config_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - default: - { - count = mpc->mpc_length; - break; - } - } - ++mpc_record; - } - setup_apic_routing(); - if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); - return num_processors; -} - -static int __init ELCR_trigger(unsigned int irq) -{ - unsigned int port; - - port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; -} - -static void __init construct_default_ioirq_mptable(int mpc_default_type) -{ - struct mpc_config_intsrc intsrc; - int i; - int ELCR_fallback = 0; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; - - intsrc.mpc_irqtype = mp_INT; - - /* - * If true, we have an ISA/PCI system with no IRQ entries - * in the MP table. To prevent the PCI interrupts from being set up - * incorrectly, we try to use the ELCR. The sanity check to see if - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can - * never be level sensitive, so we simply see if the ELCR agrees. - * If it does, we assume it's valid. - */ - if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); - - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); - else { - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); - ELCR_fallback = 1; - } - } - - for (i = 0; i < 16; i++) { - switch (mpc_default_type) { - case 2: - if (i == 0 || i == 13) - continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ - default: - if (i == 2) - continue; /* IRQ2 is never connected */ - } - - if (ELCR_fallback) { - /* - * If the ELCR indicates a level-sensitive interrupt, we - * copy that information over to the MP table in the - * irqflag field (level sensitive, active high polarity). - */ - if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; - else - intsrc.mpc_irqflag = 0; - } - - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); - } - - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); -} - -static inline void __init construct_default_ISA_mptable(int mpc_default_type) -{ - struct mpc_config_processor processor; - struct mpc_config_bus bus; - struct mpc_config_ioapic ioapic; - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - switch (mpc_default_type) { - default: - printk("???\n"); - printk(KERN_ERR "Unknown standard configuration %d\n", - mpc_default_type); - /* fall through */ - case 1: - case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); - break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); - } - MP_bus_info(&bus); - if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); - MP_bus_info(&bus); - } - - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; - MP_ioapic_info(&ioapic); - - /* - * We set up most of the low 16 IO-APIC pins according to MPS rules. - */ - construct_default_ioirq_mptable(mpc_default_type); - - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; - for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; - MP_lintsrc_info(&lintsrc); - } -} - -static struct intel_mp_floating *mpf_found; - -/* - * Scan the memory blocks for an SMP configuration block. - */ -void __init get_smp_config (void) -{ - struct intel_mp_floating *mpf = mpf_found; - - /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. - */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); - return; - } - else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); - - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } - - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1 != 0) { - - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); - return; - } - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_config_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } - - } else - BUG(); - - printk(KERN_INFO "Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ -} - -static int __init smp_scan_config (unsigned long base, unsigned long length) -{ - unsigned long *bp = isa_bus_to_virt(base); - struct intel_mp_floating *mpf; - - printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); - if (sizeof(*mpf) != 16) - printk("Error: MPF size\n"); - - while (length > 0) { - mpf = (struct intel_mp_floating *)bp; - if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && - !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; -#ifndef CONFIG_XEN - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", - mpf, virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, - BOOTMEM_DEFAULT); - if (mpf->mpf_physptr) { - /* - * We cannot access to MPC table to compute - * table size yet, as only few megabytes from - * the bottom is mapped now. - * PC-9800's MPC table places on the very last - * of physical memory; so that simply reserving - * PAGE_SIZE from mpg->mpf_physptr yields BUG() - * in reserve_bootmem. - */ - unsigned long size = PAGE_SIZE; - unsigned long end = max_low_pfn * PAGE_SIZE; - if (mpf->mpf_physptr + size > end) - size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size, - BOOTMEM_DEFAULT); - } -#else - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", - mpf, ((void *)bp - isa_bus_to_virt(base)) + base); -#endif - - mpf_found = mpf; - return 1; - } - bp += 4; - length -= 16; - } - return 0; -} - -void __init find_smp_config (void) -{ -#ifndef CONFIG_XEN - unsigned int address; -#endif - - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (smp_scan_config(0x0,0x400) || - smp_scan_config(639*0x400,0x400) || - smp_scan_config(0xF0000,0x10000)) - return; - /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - * - * MP1.4 SPEC states to only scan first 1K of 4K EBDA. - */ - -#ifndef CONFIG_XEN - address = get_bios_ebda(); - if (address) - smp_scan_config(address, 0x400); -#endif -} - -int es7000_plat; - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -void __init mp_register_lapic_address(u64 address) -{ -#ifndef CONFIG_XEN - mp_lapic_addr = (unsigned long) address; - - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); -#endif -} - -void __cpuinit mp_register_lapic (u8 id, u8 enabled) -{ - struct mpc_config_processor processor; -#ifndef CONFIG_XEN - int boot_cpu = 0; - - if (MAX_APICS - id <= 0) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) - boot_cpu = 1; - - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; -#endif - - MP_processor_info(&processor); -} - -#ifdef CONFIG_X86_IO_APIC - -#define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 - -static struct mp_ioapic_routing { - int apic_id; - int gsi_base; - int gsi_end; - u32 pin_programmed[4]; -} mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic (int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - - return -1; -} - -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) -{ - int idx = 0; - int tmpid; - - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in MADT table, skipping!\n"); - return; - } - - idx = nr_ioapics++; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - -#ifndef CONFIG_XEN - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - tmpid = io_apic_get_unique_id(idx, id); - else -#endif - tmpid = id; - if (tmpid == -1) { - nr_ioapics--; - return; - } - mp_ioapics[idx].mpc_apicid = tmpid; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); - - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); -} - -void __init -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); -} - -void __init mp_config_acpi_legacy_irqs (void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - - /* - * Fabricate the legacy ISA bus (bus #31). - */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); - - /* - * Older generations of ES7000 have no legacy identity mappings - */ - if (es7000_plat == 1) - return; - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; - - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } -} - -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 - -int mp_register_gsi(u32 gsi, int triggering, int polarity) -{ - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - - if (ioapic_renumber_irq) - gsi = ioapic_renumber_irq(ioapic, gsi); - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); - } - - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } - - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} - -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ --- head-2011-03-11.orig/arch/x86/kernel/mpparse_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,880 +0,0 @@ -/* - * Intel Multiprocessor Specification 1.1 and 1.4 - * compliant MP-table parsing routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Erich Boleyn : MP v1.4 and additional changes. - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Maciej W. Rozycki: Bits for default MP configurations - * Paul Diefenbaugh: Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/module.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/acpi.h> - -/* Have we found an MP table */ -int smp_found_config; - -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; - -static int mp_current_pci_id = 0; -/* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; - -/* # of MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* MP IRQ source entries */ -int mp_irq_entries; - -int nr_ioapics; -unsigned long mp_lapic_addr = 0; - - -#ifndef CONFIG_XEN -/* Processor that is doing the boot up */ -unsigned int boot_cpu_id = -1U; -EXPORT_SYMBOL(boot_cpu_id); -#endif - -/* Internal processor count */ -unsigned int num_processors; - -unsigned disabled_cpus __cpuinitdata; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; - -#ifndef CONFIG_XEN -u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata - = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_bios_cpu_apicid_early_ptr; -#endif -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - - -/* - * Intel MP BIOS table parsing routines: - */ - -/* - * Checksum an MP configuration block. - */ - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -#ifndef CONFIG_XEN -static void __cpuinit MP_processor_info(struct mpc_config_processor *m) -{ - int cpu; - cpumask_t tmp_map; - char *bootup_cpu = ""; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) { - disabled_cpus++; - return; - } - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - bootup_cpu = " (Bootup-CPU)"; - boot_cpu_id = m->mpc_apicid; - } - - printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(m->mpc_apicid, phys_cpu_present_map); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; - - cpu_to_apicid[cpu] = m->mpc_apicid; - bios_cpu_apicid[cpu] = m->mpc_apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; - } - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} -#else -static void __cpuinit MP_processor_info(struct mpc_config_processor *m) -{ - num_processors++; -} -#endif /* CONFIG_XEN */ - -static void __init MP_bus_info (struct mpc_config_bus *m) -{ - char str[7]; - - memcpy(str, m->mpc_bustype, 6); - str[6] = 0; - Dprintk("Bus #%d is %s\n", m->mpc_busid, str); - - if (strncmp(str, "ISA", 3) == 0) { - set_bit(m->mpc_busid, mp_bus_not_pci); - } else if (strncmp(str, "PCI", 3) == 0) { - clear_bit(m->mpc_busid, mp_bus_not_pci); - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; - } else { - printk(KERN_ERR "Unknown bustype %s\n", str); - } -} - -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - -static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -{ - if (!(m->mpc_flags & MPC_APIC_USABLE)) - return; - - printk("I/O APIC #%d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicaddr); - - if (bad_ioapic(m->mpc_apicaddr)) - return; - - mp_ioapics[nr_ioapics] = *m; - nr_ioapics++; -} - -static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -{ - mp_irqs [mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries >= MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -{ - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); -} - -/* - * Read/parse the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - int count=sizeof(*mpc); - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("MPTABLE: bad signature [%c%c%c%c]!\n", - mpc->mpc_signature[0], - mpc->mpc_signature[1], - mpc->mpc_signature[2], - mpc->mpc_signature[3]); - return 0; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("MPTABLE: checksum error!\n"); - return 0; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", - mpc->mpc_spec); - return 0; - } - if (!mpc->mpc_lapic) { - printk(KERN_ERR "MPTABLE: null local APIC address!\n"); - return 0; - } - memcpy(str,mpc->mpc_oem,8); - str[8] = 0; - printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); - - memcpy(str,mpc->mpc_productid,12); - str[12] = 0; - printk("MPTABLE: Product ID: %s ",str); - - printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); - - /* save the local APIC address, it might be non-default */ - if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - while (count < mpc->mpc_length) { - switch(*mpt) { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - MP_bus_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - MP_ioapic_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - MP_intsrc_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_lintsrc *m= - (struct mpc_config_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - } - } - setup_apic_routing(); - if (!num_processors) - printk(KERN_ERR "MPTABLE: no processors registered!\n"); - return num_processors; -} - -static int __init ELCR_trigger(unsigned int irq) -{ - unsigned int port; - - port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; -} - -static void __init construct_default_ioirq_mptable(int mpc_default_type) -{ - struct mpc_config_intsrc intsrc; - int i; - int ELCR_fallback = 0; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; - - intsrc.mpc_irqtype = mp_INT; - - /* - * If true, we have an ISA/PCI system with no IRQ entries - * in the MP table. To prevent the PCI interrupts from being set up - * incorrectly, we try to use the ELCR. The sanity check to see if - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can - * never be level sensitive, so we simply see if the ELCR agrees. - * If it does, we assume it's valid. - */ - if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); - - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); - else { - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); - ELCR_fallback = 1; - } - } - - for (i = 0; i < 16; i++) { - switch (mpc_default_type) { - case 2: - if (i == 0 || i == 13) - continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ - default: - if (i == 2) - continue; /* IRQ2 is never connected */ - } - - if (ELCR_fallback) { - /* - * If the ELCR indicates a level-sensitive interrupt, we - * copy that information over to the MP table in the - * irqflag field (level sensitive, active high polarity). - */ - if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; - else - intsrc.mpc_irqflag = 0; - } - - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); - } - - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); -} - -static inline void __init construct_default_ISA_mptable(int mpc_default_type) -{ - struct mpc_config_processor processor; - struct mpc_config_bus bus; - struct mpc_config_ioapic ioapic; - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicver = 0; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = 0; - processor.mpc_featureflag = 0; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - switch (mpc_default_type) { - default: - printk(KERN_ERR "???\nUnknown standard configuration %d\n", - mpc_default_type); - /* fall through */ - case 1: - case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); - break; - } - MP_bus_info(&bus); - if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); - MP_bus_info(&bus); - } - - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = 0; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; - MP_ioapic_info(&ioapic); - - /* - * We set up most of the low 16 IO-APIC pins according to MPS rules. - */ - construct_default_ioirq_mptable(mpc_default_type); - - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; - for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; - MP_lintsrc_info(&lintsrc); - } -} - -static struct intel_mp_floating *mpf_found; - -/* - * Scan the memory blocks for an SMP configuration block. - */ -void __init get_smp_config (void) -{ - struct intel_mp_floating *mpf = mpf_found; - - /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. - */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); - return; - } - else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); - - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1 != 0) { - - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); - return; - } - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_config_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } - - } else - BUG(); - - printk(KERN_INFO "Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ -} - -static int __init smp_scan_config (unsigned long base, unsigned long length) -{ - extern void __bad_mpf_size(void); - unsigned int *bp = isa_bus_to_virt(base); - struct intel_mp_floating *mpf; - - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); - if (sizeof(*mpf) != 16) - __bad_mpf_size(); - - while (length > 0) { - mpf = (struct intel_mp_floating *)bp; - if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && - !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; - mpf_found = mpf; - return 1; - } - bp += 4; - length -= 16; - } - return 0; -} - -void __init find_smp_config(void) -{ - unsigned int address; - - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (smp_scan_config(0x0,0x400) || - smp_scan_config(639*0x400,0x400) || - smp_scan_config(0xF0000,0x10000)) - return; - /* - * If it is an SMP machine we should know now. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - */ - - address = *(unsigned short *)phys_to_virt(0x40E); - address <<= 4; - if (smp_scan_config(address, 0x1000)) - return; - - /* If we have come this far, we did not find an MP table */ - printk(KERN_INFO "No mptable found.\n"); -} - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -void __init mp_register_lapic_address(u64 address) -{ -#ifndef CONFIG_XEN - mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); -#endif -} - -void __cpuinit mp_register_lapic (u8 id, u8 enabled) -{ - struct mpc_config_processor processor; -#ifndef CONFIG_XEN - int boot_cpu = 0; - - if (id == boot_cpu_id) - boot_cpu = 1; - - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = 0; - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = 0; - processor.mpc_featureflag = 0; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; -#endif - - MP_processor_info(&processor); -} - -#define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 - -static struct mp_ioapic_routing { - int apic_id; - int gsi_start; - int gsi_end; - u32 pin_programmed[4]; -} mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_start) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -static u8 uniq_ioapic_id(u8 id) -{ - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_config_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->mpc_apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -} - -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - -#ifndef CONFIG_XEN - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); -#endif - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); - mp_ioapics[idx].mpc_apicver = 0; - - /* - * Build basic IRQ lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI IRQs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_start = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_start, - mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - -void __init -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); -} - -void __init mp_config_acpi_legacy_irqs(void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - - /* - * Fabricate the legacy ISA bus (bus #31). - */ - set_bit(MP_ISA_BUS, mp_bus_not_pci); - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; - - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } -} - -int mp_register_gsi(u32 gsi, int triggering, int polarity) -{ - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} -#endif /*CONFIG_ACPI*/ --- head-2011-03-11.orig/arch/x86/kernel/pci-dma-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/pci-dma-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -1,280 +1,251 @@ -/* - * Dynamic DMA mapping support. - * - * On i386 there is no hardware dynamic DMA address translation, - * so consistent alloc/free are merely page allocation/freeing. - * The rest of the dynamic DMA mapping interface is implemented - * in asm/pci.h. - */ - -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/dmar.h> +#include <linux/bootmem.h> #include <linux/pci.h> -#include <linux/module.h> -#include <linux/version.h> -#include <asm/io.h> -#include <xen/balloon.h> -#include <xen/gnttab.h> -#include <asm/swiotlb.h> -#include <asm/tlbflush.h> -#include <asm/swiotlb_32.h> -#include <asm/gnttab_dma.h> -#include <asm/bug.h> -#ifdef __x86_64__ -#include <asm/iommu.h> +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/gart.h> +#include <asm/calgary.h> + +int forbid_dac __read_mostly; +EXPORT_SYMBOL(forbid_dac); + +const struct dma_mapping_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +static int iommu_sac_force __read_mostly; + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly = 0; +#endif int iommu_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_merge); -dma_addr_t bad_dma_address __read_mostly; -EXPORT_SYMBOL(bad_dma_address); +int no_iommu __read_mostly; +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; /* This tells the BIO block layer to assume merging. Default to off because we cannot guarantee merging later. */ int iommu_bio_merge __read_mostly = 0; EXPORT_SYMBOL(iommu_bio_merge); -int force_iommu __read_mostly= 0; +dma_addr_t bad_dma_address __read_mostly = 0; +EXPORT_SYMBOL(bad_dma_address); -__init int iommu_setup(char *p) -{ - return 1; -} +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible + to older i386. */ +struct device fallback_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = DMA_32BIT_MASK, + .dma_mask = &fallback_dev.coherent_dma_mask, +}; -void __init pci_iommu_alloc(void) +int dma_set_mask(struct device *dev, u64 mask) { -#ifdef CONFIG_SWIOTLB - pci_swiotlb_init(); -#endif -} + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + *dev->dma_mask = mask; -static int __init pci_iommu_init(void) -{ - no_iommu_init(); return 0; } +EXPORT_SYMBOL(dma_set_mask); -/* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); -#endif - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -#define IOMMU_BUG_ON(test) \ -do { \ - if (unlikely(test)) { \ - printk(KERN_ALERT "Fatal DMA error! " \ - "Please use 'swiotlb=force'\n"); \ - BUG(); \ - } \ -} while (0) +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +static __initdata void *dma32_bootmem_ptr; +static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); -static int check_pages_physically_contiguous(unsigned long pfn, - unsigned int offset, - size_t length) +static int __init parse_dma32_size_opt(char *p) { - unsigned long next_mfn; - int i; - int nr_pages; - - next_mfn = pfn_to_mfn(pfn); - nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; - - for (i = 1; i < nr_pages; i++) { - if (pfn_to_mfn(++pfn) != ++next_mfn) - return 0; - } - return 1; + if (!p) + return -EINVAL; + dma32_bootmem_size = memparse(p, &p); + return 0; } +early_param("dma32_size", parse_dma32_size_opt); -int range_straddles_page_boundary(paddr_t p, size_t size) +void __init dma32_reserve_bootmem(void) { - unsigned long pfn = p >> PAGE_SHIFT; - unsigned int offset = p & ~PAGE_MASK; + unsigned long size, align; + if (end_pfn <= MAX_DMA32_PFN) + return; - return ((offset + size > PAGE_SIZE) && - !check_pages_physically_contiguous(pfn, offset, size)); + align = 64ULL<<20; + size = round_up(dma32_bootmem_size, align); + dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, + __pa(MAX_DMA_ADDRESS)); + if (dma32_bootmem_ptr) + dma32_bootmem_size = size; + else + dma32_bootmem_size = 0; } - -int -dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, - enum dma_data_direction direction) +static void __init dma32_free_bootmem(void) { - int i, rc; + int node; + + if (end_pfn <= MAX_DMA32_PFN) + return; - BUG_ON(!valid_dma_direction(direction)); - WARN_ON(nents == 0 || sgl->length == 0); + if (!dma32_bootmem_ptr) + return; - if (swiotlb) { - rc = swiotlb_map_sg(hwdev, sgl, nents, direction); - } else { - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - sg->dma_address = - gnttab_dma_map_page(sg_page(sg)) + sg->offset; - sg->dma_length = sg->length; - IOMMU_BUG_ON(address_needs_mapping( - hwdev, sg->dma_address)); - IOMMU_BUG_ON(range_straddles_page_boundary( - page_to_pseudophys(sg_page(sg)) + sg->offset, - sg->length)); - } - rc = nents; - } + for_each_online_node(node) + free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr), + dma32_bootmem_size); - flush_write_buffers(); - return rc; + dma32_bootmem_ptr = NULL; + dma32_bootmem_size = 0; } -EXPORT_SYMBOL(dma_map_sg); +#else +#define dma32_free_bootmem() ((void)0) +#endif -void -dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - int i; +static const struct dma_mapping_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .map_single = swiotlb_map_single_phys, + .unmap_single = swiotlb_unmap_single, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, + .sync_single_range_for_device = swiotlb_sync_single_range_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg, + .unmap_sg = swiotlb_unmap_sg, + .dma_supported = swiotlb_dma_supported +}; - BUG_ON(!valid_dma_direction(direction)); - if (swiotlb) - swiotlb_unmap_sg(hwdev, sgl, nents, direction); - else { - struct scatterlist *sg; +void __init pci_iommu_alloc(void) +{ + /* free the range so iommu could get some range less than 4G */ + dma32_free_bootmem(); + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ +#ifdef CONFIG_GART_IOMMU + gart_iommu_hole_init(); +#endif - for_each_sg(sgl, sg, nents, i) - gnttab_dma_unmap_page(sg->dma_address); - } -} -EXPORT_SYMBOL(dma_unmap_sg); +#ifdef CONFIG_CALGARY_IOMMU + detect_calgary(); +#endif -#ifdef CONFIG_HIGHMEM -dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) -{ - dma_addr_t dma_addr; + detect_intel_iommu(); - BUG_ON(!valid_dma_direction(direction)); +#ifdef CONFIG_SWIOTLB + swiotlb_init(); if (swiotlb) { - dma_addr = swiotlb_map_page( - dev, page, offset, size, direction); - } else { - dma_addr = gnttab_dma_map_page(page) + offset; - IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + dma_ops = &swiotlb_dma_ops; } - - return dma_addr; +#endif } -EXPORT_SYMBOL(dma_map_page); -void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) +/* + * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter + * documentation. + */ +static __init int iommu_setup(char *p) { - BUG_ON(!valid_dma_direction(direction)); - if (swiotlb) - swiotlb_unmap_page(dev, dma_address, size, direction); - else - gnttab_dma_unmap_page(dma_address); -} -EXPORT_SYMBOL(dma_unmap_page); -#endif /* CONFIG_HIGHMEM */ + iommu_merge = 1; -int -dma_mapping_error(dma_addr_t dma_addr) -{ - if (swiotlb) - return swiotlb_dma_mapping_error(dma_addr); - return 0; -} -EXPORT_SYMBOL(dma_mapping_error); + if (!p) + return -EINVAL; -int -dma_supported(struct device *dev, u64 mask) -{ - if (swiotlb) - return swiotlb_dma_supported(dev, mask); - /* - * By default we'll BUG when an infeasible DMA is requested, and - * request swiotlb=force (see IOMMU_BUG_ON). - */ - return 1; -} -EXPORT_SYMBOL(dma_supported); + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - void *ret; - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - unsigned int order = get_order(size); - unsigned long vstart; - u64 mask; + if (!strncmp(p, "biomerge", 8)) { + iommu_bio_merge = 4096; + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = -1; + if (!strncmp(p, "usedac", 6)) { + forbid_dac = -1; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + swiotlb = 1; +#endif - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); +#ifdef CONFIG_GART_IOMMU + gart_parse_options(p); +#endif - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - ret = mem->virt_base + (page << PAGE_SHIFT); - memset(ret, 0, size); - return ret; - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - return NULL; +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + + p += strcspn(p, ","); + if (*p == ',') + ++p; } + return 0; +} +early_param("iommu", iommu_setup); - vstart = __get_free_pages(gfp, order); - ret = (void *)vstart; +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; - if (dev != NULL && dev->coherent_dma_mask) - mask = dev->coherent_dma_mask; - else - mask = 0xffffffff; + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; - if (ret != NULL) { - if (xen_create_contiguous_region(vstart, order, - fls64(mask)) != 0) { - free_pages(vstart, order); - return NULL; - } - memset(ret, 0, size); - *dma_handle = virt_to_bus(ret); + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; } - return ret; + return 1; } -EXPORT_SYMBOL(dma_alloc_coherent); -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) +int range_straddles_page_boundary(paddr_t p, size_t size) { - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - WARN_ON(irqs_disabled()); /* for portability */ - if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + unsigned long pfn = p >> PAGE_SHIFT; + unsigned int offset = p & ~PAGE_MASK; - bitmap_release_region(mem->bitmap, page, order); - } else { - xen_destroy_contiguous_region((unsigned long)vaddr, order); - free_pages((unsigned long)vaddr, order); - } + return ((offset + size > PAGE_SIZE) && + !check_pages_physically_contiguous(pfn, offset, size)); } -EXPORT_SYMBOL(dma_free_coherent); -#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +#ifdef CONFIG_X86_32 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dma_addr_t device_addr, size_t size, int flags) { @@ -324,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor void dma_release_declared_memory(struct device *dev) { struct dma_coherent_mem *mem = dev->dma_mem; - - if(!mem) + + if (!mem) return; dev->dma_mem = NULL; iounmap(mem->virt_base); @@ -338,8 +309,10 @@ void *dma_mark_declared_memory_occupied( dma_addr_t device_addr, size_t size) { struct dma_coherent_mem *mem = dev->dma_mem; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; int pos, err; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); + + pages >>= PAGE_SHIFT; if (!mem) return ERR_PTR(-EINVAL); @@ -351,103 +324,270 @@ void *dma_mark_declared_memory_occupied( return mem->virt_base + (pos << PAGE_SHIFT); } EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ - -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ - -int forbid_dac; -EXPORT_SYMBOL(forbid_dac); -static __devinit void via_no_dac(struct pci_dev *dev) +static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); - forbid_dac = 1; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + *ret = mem->virt_base + (page << PAGE_SHIFT); + memset(*ret, 0, size); + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + *ret = NULL; } + return (mem != NULL); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); -static int check_iommu(char *s) +static int dma_release_coherent(struct device *dev, int order, void *vaddr) { - if (!strcmp(s, "usedac")) { - forbid_dac = -1; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); return 1; } return 0; } -__setup("iommu=", check_iommu); +#else +#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) +#define dma_release_coherent(dev, order, vaddr) (0) +#endif /* CONFIG_X86_32 */ + +int dma_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", + dev->bus_id); + return 0; + } #endif -dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) + if (dma_ops->dma_supported) + return dma_ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_24BIT_MASK) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { + printk(KERN_INFO "%s: Force SAC with mask %Lx\n", + dev->bus_id, mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +/* Allocate DMA memory on node near device */ +static struct page * +dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { - dma_addr_t dma; + int node; - BUG_ON(!valid_dma_direction(direction)); - WARN_ON(size == 0); + node = dev_to_node(dev); - if (swiotlb) { - dma = swiotlb_map_single(dev, ptr, size, direction); - } else { - dma = gnttab_dma_map_page(virt_to_page(ptr)) + - offset_in_page(ptr); - IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size)); - IOMMU_BUG_ON(address_needs_mapping(dev, dma)); - } - - flush_write_buffers(); - return dma; -} -EXPORT_SYMBOL(dma_map_single); - -void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - if (swiotlb) - swiotlb_unmap_single(dev, dma_addr, size, direction); - else - gnttab_dma_unmap_page(dma_addr); + return alloc_pages_node(node, gfp, order); } -EXPORT_SYMBOL(dma_unmap_single); -void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) +/* + * Allocate memory for a coherent mapping. + */ +void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp) +{ + void *memory = NULL; + struct page *page; + unsigned long dma_mask = 0; + int noretry = 0; + unsigned int order = get_order(size); + + /* ignore region specifiers */ + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) + return memory; + + if (!dev) { + dev = &fallback_dev; + gfp |= GFP_DMA; + } + dma_mask = dev->coherent_dma_mask; + if (dma_mask == 0) + dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; + + /* Device not DMA able */ + if (dev->dma_mask == NULL) + return NULL; + +#ifdef CONFIG_XEN + gfp &= ~(__GFP_DMA | __GFP_DMA32); +#else + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ + if (gfp & __GFP_DMA) + noretry = 1; + +#ifdef CONFIG_X86_64 + /* Why <=? Even when the mask is smaller than 4GB it is often + larger than 16MB and in this case we have a chance of + finding fitting memory in the next higher zone first. If + not retry with true GFP_DMA. -AK */ + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { + gfp |= GFP_DMA32; +#endif + + again: +#endif + page = dma_alloc_pages(dev, + noretry ? gfp | __GFP_NORETRY : gfp, order); + if (page == NULL) + return NULL; + +#ifndef CONFIG_XEN + { + int high, mmu; + dma_addr_t bus = page_to_phys(page); + memory = page_address(page); + high = (bus + size) >= dma_mask; + mmu = high; + if (force_iommu && !(gfp & GFP_DMA)) + mmu = 1; + else if (high) { + free_pages((unsigned long)memory, order); + + /* Don't use the 16MB ZONE_DMA unless absolutely + needed. It's better to use remapping first. */ + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + /* Let low level make its own zone decisions */ + gfp &= ~(GFP_DMA32|GFP_DMA); + + if (dma_ops->alloc_coherent) + return dma_ops->alloc_coherent(dev, size, + dma_handle, gfp); + return NULL; + } + + memset(memory, 0, size); + if (!mmu) { + *dma_handle = bus; + return memory; + } + } + + if (dma_ops->alloc_coherent) { + free_pages((unsigned long)memory, order); + gfp &= ~(GFP_DMA|GFP_DMA32); + return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + } + + if (dma_ops->map_simple) { + *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory), + size, + PCI_DMA_BIDIRECTIONAL); + if (*dma_handle != bad_dma_address) + return memory; + } +#else + memory = page_address(page); + if (xen_create_contiguous_region((unsigned long)memory, order, + fls64(dma_mask)) == 0) { + memset(memory, 0, size); + *dma_handle = virt_to_bus(memory); + return memory; + } +#endif + + if (panic_on_overflow) + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", + (unsigned long)size); + free_pages((unsigned long)memory, order); + return NULL; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +/* + * Unmap coherent memory. + * The caller must ensure that the device has finished accessing the mapping. + */ +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) { - if (swiotlb) - swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); + int order = get_order(size); + WARN_ON(irqs_disabled()); /* for portability */ + if (dma_release_coherent(dev, order, vaddr)) + return; +#ifndef CONFIG_XEN + if (dma_ops->unmap_single) + dma_ops->unmap_single(dev, bus, size, 0); +#endif + xen_destroy_contiguous_region((unsigned long)vaddr, order); + free_pages((unsigned long)vaddr, order); } -EXPORT_SYMBOL(dma_sync_single_for_cpu); +EXPORT_SYMBOL(dma_free_coherent); -void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) +static int __init pci_iommu_init(void) { - if (swiotlb) - swiotlb_sync_single_for_device(dev, dma_handle, size, direction); +#ifdef CONFIG_CALGARY_IOMMU + calgary_iommu_init(); +#endif + + intel_iommu_init(); + +#ifdef CONFIG_GART_IOMMU + gart_iommu_init(); +#endif + + no_iommu_init(); + return 0; } -EXPORT_SYMBOL(dma_sync_single_for_device); -void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +void pci_iommu_shutdown(void) { - if (swiotlb) - swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); - flush_write_buffers(); + gart_iommu_shutdown(); } -EXPORT_SYMBOL(dma_sync_sg_for_cpu); +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ -void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +static __devinit void via_no_dac(struct pci_dev *dev) { - if (swiotlb) - swiotlb_sync_sg_for_device(dev,sg,nelems,direction); - flush_write_buffers(); + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + printk(KERN_INFO "PCI: VIA PCI bridge detected." + "Disabling DAC.\n"); + forbid_dac = 1; + } } -EXPORT_SYMBOL(dma_sync_sg_for_device); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/kernel/pci-nommu-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,108 @@ +#include <linux/dma-mapping.h> +#include <linux/dmar.h> +#include <linux/bootmem.h> +#include <linux/pci.h> + +#include <xen/gnttab.h> + +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/swiotlb.h> +#include <asm/tlbflush.h> +#include <asm/gnttab_dma.h> +#include <asm/bug.h> + +#define IOMMU_BUG_ON(test) \ +do { \ + if (unlikely(test)) { \ + printk(KERN_ALERT "Fatal DMA error! " \ + "Please use 'swiotlb=force'\n"); \ + BUG(); \ + } \ +} while (0) + +static int +gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, + int direction) +{ + unsigned int i; + struct scatterlist *sg; + + WARN_ON(nents == 0 || sgl->length == 0); + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg->dma_address = + gnttab_dma_map_page(sg_page(sg)) + sg->offset; + sg->dma_length = sg->length; + IOMMU_BUG_ON(address_needs_mapping( + hwdev, sg->dma_address)); + IOMMU_BUG_ON(range_straddles_page_boundary( + page_to_pseudophys(sg_page(sg)) + sg->offset, + sg->length)); + } + + return nents; +} + +static void +gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, + int direction) +{ + unsigned int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) + gnttab_dma_unmap_page(sg->dma_address); +} + +static dma_addr_t +gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size, + int direction) +{ + dma_addr_t dma; + + WARN_ON(size == 0); + + dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) + + offset_in_page(paddr); + IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size)); + IOMMU_BUG_ON(address_needs_mapping(dev, dma)); + + return dma; +} + +static void +gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + int direction) +{ + gnttab_dma_unmap_page(dma_addr); +} + +static int nommu_dma_supported(struct device *hwdev, u64 mask) +{ + return 1; +} + +static int nommu_mapping_error(dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_address); +} + +static const struct dma_mapping_ops nommu_dma_ops = { + .map_single = gnttab_map_single, + .unmap_single = gnttab_unmap_single, + .map_sg = gnttab_map_sg, + .unmap_sg = gnttab_unmap_sg, + .dma_supported = nommu_dma_supported, + .mapping_error = nommu_mapping_error +}; + +void __init no_iommu_init(void) +{ + if (dma_ops) + return; + + force_iommu = 0; /* no HW IOMMU */ + dma_ops = &nommu_dma_ops; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/kernel/process-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,188 @@ +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/pm.h> + +struct kmem_cache *task_xstate_cachep; + +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + *dst = *src; + if (src->thread.xstate) { + dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, + GFP_KERNEL); + if (!dst->thread.xstate) + return -ENOMEM; + WARN_ON((unsigned long)dst->thread.xstate & 15); + memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); + } + return 0; +} + +void free_thread_xstate(struct task_struct *tsk) +{ + if (tsk->thread.xstate) { + kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); + tsk->thread.xstate = NULL; + } +} + +void free_thread_info(struct thread_info *ti) +{ + free_thread_xstate(ti->task); + free_pages((unsigned long)ti, get_order(THREAD_SIZE)); +} + +void arch_task_cache_init(void) +{ + task_xstate_cachep = + kmem_cache_create("task_xstate", xstate_size, + __alignof__(union thread_xstate), + SLAB_PANIC, NULL); +} + +static void do_nothing(void *unused) +{ +} + +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ +void cpu_idle_wait(void) +{ + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +#ifndef CONFIG_XEN +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(ax, cx); + } +} + +/* Default MONITOR/MWAIT with no hints, used for default C1 state */ +static void mwait_idle(void) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else + local_irq_enable(); +} +#endif + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle(void) +{ + local_irq_enable(); + cpu_relax(); +} + +#ifndef CONFIG_XEN +/* + * mwait selection logic: + * + * It depends on the CPU. For AMD CPUs that support MWAIT this is + * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings + * then depend on a clock divisor and current Pstate of the core. If + * all cores of a processor are in halt state (C1) the processor can + * enter the C1E (C1 enhanced) state. If mwait is used this will never + * happen. + * + * idle=mwait overrides this decision and forces the usage of mwait. + */ +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + + if (c->x86_vendor == X86_VENDOR_AMD) { + switch(c->x86) { + case 0x10: + case 0x11: + return 0; + } + } + return 1; +} +#endif + +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +{ +#ifndef CONFIG_XEN + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + printk(KERN_INFO "using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } + } + selected = 1; +#endif +} + +static int __init idle_setup(char *str) +{ + if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } +#ifndef CONFIG_XEN + else if (!strcmp(str, "mwait")) + force_mwait = 1; +#endif + else + return -1; + + boot_option_idle_override = 1; + return 0; +} +early_param("idle", idle_setup); + --- head-2011-03-11.orig/arch/x86/kernel/process_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/process_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -36,6 +36,7 @@ #include <linux/personality.h> #include <linux/tick.h> #include <linux/percpu.h> +#include <linux/prctl.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -45,7 +46,6 @@ #include <asm/processor.h> #include <asm/i387.h> #include <asm/desc.h> -#include <asm/vm86.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> #endif @@ -102,16 +102,6 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->work.need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle(void) -{ - cpu_relax(); -} - static void xen_idle(void) { current_thread_info()->status &= ~TS_POLLING; @@ -121,20 +111,10 @@ static void xen_idle(void) */ smp_mb(); - local_irq_disable(); - if (!need_resched()) { - ktime_t t0, t1; - u64 t0n, t1n; - - t0 = ktime_get(); - t0n = ktime_to_ns(t0); + if (!need_resched()) safe_halt(); /* enables interrupts racelessly */ - local_irq_disable(); - t1 = ktime_get(); - t1n = ktime_to_ns(t1); - sched_clock_idle_wakeup_event(t1n - t0n); - } - local_irq_enable(); + else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } #ifdef CONFIG_APM_MODULE @@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle); #endif #ifdef CONFIG_HOTPLUG_CPU -extern cpumask_t cpu_initialized; static inline void play_dead(void) { idle_task_exit(); @@ -187,6 +166,7 @@ void cpu_idle(void) if (cpu_is_offline(cpu)) play_dead(); + local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } @@ -197,44 +177,6 @@ void cpu_idle(void) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 0, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -{ -} - -static int __init idle_setup(char *str) -{ - if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } - else - return -1; - - boot_option_idle_override = 1; - return 0; -} -early_param("idle", idle_setup); - void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re init_utsname()->version); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - 0xffff & regs->cs, regs->ip, regs->flags, + (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); @@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - regs->ds & 0xffff, regs->es & 0xffff, - regs->fs & 0xffff, gs, ss); + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) return; @@ -367,6 +308,7 @@ void flush_thread(void) /* * Forget coprocessor state.. */ + tsk->fpu_counter = 0; clear_fpu(tsk); clear_used_math(); } @@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl return err; } -#ifdef CONFIG_SECCOMP +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + __asm__("movl %0, %%gs" :: "r"(0)); + regs->fs = 0; + set_fs(USER_DS); + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +EXPORT_SYMBOL_GPL(start_thread); + static void hard_disable_TSC(void) { write_cr4(read_cr4() | X86_CR4_TSD); } + void disable_TSC(void) { preempt_disable(); @@ -453,11 +414,47 @@ void disable_TSC(void) hard_disable_TSC(); preempt_enable(); } + static void hard_enable_TSC(void) { write_cr4(read_cr4() & ~X86_CR4_TSD); } -#endif /* CONFIG_SECCOMP */ + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) @@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre /* we clear debugctl to make sure DS * is not in use when we change it */ debugctl = 0; - wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + update_debugctlmsr(0); wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); } if (next->debugctlmsr != debugctl) - wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); + update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { set_debugreg(next->debugreg0, 0); @@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre set_debugreg(next->debugreg7, 7); } -#ifdef CONFIG_SECCOMP if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ @@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre else hard_enable_TSC(); } -#endif #ifdef X86_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) @@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter > 5) - prefetch(&next->i387.fxsave); + prefetch(next->xstate); /* * Now maybe handle debug registers @@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now + * + * tsk_used_math() checks prevent calling math_state_restore(), + * which can sleep in the case of !tsk_used_math() */ - if (next_p->fpu_counter > 5) + if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); /* --- head-2011-03-11.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:32:11.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/process_64-xen.c 2011-02-02 08:32:46.000000000 +0100 @@ -39,6 +39,7 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/tick.h> +#include <linux/prctl.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -101,17 +102,6 @@ void exit_idle(void) __exit_idle(); } -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle(void) -{ - local_irq_enable(); - cpu_relax(); -} - static void xen_idle(void) { current_thread_info()->status &= ~TS_POLLING; @@ -120,20 +110,10 @@ static void xen_idle(void) * test NEED_RESCHED: */ smp_mb(); - local_irq_disable(); - if (!need_resched()) { - ktime_t t0, t1; - u64 t0n, t1n; - - t0 = ktime_get(); - t0n = ktime_to_ns(t0); + if (!need_resched()) safe_halt(); /* enables interrupts racelessly */ - local_irq_disable(); - t1 = ktime_get(); - t1n = ktime_to_ns(t1); - sched_clock_idle_wakeup_event(t1n - t0n); - } - local_irq_enable(); + else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -194,45 +174,6 @@ void cpu_idle(void) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 0, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -{ -} - -static int __init idle_setup(char *str) -{ - if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else - return -1; - - boot_option_idle_override = 1; - return 0; -} -early_param("idle", idle_setup); - /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { @@ -359,6 +300,7 @@ void flush_thread(void) /* * Forget coprocessor state.. */ + tsk->fpu_counter = 0; clear_fpu(tsk); clear_used_math(); } @@ -470,6 +412,82 @@ out: return err; } +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0)); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + regs->cs = __USER_CS; + regs->ss = __USER_DS; + regs->flags = 0x200; + set_fs(USER_DS); + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +EXPORT_SYMBOL_GPL(start_thread); + +static void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} + +static void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + /* * This special macro can be used to load a debugging register */ @@ -489,12 +507,12 @@ static inline void __switch_to_xtra(stru /* we clear debugctl to make sure DS * is not in use when we change it */ debugctl = 0; - wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + update_debugctlmsr(0); wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); } if (next->debugctlmsr != debugctl) - wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); + update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); @@ -506,6 +524,15 @@ static inline void __switch_to_xtra(stru loaddebug(next, 7); } + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } + #ifdef X86_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); @@ -545,7 +572,7 @@ __switch_to(struct task_struct *prev_p, /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter>5) - prefetch(&next->i387.fxsave); + prefetch(next->xstate); /* * This is basically '__unlazy_fpu', except that we queue a @@ -676,8 +703,11 @@ __switch_to(struct task_struct *prev_p, /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now + * + * tsk_used_math() checks prevent calling math_state_restore(), + * which can sleep in the case of !tsk_used_math() */ - if (next_p->fpu_counter>5) + if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); return prev_p; } --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/kernel/setup-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,143 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/percpu.h> +#include <asm/smp.h> +#include <asm/percpu.h> +#include <asm/sections.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/topology.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int num_processors; +unsigned disabled_cpus __cpuinitdata; +#ifndef CONFIG_XEN +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +EXPORT_SYMBOL(boot_cpu_physical_apicid); + +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; +#endif +#endif + +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) +/* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ +#ifndef CONFIG_XEN + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_bios_cpu_apicid, cpu) = + x86_bios_cpu_apicid_init[cpu]; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + x86_cpu_to_node_map_init[cpu]; +#endif + } + + /* indicate the early static arrays will soon be gone */ + x86_cpu_to_apicid_early_ptr = NULL; + x86_bios_cpu_apicid_early_ptr = NULL; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = NULL; +#endif +#endif +} + +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +cpumask_t *cpumask_of_cpu_map __read_mostly; +EXPORT_SYMBOL(cpumask_of_cpu_map); + +/* requires nr_cpu_ids to be initialized */ +static void __init setup_cpumask_of_cpu(void) +{ + int i; + + /* alloc_bootmem zeroes memory */ + cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); + for (i = 0; i < nr_cpu_ids; i++) + cpu_set(i, cpumask_of_cpu_map[i]); +} +#else +static inline void setup_cpumask_of_cpu(void) { } +#endif + +#ifdef CONFIG_X86_32 +/* + * Great future not-so-futuristic plan: make i386 and x86_64 do it + * the same way + */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); +#endif + +/* + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning + */ +void __init setup_per_cpu_areas(void) +{ + int i, highest_cpu = 0; + unsigned long size; + +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif + + /* Copy section for each CPU (we discard the original) */ + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", + size); + + for_each_possible_cpu(i) { + char *ptr; +#ifndef CONFIG_NEED_MULTIPLE_NODES + ptr = alloc_bootmem_pages(size); +#else + int node = early_cpu_to_node(i); + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(size); + printk(KERN_INFO + "cpu %d has no node or node-local memory\n", i); + } + else + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); +#endif + if (!ptr) + panic("Cannot allocate cpu data for CPU %d\n", i); +#ifdef CONFIG_X86_64 + cpu_pda(i)->data_offset = ptr - __per_cpu_start; +#else + __per_cpu_offset[i] = ptr - __per_cpu_start; +#endif + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + + highest_cpu = i; + } + + nr_cpu_ids = highest_cpu + 1; + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); + + /* Setup percpu data maps */ + setup_per_cpu_maps(); + + /* Setup cpumask_of_cpu map */ + setup_cpumask_of_cpu(); +} + +#endif --- head-2011-03-11.orig/arch/x86/kernel/setup64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/setup64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -15,6 +15,7 @@ #include <linux/bootmem.h> #include <linux/bitops.h> #include <linux/module.h> +#include <linux/kgdb.h> #include <asm/pda.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -27,6 +28,7 @@ #include <asm/proto.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/genapic.h> #ifdef CONFIG_XEN #include <asm/hypervisor.h> #endif @@ -81,8 +83,8 @@ int force_personality32 = 0; Control non executable heap for 32bit processes. To control the stack too use noexec=off -on PROT_READ does not imply PROT_EXEC for 32bit processes -off PROT_READ implies PROT_EXEC (default) +on PROT_READ does not imply PROT_EXEC for 32bit processes (default) +off PROT_READ implies PROT_EXEC */ static int __init nonx32_setup(char *str) { @@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str } __setup("noexec32=", nonx32_setup); -/* - * Copy data used in early init routines from the initial arrays to the - * per cpu data areas. These arrays then become expendable and the - * *_early_ptr's are zeroed indicating that the static arrays are gone. - */ -static void __init setup_per_cpu_maps(void) -{ -#ifndef CONFIG_XEN - int cpu; - - for_each_possible_cpu(cpu) { -#ifdef CONFIG_SMP - if (per_cpu_offset(cpu)) { -#endif - per_cpu(x86_cpu_to_apicid, cpu) = - x86_cpu_to_apicid_init[cpu]; - per_cpu(x86_bios_cpu_apicid, cpu) = - x86_bios_cpu_apicid_init[cpu]; -#ifdef CONFIG_NUMA - per_cpu(x86_cpu_to_node_map, cpu) = - x86_cpu_to_node_map_init[cpu]; -#endif -#ifdef CONFIG_SMP - } - else - printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n", - cpu); -#endif - } - - /* indicate the early static arrays will soon be gone */ - x86_cpu_to_apicid_early_ptr = NULL; - x86_bios_cpu_apicid_early_ptr = NULL; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = NULL; -#endif -#endif -} - -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ - int i; - unsigned long size; - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#endif - - /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; - - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_cpu_mask (i, cpu_possible_map) { - char *ptr; -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(i); - - if (!node_online(node) || !NODE_DATA(node)) - ptr = alloc_bootmem_pages(size); - else - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); -#endif - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); - cpu_pda(i)->data_offset = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } - - /* setup percpu data maps early */ - setup_per_cpu_maps(); -} - #ifdef CONFIG_XEN static void __init_refok switch_pt(int cpu) { @@ -410,6 +333,17 @@ void __cpuinit cpu_init (void) #endif load_LDT(&init_mm.context); +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif /* * Clear all 6 debug registers: */ @@ -420,10 +354,17 @@ void __cpuinit cpu_init (void) set_debugreg(0UL, 3); set_debugreg(0UL, 6); set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif fpu_init(); asm ("pushfq; popq %0" : "=rm" (kernel_eflags)); if (raw_irqs_disabled()) kernel_eflags &= ~X86_EFLAGS_IF; + + if (is_uv_system()) + uv_cpu_init(); } --- head-2011-03-11.orig/arch/x86/kernel/setup_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/setup_32-xen.c 2011-03-04 15:07:31.000000000 +0100 @@ -39,6 +39,7 @@ #include <linux/efi.h> #include <linux/init.h> #include <linux/edd.h> +#include <linux/iscsi_ibft.h> #include <linux/nodemask.h> #include <linux/kernel.h> #include <linux/percpu.h> @@ -49,6 +50,7 @@ #include <linux/pfn.h> #include <linux/pci.h> #include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> #include <video/edid.h> @@ -70,8 +72,9 @@ #include <xen/firmware.h> #include <xen/xencons.h> #include <setup_arch.h> -#include <bios_ebda.h> +#include <asm/bios_ebda.h> #include <asm/cacheflush.h> +#include <asm/processor.h> #ifdef CONFIG_XEN #include <xen/interface/kexec.h> @@ -136,7 +139,12 @@ static struct resource standard_io_resou }, { .name = "keyboard", .start = 0x0060, - .end = 0x006f, + .end = 0x0060, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0064, + .end = 0x0064, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, { .name = "dma page reg", @@ -161,11 +169,13 @@ static struct resource standard_io_resou } }; /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1 }; /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1 }; EXPORT_SYMBOL(boot_cpu_data); +unsigned int def_to_bigsmp; + #ifndef CONFIG_X86_PAE unsigned long mmu_cr4_features; #else @@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info); extern void early_cpu_init(void); extern int root_mountflags; -unsigned long saved_videomode; +unsigned long saved_video_mode; #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 @@ -259,7 +269,7 @@ static inline void copy_edd(void) } #endif -int __initdata user_defined_memmap = 0; +int __initdata user_defined_memmap; /* * "mem=nopentium" disables the 4MB page tables. @@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo } #ifndef CONFIG_XEN +#define BIOS_LOWMEM_KILOBYTES 0x413 + /* - * workaround for Dell systems that neglect to reserve EBDA + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. */ static void __init reserve_ebda_region(void) { - unsigned int addr; - addr = get_bios_ebda(); - if (addr) - reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT); + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT); } #endif #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init setup_bootmem_allocator(void); +static void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { /* @@ -469,7 +518,7 @@ static unsigned long __init setup_memory return max_low_pfn; } -void __init zone_sizes_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); @@ -523,10 +572,16 @@ static void __init reserve_crashkernel(v (unsigned long)(crash_size >> 20), (unsigned long)(crash_base >> 20), (unsigned long)(total_mem >> 20)); + + if (reserve_bootmem(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation " + "failed - memory is in use\n"); + return; + } + crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size, - BOOTMEM_DEFAULT); } else printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); @@ -660,16 +715,9 @@ void __init setup_bootmem_allocator(void */ reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); - /* reserve EBDA region, it's a 4K region */ + /* reserve EBDA region */ reserve_ebda_region(); - /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent - PCI prefetch into it (errata #56). Usually the page is reserved anyways, - unless you have no PS/2 mouse plugged in. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT); - #ifdef CONFIG_SMP /* * But first pinch a few for the stack/trampoline stuff @@ -691,6 +739,8 @@ void __init setup_bootmem_allocator(void #endif numa_kva_reserve(); reserve_crashkernel(); + + reserve_ibft_region(); } /* @@ -726,6 +776,18 @@ char * __init __attribute__((weak)) memo return machine_specific_memory_setup(); } +#ifdef CONFIG_NUMA +/* + * In the golden day, when everything among i386 and x86_64 will be + * integrated, this will not live here + */ +void *x86_cpu_to_node_map_early_ptr; +int x86_cpu_to_node_map_init[NR_CPUS] = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; +#endif + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -775,7 +837,7 @@ void __init setup_arch(char **cmdline_p) copy_edid(); apm_info.bios = boot_params.apm_bios_info; ist_info = boot_params.ist_info; - saved_videomode = boot_params.hdr.vid_mode; + saved_video_mode = boot_params.hdr.vid_mode; if( boot_params.sys_desc_table.length != 0 ) { set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); machine_id = boot_params.sys_desc_table.table[0]; @@ -842,15 +904,19 @@ void __init setup_arch(char **cmdline_p) efi_init(); /* update e820 for memory not covered by WB MTRRs */ - find_max_pfn(); + propagate_e820_map(); mtrr_bp_init(); #ifndef CONFIG_XEN if (mtrr_trim_uncached_memory(max_pfn)) - find_max_pfn(); + propagate_e820_map(); #endif max_low_pfn = setup_memory(); +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif + #ifdef CONFIG_VMI /* * Must be after max_low_pfn is determined, and before kernel @@ -858,6 +924,7 @@ void __init setup_arch(char **cmdline_p) */ vmi_init(); #endif + kvm_guest_init(); /* * NOTE: before this point _nobody_ is allowed to allocate @@ -979,6 +1046,18 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); +#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN) + /* + * setup to use the early static init tables during kernel startup + * X86_SMP will exclude sub-arches that don't deal well with it. + */ + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; +#endif +#endif + #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); #endif --- head-2011-03-11.orig/arch/x86/kernel/setup_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/setup_64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -29,18 +29,22 @@ #include <linux/crash_dump.h> #include <linux/root_dev.h> #include <linux/pci.h> +#include <asm/pci-direct.h> #include <linux/efi.h> #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> +#include <linux/iscsi_ibft.h> #include <linux/mmzone.h> #include <linux/kexec.h> #include <linux/cpufreq.h> #include <linux/dmi.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> +#include <linux/sort.h> #include <linux/uaccess.h> #include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> #include <asm/mtrr.h> #include <asm/uaccess.h> @@ -58,7 +62,6 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/setup.h> -#include <asm/mach_apic.h> #include <asm/numa.h> #include <asm/sections.h> #include <asm/dmi.h> @@ -66,6 +69,9 @@ #include <asm/mce.h> #include <asm/ds.h> #include <asm/topology.h> +#include <asm/pat.h> + +#include <mach_apic.h> #ifdef CONFIG_XEN #include <linux/percpu.h> #include <xen/interface/physdev.h> @@ -149,7 +155,7 @@ extern int root_mountflags; char __initdata command_line[COMMAND_LINE_SIZE]; -struct resource standard_io_resources[] = { +static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, { .name = "pic1", .start = 0x20, .end = 0x21, @@ -158,7 +164,9 @@ struct resource standard_io_resources[] .flags = IORESOURCE_BUSY | IORESOURCE_IO }, { .name = "timer1", .start = 0x50, .end = 0x53, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x6f, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, { .name = "dma page reg", .start = 0x80, .end = 0x8f, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, @@ -224,10 +232,10 @@ contig_initmem_init(unsigned long start_ e820_register_active_regions(0, start_pfn, end_pfn); #ifdef CONFIG_XEN if (xen_start_info->nr_pages < end_pfn) - free_bootmem_with_active_regions(0, xen_start_info->nr_pages); - else + end_pfn = xen_start_info->nr_pages; #endif free_bootmem_with_active_regions(0, end_pfn); + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); } #endif @@ -291,6 +299,7 @@ static void __init reserve_crashkernel(v (unsigned long)(total_mem >> 20)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); } } #else @@ -307,6 +316,40 @@ void __attribute__((weak)) __init memory machine_specific_memory_setup(); } +static void __init parse_setup_data(void) +{ + struct setup_data *data; + unsigned long pa_data; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, PAGE_SIZE); + switch (data->type) { + default: + break; + } +#ifndef CONFIG_DEBUG_BOOT_PARAMS + free_early(pa_data, pa_data+sizeof(*data)+data->len); +#endif + pa_data = data->next; + early_iounmap(data, PAGE_SIZE); + } +} + +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __init check_enable_amd_mmconf_dmi(void); +#else +void __cpuinit fam10h_check_enable_mmcfg(void) +{ +} +void __init check_enable_amd_mmconf_dmi(void) +{ +} +#endif + /* * setup_arch - architecture-specific boot-time initializations * @@ -390,6 +433,8 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + parse_setup_data(); + parse_early_param(); #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT @@ -399,6 +444,13 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); +#ifndef CONFIG_XEN + /* after parse_early_param, so could debug it */ + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); +#endif + early_gart_iommu_check(); e820_register_active_regions(0, 0, -1UL); @@ -421,15 +473,23 @@ void __init setup_arch(char **cmdline_p) check_efer(); - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT)); if (efi_enabled) efi_init(); +#ifndef CONFIG_XEN + vsmp_init(); +#endif + if (is_initial_xendomain()) dmi_scan_machine(); io_delay_init(); +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif + #if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* setup to use the early static init tables during kernel startup */ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; @@ -460,9 +520,9 @@ void __init setup_arch(char **cmdline_p) contig_initmem_init(0, end_pfn); #endif - early_res_to_bootmem(); - #ifndef CONFIG_XEN + dma32_reserve_bootmem(); + #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -488,16 +548,17 @@ void __init setup_arch(char **cmdline_p) unsigned long end_of_mem = end_pfn << PAGE_SHIFT; if (ramdisk_end <= end_of_mem) { -#ifndef CONFIG_XEN - reserve_bootmem_generic(ramdisk_image, ramdisk_size); -#endif + /* + * don't need to reserve again, already reserved early + * in x86_64_start_kernel, and early_res_to_bootmem + * convert that to reserved in bootmem + */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; #ifdef CONFIG_XEN initrd_below_start_ok = 1; #endif } else { - /* Assumes everything on node 0 */ free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", @@ -507,6 +568,9 @@ void __init setup_arch(char **cmdline_p) } #endif reserve_crashkernel(); + + reserve_ibft_region(); + paging_init(); map_vsyscall(); #ifdef CONFIG_X86_LOCAL_APIC @@ -634,16 +698,16 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); #endif + kvm_guest_init(); + /* * We trust e820 completely. No explicit ROM probing in memory. */ #ifdef CONFIG_XEN if (is_initial_xendomain()) - e820_reserve_resources(machine_e820.map, machine_e820.nr_map, - &code_resource, &data_resource, &bss_resource); + e820_reserve_resources(machine_e820.map, machine_e820.nr_map); #else - e820_reserve_resources(e820.map, e820.nr_map, - &code_resource, &data_resource, &bss_resource); + e820_reserve_resources(e820.map, e820.nr_map); e820_mark_nosave_regions(); #endif @@ -691,6 +755,9 @@ void __init setup_arch(char **cmdline_p) #endif #endif /* !CONFIG_XEN */ + + /* do this before identify_cpu for boot cpu */ + check_enable_amd_mmconf_dmi(); } #ifdef CONFIG_XEN @@ -787,9 +854,9 @@ static void __cpuinit amd_detect_cmp(str bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - c->phys_proc_id = phys_pkg_id(bits); + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; #ifdef CONFIG_NUMA node = c->phys_proc_id; @@ -806,7 +873,7 @@ static void __cpuinit amd_detect_cmp(str If that doesn't result in a usable node fall back to the path for the previous case. */ - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); + int ht_nodeid = c->initial_apicid; if (ht_nodeid >= 0 && apicid_to_node[ht_nodeid] != NUMA_NO_NODE) @@ -914,7 +981,7 @@ static void __cpuinit init_amd(struct cp /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, (unsigned long *)&c->x86_capability); + clear_cpu_cap(c, 0*32+31); /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); @@ -956,9 +1023,25 @@ static void __cpuinit init_amd(struct cp /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + if (c->x86 == 0x10) + fam10h_check_enable_mmcfg(); + #ifndef CONFIG_XEN if (amd_apic_timer_broken()) disable_apic_timer = 1; + + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && + (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + set_memory_4k((unsigned long)__va(tseg), 1); + } #endif } @@ -1054,7 +1137,7 @@ static void __cpuinit early_init_intel(s { if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } static void __cpuinit init_intel(struct cpuinfo_x86 *c) @@ -1097,9 +1180,6 @@ static void __cpuinit init_intel(struct if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_REP_GOOD); set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -1110,6 +1190,32 @@ static void __cpuinit init_intel(struct srat_detect_node(); } +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + if (c->x86 == 0x6 && c->x86_model >= 0xf) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +static void __cpuinit init_centaur(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + n = c->extended_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } + + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +} + static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -1118,6 +1224,8 @@ static void __cpuinit get_cpu_vendor(str c->x86_vendor = X86_VENDOR_AMD; else if (!strcmp(v, "GenuineIntel")) c->x86_vendor = X86_VENDOR_INTEL; + else if (!strcmp(v, "CentaurHauls")) + c->x86_vendor = X86_VENDOR_CENTAUR; else c->x86_vendor = X86_VENDOR_UNKNOWN; } @@ -1167,15 +1275,18 @@ static void __cpuinit early_identify_cpu c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (c->x86_capability[0] & (1<<19)) + if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; } -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; +#ifndef CONFIG_XEN + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; +#ifdef CONFIG_SMP + c->phys_proc_id = c->initial_apicid; +#endif #endif /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); @@ -1208,8 +1319,12 @@ static void __cpuinit early_identify_cpu case X86_VENDOR_INTEL: early_init_intel(c); break; + case X86_VENDOR_CENTAUR: + early_init_centaur(c); + break; } + validate_pat_support(c); } /* @@ -1246,6 +1361,10 @@ void __cpuinit identify_cpu(struct cpuin init_intel(c); break; + case X86_VENDOR_CENTAUR: + init_centaur(c); + break; + case X86_VENDOR_UNKNOWN: default: display_cacheinfo(c); @@ -1275,14 +1394,24 @@ void __cpuinit identify_cpu(struct cpuin #endif select_idle_routine(c); - if (c != &boot_cpu_data) - mtrr_ap_init(); #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } +void __cpuinit identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); +} + +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + mtrr_ap_init(); +} + static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLSH); @@ -1311,123 +1440,3 @@ static __init int setup_disablecpuid(cha return 1; } __setup("clearcpuid=", setup_disablecpuid); - -/* - * Get CPU information for use by the procfs. - */ - -static int show_cpuinfo(struct seq_file *m, void *v) -{ - struct cpuinfo_x86 *c = v; - int cpu = 0, i; - -#ifdef CONFIG_SMP - cpu = c->cpu_index; -#endif - - seq_printf(m, "processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)cpu, - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); - else - seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get((unsigned)cpu); - - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); - } - - /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - if (smp_num_siblings * c->x86_max_cores > 1) { - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", - cpus_weight(per_cpu(cpu_core_map, cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - } -#endif - - seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", - c->cpuid_level); - - for (i = 0; i < 32*NCAPINTS; i++) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) - seq_printf(m, " %s", x86_cap_flags[i]); - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); - - if (c->x86_tlbsize > 0) - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); - seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); - seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", - c->x86_phys_bits, c->x86_virt_bits); - - seq_printf(m, "power management:"); - for (i = 0; i < 32; i++) { - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags) && - x86_power_flags[i]) - seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", - x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } - } - - seq_printf(m, "\n\n"); - - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = first_cpu(cpu_online_map); - if ((*pos) < NR_CPUS && cpu_online(*pos)) - return &cpu_data(*pos); - return NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - *pos = next_cpu(*pos, cpu_online_map); - return c_start(m, pos); -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations cpuinfo_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo, -}; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/kernel/smp-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,327 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> + +#include <asm/mtrr.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <mach_ipi.h> +#include <xen/evtchn.h> +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically that's so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +void xen_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN_ON(1); + return; + } + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); +} + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + +static struct call_data_struct *call_data; + +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus() - 1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + + +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int +xen_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) +{ + struct call_data_struct data; + cpumask_t allbutself; + int cpus; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself) && + cpus_equal(cpu_online_map, cpu_callout_map)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + spin_unlock(&call_lock); + + return 0; +} + +static void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_all_local_evtchn(); + if (hlt_works(smp_processor_id())) + for (;;) halt(); + for (;;); +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +void xen_smp_send_stop(void) +{ + int nolock; + unsigned long flags; + + /* Don't deadlock on the call lock in panic */ + nolock = !spin_trylock(&call_lock); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); + disable_all_local_evtchn(); + local_irq_restore(flags); +} + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) +{ +#ifdef CONFIG_X86_32 + __get_cpu_var(irq_stat).irq_resched_count++; +#else + add_pda(irq_resched_count, 1); +#endif + return IRQ_HANDLED; +} + +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + (*func)(info); +#ifdef CONFIG_X86_32 + __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif + + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } + + return IRQ_HANDLED; +} --- head-2011-03-11.orig/arch/x86/kernel/smp_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,647 +0,0 @@ -/* - * Intel SMP support routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> - * - * This code is released under the GNU General Public License version 2 or - * later. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/cache.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/module.h> - -#include <asm/mtrr.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#if 0 -#include <mach_apic.h> -#endif -#include <xen/evtchn.h> - -/* - * Some notes on x86 processor bugs affecting SMP operation: - * - * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. - * The Linux implications for SMP are handled as follows: - * - * Pentium III / [Xeon] - * None of the E1AP-E3AP errata are visible to the user. - * - * E1AP. see PII A1AP - * E2AP. see PII A2AP - * E3AP. see PII A3AP - * - * Pentium II / [Xeon] - * None of the A1AP-A3AP errata are visible to the user. - * - * A1AP. see PPro 1AP - * A2AP. see PPro 2AP - * A3AP. see PPro 7AP - * - * Pentium Pro - * None of 1AP-9AP errata are visible to the normal user, - * except occasional delivery of 'spurious interrupt' as trap #15. - * This is very rare and a non-problem. - * - * 1AP. Linux maps APIC as non-cacheable - * 2AP. worked around in hardware - * 3AP. fixed in C0 and above steppings microcode update. - * Linux does not use excessive STARTUP_IPIs. - * 4AP. worked around in hardware - * 5AP. symmetric IO mode (normal Linux operation) not affected. - * 'noapic' mode has vector 0xf filled out properly. - * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs - * 8AP. We do not enable low power mode (deep sleep) during MP bootup - * 9AP. We do not use mixed mode - * - * Pentium - * There is a marginal case where REP MOVS on 100MHz SMP - * machines with B stepping processors can fail. XXX should provide - * an L1cache=Writethrough or L1cache=off option. - * - * B stepping CPUs may hang. There are hardware work arounds - * for this. We warn about it in case your board doesn't have the work - * arounds. Basically that's so I can tell anyone with a B stepping - * CPU and SMP problems "tough". - * - * Specific items [From Pentium Processor Specification Update] - * - * 1AP. Linux doesn't use remote read - * 2AP. Linux doesn't trust APIC errors - * 3AP. We work around this - * 4AP. Linux never generated 3 interrupts of the same priority - * to cause a lost local interrupt. - * 5AP. Remote read is never used - * 6AP. not affected - worked around in hardware - * 7AP. not affected - worked around in hardware - * 8AP. worked around in hardware - we get explicit CS errors if not - * 9AP. only 'noapic' mode affected. Might generate spurious - * interrupts, we log only the first one and count the - * rest silently. - * 10AP. not affected - worked around in hardware - * 11AP. Linux reads the APIC between writes to avoid this, as per - * the documentation. Make sure you preserve this as it affects - * the C stepping chips too. - * 12AP. not affected - worked around in hardware - * 13AP. not affected - worked around in hardware - * 14AP. we always deassert INIT during bootup - * 15AP. not affected - worked around in hardware - * 16AP. not affected - worked around in hardware - * 17AP. not affected - worked around in hardware - * 18AP. not affected - worked around in hardware - * 19AP. not affected - worked around in BIOS - * - * If this sounds worrying believe me these bugs are either ___RARE___, - * or are signal timing bugs worked around in hardware and there's - * about nothing of note with C stepping upwards. - */ - -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; - -/* - * the following functions deal with sending IPIs between CPUs. - * - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. - */ - -static inline int __prepare_ICR (unsigned int shortcut, int vector) -{ - unsigned int icr = shortcut | APIC_DEST_LOGICAL; - - switch (vector) { - default: - icr |= APIC_DM_FIXED | vector; - break; - case NMI_VECTOR: - icr |= APIC_DM_NMI; - break; - } - return icr; -} - -static inline int __prepare_ICR2 (unsigned int mask) -{ - return SET_APIC_DEST_FIELD(mask); -} - -DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); - -static inline void __send_IPI_one(unsigned int cpu, int vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - -void __send_IPI_shortcut(unsigned int shortcut, int vector) -{ - int cpu; - - switch (shortcut) { - case APIC_DEST_SELF: - __send_IPI_one(smp_processor_id(), vector); - break; - case APIC_DEST_ALLBUT: - for (cpu = 0; cpu < NR_CPUS; ++cpu) { - if (cpu == smp_processor_id()) - continue; - if (cpu_isset(cpu, cpu_online_map)) { - __send_IPI_one(cpu, vector); - } - } - break; - default: - printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, - vector); - break; - } -} - -void send_IPI_self(int vector) -{ - __send_IPI_shortcut(APIC_DEST_SELF, vector); -} - -/* - * This is only used on smaller machines. - */ -void send_IPI_mask_bitmask(cpumask_t mask, int vector) -{ - unsigned long flags; - unsigned int cpu; - - local_irq_save(flags); - WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]); - - for (cpu = 0; cpu < NR_CPUS; ++cpu) { - if (cpu_isset(cpu, mask)) { - __send_IPI_one(cpu, vector); - } - } - - local_irq_restore(flags); -} - -void send_IPI_mask_sequence(cpumask_t mask, int vector) -{ - - send_IPI_mask_bitmask(mask, vector); -} - -#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ - -#if 0 /* XEN */ -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul <manfred@colorfullife.com> - */ - -static cpumask_t flush_cpumask; -static struct mm_struct * flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - * - * We need to reload %cr3 since the page tables may be going - * away from under us.. - */ -void leave_mm(int cpu) -{ - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} -EXPORT_SYMBOL_GPL(leave_mm); - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu_tlbstate to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu_tlbstate[].active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu_tlbstate[].active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ - -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id) -{ - unsigned long cpu; - - cpu = get_cpu(); - - if (!cpu_isset(cpu, flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == TLB_FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(flush_va); - } else - leave_mm(cpu); - } - smp_mb__before_clear_bit(); - cpu_clear(cpu, flush_cpumask); - smp_mb__after_clear_bit(); -out: - put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; - - return IRQ_HANDLED; -} - -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, - unsigned long va) -{ - cpumask_t cpumask = *cpumaskp; - - /* - * A couple of (to be removed) sanity checks: - * - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - BUG_ON(!mm); - -#ifdef CONFIG_HOTPLUG_CPU - /* If a CPU which we ran on has gone down, OK. */ - cpus_and(cpumask, cpumask, cpu_online_map); - if (unlikely(cpus_empty(cpumask))) - return; -#endif - - /* - * i'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * AK: x86-64 has a faster method that could be ported. - */ - spin_lock(&tlbstate_lock); - - flush_mm = mm; - flush_va = va; - cpus_or(flush_cpumask, cpumask, flush_cpumask); - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - - while (!cpus_empty(flush_cpumask)) - /* nothing. lockup detection does not belong here */ - cpu_relax(); - - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); -} - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm (struct mm_struct * mm) -{ - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if(current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - - preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_page); - -static void do_flush_tlb_all(void* info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); -} - -#endif /* XEN */ - -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ -void xen_smp_send_reschedule(int cpu) -{ - WARN_ON(cpu_is_offline(cpu)); - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); -} - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -void lock_ipi_call_lock(void) -{ - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - -static struct call_data_struct *call_data; - -static void __smp_call_function(void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus() - 1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - - -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. Must not include the current cpu. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int -xen_smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) -{ - struct call_data_struct data; - cpumask_t allbutself; - int cpus; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - - allbutself = cpu_online_map; - cpu_clear(smp_processor_id(), allbutself); - - cpus_and(mask, mask, allbutself); - cpus = cpus_weight(mask); - - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); - - /* Send a message to other CPUs */ - if (cpus_equal(mask, allbutself)) - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - else - send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); - spin_unlock(&call_lock); - - return 0; -} - -static void stop_this_cpu (void * dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_all_local_evtchn(); - if (cpu_data(smp_processor_id()).hlt_works_ok) - for(;;) halt(); - for (;;); -} - -/* - * this function calls the 'stop' function on all other CPUs in the system. - */ - -void xen_smp_send_stop(void) -{ - /* Don't deadlock on the call lock in panic */ - int nolock = !spin_trylock(&call_lock); - unsigned long flags; - - local_irq_save(flags); - __smp_call_function(stop_this_cpu, NULL, 0, 0); - if (!nolock) - spin_unlock(&call_lock); - disable_all_local_evtchn(); - local_irq_restore(flags); -} - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) -{ - __get_cpu_var(irq_stat).irq_resched_count++; - - return IRQ_HANDLED; -} - -#include <linux/kallsyms.h> -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - irq_enter(); - (*func)(info); - __get_cpu_var(irq_stat).irq_call_count++; - irq_exit(); - - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } - - return IRQ_HANDLED; -} --- head-2011-03-11.orig/arch/x86/kernel/smp_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,554 +0,0 @@ -/* - * Intel SMP support routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> - * (c) 2002,2003 Andi Kleen, SuSE Labs. - * - * This code is released under the GNU General Public License version 2 or - * later. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/interrupt.h> - -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mach_apic.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/apicdef.h> -#include <asm/idle.h> -#ifdef CONFIG_XEN -#include <xen/evtchn.h> -#endif - -#ifndef CONFIG_XEN -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul <manfred@colorfullife.com> - * - * More scalable flush, from Andi Kleen - * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into - * the right per cpu variable for the flush data. - * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. - * In future when interrupts are split into per CPU domains this could be - * fixed, at the cost of triggering multiple IPIs in some cases. - */ - -union smp_flush_state { - struct { - cpumask_t flush_cpumask; - struct mm_struct *flush_mm; - unsigned long flush_va; - spinlock_t tlbstate_lock; - }; - char pad[SMP_CACHE_BYTES]; -} ____cacheline_aligned; - -/* State is put into the per CPU data section, but padded - to a full cache line because other CPUs can access it and we don't - want false sharing in the per cpu data segment. */ -static DEFINE_PER_CPU(union smp_flush_state, flush_state); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -void leave_mm(int cpu) -{ - if (read_pda(mmu_state) == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} -EXPORT_SYMBOL_GPL(leave_mm); - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu mmu_state is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) -{ - int cpu; - int sender; - union smp_flush_state *f; - - cpu = smp_processor_id(); - /* - * orig_rax contains the negated interrupt vector. - * Use that to determine where the sender put the data. - */ - sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; - f = &per_cpu(flush_state, sender); - - if (!cpu_isset(cpu, f->flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (f->flush_mm == read_pda(active_mm)) { - if (read_pda(mmu_state) == TLBSTATE_OK) { - if (f->flush_va == TLB_FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(f->flush_va); - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); -} - -void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, - unsigned long va) -{ - int sender; - union smp_flush_state *f; - cpumask_t cpumask = *cpumaskp; - - /* Caller has disabled preemption */ - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; - f = &per_cpu(flush_state, sender); - - /* - * Could avoid this lock when - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - * probably not worth checking this for a cache-hot lock. - */ - spin_lock(&f->tlbstate_lock); - - f->flush_mm = mm; - f->flush_va = va; - cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); - - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); - - while (!cpus_empty(f->flush_cpumask)) - cpu_relax(); - - f->flush_mm = NULL; - f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); -} - -int __cpuinit init_smp_flush(void) -{ - int i; - - for_each_cpu_mask(i, cpu_possible_map) { - spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); - } - return 0; -} -core_initcall(init_smp_flush); - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm (struct mm_struct * mm) -{ - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if(current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - - preempt_enable(); -} - -static void do_flush_tlb_all(void* info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (read_pda(mmu_state) == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); -} -#endif /* Xen */ - -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ - -void smp_send_reschedule(int cpu) -{ - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); -} - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -static struct call_data_struct * call_data; - -void lock_ipi_call_lock(void) -{ - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - -/* - * this function sends a 'generic call function' IPI to all other CPU - * of the system defined in the mask. - */ -static int __smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) -{ - struct call_data_struct data; - cpumask_t allbutself; - int cpus; - - allbutself = cpu_online_map; - cpu_clear(smp_processor_id(), allbutself); - - cpus_and(mask, mask, allbutself); - cpus = cpus_weight(mask); - - if (!cpus) - return 0; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - - /* Send a message to other CPUs */ - if (cpus_equal(mask, allbutself)) - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - else - send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return 0; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); - - return 0; -} -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. Must not include the current cpu. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) -{ - int ret; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock(&call_lock); - ret = __smp_call_function_mask(mask, func, info, wait); - spin_unlock(&call_lock); - return ret; -} -EXPORT_SYMBOL(smp_call_function_mask); - -/* - * smp_call_function_single - Run a function on a specific CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute <func> - * or is or has executed. - */ - -int smp_call_function_single (int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int ret, me = get_cpu(); - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - if (cpu == me) { - local_irq_disable(); - func(info); - local_irq_enable(); - put_cpu(); - return 0; - } - - ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); - - put_cpu(); - return ret; -} -EXPORT_SYMBOL(smp_call_function_single); - -/* - * smp_call_function - run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. - * @wait: If true, wait (atomically) until function has completed on other - * CPUs. - * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute func or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - * Actually there are a few legal cases, like panic. - */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - return smp_call_function_mask(cpu_online_map, func, info, wait); -} -EXPORT_SYMBOL(smp_call_function); - -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_all_local_evtchn(); - for (;;) - halt(); -} - -void smp_send_stop(void) -{ - int nolock; - unsigned long flags; - -#ifndef CONFIG_XEN - if (reboot_force) - return; -#endif - - /* Don't deadlock on the call lock in panic */ - nolock = !spin_trylock(&call_lock); - local_irq_save(flags); - __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0); - if (!nolock) - spin_unlock(&call_lock); - disable_all_local_evtchn(); - local_irq_restore(flags); -} - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -#ifndef CONFIG_XEN -asmlinkage void smp_reschedule_interrupt(void) -#else -asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx) -#endif -{ -#ifndef CONFIG_XEN - ack_APIC_irq(); -#endif - add_pda(irq_resched_count, 1); -#ifdef CONFIG_XEN - return IRQ_HANDLED; -#endif -} - -#ifndef CONFIG_XEN -asmlinkage void smp_call_function_interrupt(void) -#else -asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx) -#endif -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - -#ifndef CONFIG_XEN - ack_APIC_irq(); -#endif - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - exit_idle(); - irq_enter(); - (*func)(info); - add_pda(irq_call_count, 1); - irq_exit(); - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } -#ifdef CONFIG_XEN - return IRQ_HANDLED; -#endif -} --- head-2011-03-11.orig/arch/x86/kernel/time-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/time-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -703,8 +703,6 @@ int xen_update_persistent_clock(void) return 0; } -extern void (*late_time_init)(void); - /* Dynamically-mapped IRQ. */ DEFINE_PER_CPU(int, timer_irq); @@ -933,7 +931,7 @@ int __cpuinit local_setup_timer(unsigned return 0; } -void __cpuexit local_teardown_timer(unsigned int cpu) +void __cpuinit local_teardown_timer(unsigned int cpu) { BUG_ON(cpu == 0); unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); --- head-2011-03-11.orig/arch/x86/kernel/traps_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/traps_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -9,26 +9,28 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'asm.s'. */ -#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/spinlock.h> +#include <linux/highmem.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/kdebug.h> #include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> #include <linux/string.h> +#include <linux/unwind.h> +#include <linux/delay.h> #include <linux/errno.h> +#include <linux/kexec.h> +#include <linux/sched.h> #include <linux/timer.h> -#include <linux/mm.h> #include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/highmem.h> -#include <linux/kallsyms.h> -#include <linux/ptrace.h> -#include <linux/utsname.h> -#include <linux/kprobes.h> -#include <linux/kexec.h> -#include <linux/unwind.h> -#include <linux/uaccess.h> -#include <linux/nmi.h> #include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/mm.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -43,21 +45,18 @@ #include <linux/edac.h> #endif +#include <asm/arch_hooks.h> +#include <asm/stacktrace.h> #include <asm/processor.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/atomic.h> #include <asm/debugreg.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/unwind.h> #include <asm/desc.h> #include <asm/i387.h> #include <asm/nmi.h> -#include <asm/unwind.h> #include <asm/smp.h> -#include <asm/arch_hooks.h> -#include <linux/kdebug.h> -#include <asm/stacktrace.h> - -#include <linux/module.h> +#include <asm/io.h> #include "mach_traps.h" @@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors); asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq = 0; +char ignore_fpu_irq; #ifndef CONFIG_X86_NO_IDT /* @@ -113,12 +112,13 @@ static unsigned int code_bytes = 64; void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS - unsigned long offset = 0, symsize; + char namebuf[KSYM_NAME_LEN]; + unsigned long offset = 0; + unsigned long symsize; const char *symname; - char *modname; - char *delim = ":"; - char namebuf[128]; char reliab[4] = ""; + char *delim = ":"; + char *modname; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct /* The form of the top of the frame on the stack */ struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; + struct stack_frame *next_frame; + unsigned long return_address; }; -static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -174,7 +175,7 @@ static inline unsigned long print_contex return bp; } -#define MSG(msg) ops->warning(data, msg) +#define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, @@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task if (!stack) { unsigned long dummy; + stack = &dummy; if (task != current) stack = (unsigned long *)task->thread.sp; @@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task if (!bp) { if (task == current) { /* Grab bp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (bp) : ); + asm("movl %%ebp, %0" : "=r" (bp) :); } else { /* bp is the last reg pushed by switch_to */ bp = *(unsigned long *) task->thread.sp; @@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task while (1) { struct thread_info *context; + context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); bp = print_context_stack(context, stack, bp, ops, data); - /* Should be after the line below, but somewhere - in early boot context comes out corrupted and we - can't reference it -AK */ + /* + * Should be after the line below, but somewhere + * in early boot context comes out corrupted and we + * can't reference it: + */ if (ops->stack(data, "IRQ") < 0) break; - stack = (unsigned long*)context->previous_esp; + stack = (unsigned long *)context->previous_esp; if (!stack) break; touch_nmi_watchdog(); @@ -251,15 +256,15 @@ static void print_trace_address(void *da } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, }; static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); @@ -271,21 +276,22 @@ void show_trace(struct task_struct *task show_trace_log_lvl(task, regs, stack, bp, ""); } -static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; if (sp == NULL) { if (task) - sp = (unsigned long*)task->thread.sp; + sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; } stack = sp; - for(i = 0; i < kstack_depth_to_print; i++) { + for (i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) @@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -307,8 +314,8 @@ void show_stack(struct task_struct *task */ void dump_stack(void) { - unsigned long stack; unsigned long bp = 0; + unsigned long stack; #ifdef CONFIG_FRAME_POINTER if (!bp) @@ -320,6 +327,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); + show_trace(current, NULL, &stack, bp); } @@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs print_modules(); __show_registers(regs, 0); + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); @@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (!user_mode_vm(regs)) { - u8 *ip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; + u8 *ip; printk("\n" KERN_EMERG "Stack: "); show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); @@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs } } printk("\n"); -} +} int is_valid_bugaddr(unsigned long ip) { @@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip) static int die_counter; -int __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char *str, struct pt_regs *regs, long err) { - unsigned long sp; unsigned short ss; + unsigned long sp; printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT @@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != - NOTIFY_STOP) { + current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { + show_registers(regs); /* Executive summary in case the oops scrolled away */ sp = (unsigned long) (®s->sp); @@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; - } else { - return 1; } + + return 1; } /* - * This is gone through when something in the kernel has done something bad and - * is about to be terminated. + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: */ -void die(const char * str, struct pt_regs * regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { static struct { raw_spinlock_t lock; @@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); - } else + } else { raw_local_irq_save(flags); + } if (++die.lock_owner_depth < 3) { report_bug(regs->ip, regs); @@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg do_exit(SIGSEGV); } -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) { if (!user_mode_vm(regs)) die(str, regs, err); } -static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, - struct pt_regs * regs, long error_code, - siginfo_t *info) +static void __kprobes +do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, + long error_code, siginfo_t *info) { struct task_struct *tsk = current; - if (regs->flags & VM_MASK) { + if (regs->flags & X86_VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; @@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; - trap_signal: { - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; +trap_signal: + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - } + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; - kernel_trap: { - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; +kernel_trap: + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); } + return; - vm86_trap: { - int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); - if (ret) goto trap_signal; - return; - } +vm86_trap: + if (handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + goto trap_signal; + return; } -#define DO_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ -} - -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ +#define DO_ERROR(trapnr, signr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + if (irq) \ + local_irq_enable(); \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ +} + +#define DO_VM86_ERROR(trapnr, signr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ +} + +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) +DO_VM86_ERROR(3, SIGTRAP, "int3", int3) #endif -DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) +DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) +DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes do_general_protection(struct pt_regs * regs, long error_code) { - if (regs->flags & VM_MASK) + struct thread_struct *thread; + + thread = ¤t->thread; + + if (regs->flags & X86_VM_MASK) goto gp_in_vm86; if (!user_mode(regs)) @@ -613,6 +628,7 @@ void __kprobes do_general_protection(str current->thread.error_code = error_code; current->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && printk_ratelimit()) { printk(KERN_INFO @@ -642,22 +658,25 @@ gp_in_kernel: } } -static __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs * regs) +static notrace __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs *regs) { - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " - "CPU %d.\n", reason, smp_processor_id()); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG + "You have some hardware problem, likely on the PCI bus.\n"); #if defined(CONFIG_EDAC) - if(edac_handler_set()) { + if (edac_handler_set()) { edac_atomic_assert_error(); return; } #endif if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); @@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s clear_mem_error(reason); } -static __kprobes void -io_check_error(unsigned char reason, struct pt_regs * regs) +static notrace __kprobes void +io_check_error(unsigned char reason, struct pt_regs *regs) { printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str clear_io_check_error(reason); } -static __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static notrace __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; #ifdef CONFIG_MCA - /* Might actually be able to figure out what the guilty party - * is. */ - if( MCA_bus ) { + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { mca_handle_nmi(); return; } #endif - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " - "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } static DEFINE_SPINLOCK(nmi_print_lock); -void __kprobes die_nmi(struct pt_regs *regs, const char *msg) +void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) { - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == - NOTIFY_STOP) + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try - * to get a message out. + * to get a message out: */ bust_spinlocks(1); printk(KERN_EMERG "%s", msg); @@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r spin_unlock(&nmi_print_lock); bust_spinlocks(0); - /* If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can. - */ + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ if (!user_mode_vm(regs)) { current->thread.trap_no = 2; crash_kexec(regs); @@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r do_exit(SIGSEGV); } -static __kprobes void default_do_nmi(struct pt_regs * regs) +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; - /* Only the BSP gets external NMIs from the system. */ + /* Only the BSP gets external NMIs from the system: */ if (!smp_processor_id()) reason = get_nmi_reason(); - + if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, smp_processor_id())) -#endif unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif return; } @@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str io_check_error(reason, regs); /* * Reassert NMI in case it became active meanwhile - * as it's edge-triggered. + * as it's edge-triggered: */ reassert_nmi(); } static int ignore_nmis; -__kprobes void do_nmi(struct pt_regs * regs, long error_code) +notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { int cpu; @@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - /* This is an interrupt gate, because kprobes wants interrupts - disabled. Normal trap handlers don't. */ + /* + * This is an interrupt gate, because kprobes wants interrupts + * disabled. Normal trap handlers don't. + */ restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); } #endif @@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. - * + * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though @@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r * find every occurrence of the TF bit that could be saved away even * by user code) */ -void __kprobes do_debug(struct pt_regs * regs, long error_code) +void __kprobes do_debug(struct pt_regs *regs, long error_code) { - unsigned int condition; struct task_struct *tsk = current; + unsigned int condition; trace_hardirqs_fixup(); @@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs * goto clear_dr7; } - if (regs->flags & VM_MASK) + if (regs->flags & X86_VM_MASK) goto debug_vm86; /* Save debug status register where ptrace can see it */ @@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs * /* Ok, finally something we can handle */ send_sigtrap(tsk, regs, error_code); - /* Disable additional traps. They'll be re-enabled when + /* + * Disable additional traps. They'll be re-enabled when * the signal is delivered. */ clear_dr7: @@ -897,7 +928,7 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; return; } @@ -908,9 +939,10 @@ clear_TF_reenable: */ void math_error(void __user *ip) { - struct task_struct * task; + struct task_struct *task; + unsigned short cwd; + unsigned short swd; siginfo_t info; - unsigned short cwd, swd; /* * Save the info for the exception handler and clear the error. @@ -936,36 +968,36 @@ void math_error(void __user *ip) cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ - return; - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: /* No unmasked exception */ + return; + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } -void do_coprocessor_error(struct pt_regs * regs, long error_code) +void do_coprocessor_error(struct pt_regs *regs, long error_code) { ignore_fpu_irq = 1; math_error((void __user *)regs->ip); @@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs static void simd_math_error(void __user *ip) { - struct task_struct * task; - siginfo_t info; + struct task_struct *task; unsigned short mxcsr; + siginfo_t info; /* * Save the info for the exception handler and clear the error. @@ -996,84 +1028,82 @@ static void simd_math_error(void __user */ mxcsr = get_fpu_mxcsr(task); switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } -void do_simd_coprocessor_error(struct pt_regs * regs, - long error_code) +void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; simd_math_error((void __user *)regs->ip); - } else { - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, - error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); + return; + } + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->flags & X86_VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); } #ifndef CONFIG_XEN -void do_spurious_interrupt_bug(struct pt_regs * regs, - long error_code) +void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { #if 0 /* No need to warn about this any longer. */ - printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } -unsigned long patch_espfix_desc(unsigned long uesp, - unsigned long kesp) +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | ((((__u64)base) << 32) & 0xff00000000000000ULL) | ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | (lim_pages & 0xffff); *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + return new_kesp; } #endif /* - * 'math_state_restore()' saves the current math information in the + * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * * Careful.. There are problems with IBM-designed IRQ13 behaviour. @@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void) struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; + if (!tsk_used_math(tsk)) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (init_fpu(tsk)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + /* NB. 'clts' is done for us by Xen during virtual trap. */ - if (!tsk_used_math(tsk)) - init_fpu(tsk); restore_fpu(tsk); thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; @@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore); asmlinkage void math_emulate(long arg) { - printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n",current->comm); - force_sig(SIGFPE,current); + printk(KERN_EMERG + "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n", current->comm); + force_sig(SIGFPE, current); schedule(); } #endif /* CONFIG_MATH_EMULATION */ - /* * NB. All these are "trap gates" (i.e. events_mask isn't set) except * for those that specify <dpl>|4 in the second field. @@ -1146,25 +1189,21 @@ void __init trap_init(void) if (ret) printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); - /* - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. - * Generate a build-time error if the alignment is wrong. - */ - BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); } if (cpu_has_xmm) { - printk(KERN_INFO "Enabling unmasked SIMD FPU exception " - "support... "); + printk(KERN_INFO + "Enabling unmasked SIMD FPU exception support... "); set_in_cr4(X86_CR4_OSXMMEXCPT); printk("done.\n"); } + init_thread_xstate(); /* - * Should be a barrier for any external CPU state. + * Should be a barrier for any external CPU state: */ cpu_init(); } @@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t static int __init kstack_setup(char *s) { kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 1; } __setup("kstack=", kstack_setup); --- head-2011-03-11.orig/arch/x86/kernel/traps_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/traps_64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -33,6 +33,8 @@ #include <linux/kdebug.h> #include <linux/utsname.h> +#include <mach_traps.h> + #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg } #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL) -void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) +notrace __kprobes void +die_nmi(char *str, struct pt_regs *regs, int do_panic) { - unsigned long flags = oops_begin(); + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == + NOTIFY_STOP) + return; + flags = oops_begin(); /* * We are in trouble anyway, lets at least try * to get a message out. @@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro die("general protection fault", regs, error_code); } -static __kprobes void +static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", @@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s clear_mem_error(reason); } -static __kprobes void +static notrace __kprobes void io_check_error(unsigned char reason, struct pt_regs * regs) { printk("NMI: IOCK error (debug interrupt?)\n"); @@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str clear_io_check_error(reason); } -static __kprobes void +static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); @@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason, /* Runs on IST stack. This code must keep interrupts off all the time. Nested NMIs are prevented by the CPU. */ -asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; @@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc asmlinkage void math_state_restore(void) { struct task_struct *me = current; + + if (!used_math()) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (init_fpu(me)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ - if (!used_math()) - init_fpu(me); - restore_fpu_checking(&me->thread.i387.fxsave); + restore_fpu_checking(&me->thread.xstate->fxsave); task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } @@ -1168,6 +1192,10 @@ void __init trap_init(void) printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); /* + * initialize the per thread extended state: + */ + init_thread_xstate(); + /* * Should be a barrier for any external CPU state. */ cpu_init(); --- head-2011-03-11.orig/arch/x86/kernel/vsyscall_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/vsyscall_64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s return 0; } -long __vsyscall(3) venosys_1(void) +static long __vsyscall(3) venosys_1(void) { return -ENOSYS; } --- head-2011-03-11.orig/arch/x86/mach-xen/setup.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mach-xen/setup.c 2011-01-31 18:07:35.000000000 +0100 @@ -56,11 +56,7 @@ char * __init machine_specific_memory_se { int rc; struct xen_memory_map memmap; - /* - * This is rather large for a stack variable but this early in - * the boot process we know we have plenty slack space. - */ - struct e820entry map[E820MAX]; + static struct e820entry __initdata map[E820MAX]; memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/mm/dump_pagetables-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,371 @@ +/* + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/debugfs.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/seq_file.h> + +#include <xen/interface/xen.h> + +#include <asm/pgtable.h> + +/* + * The dumper groups pagetable entries of the same type into one, and for + * that it needs to keep some state when walking, and flush this state + * when a "break" in the continuity is found. + */ +struct pg_state { + int level; + pgprot_t current_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +/* Address space markers hints */ +static struct addr_marker address_markers[] = { + { 0, "User Space" }, +#ifdef CONFIG_X86_64 + { HYPERVISOR_VIRT_START, "Hypervisor Space" }, + { HYPERVISOR_VIRT_END, "Low Kernel Mapping" }, + { VMALLOC_START, "vmalloc() Area" }, + { VMEMMAP_START, "Vmemmap" }, + { __START_KERNEL_map, "High Kernel Mapping" }, + { MODULES_VADDR, "Modules" }, + { MODULES_END, "End Modules" }, +#else + { PAGE_OFFSET, "Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/*VMALLOC_END*/, "vmalloc() End" }, +# ifdef CONFIG_HIGHMEM + { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, +# endif + { 0/*FIXADDR_START*/, "Fixmap Area" }, + { 0/*HYPERVISOR_VIRT_START*/, "Hypervisor Space" }, +#endif + { -1, NULL } /* End of list */ +}; + +static inline bool hypervisor_space(unsigned long addr) { +#ifdef CONFIG_X86_64 + return addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END; +#else + return addr >= hypervisor_virt_start; +#endif +} + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) + +/* + * Print a readable form of a pgprot_t to the seq_file + */ +static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +{ + pgprotval_t pr = pgprot_val(prot); + static const char * const level_name[] = + { "cr3", "pgd", "pud", "pmd", "pte" }; + + if (!pgprot_val(prot)) { + /* Not present */ + seq_printf(m, " "); + } else { + if (pr & _PAGE_USER) + seq_printf(m, "USR "); + else + seq_printf(m, " "); + if (pr & _PAGE_RW) + seq_printf(m, "RW "); + else + seq_printf(m, "ro "); + if (pr & _PAGE_PWT) + seq_printf(m, "PWT "); + else + seq_printf(m, " "); + if (pr & _PAGE_PCD) + seq_printf(m, "PCD "); + else + seq_printf(m, " "); + + /* Bit 9 has a different meaning on level 3 vs 4 */ + if (level <= 3) { + if (pr & _PAGE_PSE) + seq_printf(m, "PSE "); + else + seq_printf(m, " "); + } else { + if (pr & _PAGE_PAT) + seq_printf(m, "pat "); + else + seq_printf(m, " "); + } + if (pr & _PAGE_GLOBAL) + seq_printf(m, "GLB "); + else + seq_printf(m, " "); + if (pr & _PAGE_NX) + seq_printf(m, "NX "); + else + seq_printf(m, "x "); + } + seq_printf(m, "%s\n", level_name[level]); +} + +/* + * On 64 bits, sign-extend the 48 bit address to 64 bit + */ +static unsigned long normalize_addr(unsigned long u) +{ +#ifdef CONFIG_X86_64 + return (signed long)(u << 16) >> 16; +#else + return u; +#endif +} + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(struct seq_file *m, struct pg_state *st, + pgprot_t new_prot, int level) +{ + pgprotval_t prot, cur; + static const char units[] = "KMGTPE"; + + /* + * If we have a "break" in the series, we need to flush the state that + * we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = pgprot_val(new_prot) & ~(PTE_MASK); + cur = pgprot_val(st->current_prot) & ~(PTE_MASK); + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { + const char *unit = units; + unsigned long delta; + + /* + * Now print the actual finished series + */ + seq_printf(m, "0x%p-0x%p ", + (void *)st->start_address, + (void *)st->current_address); + + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + printk_prot(m, st->current_prot, st->level); + + /* + * We print markers for special areas of address space, + * such as the start of vmalloc space etc. + * This helps in the interpretation. + */ + if (st->current_address >= st->marker[1].start_address) { + st->marker++; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + + st->start_address = st->current_address; + st->current_prot = new_prot; + st->level = level; + } +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + unsigned long P) +{ + int i; + pte_t *start; + + start = (pte_t *) pmd_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PTE; i++) { + pgprot_t prot = pte_pgprot(*start); + + st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); + note_page(m, st, prot, 4); + start++; + } +} + +#if PTRS_PER_PMD > 1 + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + unsigned long P) +{ + int i; + pmd_t *start; + + start = (pmd_t *) pud_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PMD; i++) { + st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); + if (!hypervisor_space(st->current_address) + && !pmd_none(*start)) { + pgprotval_t prot = __pmd_val(*start) & ~PTE_MASK; + + if (pmd_large(*start) || !pmd_present(*start)) + note_page(m, st, __pgprot(prot), 3); + else + walk_pte_level(m, st, *start, + P + i * PMD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 3); + start++; + } +} + +#else +#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define pud_large(a) pmd_large(__pmd(pud_val(a))) +#define pud_none(a) pmd_none(__pmd(pud_val(a))) +#endif + +#if PTRS_PER_PUD > 1 + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + unsigned long P) +{ + int i; + pud_t *start; + + start = (pud_t *) pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_PUD; i++) { + st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); + if (!hypervisor_space(st->current_address) + && !pud_none(*start)) { + pgprotval_t prot = __pud_val(*start) & ~PTE_MASK; + + if (pud_large(*start) || !pud_present(*start)) + note_page(m, st, __pgprot(prot), 2); + else + walk_pmd_level(m, st, *start, + P + i * PUD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define __pud_ma(x) ((pud_t){ __pgd_ma(x) }) +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud_ma(__pgd_val(a)),p) +#define pgd_large(a) pud_large(__pud_ma(__pgd_val(a))) +#define pgd_none(a) pud_none(__pud_ma(__pgd_val(a))) +#endif + +static void walk_pgd_level(struct seq_file *m) +{ +#ifdef CONFIG_X86_64 + pgd_t *start = (pgd_t *) &init_level4_pgt; +#else + pgd_t *start = swapper_pg_dir; +#endif + int i; + struct pg_state st; + + memset(&st, 0, sizeof(st)); + + for (i = 0; i < PTRS_PER_PGD; i++) { + st.current_address = normalize_addr(i * PGD_LEVEL_MULT); + if (!pgd_none(*start)) { + pgprotval_t prot = __pgd_val(*start) & ~PTE_MASK; + + if (pgd_large(*start) || !pgd_present(*start)) + note_page(m, &st, __pgprot(prot), 1); + else + walk_pud_level(m, &st, *start, + i * PGD_LEVEL_MULT); + } else + note_page(m, &st, __pgprot(0), 1); + + start++; + } + + /* Flush out the last page */ + st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); + note_page(m, &st, __pgprot(0), 0); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + walk_pgd_level(m); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init pt_dump_init(void) +{ + struct dentry *pe; + +#ifdef CONFIG_X86_32 + /* Not a compile-time constant on x86-32 */ + address_markers[2].start_address = VMALLOC_START; + address_markers[3].start_address = VMALLOC_END; +# ifdef CONFIG_HIGHMEM + address_markers[4].start_address = PKMAP_BASE; + address_markers[5].start_address = FIXADDR_START; + address_markers[6].start_address = hypervisor_virt_start; +# else + address_markers[4].start_address = FIXADDR_START; + address_markers[5].start_address = hypervisor_virt_start; +# endif +#endif + + pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +__initcall(pt_dump_init); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); --- head-2011-03-11.orig/arch/x86/mm/fault-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/fault-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; + + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + /* * Synchronize this task's top level page-table * with the 'reference' page table. @@ -670,7 +675,7 @@ void __kprobes do_page_fault(struct pt_r #ifdef CONFIG_X86_32 /* It's safe to allow irq's after cr2 has been saved and the vmalloc fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) local_irq_enable(); /* @@ -1028,9 +1033,5 @@ void vmalloc_sync_all(void) if (address == start) start = address + PGDIR_SIZE; } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); #endif } --- head-2011-03-11.orig/arch/x86/mm/highmem_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/highmem_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -200,6 +200,8 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); +#ifdef CONFIG_HIGHPTE EXPORT_SYMBOL(kmap_atomic_to_page); +#endif EXPORT_SYMBOL(clear_highpage); EXPORT_SYMBOL(copy_highpage); --- head-2011-03-11.orig/arch/x86/mm/init_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/init_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,4 @@ /* - * linux/arch/i386/mm/init.c * * Copyright (C) 1995 Linus Torvalds * @@ -22,6 +21,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/pci.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/bootmem.h> @@ -54,6 +54,8 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; +unsigned long max_pfn_mapped; + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; @@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init( if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); } - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); make_lowmem_page_readonly(page_table, XENFEAT_writable_page_tables); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -209,8 +211,13 @@ static void __init kernel_physical_mappi /* * Map with big pages if possible, otherwise * create normal page tables: + * + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. */ - if (cpu_has_pse) { + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; @@ -224,6 +231,7 @@ static void __init kernel_physical_mappi set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; + max_pfn_mapped = pfn; continue; } pte = one_page_table_init(pmd); @@ -241,6 +249,7 @@ static void __init kernel_physical_mappi set_pte(pte, pfn_pte(pfn, prot)); } + max_pfn_mapped = pfn; pte_ofs = 0; } pmd_idx = 0; @@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign #endif +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (mfn_to_local_pfn(pagenr) >= max_pfn) + return 1; + return 0; +} + #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; @@ -303,47 +331,17 @@ static void __init permanent_kmaps_init( pkmap_page_table = pte; } -static void __meminit free_new_highpage(struct page *page, int pfn) -{ - init_page_count(page); - __free_page(page); - totalhigh_pages++; -} - void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - free_new_highpage(page, pfn); + init_page_count(page); + __free_page(page); + totalhigh_pages++; } else SetPageReserved(page); } -static int __meminit -add_one_highpage_hotplug(struct page *page, unsigned long pfn) -{ - free_new_highpage(page, pfn); - totalram_pages++; -#ifdef CONFIG_FLATMEM - max_mapnr = max(pfn, max_mapnr); -#endif - num_physpages++; - - return 0; -} - -/* - * Not currently handling the NUMA case. - * Assuming single node and all memory that - * has been added dynamically that would be - * onlined here is in HIGHMEM. - */ -void __meminit online_page(struct page *page) -{ - ClearPageReserved(page); - add_one_highpage_hotplug(page, page_to_pfn(page)); -} - #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { @@ -466,15 +464,13 @@ void zap_low_mappings(void) { int i; - save_pg_dir(); - /* * Zap initial low-memory mappings. * * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) { + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else @@ -579,9 +575,9 @@ void __init paging_init(void) /* * Test if the WP bit works in supervisor mode. It isn't supported on 386's - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This - * used to involve black magic jumps to work around some nasty CPU bugs, - * but fortunately the switch to using exceptions got rid of all that. + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. */ static void __init test_wp_bit(void) { @@ -612,9 +608,7 @@ void __init mem_init(void) int tmp, bad_ppro; unsigned long pfn; -#if defined(CONFIG_SWIOTLB) - swiotlb_init(); -#endif + pci_iommu_alloc(); #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); @@ -716,16 +710,8 @@ void __init mem_init(void) test_wp_bit(); cpa_init(); - - /* - * Subtle. SMP is doing it's boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. - */ -#ifndef CONFIG_SMP + save_pg_dir(); zap_low_mappings(); -#endif SetPagePinned(virt_to_page(init_mm.pgd)); } @@ -775,25 +761,17 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() <= 1) -#endif - { - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); - printk(KERN_INFO "Testing CPA: write protecting again\n"); - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); -#endif - } + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif start += size; size = (unsigned long)__end_rodata - start; --- head-2011-03-11.orig/arch/x86/mm/init_64-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/init_64-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -52,9 +52,6 @@ #include <xen/features.h> -const struct dma_mapping_ops *dma_ops; -EXPORT_SYMBOL(dma_ops); - #if CONFIG_XEN_COMPAT <= 0x030002 unsigned int __kernel_page_user; EXPORT_SYMBOL(__kernel_page_user); @@ -120,6 +117,28 @@ void __meminit early_make_page_readonly( BUG(); } +#ifndef CONFIG_XEN +int direct_gbpages __meminitdata +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static int __init parse_direct_gbpages_off(char *arg) +{ + direct_gbpages = 0; + return 0; +} +early_param("nogbpages", parse_direct_gbpages_off); + +static int __init parse_direct_gbpages_on(char *arg) +{ + direct_gbpages = 1; + return 0; +} +early_param("gbpages", parse_direct_gbpages_on); +#endif + /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -135,9 +154,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", - nr_swap_pages << (PAGE_SHIFT-10)); - for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { /* @@ -328,7 +344,7 @@ void __init cleanup_highmap(void) pmd_t *last_pmd = pmd + PTRS_PER_PMD; for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { - if (!pmd_present(*pmd)) + if (pmd_none(*pmd)) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); @@ -337,8 +353,7 @@ void __init cleanup_highmap(void) #endif /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); @@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr, } #endif -static void __meminit +static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { int i = pmd_index(address); @@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE)); } } + return address; } -static void __meminit +static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { pmd_t *pmd = pmd_offset(pud, 0); + unsigned long last_map_addr; + spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); + last_map_addr = phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); + return last_map_addr; } -static void __meminit +static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { + unsigned long last_map_addr = end; int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { @@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned break; if (__pud_val(*pud)) { - phys_pmd_update(pud, addr, end); + if (!pud_large(*pud)) + last_map_addr = phys_pmd_update(pud, addr, end); + continue; + } + + if (direct_gbpages) { + set_pte((pte_t *)pud, + pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + last_map_addr = (addr & PUD_MASK) + PUD_SIZE; continue; } @@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned spin_lock(&init_mm.page_table_lock); *pud = __pud(pmd_phys | _KERNPG_TABLE); - phys_pmd_init(pmd, addr, end); + last_map_addr = phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); early_make_page_readonly(pmd, XENFEAT_writable_page_tables); } __flush_tlb_all(); + + return last_map_addr >> PAGE_SHIFT; } void __init xen_init_pt(void) @@ -785,16 +815,138 @@ static void __init xen_finish_init_mappi table_end = start_pfn; } +static void __init init_gbpages(void) +{ +#ifndef CONFIG_XEN + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif +} + +#ifdef CONFIG_MEMTEST_BOOTPARAM + +static void __init memtest(unsigned long start_phys, unsigned long size, + unsigned pattern) +{ + unsigned long i; + unsigned long *start; + unsigned long start_bad; + unsigned long last_bad; + unsigned long val; + unsigned long start_phys_aligned; + unsigned long count; + unsigned long incr; + + switch (pattern) { + case 0: + val = 0UL; + break; + case 1: + val = -1UL; + break; + case 2: + val = 0x5555555555555555UL; + break; + case 3: + val = 0xaaaaaaaaaaaaaaaaUL; + break; + default: + return; + } + + incr = sizeof(unsigned long); + start_phys_aligned = ALIGN(start_phys, incr); + count = (size - (start_phys_aligned - start_phys))/incr; + start = __va(start_phys_aligned); + start_bad = 0; + last_bad = 0; + + for (i = 0; i < count; i++) + start[i] = val; + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { + if (*start != val) { + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + } else { + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + start_bad = last_bad = start_phys_aligned; + } + } + } + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + +} + +static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("memtest", parse_memtest); + +static void __init early_memtest(unsigned long start, unsigned long end) +{ + u64 t_start, t_size; + unsigned pattern; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + for (pattern = 0; pattern < memtest_pattern; pattern++) { + t_start = start; + t_size = 0; + while (t_start < end) { + t_start = find_e820_area_size(t_start, &t_size, 1); + + /* done ? */ + if (t_start >= end) + break; + if (t_start + t_size > end) + t_size = end - t_start; + + printk(KERN_CONT "\n %016llx - %016llx pattern %d", + (unsigned long long)t_start, + (unsigned long long)t_start + t_size, pattern); + + memtest(t_start, t_size, pattern); + + t_start += t_size; + } + } + printk(KERN_CONT "\n"); +} +#else +static void __init early_memtest(unsigned long start, unsigned long end) +{ +} +#endif + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ -void __init_refok init_memory_mapping(unsigned long start, unsigned long end) +unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long next; + unsigned long next, last_map_addr = end; + unsigned long start_phys = start, end_phys = end; - pr_debug("init_memory_mapping\n"); + printk(KERN_INFO "init_memory_mapping\n"); /* * Find space for the kernel direct mapping tables. @@ -803,8 +955,10 @@ void __init_refok init_memory_mapping(un * memory mapped. Unfortunately this is done currently before the * nodes are discovered. */ - if (!after_bootmem) + if (!after_bootmem) { + init_gbpages(); find_early_table_space(end); + } start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -821,7 +975,7 @@ void __init_refok init_memory_mapping(un next = start + PGDIR_SIZE; if (next > end) next = end; - phys_pud_init(pud, __pa(start), __pa(next)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) { early_make_page_readonly(pud, XENFEAT_writable_page_tables); set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); @@ -838,6 +992,11 @@ void __init_refok init_memory_mapping(un if (!after_bootmem) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start_phys, end_phys); + + return last_map_addr; } #ifndef CONFIG_NUMA @@ -861,15 +1020,6 @@ void __init paging_init(void) /* * Memory hotplug specific functions */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; -} - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get @@ -879,11 +1029,13 @@ int arch_add_memory(int nid, u64 start, { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size-1); + last_mapped_pfn = init_memory_mapping(start, start + size-1); + if (last_mapped_pfn > max_pfn_mapped) + max_pfn_mapped = last_mapped_pfn; ret = __add_pages(zone, start_pfn, nr_pages); WARN_ON(1); @@ -902,6 +1054,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to #endif /* CONFIG_MEMORY_HOTPLUG */ +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (mfn_to_local_pfn(pagenr) >= max_pfn) + return 1; + return 0; +} + + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -1009,24 +1181,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { - unsigned long start = (unsigned long)_stext, end; - -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - - end = (unsigned long)__end_rodata; - start = (start + PAGE_SIZE - 1) & PAGE_MASK; - end &= PAGE_MASK; - if (end <= start) - return; - + unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1049,6 +1204,7 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif } + #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -1061,7 +1217,7 @@ void free_initrd_mem(unsigned long start void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { #ifdef CONFIG_NUMA - int nid = phys_to_nid(phys); + int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; @@ -1070,7 +1226,7 @@ void __init reserve_bootmem_generic(unsi * This can happen with kdump kernels when accessing * firmware tables: */ - if (pfn < end_pfn_map) + if (pfn < max_pfn_mapped) return; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", @@ -1080,10 +1236,16 @@ void __init reserve_bootmem_generic(unsi /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + nid = phys_to_nid(phys); + next_nid = phys_to_nid(phys + len - 1); + if (nid == next_nid) + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + else + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif + #ifndef CONFIG_XEN if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; @@ -1192,6 +1354,10 @@ const char *arch_vma_name(struct vm_area /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + int __meminit vmemmap_populate(struct page *start_page, unsigned long size, int node) { @@ -1226,12 +1392,32 @@ vmemmap_populate(struct page *start_page PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd_ma(__pte_val(entry))); - printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", - addr, addr + PMD_SIZE - 1, p, node); + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; } else { vmemmap_verify((pte_t *)pmd, node, addr, next); } } return 0; } + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} #endif --- head-2011-03-11.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:38:58.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/ioremap-xen.c 2011-02-07 15:39:13.000000000 +0100 @@ -20,14 +20,11 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> +#include <asm/pat.h> -enum ioremap_mode { - IOR_MODE_UNCACHED, - IOR_MODE_CACHED, -}; - -#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +#ifdef CONFIG_X86_64 +#ifndef CONFIG_XEN unsigned long __phys_addr(unsigned long x) { if (x >= __START_KERNEL_map) @@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); +#endif + +static inline int phys_addr_valid(unsigned long addr) +{ + return addr < (1UL << boot_cpu_data.x86_phys_bits); +} + +#else + +static inline int phys_addr_valid(unsigned long addr) +{ + return 1; +} #endif @@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru * Fill in the machine address: PTE ptr is done later by * apply_to_page_range(). */ - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; + pgprot_val(prot) |= _PAGE_IO; + v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot))); mfn++; address += PAGE_SIZE; @@ -170,10 +181,9 @@ int create_lookup_pte_addr(struct mm_str EXPORT_SYMBOL(create_lookup_pte_addr); -#ifdef CONFIG_X86_32 int page_is_ram(unsigned long pagenr) { - unsigned long addr, end; + resource_size_t addr, end; int i; #ifndef CONFIG_XEN @@ -209,31 +219,51 @@ int page_is_ram(unsigned long pagenr) } return 0; } -#endif /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. */ static int ioremap_change_attr(unsigned long vaddr, unsigned long size, - enum ioremap_mode mode) + unsigned long prot_val) { unsigned long nrpages = size >> PAGE_SHIFT; int err; - switch (mode) { - case IOR_MODE_UNCACHED: + switch (prot_val) { + case _PAGE_CACHE_UC: default: - err = set_memory_uc(vaddr, nrpages); + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_WC: + err = _set_memory_wc(vaddr, nrpages); break; - case IOR_MODE_CACHED: - err = set_memory_wb(vaddr, nrpages); + case _PAGE_CACHE_WB: + err = _set_memory_wb(vaddr, nrpages); break; } return err; } +int ioremap_check_change_attr(unsigned long mfn, unsigned long size, + unsigned long prot_val) +{ + unsigned long sz; + int rc; + + for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) { + unsigned long pfn = mfn_to_local_pfn(mfn); + + if (pfn >= max_pfn_mapped) + continue; + rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT), + PAGE_SIZE, prot_val); + } + + return rc; +} + /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -243,13 +273,15 @@ static int ioremap_change_attr(unsigned * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, - enum ioremap_mode mode) +static void __iomem *__ioremap_caller(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val, void *caller) { unsigned long offset, vaddr; phys_addr_t mfn, last_addr; struct vm_struct *area; + unsigned long new_prot_val; pgprot_t prot; + int retval; domid_t domid = DOMID_IO; /* Don't allow wraparound or zero size */ @@ -257,6 +289,13 @@ static void __iomem *__ioremap(resource_ if (!size || last_addr < phys_addr) return NULL; + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + (unsigned long long)phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + /* * Don't remap the low PCI/ISA area, it's always mapped.. */ @@ -269,55 +308,86 @@ static void __iomem *__ioremap(resource_ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) { unsigned long pfn = mfn_to_local_pfn(mfn); - if (pfn >= max_pfn) - continue; + if (pfn_valid(pfn)) { + if (!PageReserved(pfn_to_page(pfn))) + return NULL; + domid = DOMID_SELF; + } + } + WARN_ON_ONCE(domid == DOMID_SELF); - domid = DOMID_SELF; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; - if (pfn >= max_pfn_mapped) /* bogus */ - continue; + retval = reserve_memtype(phys_addr, phys_addr + size, + prot_val, &new_prot_val); + if (retval) { + pr_debug("Warning: reserve_memtype returned %d\n", retval); + return NULL; + } - if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + if (prot_val != new_prot_val) { + /* + * Do not fallback to certain memory types with certain + * requested type: + * - request is uc-, return cannot be write-back + * - request is uc-, return cannot be write-combine + * - request is write-combine, return cannot be write-back + */ + if ((prot_val == _PAGE_CACHE_UC_MINUS && + (new_prot_val == _PAGE_CACHE_WB || + new_prot_val == _PAGE_CACHE_WC)) || + (prot_val == _PAGE_CACHE_WC && + new_prot_val == _PAGE_CACHE_WB)) { + pr_debug( + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + (unsigned long long)phys_addr, + (unsigned long long)(phys_addr + size), + prot_val, new_prot_val); + free_memtype(phys_addr, phys_addr + size); return NULL; + } + prot_val = new_prot_val; } - switch (mode) { - case IOR_MODE_UNCACHED: + switch (prot_val) { + case _PAGE_CACHE_UC: default: - /* - * FIXME: we will use UC MINUS for now, as video fb drivers - * depend on it. Upcoming ioremap_wc() will fix this behavior. - */ + prot = PAGE_KERNEL_NOCACHE; + break; + case _PAGE_CACHE_UC_MINUS: prot = PAGE_KERNEL_UC_MINUS; break; - case IOR_MODE_CACHED: + case _PAGE_CACHE_WC: + prot = PAGE_KERNEL_WC; + break; + case _PAGE_CACHE_WB: prot = PAGE_KERNEL; break; } /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* * Ok, go for it.. */ - area = get_vm_area(size, VM_IOREMAP | (mode << 20)); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), size, prot, domid)) { + free_memtype(phys_addr, phys_addr + size); free_vm_area(area); return NULL; } - if (ioremap_change_attr(vaddr, size, mode) < 0) { - iounmap((void __iomem *) vaddr); + if (ioremap_change_attr(vaddr, size, prot_val) < 0) { + free_memtype(phys_addr, phys_addr + size); + vunmap(area->addr); return NULL; } @@ -347,16 +417,72 @@ static void __iomem *__ioremap(resource_ */ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); + /* + * Ideally, this should be: + * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * + * Till we fix all X drivers to use ioremap_wc(), we will use + * UC MINUS. + */ + unsigned long val = _PAGE_CACHE_UC_MINUS; + + return __ioremap_caller(phys_addr, size, val, + __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_nocache); +/** + * ioremap_wc - map memory into CPU space write combined + * @offset: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +{ + if (pat_wc_enabled) + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + __builtin_return_address(0)); + else + return ioremap_nocache(phys_addr, size); +} +EXPORT_SYMBOL(ioremap_wc); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, IOR_MODE_CACHED); + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); +#ifndef CONFIG_XEN +static void __iomem *ioremap_default(resource_size_t phys_addr, + unsigned long size) +{ + unsigned long flags; + void *ret; + int err; + + /* + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags); + if (err < 0) + return NULL; + + ret = (void *) __ioremap_caller(phys_addr, size, flags, + __builtin_return_address(0)); + + free_memtype(phys_addr, phys_addr + size); + return (void __iomem *)ret; +} +#endif + /** * iounmap - Free a IO remapping * @addr: virtual address from ioremap_* @@ -399,15 +525,7 @@ void iounmap(volatile void __iomem *addr return; } - if ((p->flags >> 20) != IOR_MODE_CACHED) { - unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT; - unsigned long mfn = p->phys_addr; - unsigned long va = (unsigned long)addr; - - for (; n > 0; n--, mfn++, va += PAGE_SIZE) - if (mfn_to_local_pfn(mfn) < max_pfn) - set_memory_wb(va, 1); - } + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ o = remove_vm_area((void *)addr); @@ -416,6 +534,37 @@ void iounmap(volatile void __iomem *addr } EXPORT_SYMBOL(iounmap); +#ifndef CONFIG_XEN +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void *)ioremap_default(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} +#endif + int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) @@ -427,8 +576,8 @@ static int __init early_ioremap_debug_se early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] - __attribute__((aligned(PAGE_SIZE))); +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] + __section(.bss.page_aligned); #ifdef CONFIG_X86_32 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -443,8 +592,8 @@ static inline pmd_t * __init early_iorem } #else #define early_ioremap_pmd early_get_pmd +#undef make_lowmem_page_readonly #define make_lowmem_page_readonly early_make_page_readonly -#define make_lowmem_page_writable make_page_writable #endif static inline pte_t * __init early_ioremap_pte(unsigned long addr) @@ -494,7 +643,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables); - /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */ + /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */ __flush_tlb_all(); } @@ -636,10 +785,11 @@ void __init early_iounmap(void *addr, un unsigned long offset; unsigned int nrpages; enum fixed_addresses idx; - unsigned int nesting; + int nesting; nesting = --early_ioremap_nested; - WARN_ON(nesting < 0); + if (WARN_ON(nesting < 0)) + return; if (early_ioremap_debug) { printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, --- head-2011-03-11.orig/arch/x86/mm/pageattr-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/pageattr-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <asm/e820.h> #include <asm/processor.h> @@ -17,371 +19,7 @@ #include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> -#include <asm/mmu_context.h> - -#ifndef CONFIG_X86_64 -#define TASK_SIZE64 TASK_SIZE -#endif - -static void _pin_lock(struct mm_struct *mm, int lock) { - if (lock) - spin_lock(&mm->page_table_lock); -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS - /* While mm->page_table_lock protects us against insertions and - * removals of higher level page table pages, it doesn't protect - * against updates of pte-s. Such updates, however, require the - * pte pages to be in consistent state (unpinned+writable or - * pinned+readonly). The pinning and attribute changes, however - * cannot be done atomically, which is why such updates must be - * prevented from happening concurrently. - * Note that no pte lock can ever elsewhere be acquired nesting - * with an already acquired one in the same mm, or with the mm's - * page_table_lock already acquired, as that would break in the - * non-split case (where all these are actually resolving to the - * one page_table_lock). Thus acquiring all of them here is not - * going to result in dead locks, and the order of acquires - * doesn't matter. - */ - { - pgd_t *pgd = mm->pgd; - unsigned g; - - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { - pud_t *pud; - unsigned u; - - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - pmd_t *pmd; - unsigned m; - - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - spinlock_t *ptl; - - if (pmd_none(*pmd)) - continue; - ptl = pte_lockptr(0, pmd); - if (lock) - spin_lock(ptl); - else - spin_unlock(ptl); - } - } - } - } -#endif - if (!lock) - spin_unlock(&mm->page_table_lock); -} -#define pin_lock(mm) _pin_lock(mm, 1) -#define pin_unlock(mm) _pin_lock(mm, 0) - -#define PIN_BATCH sizeof(void *) -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); - -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, - unsigned int cpu, unsigned int seq) -{ - unsigned long pfn = page_to_pfn(page); - - if (PageHighMem(page)) { - if (pgprot_val(flags) & _PAGE_RW) - ClearPagePinned(page); - else - SetPagePinned(page); - } else { - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, flags), 0); - if (unlikely(++seq == PIN_BATCH)) { - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), - PIN_BATCH, NULL))) - BUG(); - seq = 0; - } - } - - return seq; -} - -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) -{ - pgd_t *pgd = pgd_base; - pud_t *pud; - pmd_t *pmd; - int g,u,m; - unsigned int cpu, seq; - multicall_entry_t *mcl; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - cpu = get_cpu(); - - /* - * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables - * may not be the 'current' task's pagetables (e.g., current may be - * 32-bit, but the pagetables may be for a 64-bit task). - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. - */ - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - if (PTRS_PER_PUD > 1) /* not folded */ - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - if (PTRS_PER_PMD > 1) /* not folded */ - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - if (pmd_none(*pmd)) - continue; - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); - } - } - } - - mcl = per_cpu(pb_mcl, cpu); -#ifdef CONFIG_X86_64 - if (unlikely(seq > PIN_BATCH - 2)) { - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) - BUG(); - seq = 0; - } - MULTI_update_va_mapping(mcl + seq, - (unsigned long)__user_pgd(pgd_base), - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), - 0); - MULTI_update_va_mapping(mcl + seq + 1, - (unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH); - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) - BUG(); -#else - if (likely(seq != 0)) { - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, - (unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH); - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), - seq + 1, NULL))) - BUG(); - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH)) - BUG(); -#endif - - put_cpu(); -} - -static void __pgd_pin(pgd_t *pgd) -{ - pgd_walk(pgd, PAGE_KERNEL_RO); - kmap_flush_unused(); - xen_pgd_pin(__pa(pgd)); /* kernel */ -#ifdef CONFIG_X86_64 - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ -#endif - SetPagePinned(virt_to_page(pgd)); -} - -static void __pgd_unpin(pgd_t *pgd) -{ - xen_pgd_unpin(__pa(pgd)); -#ifdef CONFIG_X86_64 - xen_pgd_unpin(__pa(__user_pgd(pgd))); -#endif - pgd_walk(pgd, PAGE_KERNEL); - ClearPagePinned(virt_to_page(pgd)); -} - -void pgd_test_and_unpin(pgd_t *pgd) -{ - if (PagePinned(virt_to_page(pgd))) - __pgd_unpin(pgd); -} - -void mm_pin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - pin_lock(mm); - __pgd_pin(mm->pgd); - pin_unlock(mm); -} - -void mm_unpin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - pin_lock(mm); - __pgd_unpin(mm->pgd); - pin_unlock(mm); -} - -void mm_pin_all(void) -{ - struct page *page; - unsigned long flags; - - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - /* - * Allow uninterrupted access to the pgd_list. Also protects - * __pgd_pin() by disabling preemption. - * All other CPUs must be at a safe point (e.g., in stop_machine - * or offlined entirely). - */ - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!PagePinned(page)) - __pgd_pin((pgd_t *)page_address(page)); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - if (!PagePinned(virt_to_page(mm->pgd))) - mm_pin(mm); -} - -void arch_exit_mmap(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if (tsk->active_mm == mm) { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); - - switch_mm(mm, &init_mm, tsk); - - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); - } - - task_unlock(tsk); - - if (PagePinned(virt_to_page(mm->pgd)) - && atomic_read(&mm->mm_count) == 1 - && !mm->context.has_foreign_mappings) - mm_unpin(mm); -} - -static void _pte_free(struct page *page, unsigned int order) -{ - BUG_ON(order); - __pte_free(page); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) { - pgtable_page_ctor(pte); - SetPageForeign(pte, _pte_free); - init_page_count(pte); - } - return pte; -} - -void __pte_free(pgtable_t pte) -{ - if (!PageHighMem(pte)) { - unsigned long va = (unsigned long)page_address(pte); - unsigned int level; - pte_t *ptep = lookup_address(va, &level); - - BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); - if (!pte_write(*ptep) - && HYPERVISOR_update_va_mapping(va, - mk_pte(pte, PAGE_KERNEL), - 0)) - BUG(); - } else -#ifdef CONFIG_HIGHPTE - ClearPagePinned(pte); -#else - BUG(); -#endif - - ClearPageForeign(pte); - init_page_count(pte); - pgtable_page_dtor(pte); - __free_page(pte); -} - -#if PAGETABLE_LEVELS >= 3 -static void _pmd_free(struct page *page, unsigned int order) -{ - BUG_ON(order); - __pmd_free(page); -} - -pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pmd; - - pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); - if (!pmd) - return NULL; - SetPageForeign(pmd, _pmd_free); - init_page_count(pmd); - return page_address(pmd); -} - -void __pmd_free(pgtable_t pmd) -{ - unsigned long va = (unsigned long)page_address(pmd); - unsigned int level; - pte_t *ptep = lookup_address(va, &level); - - BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); - if (!pte_write(*ptep) - && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0)) - BUG(); - - ClearPageForeign(pmd); - init_page_count(pmd); - __free_page(pmd); -} -#endif - -/* blktap and gntdev need this, as otherwise they would implicitly (and - * needlessly, as they never use it) reference init_mm. */ -pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, int full) -{ - return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm, - addr, ptep, full); -} -EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full); +#include <asm/pat.h> /* * The current flushing context - we pass it instead of 5 arguments: @@ -393,6 +31,7 @@ struct cpa_data { int numpages; int flushtlb; unsigned long pfn; + unsigned force_split : 1; }; #ifdef CONFIG_X86_64 @@ -638,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns int i, do_split = 1; unsigned int level; + if (cpa->force_split) + return 1; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page @@ -857,9 +499,7 @@ static int split_large_page(pte_t *kpte, goto out_unlock; pbase = (pte_t *)page_address(base); -#ifdef CONFIG_X86_32 - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); -#endif + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 @@ -920,7 +560,7 @@ static int __change_page_attr(struct cpa repeat: kpte = lookup_address(address, &level); if (!kpte) - return primary ? -EINVAL : 0; + return 0; old_pte = *kpte; if (!__pte_val(old_pte)) { @@ -1079,7 +719,8 @@ static inline int cache_attr(pgprot_t at } static int change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) + pgprot_t mask_set, pgprot_t mask_clr, + int force_split) { struct cpa_data cpa; int ret, cache, checkalias; @@ -1090,7 +731,7 @@ static int change_page_attr_set_clr(unsi */ mask_set = canon_pgprot(mask_set); mask_clr = canon_pgprot(mask_clr); - if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ @@ -1107,6 +748,7 @@ static int change_page_attr_set_clr(unsi cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flushtlb = 0; + cpa.force_split = force_split; /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -1145,26 +787,67 @@ out: static inline int change_page_attr_set(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); } static inline int change_page_attr_clear(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); } -int set_memory_uc(unsigned long addr, int numpages) +int _set_memory_uc(unsigned long addr, int numpages) { + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD)); + __pgprot(_PAGE_CACHE_UC_MINUS)); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL)) + return -EINVAL; + + return _set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_memory_uc); -int set_memory_wb(unsigned long addr, int numpages) +int _set_memory_wc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_CACHE_WC)); +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + if (!pat_wc_enabled) + return set_memory_uc(addr, numpages); + + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL)) + return -EINVAL; + + return _set_memory_wc(addr, numpages); +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wb(unsigned long addr, int numpages) { return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_CACHE_MASK)); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + free_memtype(addr, addr + numpages * PAGE_SIZE); + + return _set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_memory_wb); @@ -1195,6 +878,12 @@ int set_memory_np(unsigned long addr, in return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); } +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), + __pgprot(0), 1); +} + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -1304,6 +993,45 @@ void kernel_map_pages(struct page *page, cpa_fill_pool(NULL); } +#ifdef CONFIG_DEBUG_FS +static int dpa_show(struct seq_file *m, void *v) +{ + seq_puts(m, "DEBUG_PAGEALLOC\n"); + seq_printf(m, "pool_size : %lu\n", pool_size); + seq_printf(m, "pool_pages : %lu\n", pool_pages); + seq_printf(m, "pool_low : %lu\n", pool_low); + seq_printf(m, "pool_used : %lu\n", pool_used); + seq_printf(m, "pool_failed : %lu\n", pool_failed); + + return 0; +} + +static int dpa_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, dpa_show, NULL); +} + +static const struct file_operations dpa_fops = { + .open = dpa_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init debug_pagealloc_proc_init(void) +{ + struct dentry *de; + + de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, + &dpa_fops); + if (!de) + return -ENOMEM; + + return 0; +} +__initcall(debug_pagealloc_proc_init); +#endif + #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/mm/pat-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,602 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + */ + +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/fs.h> +#include <linux/bootmem.h> + +#include <asm/msr.h> +#include <asm/tlbflush.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/pat.h> +#include <asm/e820.h> +#include <asm/cacheflush.h> +#include <asm/fcntl.h> +#include <asm/mtrr.h> +#include <asm/io.h> + +#ifdef CONFIG_X86_PAT +int __read_mostly pat_wc_enabled = 1; + +void __cpuinit pat_disable(char *reason) +{ + pat_wc_enabled = 0; + printk(KERN_INFO "%s\n", reason); +} + +static int __init nopat(char *str) +{ + pat_disable("PAT support disabled."); + return 0; +} +early_param("nopat", nopat); +#endif + +static u64 __read_mostly boot_pat_state; + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ +}; + +#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) + +void pat_init(void) +{ + u64 pat; + + if (!pat_wc_enabled) + return; + + /* Paranoia check. */ + if (!cpu_has_pat) { + printk(KERN_ERR "PAT enabled, but CPU feature cleared\n"); + /* + * Panic if this happens on the secondary CPU, and we + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + BUG_ON(boot_pat_state); + } + +#ifndef CONFIG_XEN + /* Set PWT to Write-Combining. All other bits stay the same */ + /* + * PTE encoding used in Linux: + * PAT + * |PCD + * ||PWT + * ||| + * 000 WB _PAGE_CACHE_WB + * 001 WC _PAGE_CACHE_WC + * 010 UC- _PAGE_CACHE_UC_MINUS + * 011 UC _PAGE_CACHE_UC + * PAT bit unused + */ + pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | + PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); + + /* Boot CPU check */ + if (!boot_pat_state) + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + + wrmsrl(MSR_IA32_CR_PAT, pat); +#else + /* + * PAT settings are part of the hypervisor interface, and their + * assignment cannot be changed. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + if (!boot_pat_state) + boot_pat_state = pat; +#endif + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); +} + +#undef PAT + +static char *cattr_name(unsigned long flags) +{ + switch (flags & _PAGE_CACHE_MASK) { + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + case _PAGE_CACHE_WP: return "write-protected"; + case _PAGE_CACHE_WT: return "write-through"; + default: return "broken"; + } +} + +/* + * The global memtype list keeps track of memory type for specific + * physical memory areas. Conflicting memory types in different + * mappings can cause CPU cache corruption. To avoid this we keep track. + * + * The list is sorted based on starting address and can contain multiple + * entries for each address (this allows reference counting for overlapping + * areas). All the aliases have the same cache attributes of course. + * Zero attributes are represented as holes. + * + * Currently the data structure is a list because the number of mappings + * are expected to be relatively small. If this should be a problem + * it could be changed to a rbtree or similar. + * + * memtype_lock protects the whole list. + */ + +struct memtype { + u64 start; + u64 end; + unsigned long type; + struct list_head nd; +}; + +static LIST_HEAD(memtype_list); +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, + unsigned long *ret_prot) +{ + unsigned long pat_type; + u8 mtrr_type; + + pat_type = prot & _PAGE_CACHE_MASK; + prot &= (~_PAGE_CACHE_MASK); + + /* + * We return the PAT request directly for types where PAT takes + * precedence with respect to MTRR and for UC_MINUS. + * Consistency checks with other PAT requests is done later + * while going through memtype list. + */ + if (pat_type == _PAGE_CACHE_WC) { + *ret_prot = prot | _PAGE_CACHE_WC; + return 0; + } else if (pat_type == _PAGE_CACHE_UC_MINUS) { + *ret_prot = prot | _PAGE_CACHE_UC_MINUS; + return 0; + } else if (pat_type == _PAGE_CACHE_UC) { + *ret_prot = prot | _PAGE_CACHE_UC; + return 0; + } + + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + mtrr_type = mtrr_type_lookup(start, end); + + if (mtrr_type == MTRR_TYPE_UNCACHABLE) { + *ret_prot = prot | _PAGE_CACHE_UC; + } else if (mtrr_type == MTRR_TYPE_WRCOMB) { + *ret_prot = prot | _PAGE_CACHE_WC; + } else { + *ret_prot = prot | _PAGE_CACHE_WB; + } + + return 0; +} + +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * req_type will have a special case value '-1', when requester want to inherit + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. + * + * If ret_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If ret_type is non-null, function will return + * available type in ret_type in case of no error. In case of any error + * it will return a negative return value. + */ +int reserve_memtype(u64 start, u64 end, unsigned long req_type, + unsigned long *ret_type) +{ + struct memtype *new_entry = NULL; + struct memtype *parse; + unsigned long actual_type; + int err = 0; + + /* Only track when pat_wc_enabled */ + if (!pat_wc_enabled) { + /* This is identical to page table setting without PAT */ + if (ret_type) { + if (req_type == -1) { + *ret_type = _PAGE_CACHE_WB; + } else { + *ret_type = req_type; + } + } + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { + if (ret_type) + *ret_type = _PAGE_CACHE_WB; + + return 0; + } + + if (req_type == -1) { + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + u8 mtrr_type = mtrr_type_lookup(start, end); + + if (mtrr_type == MTRR_TYPE_WRBACK) { + req_type = _PAGE_CACHE_WB; + actual_type = _PAGE_CACHE_WB; + } else { + req_type = _PAGE_CACHE_UC_MINUS; + actual_type = _PAGE_CACHE_UC_MINUS; + } + } else { + req_type &= _PAGE_CACHE_MASK; + err = pat_x_mtrr_type(start, end, req_type, &actual_type); + } + + if (err) { + if (ret_type) + *ret_type = actual_type; + + return -EINVAL; + } + + new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + new_entry->start = start; + new_entry->end = end; + new_entry->type = actual_type; + + if (ret_type) + *ret_type = actual_type; + + spin_lock(&memtype_lock); + + /* Search for existing mapping that overlaps the current range */ + list_for_each_entry(parse, &memtype_list, nd) { + struct memtype *saved_ptr; + + if (parse->start >= end) { + pr_debug("New Entry\n"); + list_add(&new_entry->nd, parse->nd.prev); + new_entry = NULL; + break; + } + + if (start <= parse->start && end >= parse->start) { + if (actual_type != parse->type && ret_type) { + actual_type = parse->type; + *ret_type = actual_type; + new_entry->type = actual_type; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + + saved_ptr = parse; + /* + * Check to see whether the request overlaps more + * than one entry in the list + */ + list_for_each_entry_continue(parse, &memtype_list, nd) { + if (end <= parse->start) { + break; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + } + + if (err) { + break; + } + + pr_debug("Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); + /* No conflict. Go ahead and add this new entry */ + list_add(&new_entry->nd, saved_ptr->nd.prev); + new_entry = NULL; + break; + } + + if (start < parse->end) { + if (actual_type != parse->type && ret_type) { + actual_type = parse->type; + *ret_type = actual_type; + new_entry->type = actual_type; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + + saved_ptr = parse; + /* + * Check to see whether the request overlaps more + * than one entry in the list + */ + list_for_each_entry_continue(parse, &memtype_list, nd) { + if (end <= parse->start) { + break; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + } + + if (err) { + break; + } + + pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); + /* No conflict. Go ahead and add this new entry */ + list_add(&new_entry->nd, &saved_ptr->nd); + new_entry = NULL; + break; + } + } + + if (err) { + printk(KERN_INFO + "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", + start, end, cattr_name(new_entry->type), + cattr_name(req_type)); + kfree(new_entry); + spin_unlock(&memtype_lock); + return err; + } + + if (new_entry) { + /* No conflict. Not yet added to the list. Add to the tail */ + list_add_tail(&new_entry->nd, &memtype_list); + pr_debug("New Entry\n"); + } + + if (ret_type) { + pr_debug( + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(actual_type), + cattr_name(req_type), cattr_name(*ret_type)); + } else { + pr_debug( + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", + start, end, cattr_name(actual_type), + cattr_name(req_type)); + } + + spin_unlock(&memtype_lock); + return err; +} + +int free_memtype(u64 start, u64 end) +{ + struct memtype *ml; + int err = -EINVAL; + + /* Only track when pat_wc_enabled */ + if (!pat_wc_enabled) { + return 0; + } + + /* Low ISA region is always mapped WB. No need to track */ + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { + return 0; + } + + spin_lock(&memtype_lock); + list_for_each_entry(ml, &memtype_list, nd) { + if (ml->start == start && ml->end == end) { + list_del(&ml->nd); + kfree(ml); + err = 0; + break; + } + } + spin_unlock(&memtype_lock); + + if (err) { + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", + current->comm, current->pid, start, end); + } + + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; +} + + +/* + * /dev/mem mmap interface. The memtype used for mapping varies: + * - Use UC for mappings with O_SYNC flag + * - Without O_SYNC flag, if there is any conflict in reserve_memtype, + * inherit the memtype from existing mapping. + * - Else use UC_MINUS memtype (for backward compatibility with existing + * X drivers. + */ +pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +#ifdef CONFIG_NONPROMISC_DEVMEM +/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ +static inline int range_is_allowed(unsigned long mfn, unsigned long size) +{ + return 1; +} +#else +static inline int range_is_allowed(unsigned long mfn, unsigned long size) +{ + u64 from = ((u64)mfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + while (cursor < to) { + if (!devmem_is_allowed(mfn)) { + printk(KERN_INFO + "Program %s tried to access /dev/mem between %Lx->%Lx.\n", + current->comm, from, to); + return 0; + } + cursor += PAGE_SIZE; + mfn++; + } + return 1; +} +#endif /* CONFIG_NONPROMISC_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn, + unsigned long size, pgprot_t *vma_prot) +{ + u64 addr = (u64)mfn << PAGE_SHIFT; + unsigned long flags = _PAGE_CACHE_UC_MINUS; + int retval; + + if (!range_is_allowed(mfn, size)) + return 0; + + if (file->f_flags & O_SYNC) { + flags = _PAGE_CACHE_UC; + } + +#ifndef CONFIG_X86_32 +#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */ + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_wc_enabled && + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + flags = _PAGE_CACHE_UC; + } +#endif +#endif + + /* + * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * Without O_SYNC, we want to get + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + if (flags != _PAGE_CACHE_UC_MINUS) { + retval = reserve_memtype(addr, addr + size, flags, NULL); + } else { + retval = reserve_memtype(addr, addr + size, -1, &flags); + } + + if (retval < 0) + return 0; + + if (ioremap_check_change_attr(mfn, size, flags) < 0) { + free_memtype(addr, addr + size); + printk(KERN_INFO + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + addr, addr + size); + return 0; + } + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)mfn << PAGE_SHIFT; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + reserve_memtype(addr, addr + size, want_flags, &flags); + if (flags != want_flags) { + printk(KERN_INFO + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + addr, (unsigned long long)(addr + size), + cattr_name(flags)); + } +} + +void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)mfn << PAGE_SHIFT; + + free_memtype(addr, addr + size); +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/arch/x86/mm/pgtable-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -0,0 +1,713 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <xen/features.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/hypervisor.h> +#include <asm/mmu_context.h> + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + if (pte) + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); + return pte; +} + +static void _pte_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + __pte_free(page); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) { + pgtable_page_ctor(pte); + SetPageForeign(pte, _pte_free); + init_page_count(pte); + } + return pte; +} + +void __pte_free(pgtable_t pte) +{ + if (!PageHighMem(pte)) { + unsigned long va = (unsigned long)page_address(pte); + unsigned int level; + pte_t *ptep = lookup_address(va, &level); + + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); + if (!pte_write(*ptep) + && HYPERVISOR_update_va_mapping(va, + mk_pte(pte, PAGE_KERNEL), + 0)) + BUG(); + } else +#ifdef CONFIG_HIGHPTE + ClearPagePinned(pte); +#else + BUG(); +#endif + + ClearPageForeign(pte); + init_page_count(pte); + pgtable_page_dtor(pte); + __free_page(pte); +} + +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + +#if PAGETABLE_LEVELS > 2 +static void _pmd_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + __pmd_free(page); +} + +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pmd; + + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + if (!pmd) + return NULL; + SetPageForeign(pmd, _pmd_free); + init_page_count(pmd); + return page_address(pmd); +} + +void __pmd_free(pgtable_t pmd) +{ + unsigned long va = (unsigned long)page_address(pmd); + unsigned int level; + pte_t *ptep = lookup_address(va, &level); + + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); + if (!pte_write(*ptep) + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0)) + BUG(); + + ClearPageForeign(pmd); + init_page_count(pmd); + __free_page(pmd); +} + +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +#if PAGETABLE_LEVELS > 3 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +#ifndef CONFIG_X86_64 +#define TASK_SIZE64 TASK_SIZE +#endif + +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + +#define PIN_BATCH sizeof(void *) +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); + +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, + unsigned int cpu, unsigned int seq) +{ + unsigned long pfn = page_to_pfn(page); + + if (PageHighMem(page)) { + if (pgprot_val(flags) & _PAGE_RW) + ClearPagePinned(page); + else + SetPagePinned(page); + } else { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); + if (unlikely(++seq == PIN_BATCH)) { + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + PIN_BATCH, NULL))) + BUG(); + seq = 0; + } + } + + return seq; +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + int g,u,m; + unsigned int cpu, seq; + multicall_entry_t *mcl; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + cpu = get_cpu(); + + /* + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables + * may not be the 'current' task's pagetables (e.g., current may be + * 32-bit, but the pagetables may be for a 64-bit task). + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. + */ + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); + } + } + } + + mcl = per_cpu(pb_mcl, cpu); +#ifdef CONFIG_X86_64 + if (unlikely(seq > PIN_BATCH - 2)) { + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) + BUG(); + seq = 0; + } + MULTI_update_va_mapping(mcl + seq, + (unsigned long)__user_pgd(pgd_base), + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), + 0); + MULTI_update_va_mapping(mcl + seq + 1, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) + BUG(); +#else + if (likely(seq != 0)) { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + seq + 1, NULL))) + BUG(); + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH)) + BUG(); +#endif + + put_cpu(); +} + +static void __pgd_pin(pgd_t *pgd) +{ + pgd_walk(pgd, PAGE_KERNEL_RO); + kmap_flush_unused(); + xen_pgd_pin(__pa(pgd)); /* kernel */ +#ifdef CONFIG_X86_64 + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ +#endif + SetPagePinned(virt_to_page(pgd)); +} + +static void __pgd_unpin(pgd_t *pgd) +{ + xen_pgd_unpin(__pa(pgd)); +#ifdef CONFIG_X86_64 + xen_pgd_unpin(__pa(__user_pgd(pgd))); +#endif + pgd_walk(pgd, PAGE_KERNEL); + ClearPagePinned(virt_to_page(pgd)); +} + +static void pgd_test_and_unpin(pgd_t *pgd) +{ + if (PagePinned(virt_to_page(pgd))) + __pgd_unpin(pgd); +} + +void mm_pin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + __pgd_pin(mm->pgd); + pin_unlock(mm); +} + +void mm_unpin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + __pgd_unpin(mm->pgd); + pin_unlock(mm); +} + +void mm_pin_all(void) +{ + struct page *page; + unsigned long flags; + + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + /* + * Allow uninterrupted access to the pgd_list. Also protects + * __pgd_pin() by disabling preemption. + * All other CPUs must be at a safe point (e.g., in stop_machine + * or offlined entirely). + */ + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) + __pgd_pin((pgd_t *)page_address(page)); + } + spin_unlock_irqrestore(&pgd_lock, flags); +} + +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (!PagePinned(virt_to_page(mm->pgd))) + mm_pin(mm); +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if (PagePinned(virt_to_page(mm->pgd)) + && atomic_read(&mm->mm_count) == 1 + && !mm->context.has_foreign_mappings) + mm_unpin(mm); +} + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + +static void pgd_ctor(void *p) +{ + pgd_t *pgd = p; + unsigned long flags; + + pgd_test_and_unpin(pgd); + + /* Clear usermode parts of PGD */ + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + +#ifdef CONFIG_X86_64 + /* set level3_user_pgt for vsyscall area */ + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] = + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); +#endif + +#ifndef CONFIG_X86_PAE + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); +#endif + + spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void pgd_dtor(void *pgd) +{ + unsigned long flags; /* can be called from interrupt context */ + + if (!SHARED_KERNEL_PMD) { + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } + + pgd_test_and_unpin(pgd); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; + + if (__pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = xen_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } + + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) + xen_destroy_contiguous_region((unsigned long)pgdp, 0); +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmds[UNSHARED_PTRS_PER_PGD]; + unsigned long addr, flags; + int i; + + /* + * We can race save/restore (if we sleep during a GFP_KERNEL memory + * allocation). We therefore store virtual addresses of pmds as they + * do not change across save/restore, and poke the machine addresses + * into the pgdir under the pgd_lock. + */ + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) { + pmds[i] = pmd_alloc_one(mm, addr); + if (!pmds[i]) + goto out_oom; + } + + spin_lock_irqsave(&pgd_lock, flags); + + /* Protect against save/restore: move below 4GB under pgd_lock. */ + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb) + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) { + spin_unlock_irqrestore(&pgd_lock, flags); +out_oom: + while (i--) + pmd_free(mm, pmds[i]); + return 0; + } + + /* Copy kernel pmd contents and write-protect the new pmds. */ + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + if (i >= KERNEL_PGD_BOUNDARY) { + memcpy(pmds[i], + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + make_lowmem_page_readonly( + pmds[i], XENFEAT_writable_page_tables); + } + + /* It is safe to poke machine addresses of pmds under the pgd_lock. */ + pud_populate(mm, pud, pmds[i]); + } + + /* List required to sync kernel mapping updates and + * to pin/unpin on save/restore. */ + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); + + return 1; +} + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + struct page *page = virt_to_page(pmd); + unsigned long pfn = page_to_pfn(page); + + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + if (PagePinned(virt_to_page(mm->pgd))) { + BUG_ON(PageHighMem(page)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + } else + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + xen_tlb_flush(); +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) +{ +} +#endif /* CONFIG_X86_PAE */ + +#ifdef CONFIG_X86_64 +/* We allocate two contiguous pages for kernel and user. */ +#define PGD_ORDER 1 +#else +#define PGD_ORDER 0 +#endif + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER); + + /* so that alloc_pd can use it */ + mm->pgd = pgd; + if (pgd) { + /* Store a back link for vmalloc_sync_all(). */ + set_page_private(virt_to_page(pgd), (unsigned long)mm); + pgd_ctor(pgd); + } + + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + free_pages((unsigned long)pgd, PGD_ORDER); + pgd = NULL; + } + + return pgd; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + /* + * After this the pgd should not be pinned for the duration of this + * function's execution. We should never sleep and thus never race: + * 1. User pmds will not become write-protected under our feet due + * to a concurrent mm_pin_all(). + * 2. The machine addresses in PGD entries will not become invalid + * due to a concurrent save/restore. + */ + pgd_dtor(pgd); + + pgd_mop_up_pmds(mm, pgd); + free_pages((unsigned long)pgd, PGD_ORDER); +} + +/* blktap and gntdev need this, as otherwise they would implicitly (and + * needlessly, as they never use it) reference init_mm. */ +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int full) +{ + return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm, + addr, ptep, full); +} +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full); + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + if (likely(vma->vm_mm == current->mm)) { + if (HYPERVISOR_update_va_mapping(address, + entry, + uvm_multi(vma->vm_mm->cpu_vm_mask) | + UVMF_INVLPG)) + BUG(); + } else { + xen_l1_entry_update(ptep, entry); + flush_tlb_page(vma, address); + } + } + + return changed; +} + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t pte = *ptep; + int young = pte_young(pte); + + pte = pte_mkold(pte); + if (PagePinned(virt_to_page(vma->vm_mm->pgd))) + ptep_set_access_flags(vma, address, ptep, pte, young); + else if (young) + ptep->pte_low = pte.pte_low; + + return young; +} --- head-2011-03-11.orig/arch/x86/mm/pgtable_32-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/pgtable_32-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -1,7 +1,3 @@ -/* - * linux/arch/i386/mm/pgtable.c - */ - #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -41,7 +37,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { @@ -157,246 +152,6 @@ void __init reserve_top_address(unsigned __VMALLOC_RESERVE += reserve; } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (pte) - make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); - return pte; -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * -- wli - */ -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_add(&page->lru, &pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_del(&page->lru); -} - -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -static void pgd_ctor(void *p) -{ - pgd_t *pgd = p; - unsigned long flags; - - pgd_test_and_unpin(pgd); - - /* Clear usermode parts of PGD */ - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); - - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { - clone_pgd_range(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } - - /* list required to sync kernel mapping updates */ - if (PAGETABLE_LEVELS == 2) - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); -} - -static void pgd_dtor(void *pgd) -{ - unsigned long flags; /* can be called from interrupt context */ - - if (!SHARED_KERNEL_PMD) { - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - } - - pgd_test_and_unpin(pgd); -} - -#ifdef CONFIG_X86_PAE -/* - * Mop up any pmd pages which may still be attached to the pgd. - * Normally they will be freed by munmap/exit_mmap, but any pmd we - * preallocate which never got a corresponding vma will need to be - * freed manually. - */ -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ - int i; - - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { - pgd_t pgd = pgdp[i]; - - if (__pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - - pgdp[i] = xen_make_pgd(0); - - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - } - } -} - -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - pud_t *pud; - pmd_t *pmds[UNSHARED_PTRS_PER_PGD]; - unsigned long addr, flags; - int i; - - /* - * We can race save/restore (if we sleep during a GFP_KERNEL memory - * allocation). We therefore store virtual addresses of pmds as they - * do not change across save/restore, and poke the machine addresses - * into the pgdir under the pgd_lock. - */ - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) { - pmds[i] = pmd_alloc_one(mm, addr); - if (!pmds[i]) - goto out_oom; - } - - spin_lock_irqsave(&pgd_lock, flags); - - /* Protect against save/restore: move below 4GB under pgd_lock. */ - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb) - && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) { - spin_unlock_irqrestore(&pgd_lock, flags); -out_oom: - while (i--) - pmd_free(mm, pmds[i]); - return 0; - } - - /* Copy kernel pmd contents and write-protect the new pmds. */ - pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - if (i >= USER_PTRS_PER_PGD) { - memcpy(pmds[i], - (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), - sizeof(pmd_t) * PTRS_PER_PMD); - make_lowmem_page_readonly( - pmds[i], XENFEAT_writable_page_tables); - } - - /* It is safe to poke machine addresses of pmds under the pgd_lock. */ - pud_populate(mm, pud, pmds[i]); - } - - /* List required to sync kernel mapping updates and - * to pin/unpin on save/restore. */ - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); - - return 1; -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ -} -#endif /* CONFIG_X86_PAE */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - - /* so that alloc_pd can use it */ - mm->pgd = pgd; - if (pgd) { - /* Store a back link for vmalloc_sync_all(). */ - set_page_private(virt_to_page(pgd), (unsigned long)mm); - pgd_ctor(pgd); - } - - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - free_page((unsigned long)pgd); - pgd = NULL; - } - - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - /* - * After this the pgd should not be pinned for the duration of this - * function's execution. We should never sleep and thus never race: - * 1. User pmds will not become write-protected under our feet due - * to a concurrent mm_pin_all(). - * 2. The machine addresses in PGD entries will not become invalid - * due to a concurrent save/restore. - */ - pgd_dtor(pgd); - - if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb)) - xen_destroy_contiguous_region((unsigned long)pgd, 0); - - pgd_mop_up_pmds(mm, pgd); - free_page((unsigned long)pgd); -} - -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -{ - pgtable_page_dtor(pte); - paravirt_release_pt(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); -} - -#ifdef CONFIG_X86_PAE - -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -{ - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); -} - -#endif - void make_lowmem_page_readonly(void *va, unsigned int feature) { pte_t *pte; --- head-2011-03-11.orig/arch/x86/pci/irq-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/pci/irq-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void) busmap[e->bus] = 1; } for(i = 1; i < 256; i++) { + int node; if (!busmap[i] || pci_find_bus(0, i)) continue; - if (pci_scan_bus_with_sysdata(i)) + node = get_mp_bus_to_node(i); + if (pci_scan_bus_on_node(i, &pci_root_ops, node)) printk(KERN_INFO "PCI: Discovered primary peer " "bus %02x [IRQ]\n", i); } @@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev * { static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; - WARN_ON_ONCE(pirq >= 16); + WARN_ON_ONCE(pirq > 16); return irqmap[read_config_nybble(router, 0x48, pirq-1)]; } @@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev * static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; unsigned int val = irqmap[irq]; - WARN_ON_ONCE(pirq >= 16); + WARN_ON_ONCE(pirq > 16); if (val) { write_config_nybble(router, 0x48, pirq-1, val); return 1; @@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; - WARN_ON_ONCE(pirq >= 5); + WARN_ON_ONCE(pirq > 5); return read_config_nybble(router, 0x55, pirqmap[pirq-1]); } @@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; - WARN_ON_ONCE(pirq >= 5); + WARN_ON_ONCE(pirq > 5); write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); return 1; } @@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev * { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; - WARN_ON_ONCE(pirq >= 4); + WARN_ON_ONCE(pirq > 4); return read_config_nybble(router,0x43, pirqmap[pirq-1]); } @@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev * { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; - WARN_ON_ONCE(pirq >= 4); + WARN_ON_ONCE(pirq > 4); write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); return 1; } @@ -623,6 +625,13 @@ static __init int via_router_probe(struc */ device = PCI_DEVICE_ID_VIA_8235; break; + case PCI_DEVICE_ID_VIA_8237: + /** + * Asus a7v600 bios wrongly reports 8237 + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_8237; + break; } } --- head-2011-03-11.orig/arch/x86/vdso/vdso32-setup-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/vdso/vdso32-setup-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E Elf32_Shdr *shdr; int i; - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 || !elf_check_arch_ia32(ehdr) || ehdr->e_type != ET_DYN); @@ -233,8 +233,12 @@ void syscall32_cpu_init(void) BUG(); #endif - if (use_sysenter < 0) - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); + if (use_sysenter < 0) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + use_sysenter = 1; + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) + use_sysenter = 1; + } } #define compat_uses_vma 1 @@ -337,8 +341,6 @@ int __init sysenter_setup(void) #ifdef CONFIG_X86_32 gate_vma_init(); - - printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); #endif #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200 @@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l int ret = 0; bool compat; + if (vdso_enabled == VDSO_DISABLED) + return 0; + down_write(&mm->mmap_sem); /* Test compat mode once here, in case someone --- head-2011-03-11.orig/drivers/acpi/processor_driver.c 2011-01-31 14:53:38.000000000 +0100 +++ head-2011-03-11/drivers/acpi/processor_driver.c 2011-01-31 18:07:35.000000000 +0100 @@ -371,7 +371,7 @@ static int acpi_processor_get_info(struc * of /proc/cpuinfo */ status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer); - if (ACPI_SUCCESS(status)) + if (ACPI_SUCCESS(status) && pr->id != -1) arch_fix_phys_package_id(pr->id, object.integer.value); return 0; --- head-2011-03-11.orig/drivers/firmware/Kconfig 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/drivers/firmware/Kconfig 2011-01-31 18:07:35.000000000 +0100 @@ -115,7 +115,7 @@ config DMIID config ISCSI_IBFT_FIND bool "iSCSI Boot Firmware Table Attributes" - depends on X86 + depends on X86 && !XEN_UNPRIVILEGED_GUEST default n help This option enables the kernel to find the region of memory --- head-2011-03-11.orig/drivers/input/xen-kbdfront.c 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/drivers/input/xen-kbdfront.c 2011-01-31 18:07:35.000000000 +0100 @@ -331,7 +331,6 @@ static const struct xenbus_device_id xen static struct xenbus_driver xenkbd_driver = { .name = "vkbd", - .owner = THIS_MODULE, .ids = xenkbd_ids, .probe = xenkbd_probe, .remove = xenkbd_remove, --- head-2011-03-11.orig/drivers/oprofile/cpu_buffer.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/oprofile/cpu_buffer.c 2011-01-31 18:07:35.000000000 +0100 @@ -475,7 +475,7 @@ fail: #ifdef CONFIG_XEN int oprofile_add_domain_switch(int32_t domain_id) { - struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; + struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); /* should have space for switching into and out of domain (2 slots each) plus one sample and one cpu mode switch */ --- head-2011-03-11.orig/drivers/pci/msi-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/pci/msi-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -504,7 +504,7 @@ int pci_enable_msi(struct pci_dev* dev) EXPORT_SYMBOL(pci_enable_msi); extern void pci_frontend_disable_msi(struct pci_dev* dev); -void pci_disable_msi(struct pci_dev* dev) +void pci_msi_shutdown(struct pci_dev* dev) { int pirq; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); @@ -532,6 +532,10 @@ void pci_disable_msi(struct pci_dev* dev pci_intx_for_msi(dev, 1); dev->msi_enabled = 0; } +void pci_disable_msi(struct pci_dev* dev) +{ + pci_msi_shutdown(dev); +} EXPORT_SYMBOL(pci_disable_msi); /** @@ -636,7 +640,7 @@ int pci_enable_msix(struct pci_dev* dev, EXPORT_SYMBOL(pci_enable_msix); extern void pci_frontend_disable_msix(struct pci_dev* dev); -void pci_disable_msix(struct pci_dev* dev) +void pci_msix_shutdown(struct pci_dev* dev) { if (!pci_msi_enable || !dev || !dev->msix_enabled) return; @@ -669,6 +673,10 @@ void pci_disable_msix(struct pci_dev* de pci_intx_for_msi(dev, 1); dev->msix_enabled = 0; } +void pci_disable_msix(struct pci_dev* dev) +{ + pci_msix_shutdown(dev); +} EXPORT_SYMBOL(pci_disable_msix); /** --- head-2011-03-11.orig/drivers/video/Kconfig 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/drivers/video/Kconfig 2011-01-31 18:07:35.000000000 +0100 @@ -2253,7 +2253,7 @@ config FB_VIRTUAL config XEN_FBDEV_FRONTEND tristate "Xen virtual frame buffer support" - depends on FB && XEN + depends on FB && PARAVIRT_XEN select FB_SYS_FILLRECT select FB_SYS_COPYAREA select FB_SYS_IMAGEBLIT --- head-2011-03-11.orig/drivers/video/xen-fbfront.c 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/drivers/video/xen-fbfront.c 2011-01-31 18:07:35.000000000 +0100 @@ -679,7 +679,6 @@ static struct xenbus_device_id xenfb_ids static struct xenbus_driver xenfb_driver = { .name = "vfb", - .owner = THIS_MODULE, .ids = xenfb_ids, .probe = xenfb_probe, .remove = xenfb_remove, --- head-2011-03-11.orig/drivers/xen/Kconfig 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-11/drivers/xen/Kconfig 2011-01-31 18:07:35.000000000 +0100 @@ -2,8 +2,6 @@ # This Kconfig describe xen options # -mainmenu "Xen Configuration" - config XEN bool --- head-2011-03-11.orig/drivers/xen/Makefile 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-11/drivers/xen/Makefile 2011-02-28 15:13:33.000000000 +0100 @@ -1,5 +1,7 @@ -obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o +obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o +xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o +xen-balloon-$(CONFIG_XEN) := balloon/ obj-$(CONFIG_XEN) += core/ obj-$(CONFIG_XEN) += console/ obj-$(CONFIG_XEN) += evtchn/ @@ -8,8 +10,9 @@ obj-$(CONFIG_XEN) += char/ xen-backend-$(CONFIG_XEN_BACKEND) := util.o -obj-$(CONFIG_XEN) += $(xen-backend-y) $(xen-backend-m) -obj-$(CONFIG_XEN_BALLOON) += balloon/ +obj-$(CONFIG_XEN) += features.o $(xen-backend-y) $(xen-backend-m) +obj-$(CONFIG_XEN_XENCOMM) += xencomm.o +obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y) obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ obj-$(CONFIG_XEN_BLKDEV_TAP2) += blktap2/ blktap2-new/ --- head-2011-03-11.orig/drivers/xen/blkfront/blkfront.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/blkfront/blkfront.c 2011-01-31 18:07:35.000000000 +0100 @@ -285,7 +285,11 @@ static void backend_changed(struct xenbu break; case XenbusStateClosing: - bd = bdget(info->dev); + if (!info->gd) { + xenbus_frontend_closed(dev); + break; + } + bd = bdget_disk(info->gd, 0); if (bd == NULL) { xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); break; --- head-2011-03-11.orig/drivers/xen/blkfront/block.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/xen/blkfront/block.h 2011-01-31 18:07:35.000000000 +0100 @@ -97,7 +97,6 @@ struct blk_shadow { struct blkfront_info { struct xenbus_device *xbdev; - dev_t dev; struct gendisk *gd; int vdevice; blkif_vdev_t handle; --- head-2011-03-11.orig/drivers/xen/blkfront/vbd.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/xen/blkfront/vbd.c 2011-01-31 18:07:35.000000000 +0100 @@ -328,17 +328,33 @@ xlvbd_init_blk_queue(struct gendisk *gd, return 0; } -static int -xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice, - u16 vdisk_info, u16 sector_size, - struct blkfront_info *info) +int +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info, + u16 sector_size, struct blkfront_info *info) { + int major, minor; struct gendisk *gd; struct xlbd_major_info *mi; int nr_minors = 1; int err = -ENODEV; unsigned int offset; + if ((vdevice>>EXT_SHIFT) > 1) { + /* this is above the extended range; something is wrong */ + pr_warning("blkfront: vdevice %#x is above the extended range;" + " ignoring\n", vdevice); + return -ENODEV; + } + + if (!VDEV_IS_EXTENDED(vdevice)) { + major = BLKIF_MAJOR(vdevice); + minor = BLKIF_MINOR(vdevice); + } + else { + major = 202; + minor = BLKIF_MINOR_EXT(vdevice); + } + BUG_ON(info->gd != NULL); BUG_ON(info->mi != NULL); BUG_ON(info->rq != NULL); @@ -426,42 +442,6 @@ xlvbd_alloc_gendisk(int major, int minor return err; } -int -xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info, - u16 sector_size, struct blkfront_info *info) -{ - struct block_device *bd; - int err = 0; - int major, minor; - - if ((vdevice>>EXT_SHIFT) > 1) { - /* this is above the extended range; something is wrong */ - pr_warning("blkfront: vdevice %#x is above the extended" - " range; ignoring\n", vdevice); - return -ENODEV; - } - - if (!VDEV_IS_EXTENDED(vdevice)) { - major = BLKIF_MAJOR(vdevice); - minor = BLKIF_MINOR(vdevice); - } - else { - major = 202; - minor = BLKIF_MINOR_EXT(vdevice); - } - - info->dev = MKDEV(major, minor); - bd = bdget(info->dev); - if (bd == NULL) - return -ENODEV; - - err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info, - sector_size, info); - - bdput(bd); - return err; -} - void xlvbd_del(struct blkfront_info *info) { --- head-2011-03-11.orig/drivers/xen/blktap/blktap.c 2011-02-17 10:10:56.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap/blktap.c 2011-02-17 10:11:08.000000000 +0100 @@ -110,6 +110,7 @@ typedef struct tap_blkif { unsigned long mode; /*current switching mode */ int minor; /*Minor number for tapdisk device */ pid_t pid; /*tapdisk process id */ + struct pid_namespace *pid_ns; /*... and its corresponding namespace */ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace shutdown */ struct idx_map { @@ -281,16 +282,14 @@ static inline unsigned int OFFSET_TO_SEG * BLKTAP VM OPS */ -static struct page *blktap_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) +static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { /* * if the page has not been mapped in by the driver then return - * NOPAGE_SIGBUS to the domain. + * VM_FAULT_SIGBUS to the domain. */ - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; } static pte_t blktap_clear_pte(struct vm_area_struct *vma, @@ -407,7 +406,7 @@ static void blktap_vma_close(struct vm_a } static struct vm_operations_struct blktap_vm_ops = { - nopage: blktap_nopage, + fault: blktap_fault, zap_pte: blktap_clear_pte, open: blktap_vma_open, close: blktap_vma_close, @@ -502,9 +501,8 @@ found: tapfds[minor] = info; if ((class = get_xen_class()) != NULL) - class_device_create(class, NULL, - MKDEV(blktap_major, minor), NULL, - "blktap%d", minor); + device_create(class, NULL, MKDEV(blktap_major, minor), + "blktap%d", minor); } out: @@ -547,7 +545,7 @@ void signal_tapdisk(int idx) return; if (info->pid > 0) { - ptask = find_task_by_pid(info->pid); + ptask = find_task_by_pid_ns(info->pid, info->pid_ns); if (ptask) info->status = CLEANSHUTDOWN; } @@ -790,8 +788,9 @@ static int blktap_ioctl(struct inode *in { if (info) { info->pid = (pid_t)arg; - DPRINTK("blktap: pid received %d\n", - info->pid); + info->pid_ns = current->nsproxy->pid_ns; + DPRINTK("blktap: pid received %p:%d\n", + info->pid_ns, info->pid); } return 0; } @@ -1744,9 +1743,7 @@ static int __init blkif_init(void) * We only create the device when a request of a new device is * made. */ - class_device_create(class, NULL, - MKDEV(blktap_major, 0), NULL, - "blktap0"); + device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0"); } else { /* this is bad, but not fatal */ WPRINTK("blktap: sysfs xen_class not created\n"); --- head-2011-03-11.orig/drivers/xen/blktap2/blktap.h 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap2/blktap.h 2011-01-31 18:07:35.000000000 +0100 @@ -127,7 +127,7 @@ struct blktap_ring { wait_queue_head_t poll_wait; dev_t devno; - struct class_device *dev; + struct device *dev; atomic_t sysfs_refcnt; struct mutex sysfs_mutex; }; --- head-2011-03-11.orig/drivers/xen/blktap2/device.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap2/device.c 2011-01-31 18:07:35.000000000 +0100 @@ -567,7 +567,8 @@ blktap_map(struct blktap *tap, if (!xen_feature(XENFEAT_auto_translated_physmap)) { pte = mk_pte(page, ring->vma->vm_page_prot); - blktap_map_uaddr(ring->vma, uvaddr, pte_mkwrite(pte)); + blktap_map_uaddr(ring->vma, uvaddr, + pte_mkspecial(pte_mkwrite(pte))); flush_tlb_page(ring->vma, uvaddr); blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL)); flush_tlb_kernel_page(kvaddr); --- head-2011-03-11.orig/drivers/xen/blktap2/ring.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap2/ring.c 2011-01-31 18:07:35.000000000 +0100 @@ -66,16 +66,15 @@ blktap_read_ring(struct blktap *tap) return 0; } -static struct page * -blktap_ring_nopage(struct vm_area_struct *vma, - unsigned long address, int *type) +static int +blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { /* * if the page has not been mapped in by the driver then return - * NOPAGE_SIGBUS to the domain. + * VM_FAULT_SIGBUS to the domain. */ - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; } static pte_t @@ -202,7 +201,7 @@ blktap_ring_vm_close(struct vm_area_stru static struct vm_operations_struct blktap_ring_vm_operations = { .close = blktap_ring_vm_close, .unmap = blktap_ring_vm_unmap, - .nopage = blktap_ring_nopage, + .fault = blktap_ring_fault, .zap_pte = blktap_ring_clear_pte, }; --- head-2011-03-11.orig/drivers/xen/blktap2/sysfs.c 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap2/sysfs.c 2011-03-11 10:58:58.000000000 +0100 @@ -36,16 +36,21 @@ blktap_sysfs_exit(struct blktap *tap) blktap_sysfs_put(tap); } -static ssize_t blktap_sysfs_pause_device(struct class_device *, const char *, size_t); -static CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); -static ssize_t blktap_sysfs_resume_device(struct class_device *, const char *, size_t); -static CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); +static ssize_t blktap_sysfs_pause_device(struct device *, + struct device_attribute *, + const char *, size_t); +static DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); +static ssize_t blktap_sysfs_resume_device(struct device *, + struct device_attribute *, + const char *, size_t); +static DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); static ssize_t -blktap_sysfs_set_name(struct class_device *dev, const char *buf, size_t size) +blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) { int err; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); blktap_sysfs_enter(tap); @@ -79,10 +84,11 @@ out: } static ssize_t -blktap_sysfs_get_name(struct class_device *dev, char *buf) +blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, + char *buf) { ssize_t size; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); blktap_sysfs_enter(tap); @@ -97,15 +103,15 @@ blktap_sysfs_get_name(struct class_devic return size; } -static CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR, - blktap_sysfs_get_name, blktap_sysfs_set_name); +static DEVICE_ATTR(name, S_IRUSR | S_IWUSR, + blktap_sysfs_get_name, blktap_sysfs_set_name); static ssize_t -blktap_sysfs_remove_device(struct class_device *dev, +blktap_sysfs_remove_device(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { int err; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); if (!tap->ring.dev) return size; @@ -117,14 +123,14 @@ blktap_sysfs_remove_device(struct class_ return (err ? : size); } -static CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); +static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); static ssize_t -blktap_sysfs_pause_device(struct class_device *dev, +blktap_sysfs_pause_device(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { int err; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); blktap_sysfs_enter(tap); @@ -149,8 +155,8 @@ blktap_sysfs_pause_device(struct class_d err = blktap_device_pause(tap); if (!err) { - class_device_remove_file(dev, &class_device_attr_pause); - err = class_device_create_file(dev, &class_device_attr_resume); + device_remove_file(dev, &dev_attr_pause); + err = device_create_file(dev, &dev_attr_resume); } out: @@ -160,11 +166,11 @@ out: } static ssize_t -blktap_sysfs_resume_device(struct class_device *dev, +blktap_sysfs_resume_device(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { int err; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); blktap_sysfs_enter(tap); @@ -181,8 +187,8 @@ blktap_sysfs_resume_device(struct class_ err = blktap_device_resume(tap); if (!err) { - class_device_remove_file(dev, &class_device_attr_resume); - err = class_device_create_file(dev, &class_device_attr_pause); + device_remove_file(dev, &dev_attr_resume); + err = device_create_file(dev, &dev_attr_pause); } out: @@ -194,12 +200,13 @@ out: #ifdef ENABLE_PASSTHROUGH static ssize_t -blktap_sysfs_enable_passthrough(struct class_device *dev, +blktap_sysfs_enable_passthrough(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) { int err; unsigned major, minor; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); BTINFO("passthrough request enabled\n"); @@ -237,11 +244,12 @@ out: #endif static ssize_t -blktap_sysfs_debug_device(struct class_device *dev, char *buf) +blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, + char *buf) { char *tmp; int i, ret; - struct blktap *tap = (struct blktap *)dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); tmp = buf; blktap_sysfs_get(tap); @@ -285,13 +293,13 @@ out: return ret; } -static CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); +static DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); int blktap_sysfs_create(struct blktap *tap) { struct blktap_ring *ring; - struct class_device *dev; + struct device *dev; int err, state = 0; if (!class) @@ -299,53 +307,52 @@ blktap_sysfs_create(struct blktap *tap) ring = &tap->ring; - dev = class_device_create(class, NULL, ring->devno, - NULL, "blktap%d", tap->minor); + dev = device_create_drvdata(class, NULL, ring->devno, tap, + "blktap%d", tap->minor); if (IS_ERR(dev)) return PTR_ERR(dev); - ring->dev = dev; - dev->class_data = tap; + ring->dev = dev; mutex_init(&ring->sysfs_mutex); atomic_set(&ring->sysfs_refcnt, 0); set_bit(BLKTAP_SYSFS, &tap->dev_inuse); - err = class_device_create_file(dev, &class_device_attr_name); + err = device_create_file(dev, &dev_attr_name); if (!err) { ++state; - err = class_device_create_file(dev, &class_device_attr_remove); + err = device_create_file(dev, &dev_attr_remove); } if (!err) { ++state; - err = class_device_create_file(dev, &class_device_attr_pause); + err = device_create_file(dev, &dev_attr_pause); } if (!err) { ++state; - err = class_device_create_file(dev, &class_device_attr_debug); + err = device_create_file(dev, &dev_attr_debug); } switch (state * !!err) { - case 3: class_device_remove_file(dev, &class_device_attr_pause); - case 2: class_device_remove_file(dev, &class_device_attr_remove); - case 1: class_device_remove_file(dev, &class_device_attr_name); + case 3: device_remove_file(dev, &dev_attr_pause); + case 2: device_remove_file(dev, &dev_attr_remove); + case 1: device_remove_file(dev, &dev_attr_name); } return err; } static void -_blktap_sysfs_destroy(struct device *_dev) +_blktap_sysfs_destroy(struct device *dev) { - struct class_device *dev = _dev->???; - struct blktap *tap = dev->class_data; + struct blktap *tap = dev_get_drvdata(dev); - class_device_remove_file(dev, &class_device_attr_name); - class_device_remove_file(dev, &class_device_attr_remove); - class_device_remove_file(dev, &class_device_attr_pause); - class_device_remove_file(dev, &class_device_attr_resume); - class_device_remove_file(dev, &class_device_attr_debug); - class_device_unregister(dev); + device_remove_file(dev, &dev_attr_name); + device_remove_file(dev, &dev_attr_remove); + device_remove_file(dev, &dev_attr_pause); + device_remove_file(dev, &dev_attr_resume); + device_remove_file(dev, &dev_attr_debug); + + device_unregister(dev); clear_bit(BLKTAP_SYSFS, &tap->dev_inuse); @@ -356,7 +363,7 @@ int blktap_sysfs_destroy(struct blktap *tap) { struct blktap_ring *ring; - struct class_device *dev; + struct device *dev; ring = &tap->ring; dev = ring->dev; @@ -368,7 +375,7 @@ blktap_sysfs_destroy(struct blktap *tap) !atomic_read(&tap->ring.sysfs_refcnt))) return -EAGAIN; - return device_schedule_callback(dev->???, _blktap_sysfs_destroy); + return device_schedule_callback(dev, _blktap_sysfs_destroy); } static ssize_t --- head-2011-03-11.orig/drivers/xen/char/mem.c 2011-01-31 17:32:29.000000000 +0100 +++ head-2011-03-11/drivers/xen/char/mem.c 2011-01-31 18:07:35.000000000 +0100 @@ -33,6 +33,27 @@ static inline int uncached_access(struct return 0; } +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ +#ifdef CONFIG_NONPROMISC_DEVMEM + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) { + printk(KERN_INFO + "Program %s tried to access /dev/mem between %Lx->%Lx.\n", + current->comm, from, to); + return 0; + } + cursor += PAGE_SIZE; + pfn++; + } +#endif + return 1; +} + /* * This funcion reads the *physical* memory. The f_pos points directly to the * memory location. @@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi sz = min_t(unsigned long, sz, count); + if (!range_is_allowed(p >> PAGE_SHIFT, count)) + return -EPERM; + v = ioremap(p, sz); if (IS_ERR(v) || v == NULL) { /* @@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f sz = min_t(unsigned long, sz, count); + if (!range_is_allowed(p >> PAGE_SHIFT, sz)) + return -EPERM; + v = ioremap(p, sz); if (v == NULL) break; @@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f } #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM +static void mmap_mem_open(struct vm_area_struct *vma) +{ + map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +static void mmap_mem_close(struct vm_area_struct *vma) +{ + unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +static struct vm_operations_struct mmap_mem_ops = { + .open = mmap_mem_open, + .close = mmap_mem_close +}; + static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma) { size_t size = vma->vm_end - vma->vm_start; @@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi if (uncached_access(file)) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (!range_is_allowed(vma->vm_pgoff, size)) + return -EPERM; + + if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size, + &vma->vm_page_prot)) + return -EINVAL; + + vma->vm_ops = &mmap_mem_ops; + /* We want to return the real error code, not EAGAIN. */ return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, vma->vm_page_prot, DOMID_IO); --- head-2011-03-11.orig/drivers/xen/console/console.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/xen/console/console.c 2011-01-31 18:07:35.000000000 +0100 @@ -551,16 +551,18 @@ static int xencons_write( return i; } -static void xencons_put_char(struct tty_struct *tty, u_char ch) +static int xencons_put_char(struct tty_struct *tty, u_char ch) { unsigned long flags; + int ret; if (DUMMY_TTY(tty)) - return; + return 0; spin_lock_irqsave(&xencons_lock, flags); - (void)__xencons_put_char(ch); + ret = __xencons_put_char(ch); spin_unlock_irqrestore(&xencons_lock, flags); + return ret; } static void xencons_flush_chars(struct tty_struct *tty) @@ -582,7 +584,7 @@ static void xencons_wait_until_sent(stru if (DUMMY_TTY(tty)) return; - while (DRV(tty->driver)->chars_in_buffer(tty)) { + while (tty_chars_in_buffer(tty)) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); if (signal_pending(current)) @@ -631,8 +633,7 @@ static void xencons_close(struct tty_str tty->closing = 1; tty_wait_until_sent(tty, 0); - if (DRV(tty->driver)->flush_buffer != NULL) - DRV(tty->driver)->flush_buffer(tty); + tty_driver_flush_buffer(tty); if (tty->ldisc.flush_buffer != NULL) tty->ldisc.flush_buffer(tty); tty->closing = 0; --- head-2011-03-11.orig/drivers/xen/core/Makefile 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/Makefile 2011-01-31 18:07:35.000000000 +0100 @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o +obj-y := evtchn.o gnttab.o reboot.o machine_reboot.o firmware.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_PROC_FS) += xen_proc.o @@ -12,4 +12,3 @@ obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o obj-$(CONFIG_X86_SMP) += spinlock.o obj-$(CONFIG_KEXEC) += machine_kexec.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o --- head-2011-03-11.orig/drivers/xen/core/machine_kexec.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/machine_kexec.c 2011-01-31 18:07:35.000000000 +0100 @@ -5,6 +5,7 @@ #include <linux/kexec.h> #include <xen/interface/kexec.h> +#include <linux/reboot.h> #include <linux/mm.h> #include <linux/bootmem.h> @@ -90,6 +91,9 @@ void __init xen_machine_kexec_setup_reso xen_hypervisor_res.start = range.start; xen_hypervisor_res.end = range.start + range.size - 1; xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM; +#ifdef CONFIG_X86_64 + insert_resource(&iomem_resource, &xen_hypervisor_res); +#endif /* fill in crashk_res if range is reserved by hypervisor */ @@ -102,6 +106,9 @@ void __init xen_machine_kexec_setup_reso if (range.size) { crashk_res.start = range.start; crashk_res.end = range.start + range.size - 1; +#ifdef CONFIG_X86_64 + insert_resource(&iomem_resource, &crashk_res); +#endif } /* get physical address of vmcoreinfo */ @@ -134,6 +141,14 @@ void __init xen_machine_kexec_setup_reso xen_max_nr_phys_cpus)) goto err; +#ifdef CONFIG_X86_64 + for (k = 0; k < xen_max_nr_phys_cpus; k++) { + res = xen_phys_cpus + k; + if (!res->parent) /* outside of xen_hypervisor_res range */ + insert_resource(&iomem_resource, res); + } +#endif + #ifdef CONFIG_X86 if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note, get_order(sizeof(vmcoreinfo_note)), @@ -153,6 +168,7 @@ void __init xen_machine_kexec_setup_reso return; } +#ifndef CONFIG_X86_64 void __init xen_machine_kexec_register_resources(struct resource *res) { int k; @@ -166,6 +182,7 @@ void __init xen_machine_kexec_register_r } machine_kexec_register_resources(res); } +#endif static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) { @@ -236,6 +253,11 @@ void machine_shutdown(void) /* do nothing */ } +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); +} /* * Local variables: --- head-2011-03-11.orig/drivers/xen/core/smpboot.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/smpboot.c 2011-01-31 18:07:35.000000000 +0100 @@ -120,7 +120,7 @@ static int __cpuinit xen_smp_intr_init(u } #ifdef CONFIG_HOTPLUG_CPU -static void xen_smp_intr_exit(unsigned int cpu) +static void __cpuinit xen_smp_intr_exit(unsigned int cpu) { if (cpu != 0) local_teardown_timer(cpu); @@ -324,7 +324,7 @@ static int __init initialize_cpu_present } core_initcall(initialize_cpu_present_map); -int __cpu_disable(void) +int __cpuinit __cpu_disable(void) { cpumask_t map = cpu_online_map; unsigned int cpu = smp_processor_id(); @@ -339,7 +339,7 @@ int __cpu_disable(void) return 0; } -void __cpu_die(unsigned int cpu) +void __cpuinit __cpu_die(unsigned int cpu) { while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { current->state = TASK_UNINTERRUPTIBLE; --- head-2011-03-11.orig/drivers/xen/core/xen_proc.c 2007-06-12 13:13:44.000000000 +0200 +++ head-2011-03-11/drivers/xen/core/xen_proc.c 2011-01-31 18:07:35.000000000 +0100 @@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base; struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode) { if ( xen_base == NULL ) - if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL ) + if ( (xen_base = proc_mkdir("xen", NULL)) == NULL ) panic("Couldn't create /proc/xen"); return create_proc_entry(name, mode, xen_base); } --- head-2011-03-11.orig/drivers/xen/fbfront/xenfb.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/xen/fbfront/xenfb.c 2011-01-31 18:07:35.000000000 +0100 @@ -93,7 +93,7 @@ struct xenfb_info * only mappings. The former creates unfaulted pages. Preserves * invariant. The latter removes pages. Preserves invariant. * - * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty + * 3. Holding both locks: xenfb_vm_fault(). Extends the dirty * rectangle and updates mappings consistently. Preserves * invariant. * @@ -112,13 +112,13 @@ struct xenfb_info * * But FIXME: the invariant is too weak. It misses that the fault * record in mappings must be consistent with the mapping of pages in - * the associated address space! do_no_page() updates the PTE after - * xenfb_vm_nopage() returns, i.e. outside the critical region. This + * the associated address space! __do_fault() updates the PTE after + * xenfb_vm_fault() returns, i.e. outside the critical region. This * allows the following race: * * X writes to some address in the Xen frame buffer - * Fault - call do_no_page() - * call xenfb_vm_nopage() + * Fault - call __do_fault() + * call xenfb_vm_fault() * grab mm_lock * map->faults++; * release mm_lock @@ -387,18 +387,17 @@ static void xenfb_vm_close(struct vm_are mutex_unlock(&info->mm_lock); } -static struct page *xenfb_vm_nopage(struct vm_area_struct *vma, - unsigned long vaddr, int *type) +static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct xenfb_mapping *map = vma->vm_private_data; struct xenfb_info *info = map->info; - int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT; + int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; unsigned long flags; struct page *page; int y1, y2; if (pgnr >= info->nr_pages) - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; mutex_lock(&info->mm_lock); spin_lock_irqsave(&info->dirty_lock, flags); @@ -414,16 +413,15 @@ static struct page *xenfb_vm_nopage(stru spin_unlock_irqrestore(&info->dirty_lock, flags); mutex_unlock(&info->mm_lock); - if (type) - *type = VM_FAULT_MINOR; + vmf->page = page; - return page; + return VM_FAULT_MINOR; } static struct vm_operations_struct xenfb_vm_ops = { .open = xenfb_vm_open, .close = xenfb_vm_close, - .nopage = xenfb_vm_nopage, + .fault = xenfb_vm_fault, }; static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma) --- head-2011-03-11.orig/drivers/xen/features.c 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/drivers/xen/features.c 2011-01-31 18:07:35.000000000 +0100 @@ -9,14 +9,21 @@ #include <linux/cache.h> #include <linux/module.h> +#ifdef CONFIG_PARAVIRT_XEN #include <asm/xen/hypercall.h> +#else +#include <asm/hypervisor.h> +#endif +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif #include <xen/interface/xen.h> #include <xen/interface/version.h> #include <xen/features.h> u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); +EXPORT_SYMBOL(xen_features); void xen_setup_features(void) { --- head-2011-03-11.orig/drivers/xen/gntdev/gntdev.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/gntdev/gntdev.c 2011-01-31 18:07:35.000000000 +0100 @@ -375,7 +375,7 @@ nomem_out: static int __init gntdev_init(void) { struct class *class; - struct class_device *device; + struct device *device; if (!is_running_on_xen()) { pr_err("You must be running Xen to use gntdev\n"); @@ -399,8 +399,8 @@ static int __init gntdev_init(void) return 0; } - device = class_device_create(class, NULL, MKDEV(gntdev_major, 0), - NULL, GNTDEV_NAME); + device = device_create(class, NULL, MKDEV(gntdev_major, 0), + GNTDEV_NAME); if (IS_ERR(device)) { pr_err("Error creating gntdev device in xen_class\n"); pr_err("gntdev created, major number = %d\n", gntdev_major); @@ -416,7 +416,7 @@ static void __exit gntdev_exit(void) { struct class *class; if ((class = get_xen_class()) != NULL) - class_device_destroy(class, MKDEV(gntdev_major, 0)); + device_destroy(class, MKDEV(gntdev_major, 0)); unregister_chrdev(gntdev_major, GNTDEV_NAME); } --- head-2011-03-11.orig/drivers/xen/netfront/netfront.c 2011-02-09 16:00:35.000000000 +0100 +++ head-2011-03-11/drivers/xen/netfront/netfront.c 2011-02-09 16:04:02.000000000 +0100 @@ -1452,8 +1452,7 @@ err: } } - while ((skb = __skb_dequeue(&errq))) - kfree_skb(skb); + __skb_queue_purge(&errq); while ((skb = __skb_dequeue(&rxq)) != NULL) { struct page *page = NETFRONT_SKB_CB(skb)->page; @@ -1623,8 +1622,7 @@ static void netif_release_rx_bufs_flip(s } } - while ((skb = __skb_dequeue(&free_list)) != NULL) - dev_kfree_skb(skb); + __skb_queue_purge(&free_list); spin_unlock_bh(&np->rx_lock); } --- head-2011-03-11.orig/drivers/xen/pciback/pci_stub.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/drivers/xen/pciback/pci_stub.c 2011-01-31 18:07:35.000000000 +0100 @@ -5,7 +5,9 @@ * Chris Bookholt <hap10@epoch.ncsc.mil> */ #include <linux/module.h> +#include <linux/gfp.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/rwsem.h> #include <linux/list.h> #include <linux/spinlock.h> @@ -507,34 +509,46 @@ static void kill_domain_by_device(struct { struct xenbus_transaction xbt; int err; - char nodename[1024]; + char *nodename; - if (!psdev) + if (!psdev) { dev_err(&psdev->dev->dev, "device is NULL when do AER recovery/kill_domain\n"); - sprintf(nodename, "/local/domain/0/backend/pci/%d/0", - psdev->pdev->xdev->otherend_id); - nodename[strlen(nodename)] = '\0'; - -again: - err = xenbus_transaction_start(&xbt); - if (err) - { - dev_err(&psdev->dev->dev, - "error %d when start xenbus transaction\n", err); return; } - /*PV AER handlers will set this flag*/ - xenbus_printf(xbt, nodename, "aerState" , "aerfail" ); - err = xenbus_transaction_end(xbt, 0); - if (err) - { - if (err == -EAGAIN) - goto again; + + nodename = kasprintf(GFP_KERNEL, + "/local/domain/0/backend/pci/%d/0", + psdev->pdev->xdev->otherend_id); + if (!nodename) { dev_err(&psdev->dev->dev, - "error %d when end xenbus transaction\n", err); + "not enough memory\n"); return; } + + do { + err = xenbus_transaction_start(&xbt); + if (err) { + dev_err(&psdev->dev->dev, + "error %d starting xenbus transaction\n", err); + break; + } + + /* PV AER handlers will set this flag */ + xenbus_printf(xbt, nodename, "aerState" , "aerfail" ); + + err = xenbus_transaction_end(xbt, 0); + switch (err) { + default: + dev_err(&psdev->dev->dev, + "error %d ending xenbus transaction\n", err); + break; + case 0: + case -EAGAIN: + break; + } + } while (err == -EAGAIN); + kfree(nodename); } /* For each aer recovery step error_detected, mmio_enabled, etc, front_end and --- head-2011-03-11.orig/drivers/xen/privcmd/privcmd.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-03-11/drivers/xen/privcmd/privcmd.c 2011-01-31 18:07:35.000000000 +0100 @@ -401,15 +401,13 @@ static long privcmd_ioctl(struct file *f } #ifndef HAVE_ARCH_PRIVCMD_MMAP -static struct page *privcmd_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; } static struct vm_operations_struct privcmd_vm_ops = { - .nopage = privcmd_nopage + .fault = privcmd_fault }; static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) --- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_client.c 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-11/drivers/xen/xenbus/xenbus_client.c 2011-01-31 18:07:35.000000000 +0100 @@ -437,7 +437,7 @@ int xenbus_map_ring_valloc(struct xenbus *vaddr = NULL; - area = alloc_vm_area(PAGE_SIZE); + area = xen_alloc_vm_area(PAGE_SIZE); if (!area) return -ENOMEM; @@ -447,7 +447,7 @@ int xenbus_map_ring_valloc(struct xenbus BUG(); if (op.status != GNTST_okay) { - free_vm_area(area); + xen_free_vm_area(area); xenbus_dev_fatal(dev, op.status, "mapping in shared page %d from domain %d", gnt_ref, dev->otherend_id); @@ -546,7 +546,7 @@ int xenbus_unmap_ring_vfree(struct xenbu BUG(); if (op.status == GNTST_okay) - free_vm_area(area); + xen_free_vm_area(area); else xenbus_dev_error(dev, op.status, "unmapping page at handle %d error %d", --- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_probe.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/drivers/xen/xenbus/xenbus_probe.c 2011-01-31 18:07:35.000000000 +0100 @@ -175,7 +175,7 @@ static int read_backend_details(struct x return read_otherend_details(xendev, "backend-id", "backend"); } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE)) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env) { struct xenbus_device *xdev; @@ -187,8 +187,10 @@ static int xenbus_uevent_frontend(struct return -ENODEV; /* stuff we want to pass to /sbin/hotplug */ +#if defined(CONFIG_XEN) || defined(MODULE) add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype); add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename); +#endif add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype); return 0; @@ -209,10 +211,8 @@ static struct xen_bus_type xenbus_fronte .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, -#if defined(CONFIG_XEN) || defined(MODULE) .uevent = xenbus_uevent_frontend, #endif -#endif }, #if defined(CONFIG_XEN) || defined(MODULE) .dev = { @@ -531,6 +531,15 @@ static ssize_t xendev_show_devtype(struc } static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); +static ssize_t xendev_show_modalias(struct device *dev, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) +{ + return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); +} +static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -591,10 +600,16 @@ int xenbus_probe_node(struct xen_bus_typ err = device_create_file(&xendev->dev, &dev_attr_devtype); if (err) - goto fail_remove_file; + goto fail_remove_nodename; + + err = device_create_file(&xendev->dev, &dev_attr_modalias); + if (err) + goto fail_remove_devtype; return 0; -fail_remove_file: +fail_remove_devtype: + device_remove_file(&xendev->dev, &dev_attr_devtype); +fail_remove_nodename: device_remove_file(&xendev->dev, &dev_attr_nodename); fail_unregister: device_unregister(&xendev->dev); --- head-2011-03-11.orig/fs/aio.c 2011-03-11 10:55:30.000000000 +0100 +++ head-2011-03-11/fs/aio.c 2011-03-11 10:58:46.000000000 +0100 @@ -1243,6 +1243,7 @@ static void io_destroy(struct kioctx *io #ifdef CONFIG_EPOLL /* forget the poll file, but it's up to the user to close it */ if (ioctx->file) { + fput(ioctx->file); ioctx->file->private_data = 0; ioctx->file = 0; } @@ -1267,6 +1268,7 @@ static int aio_queue_fd_close(struct ino spin_lock_irq(&ioctx->ctx_lock); ioctx->file = 0; spin_unlock_irq(&ioctx->ctx_lock); + fput(file); } return 0; } @@ -1302,16 +1304,17 @@ static const struct file_operations aioq static int make_aio_fd(struct kioctx *ioctx) { - int error, fd; - struct inode *inode; + int fd; struct file *file; - error = anon_inode_getfd(&fd, &inode, &file, "[aioq]", - &aioq_fops, ioctx); - if (error) - return error; + fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx); + if (fd < 0) + return fd; /* associate the file with the IO context */ + file = fget(fd); + if (!file) + return -EBADF; file->private_data = ioctx; ioctx->file = file; init_waitqueue_head(&ioctx->poll_wait); --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/desc.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/desc.h 2011-01-31 18:07:35.000000000 +0100 @@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp } static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, unsigned short seg) - + unsigned long base, unsigned dpl, unsigned flags, + unsigned short seg) { gate->a = (seg << 16) | (base & 0xffff); gate->b = (base & 0xffff0000) | @@ -84,22 +84,23 @@ static inline int desc_empty(const void #define load_TR_desc() native_load_tr_desc() #define load_gdt(dtr) native_load_gdt(dtr) #define load_idt(dtr) native_load_idt(dtr) -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) #define store_gdt(dtr) native_store_gdt(dtr) #define store_idt(dtr) native_store_idt(dtr) #define store_tr(tr) (tr = native_store_tr()) -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) +#define store_ldt(ldt) asm("sldt %0":"=m" (ldt)) #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt -#define write_ldt_entry(dt, entry, desc) \ - native_write_ldt_entry(dt, entry, desc) -#define write_gdt_entry(dt, entry, desc, type) \ - native_write_gdt_entry(dt, entry, desc, type) -#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g) +#define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) \ + native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) \ + native_write_idt_entry(dt, entry, g) static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) @@ -138,8 +139,8 @@ static inline void pack_descriptor(struc { desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); + (limit & 0x000f0000) | ((type & 0xff) << 8) | + ((flags & 0xf) << 20); desc->p = 1; } @@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; desc->base3 = PTR_HIGH(addr); #else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); #endif } @@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign * last valid byte */ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + + sizeof(unsigned long) - 1); write_gdt_entry(d, entry, &tss, DESC_TSS); } @@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign static inline void native_set_ldt(const void *addr, unsigned int entries) { if (likely(entries == 0)) - __asm__ __volatile__("lldt %w0"::"q" (0)); + asm volatile("lldt %w0"::"q" (0)); else { unsigned cpu = smp_processor_id(); ldt_desc ldt; - set_tssldt_descriptor(&ldt, (unsigned long)addr, - DESC_LDT, entries * sizeof(ldt) - 1); + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, + entries * LDT_ENTRY_SIZE - 1); write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &ldt, DESC_LDT); - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); } } @@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t } #endif -#define _LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0) +#define _LDT_empty(info) \ + ((info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0) #ifdef CONFIG_X86_64 #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) @@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim #ifndef CONFIG_X86_NO_IDT static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) + unsigned dpl, unsigned ist, unsigned seg) { gate_desc s; pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); @@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. */ #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ - movb idx*8+4(gdt), lo_b; \ - movb idx*8+7(gdt), hi_b; \ - shll $16, base; \ - movw idx*8+2(gdt), lo_w; + movb idx * 8 + 4(gdt), lo_b; \ + movb idx * 8 + 7(gdt), hi_b; \ + shll $16, base; \ + movw idx * 8 + 2(gdt), lo_w; #endif /* __ASSEMBLY__ */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/dma-mapping.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/dma-mapping.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,17 @@ -#ifdef CONFIG_X86_32 -# include "dma-mapping_32.h" -#else -# include "dma-mapping_64.h" -#endif +#ifndef _ASM_DMA_MAPPING_H_ + +#include_next <asm/dma-mapping.h> + +static inline int +address_needs_mapping(struct device *hwdev, dma_addr_t addr) +{ + dma_addr_t mask = 0xffffffff; + /* If the device has a mask, use it, otherwise default to 32 bits */ + if (hwdev && hwdev->dma_mask) + mask = *hwdev->dma_mask; + return (addr & ~mask) != 0; +} + +extern int range_straddles_page_boundary(paddr_t p, size_t size); + +#endif /* _ASM_DMA_MAPPING_H_ */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,13 @@ +#ifndef _ASM_FIXMAP_H +#define _ASM_FIXMAP_H + #ifdef CONFIG_X86_32 # include "fixmap_32.h" #else # include "fixmap_64.h" #endif + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-01-31 18:07:35.000000000 +0100 @@ -10,8 +10,8 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ -#ifndef _ASM_FIXMAP_H -#define _ASM_FIXMAP_H +#ifndef _ASM_FIXMAP_32_H +#define _ASM_FIXMAP_32_H /* used by vmalloc.c, vsyscall.lds.S. * @@ -102,8 +102,7 @@ enum fixed_addresses { */ #define NR_FIX_BTMAPS 64 #define FIX_BTMAPS_NESTING 4 - FIX_BTMAP_END = - __end_of_permanent_fixed_addresses + 512 - + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 - (__end_of_permanent_fixed_addresses & 511), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, FIX_WP_TEST, @@ -114,19 +113,16 @@ enum fixed_addresses { }; extern void __set_fixmap(enum fixed_addresses idx, - maddr_t phys, pgprot_t flags); + maddr_t phys, pgprot_t flags); extern void reserve_top_address(unsigned long reserve); -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) /* * Some hardware wants to get fixmapped without caching. */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - -#define clear_fixmap(idx) \ - __set_fixmap(idx, 0, __pgprot(0)) +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) @@ -159,7 +155,7 @@ static __always_inline unsigned long fix if (idx >= __end_of_fixed_addresses) __this_fixmap_does_not_exist(); - return __fix_to_virt(idx); + return __fix_to_virt(idx); } static inline unsigned long virt_to_fix(const unsigned long vaddr) --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-01-31 18:07:35.000000000 +0100 @@ -8,8 +8,8 @@ * Copyright (C) 1998 Ingo Molnar */ -#ifndef _ASM_FIXMAP_H -#define _ASM_FIXMAP_H +#ifndef _ASM_FIXMAP_64_H +#define _ASM_FIXMAP_64_H #include <linux/kernel.h> #include <asm/apicdef.h> @@ -35,7 +35,8 @@ enum fixed_addresses { VSYSCALL_LAST_PAGE, - VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, VSYSCALL_HPET, FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, @@ -45,11 +46,12 @@ enum fixed_addresses { #endif #ifndef CONFIG_XEN FIX_IO_APIC_BASE_0, - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif #ifdef CONFIG_EFI FIX_EFI_IO_MAP_LAST_PAGE, - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1, + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE + + MAX_EFI_IO_PAGES - 1, #endif #ifdef CONFIG_ACPI FIX_ACPI_BEGIN, @@ -79,19 +81,16 @@ enum fixed_addresses { __end_of_fixed_addresses }; -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); +extern void __set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) /* * Some hardware wants to get fixmapped without caching. */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - -#define clear_fixmap(idx) \ - __set_fixmap(idx, 0, __pgprot(0)) +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/highmem.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/highmem.h 2011-01-31 18:07:35.000000000 +0100 @@ -8,7 +8,7 @@ * Gerhard.Wichert@pdb.siemens.de * * - * Redesigned the x86 32-bit VM architecture to deal with + * Redesigned the x86 32-bit VM architecture to deal with * up to 16 Terabyte physical memory. With current x86 CPUs * we now support up to 64 Gigabytes physical RAM. * --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/io.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/io.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,22 @@ +#ifndef _ASM_X86_IO_H +#define _ASM_X86_IO_H + +#define ARCH_HAS_IOREMAP_WC + #ifdef CONFIG_X86_32 # include "io_32.h" #else # include "io_64.h" #endif + +extern void *xlate_dev_mem_ptr(unsigned long phys); +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); + +extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t); +extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t); + +extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size, + unsigned long prot_val); +extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); + +#endif /* _ASM_X86_IO_H */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/irqflags.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/irqflags.h 2011-01-31 18:07:35.000000000 +0100 @@ -139,11 +139,11 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT #endif /* __ASSEMBLY__ */ #ifndef __ASSEMBLY__ -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) -#define raw_local_irq_save(flags) \ - do { (flags) = __raw_local_irq_save(); } while (0) +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) static inline int raw_irqs_disabled_flags(unsigned long flags) { --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-01-31 18:07:35.000000000 +0100 @@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { - /* We were in lazy tlb mode and leave_mm disabled + /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload %cr3. */ load_cr3(next->pgd); @@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s #define deactivate_mm(tsk, mm) \ asm("movl %0,%%gs": :"r" (0)); -#define activate_mm(prev, next) \ - do { \ - xen_activate_mm(prev, next); \ - switch_mm((prev),(next),NULL); \ - } while(0) +#define activate_mm(prev, next) \ +do { \ + xen_activate_mm(prev, next); \ + switch_mm((prev), (next), NULL); \ +} while (0) #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-01-31 18:07:35.000000000 +0100 @@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - if (read_pda(mmu_state) == TLBSTATE_OK) + if (read_pda(mmu_state) == TLBSTATE_OK) write_pda(mmu_state, TLBSTATE_LAZY); #endif } @@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm) extern void mm_unpin(struct mm_struct *mm); void mm_pin_all(void); -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned cpu = smp_processor_id(); @@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s if (read_pda(active_mm) != next) BUG(); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { - /* We were in lazy tlb mode and leave_mm disabled + /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ @@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s #endif } -#define deactivate_mm(tsk,mm) do { \ - load_gs_index(0); \ - asm volatile("movl %0,%%fs"::"r"(0)); \ -} while(0) +#define deactivate_mm(tsk, mm) \ +do { \ + load_gs_index(0); \ + asm volatile("movl %0,%%fs"::"r"(0)); \ +} while (0) static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pci.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pci.h 2011-01-31 18:07:35.000000000 +0100 @@ -8,14 +8,13 @@ #include <asm/scatterlist.h> #include <asm/io.h> - #ifdef __KERNEL__ struct pci_sysdata { int domain; /* PCI domain */ int node; /* NUMA node */ #ifdef CONFIG_X86_64 - void* iommu; /* IOMMU private data */ + void *iommu; /* IOMMU private data */ #endif #ifdef CONFIG_XEN_PCIDEV_FRONTEND struct pcifront_device *pdev; @@ -23,6 +22,8 @@ struct pci_sysdata { }; /* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, + int node); extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); static inline int pci_domain_nr(struct pci_bus *bus) @@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct return pci_domain_nr(bus); } +extern void pci_iommu_alloc(void); /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes @@ -57,7 +59,7 @@ extern unsigned long pci_mem_start; #define PCIBIOS_MIN_CARDBUS_IO 0x4000 void pcibios_config_init(void); -struct pci_bus * pcibios_scan_root(int bus); +struct pci_bus *pcibios_scan_root(int bus); void pcibios_set_master(struct pci_dev *dev); void pcibios_penalize_isa_irq(int irq, int active); @@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d #define HAVE_PCI_MMAP extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine); + enum pci_mmap_state mmap_state, + int write_combine); #ifdef CONFIG_PCI --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgalloc.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgalloc.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,149 @@ -#ifdef CONFIG_X86_32 -# include "pgalloc_32.h" -#else -# include "pgalloc_64.h" +#ifndef _ASM_X86_PGALLOC_H +#define _ASM_X86_PGALLOC_H + +#include <linux/threads.h> +#include <linux/mm.h> /* for struct page */ +#include <linux/pagemap.h> + +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ + +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, + unsigned long start, unsigned long count) {} +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_release_pte(unsigned long pfn) {} +static inline void paravirt_release_pmd(unsigned long pfn) {} +static inline void paravirt_release_pud(unsigned long pfn) {} + +#ifdef CONFIG_X86_64 +void early_make_page_readonly(void *va, unsigned int feature); +pmd_t *early_get_pmd(unsigned long va); +#define make_lowmem_page_readonly make_page_readonly +#define make_lowmem_page_writable make_page_writable #endif + +/* + * Allocate and free page tables. + */ +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); + +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); + free_page((unsigned long)pte); +} + +extern void __pte_free(pgtable_t); +static inline void pte_free(struct mm_struct *mm, struct page *pte) +{ + __pte_free(pte); +} + +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void pmd_populate_kernel(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + + paravirt_alloc_pte(mm, pfn); + if (PagePinned(virt_to_page(mm->pgd))) { + if (!PageHighMem(pte)) + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); +#ifndef CONFIG_X86_64 + else if (!TestSetPagePinned(pte)) + kmap_flush_unused(); +#endif + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); + } else + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +#if PAGETABLE_LEVELS > 2 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); +extern void __pmd_free(pgtable_t); + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pmd)); +} + +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +#ifdef CONFIG_X86_PAE +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); +#else /* !CONFIG_X86_PAE */ +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + } else + *pud = __pud(_PAGE_TABLE | __pa(pmd)); +} +#endif /* CONFIG_X86_PAE */ + +#if PAGETABLE_LEVELS > 3 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) + +/* + * We need to use the batch mode here, but pgd_pupulate() won't be + * be called frequently. + */ +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); + } else { + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); + *__user_pgd(pgd) = *(pgd); + } +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return (pud_t *)pmd_alloc_one(mm, addr); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pud)); +} + +extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +#endif /* _ASM_X86_PGALLOC_H */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgalloc_32.h 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,111 +0,0 @@ -#ifndef _I386_PGALLOC_H -#define _I386_PGALLOC_H - -#include <linux/threads.h> -#include <linux/mm.h> /* for struct page */ -#include <linux/pagemap.h> -#include <asm/tlb.h> -#include <asm-generic/tlb.h> -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ - -#define paravirt_alloc_pt(mm, pfn) do { } while (0) -#define paravirt_alloc_pd(mm, pfn) do { } while (0) -#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) -#define paravirt_release_pt(pfn) do { } while (0) -#define paravirt_release_pd(pfn) do { } while (0) - -static inline void pmd_populate_kernel(struct mm_struct *mm, - pmd_t *pmd, pte_t *pte) -{ - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); - set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) -{ - unsigned long pfn = page_to_pfn(pte); - - paravirt_alloc_pt(mm, pfn); - if (PagePinned(virt_to_page(mm->pgd))) { - if (!PageHighMem(pte)) - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, PAGE_KERNEL_RO), 0)); - else if (!test_and_set_bit(PG_pinned, &pte->flags)) - kmap_flush_unused(); - set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); - } else - *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); -} -#define pmd_pgtable(pmd) pmd_page(pmd) - -/* - * Allocate and free page tables. - */ -extern void pgd_test_and_unpin(pgd_t *); -extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); - -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); - free_page((unsigned long)pte); -} - -extern void __pte_free(pgtable_t); -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - __pte_free(pte); -} - - -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); - -#ifdef CONFIG_X86_PAE -/* - * In the PAE case we free the pmds as part of the pgd. - */ -extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long); - -extern void __pmd_free(pgtable_t); -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - __pmd_free(virt_to_page(pmd)); -} - -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); - -static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) -{ - struct page *page = virt_to_page(pmd); - unsigned long pfn = page_to_pfn(page); - - paravirt_alloc_pd(mm, pfn); - - /* Note: almost everything apart from _PAGE_PRESENT is - reserved at the pmd (PDPT) level. */ - if (PagePinned(virt_to_page(mm->pgd))) { - BUG_ON(PageHighMem(page)); - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, PAGE_KERNEL_RO), 0)); - set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); - } else - *pudp = __pud(__pa(pmd) | _PAGE_PRESENT); - - /* - * According to Intel App note "TLBs, Paging-Structure Caches, - * and Their Invalidation", April 2007, document 317080-001, - * section 8.1: in PAE mode we explicitly have to flush the - * TLB via cr3 if the top-level pgd is changed... - */ - if (mm == current->active_mm) - xen_tlb_flush(); -} -#endif /* CONFIG_X86_PAE */ - -#endif /* _I386_PGALLOC_H */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgalloc_64.h 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,182 +0,0 @@ -#ifndef _X86_64_PGALLOC_H -#define _X86_64_PGALLOC_H - -#include <asm/pda.h> -#include <linux/threads.h> -#include <linux/mm.h> -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ - -pmd_t *early_get_pmd(unsigned long va); -void early_make_page_readonly(void *va, unsigned int feature); - -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) - -#define pmd_populate_kernel(mm, pmd, pte) \ - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)pmd, - pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, - PAGE_KERNEL_RO), 0)); - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); - } else { - *(pud) = __pud(_PAGE_TABLE | __pa(pmd)); - } -} - -/* - * We need to use the batch mode here, but pgd_pupulate() won't be - * be called frequently. - */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)pud, - pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, - PAGE_KERNEL_RO), 0)); - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); - } else { - *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); - *(__user_pgd(pgd)) = *(pgd); - } -} - -#define pmd_pgtable(pmd) pmd_page(pmd) - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) -{ - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); - } else { - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); - } -} - -extern void __pmd_free(pgtable_t); -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - __pmd_free(virt_to_page(pmd)); -} - -extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return (pud_t *)pmd_alloc_one(mm, addr); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - __pmd_free(virt_to_page(pud)); -} - -static inline void pgd_list_add(pgd_t *pgd, struct mm_struct *mm) -{ - struct page *page = virt_to_page(pgd); - unsigned long flags; - - /* Store a back link for vmalloc_sync_all(). */ - set_page_private(page, (unsigned long)mm); - - spin_lock_irqsave(&pgd_lock, flags); - list_add(&page->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - unsigned long flags; - - spin_lock_irqsave(&pgd_lock, flags); - list_del(&page->lru); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -extern void pgd_test_and_unpin(pgd_t *); - -static inline pgd_t *pgd_alloc(struct mm_struct *mm) -{ - /* - * We allocate two contiguous pages for kernel and user. - */ - unsigned boundary; - pgd_t *pgd; - - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 1); - if (!pgd) - return NULL; - pgd_test_and_unpin(pgd); - pgd_list_add(pgd, mm); - /* - * Copy kernel pointers in from init. - * Could keep a freelist or slab cache of those because the kernel - * part never changes. - */ - boundary = pgd_index(__PAGE_OFFSET); - memcpy(pgd + boundary, - init_level4_pgt + boundary, - (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); - - /* - * Set level3_user_pgt for vsyscall area - */ - __user_pgd(pgd)[pgd_index(VSYSCALL_START)] = - __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); - return pgd; -} - -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pgd_list_del(pgd); - pgd_test_and_unpin(pgd); - free_pages((unsigned long)pgd, 1); -} - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (pte) - make_page_readonly(pte, XENFEAT_writable_page_tables); - - return pte; -} - -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); - -/* Should really implement gc for free page table pages. This could be - done with a reference count in struct page. */ - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); - make_page_writable(pte, XENFEAT_writable_page_tables); - free_page((unsigned long)pte); -} - -extern void __pte_free(pgtable_t); -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - __pte_free(pte); -} - -#define __pte_free_tlb(tlb,pte) \ -do { \ - pgtable_page_dtor((pte)); \ - tlb_remove_page((tlb), (pte)); \ -} while (0) - -#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) -#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) - -#endif /* _X86_64_PGALLOC_H */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,17 +1,15 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) #define FIRST_USER_ADDRESS 0 -#define _PAGE_BIT_PRESENT 0 -#define _PAGE_BIT_RW 1 -#define _PAGE_BIT_USER 2 -#define _PAGE_BIT_PWT 3 -#define _PAGE_BIT_PCD 4 -#define _PAGE_BIT_ACCESSED 5 -#define _PAGE_BIT_DIRTY 6 -#define _PAGE_BIT_FILE 6 +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ @@ -22,6 +20,14 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* If _PAGE_BIT_PRESENT is clear, we use these: */ + +/* set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + +/* if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL + /* * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a * sign-extended value on 32-bit with all 1's in the upper word, @@ -48,10 +54,8 @@ #define _PAGE_NX 0 #endif -/* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; - pte_present gives true */ +#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE) #ifndef __ASSEMBLY__ #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 @@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user; #endif #endif -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | __kernel_page_user) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \ + _PAGE_ACCESSED | _PAGE_DIRTY) -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) +/* + * PAT settings are part of the hypervisor interface, which sets the + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). + */ +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) +#define _PAGE_CACHE_WB (0) +#define _PAGE_CACHE_WT (_PAGE_PWT) +#define _PAGE_CACHE_WC (_PAGE_PAT) +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) #define PAGE_COPY PAGE_COPY_NOEXEC -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) #ifdef CONFIG_X86_32 #define _PAGE_KERNEL_EXEC \ @@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) @@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE #define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC) #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) #define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS) #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE) @@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; @@ -152,30 +180,111 @@ extern struct list_head pgd_list; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } -static inline int pte_global(pte_t pte) { return 0; } -static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); } - -static inline int pmd_large(pmd_t pte) { - return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == - (_PAGE_PSE|_PAGE_PRESENT); -} - -static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); } -static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); } -static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); } -static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); } -static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); } -static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); } -static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); } -static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); } -static inline pte_t pte_mkglobal(pte_t pte) { return pte; } -static inline pte_t pte_clrglobal(pte_t pte) { return pte; } +static inline int pte_dirty(pte_t pte) +{ + return __pte_val(pte) & _PAGE_DIRTY; +} + +static inline int pte_young(pte_t pte) +{ + return __pte_val(pte) & _PAGE_ACCESSED; +} + +static inline int pte_write(pte_t pte) +{ + return __pte_val(pte) & _PAGE_RW; +} + +static inline int pte_file(pte_t pte) +{ + return __pte_val(pte) & _PAGE_FILE; +} + +static inline int pte_huge(pte_t pte) +{ + return __pte_val(pte) & _PAGE_PSE; +} + +static inline int pte_global(pte_t pte) +{ + return 0; +} + +static inline int pte_exec(pte_t pte) +{ + return !(__pte_val(pte) & _PAGE_NX); +} + +static inline int pte_special(pte_t pte) +{ + return 0; +} + +static inline int pmd_large(pmd_t pte) +{ + return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte_ma(__pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return __pte_ma(__pte_val(pte) | _PAGE_PSE); +} + +static inline pte_t pte_clrhuge(pte_t pte) +{ + return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); +} + +static inline pte_t pte_mkglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_clrglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte; +} extern pteval_t __supported_pte_mask; @@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte pteval_t val = pte_val(pte); val &= _PAGE_CHG_MASK; - val |= pgprot_val(newprot) & __supported_pte_mask; + val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask; return __pte(val); } -#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX)) +/* mprotect needs to preserve PAT bits when updating vm_page_prot */ +#define pgprot_modify pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; + pgprotval_t addbits = pgprot_val(newprot); + return __pgprot(preservebits | addbits); +} + +#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +#ifndef __ASSEMBLY__ +#define __HAVE_PHYS_MEM_ACCESS_PROT +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); +#endif + #define set_pte(ptep, pte) xen_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) @@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte # include "pgtable_64.h" #endif +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) + #ifndef __ASSEMBLY__ enum { @@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct * bit at the same time. */ #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ -({ \ - int __changed = !pte_same(*(ptep), entry); \ - if (__changed && (dirty)) { \ - if ( likely((vma)->vm_mm == current->mm) ) { \ - BUG_ON(HYPERVISOR_update_va_mapping(address, \ - entry, \ - uvm_multi((vma)->vm_mm->cpu_vm_mask) | \ - UVMF_INVLPG)); \ - } else { \ - xen_l1_entry_update(ptep, entry); \ - flush_tlb_page(vma, address); \ - } \ - } \ - __changed; \ -}) +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ - int __ret = 0; \ - if (pte_young(*(ptep))) \ - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ - &(ptep)->pte); \ - if (__ret) \ - pte_update((vma)->vm_mm, addr, ptep); \ - __ret; \ -}) +extern int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(vma, address, ptep) \ -({ \ - pte_t __pte = *(ptep); \ - int __young = pte_young(__pte); \ - __pte = pte_mkold(__pte); \ - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ - else if (__young) \ - (ptep)->pte_low = __pte.pte_low; \ - __young; \ -}) +extern int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_FLUSH #define ptep_clear_flush(vma, addr, ptep) \ @@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct }) #define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { pte_t pte = *ptep; if (!pte_none(pte) @@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int); #define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; if (pte_write(pte)) set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } +/* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * + * dst - pointer to pgd range anwhere on a pgd page + * src - "" + * count - the number of pgds to copy. + * + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) +{ + memcpy(dst, src, count * sizeof(pgd_t)); +} + #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-01-31 18:07:35.000000000 +0100 @@ -8,25 +8,28 @@ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> */ -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \ - &(e), __pte_val(e), pte_pfn(e)) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ - &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ - &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT) - +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), \ + (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT) static inline int pud_none(pud_t pud) { return __pud_val(pud) == 0; + } static inline int pud_bad(pud_t pud) { return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0; } + static inline int pud_present(pud_t pud) { return __pud_val(pud) & _PAGE_PRESENT; @@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((unsigned long long *)(ptep),__pte_val(pte)); + set_64bit((unsigned long long *)(ptep), __pte_val(pte)); } + static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) { xen_l2_entry_update(pmdp, pmd); } + static inline void xen_set_pud(pud_t *pudp, pud_t pud) { xen_l3_entry_update(pudp, pud); @@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp * current pgd to avoid unnecessary TLB flushes. */ pgd = read_cr3(); - if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + if (__pa(pudp) >= pgd && __pa(pudp) < + (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) xen_tlb_flush(); } -#define pud_page(pud) \ -((struct page *) __va(pud_val(pud) & PAGE_MASK)) +#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK)) -#define pud_page_vaddr(pud) \ -((unsigned long) __va(pud_val(pud) & PAGE_MASK)) +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ - pmd_index(address)) +#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \ + pmd_index(address)) #ifdef CONFIG_SMP static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) @@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte) * put the 32 bits of offset into the high part. */ #define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) +#define pgoff_to_pte(off) \ + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-07 15:38:52.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-01-31 18:07:35.000000000 +0100 @@ -38,16 +38,13 @@ void paging_init(void); #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level-defs.h> # define PMD_SIZE (1UL << PMD_SHIFT) -# define PMD_MASK (~(PMD_SIZE-1)) +# define PMD_MASK (~(PMD_SIZE - 1)) #else # include <asm/pgtable-2level-defs.h> #endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the @@ -56,21 +53,22 @@ void paging_init(void); * The vmalloc() routines leaves a hole of 4kB between each vmalloced * area for the same reason. ;) */ -#define VMALLOC_OFFSET (8*1024*1024) -#define VMALLOC_START (((unsigned long) high_memory + \ - 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#define VMALLOC_OFFSET (8 * 1024 * 1024) +#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \ + & ~(VMALLOC_OFFSET - 1)) #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 #else #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK) +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ + & PMD_MASK) #ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) #endif /* @@ -91,10 +89,10 @@ extern unsigned long pg0[]; /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. can temporarily clear it. */ #define pmd_present(x) (__pmd_val(x)) -#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) +#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) #else #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) -#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #endif @@ -107,32 +105,18 @@ extern unsigned long pg0[]; #endif /* - * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); - * - * dst - pointer to pgd range anwhere on a pgd page - * src - "" - * count - the number of pgds to copy. - * - * dst and src can be on the same page, but the range must not overlap, - * and must not cross a page boundary. + * Macro to mark a page protection value as "uncacheable". + * On processors which do not support it, this is a no-op. */ -static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) -{ - memcpy(dst, src, count * sizeof(pgd_t)); -} - -/* - * Macro to mark a page protection value as "uncacheable". On processors which do not support - * it, this is a no-op. - */ -#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ + : (prot)) /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ - #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) /* @@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t * this macro returns the index of the entry in the pgd page which would * control the given virtual address */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_index_k(addr) pgd_index(addr) +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) +#define pgd_index_k(addr) pgd_index((addr)) /* * pgd_offset() returns a (pgd_t *) * pgd_index() is used get the offset into the pgd page's array of pgd_t's; */ -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) +#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) static inline int pud_large(pud_t pud) { return 0; } @@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) { * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -#define pmd_index(address) \ - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pmd_index(address) \ + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) /* * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] @@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) { * this macro returns the index of the entry in the pte page which would * control the given virtual address */ -#define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) \ - ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) +#define pte_index(address) \ + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) \ + ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address))) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) -#define pmd_page_vaddr(pmd) \ - ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#define pmd_page_vaddr(pmd) \ + ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK)) #if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) -#else -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ + pte_index((address))) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + pte_index((address))) +#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) +#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ -#define kpte_clear_flush(ptep, vaddr) do { \ +#define kpte_clear_flush(ptep, vaddr) \ +do { \ if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \ BUG(); \ } while (0) @@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) { * The i386 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. */ -#define update_mmu_cache(vma,address,pte) do { } while (0) +#define update_mmu_cache(vma, address, pte) do { } while (0) void make_lowmem_page_readonly(void *va, unsigned int feature); void make_lowmem_page_writable(void *va, unsigned int feature); @@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va, #define kern_addr_valid(kaddr) (0) #endif -#define io_remap_pfn_range(vma,from,pfn,size,prot) \ -direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) +#define io_remap_pfn_range(vma, from, pfn, size, prot) \ + direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO) #endif /* _I386_PGTABLE_H */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-07 15:38:47.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-01-31 18:07:35.000000000 +0100 @@ -31,7 +31,7 @@ extern void paging_init(void); #endif /* !__ASSEMBLY__ */ -#define SHARED_KERNEL_PMD 1 +#define SHARED_KERNEL_PMD 0 /* * PGDIR_SHIFT determines what a top-level page table entry can map @@ -59,18 +59,20 @@ extern void paging_init(void); #ifndef __ASSEMBLY__ -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ - &(e), __pte_val(e), pte_pfn(e)) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ - &(e), __pmd_val(e), pmd_pfn(e)) -#define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ - &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ - &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pud_val(e), \ + (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) #define pgd_none(x) (!__pgd_val(x)) #define pud_none(x) (!__pud_val(x)) @@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg xen_l4_entry_update(pgdp, pgd); } -static inline void xen_pgd_clear(pgd_t * pgd) +static inline void xen_pgd_clear(pgd_t *pgd) { xen_set_pgd(pgd, xen_make_pgd(0)); xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0)); @@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t * #endif /* !__ASSEMBLY__ */ -#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) -#define MAXMEM _AC(0x6fffffffff, UL) +#define MAXMEM _AC(0x0000006fffffffff, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) -#define MODULES_VADDR _AC(0xffffffff88000000, UL) +#define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #ifndef __ASSEMBLY__ -static inline unsigned long pgd_bad(pgd_t pgd) +static inline int pgd_bad(pgd_t pgd) { - return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); + return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE; } -static inline unsigned long pud_bad(pud_t pud) +static inline int pud_bad(pud_t pud) { - return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); + return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE; } -static inline unsigned long pmd_bad(pmd_t pmd) +static inline int pmd_bad(pmd_t pmd) { - return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); + return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE; } #define pte_none(x) (!(x).pte) #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */ +#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \ @@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_ mfn_to_local_pfn(__pte_mfn(_pte)) : \ __pte_mfn(_pte)) -#define pte_page(x) pfn_to_page(pte_pfn(x)) +#define pte_page(x) pfn_to_page(pte_pfn((x))) /* * Macro to mark a page protection value as "uncacheable". */ -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) - +#define pgprot_noncached(prot) \ + (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) /* * Conversion functions: convert a page and protection to a page entry, @@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_ /* * Level 4 access. */ -#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK)) -#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)) -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) -#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address)) +#define pgd_page_vaddr(pgd) \ + ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK)) +#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT)) +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address))) #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT) static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ /* to find an entry in a page-table-directory. */ -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK)) -#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT)) -#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address)) +#define pud_page_vaddr(pud) \ + ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK)) +#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) +#define pud_offset(pgd, address) \ + ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address))) #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) static inline int pud_large(pud_t pte) { - return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == - (_PAGE_PSE|_PAGE_PRESENT); + return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); } /* PMD - Level 2 access */ -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK)) +#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) -#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \ - pmd_index(address)) +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \ + pmd_index(address)) #define pmd_none(x) (!__pmd_val(x)) #if CONFIG_XEN_COMPAT <= 0x030002 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. @@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte) #else #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) #endif -#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) -#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot)))) +#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT) #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE }) +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ + _PAGE_FILE }) #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT /* PTE - Level 1 access. */ /* page, protection -> pte */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - -#define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot)) + +#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ - pte_index(address)) + pte_index((address))) /* x86-64 always has all page tables mapped. */ -#define pte_offset_map(dir,address) pte_offset_kernel(dir,address) -#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address) +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) +#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) /* NOP */ -#define pte_unmap_nested(pte) /* NOP */ +#define pte_unmap_nested(pte) /* NOP */ + +#define update_mmu_cache(vma, address, pte) do { } while (0) -#define update_mmu_cache(vma,address,pte) do { } while (0) +#define direct_gbpages 0 /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) -extern int kern_addr_valid(unsigned long addr); +extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO) #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN @@ -284,8 +302,10 @@ extern void cleanup_highmap(void); /* fs/proc/kcore.c */ #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) -#define kc_offset_to_vaddr(o) \ - (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) +#define kc_offset_to_vaddr(o) \ + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \ + ? ((o) | ~__VIRTUAL_MASK) \ + : (o)) #define __HAVE_ARCH_PTE_SAME #endif /* !__ASSEMBLY__ */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/processor.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:42:13.000000000 +0100 @@ -3,10 +3,6 @@ #include <asm/processor-flags.h> -/* migration helpers, for KVM - will be removed in 2.6.25: */ -#include <asm/vm86.h> -#define Xgt_desc_struct desc_ptr - /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; @@ -24,6 +20,7 @@ struct mm_struct; #include <asm/msr.h> #include <asm/desc_defs.h> #include <asm/nops.h> + #include <linux/personality.h> #include <linux/cpumask.h> #include <linux/cache.h> @@ -38,16 +35,18 @@ struct mm_struct; static inline void *current_text_addr(void) { void *pc; - asm volatile("mov $1f,%0\n1:":"=r" (pc)); + + asm volatile("mov $1f, %0; 1:":"=r" (pc)); + return pc; } #ifdef CONFIG_X86_VSMP -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) #else -#define ARCH_MIN_TASKALIGN 16 -#define ARCH_MIN_MMSTRUCT_ALIGN 0 +# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif /* @@ -57,74 +56,90 @@ static inline void *current_text_addr(vo */ struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; - __u8 x86_mask; + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; #ifdef CONFIG_X86_32 - char wp_works_ok; /* It doesn't on 386's */ - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ - char hard_math; - char rfu; - char fdiv_bug; - char f00f_bug; - char coma_bug; - char pad0; + char wp_works_ok; /* It doesn't on 386's */ + + /* Problems on some 486Dx4's and old 386's: */ +#ifndef CONFIG_XEN + char hlt_works_ok; +#endif + char hard_math; +#ifndef CONFIG_XEN + char rfu; + char fdiv_bug; + char f00f_bug; +#endif + char coma_bug; + char pad0; #else - /* number of 4K pages in DTLB/ITLB combined(in pages)*/ - int x86_tlbsize; - __u8 x86_virt_bits, x86_phys_bits; + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ + int x86_tlbsize; + __u8 x86_virt_bits; + __u8 x86_phys_bits; #ifndef CONFIG_XEN - /* cpuid returned core id bits */ - __u8 x86_coreid_bits; + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; #endif - /* Max extended CPUID function supported */ - __u32 extended_cpuid_level; + /* Max extended CPUID function supported: */ + __u32 extended_cpuid_level; #endif - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ - __u32 x86_capability[NCAPINTS]; - char x86_vendor_id[16]; - char x86_model_id[64]; - int x86_cache_size; /* in KB - valid for CPUS which support this - call */ - int x86_cache_alignment; /* In bytes */ - int x86_power; - unsigned long loops_per_jiffy; + /* Maximum supported CPUID level, -1=no CPUID: */ + int cpuid_level; + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + /* in KB - valid for CPUS which support this call: */ + int x86_cache_size; + int x86_cache_alignment; /* In bytes */ + int x86_power; + unsigned long loops_per_jiffy; #ifndef CONFIG_XEN #ifdef CONFIG_SMP - cpumask_t llc_shared_map; /* cpus sharing the last level cache */ + /* cpus sharing the last level cache: */ + cpumask_t llc_shared_map; #endif - u16 x86_max_cores; /* cpuid returned max cores value */ - u16 apicid; + /* cpuid returned max cores value: */ + u16 x86_max_cores; + u16 apicid; + u16 initial_apicid; #endif - u16 x86_clflush_size; + u16 x86_clflush_size; #ifdef CONFIG_X86_HT - u16 booted_cores; /* number of cores as seen by OS */ - u16 phys_proc_id; /* Physical processor id. */ - u16 cpu_core_id; /* Core id */ + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ + u16 phys_proc_id; + /* Core id: */ + u16 cpu_core_id; #endif #ifdef CONFIG_SMP - u16 cpu_index; /* index into per_cpu list */ + /* Index into per_cpu list: */ + u16 cpu_index; #endif } __attribute__((__aligned__(SMP_CACHE_BYTES))); -#define X86_VENDOR_INTEL 0 -#define X86_VENDOR_CYRIX 1 -#define X86_VENDOR_AMD 2 -#define X86_VENDOR_UMC 3 -#define X86_VENDOR_NEXGEN 4 -#define X86_VENDOR_CENTAUR 5 -#define X86_VENDOR_TRANSMETA 7 -#define X86_VENDOR_NSC 8 -#define X86_VENDOR_NUM 9 -#define X86_VENDOR_UNKNOWN 0xff +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 + +#define X86_VENDOR_UNKNOWN 0xff /* * capabilities of CPUs */ -extern struct cpuinfo_x86 boot_cpu_data; -extern struct cpuinfo_x86 new_cpu_data; -extern __u32 cleared_cpu_caps[NCAPINTS]; +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; + +extern __u32 cleared_cpu_caps[NCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); @@ -135,7 +150,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_ #define current_cpu_data boot_cpu_data #endif -void cpu_detect(struct cpuinfo_x86 *c); +static inline int hlt_works(int cpu) +{ +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + return cpu_data(cpu).hlt_works_ok; +#else + return 1; +#endif +} + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern void cpu_detect(struct cpuinfo_x86 *c); extern void identify_cpu(struct cpuinfo_x86 *); extern void identify_boot_cpu(void); @@ -155,12 +181,12 @@ static inline void xen_cpuid(unsigned in unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - __asm__(XEN_CPUID - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); + asm(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); } static inline void load_cr3(pgd_t *pgdir) @@ -172,57 +198,70 @@ static inline void load_cr3(pgd_t *pgdir #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { - unsigned short back_link, __blh; - unsigned long sp0; - unsigned short ss0, __ss0h; - unsigned long sp1; - unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */ - unsigned long sp2; - unsigned short ss2, __ss2h; - unsigned long __cr3; - unsigned long ip; - unsigned long flags; - unsigned long ax, cx, dx, bx; - unsigned long sp, bp, si, di; - unsigned short es, __esh; - unsigned short cs, __csh; - unsigned short ss, __ssh; - unsigned short ds, __dsh; - unsigned short fs, __fsh; - unsigned short gs, __gsh; - unsigned short ldt, __ldth; - unsigned short trace, io_bitmap_base; + unsigned short back_link, __blh; + unsigned long sp0; + unsigned short ss0, __ss0h; + unsigned long sp1; + /* ss1 caches MSR_IA32_SYSENTER_CS: */ + unsigned short ss1, __ss1h; + unsigned long sp2; + unsigned short ss2, __ss2h; + unsigned long __cr3; + unsigned long ip; + unsigned long flags; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long bx; + unsigned long sp; + unsigned long bp; + unsigned long si; + unsigned long di; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace; + unsigned short io_bitmap_base; + } __attribute__((packed)); extern struct tss_struct doublefault_tss; #else struct x86_hw_tss { - u32 reserved1; - u64 sp0; - u64 sp1; - u64 sp2; - u64 reserved2; - u64 ist[7]; - u32 reserved3; - u32 reserved4; - u16 reserved5; - u16 io_bitmap_base; + u32 reserved1; + u64 sp0; + u64 sp1; + u64 sp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; + } __attribute__((packed)) ____cacheline_aligned; #endif #endif /* CONFIG_X86_NO_TSS */ /* - * Size of io_bitmap. + * IO-bitmap sizes: */ -#define IO_BITMAP_BITS 65536 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) -#define INVALID_IO_BITMAP_OFFSET 0x8000 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 #ifndef CONFIG_X86_NO_TSS struct tss_struct { - struct x86_hw_tss x86_tss; + /* + * The hardware state: + */ + struct x86_hw_tss x86_tss; /* * The extra 1 is there because the CPU will access an @@ -230,135 +269,161 @@ struct tss_struct { * bitmap. The extra byte must be all 1 bits, and must * be within the limit. */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* * Cache the current maximum and the last task that used the bitmap: */ - unsigned long io_bitmap_max; - struct thread_struct *io_bitmap_owner; + unsigned long io_bitmap_max; + struct thread_struct *io_bitmap_owner; + /* - * pads the TSS to be cacheline-aligned (size is 0x100) + * Pad the TSS to be cacheline-aligned (size is 0x100): */ - unsigned long __cacheline_filler[35]; + unsigned long __cacheline_filler[35]; /* - * .. and then another 0x100 bytes for emergency kernel stack + * .. and then another 0x100 bytes for the emergency kernel stack: */ - unsigned long stack[64]; + unsigned long stack[64]; + } __attribute__((packed)); DECLARE_PER_CPU(struct tss_struct, init_tss); -/* Save the original ist values for checking stack pointers during debugging */ +/* + * Save the original ist values for checking stack pointers during debugging + */ struct orig_ist { - unsigned long ist[7]; + unsigned long ist[7]; }; #endif /* CONFIG_X86_NO_TSS */ #define MXCSR_DEFAULT 0x1f80 struct i387_fsave_struct { - u32 cwd; - u32 swd; - u32 twd; - u32 fip; - u32 fcs; - u32 foo; - u32 fos; - u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - u32 status; /* software status information */ + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE ]: */ + u32 status; }; struct i387_fxsave_struct { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ union { struct { - u64 rip; - u64 rdp; + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ }; struct { - u32 fip; - u32 fcs; - u32 foo; - u32 fos; + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ }; }; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ - u32 padding[24]; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[24]; + } __attribute__((aligned(16))); struct i387_soft_struct { - u32 cwd; - u32 swd; - u32 twd; - u32 fip; - u32 fcs; - u32 foo; - u32 fos; - u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - u8 ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - u32 entry_eip; + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct info *info; + u32 entry_eip; }; -union i387_union { +union thread_xstate { struct i387_fsave_struct fsave; struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; + struct i387_soft_struct soft; }; -#ifdef CONFIG_X86_32 -DECLARE_PER_CPU(u8, cpu_llc_id); -#elif !defined(CONFIG_X86_NO_TSS) +#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS) DECLARE_PER_CPU(struct orig_ist, orig_ist); #endif extern void print_cpu_info(struct cpuinfo_x86 *); +extern unsigned int xstate_size; +extern void free_thread_xstate(struct task_struct *); +extern struct kmem_cache *task_xstate_cachep; extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; struct thread_struct { -/* cached TLS descriptors. */ - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; - unsigned long sp0; - unsigned long sp; + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long sp0; + unsigned long sp; #ifdef CONFIG_X86_32 - unsigned long sysenter_cs; + unsigned long sysenter_cs; #else - unsigned short es, ds, fsindex, gsindex; -#endif - unsigned long ip; - unsigned long fs; - unsigned long gs; -/* Hardware debugging registers */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; - unsigned long debugreg6; - unsigned long debugreg7; -/* fault info */ - unsigned long cr2, trap_no, error_code; -/* floating point info */ - union i387_union i387 __attribute__((aligned(16)));; + unsigned short es; + unsigned short ds; + unsigned short fsindex; + unsigned short gsindex; +#endif + unsigned long ip; + unsigned long fs; + unsigned long gs; + /* Hardware debugging registers: */ + unsigned long debugreg0; + unsigned long debugreg1; + unsigned long debugreg2; + unsigned long debugreg3; + unsigned long debugreg6; + unsigned long debugreg7; + /* Fault info: */ + unsigned long cr2; + unsigned long trap_no; + unsigned long error_code; + /* floating point and extended processor state */ + union thread_xstate *xstate; #ifdef CONFIG_X86_32 -/* virtual 86 mode info */ + /* Virtual 86 mode info */ struct vm86_struct __user *vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, saved_sp0; unsigned int saved_fs, saved_gs; #endif -/* IO permissions */ - unsigned long *io_bitmap_ptr; - unsigned long iopl; -/* max allowed port in the bitmap, in bytes: */ - unsigned io_bitmap_max; + /* IO permissions: */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; + /* Max allowed port in the bitmap, in bytes: */ + unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; /* Debug Store - if not 0 points to a DS Save Area configuration; @@ -389,12 +454,12 @@ static inline void xen_set_iopl_mask(uns } #ifndef CONFIG_X86_NO_TSS -static inline void native_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { tss->x86_tss.sp0 = thread->sp0; #ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately */ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); @@ -408,8 +473,8 @@ static inline void native_load_sp0(struc } while (0) #endif -#define __cpuid xen_cpuid -#define paravirt_enabled() 1 +#define __cpuid xen_cpuid +#define paravirt_enabled() 1 /* * These special macros can be used to get or set a debugging register @@ -429,11 +494,12 @@ static inline void native_load_sp0(struc * enable), so that any CPU's that boot up * after us can get the correct flags. */ -extern unsigned long mmu_cr4_features; +extern unsigned long mmu_cr4_features; static inline void set_in_cr4(unsigned long mask) { unsigned cr4; + mmu_cr4_features |= mask; cr4 = read_cr4(); cr4 |= mask; @@ -443,6 +509,7 @@ static inline void set_in_cr4(unsigned l static inline void clear_in_cr4(unsigned long mask) { unsigned cr4; + mmu_cr4_features &= ~mask; cr4 = read_cr4(); cr4 &= ~mask; @@ -450,42 +517,42 @@ static inline void clear_in_cr4(unsigned } struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; }; struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; + struct microcode_header hdr; + unsigned int bits[0]; }; -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; +typedef struct microcode microcode_t; +typedef struct microcode_header microcode_header_t; /* microcode format is extended from prescott processors */ struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; + unsigned int sig; + unsigned int pf; + unsigned int cksum; }; struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; struct extended_signature sigs[0]; }; typedef struct { - unsigned long seg; + unsigned long seg; } mm_segment_t; @@ -497,7 +564,7 @@ extern int kernel_thread(int (*fn)(void /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ +/* Prepare to copy thread state - unlazy all lazy state */ extern void prepare_to_copy(struct task_struct *tsk); unsigned long get_wchan(struct task_struct *p); @@ -534,118 +601,138 @@ static inline unsigned int cpuid_eax(uns unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; } + static inline unsigned int cpuid_ebx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); + return ebx; } + static inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; } + static inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; } /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static inline void rep_nop(void) { - __asm__ __volatile__("rep;nop": : :"memory"); + asm volatile("rep; nop" ::: "memory"); +} + +static inline void cpu_relax(void) +{ + rep_nop(); } -/* Stop speculative execution */ +/* Stop speculative execution: */ static inline void sync_core(void) { int tmp; + asm volatile("cpuid" : "=a" (tmp) : "0" (1) - : "ebx", "ecx", "edx", "memory"); + : "ebx", "ecx", "edx", "memory"); } -#define cpu_relax() rep_nop() - static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) + unsigned long edx) { - /* "monitor %eax,%ecx,%edx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc8;" - : :"a" (eax), "c" (ecx), "d"(edx)); + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); } static inline void __mwait(unsigned long eax, unsigned long ecx) { - /* "mwait %eax,%ecx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); } static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { - /* "mwait %eax,%ecx;" */ - asm volatile( - "sti; .byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); } extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); -extern int force_mwait; +extern int force_mwait; extern void select_idle_routine(const struct cpuinfo_x86 *c); -extern unsigned long boot_option_idle_override; +extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); /* Defined in head.S */ -extern struct desc_ptr early_gdt_descr; +extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); -/* from system description table in BIOS. Mostly for MCA use, but - * others may find it useful. */ -extern unsigned int machine_id; -extern unsigned int machine_submodel_id; -extern unsigned int BIOS_revision; +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} -/* Boot loader type from the setup header */ -extern int bootloader_type; +/* + * from system description table in BIOS. Mostly for MCA use, but + * others may find it useful: + */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; -extern char ignore_fpu_irq; -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) +/* Boot loader type from the setup header: */ +extern int bootloader_type; + +extern char ignore_fpu_irq; #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -#define BASE_PREFETCH ASM_NOP4 -#define ARCH_HAS_PREFETCH +# define BASE_PREFETCH ASM_NOP4 +# define ARCH_HAS_PREFETCH #else -#define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 (%1)" #endif -/* Prefetch instructions for Pentium III and AMD Athlon */ -/* It's not worth to care about 3dnow! prefetches for the K6 - because they are microcoded there and very slow. - However we don't do prefetches for pre XP Athlons currently - That should be fixed. */ +/* + * Prefetch instructions for Pentium III (+) and AMD Athlon (+) + * + * It's not worth to care about 3dnow prefetches for the K6 + * because they are microcoded there and very slow. + */ static inline void prefetch(const void *x) { alternative_input(BASE_PREFETCH, @@ -654,8 +741,11 @@ static inline void prefetch(const void * "r" (x)); } -/* 3dnow! prefetch to get an exclusive cache line. Useful for - spinlocks to avoid one state transition in the cache coherency protocol. */ +/* + * 3dnow prefetch to get an exclusive cache line. + * Useful for spinlocks to avoid one state transition in the + * cache coherency protocol: + */ static inline void prefetchw(const void *x) { alternative_input(BASE_PREFETCH, @@ -664,21 +754,25 @@ static inline void prefetchw(const void "r" (x)); } -#define spin_lock_prefetch(x) prefetchw(x) +static inline void spin_lock_prefetch(const void *x) +{ + prefetchw(x); +} + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). */ -#define TASK_SIZE (PAGE_OFFSET) -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - -#define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .vm86_info = NULL, \ - .sysenter_cs = __KERNEL_CS, \ - .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ +#define TASK_SIZE PAGE_OFFSET +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ + .fs = __KERNEL_PERCPU, \ } /* @@ -687,28 +781,15 @@ static inline void prefetchw(const void * permission bitmap. The extra byte must be all 1 bits, and must * be within the limit. */ -#define INIT_TSS { \ - .x86_tss = { \ +#define INIT_TSS { \ + .x86_tss = { \ .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - -#define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%gs": :"r" (0)); \ - regs->fs = 0; \ - set_fs(USER_DS); \ - regs->ds = __USER_DS; \ - regs->es = __USER_DS; \ - regs->ss = __USER_DS; \ - regs->cs = __USER_CS; \ - regs->ip = new_eip; \ - regs->sp = new_esp; \ -} while (0) - + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ +} extern unsigned long thread_saved_pc(struct task_struct *tsk); @@ -740,18 +821,18 @@ extern unsigned long thread_saved_pc(str /* * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE64 (0x800000000000UL - 4096) +#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ - 0xc0000000 : 0xFFFFe000) +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX TASK_SIZE64 @@ -764,32 +845,32 @@ extern unsigned long thread_saved_pc(str .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ } -#define start_thread(regs, new_rip, new_rsp) do { \ - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ - load_gs_index(0); \ - (regs)->ip = (new_rip); \ - (regs)->sp = (new_rsp); \ - (regs)->cs = __USER_CS; \ - (regs)->ss = __USER_DS; \ - (regs)->flags = 0x200; \ - set_fs(USER_DS); \ -} while (0) - /* * Return saved PC of a blocked thread. * What is this good for? it will be always the scheduler or ret_from_fork. */ -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) #endif /* CONFIG_X86_64 */ -/* This decides where the kernel will search for a free chunk of vm +extern void start_thread(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp); + +/* + * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) -#define KSTK_EIP(task) (task_pt_regs(task)->ip) -#define KSTK_ESP(task) (task_pt_regs(task)->sp) +#define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) + +/* Get/set a process' ability to use the timestamp counter instruction */ +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + +extern int get_tsc_mode(unsigned long adr); +extern int set_tsc_mode(unsigned int val); #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/smp.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,225 @@ -#ifdef CONFIG_X86_32 -# include "smp_32.h" +#ifndef _ASM_X86_SMP_H_ +#define _ASM_X86_SMP_H_ +#ifndef __ASSEMBLY__ +#include <linux/cpumask.h> +#include <linux/init.h> +#include <asm/percpu.h> + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifdef CONFIG_X86_LOCAL_APIC +# include <asm/mpspec.h> +# include <asm/apic.h> +# ifdef CONFIG_X86_IO_APIC +# include <asm/io_apic.h> +# endif +#endif +#include <asm/pda.h> +#include <asm/thread_info.h> + +#define cpu_callout_map cpu_possible_map +extern cpumask_t cpu_initialized; +#define cpu_callin_map cpu_possible_map + +extern void (*mtrr_hook)(void); +extern void zap_low_mappings(void); + +extern unsigned int num_processors; +extern cpumask_t cpu_initialized; + +#ifndef CONFIG_XEN +#ifdef CONFIG_SMP +extern u16 x86_cpu_to_apicid_init[]; +extern u16 x86_bios_cpu_apicid_init[]; +extern void *x86_cpu_to_apicid_early_ptr; +extern void *x86_bios_cpu_apicid_early_ptr; #else -# include "smp_64.h" +#define x86_cpu_to_apicid_early_ptr NULL +#define x86_bios_cpu_apicid_early_ptr NULL +#endif + +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(u16, cpu_llc_id); +DECLARE_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); +#endif + +#ifdef CONFIG_SMP + +#ifndef CONFIG_XEN + +/* Static state in head.S used to set up a CPU */ +extern struct { + void *sp; + unsigned short ss; +} stack_start; + +struct smp_ops { + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + int (*cpu_up)(unsigned cpu); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*smp_send_stop)(void); + void (*smp_send_reschedule)(int cpu); + int (*smp_call_function_mask)(cpumask_t mask, + void (*func)(void *info), void *info, + int wait); +}; + +/* Globals due to paravirt */ +extern void set_cpu_sibling_map(int cpu); + +extern struct smp_ops smp_ops; + +static inline void smp_send_stop(void) +{ + smp_ops.smp_send_stop(); +} + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} + +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} + +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} + +static inline int smp_call_function_mask(cpumask_t mask, + void (*func) (void *info), void *info, + int wait) +{ + return smp_ops.smp_call_function_mask(mask, func, info, wait); +} + +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +void native_smp_cpus_done(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); + +#else /* CONFIG_XEN */ + +void xen_smp_send_stop(void); +void xen_smp_send_reschedule(int cpu); +int xen_smp_call_function_mask(cpumask_t mask, + void (*func) (void *info), void *info, + int wait); + +#define smp_send_stop xen_smp_send_stop +#define smp_send_reschedule xen_smp_send_reschedule +#define smp_call_function_mask xen_smp_call_function_mask + +extern void prefill_possible_map(void); + +#endif /* CONFIG_XEN */ + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); + +extern void prefill_possible_map(void); + +void smp_store_cpu_info(int id); +#define cpu_physical_id(cpu) (cpu) + +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ +static inline int num_booting_cpus(void) +{ + return cpus_weight(cpu_callout_map); +} +#endif /* CONFIG_SMP */ + +extern unsigned disabled_cpus __cpuinitdata; + +#ifdef CONFIG_X86_32_SMP +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +DECLARE_PER_CPU(int, cpu_number); +#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) +#define safe_smp_processor_id() smp_processor_id() + +#elif defined(CONFIG_X86_64_SMP) +#define raw_smp_processor_id() read_pda(cpunumber) + +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#define safe_smp_processor_id() smp_processor_id() + +#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */ +#define cpu_physical_id(cpu) 0 +#define safe_smp_processor_id() 0 +#define stack_smp_processor_id() 0 +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + +static inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); +} + +#ifndef CONFIG_X86_64 +static inline unsigned int read_apic_id(void) +{ + return *(u32 *)(APIC_BASE + APIC_ID); +} +#else +extern unsigned int read_apic_id(void); +#endif + + +# ifdef APIC_DEFINITION +extern int hard_smp_processor_id(void); +# else +# include <mach_apicdef.h> +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_ID(read_apic_id()); +} +# endif /* APIC_DEFINITION */ + +#else /* CONFIG_X86_LOCAL_APIC */ + +# ifndef CONFIG_SMP +# define hard_smp_processor_id() 0 +# endif + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_HOTPLUG_CPU +extern void cpu_exit_clear(void); +extern void cpu_uninit(void); +#endif + +extern void smp_alloc_memory(void); +extern void lock_ipi_call_lock(void); +extern void unlock_ipi_call_lock(void); +#endif /* __ASSEMBLY__ */ #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp_32.h 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,173 +0,0 @@ -#ifndef __ASM_SMP_H -#define __ASM_SMP_H - -#ifndef __ASSEMBLY__ -#include <linux/cpumask.h> -#include <linux/init.h> - -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ -#ifdef CONFIG_X86_LOCAL_APIC -# include <asm/mpspec.h> -# include <asm/apic.h> -# ifdef CONFIG_X86_IO_APIC -# include <asm/io_apic.h> -# endif -#endif - -#define cpu_callout_map cpu_possible_map -#define cpu_callin_map cpu_possible_map - -extern unsigned int num_processors; - -extern void smp_alloc_memory(void); -extern void lock_ipi_call_lock(void); -extern void unlock_ipi_call_lock(void); - -extern void (*mtrr_hook) (void); -extern void zap_low_mappings (void); - -#ifndef CONFIG_XEN -DECLARE_PER_CPU(u8, cpu_llc_id); -DECLARE_PER_CPU(u8, x86_cpu_to_apicid); -#endif - -#ifdef CONFIG_HOTPLUG_CPU -extern void cpu_exit_clear(void); -extern void cpu_uninit(void); -#endif - -#ifdef CONFIG_SMP - -#ifndef CONFIG_XEN - -/* Globals due to paravirt */ -extern void set_cpu_sibling_map(int cpu); - -struct smp_ops -{ - void (*smp_prepare_boot_cpu)(void); - void (*smp_prepare_cpus)(unsigned max_cpus); - int (*cpu_up)(unsigned cpu); - void (*smp_cpus_done)(unsigned max_cpus); - - void (*smp_send_stop)(void); - void (*smp_send_reschedule)(int cpu); - int (*smp_call_function_mask)(cpumask_t mask, - void (*func)(void *info), void *info, - int wait); -}; - -extern struct smp_ops smp_ops; - -static inline void smp_prepare_boot_cpu(void) -{ - smp_ops.smp_prepare_boot_cpu(); -} -static inline void smp_prepare_cpus(unsigned int max_cpus) -{ - smp_ops.smp_prepare_cpus(max_cpus); -} -static inline int __cpu_up(unsigned int cpu) -{ - return smp_ops.cpu_up(cpu); -} -static inline void smp_cpus_done(unsigned int max_cpus) -{ - smp_ops.smp_cpus_done(max_cpus); -} - -static inline void smp_send_stop(void) -{ - smp_ops.smp_send_stop(); -} -static inline void smp_send_reschedule(int cpu) -{ - smp_ops.smp_send_reschedule(cpu); -} -static inline int smp_call_function_mask(cpumask_t mask, - void (*func) (void *info), void *info, - int wait) -{ - return smp_ops.smp_call_function_mask(mask, func, info, wait); -} - -void native_smp_prepare_boot_cpu(void); -void native_smp_prepare_cpus(unsigned int max_cpus); -int native_cpu_up(unsigned int cpunum); -void native_smp_cpus_done(unsigned int max_cpus); - -#else /* CONFIG_XEN */ - -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -int xen_smp_call_function_mask(cpumask_t mask, - void (*func) (void *info), void *info, - int wait); - -#define smp_send_stop xen_smp_send_stop -#define smp_send_reschedule xen_smp_send_reschedule -#define smp_call_function_mask xen_smp_call_function_mask - -extern void prefill_possible_map(void); - -#endif /* CONFIG_XEN */ - -extern int __cpu_disable(void); -extern void __cpu_die(unsigned int cpu); - -/* - * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. - */ -DECLARE_PER_CPU(int, cpu_number); -#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) - -#define cpu_physical_id(cpu) (cpu) - -#define safe_smp_processor_id() smp_processor_id() - -/* We don't mark CPUs online until __cpu_up(), so we need another measure */ -static inline int num_booting_cpus(void) -{ - return cpus_weight(cpu_callout_map); -} - -#else /* CONFIG_SMP */ - -#define safe_smp_processor_id() 0 -#define cpu_physical_id(cpu) 0 - -#endif /* !CONFIG_SMP */ - -#ifdef CONFIG_X86_LOCAL_APIC - -static __inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); -} - -# ifdef APIC_DEFINITION -extern int hard_smp_processor_id(void); -# else -# include <mach_apicdef.h> -static inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); -} -# endif /* APIC_DEFINITION */ - -#else /* CONFIG_X86_LOCAL_APIC */ - -# ifndef CONFIG_SMP -# define hard_smp_processor_id() 0 -# endif - -#endif /* CONFIG_X86_LOCAL_APIC */ - -#endif /* !ASSEMBLY */ -#endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp_64.h 2011-01-31 18:01:51.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,101 +0,0 @@ -#ifndef __ASM_SMP_H -#define __ASM_SMP_H - -#include <linux/cpumask.h> -#include <linux/init.h> - -#ifdef CONFIG_X86_LOCAL_APIC -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ -#include <asm/apic.h> -#ifdef CONFIG_X86_IO_APIC -#include <asm/io_apic.h> -#endif -#include <asm/mpspec.h> -#endif -#include <asm/pda.h> -#include <asm/thread_info.h> - -extern cpumask_t cpu_initialized; - -extern unsigned int num_processors; - -extern void smp_alloc_memory(void); -extern void lock_ipi_call_lock(void); -extern void unlock_ipi_call_lock(void); - -extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait); - -#ifndef CONFIG_XEN -DECLARE_PER_CPU(u16, cpu_llc_id); -DECLARE_PER_CPU(u16, x86_cpu_to_apicid); -DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -static inline int cpu_present_to_apicid(int mps_cpu) -{ - if (cpu_present(mps_cpu)) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} -#endif - -#ifdef CONFIG_SMP - -#define SMP_TRAMPOLINE_BASE 0x6000 - -extern int __cpu_disable(void); -extern void __cpu_die(unsigned int cpu); -extern void prefill_possible_map(void); -extern unsigned __cpuinitdata disabled_cpus; - -#define raw_smp_processor_id() read_pda(cpunumber) -#define cpu_physical_id(cpu) (cpu) - -#define stack_smp_processor_id() \ - ({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) - -/* - * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies - * scheduling and IPI sending and compresses data structures. - */ -static inline int num_booting_cpus(void) -{ - return cpus_weight(cpu_possible_map); -} - -extern void smp_send_reschedule(int cpu); - -#else /* CONFIG_SMP */ - -#define cpu_physical_id(cpu) 0 -#define stack_smp_processor_id() 0 - -#endif /* !CONFIG_SMP */ - -#define safe_smp_processor_id() smp_processor_id() - -#ifdef CONFIG_X86_LOCAL_APIC -static __inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); -} - -static inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); -} -#endif - -#endif - --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/spinlock.h 2011-01-31 18:07:35.000000000 +0100 @@ -95,7 +95,7 @@ void xen_spin_kick(raw_spinlock_t *, uns : \ : "memory", "cc") -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) { int tmp, new; @@ -158,7 +158,7 @@ static inline int __raw_spin_trylock(raw : "memory", "cc"); \ } while (0) -static inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) { int tmp; int new; @@ -183,19 +183,19 @@ static inline int __raw_spin_trylock(raw static inline int __raw_spin_is_locked(raw_spinlock_t *lock) { - int tmp = *(volatile signed int *)(&(lock)->slock); + int tmp = ACCESS_ONCE(lock->slock); return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); } static inline int __raw_spin_is_contended(raw_spinlock_t *lock) { - int tmp = *(volatile signed int *)(&(lock)->slock); + int tmp = ACCESS_ONCE(lock->slock); return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; } -static inline void __raw_spin_lock(raw_spinlock_t *lock) +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) { unsigned int token, count; unsigned int flags = __raw_local_irq_save(); @@ -214,8 +214,8 @@ static inline void __raw_spin_lock(raw_s } while (unlikely(!count) && !xen_spin_wait(lock, &token, flags)); } -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, - unsigned long flags) +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) { unsigned int token, count; bool free; @@ -230,7 +230,7 @@ static inline void __raw_spin_lock_flags } while (unlikely(!count) && !xen_spin_wait(lock, &token, flags)); } -static inline void __raw_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) { unsigned int token; bool kick; --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/swiotlb.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/swiotlb.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,5 +1,4 @@ -#ifdef CONFIG_X86_32 -# include "swiotlb_32.h" -#else -# include_next <asm/swiotlb.h> -#endif +#include_next <asm/swiotlb.h> + +dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size, + int dir); --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/system.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/system.h 2011-01-31 18:07:35.000000000 +0100 @@ -28,22 +28,44 @@ struct task_struct *__switch_to(struct t * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. */ -#define switch_to(prev, next, last) do { \ - unsigned long esi, edi; \ - asm volatile("pushfl\n\t" /* Save flags */ \ - "pushl %%ebp\n\t" \ - "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %5,%%esp\n\t" /* restore ESP */ \ - "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %6\n\t" /* restore EIP */ \ - "jmp __switch_to\n" \ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ - "popl %%ebp\n\t" \ - "popfl" \ - :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \ - "=a" (last), "=S" (esi), "=D" (edi) \ - :"m" (next->thread.sp), "m" (next->thread.ip), \ - "2" (prev), "d" (next)); \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next)); \ } while (0) /* @@ -123,30 +145,29 @@ extern void load_gs_index(unsigned); */ #define loadsegment(seg, value) \ asm volatile("\n" \ - "1:\t" \ - "movl %k0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "movl %k1, %%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b,3b) \ - : :"r" (value), "r" (0)) + "1:\t" \ + "movl %k0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "movl %k1, %%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b,3b) \ + : :"r" (value), "r" (0)) /* * Save a segment register away */ -#define savesegment(seg, value) \ +#define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; - __asm__("lsll %1,%0" - :"=r" (__limit):"r" (segment)); - return __limit+1; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; } static inline void xen_clts(void) @@ -171,13 +192,13 @@ static unsigned long __force_order; static inline unsigned long xen_read_cr0(void) { unsigned long val; - asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); return val; } static inline void xen_write_cr0(unsigned long val) { - asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); } #define xen_read_cr2() (current_vcpu_info()->arch.cr2) @@ -186,7 +207,7 @@ static inline void xen_write_cr0(unsigne static inline unsigned long xen_read_cr3(void) { unsigned long val; - asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); #ifdef CONFIG_X86_32 return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; #else @@ -201,13 +222,13 @@ static inline void xen_write_cr3(unsigne #else val = phys_to_machine(val); #endif - asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); } static inline unsigned long xen_read_cr4(void) { unsigned long val; - asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order)); + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); return val; } @@ -215,7 +236,7 @@ static inline unsigned long xen_read_cr4 static inline void xen_write_cr4(unsigned long val) { - asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order)); + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); } #ifdef CONFIG_X86_64 @@ -234,6 +255,7 @@ static inline void xen_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } + #define read_cr0() (xen_read_cr0()) #define write_cr0(x) (xen_write_cr0(x)) #define read_cr2() (xen_read_cr2()) @@ -260,7 +282,7 @@ static inline void clflush(volatile void asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } -#define nop() __asm__ __volatile__ ("nop") +#define nop() asm volatile ("nop") void disable_hlt(void); void enable_hlt(void); @@ -280,16 +302,7 @@ void default_idle(void); */ #ifdef CONFIG_X86_32 /* - * For now, "wmb()" doesn't actually do anything, as all - * Intel CPU's follow what Intel calls a *Processor Order*, - * in which all writes are seen in the program order even - * outside the CPU. - * - * I expect future Intel CPU's to have a weaker ordering, - * but I'd also expect them to finally get their act together - * and add some real memory barriers if so. - * - * Some non intel clones support out of order store. wmb() ceases to be a + * Some non-Intel clones support out of order store. wmb() ceases to be a * nop for these. */ #define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) @@ -368,7 +381,7 @@ void default_idle(void); # define smp_wmb() barrier() #endif #define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else #define smp_mb() barrier() #define smp_rmb() barrier() --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/tlbflush.h 2011-01-31 18:07:35.000000000 +0100 @@ -86,8 +86,7 @@ static inline void flush_tlb_range(struc #define TLBSTATE_LAZY 2 #ifdef CONFIG_X86_32 -struct tlb_state -{ +struct tlb_state { struct mm_struct *active_mm; int state; char __cacheline_padding[L1_CACHE_BYTES-8]; --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/vga.h 2007-06-12 13:14:02.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/vga.h 2011-01-31 18:07:35.000000000 +0100 @@ -12,9 +12,9 @@ * access the videoram directly without any black magic. */ -#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x) +#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x) #define vga_readb(x) (*(x)) -#define vga_writeb(x,y) (*(y) = (x)) +#define vga_writeb(x, y) (*(y) = (x)) #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/xor_64.h 2007-06-12 13:14:13.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/xor_64.h 2011-01-31 18:07:35.000000000 +0100 @@ -1,20 +1,23 @@ /* - * x86-64 changes / gcc fixes from Andi Kleen. + * x86-64 changes / gcc fixes from Andi Kleen. * Copyright 2002 Andi Kleen, SuSE Labs. * * This hasn't been optimized for the hammer yet, but there are likely * no advantages to be gotten from x86-64 here anyways. */ -typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; +typedef struct { + unsigned long a, b; +} __attribute__((aligned(16))) xmm_store_t; -/* Doesn't use gcc to save the XMM registers, because there is no easy way to +/* Doesn't use gcc to save the XMM registers, because there is no easy way to tell it to do a clts before the register saving. */ -#define XMMS_SAVE do { \ +#define XMMS_SAVE \ +do { \ preempt_disable(); \ if (!(current_thread_info()->status & TS_USEDFPU)) \ clts(); \ - __asm__ __volatile__ ( \ + asm volatile( \ "movups %%xmm0,(%1) ;\n\t" \ "movups %%xmm1,0x10(%1) ;\n\t" \ "movups %%xmm2,0x20(%1) ;\n\t" \ @@ -22,10 +25,11 @@ typedef struct { unsigned long a,b; } __ : "=&r" (cr0) \ : "r" (xmm_save) \ : "memory"); \ -} while(0) +} while (0) -#define XMMS_RESTORE do { \ - asm volatile ( \ +#define XMMS_RESTORE \ +do { \ + asm volatile( \ "sfence ;\n\t" \ "movups (%1),%%xmm0 ;\n\t" \ "movups 0x10(%1),%%xmm1 ;\n\t" \ @@ -37,72 +41,72 @@ typedef struct { unsigned long a,b; } __ if (!(current_thread_info()->status & TS_USEDFPU)) \ stts(); \ preempt_enable(); \ -} while(0) +} while (0) #define OFFS(x) "16*("#x")" #define PF_OFFS(x) "256+16*("#x")" #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" -#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" -#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" -#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" -#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" -#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" -#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" -#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" static void xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { - unsigned int lines = bytes >> 8; + unsigned int lines = bytes >> 8; unsigned long cr0; xmm_store_t xmm_save[4]; XMMS_SAVE; - asm volatile ( + asm volatile( #undef BLOCK #define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ + LD(i, 0) \ + LD(i + 1, 1) \ PF1(i) \ - PF1(i+2) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ PF0(0) PF0(2) " .align 32 ;\n" - " 1: ;\n" + " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" " decl %[cnt] ; jnz 1b" : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) - : [inc] "r" (256UL) - : "memory"); + : [inc] "r" (256UL) + : "memory"); XMMS_RESTORE; } @@ -117,52 +121,52 @@ xor_sse_3(unsigned long bytes, unsigned XMMS_SAVE; - __asm__ __volatile__ ( + asm volatile( #undef BLOCK #define BLOCK(i) \ PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ PF2(i) \ - PF2(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ PF0(0) PF0(2) " .align 32 ;\n" - " 1: ;\n" + " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" " decl %[cnt] ; jnz 1b" : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] "r" (256UL) - : "memory"); + : "memory"); XMMS_RESTORE; } @@ -171,64 +175,64 @@ xor_sse_4(unsigned long bytes, unsigned unsigned long *p3, unsigned long *p4) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; + xmm_store_t xmm_save[4]; unsigned long cr0; XMMS_SAVE; - __asm__ __volatile__ ( + asm volatile( #undef BLOCK #define BLOCK(i) \ PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ PF3(i) \ - PF3(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ PF0(0) PF0(2) " .align 32 ;\n" - " 1: ;\n" + " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" " decl %[cnt] ; jnz 1b" : [cnt] "+c" (lines), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) : [inc] "r" (256UL) - : "memory" ); + : "memory" ); XMMS_RESTORE; } @@ -237,70 +241,70 @@ static void xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4, unsigned long *p5) { - unsigned int lines = bytes >> 8; + unsigned int lines = bytes >> 8; xmm_store_t xmm_save[4]; unsigned long cr0; XMMS_SAVE; - __asm__ __volatile__ ( + asm volatile( #undef BLOCK #define BLOCK(i) \ PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ PF3(i) \ - PF3(i+2) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ + PF3(i + 2) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ PF4(i) \ - PF4(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - XO4(i+1,1) \ - XO4(i+2,2) \ - XO4(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ PF0(0) PF0(2) " .align 32 ;\n" - " 1: ;\n" + " 1: ;\n" BLOCK(0) BLOCK(4) BLOCK(8) BLOCK(12) - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " addq %[inc], %[p4] ;\n" - " addq %[inc], %[p5] ;\n" + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p5] ;\n" " decl %[cnt] ; jnz 1b" : [cnt] "+c" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) : [inc] "r" (256UL) : "memory"); @@ -309,18 +313,18 @@ xor_sse_5(unsigned long bytes, unsigned } static struct xor_block_template xor_block_sse = { - .name = "generic_sse", - .do_2 = xor_sse_2, - .do_3 = xor_sse_3, - .do_4 = xor_sse_4, - .do_5 = xor_sse_5, + .name = "generic_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, }; #undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_sse); \ - } while (0) +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_sse); \ +} while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu --- head-2011-03-11.orig/include/linux/page-flags.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/include/linux/page-flags.h 2011-01-31 18:07:35.000000000 +0100 @@ -345,28 +345,27 @@ static inline void SetPageUptodate(struc CLEARPAGEFLAG(Uptodate, uptodate) -#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) -#define SetPageForeign(_page, dtor) do { \ - set_bit(PG_foreign, &(_page)->flags); \ - BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \ - (_page)->index = (long)(dtor); \ -} while (0) -#define ClearPageForeign(page) do { \ - clear_bit(PG_foreign, &(page)->flags); \ - (page)->index = 0; \ -} while (0) -#define PageForeignDestructor(_page, order) \ - ((void (*)(struct page *, unsigned int))(_page)->index)(_page, order) - -#if 0 -#define PageNetback(page) test_bit(PG_netback, &(page)->flags) -#define SetPageNetback(page) set_bit(PG_netback, &(page)->flags) -#define ClearPageNetback(page) clear_bit(PG_netback, &(page)->flags) -#endif - -#define PageBlkback(page) test_bit(PG_blkback, &(page)->flags) -#define SetPageBlkback(page) set_bit(PG_blkback, &(page)->flags) -#define ClearPageBlkback(page) clear_bit(PG_blkback, &(page)->flags) +#ifdef CONFIG_XEN +TESTPAGEFLAG(Foreign, foreign) +static inline void SetPageForeign(struct page *page, + void (*dtor)(struct page *, unsigned int)) +{ + BUG_ON(!dtor); + set_bit(PG_foreign, &page->flags); + page->index = (long)dtor; +} +static inline void ClearPageForeign(struct page *page) +{ + clear_bit(PG_foreign, &page->flags); + page->index = 0; +} +static inline void PageForeignDestructor(struct page *page, unsigned int order) +{ + ((void (*)(struct page *, unsigned int))page->index)(page, order); +} +/*PAGEFLAG(Netback, netback)*/ +PAGEFLAG(Blkback, blkback) +#endif extern void cancel_dirty_page(struct page *page, unsigned int account_size); --- head-2011-03-11.orig/include/xen/balloon.h 2007-06-12 13:14:19.000000000 +0200 +++ head-2011-03-11/include/xen/balloon.h 2011-01-31 18:07:35.000000000 +0100 @@ -31,9 +31,12 @@ * IN THE SOFTWARE. */ -#ifndef __ASM_BALLOON_H__ -#define __ASM_BALLOON_H__ +#ifndef __XEN_BALLOON_H__ +#define __XEN_BALLOON_H__ +#include <linux/spinlock.h> + +#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) /* * Inform the balloon driver that it should allow some slop for device-driver * memory activities. @@ -53,5 +56,6 @@ void balloon_release_driver_page(struct extern spinlock_t balloon_lock; #define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags) #define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags) +#endif -#endif /* __ASM_BALLOON_H__ */ +#endif /* __XEN_BALLOON_H__ */ --- head-2011-03-11.orig/include/xen/interface/grant_table.h 2011-01-31 15:14:12.000000000 +0100 +++ head-2011-03-11/include/xen/interface/grant_table.h 2011-01-31 18:07:35.000000000 +0100 @@ -288,6 +288,7 @@ struct gnttab_map_grant_ref { grant_handle_t handle; uint64_t dev_bus_addr; }; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref); typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t; DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t); @@ -311,6 +312,7 @@ struct gnttab_unmap_grant_ref { /* OUT parameters. */ int16_t status; /* GNTST_* */ }; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref); typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t; DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t); @@ -332,6 +334,7 @@ struct gnttab_setup_table { int16_t status; /* GNTST_* */ XEN_GUEST_HANDLE(ulong) frame_list; }; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table); typedef struct gnttab_setup_table gnttab_setup_table_t; DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t); @@ -346,6 +349,7 @@ struct gnttab_dump_table { /* OUT parameters. */ int16_t status; /* GNTST_* */ }; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table); typedef struct gnttab_dump_table gnttab_dump_table_t; DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t); @@ -366,6 +370,7 @@ struct gnttab_transfer { /* OUT parameters. */ int16_t status; }; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer); typedef struct gnttab_transfer gnttab_transfer_t; DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t); @@ -411,6 +416,7 @@ typedef struct gnttab_copy { /* OUT parameters. */ int16_t status; } gnttab_copy_t; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy); DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t); /* @@ -429,6 +435,7 @@ struct gnttab_query_size { uint32_t max_nr_frames; int16_t status; /* GNTST_* */ }; +DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size); typedef struct gnttab_query_size gnttab_query_size_t; DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t); --- head-2011-03-11.orig/include/xen/interface/io/fbif.h 2011-01-31 15:14:12.000000000 +0100 +++ head-2011-03-11/include/xen/interface/io/fbif.h 2011-01-31 18:07:35.000000000 +0100 @@ -150,7 +150,12 @@ struct xenfb_page * framebuffer with a max resolution of 12,800x10,240. Should * be enough for a while with room leftover for expansion. */ +#ifndef CONFIG_PARAVIRT_XEN unsigned long pd[256]; +#else + /* Two directory pages should be enough for a while. */ + unsigned long pd[2]; +#endif }; /* --- head-2011-03-11.orig/include/xen/interface/memory.h 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-11/include/xen/interface/memory.h 2011-01-31 18:07:35.000000000 +0100 @@ -88,7 +88,6 @@ struct xen_memory_reservation { */ domid_t domid; }; -DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation); typedef struct xen_memory_reservation xen_memory_reservation_t; DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t); @@ -174,7 +173,11 @@ struct xen_machphys_mfn_list { * any large discontiguities in the machine address space, 2MB gaps in * the machphys table will be represented by an MFN base of zero. */ +#ifndef CONFIG_PARAVIRT_XEN XEN_GUEST_HANDLE(xen_pfn_t) extent_start; +#else + ulong extent_start; +#endif /* * Number of extents written to the above array. This will be smaller @@ -182,7 +185,6 @@ struct xen_machphys_mfn_list { */ unsigned int nr_extents; }; -DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list); typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t; DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t); @@ -224,7 +226,6 @@ struct xen_add_to_physmap { /* GPFN where the source mapping page should appear. */ xen_pfn_t gpfn; }; -DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap); typedef struct xen_add_to_physmap xen_add_to_physmap_t; DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t); --- head-2011-03-11.orig/include/xen/interface/vcpu.h 2011-01-31 15:14:12.000000000 +0100 +++ head-2011-03-11/include/xen/interface/vcpu.h 2011-01-31 18:07:35.000000000 +0100 @@ -87,6 +87,7 @@ struct vcpu_runstate_info { */ uint64_t time[4]; }; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info); typedef struct vcpu_runstate_info vcpu_runstate_info_t; DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t); @@ -142,6 +143,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_register_ru struct vcpu_set_periodic_timer { uint64_t period_ns; }; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer); typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t; DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t); @@ -155,6 +157,7 @@ struct vcpu_set_singleshot_timer { uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */ uint32_t flags; /* VCPU_SSHOTTMR_??? */ }; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer); typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t; DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t); @@ -178,6 +181,7 @@ struct vcpu_register_vcpu_info { uint32_t offset; /* offset within page */ uint32_t rsvd; /* unused */ }; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info); typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t; DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t); --- head-2011-03-11.orig/lib/swiotlb-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-11/lib/swiotlb-xen.c 2011-01-31 18:07:35.000000000 +0100 @@ -20,6 +20,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/bootmem.h> +#include <linux/iommu-helper.h> #include <linux/highmem.h> #include <asm/io.h> #include <asm/pci.h> @@ -296,15 +297,6 @@ __sync_single(struct phys_addr buffer, c } } -static inline unsigned int is_span_boundary(unsigned int index, - unsigned int nslots, - unsigned long offset_slots, - unsigned long max_slots) -{ - unsigned long offset = (offset_slots + index) & (max_slots - 1); - return offset + nslots > max_slots; -} - /* * Allocates bounce buffer and returns its kernel virtual address. */ @@ -343,61 +335,53 @@ map_single(struct device *hwdev, struct * request and allocate a buffer from that IO TLB pool. */ spin_lock_irqsave(&io_tlb_lock, flags); - { - index = ALIGN(io_tlb_index, stride); - if (index >= iotlb_nslabs) - index = 0; - wrap = index; + index = ALIGN(io_tlb_index, stride); + if (index >= iotlb_nslabs) + index = 0; + wrap = index; - do { - while (is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= iotlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } + do { + while (iommu_is_span_boundary(index, nslots, offset_slots, + max_slots)) { + index += stride; + if (index >= iotlb_nslabs) + index = 0; + if (index == wrap) + goto not_found; + } + + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int) (index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT); /* - * If we find a slot that indicates we have 'nslots' - * number of contiguous buffers, we allocate the - * buffers from that slot and mark the entries as '0' - * indicating unavailable. + * Update the indices to avoid searching in the next + * round. */ - if (io_tlb_list[index] >= nslots) { - int count = 0; + io_tlb_index = ((index + nslots) < iotlb_nslabs + ? (index + nslots) : 0); - for (i = index; i < (int)(index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; - (OFFSET(i, IO_TLB_SEGSIZE) != - IO_TLB_SEGSIZE -1) && io_tlb_list[i]; - i--) - io_tlb_list[i] = ++count; - dma_addr = iotlb_virt_start + - (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in - * the next round. - */ - io_tlb_index = - ((index + nslots) < iotlb_nslabs - ? (index + nslots) : 0); - - goto found; - } - index += stride; - if (index >= iotlb_nslabs) - index = 0; - } while (index != wrap); + goto found; + } + index += stride; + if (index >= iotlb_nslabs) + index = 0; + } while (index != wrap); - not_found: - spin_unlock_irqrestore(&io_tlb_lock, flags); - return NULL; - } - found: +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + return NULL; +found: spin_unlock_irqrestore(&io_tlb_lock, flags); /* @@ -525,11 +509,13 @@ swiotlb_full(struct device *dev, size_t * Once the device is given the dma address, the device owns this memory until * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. */ -dma_addr_t -swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) -{ - dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) + - offset_in_page(ptr); +static dma_addr_t +_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, + int dir, struct dma_attrs *attrs) +{ + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); + dma_addr_t dev_addr = gnttab_dma_map_page(page) + + offset_in_page(paddr); void *map; struct phys_addr buffer; @@ -540,7 +526,7 @@ swiotlb_map_single(struct device *hwdev, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!range_straddles_page_boundary(__pa(ptr), size) && + if (!range_straddles_page_boundary(paddr, size) && !address_needs_mapping(hwdev, dev_addr)) return dev_addr; @@ -548,8 +534,8 @@ swiotlb_map_single(struct device *hwdev, * Oh well, have to allocate and map a bounce buffer. */ gnttab_dma_unmap_page(dev_addr); - buffer.page = virt_to_page(ptr); - buffer.offset = (unsigned long)ptr & ~PAGE_MASK; + buffer.page = page; + buffer.offset = offset_in_page(paddr); map = map_single(hwdev, buffer, size, dir); if (!map) { swiotlb_full(hwdev, size, dir, 1); @@ -560,6 +546,26 @@ swiotlb_map_single(struct device *hwdev, return dev_addr; } +dma_addr_t +swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, + int dir, struct dma_attrs *attrs) +{ + return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs); +} +EXPORT_SYMBOL(swiotlb_map_single_attrs); + +dma_addr_t +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) +{ + return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL); +} + +dma_addr_t +swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir) +{ + return _swiotlb_map_single(hwdev, paddr, size, dir, NULL); +} + /* * Unmap a single streaming mode DMA translation. The dma_addr and size must * match what was provided for in a previous swiotlb_map_single call. All @@ -569,8 +575,8 @@ swiotlb_map_single(struct device *hwdev, * whatever the device wrote there. */ void -swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, - int dir) +swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir, struct dma_attrs *attrs) { BUG_ON(dir == DMA_NONE); if (in_swiotlb_aperture(dev_addr)) @@ -578,7 +584,14 @@ swiotlb_unmap_single(struct device *hwde else gnttab_dma_unmap_page(dev_addr); } +EXPORT_SYMBOL(swiotlb_unmap_single_attrs); +void +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, + int dir) +{ + return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL); +} /* * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. @@ -613,6 +626,38 @@ swiotlb_sync_single_for_device(struct de } /* + * Same as above, but for a sub-range of the mapping. + */ +static void +swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, + int dir, int target) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + sync_single(hwdev, bus_to_virt(dev_addr + offset), size, + dir, target); +} + +void +swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, int dir) +{ + swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, + SYNC_FOR_CPU); +} + +void +swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, int dir) +{ + swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, + SYNC_FOR_DEVICE); +} + +void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int, + struct dma_attrs *); +/* * Map a set of buffers described by scatterlist in streaming mode for DMA. * This is the scatter-gather version of the above swiotlb_map_single * interface. Here the scatter gather list elements are each tagged with the @@ -629,8 +674,8 @@ swiotlb_sync_single_for_device(struct de * same here. */ int -swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) +swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, + int dir, struct dma_attrs *attrs) { struct scatterlist *sg; struct phys_addr buffer; @@ -654,7 +699,8 @@ swiotlb_map_sg(struct device *hwdev, str /* Don't panic here, we expect map_sg users to do proper error handling. */ swiotlb_full(hwdev, sg->length, dir, 0); - swiotlb_unmap_sg(hwdev, sgl, i, dir); + swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); sgl[0].dma_length = 0; return 0; } @@ -665,14 +711,22 @@ swiotlb_map_sg(struct device *hwdev, str } return nelems; } +EXPORT_SYMBOL(swiotlb_map_sg_attrs); + +int +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + int dir) +{ + return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_single() above. */ void -swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) +swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, int dir, struct dma_attrs *attrs) { struct scatterlist *sg; int i; @@ -687,6 +741,14 @@ swiotlb_unmap_sg(struct device *hwdev, s gnttab_dma_unmap_page(sg->dma_address); } } +EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); + +void +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + int dir) +{ + return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} /* * Make physical memory consistent for a set of streaming mode DMA translations @@ -725,46 +787,6 @@ swiotlb_sync_sg_for_device(struct device swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); } -#ifdef CONFIG_HIGHMEM - -dma_addr_t -swiotlb_map_page(struct device *hwdev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - struct phys_addr buffer; - dma_addr_t dev_addr; - char *map; - - dev_addr = gnttab_dma_map_page(page) + offset; - if (address_needs_mapping(hwdev, dev_addr)) { - gnttab_dma_unmap_page(dev_addr); - buffer.page = page; - buffer.offset = offset; - map = map_single(hwdev, buffer, size, direction); - if (!map) { - swiotlb_full(hwdev, size, direction, 1); - map = io_tlb_overflow_buffer; - } - dev_addr = (dma_addr_t)virt_to_bus(map); - } - - return dev_addr; -} - -void -swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - if (in_swiotlb_aperture(dma_address)) - unmap_single(hwdev, bus_to_virt(dma_address), size, direction); - else - gnttab_dma_unmap_page(dma_address); -} - -#endif - int swiotlb_dma_mapping_error(dma_addr_t dma_addr) {