From: Linux Kernel Mailing List Subject: [PATCH] Linux: Update to 2.6.28 Patch-mainline: 2.6.28 This patch contains the differences between Linux 2.6.27 and 2.6.28. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.28" by xen-port-patches.py --- head-2011-03-17.orig/arch/ia64/Kconfig 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/Kconfig 2011-02-01 14:39:24.000000000 +0100 @@ -230,7 +230,7 @@ config IA64_HP_SIM config IA64_XEN_GUEST bool "Xen guest" select SWIOTLB - depends on XEN + depends on PARAVIRT_XEN help Build a kernel that runs on Xen guest domain. At this moment only 16KB page size in supported. --- head-2011-03-17.orig/arch/ia64/Makefile 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/Makefile 2011-02-01 14:39:24.000000000 +0100 @@ -55,7 +55,7 @@ core-$(CONFIG_IA64_XEN_GUEST) += arch/ia core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ core-$(CONFIG_KVM) += arch/ia64/kvm/ -core-$(CONFIG_XEN) += arch/ia64/xen/ +core-$(CONFIG_PARAVIRT_XEN) += arch/ia64/xen/ drivers-$(CONFIG_PCI) += arch/ia64/pci/ drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/ --- head-2011-03-17.orig/arch/ia64/include/asm/xen/hypervisor.h 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/include/asm/xen/hypervisor.h 2011-02-01 14:39:24.000000000 +0100 @@ -40,7 +40,7 @@ #include #include -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; --- head-2011-03-17.orig/arch/ia64/include/asm/xen/interface.h 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/include/asm/xen/interface.h 2011-02-01 14:39:24.000000000 +0100 @@ -56,29 +56,21 @@ #ifndef _ASM_IA64_XEN_INTERFACE_H #define _ASM_IA64_XEN_INTERFACE_H -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name -#define GUEST_HANDLE_64(name) GUEST_HANDLE(name) + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name +#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name) #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); -__DEFINE_GUEST_HANDLE(u64, unsigned long); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); +__DEFINE_XEN_GUEST_HANDLE(u64, unsigned long); +typedef unsigned long xen_ulong_t; typedef unsigned long xen_pfn_t; -DEFINE_GUEST_HANDLE(xen_pfn_t); #define PRI_xen_pfn "lx" #endif @@ -90,7 +82,7 @@ DEFINE_GUEST_HANDLE(xen_pfn_t); /* Maximum number of virtual CPUs in multi-processor guests. */ /* keep sizeof(struct shared_page) <= PAGE_SIZE. * this is checked in arch/ia64/xen/hypervisor.c. */ -#define MAX_VIRT_CPUS 64 +#define XEN_LEGACY_MAX_VCPUS 64 #ifndef __ASSEMBLY__ --- head-2011-03-17.orig/arch/ia64/kernel/asm-offsets.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/kernel/asm-offsets.c 2011-02-01 14:39:24.000000000 +0100 @@ -290,7 +290,7 @@ void foo(void) DEFINE(IA64_ITC_LASTCYCLE_OFFSET, offsetof (struct itc_jitter_data_t, itc_lastcycle)); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); DEFINE(XEN_NATIVE_ASM, XEN_NATIVE); --- head-2011-03-17.orig/arch/ia64/xen/Kconfig 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/xen/Kconfig 2011-02-02 15:36:46.000000000 +0100 @@ -2,7 +2,7 @@ # This Kconfig describes xen/ia64 options # -config XEN +config PARAVIRT_XEN bool "Xen hypervisor support" default y depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL @@ -16,10 +16,6 @@ config XEN Enable Xen hypervisor support. Resulting kernel runs both as a guest OS on Xen and natively on hardware. -config XEN_XENCOMM - depends on XEN - bool - config NO_IDLE_HZ - depends on XEN + depends on PARAVIRT_XEN bool --- head-2011-03-17.orig/arch/ia64/xen/xcom_hcall.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/ia64/xen/xcom_hcall.c 2011-02-01 14:39:24.000000000 +0100 @@ -343,7 +343,7 @@ xencommize_memory_reservation(struct xen int xencomm_hypercall_memory_op(unsigned int cmd, void *arg) { - GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; + XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; struct xen_memory_reservation *xmr = NULL; int rc; struct xencomm_handle *desc; --- head-2011-03-17.orig/arch/x86/Kconfig 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/Kconfig 2011-02-01 14:39:24.000000000 +0100 @@ -1028,7 +1028,7 @@ config MICROCODE config MICROCODE_INTEL bool "Intel microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN default MICROCODE select FW_LOADER ---help--- @@ -1041,7 +1041,7 @@ config MICROCODE_INTEL config MICROCODE_AMD bool "AMD microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN select FW_LOADER ---help--- If you select this option, microcode patch loading support for AMD @@ -1342,6 +1342,7 @@ config HIGHPTE config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" + depends on !XEN ---help--- Periodically check for memory corruption in low memory, which is suspected to be caused by BIOS. Even when enabled in the @@ -1372,6 +1373,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_C config X86_RESERVE_LOW int "Amount of low memory, in kilobytes, to reserve for the BIOS" + depends on !XEN default 64 range 4 640 ---help--- @@ -1495,8 +1497,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAU config X86_PAT def_bool y - prompt "x86 PAT support" if EXPERT - depends on MTRR + prompt "x86 PAT support" if EXPERT || XEN_UNPRIVILEGED_GUEST + depends on MTRR || (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND) ---help--- Use PAT attributes to setup page level cache control. @@ -2091,7 +2093,7 @@ config DMAR_FLOPPY_WA config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" - depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && !XEN && EXPERIMENTAL ---help--- Supports Interrupt remapping for IO-APIC and MSI devices. To use x2apic mode in the CPU's which support x2APIC enhancements or --- head-2011-03-17.orig/arch/x86/Kconfig.cpu 2011-03-03 17:48:58.000000000 +0100 +++ head-2011-03-17/arch/x86/Kconfig.cpu 2011-03-03 16:02:15.000000000 +0100 @@ -446,7 +446,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -486,7 +486,7 @@ config CPU_SUP_CENTAUR config CPU_SUP_TRANSMETA_32 default y bool "Support Transmeta processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for Transmeta processors @@ -500,7 +500,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for UMC processors --- head-2011-03-17.orig/arch/x86/Makefile 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/Makefile 2011-02-01 14:39:24.000000000 +0100 @@ -117,7 +117,7 @@ endif KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) # Xen subarch support -mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen +mflags-$(CONFIG_XEN) := -Iarch/x86/include/mach-xen mcore-$(CONFIG_XEN) := arch/x86/mach-xen/ KBUILD_CFLAGS += $(mflags-y) @@ -159,7 +159,7 @@ PHONY += bzImage vmlinuz $(BOOT_TARGETS) ifdef CONFIG_XEN KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ - -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(KBUILD_CPPFLAGS) + -I$(srctree)/arch/x86/include/mach-xen $(KBUILD_CPPFLAGS) ifdef CONFIG_X86_64 LDFLAGS_vmlinux := -e startup_64 --- head-2011-03-17.orig/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:39:24.000000000 +0100 @@ -39,11 +39,11 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax movq %rax,R11(%rsp) movq %rax,R10(%rsp) - movq %rax,R9(%rsp) + movq %\_r9,R9(%rsp) movq %rax,R8(%rsp) .endm @@ -52,11 +52,10 @@ * We don't reload %eax because syscall_trace_enter() returned * the value it wants us to use in the table lookup. */ - .macro LOAD_ARGS32 offset - movl \offset(%rsp),%r11d - movl \offset+8(%rsp),%r10d + .macro LOAD_ARGS32 offset, _r9=0 + .if \_r9 movl \offset+16(%rsp),%r9d - movl \offset+24(%rsp),%r8d + .endif movl \offset+40(%rsp),%ecx movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi @@ -135,7 +134,7 @@ ENTRY(ia32_sysenter_target) SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ -1: movl (%rbp),%r9d +1: movl (%rbp),%ebp .section __ex_table,"a" .quad 1b,ia32_badarg .previous @@ -146,7 +145,7 @@ ENTRY(ia32_sysenter_target) cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys sysenter_do_call: - IA32_ARG_FIXUP 1 + IA32_ARG_FIXUP sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -204,20 +203,17 @@ sysexit_audit: #endif sysenter_tracesys: - xchgl %r9d,%ebp #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jz sysenter_auditsys #endif SAVE_REST CLEAR_RREGS - movq %r9,R9(%rsp) movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - xchgl %ebp,%r9d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -272,9 +268,9 @@ ENTRY(ia32_cstar_target) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz cstar_tracesys -cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys +cstar_do_call: IA32_ARG_FIXUP 1 cstar_dispatch: call *ia32_sys_call_table(,%rax,8) @@ -303,15 +299,13 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS - movq %r9,R9(%rsp) + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - movl RSP-ARGOFFSET(%rsp), %r8d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call @@ -522,8 +516,8 @@ ia32_sys_call_table: .quad compat_sys_setrlimit /* 75 */ .quad compat_sys_old_getrlimit /* old_getrlimit */ .quad compat_sys_getrusage - .quad sys32_gettimeofday - .quad sys32_settimeofday + .quad compat_sys_gettimeofday + .quad compat_sys_settimeofday .quad sys_getgroups16 /* 80 */ .quad sys_setgroups16 .quad sys32_old_select --- head-2011-03-17.orig/arch/x86/include/asm/agp.h 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/agp.h 2011-02-01 14:39:24.000000000 +0100 @@ -15,6 +15,9 @@ #define map_page_into_agp(page) set_pages_uc(page, 1) #define unmap_page_from_agp(page) set_pages_wb(page, 1) +#define map_pages_into_agp set_pages_array_uc +#define unmap_pages_from_agp set_pages_array_wb + /* * Could use CLFLUSH here if the cpu supports it. But then it would * need to be called for each cacheline of the whole page so it may --- head-2011-03-17.orig/arch/x86/include/asm/cpufeature.h 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/cpufeature.h 2011-02-01 14:39:24.000000000 +0100 @@ -276,7 +276,11 @@ extern const char * const x86_power_flag #define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) +#ifndef CONFIG_XEN #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#else +#define cpu_has_xsave boot_cpu_has(X86_FEATURE_OSXSAVE) +#endif #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) --- head-2011-03-17.orig/arch/x86/include/asm/hw_irq.h 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/hw_irq.h 2011-02-01 14:39:24.000000000 +0100 @@ -128,6 +128,7 @@ extern void smp_error_interrupt(struct p extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP +#ifndef CONFIG_XEN extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); extern void smp_call_function_single_interrupt(struct pt_regs *); @@ -136,6 +137,12 @@ extern void smp_invalidate_interrupt(str #else extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif +#else +#include +extern irqreturn_t smp_reschedule_interrupt(int, void *); +extern irqreturn_t smp_call_function_interrupt(int, void *); +extern irqreturn_t smp_call_function_single_interrupt(int, void *); +#endif #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); --- head-2011-03-17.orig/arch/x86/include/asm/segment.h 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/segment.h 2011-02-01 14:39:24.000000000 +0100 @@ -186,7 +186,9 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) -#ifndef CONFIG_PARAVIRT +#if defined(CONFIG_X86_XEN) +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) +#elif !defined(CONFIG_PARAVIRT) #define get_kernel_rpl() 0 #endif --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/agp.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/agp.h 2011-02-01 14:39:24.000000000 +0100 @@ -21,6 +21,23 @@ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ set_pages_wb(page, 1)) +#define map_pages_into_agp(pages, nr) ({ \ + __typeof__(nr) n__; \ + int rc__ = 0; \ + for (n__ = 0; n__ < (nr) && !rc__; ++n__) \ + rc__ = xen_create_contiguous_region( \ + (unsigned long)page_address((pages)[n__]), 0, 32); \ + rc__ ?: set_pages_array_uc(pages, nr); \ +}) +#define unmap_pages_from_agp(pages, nr) ({ \ + __typeof__(nr) n__; \ + for (n__ = 0; n__ < nr; ++n__) \ + xen_destroy_contiguous_region( \ + (unsigned long)page_address((pages)[n__]), 0); \ + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ + set_pages_array_wb(pages, nr); \ +}) + /* * Could use CLFLUSH here if the cpu supports it. But then it would * need to be called for each cacheline of the whole page so it may @@ -40,4 +57,4 @@ #define free_gatt_pages(table, order) \ dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) -#endif +#endif /* _ASM_X86_AGP_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _ASM_DESC_H_ -#define _ASM_DESC_H_ +#ifndef _ASM_X86_DESC_H +#define _ASM_X86_DESC_H #ifndef __ASSEMBLY__ #include @@ -24,6 +24,11 @@ static inline void fill_ldt(struct desc_ desc->d = info->seg_32bit; desc->g = info->limit_in_pages; desc->base2 = (info->base_addr & 0xff000000) >> 24; + /* + * Don't allow setting of the lm bit. It is useless anyway + * because 64bit system calls require __USER_CS: + */ + desc->l = 0; } #ifndef CONFIG_X86_NO_IDT @@ -98,6 +103,14 @@ static inline int desc_empty(const void #define write_idt_entry(dt, entry, g) \ native_write_idt_entry(dt, entry, g) +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) { @@ -360,20 +373,16 @@ static inline void set_system_intr_gate( _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); } -static inline void set_trap_gate(unsigned int n, void *addr) +static inline void set_system_trap_gate(unsigned int n, void *addr) { BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); } -static inline void set_system_gate(unsigned int n, void *addr) +static inline void set_trap_gate(unsigned int n, void *addr) { BUG_ON((unsigned)n > 0xFF); -#ifdef CONFIG_X86_32 - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -#else - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -#endif + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); } static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) @@ -388,7 +397,7 @@ static inline void set_intr_gate_ist(int _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); } -static inline void set_system_gate_ist(int n, void *addr, unsigned ist) +static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) { BUG_ON((unsigned)n > 0xFF); _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); @@ -420,4 +429,4 @@ static inline void set_system_gate_ist(i #endif /* __ASSEMBLY__ */ -#endif +#endif /* _ASM_X86_DESC_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/dma-mapping.h 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/dma-mapping.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,17 +1,12 @@ -#ifndef _ASM_DMA_MAPPING_H_ +#ifndef _ASM_X86_DMA_MAPPING_H_ #include_next -static inline int -address_needs_mapping(struct device *hwdev, dma_addr_t addr) -{ - dma_addr_t mask = 0xffffffff; - /* If the device has a mask, use it, otherwise default to 32 bits */ - if (hwdev && hwdev->dma_mask) - mask = *hwdev->dma_mask; - return (addr & ~mask) != 0; -} +void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); + +#define address_needs_mapping(hwdev, addr, size) \ + !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size) extern int range_straddles_page_boundary(paddr_t p, size_t size); -#endif /* _ASM_DMA_MAPPING_H_ */ +#endif /* _ASM_X86_DMA_MAPPING_H_ */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _ASM_FIXMAP_H -#define _ASM_FIXMAP_H +#ifndef _ASM_X86_FIXMAP_H +#define _ASM_X86_FIXMAP_H #ifdef CONFIG_X86_32 # include "fixmap_32.h" @@ -9,6 +9,10 @@ extern int fixmaps_set; +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t); static inline void __set_fixmap(enum fixed_addresses idx, @@ -61,4 +65,4 @@ static inline unsigned long virt_to_fix( BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); return __virt_to_fix(vaddr); } -#endif +#endif /* _ASM_X86_FIXMAP_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-02-01 14:39:24.000000000 +0100 @@ -10,8 +10,8 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ -#ifndef _ASM_FIXMAP_32_H -#define _ASM_FIXMAP_32_H +#ifndef _ASM_X86_FIXMAP_32_H +#define _ASM_X86_FIXMAP_32_H /* used by vmalloc.c, vsyscall.lds.S. * @@ -27,10 +27,8 @@ extern unsigned long __FIXADDR_TOP; #include #include #include -#ifdef CONFIG_HIGHMEM #include #include -#endif /* * Here we define all the compile-time 'special' virtual @@ -81,10 +79,8 @@ enum fixed_addresses { #ifdef CONFIG_X86_CYCLONE_TIMER FIX_CYCLONE_TIMER, /*cyclone timer register*/ #endif -#ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif @@ -100,10 +96,10 @@ enum fixed_addresses { * can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_NESTING 4 +#define FIX_BTMAPS_SLOTS 4 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, FIX_WP_TEST, #ifdef CONFIG_ACPI FIX_ACPI_BEGIN, @@ -126,4 +122,4 @@ extern void reserve_top_address(unsigned #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) #endif /* !__ASSEMBLY__ */ -#endif +#endif /* _ASM_X86_FIXMAP_32_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-02-01 14:39:24.000000000 +0100 @@ -8,8 +8,8 @@ * Copyright (C) 1998 Ingo Molnar */ -#ifndef _ASM_FIXMAP_64_H -#define _ASM_FIXMAP_64_H +#ifndef _ASM_X86_FIXMAP_64_H +#define _ASM_X86_FIXMAP_64_H #include #include @@ -47,6 +47,10 @@ enum fixed_addresses { #ifndef CONFIG_XEN FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, +#else +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, #endif #ifdef CONFIG_EFI FIX_EFI_IO_MAP_LAST_PAGE, @@ -58,29 +62,26 @@ enum fixed_addresses { #else FIX_SHARED_INFO, #endif + __end_of_permanent_fixed_addresses, #ifdef CONFIG_ACPI FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, #endif -#define NR_FIX_ISAMAPS 256 - FIX_ISAMAP_END, - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif - __end_of_permanent_fixed_addresses, /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. * - * We round it up to the next 512 pages boundary so that we + * We round it up to the next 256 pages boundary so that we * can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_NESTING 4 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 - - (__end_of_permanent_fixed_addresses & 511), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, +#define FIX_BTMAPS_SLOTS 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, __end_of_fixed_addresses }; @@ -92,4 +93,4 @@ enum fixed_addresses { #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) -#endif +#endif /* _ASM_X86_FIXMAP_64_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:39:24.000000000 +0100 @@ -15,8 +15,8 @@ * Copyright (C) 1999 Ingo Molnar */ -#ifndef _ASM_HIGHMEM_H -#define _ASM_HIGHMEM_H +#ifndef _ASM_X86_HIGHMEM_H +#define _ASM_X86_HIGHMEM_H #ifdef __KERNEL__ @@ -24,14 +24,11 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; -extern pte_t *pkmap_page_table; - /* * Right now we initialize only a single pte table. It can be extended * easily, subsequent pte tables have to be allocated in one physical @@ -95,4 +92,4 @@ static inline void copy_user_highpage(st #endif /* __KERNEL__ */ -#endif /* _ASM_HIGHMEM_H */ +#endif /* _ASM_X86_HIGHMEM_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:39:24.000000000 +0100 @@ -5,20 +5,6 @@ #include -/* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -#ifndef __ASSEMBLY__ -extern void early_ioremap_init(void); -extern void early_ioremap_clear(void); -extern void early_ioremap_reset(void); -extern void *early_ioremap(unsigned long offset, unsigned long size); -extern void early_iounmap(void *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); -#endif - #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ @@ -73,12 +59,14 @@ build_mmio_write(__writeq, "q", unsigned #define writeq writeq #endif +extern int iommu_bio_merge; + #define native_io_delay xen_io_delay #ifdef CONFIG_X86_32 -# include "../../io_32.h" +# include "../../asm/io_32.h" #else -# include "../../io_64.h" +# include "../../asm/io_64.h" #endif #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -95,7 +83,7 @@ build_mmio_write(__writeq, "q", unsigned (unsigned long)(bv)->bv_offset) #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \ + (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ && bvec_to_pseudophys(vec1) + (vec1)->bv_len \ == bvec_to_pseudophys(vec2)) @@ -134,8 +122,9 @@ extern void __iomem *ioremap_wc(unsigned extern void early_ioremap_init(void); extern void early_ioremap_clear(void); extern void early_ioremap_reset(void); -extern void *early_ioremap(unsigned long offset, unsigned long size); -extern void early_iounmap(void *addr, unsigned long size); +extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); +extern void __iomem *early_memremap(unsigned long offset, unsigned long size); +extern void early_iounmap(void __iomem *addr, unsigned long size); extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:27:18.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:31:50.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 @@ -47,6 +47,5 @@ #define NR_DYNIRQS 256 #define NR_IRQS (NR_PIRQS + NR_DYNIRQS) -#define NR_IRQ_VECTORS NR_IRQS -#endif /* _ASM_IRQ_VECTORS_H */ +#endif /* _ASM_X86_IRQ_VECTORS_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 14:39:24.000000000 +0100 @@ -157,23 +157,6 @@ static inline int raw_irqs_disabled_flag raw_irqs_disabled_flags(flags); \ }) -/* - * makes the traced hardirq state match with the machine state - * - * should be a rarely used function, only in places where its - * otherwise impossible to know the irq state, like in traps. - */ -static inline void trace_hardirqs_fixup_flags(unsigned long flags) -{ - if (raw_irqs_disabled_flags(flags)) - trace_hardirqs_off(); - else - trace_hardirqs_on(); -} - -#define trace_hardirqs_fixup() \ - trace_hardirqs_fixup_flags(__raw_local_save_flags()) - #else #ifdef CONFIG_X86_64 --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/maddr.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/maddr.h 2011-02-01 14:39:24.000000000 +0100 @@ -59,10 +59,10 @@ static inline unsigned long mfn_to_pfn(u /* The array access can fail (e.g., device space beyond end of RAM). */ asm ( - "1: "_ASM_MOV_UL" %1,%0\n" + "1: "_ASM_MOV" %1,%0\n" "2:\n" ".section .fixup,\"ax\"\n" - "3: "_ASM_MOV_UL" %2,%0\n" + "3: "_ASM_MOV" %2,%0\n" " jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b,3b) --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __ASM_X86_MMU_CONTEXT_H -#define __ASM_X86_MMU_CONTEXT_H +#ifndef _ASM_X86_MMU_CONTEXT_H +#define _ASM_X86_MMU_CONTEXT_H #include #include @@ -39,4 +39,4 @@ do { \ } while (0); -#endif /* __ASM_X86_MMU_CONTEXT_H */ +#endif /* _ASM_X86_MMU_CONTEXT_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __I386_SCHED_H -#define __I386_SCHED_H +#ifndef _ASM_X86_MMU_CONTEXT_32_H +#define _ASM_X86_MMU_CONTEXT_32_H static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { @@ -81,4 +81,4 @@ static inline void switch_mm(struct mm_s #define deactivate_mm(tsk, mm) \ asm("movl %0,%%gs": :"r" (0)); -#endif +#endif /* _ASM_X86_MMU_CONTEXT_32_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __X86_64_MMU_CONTEXT_H -#define __X86_64_MMU_CONTEXT_H +#ifndef _ASM_X86_MMU_CONTEXT_64_H +#define _ASM_X86_MMU_CONTEXT_64_H static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { @@ -103,4 +103,4 @@ do { \ asm volatile("movl %0,%%fs"::"r"(0)); \ } while (0) -#endif +#endif /* _ASM_X86_MMU_CONTEXT_64_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __x86_PCI_H -#define __x86_PCI_H +#ifndef _ASM_X86_PCI_H +#define _ASM_X86_PCI_H #include /* for struct page */ #include @@ -93,7 +93,7 @@ static inline void early_quirks(void) { #ifdef CONFIG_X86_32 # include "pci_32.h" #else -# include "pci_64.h" +# include "../../asm/pci_64.h" #endif /* implement the pci_ DMA API in terms of the generic device dma_ one */ @@ -117,4 +117,4 @@ static inline cpumask_t __pcibus_to_cpum } #endif -#endif +#endif /* _ASM_X86_PCI_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-01 14:39:24.000000000 +0100 @@ -149,4 +149,4 @@ extern void __pud_free_tlb(struct mmu_ga #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ -#endif /* _ASM_X86_PGALLOC_H */ +#endif /* _ASM_X86_PGALLOC_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-07 15:40:30.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-07 15:41:11.000000000 +0100 @@ -14,11 +14,11 @@ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_UNUSED2 10 -#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and - * has no associated page struct. */ +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_UNUSED3 11 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ @@ -39,11 +39,12 @@ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) -#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2) -#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define __HAVE_ARCH_PTE_SPECIAL #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) @@ -69,7 +70,7 @@ extern unsigned int __kernel_page_user; _PAGE_DIRTY | __kernel_page_user) /* Set of bits not changed in pte_modify */ -#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) /* @@ -116,6 +117,11 @@ extern unsigned int __kernel_page_user; #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) @@ -130,6 +136,11 @@ extern unsigned int __kernel_page_user; #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + /* xwr */ #define __P000 PAGE_NONE #define __P001 PAGE_READONLY @@ -149,6 +160,22 @@ extern unsigned int __kernel_page_user; #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +/* + * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + #ifndef __ASSEMBLY__ /* @@ -205,6 +232,15 @@ static inline int pte_special(pte_t pte) return pte_flags(pte) & _PAGE_SPECIAL; } +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \ + (_pte).pte_low & _PAGE_PRESENT ? \ + mfn_to_local_pfn(__pte_mfn(_pte)) : \ + __pte_mfn(_pte)) + +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) + static inline int pmd_large(pmd_t pte) { return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -347,6 +383,9 @@ static inline void xen_pagetable_setup_s static inline void xen_pagetable_setup_done(pgd_t *base) {} #endif +struct seq_file; +extern void arch_report_meminfo(struct seq_file *m); + #define set_pte(ptep, pte) xen_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) @@ -638,4 +677,4 @@ int create_lookup_pte_addr(struct mm_str #endif /* __ASSEMBLY__ */ -#endif /* _ASM_X86_PGTABLE_H */ +#endif /* _ASM_X86_PGTABLE_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _I386_PGTABLE_3LEVEL_H -#define _I386_PGTABLE_3LEVEL_H +#ifndef _ASM_X86_PGTABLE_3LEVEL_H +#define _ASM_X86_PGTABLE_3LEVEL_H /* * Intel Physical Address Extension (PAE) Mode - three-level page @@ -102,13 +102,13 @@ static inline void pud_clear(pud_t *pudp xen_tlb_flush(); } -#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK)) +#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) #define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \ +#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \ pmd_index(address)) #ifdef CONFIG_SMP @@ -133,8 +133,6 @@ static inline int pte_same(pte_t a, pte_ return a.pte_low == b.pte_low && a.pte_high == b.pte_high; } -#define pte_page(x) pfn_to_page(pte_pfn(x)) - static inline int pte_none(pte_t pte) { return !(pte.pte_low | pte.pte_high); @@ -142,12 +140,6 @@ static inline int pte_none(pte_t pte) #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ ((_pte).pte_high << (32-PAGE_SHIFT))) -#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ - __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) -#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \ - (_pte).pte_low & _PAGE_PRESENT ? \ - mfn_to_local_pfn(__pte_mfn(_pte)) : \ - __pte_mfn(_pte)) /* * Bits 0, 6 and 7 are taken in the low part of the pte, @@ -165,4 +157,4 @@ static inline int pte_none(pte_t pte) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) -#endif /* _I386_PGTABLE_3LEVEL_H */ +#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable-3level-defs.h 2011-01-31 17:32:29.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable-3level-defs.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _I386_PGTABLE_3LEVEL_DEFS_H -#define _I386_PGTABLE_3LEVEL_DEFS_H +#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H #define SHARED_KERNEL_PMD 0 @@ -21,4 +21,4 @@ */ #define PTRS_PER_PTE 512 -#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */ +#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _I386_PGTABLE_H -#define _I386_PGTABLE_H +#ifndef _ASM_X86_PGTABLE_32_H +#define _ASM_X86_PGTABLE_32_H /* * The Linux memory management assumes a three-level page table setup. On @@ -29,6 +29,7 @@ static inline void pgtable_cache_init(vo static inline void check_pgt_cache(void) { } void paging_init(void); +extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); /* * The Linux x86 paging architecture is 'compile-time dual-mode', it @@ -54,8 +55,7 @@ void paging_init(void); * area for the same reason. ;) */ #define VMALLOC_OFFSET (8 * 1024 * 1024) -#define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \ - & ~(VMALLOC_OFFSET - 1)) +#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 #else @@ -71,6 +71,8 @@ void paging_init(void); # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) #endif +#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) + /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are @@ -195,4 +197,4 @@ void make_lowmem_page_writable(void *va, #define io_remap_pfn_range(vma, from, pfn, size, prot) \ direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO) -#endif /* _I386_PGTABLE_H */ +#endif /* _ASM_X86_PGTABLE_32_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _X86_64_PGTABLE_H -#define _X86_64_PGTABLE_H +#ifndef _ASM_X86_PGTABLE_64_H +#define _ASM_X86_PGTABLE_64_H #include #ifndef __ASSEMBLY__ @@ -65,14 +65,14 @@ extern void paging_init(void); printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \ + printk("%s:%d: bad pmd %p(%016lx pfn %010Lx).\n", \ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", \ + printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \ __FILE__, __LINE__, &(e), __pud_val(e), \ (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", \ + printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n", \ __FILE__, __LINE__, &(e), __pgd_val(e), \ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) @@ -181,14 +181,6 @@ static inline int pmd_bad(pmd_t pmd) #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) -#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \ - __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \ - (_pte).pte & _PAGE_PRESENT ? \ - mfn_to_local_pfn(__pte_mfn(_pte)) : \ - __pte_mfn(_pte)) - -#define pte_page(x) pfn_to_page(pte_pfn((x))) /* * Macro to mark a page protection value as "uncacheable". @@ -312,4 +304,4 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME #endif /* !__ASSEMBLY__ */ -#endif /* _X86_64_PGTABLE_H */ +#endif /* _ASM_X86_PGTABLE_64_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:44:23.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:45:14.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __ASM_X86_PROCESSOR_H -#define __ASM_X86_PROCESSOR_H +#ifndef _ASM_X86_PROCESSOR_H +#define _ASM_X86_PROCESSOR_H #include @@ -20,6 +20,7 @@ struct mm_struct; #include #include #include +#include #include #include @@ -72,21 +73,21 @@ struct cpuinfo_x86 { char rfu; char fdiv_bug; char f00f_bug; -#endif char coma_bug; char pad0; +#endif #else /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; __u8 x86_virt_bits; __u8 x86_phys_bits; +#endif #ifndef CONFIG_XEN /* CPUID returned core id bits: */ __u8 x86_coreid_bits; #endif /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; -#endif /* Maximum supported CPUID level, -1=no CPUID: */ int cpuid_level; __u32 x86_capability[NCAPINTS]; @@ -150,6 +151,8 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_ #define current_cpu_data boot_cpu_data #endif +extern const struct seq_operations cpuinfo_op; + static inline int hlt_works(int cpu) { #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) @@ -163,6 +166,8 @@ static inline int hlt_works(int cpu) extern void cpu_detect(struct cpuinfo_x86 *c); +extern struct pt_regs *idle_regs(struct pt_regs *); + extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); @@ -171,11 +176,8 @@ extern void init_scattered_cpuid_feature extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; -#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64) +extern void detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); -#else -static inline void detect_ht(struct cpuinfo_x86 *c) {} -#endif static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) @@ -337,7 +339,12 @@ struct i387_fxsave_struct { /* 16*16 bytes for each XMM-reg = 256 bytes: */ u32 xmm_space[64]; - u32 padding[24]; + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; } __attribute__((aligned(16))); @@ -361,10 +368,23 @@ struct i387_soft_struct { u32 entry_eip; }; +struct xsave_hdr_struct { + u64 xstate_bv; + u64 reserved1[2]; + u64 reserved2[5]; +} __attribute__((packed)); + +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + /* new processor state extensions will go here */ +} __attribute__ ((packed, aligned (64))); + union thread_xstate { struct i387_fsave_struct fsave; struct i387_fxsave_struct fxsave; struct i387_soft_struct soft; + struct xsave_struct xsave; }; #if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS) @@ -422,9 +442,14 @@ struct thread_struct { unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; -/* Debug Store - if not 0 points to a DS Save Area configuration; - * goes into MSR_IA32_DS_AREA */ - unsigned long ds_area_msr; +#ifdef CONFIG_X86_DS +/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + struct ds_context *ds_ctx; +#endif /* CONFIG_X86_DS */ +#ifdef CONFIG_X86_PTRACE_BTS +/* the signal to send on a bts buffer overflow */ + unsigned int bts_ovfl_signal; +#endif /* CONFIG_X86_PTRACE_BTS */ }; static inline unsigned long xen_get_debugreg(int regno) @@ -512,41 +537,6 @@ static inline void clear_in_cr4(unsigned write_cr4(cr4); } -struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; -}; - -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - typedef struct { unsigned long seg; } mm_segment_t; @@ -894,4 +884,4 @@ extern void start_thread(struct pt_regs extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); -#endif +#endif /* _ASM_X86_PROCESSOR_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_SMP_H_ -#define _ASM_X86_SMP_H_ +#ifndef _ASM_X86_SMP_H +#define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include #include @@ -34,7 +34,12 @@ extern cpumask_t cpu_initialized; DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); +#endif +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(int, cpu_number); +#endif +#ifndef CONFIG_XEN DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); #endif @@ -52,12 +57,16 @@ extern struct { struct smp_ops { void (*smp_prepare_boot_cpu)(void); void (*smp_prepare_cpus)(unsigned max_cpus); - int (*cpu_up)(unsigned cpu); void (*smp_cpus_done)(unsigned max_cpus); void (*smp_send_stop)(void); void (*smp_send_reschedule)(int cpu); + int (*cpu_up)(unsigned cpu); + int (*cpu_disable)(void); + void (*cpu_die)(unsigned int cpu); + void (*play_dead)(void); + void (*send_call_func_ipi)(cpumask_t mask); void (*send_call_func_single_ipi)(int cpu); }; @@ -92,6 +101,21 @@ static inline int __cpu_up(unsigned int return smp_ops.cpu_up(cpu); } +static inline int __cpu_disable(void) +{ + return smp_ops.cpu_disable(); +} + +static inline void __cpu_die(unsigned int cpu) +{ + smp_ops.cpu_die(cpu); +} + +static inline void play_dead(void) +{ + smp_ops.play_dead(); +} + static inline void smp_send_reschedule(int cpu) { smp_ops.smp_send_reschedule(cpu); @@ -107,13 +131,20 @@ static inline void arch_send_call_functi smp_ops.send_call_func_ipi(mask); } +void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); int native_cpu_up(unsigned int cpunum); +int native_cpu_disable(void); +void native_cpu_die(unsigned int cpu); +void native_play_dead(void); +void play_dead_common(void); #else /* CONFIG_XEN */ +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); void xen_smp_send_stop(void); void xen_smp_send_reschedule(int cpu); void xen_send_call_func_ipi(cpumask_t mask); @@ -124,10 +155,11 @@ void xen_send_call_func_single_ipi(int c #define arch_send_call_function_single_ipi xen_send_call_func_single_ipi #define arch_send_call_function_ipi xen_send_call_func_ipi +void play_dead(void); + #endif /* CONFIG_XEN */ -extern int __cpu_disable(void); -extern void __cpu_die(unsigned int cpu); +extern void prefill_possible_map(void); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) (cpu) @@ -137,15 +169,11 @@ static inline int num_booting_cpus(void) { return cpus_weight(cpu_callout_map); } -#endif /* CONFIG_SMP */ - -#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN)) -extern void prefill_possible_map(void); #else static inline void prefill_possible_map(void) { } -#endif +#endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; @@ -155,7 +183,6 @@ extern unsigned disabled_cpus __cpuinitd * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -DECLARE_PER_CPU(int, cpu_number); #define raw_smp_processor_id() (x86_read_percpu(cpu_number)) #define safe_smp_processor_id() smp_processor_id() @@ -178,30 +205,33 @@ DECLARE_PER_CPU(int, cpu_number); #ifdef CONFIG_X86_LOCAL_APIC +#ifndef CONFIG_X86_64 static inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); } -#ifndef CONFIG_X86_64 +#include static inline unsigned int read_apic_id(void) { - return *(u32 *)(APIC_BASE + APIC_ID); + unsigned int reg; + + reg = *(u32 *)(APIC_BASE + APIC_ID); + + return GET_APIC_ID(reg); } -#else -extern unsigned int read_apic_id(void); #endif -# ifdef APIC_DEFINITION +# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64) extern int hard_smp_processor_id(void); # else -# include +#include static inline int hard_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(read_apic_id()); + return read_apic_id(); } # endif /* APIC_DEFINITION */ @@ -213,9 +243,11 @@ static inline int hard_smp_processor_id( #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_HOTPLUG_CPU -extern void cpu_uninit(void); +#ifdef CONFIG_X86_HAS_BOOT_CPU_ID +extern unsigned char boot_cpu_id; +#else +#define boot_cpu_id 0 #endif #endif /* __ASSEMBLY__ */ -#endif +#endif /* _ASM_X86_SMP_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _X86_SPINLOCK_H_ -#define _X86_SPINLOCK_H_ +#ifndef _ASM_X86_SPINLOCK_H +#define _ASM_X86_SPINLOCK_H #include #include @@ -453,4 +453,4 @@ static inline void __raw_write_unlock(ra #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() -#endif +#endif /* _ASM_X86_SPINLOCK_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock_types.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock_types.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __ASM_SPINLOCK_TYPES_H -#define __ASM_SPINLOCK_TYPES_H +#ifndef _ASM_X86_SPINLOCK_TYPES_H +#define _ASM_X86_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_H # error "please don't include this file directly" @@ -38,4 +38,4 @@ typedef struct { #define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } -#endif +#endif /* _ASM_X86_SPINLOCK_TYPES_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/system.h 2011-03-03 15:58:55.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/system.h 2011-03-03 16:01:23.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_SYSTEM_H_ -#define _ASM_X86_SYSTEM_H_ +#ifndef _ASM_X86_SYSTEM_H +#define _ASM_X86_SYSTEM_H #include #include @@ -65,7 +65,10 @@ do { \ \ /* regparm parameters for __switch_to(): */ \ [prev] "a" (prev), \ - [next] "d" (next)); \ + [next] "d" (next) \ + \ + : /* reloaded segment registers */ \ + "memory"); \ } while (0) #ifndef CONFIG_XEN @@ -405,4 +408,4 @@ static inline void rdtsc_barrier(void) alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); } -#endif +#endif /* _ASM_X86_SYSTEM_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/system_64.h 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/system_64.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef __ASM_SYSTEM_H -#define __ASM_SYSTEM_H +#ifndef _ASM_X86_SYSTEM_64_H +#define _ASM_X86_SYSTEM_64_H #include #include @@ -17,4 +17,4 @@ static inline void write_cr8(unsigned lo #include -#endif +#endif /* _ASM_X86_SYSTEM_64_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 14:39:24.000000000 +0100 @@ -63,6 +63,10 @@ static inline void flush_tlb_range(struc __flush_tlb(); } +static inline void reset_lazy_tlbstate(void) +{ +} + #else /* SMP */ #include @@ -92,6 +96,12 @@ struct tlb_state { char __cacheline_padding[L1_CACHE_BYTES-8]; }; DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); + +void reset_lazy_tlbstate(void); +#else +static inline void reset_lazy_tlbstate(void) +{ +} #endif #endif /* SMP */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/vga.h 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/vga.h 2011-02-01 14:39:24.000000000 +0100 @@ -4,8 +4,8 @@ * (c) 1998 Martin Mares */ -#ifndef _LINUX_ASM_VGA_H_ -#define _LINUX_ASM_VGA_H_ +#ifndef _ASM_X86_VGA_H +#define _ASM_X86_VGA_H /* * On the PC, we can just recalculate addresses and then @@ -17,4 +17,4 @@ #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) -#endif +#endif /* _ASM_X86_VGA_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/xor.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/xor.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "../../xor_32.h" +# include "../../asm/xor_32.h" #else # include "xor_64.h" #endif --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/xor_64.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/xor_64.h 2011-02-01 14:39:24.000000000 +0100 @@ -1,5 +1,5 @@ -#ifndef ASM_X86__XOR_64_H -#define ASM_X86__XOR_64_H +#ifndef _ASM_X86_XOR_64_H +#define _ASM_X86_XOR_64_H /* * x86-64 changes / gcc fixes from Andi Kleen. @@ -334,4 +334,4 @@ do { \ deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) -#endif /* ASM_X86__XOR_64_H */ +#endif /* _ASM_X86_XOR_64_H */ --- head-2011-03-17.orig/arch/x86/kernel/Makefile 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/Makefile 2011-02-01 14:39:24.000000000 +0100 @@ -129,7 +129,7 @@ ifeq ($(CONFIG_X86_64),y) time_64-$(CONFIG_XEN) += time_32.o endif -disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ - i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \ - tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o genx2apic_%.o \ + hpet.o i8253.o i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o \ + smpboot.o tlb_$(BITS).o tsc.o tsc_sync.o uv_%.o vsmp_64.o disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o --- head-2011-03-17.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -10,6 +10,7 @@ #include #include #include +#include #include "realmode/wakeup.h" #include "sleep.h" @@ -22,7 +23,7 @@ unsigned long acpi_realmode_flags; static unsigned long acpi_realmode; #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) -static char temp_stack[10240]; +static char temp_stack[4096]; #endif #endif @@ -100,7 +101,9 @@ int acpi_save_state_mem(void) #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP - stack_start.sp = temp_stack + 4096; + stack_start.sp = temp_stack + sizeof(temp_stack); + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; --- head-2011-03-17.orig/arch/x86/kernel/apic/apic-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/apic/apic-xen.c 2011-02-24 15:49:32.000000000 +0100 @@ -1,60 +1,13 @@ /* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. + * Local APIC handling stubs */ #include - -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "io_ports.h" - -#ifndef CONFIG_XEN -/* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers - */ -static cpumask_t timer_bcast_ipi; -#endif - -/* - * Knob to control our willingness to enable the local APIC. - */ +#include +#include /* * Debug level, exported for io_apic.c @@ -64,21 +17,29 @@ unsigned int apic_verbosity; /* Have we found an MP table */ int smp_found_config; -#ifndef CONFIG_XEN -static int modern_apic(void) +static int __init apic_set_verbosity(char *arg) { - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} -#endif /* !CONFIG_XEN */ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } -int get_physical_broadcast(void) -{ - return 0xff; + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; } +early_param("apic", apic_set_verbosity); int setup_profiling_timer(unsigned int multiplier) { @@ -93,9 +54,12 @@ int setup_profiling_timer(unsigned int m int __init APIC_init_uniprocessor(void) { #ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif #endif return 0; --- head-2011-03-17.orig/arch/x86/kernel/cpu/topology.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/topology.c 2011-02-01 14:39:24.000000000 +0100 @@ -28,7 +28,7 @@ */ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; --- head-2011-03-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:40:32.000000000 +0100 @@ -1,33 +1,73 @@ #include +#include +#include #include +#include +#include +#include +#include +#include #include #include -#include #include -#include -#include #include #include #include +#include #include #include #include #include #include +#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #include #include -#else +#include +#elif defined(CONFIG_X86_64_XEN) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + #ifdef CONFIG_XEN +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC) #define phys_pkg_id(a,b) a #endif -#endif #include +#include +#endif #include "cpu.h" +static struct cpu_dev *this_cpu __cpuinitdata; + +#ifdef CONFIG_X86_64 +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, +} }; +#else +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, @@ -63,17 +103,171 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page #endif [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; +#endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - +#ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); +#endif + +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + static int disable_x86_serial_nr __cpuinitdata = 1; -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { + /* Disable processor serial number */ + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +/* Probe for the CPUID instruction */ +static inline int have_cpuid_p(void) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct desc_ptr gdt_descr; + unsigned long va, frames[16]; + int f; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + + for (va = gdt_descr.address, f = 0; + va < gdt_descr.address + gdt_descr.size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly( + (void *)va, XENFEAT_writable_descriptor_tables); + } + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8)) + BUG(); +#ifdef CONFIG_X86_32 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +#endif +} + +static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else /* Not much we can do here... */ /* Check if at least it has cpuid */ if (c->cpuid_level == -1) { @@ -83,28 +277,22 @@ static void __cpuinit default_init(struc else if (c->x86 == 3) strcpy(c->x86_model_id, "386"); } +#endif } static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); -int __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; - if (cpuid_eax(0x80000000) < 0x80000004) - return 0; + if (c->extended_cpuid_level < 0x80000004) + return; v = (unsigned int *) c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); @@ -123,30 +311,34 @@ int __cpuinit get_model_name(struct cpui while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } - - return 1; } - void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, ecx, edx, l2size; + unsigned int n, dummy, ebx, ecx, edx, l2size; - n = cpuid_eax(0x80000000); + n = c->extended_cpuid_level; if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24)+(edx>>24); + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif } if (n < 0x80000006) /* Some chips just has a large L1. */ return; - ecx = cpuid_ecx(0x80000006); + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); l2size = ecx >> 16; +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else /* do processor-specific cache resizing */ if (this_cpu->c_size_cache) l2size = this_cpu->c_size_cache(c, l2size); @@ -157,116 +349,106 @@ void __cpuinit display_cacheinfo(struct if (l2size == 0) return; /* Again, no L2 cache is possible */ +#endif c->x86_cache_size = l2size; printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); + l2size, ecx & 0xFF); } -/* - * Naming convention should be: [()] - * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { - struct cpu_model_info *info; +#ifdef CONFIG_X86_HT + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; - if (c->x86_model >= 16) - return NULL; /* Range check */ + if (!cpu_has(c, X86_FEATURE_HT)) + return; - if (!this_cpu) - return NULL; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; - info = this_cpu->c_models; + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); +#ifdef CONFIG_X86_64 + c->phys_proc_id = phys_pkg_id(index_msb); +#else + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); +#endif + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + +#ifdef CONFIG_X86_64 + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); +#else + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); +#endif } - return NULL; /* Not found */ -} +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } +#endif +} -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - if (!early) - this_cpu = cpu_devs[i]; - return; - } + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; } } + if (!printed) { printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } + c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; } - -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - - -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - - -/* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - -void __init cpu_detect(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -275,48 +457,85 @@ void __init cpu_detect(struct cpuinfo_x8 (unsigned int *)&c->x86_vendor_id[4]); c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; + c->x86_model += ((tfms >> 16) & 0xf) << 4; if (cap0 & (1<<19)) { - c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; } } } -static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) + +static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; - unsigned int ebx; + u32 ebx; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (have_cpuid_p()) { - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - } + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); } + } + +#ifdef CONFIG_X86_64 + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; } +#endif + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + +} + +static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif } /* @@ -328,25 +547,65 @@ static void __cpuinit early_get_cap(stru * WARNING: this function is only called on the BP. Don't add code here * that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + /* cyrix could have cpuid enabled via c_identify()*/ if (!have_cpuid_p()) return; cpu_detect(c); - get_cpu_vendor(c, 1); + get_cpu_vendor(c); + + get_cpu_cap(c); + + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + + validate_pat_support(c); - early_get_cap(c); +#ifdef CONFIG_SMP + c->cpu_index = boot_cpu_id; +#endif +} + +void __init early_cpu_init(void) +{ + struct cpu_dev **cdev; + int count = 0; + + printk("KERNEL supported cpus:\n"); + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + struct cpu_dev *cpudev = *cdev; + unsigned int j; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); + early_identify_cpu(&boot_cpu_data); } /* @@ -364,88 +623,43 @@ static void __cpuinit detect_nopl(struct static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { - u32 tfms, xlvl; - unsigned int ebx; + c->extended_cpuid_level = 0; - if (have_cpuid_p()) { - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c, 0); - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; -#ifndef CONFIG_XEN - c->initial_apicid = (ebx >> 24) & 0xFF; -#ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id(c->initial_apicid, 0); - c->phys_proc_id = c->initial_apicid; -#else - c->apicid = c->initial_apicid; -#endif -#endif - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; - init_scattered_cpuid_features(c); - detect_nopl(c); - } -} + cpu_detect(c); -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); + get_cpu_vendor(c); - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } -} + get_cpu_cap(c); -static int __init x86_serial_nr_setup(char *s) -{ - disable_x86_serial_nr = 0; - return 1; -} -__setup("serialnumber", x86_serial_nr_setup); +#ifndef CONFIG_XEN + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT + c->apicid = phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + +#ifdef CONFIG_X86_HT + c->phys_proc_id = c->initial_apicid; +#endif + } +#endif + get_model_name(c); /* Default name */ + init_scattered_cpuid_features(c); + detect_nopl(c); +} /* * This does the hard work of actually picking apart the CPU stuff... @@ -457,34 +671,33 @@ static void __cpuinit identify_cpu(struc c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->cpuid_level = -1; /* CPUID not detected */ c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ #ifndef CONFIG_XEN c->x86_max_cores = 1; + c->x86_coreid_bits = 0; #endif +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else + c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (boot_cpu_has(X86_FEATURE_SYSCALL32)) set_cpu_cap(c, X86_FEATURE_SYSCALL32); - if (!have_cpuid_p()) { - /* - * First of all, decide if this is a 486 or higher - * It's a 486 if we can modify the AC flag - */ - if (flag_is_changeable_p(X86_EFLAGS_AC)) - c->x86 = 4; - else - c->x86 = 3; - } - generic_identify(c); if (this_cpu->c_identify) this_cpu->c_identify(c); +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + c->apicid = phys_pkg_id(0); +#endif + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -518,6 +731,10 @@ static void __cpuinit identify_cpu(struc c->x86, c->x86_model); } +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -526,7 +743,7 @@ static void __cpuinit identify_cpu(struc */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } @@ -534,72 +751,91 @@ static void __cpuinit identify_cpu(struc for (i = 0; i < NCAPINTS; i++) c->x86_capability[i] &= ~cleared_cpu_caps[i]; +#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); +#endif select_idle_routine(c); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + numa_add_cpu(smp_processor_id()); +#endif +} + +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; } +#endif void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); +#ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); +#else + vgetcpu_set_mode(); +#endif } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); +#ifdef CONFIG_X86_32 enable_sep_cpu(); +#endif mtrr_ap_init(); } -#ifdef CONFIG_X86_HT -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return; +struct msr_range { + unsigned min; + unsigned max; +}; - smp_num_siblings = (ebx & 0xff0000) >> 16; +static struct msr_range msr_range_array[] __cpuinitdata = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { +static void __cpuinit print_cpu_msr(void) +{ + unsigned index; + u64 val; + int i; + unsigned index_min, index_max; - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } + } +} - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings) ; - - core_bits = get_count_order(c->x86_max_cores); +static int show_msr __cpuinitdata; +static __init int setup_show_msr(char *arg) +{ + int num; - c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + get_option(&arg, &num); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } + if (num > 0) + show_msr = num; + return 1; } -#endif +__setup("show_msr=", setup_show_msr); static __init int setup_noclflush(char *arg) { @@ -617,18 +853,26 @@ void __cpuinit print_cpu_info(struct cpu else if (c->cpuid_level >= 0) vendor = c->x86_vendor_id; - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk("%s ", vendor); + if (vendor && !strstr(c->x86_model_id, vendor)) + printk(KERN_CONT "%s ", vendor); - if (!c->x86_model_id[0]) - printk("%d86", c->x86); + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); else - printk("%s", c->x86_model_id); + printk(KERN_CONT "%d86", c->x86); if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); + +#ifdef CONFIG_SMP + if (c->cpu_index < show_msr) + print_cpu_msr(); +#else + if (show_msr) + print_cpu_msr(); +#endif } static __init int setup_disablecpuid(char *arg) @@ -644,19 +888,124 @@ __setup("clearcpuid=", setup_disablecpui cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -void __init early_cpu_init(void) +#ifdef CONFIG_X86_64 +struct x8664_pda **_cpu_pda __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + +#ifndef CONFIG_X86_NO_IDT +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +#endif + +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + +static void __ref switch_pt(int cpu) +{ +#ifdef CONFIG_XEN + if (cpu == 0) + xen_init_pt(); + xen_pt_switch(__pa_symbol(init_level4_pgt)); + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); +#endif +} + +void __cpuinit pda_init(int cpu) +{ + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + loadsegment(fs, 0); + loadsegment(gs, 0); +#ifndef CONFIG_XEN + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); +#else + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, + (unsigned long)pda)) + BUG(); +#endif + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; + } else { + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } + + switch_pt(cpu); +} + +#ifndef CONFIG_X86_NO_TSS +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; +#endif + +extern asmlinkage void ignore_sysret(void); + +void __cpuinit syscall_init(void) { - struct cpu_vendor_dev *cvdev; +#ifndef CONFIG_XEN + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); +#endif - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#elif defined(CONFIG_XEN) + static const struct callback_register __cpuinitconst cstar = { + .type = CALLBACKTYPE_syscall32, + .address = (unsigned long)ignore_sysret + }; - early_cpu_detect(); - validate_pat_support(&boot_cpu_data); + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar)) + printk(KERN_WARNING "Unable to register CSTAR callback\n"); +#endif + +#ifndef CONFIG_XEN + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); +#endif } +unsigned long kernel_eflags; + +#ifndef CONFIG_X86_NO_TSS +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); +#endif + +#else + /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { @@ -664,36 +1013,154 @@ struct pt_regs * __cpuinit idle_regs(str regs->fs = __KERNEL_PERCPU; return regs; } +#endif -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init for 64 bit + */ +#ifdef CONFIG_X86_64 +void __cpuinit cpu_init(void) { - struct desc_ptr gdt_descr; - unsigned long va, frames[16]; - int f; + int cpu = stack_smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + int i; +#endif + struct task_struct *me; - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); +#ifndef CONFIG_X86_NO_TSS + else + estacks = boot_exception_stacks; +#endif - for (va = gdt_descr.address, f = 0; - va < gdt_descr.address + gdt_descr.size; - va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); - make_lowmem_page_readonly( - (void *)va, XENFEAT_writable_descriptor_tables); + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); +#ifndef CONFIG_X86_NO_IDT + load_idt((const struct desc_ptr *)&idt_descr); +#endif + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); +#ifndef CONFIG_XEN + if (cpu != 0 && x2apic) + enable_x2apic(); +#endif + +#ifndef CONFIG_X86_NO_TSS + /* + * set up and load the per-CPU TSS + */ + if (!orig_ist->ist[0]) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; + } } - if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8)) + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; +#endif + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) BUG(); - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); +#ifndef CONFIG_X86_NO_TSS + set_tss_desc(cpu, t); + load_TR_desc(); +#endif + load_LDT(&init_mm.context); + +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif + + fpu_init(); + +#ifndef CONFIG_XEN + raw_local_save_flags(kernel_eflags); +#else + asm ("pushfq; popq %0" : "=rm" (kernel_eflags)); + if (raw_irqs_disabled()) + kernel_eflags &= ~X86_EFLAGS_IF; +#endif + + if (is_uv_system()) + uv_cpu_init(); } -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ +#else + void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); @@ -747,19 +1214,21 @@ void __cpuinit cpu_init(void) /* * Force FPU initialization: */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); -} -#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) -void __cpuinit cpu_uninit(void) -{ - int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); + /* + * Boot processor to setup the FP and extended state context info. + */ + if (smp_processor_id() == boot_cpu_id) + init_thread_xstate(); - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; + xsave_init(); } + + #endif --- head-2011-03-17.orig/arch/x86/kernel/cpu/common_64-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,777 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#include -#elif defined(CONFIG_XEN) -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cpu.h" - -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, -} }; -EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); - -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ -#ifndef CONFIG_XEN - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -#else - void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id()); - unsigned long frames[16]; - unsigned int f = 0; - - for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) { - frames[f++] = virt_to_mfn(va); - make_page_readonly(va, XENFEAT_writable_descriptor_tables); - } - if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct))) - BUG(); -#endif -} - -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; - -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ - display_cacheinfo(c); -} - -static struct cpu_dev __cpuinitdata default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", -}; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, ebx, ecx, edx; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " - "D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24) + (edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } -} - -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); - } -out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } - -#endif -} - -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - int i; - static int printed; - - for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - this_cpu = cpu_devs[i]; - return; - } - } - } - if (!printed) { - printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -static void __init early_cpu_support_print(void) -{ - int i,j; - struct cpu_dev *cpu_devx; - - printk("KERNEL supported cpus:\n"); - for (i = 0; i < X86_VENDOR_NUM; i++) { - cpu_devx = cpu_devs[i]; - if (!cpu_devx) - continue; - for (j = 0; j < 2; j++) { - if (!cpu_devx->c_ident[j]) - continue; - printk(" %s %s\n", cpu_devx->c_vendor, - cpu_devx->c_ident[j]); - } - } -} - -/* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6, unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. Hence, probe for it based on first - * principles. - * - * Note: no 64-bit chip is known to lack these, but put the code here - * for consistency with 32 bits, and to make it utterly trivial to - * diagnose the problem should it ever surface. - */ -static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) -{ - const u32 nopl_signature = 0x888c53b1; /* Random number */ - u32 has_nopl = nopl_signature; - - clear_cpu_cap(c, X86_FEATURE_NOPL); - if (c->x86 >= 6) { - asm volatile("\n" - "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ - "2:\n" - " .section .fixup,\"ax\"\n" - "3: xor %0,%0\n" - " jmp 2b\n" - " .previous\n" - _ASM_EXTABLE(1b,3b) - : "+a" (has_nopl)); - - if (has_nopl == nopl_signature) - set_cpu_cap(c, X86_FEATURE_NOPL); - } -} - -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); - -void __init early_cpu_init(void) -{ - struct cpu_vendor_dev *cvdev; - - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - early_cpu_support_print(); - early_identify_cpu(&boot_cpu_data); -} - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms, xlvl; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; -#ifndef CONFIG_XEN - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; -#endif - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - -#ifndef CONFIG_XEN - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; -#ifdef CONFIG_SMP - c->phys_proc_id = c->initial_apicid; -#endif -#endif - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - - if (c->extended_cpuid_level >= 0x80000008) { - u32 eax = cpuid_eax(0x80000008); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - detect_nopl(c); - - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); - - validate_pat_support(c); -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - - early_identify_cpu(c); - - init_scattered_cpuid_features(c); - -#ifndef CONFIG_XEN - c->apicid = phys_pkg_id(0); -#endif - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - if (this_cpu->c_init) - this_cpu->c_init(c); - - detect_ht(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0; i < NCAPINTS; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif - select_idle_routine(c); - -#ifdef CONFIG_NUMA - numa_add_cpu(smp_processor_id()); -#endif - -} - -void __cpuinit identify_boot_cpu(void) -{ - identify_cpu(&boot_cpu_data); -} - -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) -{ - BUG_ON(c == &boot_cpu_data); - identify_cpu(c); - mtrr_ap_init(); -} - -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); - return 1; -} -__setup("noclflush", setup_noclflush); - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); - else - printk(KERN_CONT "\n"); -} - -static __init int setup_disablecpuid(char *arg) -{ - int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; -} -__setup("clearcpuid=", setup_disablecpuid); - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - -#ifndef CONFIG_X86_NO_IDT -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -#endif - -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -static void __init_refok switch_pt(int cpu) -{ -#ifdef CONFIG_XEN - if (cpu == 0) - xen_init_pt(); - xen_pt_switch(__pa_symbol(init_level4_pgt)); - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); -#endif -} - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - loadsegment(fs, 0); - loadsegment(gs, 0); -#ifndef CONFIG_XEN - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); -#else - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, - (unsigned long)pda)) - BUG(); -#endif - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = (unsigned long)stack_thread_info() - - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - pda->irqstackptr += IRQSTACKSIZE - 64; - } else { - if (!pda->irqstackptr) { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", - cpu); - pda->irqstackptr += IRQSTACKSIZE - 64; - } - - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } - - switch_pt(cpu); -} - -#ifndef CONFIG_X86_NO_TSS -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; -#endif - -extern asmlinkage void ignore_sysret(void); - -void __cpuinit syscall_init(void) -{ -#ifndef CONFIG_XEN - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); -#endif -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); -#else - static const struct callback_register __cpuinitconst cstar = { - .type = CALLBACKTYPE_syscall32, - .address = (unsigned long)ignore_sysret - }; - - if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar)) - printk(KERN_WARNING "Unable to register CSTAR callback\n"); -#endif -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - -unsigned long kernel_eflags; - -#ifndef CONFIG_X86_NO_TSS -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); -#endif - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init(void) -{ - int cpu = stack_smp_processor_id(); -#ifndef CONFIG_X86_NO_TSS - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - int i; -#endif - struct task_struct *me; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) - pda_init(cpu); -#ifndef CONFIG_X86_NO_TSS - else - estacks = boot_exception_stacks; -#endif - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - - switch_to_new_gdt(); -#ifndef CONFIG_X86_NO_IDT - load_idt((const struct desc_ptr *)&idt_descr); -#endif - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - -#ifndef CONFIG_X86_NO_TSS - /* - * set up and load the per-CPU TSS - */ - if (!orig_ist->ist[0]) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception " - "stack %ld %d\n", v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - } - } - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; -#endif - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_sp0(t, ¤t->thread); -#ifndef CONFIG_X86_NO_TSS - set_tss_desc(cpu, t); - load_TR_desc(); -#endif - load_LDT(&init_mm.context); - -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } -#endif - - fpu_init(); - - asm ("pushfq; popq %0" : "=rm" (kernel_eflags)); - if (raw_irqs_disabled()) - kernel_eflags &= ~X86_EFLAGS_IF; - - if (is_uv_system()) - uv_cpu_init(); -} --- head-2011-03-17.orig/arch/x86/kernel/dumpstack_64.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/dumpstack_64.c 2011-02-01 14:39:24.000000000 +0100 @@ -21,6 +21,7 @@ #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) +#ifndef CONFIG_X86_NO_TSS static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", @@ -32,10 +33,12 @@ static char x86_stack_ids[][8] = { N_EXCEPTION_STACKS_END ] = "#DB[?]" #endif }; +#endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { +#ifndef CONFIG_X86_NO_TSS unsigned k; /* @@ -95,6 +98,7 @@ static unsigned long *in_exception_stack } #endif } +#endif /* CONFIG_X86_NO_TSS */ return NULL; } --- head-2011-03-17.orig/arch/x86/kernel/e820-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/e820-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -167,6 +167,9 @@ static void __init _e820_print_map(const case E820_NVS: printk(KERN_CONT "(ACPI NVS)\n"); break; + case E820_UNUSABLE: + printk("(unusable)\n"); + break; default: printk(KERN_CONT "type %u\n", e820->map[i].type); break; @@ -1399,6 +1402,7 @@ static inline const char *e820_type_to_s case E820_RAM: return "System RAM"; case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; default: return "reserved"; } } @@ -1410,6 +1414,7 @@ static inline const char *e820_type_to_s /* * Mark e820 reserved areas as busy for the resource manager. */ +static struct resource __initdata *e820_res; void __init e820_reserve_resources(void) { int i; @@ -1417,20 +1422,28 @@ void __init e820_reserve_resources(void) u64 end; res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); + res->flags = IORESOURCE_MEM; + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + res->flags |= IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + } res++; } @@ -1442,6 +1455,19 @@ void __init e820_reserve_resources(void) } } +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (!res->parent && res->end) + insert_resource_expand_to_fit(&iomem_resource, res); + res++; + } +} + #undef e820 #ifndef CONFIG_XEN --- head-2011-03-17.orig/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -3,10 +3,18 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include +#include +#include +#include #ifndef CONFIG_XEN /* Simple VGA output */ @@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; / static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); outb(ch, early_serial_base + TXR); @@ -111,7 +120,7 @@ static __init void early_serial_init(cha if (!strncmp(s, "0x", 2)) { early_serial_base = simple_strtoul(s, &e, 16); } else { - static int bases[] = { 0x3f8, 0x2f8 }; + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; @@ -180,6 +189,721 @@ static struct console early_serial_conso .index = -1, }; +#ifdef CONFIG_EARLY_PRINTK_DBGP + +static struct ehci_caps __iomem *ehci_caps; +static struct ehci_regs __iomem *ehci_regs; +static struct ehci_dbg_port __iomem *ehci_debug; +static unsigned int dbgp_endpoint_out; + +struct ehci_dev { + u32 bus; + u32 slot; + u32 func; +}; + +static struct ehci_dev ehci_dev; + +#define USB_DEBUG_DEVNUM 127 + +#define DBGP_DATA_TOGGLE 0x8800 + +static inline u32 dbgp_pid_update(u32 x, u32 tok) +{ + return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); +} + +static inline u32 dbgp_len_update(u32 x, u32 len) +{ + return (x & ~0x0f) | (len & 0x0f); +} + +/* + * USB Packet IDs (PIDs) + */ + +/* token */ +#define USB_PID_OUT 0xe1 +#define USB_PID_IN 0x69 +#define USB_PID_SOF 0xa5 +#define USB_PID_SETUP 0x2d +/* handshake */ +#define USB_PID_ACK 0xd2 +#define USB_PID_NAK 0x5a +#define USB_PID_STALL 0x1e +#define USB_PID_NYET 0x96 +/* data */ +#define USB_PID_DATA0 0xc3 +#define USB_PID_DATA1 0x4b +#define USB_PID_DATA2 0x87 +#define USB_PID_MDATA 0x0f +/* Special */ +#define USB_PID_PREAMBLE 0x3c +#define USB_PID_ERR 0x3c +#define USB_PID_SPLIT 0x78 +#define USB_PID_PING 0xb4 +#define USB_PID_UNDEF_0 0xf0 + +#define USB_PID_DATA_TOGGLE 0x88 +#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) + +#define PCI_CAP_ID_EHCI_DEBUG 0xa + +#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ +#define HUB_SHORT_RESET_TIME 10 +#define HUB_LONG_RESET_TIME 200 +#define HUB_RESET_TIMEOUT 500 + +#define DBGP_MAX_PACKET 8 + +static int dbgp_wait_until_complete(void) +{ + u32 ctrl; + int loop = 0x100000; + + do { + ctrl = readl(&ehci_debug->control); + /* Stop when the transaction is finished */ + if (ctrl & DBGP_DONE) + break; + } while (--loop > 0); + + if (!loop) + return -1; + + /* + * Now that we have observed the completed transaction, + * clear the done bit. + */ + writel(ctrl | DBGP_DONE, &ehci_debug->control); + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); +} + +static void dbgp_mdelay(int ms) +{ + int i; + + while (ms--) { + for (i = 0; i < 1000; i++) + outb(0x1, 0x80); + } +} + +static void dbgp_breath(void) +{ + /* Sleep to give the debug port a chance to breathe */ +} + +static int dbgp_wait_until_done(unsigned ctrl) +{ + u32 pids, lpid; + int ret; + int loop = 3; + +retry: + writel(ctrl | DBGP_GO, &ehci_debug->control); + ret = dbgp_wait_until_complete(); + pids = readl(&ehci_debug->pids); + lpid = DBGP_PID_GET(pids); + + if (ret < 0) + return ret; + + /* + * If the port is getting full or it has dropped data + * start pacing ourselves, not necessary but it's friendly. + */ + if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) + dbgp_breath(); + + /* If I get a NACK reissue the transmission */ + if (lpid == USB_PID_NAK) { + if (--loop > 0) + goto retry; + } + + return ret; +} + +static void dbgp_set_data(const void *buf, int size) +{ + const unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = hi = 0; + for (i = 0; i < 4 && i < size; i++) + lo |= bytes[i] << (8*i); + for (; i < 8 && i < size; i++) + hi |= bytes[i] << (8*(i - 4)); + writel(lo, &ehci_debug->data03); + writel(hi, &ehci_debug->data47); +} + +static void dbgp_get_data(void *buf, int size) +{ + unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = readl(&ehci_debug->data03); + hi = readl(&ehci_debug->data47); + for (i = 0; i < 4 && i < size; i++) + bytes[i] = (lo >> (8*i)) & 0xff; + for (; i < 8 && i < size; i++) + bytes[i] = (hi >> (8*(i - 4))) & 0xff; +} + +static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, + const char *bytes, int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_OUT); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + dbgp_set_data(bytes, size); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + return ret; +} + +static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_IN); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl &= ~DBGP_OUT; + ctrl |= DBGP_GO; + + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + if (size > ret) + size = ret; + dbgp_get_data(data, size); + return ret; +} + +static int dbgp_control_msg(unsigned devnum, int requesttype, int request, + int value, int index, void *data, int size) +{ + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; + int read; + int ret; + + read = (requesttype & USB_DIR_IN) != 0; + if (size > (read ? DBGP_MAX_PACKET:0)) + return -1; + + /* Compute the control message */ + req.bRequestType = requesttype; + req.bRequest = request; + req.wValue = cpu_to_le16(value); + req.wIndex = cpu_to_le16(index); + req.wLength = cpu_to_le16(size); + + pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); + addr = DBGP_EPADDR(devnum, 0); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, sizeof(req)); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + /* Send the setup message */ + dbgp_set_data(&req, sizeof(req)); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + /* Read the result */ + return dbgp_bulk_read(devnum, 0, data, size); +} + + +/* Find a PCI capability */ +static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) +{ + u8 pos; + int bytes; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) +{ + u32 class; + + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) + return 0; + + return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); +} + +static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) +{ + u32 bus, slot, func; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + unsigned cap; + + cap = __find_dbgp(bus, slot, func); + + if (!cap) + continue; + if (ehci_num-- != 0) + continue; + *rbus = bus; + *rslot = slot; + *rfunc = func; + return cap; + } + } + } + return 0; +} + +static int ehci_reset_port(int port) +{ + u32 portsc; + u32 delay_time, delay; + int loop; + + /* Reset the usb debug port */ + portsc = readl(&ehci_regs->port_status[port - 1]); + portsc &= ~PORT_PE; + portsc |= PORT_RESET; + writel(portsc, &ehci_regs->port_status[port - 1]); + + delay = HUB_ROOT_RESET_TIME; + for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; + delay_time += delay) { + dbgp_mdelay(delay); + + portsc = readl(&ehci_regs->port_status[port - 1]); + if (portsc & PORT_RESET) { + /* force reset to complete */ + loop = 2; + writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), + &ehci_regs->port_status[port - 1]); + do { + portsc = readl(&ehci_regs->port_status[port-1]); + } while ((portsc & PORT_RESET) && (--loop > 0)); + } + + /* Device went away? */ + if (!(portsc & PORT_CONNECT)) + return -ENOTCONN; + + /* bomb out completely if something weird happend */ + if ((portsc & PORT_CSC)) + return -EINVAL; + + /* If we've finished resetting, then break out of the loop */ + if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) + return 0; + } + return -EBUSY; +} + +static int ehci_wait_for_port(int port) +{ + u32 status; + int ret, reps; + + for (reps = 0; reps < 3; reps++) { + dbgp_mdelay(100); + status = readl(&ehci_regs->status); + if (status & STS_PCD) { + ret = ehci_reset_port(port); + if (ret == 0) + return 0; + } + } + return -ENOTCONN; +} + +#ifdef DBGP_DEBUG +# define dbgp_printk early_printk +#else +static inline void dbgp_printk(const char *fmt, ...) { } +#endif + +typedef void (*set_debug_port_t)(int port); + +static void default_set_debug_port(int port) +{ +} + +static set_debug_port_t set_debug_port = default_set_debug_port; + +static void nvidia_set_debug_port(int port) +{ + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x74); + dword &= ~(0x0f<<12); + dword |= ((port & 0x0f)<<12); + write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, + dword); + dbgp_printk("set debug port to %d\n", port); +} + +static void __init detect_set_debug_port(void) +{ + u32 vendorid; + + vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x00); + + if ((vendorid & 0xffff) == 0x10de) { + dbgp_printk("using nvidia set_debug_port\n"); + set_debug_port = nvidia_set_debug_port; + } +} + +static int __init ehci_setup(void) +{ + struct usb_debug_descriptor dbgp_desc; + u32 cmd, ctrl, status, portsc, hcs_params; + u32 debug_port, new_debug_port = 0, n_ports; + u32 devnum; + int ret, i; + int loop; + int port_map_tried; + int playtimes = 3; + +try_next_time: + port_map_tried = 0; + +try_next_port: + + hcs_params = readl(&ehci_caps->hcs_params); + debug_port = HCS_DEBUG_PORT(hcs_params); + n_ports = HCS_N_PORTS(hcs_params); + + dbgp_printk("debug_port: %d\n", debug_port); + dbgp_printk("n_ports: %d\n", n_ports); + + for (i = 1; i <= n_ports; i++) { + portsc = readl(&ehci_regs->port_status[i-1]); + dbgp_printk("portstatus%d: %08x\n", i, portsc); + } + + if (port_map_tried && (new_debug_port != debug_port)) { + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + return -1; + } + + loop = 10; + /* Reset the EHCI controller */ + cmd = readl(&ehci_regs->command); + cmd |= CMD_RESET; + writel(cmd, &ehci_regs->command); + do { + cmd = readl(&ehci_regs->command); + } while ((cmd & CMD_RESET) && (--loop > 0)); + + if (!loop) { + dbgp_printk("can not reset ehci\n"); + return -1; + } + dbgp_printk("ehci reset done\n"); + + /* Claim ownership, but do not enable yet */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_OWNER; + ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); + writel(ctrl, &ehci_debug->control); + + /* Start the ehci running */ + cmd = readl(&ehci_regs->command); + cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); + cmd |= CMD_RUN; + writel(cmd, &ehci_regs->command); + + /* Ensure everything is routed to the EHCI */ + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ + loop = 10; + do { + status = readl(&ehci_regs->status); + } while ((status & STS_HALT) && (--loop > 0)); + + if (!loop) { + dbgp_printk("ehci can be started\n"); + return -1; + } + dbgp_printk("ehci started\n"); + + /* Wait for a device to show up in the debug port */ + ret = ehci_wait_for_port(debug_port); + if (ret < 0) { + dbgp_printk("No device found in debug port\n"); + goto next_debug_port; + } + dbgp_printk("ehci wait for port done\n"); + + /* Enable the debug port */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_CLAIM; + writel(ctrl, &ehci_debug->control); + ctrl = readl(&ehci_debug->control); + if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { + dbgp_printk("No device in debug port\n"); + writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); + goto err; + } + dbgp_printk("debug ported enabled\n"); + + /* Completely transfer the debug device to the debug controller */ + portsc = readl(&ehci_regs->port_status[debug_port - 1]); + portsc &= ~PORT_PE; + writel(portsc, &ehci_regs->port_status[debug_port - 1]); + + dbgp_mdelay(100); + + /* Find the debug device and make it device number 127 */ + for (devnum = 0; devnum <= 127; devnum++) { + ret = dbgp_control_msg(devnum, + USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, + &dbgp_desc, sizeof(dbgp_desc)); + if (ret > 0) + break; + } + if (devnum > 127) { + dbgp_printk("Could not find attached debug device\n"); + goto err; + } + if (ret < 0) { + dbgp_printk("Attached device is not a debug device\n"); + goto err; + } + dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; + + /* Move the device to 127 if it isn't already there */ + if (devnum != USB_DEBUG_DEVNUM) { + ret = dbgp_control_msg(devnum, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); + if (ret < 0) { + dbgp_printk("Could not move attached device to %d\n", + USB_DEBUG_DEVNUM); + goto err; + } + devnum = USB_DEBUG_DEVNUM; + dbgp_printk("debug device renamed to 127\n"); + } + + /* Enable the debug interface */ + ret = dbgp_control_msg(USB_DEBUG_DEVNUM, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); + if (ret < 0) { + dbgp_printk(" Could not enable the debug device\n"); + goto err; + } + dbgp_printk("debug interface enabled\n"); + + /* Perform a small write to get the even/odd data state in sync + */ + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); + if (ret < 0) { + dbgp_printk("dbgp_bulk_write failed: %d\n", ret); + goto err; + } + dbgp_printk("small write doned\n"); + + return 0; +err: + /* Things didn't work so remove my claim */ + ctrl = readl(&ehci_debug->control); + ctrl &= ~(DBGP_CLAIM | DBGP_OUT); + writel(ctrl, &ehci_debug->control); + return -1; + +next_debug_port: + port_map_tried |= (1<<(debug_port - 1)); + new_debug_port = ((debug_port-1+1)%n_ports) + 1; + if (port_map_tried != ((1<> 29) & 0x7; + bar = (bar * 4) + 0xc; + offset = (debug_port >> 16) & 0xfff; + dbgp_printk("bar: %02x offset: %03x\n", bar, offset); + if (bar != PCI_BASE_ADDRESS_0) { + dbgp_printk("only debug ports on bar 1 handled.\n"); + + return -1; + } + + bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); + if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { + dbgp_printk("only simple 32bit mmio bars supported\n"); + + return -1; + } + + /* double check if the mem space is enabled */ + byte = read_pci_config_byte(bus, slot, func, 0x04); + if (!(byte & 0x2)) { + byte |= 0x02; + write_pci_config_byte(bus, slot, func, 0x04, byte); + dbgp_printk("mmio for ehci enabled\n"); + } + + /* + * FIXME I don't have the bar size so just guess PAGE_SIZE is more + * than enough. 1K is the biggest I have seen. + */ + set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); + ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); + ehci_bar += bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + + ehci_caps = ehci_bar; + ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); + ehci_debug = ehci_bar + offset; + ehci_dev.bus = bus; + ehci_dev.slot = slot; + ehci_dev.func = func; + + detect_set_debug_port(); + + ret = ehci_setup(); + if (ret < 0) { + dbgp_printk("ehci_setup failed\n"); + ehci_debug = NULL; + + return -1; + } + + return 0; +} + +static void early_dbgp_write(struct console *con, const char *str, u32 n) +{ + int chunk, ret; + + if (!ehci_debug) + return; + while (n > 0) { + chunk = n; + if (chunk > DBGP_MAX_PACKET) + chunk = DBGP_MAX_PACKET; + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, + dbgp_endpoint_out, str, chunk); + str += chunk; + n -= chunk; + } +} + +static struct console early_dbgp_console = { + .name = "earlydbg", + .write = early_dbgp_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; +#endif + /* Console interface to a host file on AMD's SimNow! */ static int simnow_fd; @@ -194,6 +918,7 @@ enum { static noinline long simnow(long cmd, long a, long b, long c) { long ret; + asm volatile("cpuid" : "=a" (ret) : "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); @@ -203,6 +928,7 @@ static noinline long simnow(long cmd, lo static void __init simnow_init(char *str) { char *fn = "klog"; + if (*str == '=') fn = ++str; /* error ignored */ @@ -223,7 +949,7 @@ static struct console simnow_console = { /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; -static int early_console_initialized; +static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { @@ -237,10 +963,11 @@ asmlinkage void early_printk(const char va_end(ap); } -static int __initdata keep_early; static int __init setup_early_printk(char *buf) { + int keep_early; + if (!buf) return 0; @@ -248,8 +975,7 @@ static int __init setup_early_printk(cha return 0; early_console_initialized = 1; - if (strstr(buf, "keep")) - keep_early = 1; + keep_early = (strstr(buf, "keep") != NULL); if (!strncmp(buf, "serial", 6)) { early_serial_init(buf + 6); @@ -269,6 +995,17 @@ static int __init setup_early_printk(cha simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_EARLY_PRINTK_DBGP + } else if (!strncmp(buf, "dbgp", 4)) { + if (early_dbgp_init(buf+4) < 0) + return 0; + early_console = &early_dbgp_console; + /* + * usb subsys will reset ehci controller, so don't keep + * that early console + */ + keep_early = 0; +#endif #ifdef CONFIG_XEN } else if (!strncmp(buf, "xen", 3)) { early_console = &xenboot_console; @@ -282,4 +1019,5 @@ static int __init setup_early_printk(cha register_console(early_console); return 0; } + early_param("earlyprintk", setup_early_printk); --- head-2011-03-17.orig/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:39:24.000000000 +0100 @@ -700,7 +700,7 @@ ENTRY(interrupt) ENTRY(irq_entries_start) RING0_INT_FRAME vector=0 -.rept NR_IRQS +.rept NR_VECTORS ALIGN .if vector CFI_ADJUST_CFA_OFFSET -4 @@ -805,6 +805,7 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es + TRACE_IRQS_OFF movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception @@ -974,22 +975,9 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL -#ifndef CONFIG_XEN - GET_CR0_INTO_EAX - testl $0x4, %eax # EM (math emulation bit) - je device_available_emulate - pushl $0 # temporary storage for ORIG_EIP + pushl $do_device_not_available CFI_ADJUST_CFA_OFFSET 4 - call math_emulate - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 - jmp ret_from_exception -device_available_emulate: -#endif - preempt_stop(CLBR_ANY) - call math_state_restore - jmp ret_from_exception + jmp error_code CFI_ENDPROC END(device_not_available) @@ -1034,6 +1022,7 @@ debug_stack_correct: pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug @@ -1079,6 +1068,7 @@ nmi_stack_correct: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi @@ -1119,6 +1109,7 @@ nmi_espfix_stack: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi @@ -1162,6 +1153,7 @@ KPROBE_ENTRY(int3) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 @@ -1303,24 +1295,10 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - subl $MCOUNT_INSN_SIZE, %eax - -.globl mcount_call -mcount_call: - call ftrace_stub - - popl %edx - popl %ecx - popl %eax - ret END(mcount) @@ -1372,7 +1350,7 @@ trace: jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #include --- head-2011-03-17.orig/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:39:24.000000000 +0100 @@ -66,35 +66,9 @@ .code64 -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - - movq 0x38(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - -.globl mcount_call -mcount_call: - call ftrace_stub - - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp - retq END(mcount) @@ -169,7 +143,7 @@ trace: jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args @@ -297,9 +271,9 @@ NMI_MASK = 0x80000000 ENTRY(ret_from_fork) CFI_DEFAULT_STACK push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 4 + CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -4 + CFI_ADJUST_CFA_OFFSET -8 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) @@ -863,6 +837,9 @@ END(spurious_interrupt) .if \ist movq %gs:pda_data_offset, %rbp .endif + .if \irqtrace + TRACE_IRQS_OFF + .endif movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) @@ -1271,7 +1248,7 @@ ENTRY(simd_coprocessor_error) END(simd_coprocessor_error) ENTRY(device_not_available) - zeroentry math_state_restore + zeroentry do_device_not_available END(device_not_available) /* runs on exception stack */ @@ -1370,9 +1347,11 @@ ENTRY(divide_error) zeroentry do_divide_error END(divide_error) +#ifndef CONFIG_XEN ENTRY(spurious_interrupt_bug) zeroentry do_spurious_interrupt_bug END(spurious_interrupt_bug) +#endif #ifdef CONFIG_X86_MCE /* runs on exception stack */ --- head-2011-03-17.orig/arch/x86/kernel/fixup.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/fixup.c 2011-02-01 14:39:24.000000000 +0100 @@ -37,7 +37,7 @@ #define DP(_f, _args...) pr_alert(" " _f "\n" , ## _args ) -void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) { static unsigned long printed = 0; char info[100]; --- head-2011-03-17.orig/arch/x86/kernel/head-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -36,6 +36,7 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); + printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ --- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -151,12 +151,11 @@ void __init x86_64_start_kernel(char * r load_idt((const struct desc_ptr *)&idt_descr); #endif - early_printk("Kernel alive\n"); + if (console_loglevel == 10) + early_printk("Kernel alive\n"); x86_64_init_pda(); - early_printk("Kernel really alive\n"); - x86_64_start_reservations(real_mode_data); } --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-17/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -0,0 +1,3937 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* time_after() */ +#ifdef CONFIG_ACPI +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_XEN +#include +#include +#include + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; +#endif /* CONFIG_XEN */ + +#define __apicdebuginit(type) static type __init + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +static DEFINE_SPINLOCK(ioapic_lock); +#ifndef CONFIG_XEN +static DEFINE_SPINLOCK(vector_lock); +#endif + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* I/O APIC entries */ +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + +struct irq_pin_list; +struct irq_cfg { +#ifndef CONFIG_XEN + unsigned int irq; + struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; +#endif + u8 vector; +#ifndef CONFIG_XEN + u8 move_in_progress : 1; +#endif +}; + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfgx[NR_IRQS] = { + [0] = { .irq = 0 }, + [1] = { .irq = 1 }, + [2] = { .irq = 2 }, + [3] = { .irq = 3 }, + [4] = { .irq = 4 }, + [5] = { .irq = 5 }, + [6] = { .irq = 6 }, + [7] = { .irq = 7 }, + [8] = { .irq = 8 }, + [9] = { .irq = 9 }, + [10] = { .irq = 10 }, + [11] = { .irq = 11 }, + [12] = { .irq = 12 }, + [13] = { .irq = 13 }, + [14] = { .irq = 14 }, + [15] = { .irq = 15 }, +}; + +#define for_each_irq_cfg(irq, cfg) \ + for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + return irq_cfg(irq); +} + +#ifdef CONFIG_XEN +#define irq_2_pin_init() +#define add_pin_to_irq(irq, apic, pin) +#else +/* + * Rough estimation of how many shared IRQs there are, can be changed + * anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; +static struct irq_pin_list *irq_2_pin_ptr; + +static void __init irq_2_pin_init(void) +{ + struct irq_pin_list *pin = irq_2_pin_head; + int i; + + for (i = 1; i < PIN_MAP_SIZE; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin = irq_2_pin_ptr; + + if (!pin) + panic("can not get more irq_2_pin\n"); + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); +} +#endif + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +#else + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (ret) + return ret; + return apic_op.value; +#endif +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +#else + struct physdev_apic apic_op; + + apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; + apic_op.reg = reg; + apic_op.value = value; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); +#endif +} + +#ifdef CONFIG_XEN +#define io_apic_modify io_apic_write +#else +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +static bool io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + int pin; + + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + if (!entry->next) + break; + entry = entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} +#endif /* CONFIG_XEN */ + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +#ifndef CONFIG_XEN +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} +#endif + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu; + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifndef CONFIG_XEN +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; +#ifdef CONFIG_INTR_REMAP + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); +#else + io_apic_write(apic, 0x11 + pin*2, dest); +#endif + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int assign_irq_vector(int irq, cpumask_t mask); + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + cfg = irq_cfg(irq); + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + desc = irq_to_desc(irq); + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + desc->affinity = mask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} +#endif /* CONFIG_SMP */ + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + return; + } + + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; + + entry = entry->next; + } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; + entry->apic = apic; + entry->pin = pin; +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; + + while (entry) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + replaced = 1; + /* every one is different, right? */ + break; + } + entry = entry->next; + } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); +} + +static inline void io_apic_modify_irq(unsigned int irq, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + int pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + unsigned int reg; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); + } +} + +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); +} + +#ifdef CONFIG_X86_64 +void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} + +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +} +#else /* CONFIG_X86_32 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); +} + +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); +} +#endif /* CONFIG_X86_32 */ + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + ioapic_mask_entry(apic, pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ +#endif /* !CONFIG_XEN */ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + goto nomem; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + + return 0; + +nomem: + while (apic >= 0) + kfree(early_ioapic_entries[apic--]); + memset(early_ioapic_entries, 0, + ARRAY_SIZE(early_ioapic_entries)); + + return -ENOMEM; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); + kfree(early_ioapic_entries[apic]); + early_ioapic_entries[apic] = NULL; + } +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} +#endif + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) + return i; + + return -1; +} + +#ifndef CONFIG_XEN +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) + + return mp_irqs[i].mp_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) + return apic; + } + } + + return -1; +} +#endif + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].mp_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + +static int MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mp_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mp_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mp_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif + break; + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +int (*ioapic_renumber_irq)(int ioapic, int irq); +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mp_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mp_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mp_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + + return irq; +} + +#ifndef CONFIG_XEN +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} +#endif + +static int assign_irq_vector(int irq, cpumask_t mask) +{ + struct physdev_irq irq_op; + struct irq_cfg *cfg; + + if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) + return -EINVAL; + + cfg = irq_cfg(irq); + + if (cfg->vector) + return 0; + + irq_op.irq = irq; + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) + return -ENOSPC; + + cfg->vector = irq_op.vector; + + return 0; +} + +#ifndef CONFIG_XEN +static void __clear_irq_vector(int irq) +{ + struct irq_cfg *cfg; + cpumask_t mask; + int cpu, vector; + + cfg = irq_cfg(irq); + BUG_ON(!cfg->vector); + + vector = cfg->vector; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cpus_clear(cfg->domain); + + if (likely(!cfg->move_in_progress)) + return; + cpus_and(mask, cfg->old_domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; + vector++) { + if (per_cpu(vector_irq, cpu)[vector] != irq) + continue; + per_cpu(vector_irq, cpu)[vector] = -1; + break; + } + } + cfg->move_in_progress = 0; +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + struct irq_cfg *cfg; + + /* Mark the inuse vectors */ + for_each_irq_cfg(irq, cfg) { + if (!cpu_isset(cpu, cfg->domain)) + continue; + vector = cfg->vector; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } +} + +static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + +static void ioapic_register_intr(int irq, unsigned long trigger) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + desc->status |= IRQ_LEVEL; + else + desc->status &= ~IRQ_LEVEL; + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + desc->status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); +} +#else /* !CONFIG_XEN */ +#define __clear_irq_vector(irq) ((void)(irq)) +#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq) +#endif + +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) +{ + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; + } + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + return 0; +} + +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, + int trigger, int polarity) +{ + struct irq_cfg *cfg; + struct IO_APIC_route_entry entry; + cpumask_t mask; + + if (!IO_APIC_IRQ(irq)) + return; + + cfg = irq_cfg(irq); + + mask = TARGET_CPUS; + if (assign_irq_vector(irq, mask)) + return; + +#ifndef CONFIG_XEN + cpus_and(mask, cfg->domain, mask); +#endif + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, + irq, trigger, polarity); + + + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } + + ioapic_register_intr(irq, trigger); + if (irq < 16) + disable_8259A_irq(irq); + + ioapic_write_entry(apic, pin, entry); +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq; + int notcon = 0; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } + + irq = pin_2_irq(idx, apic, pin); +#if defined(CONFIG_XEN) + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) + continue; +#elif defined(CONFIG_X86_32) + if (multi_timer_check(apic, irq)) + continue; +#endif + add_pin_to_irq(irq, apic, pin); + + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } + } + + if (notcon) + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); +} + +#ifndef CONFIG_XEN +/* + * Set up the timer pin, possibly with the 8259A-master behind. + */ +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, + int vector) +{ + struct IO_APIC_route_entry entry; + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + return; +#endif + + memset(&entry, 0, sizeof(entry)); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 1; /* mask IRQ now */ + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we may have a 8259A-master in AEOI mode ... + */ + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(apic, pin, entry); +} + + +__apicdebuginit(void) print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + struct irq_cfg *cfg; + unsigned int irq; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + printk(KERN_DEBUG " %02x %03X ", + i, + entry.dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_irq_cfg(irq, cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) + continue; + printk(KERN_DEBUG "IRQ%d ", irq); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +__apicdebuginit(void) print_APIC_bitfield(int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +__apicdebuginit(void) print_all_local_APICs(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + preempt_enable(); +} + +__apicdebuginit(void) print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; +#endif /* !CONFIG_XEN */ + +void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; +#ifndef CONFIG_XEN + int i8259_apic, i8259_pin; +#endif + int apic; + unsigned long flags; + +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } +#ifndef CONFIG_XEN + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +#endif +} + +#ifndef CONFIG_XEN +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest = read_apic_id(); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + unsigned long flags; + + if (no_timer_check) + return 1; + + local_save_flags(flags); + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + local_irq_restore(flags); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (time_after(jiffies, t1 + 4)) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +#ifdef CONFIG_X86_64 +static int ioapic_retrigger_irq(unsigned int irq) +{ + + struct irq_cfg *cfg = irq_cfg(irq); + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +#ifdef CONFIG_SMP + +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); + +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte; + unsigned int dest; + unsigned long flags; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc->affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, desc->pending_mask); + + ret = 0; + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, desc->pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } +} +#else +static inline void irq_complete_move(unsigned int irq) {} +#endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif + +static void ack_apic_edge(unsigned int irq) +{ + irq_complete_move(irq); + move_native_irq(irq); + ack_APIC_irq(); +} + +atomic_t irq_mis_count; + +static void ack_apic_level(unsigned int irq) +{ +#ifdef CONFIG_X86_32 + unsigned long v; + int i; +#endif + int do_unmask_irq = 0; + + irq_complete_move(irq); +#ifdef CONFIG_GENERIC_PENDING_IRQ + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } +#endif + +#ifdef CONFIG_X86_32 + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +#endif + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); + unmask_IO_APIC_irq(irq); + } + +#ifdef CONFIG_X86_32 + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +#endif +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif +#endif /* !CONFIG_XEN */ + +static inline void init_IO_APIC_traps(void) +{ + int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for_each_irq_cfg(irq, cfg) { +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) + continue; +#endif + if (IO_APIC_IRQ(irq) && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else { + desc = irq_to_desc(irq); + /* Strange. Oh, well.. */ + desc->chip = &no_irq_chip; + } + } + } +} + +#ifndef CONFIG_XEN +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +static void __init setup_nmi(void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_cfg *cfg = irq_cfg(0); + int apic1, pin1, apic2, pin2; + unsigned long flags; + unsigned int ver; + int no_pin1 = 0; + + local_irq_save(flags); + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + assign_irq_vector(0, TARGET_CPUS); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); +#endif + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (no_pin1) { + add_pin_to_irq(0, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + } + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + unmask_IO_APIC_irq(0); + enable_8259A_irq(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + timer_through_8259 = 1; + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + goto out; + } + /* + * Cleanup, just in case ... + */ + disable_8259A_irq(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + if (nmi_watchdog == NMI_IO_APIC) { + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = NMI_NONE; + } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + disable_8259A_irq(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + init_8259A(0); + make_8259A_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_restore(flags); +} +#else +#define check_timer() ((void)0) +#endif + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + +#if defined(CONFIG_X86_32) || defined(CONFIG_XEN) + enable_IO_APIC(); +#else + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ +#endif + + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif + sync_Arb_IDs(); +#endif + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); +} + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; +#ifdef CONFIG_X86_XEN + if (is_initial_xendomain()) { + struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; + op.u.platform_quirk.quirk_id = sis_apic_bug ? + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; + VOID(HYPERVISOR_platform_op(&op)); + } +#endif + return 0; +} + +late_initcall(io_apic_bug_finalize); + +#ifndef CONFIG_XEN +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + .name = "ioapic", + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* + * Dynamic irq allocate and deallocation + */ +unsigned int create_irq_nr(unsigned int irq_want) +{ + /* Allocate an unused irq */ + unsigned int irq; + unsigned int new; + unsigned long flags; + struct irq_cfg *cfg_new; + + irq_want = nr_irqs - 1; + + irq = 0; + spin_lock_irqsave(&vector_lock, flags); + for (new = irq_want; new > 0; new--) { + if (platform_legacy_irq(new)) + continue; + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) + continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); + if (__assign_irq_vector(new, TARGET_CPUS) == 0) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq > 0) { + dynamic_irq_init(irq); + } + return irq; +} + +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); +} +#endif /* !CONFIG_XEN */ + +/* + * MSI message composition + */ +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (err) + return err; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } + return err; +} + +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_to_desc(irq); + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + + return 0; +} + +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + unsigned int irq; + int ret; + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; + +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + return 0; + +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif +} + +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; + +error: + destroy_irq(irq); + return ret; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#ifdef CONFIG_HPET_TIMER + +#ifdef CONFIG_SMP +static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + hpet_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = hpet_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_hpet_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + + hpet_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, + "edge"); + + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + target_ht_irq(irq, dest, cfg->vector); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg; + int err; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (!err) { + struct ht_irq_msg msg; + unsigned dest; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); + } + return err; +} +#endif /* CONFIG_HT_IRQ */ + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long flags; + int err; + + err = assign_irq_vector(irq, *eligible_cpu); + if (err != 0) + return err; + + spin_lock_irqsave(&vector_lock, flags); + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + spin_unlock_irqrestore(&vector_lock, flags); + + cfg = irq_cfg(irq); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->vector = cfg->vector; + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cpu_mask_to_apicid(*eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int mmr_pnode; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->mask = 1; + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} +#endif /* CONFIG_X86_64 */ + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + +int __init probe_nr_irqs(void) +{ + return NR_IRQS; +} + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +#ifdef CONFIG_X86_32 +#ifndef CONFIG_XEN +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} +#endif /* !CONFIG_XEN */ + +int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} +#endif + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +{ +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", + ioapic, irq); + return -EINVAL; + } +#endif + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + + return 0; +} + + +int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +{ + int i; + + if (skip_ioapic_setup) + return -1; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) + break; + if (i >= mp_irq_entries) + return -1; + + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return 0; +} + +#endif /* CONFIG_ACPI */ + +#ifndef CONFIG_XEN +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + struct irq_desc *desc; + struct irq_cfg *cfg; + cpumask_t mask; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + cfg = irq_cfg(irq); + if (!cfg->vector) { + setup_IO_APIC_irq(ioapic, pin, irq, + irq_trigger(irq_entry), + irq_polarity(irq_entry)); + continue; + + } + + /* + * Honour affinities which have been set in early boot + */ + desc = irq_to_desc(irq); + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = TARGET_CPUS; + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, mask); + else +#endif + set_ioapic_affinity_irq(irq, mask); + } + + } +} +#endif + +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + irq_2_pin_init(); + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif /* !CONFIG_XEN */ --- head-2011-03-17.orig/arch/x86/kernel/io_apic_32-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,2985 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku and - * Hidemi Kishimoto , - * further tested and cleaned up by Zach Brown - * and Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* time_after() */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef CONFIG_XEN -#include -#include -#include - -/* Fake i8259 */ -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -#define disable_8259A_irq(_irq) ((void)0) -#define i8259A_irq_pending(_irq) (0) - -unsigned long io_apic_irqs; - -#define clear_IO_APIC() ((void)0) -#endif /* CONFIG_XEN */ - -int (*ioapic_renumber_irq)(int ioapic, int irq); -atomic_t irq_mis_count; - -#ifndef CONFIG_XEN -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -#endif - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int timer_through_8259 __initdata; - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -static int disable_timer_pin_1 __initdata; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -#ifndef CONFIG_XEN -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} -#endif - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ -#ifndef CONFIG_XEN - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -#else - struct physdev_apic apic_op; - int ret; - - apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; - apic_op.reg = reg; - ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); - if (ret) - return ret; - return apic_op.value; -#endif -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ -#ifndef CONFIG_XEN - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -#else - struct physdev_apic apic_op; - - apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; - apic_op.reg = reg; - apic_op.value = value; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); -#endif -} - -#ifndef CONFIG_XEN -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - volatile struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} -#else -#define io_apic_modify io_apic_write -#endif - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -#ifndef CONFIG_XEN -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} -#endif - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifndef CONFIG_XEN -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); - } - entry->apic = apic; - entry->pin = pin; -} - -#ifndef CONFIG_XEN -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - - while (1) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - } - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int pin, reg; - - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -static void mask_IO_APIC_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -{ - unsigned long flags; - int pin; - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int apicid_value; - cpumask_t tmp; - - cpus_and(tmp, cpumask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(cpumask, tmp, CPU_MASK_ALL); - - apicid_value = cpu_mask_to_apicid(cpumask); - /* Prepare to do the io_apic_write */ - apicid_value = apicid_value << 24; - spin_lock_irqsave(&ioapic_lock, flags); - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - irq_desc[irq].affinity = cpumask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#if defined(CONFIG_IRQBALANCE) -# include /* kernel_thread() */ -# include /* kstat */ -# include /* kmalloc() */ -# include - -#define IRQBALANCE_CHECK_ARCH -999 -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -static int physical_balance __read_mostly; -static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; - -static struct irq_cpu_info { - unsigned long *last_irq; - unsigned long *irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) - -static cpumask_t balance_irq_affinity[NR_IRQS] = { - [0 ... NR_IRQS-1] = CPU_MASK_ALL -}; - -void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ - balance_irq_affinity[irq] = mask; -} - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu, now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) - set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - - for_each_online_cpu(i) { - for (j = 0; j < NR_IRQS; j++) { - if (!irq_desc[j].action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - - for_each_possible_cpu(i) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < NR_IRQS; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ or balancing disabled ? */ - if (!irq_desc[j].action || irq_balancing_disabled(j)) - continue; - if (package_index == i) - IRQ_DELTA(package_index, j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i, j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i, j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index, j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* - * Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* - * In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* - * if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) - goto not_worth_the_effort; - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < NR_IRQS; j++) { - /* Is this an active IRQ? */ - if (!irq_desc[j].action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded, j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded, j)) { - move_this_load = IRQ_DELTA(max_loaded, j); - selected_irq = j; - } - } - if (selected_irq == -1) - goto tryanothercpu; - - imbalance = move_this_load; - - /* For physical_balance case, we accumulated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, - cpu_online_map, - balance_irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - /* mark for change destination */ - set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) { - irq_desc[i].pending_mask = cpumask_of_cpu(0); - set_pending_irq(i, cpumask_of_cpu(0)); - } - - set_freezable(); - for ( ; ; ) { - time_remaining = schedule_timeout_interruptible(time_remaining); - try_to_freeze(); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - preempt_enable(); - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) - return 0; - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for_each_possible_cpu(i) { - kfree(irq_cpu_data[i].irq_delta); - irq_cpu_data[i].irq_delta = NULL; - kfree(irq_cpu_data[i].last_irq); - irq_cpu_data[i].last_irq = NULL; - } - return 0; -} - -int __devinit irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 1; -} - -__setup("noirqbalance", irqbalance_disable); - -late_initcall(balanced_irq_init); -#endif /* CONFIG_IRQBALANCE */ -#endif /* CONFIG_SMP */ -#endif - -#ifndef CONFIG_SMP -void send_IPI_self(int vector) -{ -#ifndef CONFIG_XEN - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -#endif -} -#endif /* !CONFIG_SMP */ - - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; -int skip_ioapic_setup; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -#ifndef CONFIG_XEN -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for (apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} -#endif - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " - "slot:%d, pin:%d.\n", bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -#ifndef CONFIG_XEN -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif /* !CONFIG_XEN */ -#endif - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - polarity = test_bit(bus, mp_bus_not_pci)? - default_ISA_polarity(idx): - default_PCI_polarity(idx); - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - case 0: /* conforms, ie. bus-type dependent */ - { - trigger = test_bit(bus, mp_bus_not_pci)? - default_ISA_trigger(idx): - default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) - irq = mp_irqs[idx].mp_srcbusirq; - else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - return irq; -} - -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ - -static int __assign_irq_vector(int irq) -{ - int vector; - struct physdev_irq irq_op; - - BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); - - if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) - return -EINVAL; - - if (irq_vector[irq] > 0) - return irq_vector[irq]; - - irq_op.irq = irq; - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) - return -ENOSPC; - - vector = irq_op.vector; - irq_vector[irq] = vector; - - return vector; -} - -static int assign_irq_vector(int irq) -{ - unsigned long flags; - int vector; - - spin_lock_irqsave(&vector_lock, flags); - vector = __assign_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); - - return vector; -} - -#ifndef CONFIG_XEN -static struct irq_chip ioapic_chip; - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) -{ - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); - } - set_intr_gate(vector, interrupt[irq]); -} -#else -#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) -#endif - -static void __init setup_IO_APIC_irqs(void) -{ - struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = - cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic, pin, mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mp_apicid, - pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mp_apicid, pin); - continue; - } - - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - } - - irq = pin_2_irq(idx, apic, pin); - /* - * skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum - */ - if (multi_timer_check(apic, irq)) - continue; - else - add_pin_to_irq(irq, apic, pin); - - if (/*!apic &&*/ !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -#ifndef CONFIG_XEN -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - ioapic_register_intr(0, vector, IOAPIC_EDGE); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - -void __init print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -static void print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -void /*__init*/ print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b, 0xa0); - outb(0x0b, 0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a, 0xa0); - outb(0x0a, 0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} -#else -void __init print_IO_APIC(void) {} -#endif /* !CONFIG_XEN */ - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; -#ifndef CONFIG_XEN - int i8259_apic, i8259_pin; -#endif - int i, apic; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } -#ifndef CONFIG_XEN - for (apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } -#endif - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - -#ifndef CONFIG_XEN - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(read_apic_id()); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -#endif -} - -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -#ifndef CONFIG_XEN -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - return; -#endif - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - if (time_after(jiffies, t1 + 4)) - return 1; - - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Startup quirk: - * - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - * - * (We do this for level-triggered IRQs too - it cannot hurt.) - */ -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -static void ack_ioapic_irq(unsigned int irq) -{ - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_ioapic_quirk_irq(unsigned int irq) -{ - unsigned long v; - int i; - - move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_vector[irq]; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} - -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_vector[irq]); - - return 1; -} - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_ioapic_irq, - .eoi = ack_ioapic_quirk_irq, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; -#endif /* !CONFIG_XEN */ - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); -#ifndef CONFIG_XEN - else - /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; -#endif - } - } -} - -#ifndef CONFIG_XEN -/* - * The local APIC irq-chip implementation: - */ - -static void ack_lapic_irq(unsigned int irq) -{ - ack_APIC_irq(); -} - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq, int vector) -{ - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); - set_intr_gate(vector, interrupt[irq]); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - */ -static inline void __init check_timer(void) -{ - int apic1, pin1, apic2, pin2; - int no_pin1 = 0; - int vector; - unsigned int ver; - unsigned long flags; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } - timer_ack = 0; - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0, vector); - apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} -#else -int timer_uses_ioapic_pin_0 = 0; -#define check_timer() ((void)0) -#endif - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ -#ifndef CONFIG_XEN - int i; - - /* Reserve all the system vectors. */ - for (i = first_system_vector; i < NR_VECTORS; i++) - set_bit(i, used_vectors); -#endif - - enable_IO_APIC(); - - io_apic_irqs = ~PIC_IRQS; - - printk("ENABLING IO-APIC IRQs\n"); - -#ifndef CONFIG_XEN - /* - * Set up IO-APIC IRQ routing. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); - sync_Arb_IDs(); -#endif - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - if (is_initial_xendomain()) { - struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; - op.u.platform_quirk.quirk_id = sis_apic_bug ? - QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; - VOID(HYPERVISOR_platform_op(&op)); - } - return 0; -} - -late_initcall(io_apic_bug_finalize); - -#ifndef CONFIG_XEN - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - entry[i] = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device *dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -int create_irq(void) -{ - /* Allocate an unused irq */ - int irq, new, vector = 0; - unsigned long flags; - - irq = -ENOSPC; - spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { - if (platform_legacy_irq(new)) - continue; - if (irq_vector[new] != 0) - continue; - vector = __assign_irq_vector(new); - if (likely(vector > 0)) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq >= 0) { - set_intr_gate(vector, interrupt[irq]); - dynamic_irq_init(irq); - } - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - - spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_vector[irq], used_vectors); - irq_vector[irq] = 0; - spin_unlock_irqrestore(&vector_lock, flags); -} - -#endif /* CONFIG_XEN */ - -/* - * MSI message composition - */ -#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - int vector; - unsigned dest; - - vector = assign_irq_vector(irq); - if (vector >= 0) { - dest = cpu_mask_to_apicid(TARGET_CPUS); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? -MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? -MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(vector); - } - return vector; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - int vector; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - vector = assign_irq_vector(irq); - if (vector < 0) - return; - - dest = cpu_mask_to_apicid(mask); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; -} -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_ioapic_irq, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, - "edge"); - - return 0; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#endif /* CONFIG_PCI_MSI */ - -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(mask, tmp, CPU_MASK_ALL); - - dest = cpu_mask_to_apicid(mask); - - target_ht_irq(irq, dest); - irq_desc[irq].affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_ioapic_irq, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - int vector; - - vector = assign_irq_vector(irq); - if (vector >= 0) { - struct ht_irq_msg msg; - unsigned dest; - cpumask_t tmp; - - cpus_clear(tmp); - cpu_set(vector >> 8, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return vector; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifndef CONFIG_XEN -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} -#endif /* !CONFIG_XEN */ - - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - -int __init io_apic_get_redir_entries(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) -{ - struct IO_APIC_route_entry entry; - - if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - ioapic_write_entry(ioapic, pin, entry); - - return 0; -} - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -static int __init parse_disable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); - -static int __init parse_enable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = -1; - return 0; -} -early_param("enable_timer_pin_1", parse_enable_timer_pin_1); - -static int __init parse_noapic(char *arg) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -#ifndef CONFIG_XEN -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } - } else { -fake_ioapic_page: - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } -} -#endif --- head-2011-03-17.orig/arch/x86/kernel/io_apic_64-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,2448 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku and - * Hidemi Kishimoto , - * further tested and cleaned up by Zach Brown - * and Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_ACPI -#include -#endif -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -struct irq_cfg { -#ifndef CONFIG_XEN - cpumask_t domain; - cpumask_t old_domain; -#endif - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly; - -static int assign_irq_vector(int irq, cpumask_t mask); - -#ifndef CONFIG_XEN -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; -#endif - -#define __apicdebuginit __init - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static int disable_timer_pin_1 __initdata; - -#ifdef CONFIG_XEN -#include -#include -#include - -/* Fake i8259 */ -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -#define disable_8259A_irq(_irq) ((void)0) -#define i8259A_irq_pending(_irq) (0) - -unsigned long io_apic_irqs; - -#define clear_IO_APIC() ((void)0) -#else -int timer_through_8259 __initdata; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -#endif - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -#ifndef CONFIG_XEN -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} -#endif - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ -#ifndef CONFIG_XEN - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -#else - struct physdev_apic apic_op; - int ret; - - apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; - apic_op.reg = reg; - ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); - if (ret) - return ret; - return apic_op.value; -#endif -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ -#ifndef CONFIG_XEN - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -#else - struct physdev_apic apic_op; - - apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; - apic_op.reg = reg; - apic_op.value = value; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); -#endif -} - -#ifdef CONFIG_XEN -#define io_apic_modify io_apic_write -#else -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - */ -static inline void io_apic_modify(unsigned int apic, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(value, &io_apic->data); -} - -static bool io_apic_level_ack_pending(unsigned int irq) -{ - struct irq_pin_list *entry; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - entry = irq_2_pin + irq; - for (;;) { - unsigned int reg; - int pin; - - pin = entry->pin; - if (pin == -1) - break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} -#endif - -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ -#ifndef CONFIG_XEN - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -#endif -} - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -#ifndef CONFIG_XEN -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} -#endif - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifndef CONFIG_XEN -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - int apic, pin; - struct irq_pin_list *entry = irq_2_pin + irq; - - BUG_ON(irq >= NR_IRQS); - for (;;) { - unsigned int reg; - apic = entry->apic; - pin = entry->pin; - if (pin == -1) - break; - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, reg); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif -#endif - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - BUG_ON(irq >= NR_IRQS); - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: ran out of irq_2_pin entries!"); - } - entry->apic = apic; - entry->pin = pin; -} - -#ifndef CONFIG_XEN -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - - while (1) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - } - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - BUG_ON(irq >= NR_IRQS); \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#endif /* !CONFIG_XEN */ - -int skip_ioapic_setup; -int ioapic_force; - -static int __init parse_noapic(char *str) -{ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 1; -} -__setup("disable_timer_pin_1", disable_timer_pin_setup); - - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -#ifndef CONFIG_XEN -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} -#endif - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - BUG_ON(best_guess >= NR_IRQS); - return best_guess; -} - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mp_srcbusirq; - } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - } - BUG_ON(irq >= NR_IRQS); - return irq; -} - -void lock_vector_lock(void) -{ - /* Used to the online set of cpus does not change - * during assign_irq_vector. - */ - spin_lock(&vector_lock); -} - -void unlock_vector_lock(void) -{ - spin_unlock(&vector_lock); -} - -static int __assign_irq_vector(int irq, cpumask_t mask) -{ - struct physdev_irq irq_op; - struct irq_cfg *cfg; - - BUG_ON((unsigned)irq >= NR_IRQS); - - if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) - return -EINVAL; - - cfg = &irq_cfg[irq]; - - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; - - if (cfg->vector) - return 0; - - irq_op.irq = irq; - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) - return -ENOSPC; - - cfg->vector = irq_op.vector; - - return 0; -} - -static int assign_irq_vector(int irq, cpumask_t mask) -{ - int err; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); - spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -#ifndef CONFIG_XEN -static void __clear_irq_vector(int irq) -{ - struct irq_cfg *cfg; - cpumask_t mask; - int cpu, vector; - - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; - BUG_ON(!cfg->vector); - - vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) - per_cpu(vector_irq, cpu)[vector] = -1; - - cfg->vector = 0; - cpus_clear(cfg->domain); -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ - int irq, vector; - - /* Mark the inuse vectors */ - for (irq = 0; irq < NR_IRQS; ++irq) { - if (!cpu_isset(cpu, irq_cfg[irq].domain)) - continue; - vector = irq_cfg[irq].vector; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) - continue; - if (!cpu_isset(cpu, irq_cfg[irq].domain)) - per_cpu(vector_irq, cpu)[vector] = -1; - } -} - -static struct irq_chip ioapic_chip; - -static void ioapic_register_intr(int irq, unsigned long trigger) -{ - if (trigger) { - irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); - } -} -#else -#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq) -#endif /* !CONFIG_XEN */ - -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) -{ - struct irq_cfg *cfg = irq_cfg + irq; - struct IO_APIC_route_entry entry; - cpumask_t mask; - - if (!IO_APIC_IRQ(irq)) - return; - - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) - return; - -#ifndef CONFIG_XEN - cpus_and(mask, cfg->domain, mask); -#endif - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, - irq, trigger, polarity); - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest = cpu_mask_to_apicid(mask); - entry.mask = 0; /* enable IRQ */ - entry.trigger = trigger; - entry.polarity = polarity; - entry.vector = cfg->vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry.mask = 1; - - ioapic_register_intr(irq, trigger); - if (irq < 16) - disable_8259A_irq(irq); - - ioapic_write_entry(apic, pin, entry); -} - -static void __init setup_IO_APIC_irqs(void) -{ - int apic, pin, idx, irq, first_notcon = 1; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - irq = pin_2_irq(idx, apic, pin); - add_pin_to_irq(irq, apic, pin); - - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -#ifndef CONFIG_XEN -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - -void __apicdebuginit print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - if (reg_01.bits.version >= 0x10) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X ", - i, - entry.dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -static __apicdebuginit void print_APIC_bitfield (int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs (void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -void __apicdebuginit print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} -#else -void __apicdebuginit print_IO_APIC(void) {} -#endif /* !CONFIG_XEN */ - -void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; -#ifndef CONFIG_XEN - int i8259_apic, i8259_pin; -#endif - int i, apic; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } -#ifndef CONFIG_XEN - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } -#endif - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - -#ifndef CONFIG_XEN - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = GET_APIC_ID(read_apic_id()); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -#endif -} - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -#ifndef CONFIG_XEN -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -static int ioapic_retrigger_irq(unsigned int irq) -{ - struct irq_cfg *cfg = &irq_cfg[irq]; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -#ifdef CONFIG_SMP -asmlinkage void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= NR_IRQS) - continue; - - desc = irq_desc + irq; - cfg = irq_cfg + irq; - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; - - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; - - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void irq_complete_move(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg + irq; - unsigned vector, me; - - if (likely(!cfg->move_in_progress)) - return; - - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } -} -#else -static inline void irq_complete_move(unsigned int irq) {} -#endif - -static void ack_apic_edge(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_apic_level(unsigned int irq) -{ - int do_unmask_irq = 0; - - irq_complete_move(irq); -#ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } -#endif - - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq); - unmask_IO_APIC_irq(irq); - } -} - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; -#endif /* !CONFIG_XEN */ - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); -#ifndef CONFIG_XEN - else - /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; -#endif - } - } -} - -#ifndef CONFIG_XEN -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq) -{ - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - printk(" done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) - return; - - entry0 = ioapic_read_entry(apic, pin); - - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for modern platforms only. - */ -static inline void __init check_timer(void) -{ - struct irq_cfg *cfg = irq_cfg + 0; - int apic1, pin1, apic2, pin2; - unsigned long flags; - int no_pin1 = 0; - - local_irq_save(flags); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - } - unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0); - apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} -#else -#define check_timer() ((void)0) -int timer_uses_ioapic_pin_0 = 0; -#endif /* !CONFIG_XEN */ - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1<<2) - -void __init setup_IO_APIC(void) -{ - enable_IO_APIC(); - - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - -#ifndef CONFIG_XEN - sync_Arb_IDs(); -#endif /* !CONFIG_XEN */ - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -#ifndef CONFIG_XEN - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -int create_irq(void) -{ - /* Allocate an unused irq */ - int irq; - int new; - unsigned long flags; - - irq = -ENOSPC; - spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { - if (platform_legacy_irq(new)) - continue; - if (irq_cfg[new].vector != 0) - continue; - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq >= 0) { - dynamic_irq_init(irq); - } - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); -} - -#endif /* CONFIG_XEN */ - -/* - * MSI message composition - */ -#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - struct irq_cfg *cfg = irq_cfg + irq; - int err; - unsigned dest; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } - return err; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; -} -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - - return 0; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#ifdef CONFIG_DMAR -#ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; -} -#endif /* CONFIG_SMP */ - -struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} -#endif - -#endif /* CONFIG_PCI_MSI */ -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg = irq_cfg + irq; - int err; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; - - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return err; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#define IO_APIC_MAX_ID 0xFE - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); - - return 0; -} - - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -#ifndef CONFIG_XEN -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - if (!irq_cfg[irq].vector) - setup_IO_APIC_irq(ioapic, pin, irq, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - else - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#define IOAPIC_RESOURCE_NAME_SIZE 11 - -static struct resource *ioapic_resources; - -static struct resource * __init ioapic_setup_resources(void) -{ - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - struct resource *ioapic_res; - int i; - - ioapic_res = ioapic_setup_resources(); - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; - } else { - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } - } -} - -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif /* !CONFIG_XEN */ --- head-2011-03-17.orig/arch/x86/kernel/ioport-xen.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/ioport-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -14,6 +14,7 @@ #include #include #include +#include #include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ --- head-2011-03-17.orig/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:56:33.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:56:51.000000000 +0100 @@ -57,7 +57,4 @@ void send_IPI_mask_sequence(cpumask_t ma send_IPI_mask_bitmask(mask, vector); } -/* must come after the send_IPI functions above for inlining */ -#include - #endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-17/arch/x86/kernel/irq-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -0,0 +1,200 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include + +#include +#include +#include + +atomic_t irq_err_count; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "unexpected IRQ trap at irq %02x\n", irq); + +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +#endif +} + +#ifdef CONFIG_X86_32 +# define irq_stats(x) (&per_cpu(irq_stat, x)) +#else +# define irq_stats(x) cpu_pda(x) +#endif +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); +#ifndef CONFIG_XEN + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#else + seq_printf(p, "LCK: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count); + seq_printf(p, " Spinlock wakeups\n"); +#endif +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +# ifdef CONFIG_X86_64 + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +# endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return show_other_interrupts(p); + + /* print header */ + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); +#endif + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +#endif + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +#ifndef CONFIG_XEN + sum += irq_stats(cpu)->irq_tlb_count; +#else + sum += irq_stats(cpu)->irq_lock_count; +#endif +#endif +#ifdef CONFIG_X86_MCE + sum += irq_stats(cpu)->irq_thermal_count; +# ifdef CONFIG_X86_64 + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} --- head-2011-03-17.orig/arch/x86/kernel/ldt-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/ldt-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) --- head-2011-03-17.orig/arch/x86/kernel/microcode-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,214 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/253668.htm - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -//#define DEBUG /* pr_debug */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - -static int verbose; -module_param(verbose, int, 0644); - -#define MICROCODE_VERSION "1.14a-xen" - -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ - -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); - -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static int do_microcode_update (const void __user *ubuf, size_t len) -{ - int err; - void *kbuf; - - kbuf = vmalloc(len); - if (!kbuf) - return -ENOMEM; - - if (copy_from_user(kbuf, ubuf, len) == 0) { - struct xen_platform_op op; - - op.cmd = XENPF_microcode_update; - set_xen_guest_handle(op.u.microcode.data, kbuf); - op.u.microcode.length = len; - err = HYPERVISOR_platform_op(&op); - } else - err = -EFAULT; - - vfree(kbuf); - - return err; -} - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if (len < MC_HEADER_SIZE) { - printk(KERN_ERR "microcode: not enough data\n"); - return -EINVAL; - } - - mutex_lock(µcode_mutex); - - ret = do_microcode_update(buf, len); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit (void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) -#endif - -/* fake device for request_firmware */ -static struct platform_device *microcode_pdev; - -static int request_microcode(void) -{ - char name[30]; - const struct cpuinfo_x86 *c = &boot_cpu_data; - const struct firmware *firmware; - int error; - struct xen_platform_op op; - - sprintf(name,"intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); - error = request_firmware(&firmware, name, µcode_pdev->dev); - if (error) { - pr_debug("microcode: data file %s load failed\n", name); - return error; - } - - op.cmd = XENPF_microcode_update; - set_xen_guest_handle(op.u.microcode.data, firmware->data); - op.u.microcode.length = firmware->size; - error = HYPERVISOR_platform_op(&op); - - release_firmware(firmware); - - if (error) - pr_debug("ucode load failed\n"); - - return error; -} - -static int __init microcode_init (void) -{ - int error; - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - request_microcode(); - - return 0; -} - -static void __exit microcode_exit (void) -{ - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); -} - -module_init(microcode_init) -module_exit(microcode_exit) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-17/arch/x86/kernel/microcode_core-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -0,0 +1,225 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +static int verbose; +module_param(verbose, int, 0644); + +#define MICROCODE_VERSION "2.00-xen" + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *ubuf, size_t len) +{ + int err; + void *kbuf; + + kbuf = vmalloc(len); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, ubuf, len) == 0) { + struct xen_platform_op op; + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, kbuf); + op.u.microcode.length = len; + err = HYPERVISOR_platform_op(&op); + } else + err = -EFAULT; + + vfree(kbuf); + + return err; +} + +static int microcode_open(struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", + num_physpages); + return -EINVAL; + } + + mutex_lock(µcode_mutex); + + ret = do_microcode_update(buf, len); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int request_microcode(const char *name) +{ + const struct firmware *firmware; + int error; + struct xen_platform_op op; + + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("microcode: data file %s load failed\n", name); + return error; + } + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, firmware->data); + op.u.microcode.length = firmware->size; + error = HYPERVISOR_platform_op(&op); + + release_firmware(firmware); + + if (error) + pr_debug("ucode load failed\n"); + + return error; +} + +static int __init microcode_init(void) +{ + const struct cpuinfo_x86 *c = &boot_cpu_data; + char buf[32]; + const char *fw_name = buf; + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + sprintf(buf, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + else if (c->x86_vendor == X86_VENDOR_AMD) + fw_name = "amd-ucode/microcode_amd.bin"; + else { + printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + return -ENODEV; + } + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + request_microcode(fw_name); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION + " ," + " Peter Oruba\n"); + + return 0; +} + +static void __exit microcode_exit(void) +{ + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} + +module_init(microcode_init); +module_exit(microcode_exit); --- head-2011-03-17.orig/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -410,7 +410,9 @@ static int __init smp_read_mpc(struct mp generic_bigsmp_probe(); #endif +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; @@ -622,6 +624,9 @@ void __init get_smp_config(void) printk(KERN_INFO "Using ACPI for processor (LAPIC) " "configuration information\n"); + if (!mpf) + return; + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) --- head-2011-03-17.orig/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address); /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to older i386. */ -struct device fallback_dev = { +struct device x86_dma_fallback_dev = { .bus_id = "fallback device", .coherent_dma_mask = DMA_32BIT_MASK, - .dma_mask = &fallback_dev.coherent_dma_mask, + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; +EXPORT_SYMBOL(x86_dma_fallback_dev); int dma_set_mask(struct device *dev, u64 mask) { @@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void) * using 512M as goal */ align = 64ULL<<20; - size = round_up(dma32_bootmem_size, align); + size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); if (dma32_bootmem_ptr) @@ -109,6 +110,8 @@ static void __init dma32_free_bootmem(vo #endif static struct dma_mapping_ops swiotlb_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, .mapping_error = swiotlb_dma_mapping_error, .map_single = swiotlb_map_single_phys, .unmap_single = swiotlb_unmap_single, @@ -147,13 +150,77 @@ void __init pci_iommu_alloc(void) } #ifndef CONFIG_XEN -unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) { unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); return size >> PAGE_SHIFT; } -EXPORT_SYMBOL(iommu_num_pages); +EXPORT_SYMBOL(iommu_nr_pages); +#endif + +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; +#ifndef CONFIG_XEN + dma_addr_t addr; +#else + void *memory; +#endif + unsigned int order = get_order(size); + + dma_mask = dma_alloc_coherent_mask(dev, flag); + +#ifndef CONFIG_XEN + flag |= __GFP_ZERO; +again: +#else + flag &= ~(__GFP_DMA | __GFP_DMA32); +#endif + page = alloc_pages_node(dev_to_node(dev), flag, order); + if (!page) + return NULL; + +#ifndef CONFIG_XEN + addr = page_to_phys(page); + if (!is_buffer_dma_capable(dma_mask, addr, size)) { + __free_pages(page, order); + + if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +#else + memory = page_address(page); + if (xen_create_contiguous_region((unsigned long)memory, order, + fls64(dma_mask))) { + __free_pages(page, order); + return NULL; + } + + *dma_addr = virt_to_bus(memory); + return memset(memory, 0, size); +#endif +} + +#ifdef CONFIG_XEN +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + unsigned int order = get_order(size); + unsigned long va = (unsigned long)vaddr; + + xen_destroy_contiguous_region(va, order); + free_pages(va, order); +} #endif /* @@ -291,164 +358,6 @@ int dma_supported(struct device *dev, u6 } EXPORT_SYMBOL(dma_supported); -/* Allocate DMA memory on node near device */ -static struct page * -dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) -{ - int node; - - node = dev_to_node(dev); - - return alloc_pages_node(node, gfp, order); -} - -/* - * Allocate memory for a coherent mapping. - */ -void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) -{ -#ifndef CONFIG_XEN - struct dma_mapping_ops *ops = get_dma_ops(dev); -#endif - void *memory = NULL; - struct page *page; - unsigned long dma_mask = 0; - int noretry = 0; - unsigned int order = get_order(size); - - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) { - dev = &fallback_dev; - gfp |= GFP_DMA; - } - dma_mask = dev->coherent_dma_mask; - if (dma_mask == 0) - dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; - - /* Device not DMA able */ - if (dev->dma_mask == NULL) - return NULL; - -#ifdef CONFIG_XEN - gfp &= ~(__GFP_DMA | __GFP_DMA32); -#else - /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ - if (gfp & __GFP_DMA) - noretry = 1; - -#ifdef CONFIG_X86_64 - /* Why <=? Even when the mask is smaller than 4GB it is often - larger than 16MB and in this case we have a chance of - finding fitting memory in the next higher zone first. If - not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp |= GFP_DMA32; -#endif - - again: -#endif - page = dma_alloc_pages(dev, - noretry ? gfp | __GFP_NORETRY : gfp, order); - if (page == NULL) - return NULL; - -#ifndef CONFIG_XEN - { - int high, mmu; - dma_addr_t bus = page_to_phys(page); - memory = page_address(page); - high = (bus + size) >= dma_mask; - mmu = high; - if (force_iommu && !(gfp & GFP_DMA)) - mmu = 1; - else if (high) { - free_pages((unsigned long)memory, order); - - /* Don't use the 16MB ZONE_DMA unless absolutely - needed. It's better to use remapping first. */ - if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - - /* Let low level make its own zone decisions */ - gfp &= ~(GFP_DMA32|GFP_DMA); - - if (ops->alloc_coherent) - return ops->alloc_coherent(dev, size, - dma_handle, gfp); - return NULL; - } - - memset(memory, 0, size); - if (!mmu) { - *dma_handle = bus; - return memory; - } - } - - if (ops->alloc_coherent) { - free_pages((unsigned long)memory, order); - gfp &= ~(GFP_DMA|GFP_DMA32); - return ops->alloc_coherent(dev, size, dma_handle, gfp); - } - - if (ops->map_simple) { - *dma_handle = ops->map_simple(dev, virt_to_bus(memory), - size, - PCI_DMA_BIDIRECTIONAL); - if (*dma_handle != bad_dma_address) - return memory; - } -#else - memory = page_address(page); - if (xen_create_contiguous_region((unsigned long)memory, order, - fls64(dma_mask)) == 0) { - memset(memory, 0, size); - *dma_handle = virt_to_bus(memory); - return memory; - } -#endif - - if (panic_on_overflow) - panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", - (unsigned long)size); - free_pages((unsigned long)memory, order); - return NULL; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -/* - * Unmap coherent memory. - * The caller must ensure that the device has finished accessing the mapping. - */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) -{ -#ifndef CONFIG_XEN - struct dma_mapping_ops *ops = get_dma_ops(dev); -#endif - - int order = get_order(size); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_from_coherent(dev, order, vaddr)) - return; -#ifndef CONFIG_XEN - if (ops->unmap_single) - ops->unmap_single(dev, bus, size, 0); -#endif - xen_destroy_contiguous_region((unsigned long)vaddr, order); - free_pages((unsigned long)vaddr, order); -} -EXPORT_SYMBOL(dma_free_coherent); - static int __init pci_iommu_init(void) { calgary_iommu_init(); --- head-2011-03-17.orig/arch/x86/kernel/pci-nommu-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/pci-nommu-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -36,7 +37,7 @@ gnttab_map_sg(struct device *hwdev, stru gnttab_dma_map_page(sg_page(sg)) + sg->offset; sg->dma_length = sg->length; IOMMU_BUG_ON(address_needs_mapping( - hwdev, sg->dma_address)); + hwdev, sg->dma_address, sg->length)); IOMMU_BUG_ON(range_straddles_page_boundary( page_to_pseudophys(sg_page(sg)) + sg->offset, sg->length)); @@ -67,7 +68,7 @@ gnttab_map_single(struct device *dev, ph dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) + offset_in_page(paddr); IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size)); - IOMMU_BUG_ON(address_needs_mapping(dev, dma)); + IOMMU_BUG_ON(address_needs_mapping(dev, dma, size)); return dma; } @@ -84,7 +85,9 @@ static int nommu_dma_supported(struct de return 1; } -static struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, .map_single = gnttab_map_single, .unmap_single = gnttab_unmap_single, .map_sg = gnttab_map_sg, --- head-2011-03-17.orig/arch/x86/kernel/process-xen.c 2011-03-03 15:59:49.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/process-xen.c 2011-03-03 16:00:33.000000000 +0100 @@ -151,7 +151,8 @@ static void mwait_idle(void) static void poll_idle(void) { local_irq_enable(); - cpu_relax(); + while (!need_resched()) + cpu_relax(); } #ifndef CONFIG_XEN --- head-2011-03-17.orig/arch/x86/kernel/process_32-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/process_32-xen.c 2011-02-02 08:34:28.000000000 +0100 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -51,8 +52,6 @@ #endif #include -#include -#include #include @@ -60,6 +59,8 @@ #include #include #include +#include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); @@ -78,42 +79,12 @@ unsigned long thread_saved_pc(struct tas return ((unsigned long *)tsk->thread.sp)[3]; } -#ifdef CONFIG_HOTPLUG_CPU -#ifndef CONFIG_XEN -#include - -static void cpu_exit_clear(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - cpu_uninit(); - irq_ctx_exit(cpu); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); - c1e_remove_cpu(cpu); -} -#endif - -static inline void play_dead(void) -{ - idle_task_exit(); - local_irq_disable(); - cpu_clear(smp_processor_id(), cpu_initialized); - preempt_enable_no_resched(); - VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); - cpu_bringup(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -155,12 +126,13 @@ void cpu_idle(void) } } -void __show_registers(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; + const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -173,11 +145,15 @@ void __show_registers(struct pt_regs *re } printk("\n"); - printk("Pid: %d, comm: %s %s (%s %.*s)\n", + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, board); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, @@ -216,7 +192,7 @@ void __show_registers(struct pt_regs *re void show_regs(struct pt_regs *regs) { - __show_registers(regs, 1); + __show_regs(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); } @@ -269,6 +245,14 @@ void exit_thread(void) t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(current->thread.ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(current->thread.ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -434,6 +418,35 @@ int set_tsc_mode(unsigned int val) return 0; } +#ifdef CONFIG_X86_DS +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + unsigned long ds_prev = 0; + unsigned long ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + update_debugctlmsr(0); + wrmsr(MSR_IA32_DS_AREA, ds_next, 0); + } + return debugctl; +} +#else +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + return debugctl; +} +#endif /* CONFIG_X86_DS */ + static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { @@ -443,14 +456,7 @@ __switch_to_xtra(struct task_struct *pre prev = &prev_p->thread; next = &next_p->thread; - debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); - } + debugctl = update_debugctl(prev, next, prev->debugctlmsr); if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -474,13 +480,13 @@ __switch_to_xtra(struct task_struct *pre hard_enable_TSC(); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ } /* --- head-2011-03-17.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:34:01.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/process_64-xen.c 2011-02-02 08:34:22.000000000 +0100 @@ -40,25 +40,23 @@ #include #include #include +#include +#include -#include #include #include -#include #include #include #include #include #include #include -#include #include #include #include #include #include - -#include +#include asmlinkage extern void ret_from_fork(void); @@ -70,6 +68,13 @@ void idle_notifier_register(struct notif { atomic_notifier_chain_register(&idle_notifier, n); } +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); void enter_idle(void) { @@ -93,25 +98,12 @@ void exit_idle(void) __exit_idle(); } -#ifdef CONFIG_HOTPLUG_CPU -static inline void play_dead(void) -{ - idle_task_exit(); -#ifndef CONFIG_XEN - c1e_remove_cpu(raw_smp_processor_id()); -#endif - local_irq_disable(); - cpu_clear(smp_processor_id(), cpu_initialized); - preempt_enable_no_resched(); - VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); - cpu_bringup(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -156,63 +148,74 @@ void cpu_idle(void) } /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) +void __show_regs(struct pt_regs *regs, int all) { - unsigned long fs, gs, shadowgs; + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, - regs->flags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - - asm("mov %%ds,%0" : "=r" (ds)); - asm("mov %%cs,%0" : "=r" (cs)); - asm("mov %%es,%0" : "=r" (es)); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); asm("mov %%fs,%0" : "=r" (fsindex)); asm("mov %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es); + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); + + printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) { - printk("CPU %d:", smp_processor_id()); - __show_regs(regs); + printk(KERN_INFO "CPU %d:", smp_processor_id()); + __show_regs(regs, 1); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } @@ -249,6 +252,14 @@ void exit_thread(void) #endif t->io_bitmap_max = 0; } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(t->ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(t->ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void xen_load_gs_index(unsigned gs) @@ -329,10 +340,10 @@ void prepare_to_copy(struct task_struct int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, - struct task_struct * p, struct pt_regs * regs) + struct task_struct *p, struct pt_regs *regs) { int err; - struct pt_regs * childregs; + struct pt_regs *childregs; struct task_struct *me = current; childregs = ((struct pt_regs *) @@ -376,10 +387,10 @@ int copy_thread(int nr, unsigned long cl if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) goto out; } p->thread.iopl = current->thread.iopl; @@ -486,13 +497,27 @@ static inline void __switch_to_xtra(stru next = &next_p->thread; debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + +#ifdef CONFIG_X86_DS + { + unsigned long ds_prev = 0, ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* + * We clear debugctl to make sure DS + * is not in use when we change it: + */ + debugctl = 0; + update_debugctlmsr(0); + wrmsrl(MSR_IA32_DS_AREA, ds_next); + } } +#endif /* CONFIG_X86_DS */ if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -516,13 +541,13 @@ static inline void __switch_to_xtra(stru hard_enable_TSC(); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -554,7 +579,7 @@ __switch_to(struct task_struct *prev_p, multicall_entry_t _mcl[8], *mcl = _mcl; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter>5) + if (next_p->fpu_counter > 5) prefetch(next->xstate); /* @@ -635,12 +660,12 @@ __switch_to(struct task_struct *prev_p, if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) BUG(); - /* + /* * Switch DS and ES. * This won't pick up thread selector changes, but I guess that is ok. */ if (unlikely(next->es)) - loadsegment(es, next->es); + loadsegment(es, next->es); if (unlikely(next->ds)) loadsegment(ds, next->ds); @@ -654,7 +679,7 @@ __switch_to(struct task_struct *prev_p, */ arch_leave_lazy_cpu_mode(); - /* + /* * Switch FS and GS. * * Segment register != 0 always requires a reload. Also @@ -673,10 +698,10 @@ __switch_to(struct task_struct *prev_p, if (next->gs) WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); - /* + /* * Switch the PDA context. */ - write_pda(pcurrent, next_p); + write_pda(pcurrent, next_p); write_pda(kernelstack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); @@ -717,7 +742,7 @@ long sys_execve(char __user *name, char char __user * __user *envp, struct pt_regs *regs) { long error; - char * filename; + char *filename; filename = getname(name); error = PTR_ERR(filename); @@ -775,56 +800,56 @@ asmlinkage long sys_vfork(struct pt_regs unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,ip; + u64 fp, ip; int count = 0; - if (!p || p == current || p->state==TASK_RUNNING) - return 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.sp); - do { + do { if (fp < (unsigned long)stack || - fp > (unsigned long)stack+THREAD_SIZE) - return 0; + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; ip = *(u64 *)(fp+8); if (!in_sched_functions(ip)) return ip; - fp = *(u64 *)fp; - } while (count++ < 16); + fp = *(u64 *)fp; + } while (count++ < 16); return 0; } long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; +{ + int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (code) { case ARCH_SET_GS: if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); + load_gs_index(GS_TLS_SEL); } - task->thread.gsindex = GS_TLS_SEL; + task->thread.gsindex = GS_TLS_SEL; task->thread.gs = 0; - } else { + } else { task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { load_gs_index(0); ret = HYPERVISOR_set_segment_base( SEGBASE_GS_USER, addr); - } + } } put_cpu(); break; @@ -879,8 +904,7 @@ long do_arch_prctl(struct task_struct *t rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gs; - } - else + } else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; --- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-03-04 15:09:03.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-03-03 16:22:12.000000000 +0100 @@ -261,6 +261,9 @@ unsigned long saved_video_mode; #define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -339,7 +342,7 @@ static void __init relocate_initrd(void) if (clen > MAX_MAP_CHUNK-slop) clen = MAX_MAP_CHUNK-slop; mapaddr = ramdisk_image & PAGE_MASK; - p = early_ioremap(mapaddr, clen+slop); + p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); early_iounmap(p, clen+slop); q += clen; @@ -430,7 +433,7 @@ static void __init parse_setup_data(void return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); + data = early_memremap(pa_data, PAGE_SIZE); switch (data->type) { case SETUP_E820_EXT: parse_e820_ext(data, pa_data); @@ -455,7 +458,7 @@ static void __init e820_reserve_setup_da return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -483,7 +486,7 @@ static void __init reserve_early_setup_d return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; @@ -625,7 +628,13 @@ static void __init reserve_standard_io_r } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ + +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed * by kexec loader to the capture kernel. @@ -646,6 +655,190 @@ static struct x86_quirks default_x86_qui struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; /* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +static void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + +static struct timer_list periodic_check_timer; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for(i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for(; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); +} + +void start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} +#endif + +static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE + "%s detected: BIOS may corrupt low RAM, working it around.\n", + d->ident); + + e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + return 0; +} + +/* List of systems that have known low memory corruption BIOS problems */ +static struct dmi_system_id __initdata bad_bios_dmi_table[] = { +#ifdef CONFIG_X86_RESERVE_LOW_64K + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + }, + }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), + }, + }, +#endif + {} +}; + +/* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures * for initialization. Note, the efi init code path is determined by the @@ -693,6 +886,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + /* VMI may relocate the fixmap; do this before touching ioremap area */ + vmi_init(); + early_cpu_init(); early_ioremap_init(); @@ -787,6 +983,19 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; @@ -796,13 +1005,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif + /* Must be before kernel pagetables are setup */ + vmi_activate(); /* after early param, so could get panic from serial */ reserve_early_setup_data(); @@ -821,10 +1025,15 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + if (is_initial_xendomain()) { + dmi_scan_machine(); + + dmi_check_system(bad_bios_dmi_table); + #ifdef CONFIG_X86_32 - if (is_initial_xendomain()) probe_roms(); #endif + } #ifndef CONFIG_XEN /* after parse_early_param, so could debug it */ @@ -870,6 +1079,10 @@ void __init setup_arch(char **cmdline_p) num_physpages = max_pfn; max_mapnr = max_pfn; +#ifndef CONFIG_XEN + if (cpu_has_x2apic) + check_x2apic(); +#endif /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ @@ -881,6 +1094,10 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< #include +#include #include #include @@ -415,14 +416,9 @@ unsigned long profile_pc(struct pt_regs unsigned long pc = instruction_pointer(regs); #if defined(CONFIG_SMP) || defined(__x86_64__) -# ifdef __i386__ - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) -# else - if (!user_mode(regs) -# endif - && in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { # ifdef CONFIG_FRAME_POINTER - return ((unsigned long *)regs->bp)[1]; + return *(unsigned long *)(regs->bp + sizeof(long)); # else # ifdef __i386__ unsigned long *sp = (unsigned long *)®s->sp; @@ -577,6 +573,7 @@ irqreturn_t timer_interrupt(int irq, voi run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs())); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(current); profile_tick(CPU_PROFILING); @@ -806,7 +803,8 @@ static void stop_hz_timer(void) smp_mb(); /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ - if (rcu_needs_cpu(cpu) || local_softirq_pending() || + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + local_softirq_pending() || (j = get_next_timer_interrupt(jiffies), time_before_eq(j, jiffies))) { cpu_clear(cpu, nohz_cpu_mask); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-17/arch/x86/kernel/traps-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -0,0 +1,1022 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_EISA +#include +#include +#endif + +#ifdef CONFIG_MCA +#include +#endif + +#if defined(CONFIG_EDAC) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_X86_64 +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include + +#include "cpu/mcheck/mce.h" + +#ifndef CONFIG_XEN +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); +#endif + +asmlinkage int system_call(void); + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq; + +#ifndef CONFIG_X86_NO_IDT +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif +#endif + +static int ignore_nmis; + +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); +} + +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) +{ + if (!user_mode_vm(regs)) + die(str, regs, err); +} + +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ +#ifndef CONFIG_XEN + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); +#endif + + return 0; +} +#endif + +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) + goto vm86_trap; + goto trap_signal; + } +#endif + + if (!user_mode(regs)) + goto kernel_trap; + +#ifdef CONFIG_X86_32 +trap_signal: +#endif + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + +kernel_trap: + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); + } + return; + +#ifdef CONFIG_X86_32 +vm86_trap: + if (handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + goto trap_signal; + return; +#endif +} + +#define DO_ERROR(trapnr, signr, str, name) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ +} + +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + +dotraplinkage void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk; + + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ + return; + } + + if (regs->flags & X86_VM_MASK) + goto gp_in_vm86; +#endif + + tsk = current; + if (!user_mode(regs)) + goto gp_in_kernel; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + force_sig(SIGSEGV, tsk); + return; + +#ifdef CONFIG_X86_32 +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; +#endif + +gp_in_kernel: + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); +} + +static notrace __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs *regs) +{ + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG + "You have some hardware problem, likely on the PCI bus.\n"); + +#if defined(CONFIG_EDAC) + if (edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + + /* Clear and disable the memory parity error line. */ + clear_mem_error(reason); +} + +static notrace __kprobes void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + clear_io_check_error(reason); +} + +static notrace __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) + return; +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); +} + +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); + + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog_tick(regs, reason)) + return; + if (!do_nmi_callback(regs, cpu)) + unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif + + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered: + */ + reassert_nmi(); +#endif +} + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif + + if (!ignore_nmis) + default_do_nmi(regs); + + nmi_exit(); +} + +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + +/* May run on IST stack. */ +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +{ +#ifdef CONFIG_KPROBES + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif + + preempt_conditional_sti(regs); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + unsigned long condition; + int si_code; + + get_debugreg(condition, 6); + + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; + + /* It's safe to allow irq's after DR6 has been saved */ + preempt_conditional_sti(regs); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg7) + goto clear_dr7; + } + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) + goto debug_vm86; +#endif + + /* Save debug status register where ptrace can see it */ + tsk->thread.debugreg6 = condition; + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ + if (condition & DR_STEP) { + if (!user_mode(regs)) + goto clear_TF_reenable; + } + + si_code = get_si_code(condition); + /* Ok, finally something we can handle */ + send_sigtrap(tsk, regs, error_code, si_code); + + /* + * Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ +clear_dr7: + set_debugreg(0, 7); + preempt_conditional_cli(regs); + return; + +#ifdef CONFIG_X86_32 +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); + return; +#endif + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); + return; +} + +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void __user *ip) +{ + struct task_struct *task; + siginfo_t info; + unsigned short cwd, swd; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = ip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (swd & ~cwd & 0x3f) { + case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 + return; +#endif + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + + math_error((void __user *)regs->ip); +} + +static void simd_math_error(void __user *ip) +{ + struct task_struct *task; + siginfo_t info; + unsigned short mxcsr; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = ip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->ip); + return; + } + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->flags & X86_VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; + } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif +} + +#ifndef CONFIG_XEN +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); +#if 0 + /* No need to warn about this any longer. */ + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif +} + +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +{ + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + + return new_kesp; +} +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif +#endif /* CONFIG_XEN */ + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). + */ +asmlinkage void math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + if (!tsk_used_math(tsk)) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (init_fpu(tsk)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + + /* NB. 'clts' is done for us by Xen during virtual trap. */ +#ifdef CONFIG_X86_32 + restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} +EXPORT_SYMBOL_GPL(math_state_restore); + +#ifndef CONFIG_MATH_EMULATION +asmlinkage void math_emulate(long arg) +{ + printk(KERN_EMERG + "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n", current->comm); + force_sig(SIGFPE, current); + schedule(); +} +#endif /* CONFIG_MATH_EMULATION */ + +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else + math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif + +/* + * NB. All these are "trap gates" (i.e. events_mask isn't set) except + * for those that specify |4 in the second field. + */ +static const trap_info_t __cpuinitconst trap_table[] = { +#ifdef CONFIG_X86_32 +#define X 0 +#else +#define X 4 +#endif + { 0, 0|X, __KERNEL_CS, (unsigned long)divide_error }, + { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, + { 4, 3|X, __KERNEL_CS, (unsigned long)overflow }, + { 5, 0|X, __KERNEL_CS, (unsigned long)bounds }, + { 6, 0|X, __KERNEL_CS, (unsigned long)invalid_op }, + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, + { 9, 0|X, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, + { 10, 0|X, __KERNEL_CS, (unsigned long)invalid_TSS }, + { 11, 0|X, __KERNEL_CS, (unsigned long)segment_not_present }, + { 12, 0|X, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0|X, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, + { 16, 0|X, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0|X, __KERNEL_CS, (unsigned long)alignment_check }, +#ifdef CONFIG_X86_MCE + { 18, 0|X, __KERNEL_CS, (unsigned long)machine_check }, +#endif + { 19, 0|X, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, +#ifdef CONFIG_X86_32 + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, +#elif defined(CONFIG_IA32_EMULATION) + { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall }, +#endif + { 0, 0, 0, 0 } +}; + +void __init trap_init(void) +{ + int ret; + + ret = HYPERVISOR_set_trap_table(trap_table); + if (ret) + printk("HYPERVISOR_set_trap_table failed (%d)\n", ret); + +#ifdef CONFIG_X86_32 + if (cpu_has_fxsr) { + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO + "Enabling unmasked SIMD FPU exception support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } + +#endif + /* + * Should be a barrier for any external CPU state: + */ + cpu_init(); +} + +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) +{ + const trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} --- head-2011-03-17.orig/arch/x86/kernel/traps_32-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,1222 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_EISA -#include -#include -#endif - -#ifdef CONFIG_MCA -#include -#endif - -#if defined(CONFIG_EDAC) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mach_traps.h" - -#ifndef CONFIG_XEN -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); -#endif - -asmlinkage int system_call(void); - -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - -#ifndef CONFIG_X86_NO_IDT -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -#endif - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; -static unsigned int code_bytes = 64; -static int ignore_nmis; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); -#endif -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) -{ - void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - for (;;) { - struct thread_info *context; - - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; - stack = (unsigned long *)context->previous_esp; - if (!stack) - break; - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); - touch_nmi_watchdog(); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); - } - printk("\n%sCall Trace:\n", log_lvl); - - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - printk(" "); - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - show_trace(current, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - - print_modules(); - __show_registers(regs, 0); - - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode_vm(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad EIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} - -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); -} - -static void __kprobes -do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - - if (regs->flags & X86_VM_MASK) { - if (vm86) - goto vm86_trap; - goto trap_signal; - } - - if (!user_mode(regs)) - goto kernel_trap; - -trap_signal: - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; - -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -} - -#define DO_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ -} - -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ -} - -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -#ifndef CONFIG_KPROBES -DO_VM86_ERROR(3, SIGTRAP, "int3", int3) -#endif -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) - -void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - struct thread_struct *thread; - - thread = ¤t->thread; - - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, task_pid_nr(tsk), - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG - "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - clear_io_check_error(reason); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); -#endif - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered: - */ - reassert_nmi(); -} - -notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) -{ - int cpu; - - nmi_enter(); - - cpu = smp_processor_id(); - - ++nmi_count(cpu); - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -#ifdef CONFIG_KPROBES -void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ - trace_hardirqs_fixup(); - - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; - /* - * This is an interrupt gate, because kprobes wants interrupts - * disabled. Normal trap handlers don't. - */ - restore_interrupts(regs); - - do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); -} -#endif - -/* - * Our handling of the processor debug registers is non-trivial. - * We do not clear them on entry and exit from the kernel. Therefore - * it is possible to get a watchpoint trap here from inside the kernel. - * However, the code in ./ptrace.c has ensured that the user can - * only set watchpoints on userspace addresses. Therefore the in-kernel - * watchpoint trap can only occur in code which is reading/writing - * from user space. Such code must not hold kernel locks (since it - * can equally take a page fault), therefore it is safe to call - * force_sig_info even though that claims and releases locks. - * - * Code in ./signal.c ensures that the debug control register - * is restored before we deliver any signal, and therefore that - * user code runs with the correct debug control register even though - * we clear it here. - * - * Being careful here means that we don't have to be as careful in a - * lot of more complicated places (task switching can be a bit lazy - * about restoring all the debug state, and ptrace doesn't have to - * find every occurrence of the TF bit that could be saved away even - * by user code) - */ -void __kprobes do_debug(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk = current; - unsigned int condition; - - trace_hardirqs_fixup(); - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - /* It's safe to allow irq's after DR6 has been saved */ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - - if (regs->flags & X86_VM_MASK) - goto debug_vm86; - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - /* - * We already checked v86 mode above, so we can - * check for kernel mode by just checking the CPL - * of CS. - */ - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); - return; - -debug_vm86: - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - return; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ - return; - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -void do_coprocessor_error(struct pt_regs *regs, long error_code) -{ - ignore_fpu_irq = 1; - math_error((void __user *)regs->ip); -} - -static void simd_math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) -{ - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); -} - -#ifndef CONFIG_XEN -void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) -{ -#if 0 - /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#endif - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). - */ -asmlinkage void math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - /* NB. 'clts' is done for us by Xen during virtual trap. */ - restore_fpu(tsk); - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -#ifndef CONFIG_MATH_EMULATION - -asmlinkage void math_emulate(long arg) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} - -#endif /* CONFIG_MATH_EMULATION */ - -/* - * NB. All these are "trap gates" (i.e. events_mask isn't set) except - * for those that specify |4 in the second field. - */ -static const trap_info_t __cpuinitconst trap_table[] = { - { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, - { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, - { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, - { 4, 3, __KERNEL_CS, (unsigned long)overflow }, - { 5, 0, __KERNEL_CS, (unsigned long)bounds }, - { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, - { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, - { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, - { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, - { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, - { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, - { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, - { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, - { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, - { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, - { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, -#ifdef CONFIG_X86_MCE - { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, -#endif - { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, - { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, - { 0, 0, 0, 0 } -}; - -void __init trap_init(void) -{ - int ret; - - ret = HYPERVISOR_set_trap_table(trap_table); - if (ret) - printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); - - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - - init_thread_xstate(); - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); -} - -void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) -{ - const trap_info_t *t = trap_table; - - for (t = trap_table; t->address; t++) { - trap_ctxt[t->vector].flags = t->flags; - trap_ctxt[t->vector].cs = t->cs; - trap_ctxt[t->vector].address = t->address; - } -} - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; -} -__setup("kstack=", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); --- head-2011-03-17.orig/arch/x86/kernel/traps_64-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,1238 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_EDAC) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; -static unsigned int code_bytes = 64; -static int ignore_nmis; -static int die_counter; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - /* Make sure to not schedule here because we could be running - on an exception stack. */ - dec_preempt_count(); -} - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); -} - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ -#ifndef CONFIG_X86_NO_TSS - static char ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" -#endif - }; - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = ids[j]; - return (unsigned long *)end; - } -#endif - } -#endif - return NULL; -} - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; - unsigned used = 0; - struct thread_info *tinfo; - - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task && task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - tinfo = task_thread_info(task); - for (;;) { - char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - if (ops->stack(data, id) < 0) - break; - - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); - ops->stack(data, ""); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - ops->stack(data, "EOI"); - continue; - } - } - break; - } - - /* - * This handles the process stack: - */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); - put_cpu(); -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s\n", msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk(" <%s> ", name); - return 0; -} - -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("\n"); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - const int cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n"); - printk(" %016lx", *stack++); - touch_nmi_watchdog(); - } - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movq %%rbp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - unsigned long sp; - const int cpu = smp_processor_id(); - struct task_struct *cur = cpu_pda(cpu)->pcurrent; - - sp = regs->sp; - printk("CPU %d ", cpu); - __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("Stack: "); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, ""); - printk("\n"); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL) -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} -#endif - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - - if (!user_mode(regs)) - goto kernel_trap; - - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -/* Runs on IST stack */ -asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} - -asmlinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - clear_io_check_error(reason); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; -#endif - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -} - -asmlinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - - add_pda(__nmi_count, 1); - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) -{ - trace_hardirqs_fixup(); - - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs * regs, - unsigned long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - siginfo_t info; - - trace_hardirqs_fixup(); - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; - force_sig_info(SIGTRAP, &info, tsk); - -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) -{ - void __user *ip = (void __user *)(regs->ip); - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) -{ - void __user *ip = (void __user *)(regs->ip); - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) -{ -} - -#if 0 -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} -#endif - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(void) -{ - struct task_struct *me = current; - - if (!used_math()) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(me)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) { - stts(); - force_sig(SIGSEGV, me); - return; - } - task_thread_info(me)->status |= TS_USEDFPU; - me->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - - -/* - * NB. All these are "interrupt gates" (i.e. events_mask is set) because we - * specify |4 in the second field. - */ -static const trap_info_t __cpuinitconst trap_table[] = { - { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error }, - { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, - { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, - { 4, 3|4, __KERNEL_CS, (unsigned long)overflow }, - { 5, 0|4, __KERNEL_CS, (unsigned long)bounds }, - { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op }, - { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, - { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun}, - { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS }, - { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present }, - { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment }, - { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection }, - { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, - { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug }, - { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error }, - { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check }, -#ifdef CONFIG_X86_MCE - { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check }, -#endif - { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, -#ifdef CONFIG_IA32_EMULATION - { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall}, -#endif - { 0, 0, 0, 0 } -}; - -void __init trap_init(void) -{ - int ret; - - ret = HYPERVISOR_set_trap_table(trap_table); - if (ret) - printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); - /* - * initialize the per thread extended state: - */ - init_thread_xstate(); - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); -} - -void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) -{ - const trap_info_t *t = trap_table; - - for (t = trap_table; t->address; t++) { - trap_ctxt[t->vector].flags = t->flags; - trap_ctxt[t->vector].cs = t->cs; - trap_ctxt[t->vector].address = t->address; - } -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); --- head-2011-03-17.orig/arch/x86/mm/dump_pagetables-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/dump_pagetables-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -160,8 +160,8 @@ static void note_page(struct seq_file *m * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); - cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); + prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; + cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; if (!st->level) { /* First entry */ --- head-2011-03-17.orig/arch/x86/mm/fault-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/fault-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -35,6 +35,7 @@ #include #include #include +#include /* * Page fault error code bits @@ -370,8 +371,6 @@ static int is_errata100(struct pt_regs * return 0; } -void do_invalid_op(struct pt_regs *, unsigned long); - static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG @@ -609,11 +608,6 @@ void __kprobes do_page_fault(struct pt_r unsigned long flags; #endif - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - /* Set the "privileged fault" bit to something sane. */ if (user_mode_vm(regs)) error_code |= PF_USER; @@ -677,24 +671,23 @@ void __kprobes do_page_fault(struct pt_r } -#ifdef CONFIG_X86_32 - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) - local_irq_enable(); - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; -#else /* CONFIG_X86_64 */ - if (likely(regs->flags & X86_EFLAGS_IF)) + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); +#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); +#endif /* * If we're in an interrupt, have no user context or are running in an @@ -703,15 +696,9 @@ void __kprobes do_page_fault(struct pt_r if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; again: -#endif - /* When running in the kernel we expect faults to occur only to + /* + * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem @@ -774,9 +761,6 @@ good_area: goto bad_area; } -#ifdef CONFIG_X86_32 -survive: -#endif /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -911,12 +895,11 @@ out_of_memory: up_read(&mm->mmap_sem); if (is_global_init(tsk)) { yield(); -#ifdef CONFIG_X86_32 - down_read(&mm->mmap_sem); - goto survive; -#else + /* + * Re-lookup the vma - in theory the vma tree might + * have changed: + */ goto again; -#endif } printk("VM: killing process %s\n", tsk->comm); @@ -949,14 +932,15 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { -#ifdef CONFIG_X86_32 - unsigned long address = VMALLOC_START & PGDIR_MASK; + unsigned long address; +#ifdef CONFIG_X86_32 if (SHARED_KERNEL_PMD) return; - BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK); - for (; address < hypervisor_virt_start; address += PMD_SIZE) { + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { unsigned long flags; struct page *page; @@ -974,10 +958,8 @@ void vmalloc_sync_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; --- head-2011-03-17.orig/arch/x86/mm/highmem_32-xen.c 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/highmem_32-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ struct page *kmap_atomic_to_page(void *ptr) { --- head-2011-03-17.orig/arch/x86/mm/init_32-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_32-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -206,11 +208,32 @@ static void __init kernel_physical_mappi pgd_t *pgd; pmd_t *pmd; pte_t *pte; - unsigned pages_2m = 0, pages_4k = 0; + unsigned pages_2m, pages_4k; + int mapping_iter; - if (!cpu_has_pse) + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; + + if (!cpu_has_pse) { use_pse = 0; + mapping_iter = 0; + } +repeat: + pages_2m = pages_4k = 0; pfn = start_pfn; pgd_idx = pgd_index((pfn<= xen_start_info->nr_pages || pte_present(*pte)) @@ -279,12 +317,34 @@ static void __init kernel_physical_mappi prot = PAGE_KERNEL_EXEC; pages_4k++; - set_pte(pte, pfn_pte(pfn, prot)); + if (mapping_iter == 1) + set_pte(pte, pfn_pte(pfn, init_prot)); + else + set_pte(pte, pfn_pte(pfn, prot)); } } } - update_page_count(PG_LEVEL_2M, pages_2m); - update_page_count(PG_LEVEL_4K, pages_4k); + if (mapping_iter <= 1) { + /* + * update direct mapping page count only in the first + * iteration. + */ + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); + } + if (mapping_iter == 1) { + /* + * local global flush tlb, which will flush the previous + * mappings present in both small and large page TLB's. + */ + __flush_tlb_all(); + + /* + * Second iteration will set the actual desired PTE attributes. + */ + mapping_iter = 2; + goto repeat; + } } /* @@ -306,7 +366,6 @@ int devmem_is_allowed(unsigned long page return 0; } -#ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; @@ -329,6 +388,7 @@ static void __init kmap_init(void) kmap_prot = PAGE_KERNEL; } +#ifdef CONFIG_HIGHMEM static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; @@ -416,7 +476,6 @@ static void __init set_highmem_pages_ini #endif /* !CONFIG_NUMA */ #else -# define kmap_init() do { } while (0) # define permanent_kmaps_init(pgd_base) do { } while (0) # define set_highmem_pages_init() do { } while (0) #endif /* CONFIG_HIGHMEM */ @@ -775,7 +834,7 @@ static unsigned long __init extend_init_ return start_pfn; } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse) { unsigned long puds, pmds, ptes, tables; @@ -785,7 +844,7 @@ static void __init find_early_table_spac pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); @@ -818,12 +877,22 @@ unsigned long __init_refok init_memory_m pgd_t *pgd_base = swapper_pg_dir; unsigned long start_pfn, end_pfn; unsigned long big_page_start; +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + int use_pse = 0; +#else + int use_pse = cpu_has_pse; +#endif /* * Find space for the kernel direct mapping tables. */ if (!after_init_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse); #ifdef CONFIG_X86_PAE set_nx(); @@ -869,7 +938,7 @@ unsigned long __init_refok init_memory_m end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - cpu_has_pse); + use_pse); /* tail is not big page alignment ? */ start_pfn = end_pfn; @@ -954,6 +1023,8 @@ void __init mem_init(void) pci_iommu_alloc(); + start_periodic_check_for_corruption(); + #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif @@ -1038,7 +1109,6 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - cpa_init(); save_pg_dir(); zap_low_mappings(); --- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -157,6 +158,62 @@ static unsigned long __meminitdata table static unsigned long __meminitdata table_cur; static unsigned long __meminitdata table_top; +pteval_t __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +static int do_not_nx __cpuinitdata; + +/* + * noexec=on|off + * Control non-executable mappings for 64-bit processes. + * + * on Enable (default) + * off Disable + */ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -214,14 +271,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsig } pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && __pte_val(new_pte) && -#ifdef CONFIG_ACPI - /* __acpi_map_table() fails to properly call clear_fixmap() */ - (vaddr < __fix_to_virt(FIX_ACPI_END) || - vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) && -#endif - __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* @@ -306,7 +355,7 @@ void __init init_extra_mapping_uc(unsign void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; pmd_t *last_pmd = pmd + PTRS_PER_PMD; @@ -336,7 +385,7 @@ static __ref void *alloc_low_page(unsign if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); - adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE); + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); *phys = pfn * PAGE_SIZE; return adr; @@ -382,7 +431,8 @@ static inline int __meminit make_readonl } static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { unsigned pages = 0; unsigned long last_map_addr = end; @@ -391,49 +441,58 @@ phys_pte_init(pte_t *pte_page, unsigned pte_t *pte = pte_page + pte_index(addr); for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - unsigned long pteval = addr | __PAGE_KERNEL; + unsigned long pteval = addr | pgprot_val(prot); if (addr >= end || (!after_bootmem && (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages)) break; - if (__pte_val(*pte)) + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (__pte_val(*pte)) { + pages++; continue; + } if (make_readonly(addr)) pteval &= ~_PAGE_RW; if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pteval); + pages++; if (!after_bootmem) *pte = __pte(pteval & __supported_pte_mask); else set_pte(pte, __pte(pteval & __supported_pte_mask)); last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; - pages++; } + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); BUG_ON(!max_pfn_mapped); - return phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end, prot); } static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { unsigned long pages = 0; unsigned long last_map_addr = end; - unsigned long start = address; int i = pmd_index(address); @@ -441,6 +500,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; if (address >= end) break; @@ -449,27 +509,42 @@ phys_pmd_init(pmd_t *pmd_page, unsigned if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end, prot); spin_unlock(&init_mm.page_table_lock); + continue; } - /* Count entries we're using from level2_ident_pgt */ - if (start == 0) + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { pages++; - continue; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end, new_prot); unmap_low_page(pte); if (!after_bootmem) { @@ -490,13 +565,13 @@ phys_pmd_init(pmd_t *pmd_page, unsigned static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; BUG_ON(!max_pfn_mapped); - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); __flush_tlb_all(); return last_map_addr; } @@ -513,15 +588,34 @@ phys_pud_init(pud_t *pud_page, unsigned unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; if (addr >= end) break; if (__pud_val(*pud)) { - if (!pud_large(*pud)) + if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); - continue; + page_size_mask, prot); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + pages++; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<> PAGE_SHIFT); @@ -825,11 +923,13 @@ unsigned long __init_refok init_memory_m unsigned long last_map_addr = 0; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; + unsigned long pos; struct map_range mr[NR_RANGE_MR]; int nr_range, i; + int use_pse, use_gbpages; - printk(KERN_INFO "init_memory_mapping\n"); + printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); /* * Find space for the kernel direct mapping tables. @@ -841,9 +941,21 @@ unsigned long __init_refok init_memory_m if (!after_bootmem) init_gbpages(); - if (direct_gbpages) +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) + if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); @@ -851,35 +963,50 @@ unsigned long __init_refok init_memory_m /* head if not big page alignment ?*/ start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) + pos = start_pfn << PAGE_SHIFT; + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } /* big page (2M) range*/ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1< ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); @@ -904,7 +1031,7 @@ unsigned long __init_refok init_memory_m (mr[i].page_size_mask & (1<> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size-1); + last_mapped_pfn = init_memory_mapping(start, start + size); if (last_mapped_pfn > max_pfn_mapped) max_pfn_mapped = last_mapped_pfn; ret = __add_pages(zone, start_pfn, nr_pages); - WARN_ON(1); + WARN_ON_ONCE(ret); return ret; } @@ -1062,8 +1189,11 @@ static struct kcore_list kcore_mem, kcor void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; + unsigned long absent_pages; unsigned long pfn; + start_periodic_check_for_corruption(); + pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ @@ -1076,13 +1206,15 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif + /* XEN: init pages outside initial allocation. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { ClearPageReserved(pfn_to_page(pfn)); init_page_count(pfn_to_page(pfn)); } - reservedpages = max_pfn - totalram_pages - - absent_pages_in_range(0, max_pfn); + + absent_pages = absent_pages_in_range(0, max_pfn); + reservedpages = max_pfn - totalram_pages - absent_pages; after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -1099,15 +1231,14 @@ void __init mem_init(void) VSYSCALL_END - VSYSCALL_START); printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk reserved, %ldk data, %ldk init)\n", + "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), max_pfn << (PAGE_SHIFT-10), codesize >> 10, + absent_pages << (PAGE_SHIFT-10), reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - - cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-17/arch/x86/mm/iomap_32-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -0,0 +1,61 @@ +/* + * Copyright © 2008 Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + */ + +#include +#include +#include + +/* Map 'mfn' using fixed map 'type' and protections 'prot' + */ +void * +iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + pagefault_disable(); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + pgprot_val(prot) |= _PAGE_IOMAP; + set_pte_at(&init_mm, vaddr, kmap_pte-idx, pfn_pte_ma(mfn, prot)); + /*arch_flush_lazy_mmu_mode()*/; + + return (void*) vaddr; +} +EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); + +void +iounmap_atomic(void *kvaddr, enum km_type type) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + /* + * Force other mappings to Oops if they'll try to access this pte + * without first remap it. Keeping stale mappings around is a bad idea + * also, in case the page changes cacheability attributes or becomes + * a protected page in a hypervisor. + */ + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + kpte_clear_flush(kmap_pte-idx, vaddr); + + /*arch_flush_lazy_mmu_mode();*/ + pagefault_enable(); +} +EXPORT_SYMBOL_GPL(iounmap_atomic); --- head-2011-03-17.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:40:39.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/ioremap-xen.c 2011-02-07 15:41:07.000000000 +0100 @@ -25,20 +25,51 @@ #ifdef CONFIG_X86_64 -#ifndef CONFIG_XEN +static inline int phys_addr_valid(unsigned long addr) +{ + return addr < (1UL << boot_cpu_data.x86_phys_bits); +} + +#define phys_base 0 + unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : + !phys_addr_valid(x)); + } + return x; } EXPORT_SYMBOL(__phys_addr); -#endif -static inline int phys_addr_valid(unsigned long addr) +bool __virt_addr_valid(unsigned long x) { - return addr < (1UL << boot_cpu_data.x86_phys_bits); + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (system_state == SYSTEM_BOOTING ? + x > MAXMEM : !phys_addr_valid(x)) { + return false; + } + } + + return pfn_valid(x >> PAGE_SHIFT); } +EXPORT_SYMBOL(__virt_addr_valid); + +#undef phys_base #else @@ -47,6 +78,28 @@ static inline int phys_addr_valid(unsign return 1; } +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants; not available at the boot time */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && + is_vmalloc_addr((void *) x)); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + #endif static int direct_remap_area_pte_fn(pte_t *pte, @@ -103,7 +156,7 @@ static int __direct_remap_pfn_range(stru * Fill in the machine address: PTE ptr is done later by * apply_to_page_range(). */ - pgprot_val(prot) |= _PAGE_IO; + pgprot_val(prot) |= _PAGE_IOMAP; v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot))); mfn++; @@ -221,6 +274,25 @@ int page_is_ram(unsigned long pagenr) return 0; } +int pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + if (page_is_ram(mfn_to_local_pfn(page_nr))) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -308,6 +380,12 @@ static void __iomem *__ioremap_caller(re return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr); /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ON(iomem_map_sanity_check(phys_addr, size)); + + /* * Don't allow anybody to remap normal RAM that we're using.. */ for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) { @@ -362,16 +440,16 @@ static void __iomem *__ioremap_caller(re switch (prot_val) { case _PAGE_CACHE_UC: default: - prot = PAGE_KERNEL_NOCACHE; + prot = PAGE_KERNEL_IO_NOCACHE; break; case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_UC_MINUS; + prot = PAGE_KERNEL_IO_UC_MINUS; break; case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_WC; + prot = PAGE_KERNEL_IO_WC; break; case _PAGE_CACHE_WB: - prot = PAGE_KERNEL; + prot = PAGE_KERNEL_IO; break; } @@ -471,7 +549,7 @@ static void __iomem *ioremap_default(res unsigned long size) { unsigned long flags; - void *ret; + void __iomem *ret; int err; /* @@ -483,11 +561,11 @@ static void __iomem *ioremap_default(res if (err < 0) return NULL; - ret = (void *) __ioremap_caller(phys_addr, size, flags, - __builtin_return_address(0)); + ret = __ioremap_caller(phys_addr, size, flags, + __builtin_return_address(0)); free_memtype(phys_addr, phys_addr + size); - return (void __iomem *)ret; + return ret; } #endif @@ -583,7 +661,7 @@ void unxlate_dev_mem_ptr(unsigned long p } #endif -int __initdata early_ioremap_debug; +static int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) { @@ -702,12 +780,12 @@ static void __init __early_set_fixmap(en } static inline void __init early_set_fixmap(enum fixed_addresses idx, - unsigned long phys) + unsigned long phys, pgprot_t prot) { if (after_paging_init) - set_fixmap(idx, phys); + __set_fixmap(idx, phys, prot); else - __early_set_fixmap(idx, phys, PAGE_KERNEL); + __early_set_fixmap(idx, phys, prot); } static inline void __init early_clear_fixmap(enum fixed_addresses idx) @@ -718,16 +796,22 @@ static inline void __init early_clear_fi __early_set_fixmap(idx, 0, __pgprot(0)); } - -int __initdata early_ioremap_nested; - +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; static int __init check_early_ioremap_leak(void) { - if (!early_ioremap_nested) + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (!count) return 0; WARN(1, KERN_WARNING "Debug warning: early ioremap leak of %d areas detected.\n", - early_ioremap_nested); + count); printk(KERN_WARNING "please boot with early_ioremap_debug and report the dmesg.\n"); @@ -735,18 +819,33 @@ static int __init check_early_ioremap_le } late_initcall(check_early_ioremap_leak); -void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; - unsigned int nrpages, nesting; + unsigned int nrpages; enum fixed_addresses idx0, idx; + int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); - nesting = early_ioremap_nested; + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", + phys_addr, size); + WARN_ON(1); + return NULL; + } + if (early_ioremap_debug) { printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", - phys_addr, size, nesting); + phys_addr, size, slot); dump_stack(); } @@ -757,17 +856,13 @@ void __init *early_ioremap(unsigned long return NULL; } - if (nesting >= FIX_BTMAPS_NESTING) { - WARN_ON(1); - return NULL; - } - early_ioremap_nested++; + prev_size[slot] = size; /* * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. @@ -781,10 +876,10 @@ void __init *early_ioremap(unsigned long /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; idx = idx0; while (nrpages > 0) { - early_set_fixmap(idx, phys_addr); + early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; --idx; --nrpages; @@ -792,24 +887,55 @@ void __init *early_ioremap(unsigned long if (early_ioremap_debug) printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); - return (void *) (offset + fix_to_virt(idx0)); + prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); + return prev_map[slot]; } -void __init early_iounmap(void *addr, unsigned long size) +/* Remap an IO device */ +void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); +} + +/* Remap memory */ +void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL); +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) { unsigned long virt_addr; unsigned long offset; unsigned int nrpages; enum fixed_addresses idx; - int nesting; + int i, slot; - nesting = --early_ioremap_nested; - if (WARN_ON(nesting < 0)) + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", + addr, size); + WARN_ON(1); + return; + } + + if (prev_size[slot] != size) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot]); + WARN_ON(1); return; + } if (early_ioremap_debug) { printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, nesting); + size, slot); dump_stack(); } @@ -821,12 +947,13 @@ void __init early_iounmap(void *addr, un offset = virt_addr & ~PAGE_MASK; nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_clear_fixmap(idx); --idx; --nrpages; } + prev_map[slot] = NULL; } void __this_fixmap_does_not_exist(void) --- head-2011-03-17.orig/arch/x86/mm/pageattr-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pageattr-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -25,15 +25,27 @@ * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { - unsigned long vaddr; + unsigned long *vaddr; pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int flushtlb; + int flags; unsigned long pfn; unsigned force_split : 1; + int curpage; }; +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -53,23 +65,22 @@ static void split_page_count(int level) direct_pages_count[level - 1] += PTRS_PER_PTE; } -int arch_report_meminfo(char *page) +void arch_report_meminfo(struct seq_file *m) { - int n = sprintf(page, "DirectMap4k: %8lu kB\n", + seq_printf(m, "DirectMap4k: %8lu kB\n", direct_pages_count[PG_LEVEL_4K] << 2); #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) - n += sprintf(page + n, "DirectMap2M: %8lu kB\n", + seq_printf(m, "DirectMap2M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 11); #else - n += sprintf(page + n, "DirectMap4M: %8lu kB\n", + seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif #ifdef CONFIG_X86_64 if (direct_gbpages) - n += sprintf(page + n, "DirectMap1G: %8lu kB\n", + seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); #endif - return n; } #else static inline void split_page_count(int level) { } @@ -84,7 +95,7 @@ static inline unsigned long highmap_star static inline unsigned long highmap_end_pfn(void) { - return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -190,6 +201,41 @@ static void cpa_flush_range(unsigned lon } } +static void cpa_flush_array(unsigned long *start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long *addr; + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* 4M threshold */ + if (numpages >= 1024) { + if (boot_cpu_data.x86_model >= 4) + wbinvd(); + return; + } + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr++) { + pte_t *pte = lookup_address(*addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (__pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) *addr, PAGE_SIZE); + } +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -414,7 +460,7 @@ try_preserve_large_page(pte_t *kpte, uns */ new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot)); __set_pmd_pte(kpte, address, level, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; do_split = 0; } @@ -424,84 +470,6 @@ out_unlock: return do_split; } -static LIST_HEAD(page_pool); -static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed; - -static void cpa_fill_pool(struct page **ret) -{ - gfp_t gfp = GFP_KERNEL; - unsigned long flags; - struct page *p; - - /* - * Avoid recursion (on debug-pagealloc) and also signal - * our priority to get to these pagetables: - */ - if (current->flags & PF_MEMALLOC) - return; - current->flags |= PF_MEMALLOC; - - /* - * Allocate atomically from atomic contexts: - */ - if (in_atomic() || irqs_disabled() || debug_pagealloc) - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - - while (pool_pages < pool_size || (ret && !*ret)) { - p = alloc_pages(gfp, 0); - if (!p) { - pool_failed++; - break; - } - /* - * If the call site needs a page right now, provide it: - */ - if (ret && !*ret) { - *ret = p; - continue; - } - spin_lock_irqsave(&pgd_lock, flags); - list_add(&p->lru, &page_pool); - pool_pages++; - spin_unlock_irqrestore(&pgd_lock, flags); - } - - current->flags &= ~PF_MEMALLOC; -} - -#define SHIFT_MB (20 - PAGE_SHIFT) -#define ROUND_MB_GB ((1 << 10) - 1) -#define SHIFT_MB_GB 10 -#define POOL_PAGES_PER_GB 16 - -void __init cpa_init(void) -{ - struct sysinfo si; - unsigned long gb; - - si_meminfo(&si); - /* - * Calculate the number of pool pages: - * - * Convert totalram (nr of pages) to MiB and round to the next - * GiB. Shift MiB to Gib and multiply the result by - * POOL_PAGES_PER_GB: - */ - if (debug_pagealloc) { - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; - } else { - pool_size = 1; - } - pool_low = pool_size; - - cpa_fill_pool(NULL); - printk(KERN_DEBUG - "CPA: page pool initialized %lu of %lu pages preallocated\n", - pool_pages, pool_size); -} - static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, mfn, mfninc = 1; @@ -510,28 +478,15 @@ static int split_large_page(pte_t *kpte, pgprot_t ref_prot; struct page *base; - /* - * Get a page from the pool. The pool list is protected by the - * pgd_lock, which we have to take anyway for the split - * operation: - */ - spin_lock_irqsave(&pgd_lock, flags); - if (list_empty(&page_pool)) { - spin_unlock_irqrestore(&pgd_lock, flags); - base = NULL; - cpa_fill_pool(&base); - if (!base) - return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); - } else { - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - } + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page * up for us already: @@ -592,11 +547,8 @@ out_unlock: * If we dropped out via the lookup_address check under * pgd_lock then stick the page back into the pool: */ - if (base) { - list_add(&base->lru, &page_pool); - pool_pages++; - } else - pool_used++; + if (base) + __free_page(base); spin_unlock_irqrestore(&pgd_lock, flags); return 0; @@ -604,11 +556,16 @@ out_unlock: static int __change_page_attr(struct cpa_data *cpa, int primary) { - unsigned long address = cpa->vaddr; + unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; + if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; + repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -620,7 +577,7 @@ repeat: return 0; WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); + *cpa->vaddr); return -EINVAL; } @@ -647,7 +604,7 @@ repeat: */ if (__pte_val(old_pte) != __pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; @@ -671,7 +628,25 @@ repeat: */ err = split_large_page(kpte, address); if (!err) { - cpa->flushtlb = 1; + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); goto repeat; } @@ -684,6 +659,7 @@ static int cpa_process_alias(struct cpa_ { struct cpa_data alias_cpa; int ret = 0; + unsigned long temp_cpa_vaddr, vaddr; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -696,16 +672,24 @@ static int cpa_process_alias(struct cpa_ * No need to redo, when the primary call touched the direct * mapping already: */ - if (!(within(cpa->vaddr, PAGE_OFFSET, + if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) #ifdef CONFIG_X86_64 - || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + || within(vaddr, PAGE_OFFSET + (1UL<<32), PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) #endif )) { alias_cpa = *cpa; - alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; + ret = __change_page_attr_set_clr(&alias_cpa, 0); } @@ -717,7 +701,7 @@ static int cpa_process_alias(struct cpa_ * No need to redo, when the primary call touched the high * mapping already: */ - if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) return 0; /* @@ -728,8 +712,9 @@ static int cpa_process_alias(struct cpa_ return 0; alias_cpa = *cpa; - alias_cpa.vaddr = - (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map; + temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; /* * The high mapping range is imprecise, so ignore the return value. @@ -749,8 +734,15 @@ static int __change_page_attr_set_clr(st * preservation check. */ cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & CPA_ARRAY) + cpa->numpages = 1; + if (!debug_pagealloc) + spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); if (ret) return ret; @@ -767,7 +759,11 @@ static int __change_page_attr_set_clr(st */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - cpa->vaddr += cpa->numpages * PAGE_SIZE; + if (cpa->flags & CPA_ARRAY) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } @@ -778,9 +774,9 @@ static inline int cache_attr(pgprot_t at (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int array) { struct cpa_data cpa; int ret, cache, checkalias; @@ -795,21 +791,40 @@ static int change_page_attr_set_clr(unsi return 0; /* Ensure we are PAGE_SIZE aligned */ - if (addr & ~PAGE_MASK) { - addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); + if (!array) { + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + } else { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } } + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + + vm_unmap_aliases(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flushtlb = 0; + cpa.flags = 0; + cpa.curpage = 0; cpa.force_split = force_split; + if (array) + cpa.flags |= CPA_ARRAY; + /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -818,7 +833,7 @@ static int change_page_attr_set_clr(unsi /* * Check whether we really changed something: */ - if (!cpa.flushtlb) + if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* @@ -833,27 +848,30 @@ static int change_page_attr_set_clr(unsi * error case we fall back to cpa_flush_all (which uses * wbindv): */ - if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages, cache); - else + if (!ret && cpu_has_clflush) { + if (cpa.flags & CPA_ARRAY) + cpa_flush_array(addr, numpages, cache); + else + cpa_flush_range(*addr, numpages, cache); + } else cpa_flush_all(cache); out: - cpa_fill_pool(NULL); - return ret; } -static inline int change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + array); } -static inline int change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + array); } #ifdef CONFIG_XEN @@ -906,8 +924,8 @@ int _set_memory_uc(unsigned long addr, i /* * for now UC MINUS. see comments in ioremap_nocache() */ - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -923,10 +941,48 @@ int set_memory_uc(unsigned long addr, in } EXPORT_SYMBOL(set_memory_uc); +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + unsigned long start; + unsigned long end; + int i; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + start = __pa(addr[i]); + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + goto out; + } + + return change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); +out: + for (i = 0; i < addrinarray; i++) { + unsigned long tmp = __pa(addr[i]); + + if (tmp == start) + break; + for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(tmp, end); + } + return -EINVAL; +} +EXPORT_SYMBOL(set_memory_array_uc); + int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_WC)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_WC), 0); } int set_memory_wc(unsigned long addr, int numpages) @@ -944,8 +1000,8 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK)); + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) @@ -956,37 +1012,59 @@ int set_memory_wb(unsigned long addr, in } EXPORT_SYMBOL(set_memory_wb); +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + + for (i = 0; i < addrinarray; i++) { + unsigned long start = __pa(addr[i]); + unsigned long end; + + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(start, end); + } + return change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); +} +EXPORT_SYMBOL(set_memory_array_wb); + int set_memory_x(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } +EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } +EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), - __pgprot(0), 1); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0); } int set_pages_uc(struct page *page, int numpages) @@ -1039,22 +1117,38 @@ int set_pages_rw(struct page *page, int static int __set_pages_p(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .mask_clr = __pgprot(0)}; + .mask_clr = __pgprot(0), + .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } static int __set_pages_np(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -1074,11 +1168,8 @@ void kernel_map_pages(struct page *page, /* * The return value is ignored as the calls cannot fail. - * Large pages are kept enabled at boot time, and are - * split up quickly with DEBUG_PAGEALLOC. If a splitup - * fails here (due to temporary memory shortage) no damage - * is done because we just keep the largepage intact up - * to the next attempt when it will likely be split up: + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); @@ -1090,53 +1181,8 @@ void kernel_map_pages(struct page *page, * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); - - /* - * Try to refill the page pool here. We can do this only after - * the tlb flush. - */ - cpa_fill_pool(NULL); -} - -#ifdef CONFIG_DEBUG_FS -static int dpa_show(struct seq_file *m, void *v) -{ - seq_puts(m, "DEBUG_PAGEALLOC\n"); - seq_printf(m, "pool_size : %lu\n", pool_size); - seq_printf(m, "pool_pages : %lu\n", pool_pages); - seq_printf(m, "pool_low : %lu\n", pool_low); - seq_printf(m, "pool_used : %lu\n", pool_used); - seq_printf(m, "pool_failed : %lu\n", pool_failed); - - return 0; } -static int dpa_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, dpa_show, NULL); -} - -static const struct file_operations dpa_fops = { - .open = dpa_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init debug_pagealloc_proc_init(void) -{ - struct dentry *de; - - de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, - &dpa_fops); - if (!de) - return -ENOMEM; - - return 0; -} -__initcall(debug_pagealloc_proc_init); -#endif - #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) --- head-2011-03-17.orig/arch/x86/mm/pat-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pat-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -7,24 +7,24 @@ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. */ -#include +#include +#include +#include #include #include +#include #include -#include -#include -#include -#include -#include +#include #include -#include +#include #include -#include -#include -#include #include +#include #include +#include +#include +#include #include #ifdef CONFIG_X86_PAT @@ -46,6 +46,7 @@ early_param("nopat", nopat); static int debug_enable; + static int __init pat_debug_setup(char *str) { debug_enable = 1; @@ -157,14 +158,23 @@ static char *cattr_name(unsigned long fl */ struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; + u64 start; + u64 end; + unsigned long type; + struct list_head nd; }; static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ + +static inline u8 _mtrr_type_lookup(u64 start, u64 end) +{ + if (is_initial_xendomain()) + return mtrr_type_lookup(start, end); + return pagerange_is_ram(start, end) > 0 + ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE; +} +#define mtrr_type_lookup _mtrr_type_lookup /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -192,8 +202,8 @@ static unsigned long pat_x_mtrr_type(u64 return req_type; } -static int chk_conflict(struct memtype *new, struct memtype *entry, - unsigned long *type) +static int +chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) { if (new->type != entry->type) { if (type) { @@ -223,6 +233,72 @@ static struct memtype *cached_entry; static u64 cached_start; /* + * For RAM pages, mark the pages as non WB memory type using + * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or + * set_memory_wc() on a RAM page at a time before marking it as WB again. + * This is ok, because only one driver will be owning the page and + * doing set_memory_*() calls. + * + * For now, we use PageNonWB to track that the RAM page is being mapped + * as non WB. In future, we will have to use one more flag + * (or some other mechanism in page_struct) to distinguish between + * UC and WC mapping. + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + unsigned long mfn, end_mfn; + + for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) { + unsigned long pfn = mfn_to_local_pfn(mfn); + + BUG_ON(!pfn_valid(pfn)); + page = pfn_to_page(pfn); + if (page_mapped(page) || PageNonWB(page)) + goto out; + + SetPageNonWB(page); + } + return 0; + +out: + end_mfn = mfn; + for (mfn = (start >> PAGE_SHIFT); mfn < end_mfn; ++mfn) { + page = pfn_to_page(mfn_to_local_pfn(mfn)); + ClearPageNonWB(page); + } + + return -EINVAL; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + unsigned long mfn, end_mfn; + + for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) { + unsigned long pfn = mfn_to_local_pfn(mfn); + + BUG_ON(!pfn_valid(pfn)); + page = pfn_to_page(pfn); + if (page_mapped(page) || !PageNonWB(page)) + goto out; + + ClearPageNonWB(page); + } + return 0; + +out: + end_mfn = mfn; + for (mfn = (start >> PAGE_SHIFT); mfn < end_mfn; ++mfn) { + page = pfn_to_page(mfn_to_local_pfn(mfn)); + SetPageNonWB(page); + } + return -EINVAL; +} + +/* * req_type typically has one of the: * - _PAGE_CACHE_WB * - _PAGE_CACHE_WC @@ -238,14 +314,15 @@ static u64 cached_start; * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { struct memtype *new, *entry; unsigned long actual_type; struct list_head *where; + int is_range_ram; int err = 0; - BUG_ON(start >= end); /* end is exclusive */ + BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ @@ -278,17 +355,24 @@ int reserve_memtype(u64 start, u64 end, actual_type = _PAGE_CACHE_WB; else actual_type = _PAGE_CACHE_UC_MINUS; - } else + } else { actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + } + + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, new_type); + else if (is_range_ram < 0) + return -EINVAL; new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; - new->start = start; - new->end = end; - new->type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; if (new_type) *new_type = actual_type; @@ -347,6 +431,7 @@ int reserve_memtype(u64 start, u64 end, start, end, cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); + return err; } @@ -370,6 +455,7 @@ int free_memtype(u64 start, u64 end) { struct memtype *entry; int err = -EINVAL; + int is_range_ram; if (!pat_enabled) return 0; @@ -378,6 +464,12 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; + spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { @@ -398,6 +490,7 @@ int free_memtype(u64 start, u64 end) } dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; } @@ -415,12 +508,16 @@ static inline int range_is_allowed(unsig return 1; } #else +/* This check is needed to avoid cache aliasing when PAT is enabled */ static inline int range_is_allowed(unsigned long mfn, unsigned long size) { u64 from = ((u64)mfn) << PAGE_SHIFT; u64 to = from + size; u64 cursor = from; + if (!pat_enabled) + return 1; + while (cursor < to) { if (!devmem_is_allowed(mfn)) { printk(KERN_INFO @@ -504,9 +601,9 @@ int phys_mem_access_prot_allowed(struct void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot) { + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); u64 addr = (u64)mfn << PAGE_SHIFT; unsigned long flags; - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); reserve_memtype(addr, addr + size, want_flags, &flags); if (flags != want_flags) { @@ -526,7 +623,7 @@ void unmap_devmem(unsigned long mfn, uns free_memtype(addr, addr + size); } -#if defined(CONFIG_DEBUG_FS) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ static struct memtype *memtype_get_idx(loff_t pos) @@ -549,6 +646,7 @@ static struct memtype *memtype_get_idx(l } spin_unlock(&memtype_lock); kfree(print_entry); + return NULL; } @@ -579,6 +677,7 @@ static int memtype_seq_show(struct seq_f seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), print_entry->start, print_entry->end); kfree(print_entry); + return 0; } @@ -610,4 +709,4 @@ static int __init pat_memtype_list_init( late_initcall(pat_memtype_list_init); -#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ --- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -129,7 +129,7 @@ void __pud_free_tlb(struct mmu_gather *t static void _pin_lock(struct mm_struct *mm, int lock) { if (lock) spin_lock(&mm->page_table_lock); -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS /* While mm->page_table_lock protects us against insertions and * removals of higher level page table pages, it doesn't protect * against updates of pte-s. Such updates, however, require the @@ -408,10 +408,8 @@ static inline void pgd_list_del(pgd_t *p #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(void *p) +static void pgd_ctor(pgd_t *pgd) { - pgd_t *pgd = p; - pgd_test_and_unpin(pgd); /* If the pgd points to a shared pagetable level (either the @@ -440,7 +438,7 @@ static void pgd_ctor(void *p) pgd_list_add(pgd); } -static void pgd_dtor(void *pgd) +static void pgd_dtor(pgd_t *pgd) { unsigned long flags; /* can be called from interrupt context */ --- head-2011-03-17.orig/arch/x86/mm/pgtable_32-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable_32-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -122,7 +122,6 @@ void __init reserve_top_address(unsigned printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; } /* @@ -135,7 +134,8 @@ static int __init parse_vmalloc(char *ar if (!arg) return -EINVAL; - __VMALLOC_RESERVE = memparse(arg, &arg); + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; return 0; } early_param("vmalloc", parse_vmalloc); --- head-2011-03-17.orig/arch/x86/pci/irq-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/pci/irq-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -499,7 +499,7 @@ static int pirq_amd756_get(struct pci_de if (pirq <= 4) irq = read_config_nybble(router, 0x56, pirq - 1); dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n", dev->vendor, dev->device, pirq, irq); return irq; } @@ -507,7 +507,7 @@ static int pirq_amd756_get(struct pci_de static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n", dev->vendor, dev->device, pirq, irq); if (pirq <= 4) write_config_nybble(router, 0x56, pirq - 1, irq); @@ -596,13 +596,20 @@ static __init int intel_router_probe(str case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: - case PCI_DEVICE_ID_INTEL_PCH_0: - case PCI_DEVICE_ID_INTEL_PCH_1: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } + + if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; } @@ -829,7 +836,7 @@ static void __init pirq_find_router(stru r->get = NULL; r->set = NULL; - DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); @@ -849,7 +856,7 @@ static void __init pirq_find_router(stru h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } - dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", pirq_router.name, pirq_router_dev->vendor, pirq_router_dev->device); @@ -1049,35 +1056,44 @@ static void __init pcibios_fixup_irqs(vo if (io_apic_assign_pci_irqs) { int irq; - if (pin) { - /* - * interrupt pins are numbered starting - * from 1 - */ - pin--; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin); - /* - * Busses behind bridges are typically not listed in the MP-table. - * In this case we have to look up the IRQ based on the parent bus, - * parent slot, and pin number. The SMP code detects such bridged - * busses itself so we should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; + if (!pin) + continue; + + /* + * interrupt pins are numbered starting from 1 + */ + pin--; + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the + * MP-table. In this case we have to look up the IRQ + * based on the parent bus, parent slot, and pin number. + * The SMP code detects such bridged busses itself so we + * should get into this branch reliably. + */ + if (irq < 0 && dev->bus->parent) { + /* go back to the bridge */ + struct pci_dev *bridge = dev->bus->self; + int bus; - pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); - if (irq >= 0) - dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n", - pci_name(bridge), - 'A' + pin, irq); - } - if (irq >= 0) { - dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq); - dev->irq = irq; - } + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + bus = bridge->bus->number; + irq = IO_APIC_get_PCI_irq_vector(bus, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + dev_warn(&dev->dev, + "using bridge %s INT %c to " + "get IRQ %d\n", + pci_name(bridge), + 'A' + pin, irq); + } + if (irq >= 0) { + dev_info(&dev->dev, + "PCI->APIC IRQ transform: INT %c " + "-> IRQ %d\n", + 'A' + pin, irq); + dev->irq = irq; } } #endif --- head-2011-03-17.orig/arch/x86/xen/Kconfig 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/xen/Kconfig 2011-02-01 14:39:24.000000000 +0100 @@ -43,7 +43,7 @@ config XEN_SAVE_RESTORE config XEN_DEBUG_FS bool "Enable Xen debug and tuning parameters in debugfs" - depends on XEN && DEBUG_FS + depends on PARAVIRT_XEN && DEBUG_FS default n help Enable statistics output and various tuning options in debugfs. --- head-2011-03-17.orig/drivers/acpi/acpica/hwsleep.c 2011-01-31 17:01:49.000000000 +0100 +++ head-2011-03-17/drivers/acpi/acpica/hwsleep.c 2011-02-01 14:39:24.000000000 +0100 @@ -396,8 +396,7 @@ acpi_status asmlinkage acpi_enter_sleep_ err = acpi_notify_hypervisor_state(sleep_state, PM1Acontrol, PM1Bcontrol); if (err) { - ACPI_DEBUG_PRINT((ACPI_DB_ERROR, - "Hypervisor failure [%d]\n", err)); + printk(KERN_ERR "ACPI: Hypervisor failure [%d]\n", err); return_ACPI_STATUS(AE_ERROR); } #endif --- head-2011-03-17.orig/drivers/acpi/processor_core.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/acpi/processor_core.c 2011-02-01 14:39:24.000000000 +0100 @@ -165,13 +165,20 @@ exit: int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id) { - int i; + int i = 0; int apic_id = -1; + if (type < 0) { + if (!processor_cntl_external()) + return -1; + type = ~type; + i = 1; + } + apic_id = map_mat_entry(handle, type, acpi_id); if (apic_id == -1) apic_id = map_madt_entry(type, acpi_id); - if (apic_id == -1) + if (apic_id == -1 || i) return apic_id; #ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL --- head-2011-03-17.orig/drivers/acpi/processor_driver.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/acpi/processor_driver.c 2011-02-01 14:39:24.000000000 +0100 @@ -326,7 +326,8 @@ static int acpi_processor_get_info(struc if (pr->id == -1) { if (ACPI_FAILURE (acpi_processor_hotadd_init(pr->handle, &pr->id)) && - !processor_cntl_external()) { + get_cpu_id(pr->handle, ~device_declaration, + pr->acpi_id) < 0) { return -ENODEV; } } --- head-2011-03-17.orig/drivers/acpi/processor_extcntl.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/acpi/processor_extcntl.c 2011-02-01 14:39:24.000000000 +0100 @@ -30,7 +30,6 @@ #include -#define ACPI_PROCESSOR_COMPONENT 0x01000000 #define ACPI_PROCESSOR_CLASS "processor" #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_extcntl") --- head-2011-03-17.orig/drivers/char/agp/generic.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/drivers/char/agp/generic.c 2011-02-01 14:39:24.000000000 +0100 @@ -1217,7 +1217,7 @@ int agp_generic_alloc_pages(struct agp_b } #ifdef CONFIG_X86 - set_pages_array_uc(mem->pages, num_pages); + map_pages_into_agp(mem->pages, num_pages); #endif ret = 0; out: @@ -1250,7 +1250,7 @@ void agp_generic_destroy_pages(struct ag return; #ifdef CONFIG_X86 - set_pages_array_wb(mem->pages, mem->page_count); + unmap_pages_from_agp(mem->pages, mem->page_count); #endif for (i = 0; i < mem->page_count; i++) { --- head-2011-03-17.orig/drivers/firmware/dmi_scan.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/drivers/firmware/dmi_scan.c 2011-02-17 10:11:37.000000000 +0100 @@ -482,6 +482,11 @@ static bool dmi_matches(const struct dmi { int i; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return false; +#endif + WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n"); for (i = 0; i < ARRAY_SIZE(dmi->matches); i++) { --- head-2011-03-17.orig/drivers/idle/Kconfig 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/drivers/idle/Kconfig 2011-02-01 14:39:24.000000000 +0100 @@ -10,7 +10,7 @@ config INTEL_IDLE processors intel_idle does not support. menu "Memory power savings" -depends on X86_64 +depends on X86_64 && !XEN config I7300_IDLE_IOAT_CHANNEL bool --- head-2011-03-17.orig/drivers/pci/msi-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/pci/msi-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -266,8 +266,16 @@ static int msi_map_vector(struct pci_dev * dev->irq in dom0 will be 'Xen pirq' if this device belongs to * to another domain, and will be 'Linux irq' if it belongs to dom0. */ - return ((domid != DOMID_SELF) ? - map_irq.pirq : evtchn_map_pirq(-1, map_irq.pirq)); + if (domid == DOMID_SELF) { + rc = evtchn_map_pirq(-1, map_irq.pirq); + dev_printk(KERN_DEBUG, &dev->dev, + "irq %d (%d) for MSI/MSI-X\n", + rc, map_irq.pirq); + return rc; + } + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for dom%d MSI/MSI-X\n", + map_irq.pirq, domid); + return map_irq.pirq; } static void pci_intx_for_msi(struct pci_dev *dev, int enable) @@ -722,3 +730,24 @@ void pci_msi_init_pci_dev(struct pci_dev INIT_LIST_HEAD(&dev->msi_list); #endif } + +#ifdef CONFIG_ACPI +#include +#include +static void __devinit msi_acpi_init(void) +{ + if (acpi_pci_disabled) + return; + pci_osc_support_set(OSC_MSI_SUPPORT); + pcie_osc_support_set(OSC_MSI_SUPPORT); +} +#else +static inline void msi_acpi_init(void) { } +#endif /* CONFIG_ACPI */ + +void __devinit msi_init(void) +{ + if (!pci_msi_enable) + return; + msi_acpi_init(); +} --- head-2011-03-17.orig/drivers/pci/probe.c 2011-03-17 14:35:46.000000000 +0100 +++ head-2011-03-17/drivers/pci/probe.c 2011-02-01 14:39:24.000000000 +0100 @@ -1214,6 +1214,11 @@ static void pci_init_capabilities(struct /* Vital Product Data */ pci_vpd_pci22_init(dev); +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return; +#endif + /* Alternative Routing-ID Forwarding */ pci_enable_ari(dev); --- head-2011-03-17.orig/drivers/xen/Kconfig 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/drivers/xen/Kconfig 2011-02-02 15:36:33.000000000 +0100 @@ -344,9 +344,6 @@ config XEN_SMPBOOT def_bool y depends on SMP && !PPC_XEN -config XEN_XENCOMM - bool - config XEN_DEVMEM def_bool y @@ -452,4 +449,7 @@ config SWIOTLB_XEN depends on PCI select SWIOTLB +config XEN_XENCOMM + bool + endmenu --- head-2011-03-17.orig/drivers/xen/Makefile 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/Makefile 2011-02-01 14:39:24.000000000 +0100 @@ -1,4 +1,5 @@ obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o +xen-hotplug-$(CONFIG_PARAVIRT_XEN) := cpu_hotplug.o xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o xen-balloon-$(CONFIG_XEN) := balloon/ @@ -11,6 +12,7 @@ obj-$(CONFIG_XEN) += char/ xen-backend-$(CONFIG_XEN_BACKEND) := util.o obj-$(CONFIG_XEN) += features.o $(xen-backend-y) $(xen-backend-m) +obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y) obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y) obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ --- head-2011-03-17.orig/drivers/xen/blkback/vbd.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkback/vbd.c 2011-02-01 14:39:24.000000000 +0100 @@ -95,7 +95,8 @@ int vbd_create(blkif_t *blkif, blkif_vde void vbd_free(struct vbd *vbd) { if (vbd->bdev) - blkdev_put(vbd->bdev); + blkdev_put(vbd->bdev, + vbd->readonly ? FMODE_READ : FMODE_WRITE); vbd->bdev = NULL; } --- head-2011-03-17.orig/drivers/xen/blkfront/blkfront.c 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/blkfront.c 2011-02-01 14:39:24.000000000 +0100 @@ -342,6 +342,7 @@ static void connect(struct blkfront_info return; pr_info("Setting capacity to %Lu\n", sectors); set_capacity(info->gd, sectors); + revalidate_disk(info->gd); /* fall through */ case BLKIF_STATE_SUSPENDED: @@ -500,9 +501,15 @@ static void blkif_restart_queue_callback schedule_work(&info->work); } +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) int blkif_open(struct inode *inode, struct file *filep) { - struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + struct block_device *bd = inode->i_bdev; +#else +int blkif_open(struct block_device *bd, fmode_t mode) +{ +#endif + struct blkfront_info *info = bd->bd_disk->private_data; if (!info->xbdev) return -ENODEV; @@ -511,9 +518,16 @@ int blkif_open(struct inode *inode, stru } +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) int blkif_release(struct inode *inode, struct file *filep) { - struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + struct gendisk *disk = inode->i_bdev->bd_disk; +#else +int blkif_release(struct gendisk *disk, fmode_t mode) +{ +#endif + struct blkfront_info *info = disk->private_data; + info->users--; if (info->users == 0) { /* Check whether we have been instructed to close. We will @@ -532,10 +546,17 @@ int blkif_release(struct inode *inode, s } +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) int blkif_ioctl(struct inode *inode, struct file *filep, unsigned command, unsigned long argument) { - struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + struct block_device *bd = inode->i_bdev; +#else +int blkif_ioctl(struct block_device *bd, fmode_t mode, + unsigned command, unsigned long argument) +{ +#endif + struct blkfront_info *info = bd->bd_disk->private_data; int i; DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", @@ -544,7 +565,6 @@ int blkif_ioctl(struct inode *inode, str switch (command) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) case HDIO_GETGEO: { - struct block_device *bd = inode->i_bdev; struct hd_geometry geo; int ret; @@ -586,10 +606,14 @@ int blkif_ioctl(struct inode *inode, str #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) return scsi_cmd_ioctl(filep, info->gd, command, (void __user *)argument); -#else +#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) return scsi_cmd_ioctl(filep, info->rq, info->gd, command, (void __user *)argument); +#else + return scsi_cmd_ioctl(info->rq, info->gd, + mode, command, + (void __user *)argument); #endif } } --- head-2011-03-17.orig/drivers/xen/blkfront/block.h 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/block.h 2011-02-01 14:39:24.000000000 +0100 @@ -123,10 +123,17 @@ struct blkfront_info extern spinlock_t blkif_io_lock; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) extern int blkif_open(struct inode *inode, struct file *filep); extern int blkif_release(struct inode *inode, struct file *filep); extern int blkif_ioctl(struct inode *inode, struct file *filep, unsigned command, unsigned long argument); +#else +extern int blkif_open(struct block_device *bdev, fmode_t mode); +extern int blkif_release(struct gendisk *disk, fmode_t mode); +extern int blkif_ioctl(struct block_device *bdev, fmode_t mode, + unsigned command, unsigned long argument); +#endif extern int blkif_getgeo(struct block_device *, struct hd_geometry *); extern int blkif_check(dev_t dev); extern int blkif_revalidate(dev_t dev); --- head-2011-03-17.orig/drivers/xen/blktap2/device.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2/device.c 2011-02-07 14:14:26.000000000 +0100 @@ -36,10 +36,10 @@ dev_to_blktap(struct blktap_device *dev) } static int -blktap_device_open(struct inode *inode, struct file *filep) +blktap_device_open(struct block_device *bd, fmode_t mode) { struct blktap *tap; - struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + struct blktap_device *dev = bd->bd_disk->private_data; if (!dev) return -ENOENT; @@ -55,9 +55,9 @@ blktap_device_open(struct inode *inode, } static int -blktap_device_release(struct inode *inode, struct file *filep) +blktap_device_release(struct gendisk *disk, fmode_t mode) { - struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + struct blktap_device *dev = disk->private_data; struct blktap *tap = dev_to_blktap(dev); dev->users--; @@ -85,18 +85,17 @@ blktap_device_getgeo(struct block_device } static int -blktap_device_ioctl(struct inode *inode, struct file *filep, +blktap_device_ioctl(struct block_device *bd, fmode_t mode, unsigned command, unsigned long argument) { int i; - DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", - command, (long)argument, inode->i_rdev); + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx\n", + command, (long)argument); switch (command) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) case HDIO_GETGEO: { - struct block_device *bd = inode->i_bdev; struct hd_geometry geo; int ret; @@ -762,7 +761,7 @@ blktap_device_close_bdev(struct blktap * dev = &tap->device; if (dev->bdev) - blkdev_put(dev->bdev); + blkdev_put(dev->bdev, FMODE_WRITE); dev->bdev = NULL; clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); @@ -786,7 +785,7 @@ blktap_device_open_bdev(struct blktap *t if (!bdev->bd_disk) { BTERR("device %x:%x doesn't exist\n", MAJOR(pdev), MINOR(pdev)); - blkdev_put(dev->bdev); + blkdev_put(bdev, FMODE_WRITE); return -ENOENT; } --- head-2011-03-17.orig/drivers/xen/core/evtchn.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/evtchn.c 2011-02-01 14:39:24.000000000 +0100 @@ -149,7 +149,7 @@ static void bind_evtchn_to_cpu(unsigned BUG_ON(!test_bit(chn, s->evtchn_mask)); if (irq != -1) - irq_desc[irq].affinity = cpumask_of_cpu(cpu); + irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); @@ -162,7 +162,7 @@ static void init_evtchn_cpu_bindings(voi /* By default all event channels notify CPU#0. */ for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); + irq_to_desc(i)->affinity = cpumask_of_cpu(0); memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); for_each_possible_cpu(i) @@ -747,7 +747,7 @@ static void ack_dynirq(unsigned int irq) static void end_dynirq(unsigned int irq) { - if (!(irq_desc[irq].status & IRQ_DISABLED)) { + if (!(irq_to_desc(irq)->status & IRQ_DISABLED)) { move_masked_irq(irq); unmask_dynirq(irq); } @@ -841,7 +841,7 @@ static void enable_pirq(unsigned int irq bind_pirq.pirq = evtchn_get_xen_pirq(irq); /* NB. We are happy to share unless we are probing. */ bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq) - || (irq_desc[irq].status & IRQ_AUTODETECT) + || (irq_to_desc(irq)->status & IRQ_AUTODETECT) ? 0 : BIND_PIRQ__WILL_SHARE; if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) { if (bind_pirq.flags) @@ -900,11 +900,13 @@ static void unmask_pirq(unsigned int irq static void end_pirq(unsigned int irq) { - if ((irq_desc[irq].status & (IRQ_DISABLED|IRQ_PENDING)) == + const struct irq_desc *desc = irq_to_desc(irq); + + if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == (IRQ_DISABLED|IRQ_PENDING)) shutdown_pirq(irq); else { - if (!(irq_desc[irq].status & IRQ_DISABLED)) + if (!(desc->status & IRQ_DISABLED)) move_masked_irq(irq); unmask_pirq(irq); } @@ -1051,7 +1053,7 @@ static void restore_cpu_ipis(unsigned in bind_evtchn_to_cpu(evtchn, cpu); /* Ready for use. */ - if (!(irq_desc[irq].status & IRQ_DISABLED)) + if (!(irq_to_desc(irq)->status & IRQ_DISABLED)) unmask_evtchn(evtchn); } } @@ -1187,7 +1189,7 @@ void __init xen_init_IRQ(void) for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) { irq_bindcount[i] = 0; - irq_desc[i].status |= IRQ_NOPROBE; + irq_to_desc(i)->status |= IRQ_NOPROBE; set_irq_chip_and_handler_name(i, &dynirq_chip, handle_fasteoi_irq, "fasteoi"); } --- head-2011-03-17.orig/drivers/xen/core/smpboot.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/smpboot.c 2011-02-01 14:39:24.000000000 +0100 @@ -25,10 +25,6 @@ #include #include -extern irqreturn_t smp_reschedule_interrupt(int, void *); -extern irqreturn_t smp_call_function_interrupt(int, void *); -extern irqreturn_t smp_call_function_single_interrupt(int, void *); - extern int local_setup_timer(unsigned int cpu); extern void local_teardown_timer(unsigned int cpu); @@ -147,7 +143,7 @@ static void __cpuinit xen_smp_intr_exit( } #endif -void __cpuinit cpu_bringup(void) +static void __cpuinit cpu_bringup(void) { cpu_init(); identify_secondary_cpu(¤t_cpu_data); @@ -381,6 +377,20 @@ int __cpuinit __cpu_up(unsigned int cpu) return 0; } +void __ref play_dead(void) +{ + idle_task_exit(); + local_irq_disable(); + cpu_clear(smp_processor_id(), cpu_initialized); + preempt_enable_no_resched(); + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); +#ifdef CONFIG_HOTPLUG_CPU + cpu_bringup(); +#else + BUG(); +#endif +} + void __init smp_cpus_done(unsigned int max_cpus) { } --- head-2011-03-17.orig/drivers/xen/netfront/netfront.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/netfront/netfront.c 2011-02-01 14:39:24.000000000 +0100 @@ -952,7 +952,7 @@ static int network_start_xmit(struct sk_ return 0; } - frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE; + frags += DIV_ROUND_UP(offset + len, PAGE_SIZE); if (unlikely(frags > MAX_SKB_FRAGS + 1)) { pr_alert("xennet: skb rides the rocket: %d frags\n", frags); dump_stack(); --- head-2011-03-17.orig/drivers/xen/scsifront/scsifront.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/drivers/xen/scsifront/scsifront.c 2011-02-08 10:04:41.000000000 +0100 @@ -352,7 +352,7 @@ static int scsifront_queuecommand(struct memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE); ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction; - ring_req->timeout_per_command = (sc->timeout_per_command / HZ); + ring_req->timeout_per_command = (sc->request->timeout / HZ); info->shadow[rqid].req_scsi_cmnd = (unsigned long)sc; info->shadow[rqid].sc_data_direction = sc->sc_data_direction; @@ -421,7 +421,7 @@ static int scsifront_dev_reset_handler(s memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE); ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction; - ring_req->timeout_per_command = (sc->timeout_per_command / HZ); + ring_req->timeout_per_command = (sc->request->timeout / HZ); ring_req->nr_segments = 0; scsifront_do_request(info); --- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe.h 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe.h 2011-02-01 14:39:24.000000000 +0100 @@ -40,6 +40,11 @@ #define XEN_BUS_ID_SIZE BUS_ID_SIZE #endif +#ifdef CONFIG_PARAVIRT_XEN +#define is_running_on_xen() xen_domain() +#define is_initial_xendomain() xen_initial_domain() +#endif + #if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE) extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); --- head-2011-03-17.orig/include/xen/cpu_hotplug.h 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/include/xen/cpu_hotplug.h 2011-02-01 14:39:24.000000000 +0100 @@ -15,8 +15,6 @@ void init_xenbus_allowed_cpumask(void); int smp_suspend(void); void smp_resume(void); -void cpu_bringup(void); - #else /* !defined(CONFIG_HOTPLUG_CPU) */ #define cpu_up_check(cpu) (0) --- head-2011-03-17.orig/lib/swiotlb-xen.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/lib/swiotlb-xen.c 2011-02-01 14:39:24.000000000 +0100 @@ -57,7 +57,6 @@ enum dma_sync_target { int swiotlb_force; -static char *iotlb_virt_start; static unsigned long iotlb_nslabs; /* @@ -65,16 +64,7 @@ static unsigned long iotlb_nslabs; * swiotlb_sync_single_*, to see if the memory was in fact allocated by this * API. */ -static unsigned long iotlb_pfn_start, iotlb_pfn_end; - -/* Does the given dma address reside within the swiotlb aperture? */ -static inline int in_swiotlb_aperture(dma_addr_t dev_addr) -{ - unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT); - return (pfn_valid(pfn) - && (pfn >= iotlb_pfn_start) - && (pfn < iotlb_pfn_end)); -} +static char *io_tlb_start, *io_tlb_end; /* * When the IOMMU overflows we return a fallback buffer. This sets the size. @@ -159,15 +149,15 @@ swiotlb_init_with_default_size(size_t de /* * Get IO TLB memory from the low pages */ - iotlb_virt_start = alloc_bootmem_pages(bytes); - if (!iotlb_virt_start) + io_tlb_start = alloc_bootmem_pages(bytes); + if (!io_tlb_start) panic("Cannot allocate SWIOTLB buffer!\n"); dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) { do { rc = xen_create_contiguous_region( - (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT), + (unsigned long)io_tlb_start + (i << IO_TLB_SHIFT), get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT), dma_bits); } while (rc && dma_bits++ < max_dma_bits); @@ -178,10 +168,10 @@ swiotlb_init_with_default_size(size_t de "some DMA memory (e.g., dom0_mem=-128M).\n"); iotlb_nslabs = i; i <<= IO_TLB_SHIFT; - free_bootmem(__pa(iotlb_virt_start + i), bytes - i); + free_bootmem(__pa(io_tlb_start + i), bytes - i); bytes = i; for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) { - unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1)); + unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1)); if (bits > dma_bits) dma_bits = bits; @@ -189,6 +179,7 @@ swiotlb_init_with_default_size(size_t de break; } } + io_tlb_end = io_tlb_start + bytes; /* * Allocate and initialize the free list array. This array is used @@ -217,15 +208,12 @@ swiotlb_init_with_default_size(size_t de if (rc) panic("No suitable physical memory available for SWIOTLB overflow buffer!\n"); - iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT; - iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT); - printk(KERN_INFO "Software IO TLB enabled: \n" " Aperture: %lu megabytes\n" " Kernel range: %p - %p\n" " Address size: %u bits\n", bytes >> 20, - iotlb_virt_start, iotlb_virt_start + bytes, + io_tlb_start, io_tlb_end, dma_bits); } @@ -253,6 +241,18 @@ swiotlb_init(void) printk(KERN_INFO "Software IO TLB disabled\n"); } +static int is_swiotlb_buffer(dma_addr_t addr) +{ + unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr)); + char *va = pfn_valid(pfn) ? __va(pfn << PAGE_SHIFT) : NULL; + +#ifdef CONFIG_HIGHMEM + if (pfn >= highstart_pfn) + return 0; +#endif + return va >= io_tlb_start && va < io_tlb_end; +} + /* * We use __copy_to_user_inatomic to transfer to the host buffer because the * buffer may be mapped read-only (e.g, in blkback driver) but lower-level @@ -362,7 +362,7 @@ map_single(struct device *hwdev, struct io_tlb_list[i] = 0; for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) io_tlb_list[i] = ++count; - dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT); + dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); /* * Update the indices to avoid searching in the next @@ -404,7 +404,7 @@ found: static struct phys_addr dma_addr_to_phys_addr(char *dma_addr) { - int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; struct phys_addr buffer = io_tlb_orig_addr[index]; buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1); buffer.page += buffer.offset >> PAGE_SHIFT; @@ -420,7 +420,7 @@ unmap_single(struct device *hwdev, char { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); /* @@ -527,7 +527,7 @@ _swiotlb_map_single(struct device *hwdev * buffering it. */ if (!range_straddles_page_boundary(paddr, size) && - !address_needs_mapping(hwdev, dev_addr)) + !address_needs_mapping(hwdev, dev_addr, size)) return dev_addr; /* @@ -578,9 +578,11 @@ void swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, struct dma_attrs *attrs) { + char *dma_addr = bus_to_virt(dev_addr); + BUG_ON(dir == DMA_NONE); - if (in_swiotlb_aperture(dev_addr)) - unmap_single(hwdev, bus_to_virt(dev_addr), size, dir); + if (is_swiotlb_buffer(dev_addr)) + unmap_single(hwdev, dma_addr, size, dir); else gnttab_dma_unmap_page(dev_addr); } @@ -606,9 +608,11 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { + char *dma_addr = bus_to_virt(dev_addr); + BUG_ON(dir == DMA_NONE); - if (in_swiotlb_aperture(dev_addr)) - sync_single(hwdev, bus_to_virt(dev_addr), size, dir, target); + if (is_swiotlb_buffer(dev_addr)) + sync_single(hwdev, dma_addr, size, dir, target); } void @@ -633,10 +637,11 @@ swiotlb_sync_single_range(struct device unsigned long offset, size_t size, int dir, int target) { + char *dma_addr = bus_to_virt(dev_addr); + BUG_ON(dir == DMA_NONE); - if (in_swiotlb_aperture(dev_addr)) - sync_single(hwdev, bus_to_virt(dev_addr + offset), size, - dir, target); + if (is_swiotlb_buffer(dev_addr)) + sync_single(hwdev, dma_addr + offset, size, dir, target); } void @@ -690,7 +695,7 @@ swiotlb_map_sg_attrs(struct device *hwde if (range_straddles_page_boundary(page_to_pseudophys(sg_page(sg)) + sg->offset, sg->length) - || address_needs_mapping(hwdev, dev_addr)) { + || address_needs_mapping(hwdev, dev_addr, sg->length)) { gnttab_dma_unmap_page(dev_addr); buffer.page = sg_page(sg); buffer.offset = sg->offset; @@ -734,7 +739,7 @@ swiotlb_unmap_sg_attrs(struct device *hw BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) { - if (in_swiotlb_aperture(sg->dma_address)) + if (sg->dma_address != sg_phys(sg)) unmap_single(hwdev, bus_to_virt(sg->dma_address), sg->dma_length, dir); else @@ -767,7 +772,7 @@ swiotlb_sync_sg(struct device *hwdev, st BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) { - if (in_swiotlb_aperture(sg->dma_address)) + if (sg->dma_address != sg_phys(sg)) sync_single(hwdev, bus_to_virt(sg->dma_address), sg->dma_length, dir, target); } --- head-2011-03-17.orig/mm/vmalloc.c 2011-01-31 17:32:29.000000000 +0100 +++ head-2011-03-17/mm/vmalloc.c 2011-02-01 14:39:24.000000000 +0100 @@ -478,6 +478,8 @@ static void vmap_debug_free_range(unsign #ifdef CONFIG_DEBUG_PAGEALLOC vunmap_page_range(start, end); flush_tlb_kernel_range(start, end); +#elif defined(CONFIG_XEN) && defined(CONFIG_X86) + vunmap_page_range(start, end); #endif }