From: Linux Kernel Mailing List Subject: Linux: 2.6.32 Patch-mainline: 2.6.32 This patch contains the differences between 2.6.31 and 2.6.32. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.32" by xen-port-patches.py --- head-2010-05-25.orig/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:32:27.000000000 +0100 @@ -20,18 +20,15 @@ #define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) #define __AUDIT_ARCH_LE 0x40000000 -#ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit int_ret_from_sys_call -#define sysretl_audit int_ret_from_sys_call -#endif - #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp + jmp ia32_common .else movl %ebp,%r9d +ia32_common: .endif xchg %ecx,%esi movl %ebx,%edi @@ -39,12 +36,12 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS _r9=rax + .macro CLEAR_RREGS offset=0, _r9=rax xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) + movq %rax,\offset+R11(%rsp) + movq %rax,\offset+R10(%rsp) + movq %\_r9,\offset+R9(%rsp) + movq %rax,\offset+R8(%rsp) .endm /* @@ -144,17 +141,7 @@ ENTRY(ia32_sysenter_target) jnz sysenter_tracesys cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys -sysenter_do_call: - IA32_ARG_FIXUP -sysenter_dispatch: - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz sysexit_audit - jmp int_ret_from_sys_call + jmp ia32_do_call #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common @@ -175,31 +162,10 @@ sysenter_dispatch: movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ .endm - .macro auditsys_exit exit,ebpsave=RBP - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) - jnz int_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - .endm - sysenter_auditsys: auditsys_entry_common movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call + jmp ia32_dispatch #endif sysenter_tracesys: @@ -216,7 +182,7 @@ sysenter_tracesys: RESTORE_REST cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ - jmp sysenter_do_call + jmp ia32_do_call CFI_ENDPROC ENDPROC(ia32_sysenter_target) @@ -272,24 +238,13 @@ ENTRY(ia32_cstar_target) ja ia32_badsys cstar_do_call: IA32_ARG_FIXUP 1 -cstar_dispatch: - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - DISABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz sysretl_audit - jmp int_ret_from_sys_call #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ auditsys_entry_common movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ + jmp ia32_dispatch #endif cstar_tracesys: @@ -299,7 +254,7 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS r9 + CLEAR_RREGS 0, r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter @@ -367,9 +322,11 @@ ENTRY(ia32_syscall) ja ia32_badsys ia32_do_call: IA32_ARG_FIXUP +ia32_dispatch: call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX-ARGOFFSET(%rsp) + CLEAR_RREGS -ARGOFFSET jmp int_ret_from_sys_call ia32_tracesys: @@ -387,8 +344,8 @@ END(ia32_syscall) ia32_badsys: movq $0,ORIG_RAX-ARGOFFSET(%rsp) - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call + movq $-ENOSYS,%rax + jmp ia32_sysret quiet_ni_syscall: movq $-ENOSYS,%rax @@ -482,7 +439,7 @@ ia32_sys_call_table: .quad sys_mkdir .quad sys_rmdir /* 40 */ .quad sys_dup - .quad sys32_pipe + .quad sys_pipe .quad compat_sys_times .quad quiet_ni_syscall /* old prof syscall holder */ .quad sys_brk /* 45 */ @@ -776,5 +733,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: --- head-2010-05-25.orig/arch/x86/include/asm/time.h 2010-03-24 15:10:37.000000000 +0100 +++ head-2010-05-25/arch/x86/include/asm/time.h 2010-03-24 15:32:27.000000000 +0100 @@ -8,8 +8,9 @@ extern void hpet_time_init(void); extern void time_init(void); #ifdef CONFIG_XEN +struct timespec; extern int xen_independent_wallclock(void); -extern unsigned long xen_read_persistent_clock(void); +extern void xen_read_persistent_clock(struct timespec *); extern int xen_update_persistent_clock(void); #endif --- head-2010-05-25.orig/arch/x86/include/asm/uv/uv_hub.h 2010-05-25 09:12:08.000000000 +0200 +++ head-2010-05-25/arch/x86/include/asm/uv/uv_hub.h 2010-03-24 15:32:27.000000000 +0100 @@ -11,7 +11,7 @@ #ifndef _ASM_X86_UV_UV_HUB_H #define _ASM_X86_UV_UV_HUB_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_UV #include #include #include --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/agp.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/agp.h 2010-03-24 15:32:27.000000000 +0100 @@ -28,10 +28,7 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) phys_to_machine(x) -#define gart_to_phys(x) machine_to_phys(x) -#define page_to_gart(x) phys_to_gart(page_to_pseudophys(x)) +#define virt_to_gart virt_to_machine /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) ({ \ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/desc.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/desc.h 2010-03-24 15:32:27.000000000 +0100 @@ -312,7 +312,14 @@ static inline void load_LDT(mm_context_t static inline unsigned long get_desc_base(const struct desc_struct *desc) { - return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); + return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); +} + +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; } static inline unsigned long get_desc_limit(const struct desc_struct *desc) @@ -320,6 +327,12 @@ static inline unsigned long get_desc_lim return desc->limit0 | (desc->limit << 16); } +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + #ifndef CONFIG_X86_NO_IDT static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/dma-mapping.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/dma-mapping.h 2010-03-24 15:32:27.000000000 +0100 @@ -1,11 +1,24 @@ #ifndef _ASM_X86_DMA_MAPPING_H_ +#define phys_to_dma _phys_to_dma_ +#define dma_to_phys _dma_to_phys_ + #include_next -void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); +#undef phys_to_dma +#undef dma_to_phys + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return phys_to_machine(paddr); +} -#define address_needs_mapping(hwdev, addr, size) \ - !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size) +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return machine_to_phys(daddr); +} + +void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); extern int range_straddles_page_boundary(paddr_t p, size_t size); --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/fixmap.h 2010-03-24 15:32:27.000000000 +0100 @@ -139,6 +139,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif __end_of_fixed_addresses }; --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/hypervisor.h 2010-03-24 15:32:27.000000000 +0100 @@ -70,6 +70,7 @@ extern start_info_t *xen_start_info; #endif #define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN)) +#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data) struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu); @@ -351,6 +352,6 @@ MULTI_grant_table_op(multicall_entry_t * #endif -#define uvm_multi(cpumask) ((unsigned long)cpus_addr(cpumask) | UVMF_MULTI) +#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI) #endif /* __HYPERVISOR_H__ */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/irqflags.h 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/irqflags.h 2010-03-24 15:32:27.000000000 +0100 @@ -1,7 +1,7 @@ #ifndef _X86_IRQFLAGS_H_ #define _X86_IRQFLAGS_H_ -#include +#include #ifndef __ASSEMBLY__ /* --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/mmu_context.h 2010-03-24 15:32:27.000000000 +0100 @@ -88,12 +88,12 @@ static inline void switch_mm(struct mm_s !PagePinned(virt_to_page(next->pgd))); /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); #if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif - cpu_set(cpu, next->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables: load_cr3(next->pgd) */ op->cmd = MMUEXT_NEW_BASEPTR; @@ -125,7 +125,7 @@ static inline void switch_mm(struct mm_s percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pci.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pci.h 2010-03-24 15:32:27.000000000 +0100 @@ -151,7 +151,11 @@ static inline int __pcibus_to_node(const static inline const struct cpumask * cpumask_of_pcibus(const struct pci_bus *bus) { - return cpumask_of_node(__pcibus_to_node(bus)); + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); } #endif --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:32:27.000000000 +0100 @@ -53,16 +53,6 @@ extern struct list_head pgd_list; #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -static inline void __init paravirt_pagetable_setup_start(pgd_t *base) -{ - xen_pagetable_setup_start(base); -} - -static inline void __init paravirt_pagetable_setup_done(pgd_t *base) -{ - xen_pagetable_setup_done(base); -} - #define pgd_val(x) xen_pgd_val(x) #define __pgd(x) xen_make_pgd(x) @@ -134,6 +124,11 @@ static inline int pte_special(pte_t pte) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int pmd_large(pmd_t pte) { return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -363,7 +358,7 @@ static inline unsigned long pmd_page_vad * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -static inline unsigned pmd_index(unsigned long address) +static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } @@ -383,7 +378,7 @@ static inline unsigned pmd_index(unsigne * this function returns the index of the entry in the pte page which would * control the given virtual address */ -static inline unsigned pte_index(unsigned long address) +static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } @@ -439,11 +434,6 @@ static inline pmd_t *pmd_offset(pud_t *p return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; -} - static inline int pud_large(pud_t pud) { return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -479,7 +469,7 @@ static inline unsigned long pgd_page_vad #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned pud_index(unsigned long address) +static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } @@ -600,7 +590,7 @@ extern int ptep_clear_flush_young(struct if (!pte_none(__res) && \ ((vma)->vm_mm != current->mm || \ HYPERVISOR_update_va_mapping(addr, __pte(0), \ - uvm_multi((vma)->vm_mm->cpu_vm_mask) | \ + uvm_multi(mm_cpumask((vma)->vm_mm)) | \ UVMF_INVLPG))) { \ __xen_pte_clear(__ptep); \ flush_tlb_page(vma, addr); \ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_types.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_types.h 2010-03-24 15:32:27.000000000 +0100 @@ -334,6 +334,7 @@ static inline pteval_t pte_flags(pte_t p typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; +extern void set_nx(void); extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine @@ -354,14 +355,6 @@ int phys_mem_access_prot_allowed(struct /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -#ifndef CONFIG_XEN -extern void native_pagetable_setup_start(pgd_t *base); -extern void native_pagetable_setup_done(pgd_t *base); -#else -static inline void xen_pagetable_setup_start(pgd_t *base) {} -static inline void xen_pagetable_setup_done(pgd_t *base) {} -#endif - struct seq_file; extern void arch_report_meminfo(struct seq_file *m); --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:32:27.000000000 +0100 @@ -27,6 +27,7 @@ struct mm_struct; #include #include #include +#include #include #include @@ -411,7 +412,17 @@ extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, stack_canary); +/* + * Make sure stack canary segment base is cached-aligned: + * "For Intel Atom processors, avoid non zero segment base address + * that is not aligned to cache line boundary at all cost." + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) + */ +struct stack_canary { + char __pad[20]; /* canary at %gs:20 */ + unsigned long canary; +}; +DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif #endif /* X86_64 */ @@ -647,13 +658,23 @@ static inline void cpu_relax(void) rep_nop(); } -/* Stop speculative execution: */ +/* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) - : "ebx", "ecx", "edx", "memory"); +#if defined(CONFIG_M386) || defined(CONFIG_M486) + if (boot_cpu_data.x86 < 5) + /* There is no speculative execution. + * jmp is a barrier to prefetching. */ + asm volatile("jmp 1f\n1:\n" ::: "memory"); + else +#endif + /* cpuid is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. */ + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); } static inline void __monitor(const void *eax, unsigned long ecx, @@ -944,4 +965,35 @@ extern void start_thread(struct pt_regs extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +extern int amd_get_nb_id(int cpu); + +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + #endif /* _ASM_X86_PROCESSOR_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/setup.h 2010-03-24 15:32:27.000000000 +0100 @@ -0,0 +1,8 @@ +#ifndef __ASSEMBLY__ + +void xen_start_kernel(void); +void xen_arch_setup(void); + +#endif + +#include_next --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/smp-processor-id.h 2010-03-24 15:32:27.000000000 +0100 @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_SMP_PROCESSOR_ID_H +#define _ASM_X86_SMP_PROCESSOR_ID_H + +#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__) + +#include + +DECLARE_PER_CPU(int, cpu_number); + +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() percpu_read(cpu_number) +#define safe_smp_processor_id() smp_processor_id() + +#ifdef CONFIG_X86_64_SMP +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() +#else +# define smp_processor_id() raw_smp_processor_id() +#endif + +#endif /* SMP && !__ASSEMBLY__ */ + +#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/smp.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/smp.h 2010-03-24 15:32:27.000000000 +0100 @@ -121,7 +121,6 @@ static inline void arch_send_call_functi smp_ops.send_call_func_single_ipi(cpu); } -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_ops.send_call_func_ipi(mask); @@ -167,27 +166,7 @@ static inline int num_booting_cpus(void) extern unsigned disabled_cpus __cpuinitdata; -#ifdef CONFIG_X86_32_SMP -/* - * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. - */ -#define raw_smp_processor_id() (percpu_read(cpu_number)) -#define safe_smp_processor_id() smp_processor_id() - -#elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() (percpu_read(cpu_number)) - -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) -#define safe_smp_processor_id() smp_processor_id() - -#endif +#include #ifdef CONFIG_X86_LOCAL_APIC --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:32:27.000000000 +0100 @@ -30,7 +30,7 @@ void __switch_to_xtra(struct task_struct "movl %P[task_canary](%[next]), %%ebx\n\t" \ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ - , [stack_canary] "=m" (per_cpu_var(stack_canary)) + , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -149,33 +149,6 @@ do { \ #endif #ifdef __KERNEL__ -#define _set_base(addr, base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while (0) - -#define _set_limit(addr, limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while (0) - -#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) -#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) extern void xen_load_gs_index(unsigned); --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/tlbflush.h 2010-03-24 15:32:27.000000000 +0100 @@ -74,9 +74,9 @@ static inline void reset_lazy_tlbstate(v #define local_flush_tlb() __flush_tlb() #define flush_tlb_all xen_tlb_flush_all -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) +#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm)) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm)) +#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va) #define flush_tlb() flush_tlb_current_task() --- head-2010-05-25.orig/arch/x86/kernel/Makefile 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/Makefile 2010-03-24 15:32:27.000000000 +0100 @@ -132,8 +132,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - - time_64-$(CONFIG_XEN) += time_32.o endif disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ --- head-2010-05-25.orig/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -79,6 +79,8 @@ unsigned long io_apic_irqs; #endif /* CONFIG_XEN */ #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -100,12 +102,24 @@ int nr_ioapic_registers[MAX_IO_APICS]; struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; +/* IO APIC gsi routing info */ +struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#ifndef CONFIG_XEN +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; +#else +#define nr_legacy_irqs NR_IRQS_LEGACY +#endif + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -132,15 +146,6 @@ static int __init parse_noapic(char *str early_param("noapic", parse_noapic); #ifndef CONFIG_XEN -struct irq_pin_list; - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -155,6 +160,11 @@ static struct irq_pin_list *get_one_free return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; @@ -188,6 +198,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -205,7 +221,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -231,17 +247,14 @@ static struct irq_cfg *get_one_free_irq_ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } @@ -455,13 +468,10 @@ static bool io_apic_level_ack_pending(st unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -469,9 +479,6 @@ static bool io_apic_level_ack_pending(st spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -543,72 +550,68 @@ static void ioapic_mask_entry(int apic, * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **last, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; + return 0; + last = &entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } entry->apic = apic; entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; + struct irq_pin_list *entry; - while (entry) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } - entry = entry->next; } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -625,7 +628,6 @@ static void __unmask_IO_APIC_irq(struct io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -641,11 +643,6 @@ static void __mask_IO_APIC_irq(struct ir { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { @@ -658,7 +655,6 @@ static void __unmask_and_level_IO_APIC_i io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -719,6 +715,7 @@ static void clear_IO_APIC (void) } #else #define add_pin_to_irq_node(cfg, node, apic, pin) +#define add_pin_to_irq_node_nopanic(cfg, node, apic, pin) 0 #endif /* CONFIG_XEN */ #ifdef CONFIG_X86_32 @@ -935,7 +932,7 @@ static int __init find_isa_irq_apic(int */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1547,7 +1544,7 @@ static void setup_IO_APIC_irq(int apic_i } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1775,12 +1772,8 @@ __apicdebuginit(void) print_IO_APIC(void if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -1924,7 +1917,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1956,7 +1949,7 @@ __apicdebuginit(int) print_all_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic || disable_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return 0; print_all_local_APICs(); @@ -1990,6 +1983,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + #ifndef CONFIG_XEN for(apic = 0; apic < nr_ioapics; apic++) { int pin; @@ -2049,6 +2046,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -2082,7 +2082,7 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - if (cpu_has_apic) + if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2095,7 +2095,7 @@ void disable_IO_APIC(void) * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2104,9 +2104,8 @@ static void __init setup_ioapic_ids_from unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -2280,7 +2279,7 @@ static unsigned int startup_ioapic_irq(u struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2292,7 +2291,6 @@ static unsigned int startup_ioapic_irq(u return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2305,14 +2303,6 @@ static int ioapic_retrigger_irq(unsigned return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -2350,13 +2340,9 @@ static void __target_IO_APIC_irq(unsigne struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2369,9 +2355,6 @@ static void __target_IO_APIC_irq(unsigne reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } @@ -2596,11 +2579,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2613,31 +2593,28 @@ static void ack_apic_level(unsigned int } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2679,7 +2656,7 @@ static void ack_apic_level(unsigned int unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2687,26 +2664,15 @@ static void ack_apic_level(unsigned int __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void @@ -2796,7 +2762,7 @@ static inline void init_IO_APIC_traps(vo * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -3136,7 +3102,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3148,23 +3114,21 @@ void __init setup_IO_APIC(void) * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ #endif - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* * Set up IO-APIC IRQ routing. */ #ifndef CONFIG_XEN -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); #endif setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3274,7 +3238,6 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3346,8 +3309,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); @@ -4025,9 +3987,13 @@ static int __io_apic_set_pci_routing(str /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); @@ -4056,11 +4022,28 @@ int io_apic_set_pci_routing(struct devic return __io_apic_set_pci_routing(dev, irq, irq_attr); } -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ +u8 __init io_apic_unique_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); -#ifdef CONFIG_ACPI + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} #ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) @@ -4171,8 +4154,6 @@ int acpi_get_override_irq(int bus_irq, i return 0; } -#endif /* CONFIG_ACPI */ - #ifndef CONFIG_XEN /* * This function currently is only a helper for the i386 smp boot process where @@ -4227,7 +4208,7 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(void) +static struct resource * __init ioapic_setup_resources(int nr_ioapics) { unsigned long n; struct resource *res; @@ -4243,15 +4224,13 @@ static struct resource * __init ioapic_s mem = alloc_bootmem(n); res = (void *)mem; - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * nr_ioapics; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; } ioapic_resources = res; @@ -4265,7 +4244,7 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(); + ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].apicaddr; @@ -4294,11 +4273,9 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; } } @@ -4320,3 +4297,78 @@ void __init ioapic_insert_resources(void } } #endif /* !CONFIG_XEN */ + +int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_gsi_routing[i].gsi_base) + && (gsi <= mp_gsi_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, int gsi) +{ + if (WARN_ON(ioapic == -1)) + return -1; + if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + return -1; + + return gsi - mp_gsi_routing[ioapic].gsi_base; +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].type = MP_IOAPIC; + mp_ioapics[idx].flags = MPC_APIC_USABLE; + mp_ioapics[idx].apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + mp_ioapics[idx].apicid = io_apic_unique_id(id); + mp_ioapics[idx].apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_gsi_routing[idx].gsi_base = gsi_base; + mp_gsi_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, + mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, + mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + + nr_ioapics++; +} --- head-2010-05-25.orig/arch/x86/kernel/cpu/Makefile 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/cpu/Makefile 2010-03-24 15:32:27.000000000 +0100 @@ -34,7 +34,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o -disabled-obj-$(CONFIG_XEN) := hypervisor.o vmware.o +disabled-obj-$(CONFIG_XEN) := hypervisor.o sched.o vmware.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ --- head-2010-05-25.orig/arch/x86/kernel/cpu/amd.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/cpu/amd.c 2010-03-24 15:32:27.000000000 +0100 @@ -313,7 +313,7 @@ static void __cpuinit amd_detect_cmp(str int amd_get_nb_id(int cpu) { int id = 0; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) id = per_cpu(cpu_llc_id, cpu); #endif return id; @@ -469,8 +469,10 @@ static void __cpuinit init_amd(struct cp if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#ifndef CONFIG_XEN /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); +#endif #else /* --- head-2010-05-25.orig/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -13,13 +13,13 @@ #include #include -#include +#include #include #include #include #include -#include -#include +#include +#include #include #include #include @@ -28,13 +28,12 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -102,17 +101,17 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), #ifndef CONFIG_XEN /* * Segments used for calling PnP BIOS have byte granularity. @@ -120,29 +119,29 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), #endif - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; @@ -900,7 +899,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1013,7 +1012,7 @@ __setup("clearcpuid=", setup_disablecpui #ifdef CONFIG_X86_64 #ifndef CONFIG_X86_NO_IDT -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; #endif DEFINE_PER_CPU_FIRST(union irq_stack_union, @@ -1027,13 +1026,21 @@ void xen_switch_pt(void) #endif } -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; #ifndef CONFIG_X86_NO_TSS @@ -1049,8 +1056,7 @@ static const unsigned int exception_stac }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif void __cpuinit syscall_init(void) @@ -1097,8 +1103,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, stack_canary); +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif /* Make sure %fs and %gs are initialized properly in idle threads */ --- head-2010-05-25.orig/arch/x86/kernel/cpu/mcheck/mce-inject.c 2010-05-25 09:12:08.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/cpu/mcheck/mce-inject.c 2010-04-15 10:10:43.000000000 +0200 @@ -144,7 +144,7 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) if (m->inject_flags & MCJ_NMI_BROADCAST) { unsigned long start; int cpu; --- head-2010-05-25.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/cpu/mtrr/main-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -1,10 +1,9 @@ -#include -#include -#include +#define DEBUG + +#include #include -#include -#include #include +#include #include #include "mtrr.h" @@ -58,7 +57,7 @@ static void __init init_table(void) mtrr_usage_table[i] = 0; } -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { int error; @@ -88,25 +87,23 @@ int mtrr_add_page(unsigned long base, un static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } return 0; } -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); int mtrr_del_page(int reg, unsigned long base, unsigned long size) { @@ -128,13 +125,13 @@ int mtrr_del_page(int reg, unsigned long } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) { @@ -153,15 +150,12 @@ int mtrr_del_page(int reg, unsigned long return error; } -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); /* --- head-2010-05-25.orig/arch/x86/kernel/e820-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/e820-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -134,7 +134,7 @@ static void __init __e820_add_region(str { int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { + if (x >= ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } @@ -1455,7 +1455,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; @@ -1502,8 +1502,8 @@ static unsigned long ram_alignment(resou if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) @@ -1543,59 +1543,8 @@ void __init e820_reserve_resources_late( #undef e820 -#ifndef CONFIG_XEN char *__init default_machine_specific_memory_setup(void) { - char *who = "BIOS-e820"; - u32 new_nr; - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { - u64 mem_size; - - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { - mem_size = boot_params.screen_info.ext_mem_k; - who = "BIOS-88"; - } else { - mem_size = boot_params.alt_mem_k; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - - /* In case someone cares... */ - return who; -} - -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} -#endif - -static char * __init _memory_setup(void) -{ int rc, nr_map; struct xen_memory_map memmap; static struct e820entry __initdata map[E820MAX]; @@ -1639,7 +1588,7 @@ void __init setup_memory_map(void) { char *who; - who = _memory_setup(); + who = x86_init.resources.memory_setup(); #ifdef CONFIG_XEN if (is_initial_xendomain()) { printk(KERN_INFO "Xen-provided machine memory map:\n"); --- head-2010-05-25.orig/arch/x86/kernel/early_printk-xen.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/early_printk-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -178,7 +178,6 @@ static __init void early_serial_init(cha * mappings. Someone should fix this for domain 0. For now, use fake serial. */ #define early_vga_console early_serial_console -#define xenboot_console early_serial_console #endif @@ -189,721 +188,6 @@ static struct console early_serial_conso .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 10; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -920,10 +204,24 @@ asmlinkage void early_printk(const char va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -932,44 +230,41 @@ static int __init setup_early_printk(cha return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); + keep = (strstr(buf, "keep") != NULL); - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } #ifndef CONFIG_XEN - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; +#else + if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) { #endif - early_console = &early_vga_console; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif -#ifdef CONFIG_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } --- head-2010-05-25.orig/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:32:27.000000000 +0100 @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -150,7 +151,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -159,10 +160,10 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, 16(%rsp) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp + addq $16, %rsp retq #endif @@ -546,20 +547,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -668,6 +662,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -914,7 +909,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif --- head-2010-05-25.orig/arch/x86/kernel/head-xen.c 2010-04-28 17:07:13.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/head-xen.c 2010-04-15 10:10:51.000000000 +0200 @@ -59,7 +59,6 @@ void __init reserve_ebda_region(void) #include #include #include -#include #include #include @@ -164,7 +163,7 @@ void __init xen_start_kernel(void) } -void __init machine_specific_arch_setup(void) +void __init xen_arch_setup(void) { int ret; static const struct callback_register __initconst event = { --- head-2010-05-25.orig/arch/x86/kernel/head32-xen.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head32-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -9,11 +9,26 @@ #include #include -#include #include #include -#include +#include #include +#include +#include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + if (is_initial_xendomain()) + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; +#ifndef CONFIG_XEN + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +#endif +} void __init i386_start_kernel(void) { @@ -31,7 +46,16 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_ebda_region(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } #else { int max_cmdline; @@ -42,6 +66,7 @@ void __init i386_start_kernel(void) boot_command_line[max_cmdline-1] = '\0'; } + i386_default_early_setup(); xen_start_kernel(); #endif --- head-2010-05-25.orig/arch/x86/kernel/head64-xen.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head64-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -20,15 +20,14 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include +#include #ifndef CONFIG_XEN static void __init zap_identity_mappings(void) --- head-2010-05-25.orig/arch/x86/kernel/head_32-xen.S 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head_32-xen.S 2010-03-24 15:32:27.000000000 +0100 @@ -30,7 +30,7 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -.section .text.head,"ax",@progbits +__HEAD #define VIRT_ENTRY_OFFSET 0x0 .org VIRT_ENTRY_OFFSET ENTRY(startup_32) @@ -69,7 +69,6 @@ ENTRY(startup_32) */ movl $per_cpu__gdt_page,%eax movl $per_cpu__stack_canary,%ecx - subl $20, %ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -122,7 +121,7 @@ ENTRY(hypercall_page) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm ENTRY(swapper_pg_fixmap) .fill 1024,4,0 --- head-2010-05-25.orig/arch/x86/kernel/head_64-xen.S 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head_64-xen.S 2010-03-24 15:32:27.000000000 +0100 @@ -23,7 +23,7 @@ #include #include - .section .text.head, "ax", @progbits + __HEAD .code64 .globl startup_64 startup_64: @@ -51,7 +51,7 @@ startup_64: #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ - phys_##name = . - .text.head; \ + phys_##name = . - .head.text; \ ENTRY(name) NEXT_PAGE(init_level4_pgt) @@ -104,7 +104,7 @@ NEXT_PAGE(hypercall_page) #undef NEXT_PAGE - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE --- head-2010-05-25.orig/arch/x86/kernel/irq-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/irq-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -67,10 +67,10 @@ static int show_other_interrupts(struct for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); @@ -112,7 +112,7 @@ static int show_other_interrupts(struct seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -212,7 +212,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif --- head-2010-05-25.orig/arch/x86/kernel/ldt-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/ldt-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -70,8 +70,8 @@ static int alloc_ldt(mm_context_t *pc, i XENFEAT_writable_descriptor_tables); load_LDT(pc); #ifdef CONFIG_SMP - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #endif --- head-2010-05-25.orig/arch/x86/kernel/microcode_core-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/microcode_core-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -97,8 +97,8 @@ static ssize_t microcode_write(struct fi { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -121,7 +121,7 @@ static const struct file_operations micr static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devnode = "cpu/microcode", + .nodename = "cpu/microcode", .fops = µcode_fops, }; --- head-2010-05-25.orig/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -51,6 +51,13 @@ static int __init mpf_checksum(unsigned return sum & 0xFF; } +#ifndef CONFIG_XEN +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} +#endif + static void __init MP_processor_info(struct mpc_cpu *m) { #ifndef CONFIG_XEN @@ -62,10 +69,7 @@ static void __init MP_processor_info(str return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -80,16 +84,18 @@ static void __init MP_processor_info(str } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -106,8 +112,8 @@ static void __init MP_bus_info(struct mp mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -301,6 +307,8 @@ static void __init smp_dump_mptable(stru 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -322,16 +330,13 @@ static int __init smp_read_mpc(struct mp if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -363,8 +368,7 @@ static int __init smp_read_mpc(struct mp count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP @@ -492,11 +496,11 @@ static void __init construct_ioapic_tabl MP_bus_info(&bus); } - ioapic.type = MP_IOAPIC; - ioapic.apicid = 2; - ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.flags = MPC_APIC_USABLE; - ioapic.apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; MP_ioapic_info(&ioapic); /* @@ -618,7 +622,7 @@ static int __init check_physptr(struct m /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -635,11 +639,6 @@ static void __init __get_smp_config(unsi if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) @@ -680,16 +679,6 @@ static void __init __get_smp_config(unsi */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - #ifndef CONFIG_XEN static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { @@ -761,16 +750,12 @@ static int __init smp_scan_config(unsign return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { #ifndef CONFIG_XEN unsigned int address; #endif - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -807,16 +792,6 @@ static void __init __find_smp_config(uns #endif } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; --- head-2010-05-25.orig/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -32,17 +33,22 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -int iommu_pass_through; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); @@ -88,6 +94,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else @@ -178,7 +189,7 @@ again: #ifndef CONFIG_XEN addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, order); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { @@ -266,10 +277,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); @@ -381,7 +390,7 @@ void pci_iommu_shutdown(void) amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ --- head-2010-05-25.orig/arch/x86/kernel/pci-nommu-xen.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/pci-nommu-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -36,7 +36,7 @@ gnttab_map_sg(struct device *hwdev, stru sg->dma_address = gnttab_dma_map_page(sg_page(sg)) + sg->offset; sg->dma_length = sg->length; - IOMMU_BUG_ON(address_needs_mapping( + IOMMU_BUG_ON(!dma_capable( hwdev, sg->dma_address, sg->length)); IOMMU_BUG_ON(range_straddles_page_boundary( page_to_pseudophys(sg_page(sg)) + sg->offset, @@ -69,7 +69,7 @@ gnttab_map_page(struct device *dev, stru dma = gnttab_dma_map_page(page) + offset; IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) + offset, size)); - IOMMU_BUG_ON(address_needs_mapping(dev, dma, size)); + IOMMU_BUG_ON(!dma_capable(dev, dma, size)); return dma; } @@ -81,19 +81,36 @@ gnttab_unmap_page(struct device *dev, dm gnttab_dma_unmap_page(dma_addr); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + static int nommu_dma_supported(struct device *hwdev, u64 mask) { return 1; } struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = dma_generic_free_coherent, - .map_page = gnttab_map_page, - .unmap_page = gnttab_unmap_page, - .map_sg = gnttab_map_sg, - .unmap_sg = gnttab_unmap_sg, - .dma_supported = nommu_dma_supported, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .map_page = gnttab_map_page, + .unmap_page = gnttab_unmap_page, + .map_sg = gnttab_map_sg, + .unmap_sg = gnttab_unmap_sg, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .dma_supported = nommu_dma_supported, }; void __init no_iommu_init(void) --- head-2010-05-25.orig/arch/x86/kernel/process-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/process-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,9 +26,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -285,9 +282,7 @@ static inline int hlt_use_halt(void) */ void xen_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -300,7 +295,6 @@ void xen_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); @@ -354,9 +348,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -366,15 +358,13 @@ void mwait_idle_with_hints(unsigned long if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,7 +374,6 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); } else local_irq_enable(); } @@ -397,13 +386,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } #ifndef CONFIG_XEN @@ -556,10 +543,8 @@ void __init init_c1e_mask(void) { #ifndef CONFIG_XEN /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); #endif } --- head-2010-05-25.orig/arch/x86/kernel/process_32-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/process_32-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ @@ -360,6 +357,7 @@ __switch_to(struct task_struct *prev_p, #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, cpu); #endif + bool preload_fpu; #if CONFIG_XEN_COMPAT > 0x030002 struct physdev_set_iopl iopl_op; struct physdev_set_iobitmap iobmp_op; @@ -373,15 +371,24 @@ __switch_to(struct task_struct *prev_p, /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + + /* * This is basically '__unlazy_fpu', except that we queue a * multicall to indicate FPU task switch, rather than * synchronously trapping to Xen. */ if (task_thread_info(prev_p)->status & TS_USEDFPU) { __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } } #if 0 /* lazy fpu sanity check */ else BUG_ON(!(read_cr0() & 8)); @@ -427,6 +434,14 @@ __switch_to(struct task_struct *prev_p, mcl++; } + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { set_xen_guest_handle(iobmp_op.bitmap, (char *)next->io_bitmap_ptr); @@ -451,7 +466,7 @@ __switch_to(struct task_struct *prev_p, BUG(); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -470,15 +485,8 @@ __switch_to(struct task_struct *prev_p, */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) --- head-2010-05-25.orig/arch/x86/kernel/process_64-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/process_64-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -64,9 +64,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - static DEFINE_PER_CPU(unsigned char, is_idle); unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; @@ -399,6 +396,7 @@ __switch_to(struct task_struct *prev_p, #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, cpu); #endif + bool preload_fpu; #if CONFIG_XEN_COMPAT > 0x030002 struct physdev_set_iopl iopl_op; struct physdev_set_iobitmap iobmp_op; @@ -409,8 +407,15 @@ __switch_to(struct task_struct *prev_p, #endif multicall_entry_t _mcl[8], *mcl = _mcl; + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -422,12 +427,21 @@ __switch_to(struct task_struct *prev_p, */ if (task_thread_info(prev_p)->status & TS_USEDFPU) { __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } } else prev_p->fpu_counter = 0; + /* Make sure cpu is ready for new context */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + /* * Reload sp0. * This is load_sp0(tss, next) with a multicall. @@ -545,15 +559,12 @@ __switch_to(struct task_struct *prev_p, task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } --- head-2010-05-25.orig/arch/x86/kernel/quirks-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/quirks-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -509,7 +509,7 @@ static void __init quirk_amd_nb_node(str pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, --- head-2010-05-25.orig/arch/x86/kernel/rtc.c 2010-03-24 15:10:37.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/rtc.c 2010-03-24 15:32:27.000000000 +0100 @@ -189,8 +189,10 @@ void read_persistent_clock(struct timesp unsigned long retval, flags; #ifdef CONFIG_XEN - if (!is_initial_xendomain()) - return xen_read_persistent_clock(); + if (!is_initial_xendomain()) { + xen_read_persistent_clock(ts); + return; + } #endif spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); --- head-2010-05-25.orig/arch/x86/kernel/setup-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/setup-xen.c 2010-03-24 15:32:27.000000000 +0100 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,7 @@ #include #include +#include #include