From: Linux Kernel Mailing List Subject: Linux: 2.6.31 Patch-mainline: 2.6.31 This patch contains the differences between 2.6.30 and 2.6.31. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.31" by xen-port-patches.py --- head-2011-03-17.orig/arch/x86/Kconfig 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/Kconfig 2011-02-01 14:50:44.000000000 +0100 @@ -21,7 +21,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_EVENTS + select HAVE_PERF_EVENTS if !XEN select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES @@ -879,7 +879,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS config X86_MCE bool "Machine Check / overheating reporting" - depends on !X86_XEN && !XEN_UNPRIVILEGED_GUEST + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Machine Check support allows the processor to notify the kernel if it detects a problem (e.g. overheating, data corruption). @@ -912,7 +912,7 @@ config X86_MCE_AMD config X86_ANCIENT_MCE bool "Support for old Pentium 5 / WinChip machine checks" - depends on X86_32 && X86_MCE + depends on X86_32 && X86_MCE && !XEN ---help--- Include support for machine check handling on old Pentium 5 or WinChip systems. These typically need to be enabled explicitely on the command @@ -1609,6 +1609,7 @@ config KEXEC_JUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP || XEN) + default 0x100000 if XEN default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. --- head-2011-03-17.orig/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:50:44.000000000 +0100 @@ -770,9 +770,11 @@ ia32_sys_call_table: .quad compat_sys_signalfd4 .quad sys_eventfd2 .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ + .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 .quad compat_sys_preadv .quad compat_sys_pwritev + .quad compat_sys_rt_tgsigqueueinfo /* 335 */ + .quad sys_perf_counter_open ia32_syscall_end: --- head-2011-03-17.orig/arch/x86/include/asm/hw_irq.h 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/hw_irq.h 2011-02-01 14:50:44.000000000 +0100 @@ -142,6 +142,7 @@ extern asmlinkage void smp_invalidate_in extern irqreturn_t smp_reschedule_interrupt(int, void *); extern irqreturn_t smp_call_function_interrupt(int, void *); extern irqreturn_t smp_call_function_single_interrupt(int, void *); +extern irqreturn_t smp_reboot_interrupt(int, void *); #endif #endif --- head-2011-03-17.orig/arch/x86/include/asm/required-features.h 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/required-features.h 2011-02-01 14:50:44.000000000 +0100 @@ -48,7 +48,7 @@ #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT +#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN) /* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 #define NEED_PGE 0 --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/agp.h 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/agp.h 2011-02-01 14:50:44.000000000 +0100 @@ -48,6 +48,7 @@ /* Convert a physical address to an address suitable for the GART. */ #define phys_to_gart(x) phys_to_machine(x) #define gart_to_phys(x) machine_to_phys(x) +#define page_to_gart(x) phys_to_gart(page_to_pseudophys(x)) /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) ({ \ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:50:44.000000000 +0100 @@ -1,7 +1,6 @@ #ifndef _ASM_X86_DESC_H #define _ASM_X86_DESC_H -#ifndef __ASSEMBLY__ #include #include #include @@ -406,29 +405,4 @@ static inline void set_system_intr_gate_ } #endif -#else -/* - * GET_DESC_BASE reads the descriptor base of the specified segment. - * - * Args: - * idx - descriptor index - * gdt - GDT pointer - * base - 32bit register to which the base will be written - * lo_w - lo word of the "base" register - * lo_b - lo byte of the "base" register - * hi_b - hi byte of the low word of the "base" register - * - * Example: - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. - */ -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ - movb idx * 8 + 4(gdt), lo_b; \ - movb idx * 8 + 7(gdt), hi_b; \ - shll $16, base; \ - movw idx * 8 + 2(gdt), lo_w; - - -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_DESC_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:50:44.000000000 +0100 @@ -118,12 +118,9 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE1, + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. @@ -136,6 +133,9 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/hypercall.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/hypercall.h 2011-02-01 14:50:44.000000000 +0100 @@ -265,7 +265,7 @@ HYPERVISOR_memory_op( unsigned int cmd, void *arg) { if (arch_use_lazy_mmu_mode()) - xen_multicall_flush(false); + xen_multicall_flush(); return _hypercall2(int, memory_op, cmd, arg); } @@ -336,7 +336,7 @@ HYPERVISOR_grant_table_op( int rc; if (arch_use_lazy_mmu_mode()) - xen_multicall_flush(false); + xen_multicall_flush(); #ifdef GNTTABOP_map_grant_ref if (cmd == GNTTABOP_map_grant_ref) #endif --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/hypervisor.h 2011-03-11 11:13:19.000000000 +0100 @@ -144,7 +144,7 @@ void scrub_pages(void *, unsigned int); DECLARE_PER_CPU(bool, xen_lazy_mmu); -void xen_multicall_flush(bool); +void xen_multicall_flush(void); int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t, unsigned long flags); @@ -162,7 +162,7 @@ static inline void arch_enter_lazy_mmu_m static inline void arch_leave_lazy_mmu_mode(void) { percpu_write(xen_lazy_mmu, false); - xen_multicall_flush(false); + xen_multicall_flush(); } #define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu)) @@ -176,13 +176,13 @@ static inline void arch_leave_lazy_mmu_m static inline void arch_flush_lazy_mmu_mode(void) { if (arch_use_lazy_mmu_mode()) - xen_multicall_flush(false); + xen_multicall_flush(); } #endif #else /* !CONFIG_XEN || MODULE */ -static inline void xen_multicall_flush(bool ignore) {} +static inline void xen_multicall_flush(void) {} #define arch_use_lazy_mmu_mode() false #define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; }) #define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; }) @@ -356,4 +356,9 @@ MULTI_grant_table_op(multicall_entry_t * #define uvm_multi(cpumask) ((unsigned long)cpus_addr(cpumask) | UVMF_MULTI) +#ifdef LINUX +/* drivers/staging/ use Windows-style types, including VOID */ +#undef VOID +#endif + #endif /* __HYPERVISOR_H__ */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:33:07.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:33:45.000000000 +0100 @@ -1,8 +1,11 @@ #ifndef _ASM_X86_IRQ_VECTORS_H #define _ASM_X86_IRQ_VECTORS_H +#define MCE_VECTOR 0x12 + #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 #else # define IA32_SYSCALL_VECTOR 0x80 #endif @@ -11,7 +14,8 @@ #define CALL_FUNCTION_VECTOR 1 #define NMI_VECTOR 0x02 #define CALL_FUNC_SINGLE_VECTOR 3 -#define NR_IPIS 4 +#define REBOOT_VECTOR 4 +#define NR_IPIS 5 /* * The maximum number of vectors supported by i386 processors --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:50:44.000000000 +0100 @@ -97,7 +97,8 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS 0 -#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) || defined(CONFIG_SWIOTLB) +#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) \ + || defined(CONFIG_SWIOTLB) #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; @@ -136,6 +137,7 @@ extern void pci_iommu_alloc(void); /* generic pci stuff */ #include +#define PCIBIOS_MAX_MEM_32 0xffffffff #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-01 14:50:44.000000000 +0100 @@ -51,7 +51,13 @@ static inline void pte_free(struct mm_st __pte_free(pte); } -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) @@ -92,7 +98,13 @@ static inline void pmd_free(struct mm_st __pmd_free(virt_to_page(pmd)); } -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); @@ -145,7 +157,14 @@ static inline void pud_free(struct mm_st __pmd_free(virt_to_page(pud)); } -extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 14:50:44.000000000 +0100 @@ -2,6 +2,7 @@ #define _ASM_X86_PGTABLE_H #include +#include #include @@ -78,6 +79,8 @@ static inline void __init paravirt_paget #define pte_val(x) xen_pte_val(x) #define __pte(x) xen_make_pte(x) +#define arch_end_context_switch(prev) do {} while(0) + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -264,10 +267,17 @@ static inline pgprot_t pgprot_modify(pgp #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline int is_new_memtype_allowed(unsigned long flags, - unsigned long new_flags) +static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, + unsigned long flags, + unsigned long new_flags) { /* + * PAT type is always WB for ISA. So no need to check. + */ + if (is_ISA_range(paddr, paddr + size - 1)) + return 1; + + /* * Certain new memtypes are not allowed with certain * requested memtype: * - request is uncached, return cannot be write-back @@ -312,6 +322,11 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +static inline int pte_hidden(pte_t pte) +{ + return pte_flags(pte) & _PAGE_HIDDEN; +} + static inline int pmd_present(pmd_t pmd) { #if CONFIG_XEN_COMPAT <= 0x030002 @@ -511,6 +526,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ +#define direct_gbpages 0 + /* local pte updates need not use xchg for locking */ static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) { --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:50:44.000000000 +0100 @@ -48,13 +48,17 @@ extern void set_pmd_pfn(unsigned long, u #endif #if defined(CONFIG_HIGHPTE) +#define __KM_PTE \ + (in_nmi() ? KM_NMI_PTE : \ + in_irq() ? KM_IRQ_PTE : \ + KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) +#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) #else #define pte_offset_map(dir, address) \ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:50:44.000000000 +0100 @@ -33,10 +33,6 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) @@ -142,8 +138,6 @@ static inline int pgd_large(pgd_t pgd) { #define update_mmu_cache(vma, address, pte) do { } while (0) -#define direct_gbpages 0 - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) @@ -178,10 +172,7 @@ extern void cleanup_highmap(void); /* fs/proc/kcore.c */ #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) -#define kc_offset_to_vaddr(o) \ - (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \ - ? ((o) | ~__VIRTUAL_MASK) \ - : (o)) +#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME #endif /* !__ASSEMBLY__ */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64_types.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64_types.h 2011-02-01 14:50:44.000000000 +0100 @@ -51,11 +51,12 @@ typedef union { pteval_t pte; unsigned i #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAX_PHYSMEM_BITS 43 #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc20000000000, UL) -#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_types.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_types.h 2011-02-01 14:50:44.000000000 +0100 @@ -18,7 +18,7 @@ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 @@ -41,13 +41,18 @@ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) -#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_KMEMCHECK +#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_HIDDEN (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else @@ -330,7 +335,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; -extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:45:53.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:46:07.000000000 +0100 @@ -146,7 +146,8 @@ struct cpuinfo_x86 { extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern __u32 cleared_cpu_caps[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); @@ -427,9 +428,6 @@ DECLARE_PER_CPU(unsigned long, stack_can extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; struct thread_struct { /* Cached TLS descriptors: */ @@ -444,8 +442,12 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif +#ifdef CONFIG_X86_32 unsigned long ip; +#endif +#ifdef CONFIG_X86_64 unsigned long fs; +#endif unsigned long gs; /* Hardware debugging registers: */ unsigned long debugreg0; @@ -474,14 +476,8 @@ struct thread_struct { unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; -#ifdef CONFIG_X86_DS -/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + /* Debug Store context; see asm/ds.h */ struct ds_context *ds_ctx; -#endif /* CONFIG_X86_DS */ -#ifdef CONFIG_X86_PTRACE_BTS -/* the signal to send on a bts buffer overflow */ - unsigned int bts_ovfl_signal; -#endif /* CONFIG_X86_PTRACE_BTS */ }; static inline unsigned long xen_get_debugreg(int regno) @@ -751,6 +747,21 @@ static inline unsigned long get_debugctl return debugctlmsr; } +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR @@ -760,6 +771,18 @@ static inline void update_debugctlmsr(un wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: @@ -770,6 +793,7 @@ extern unsigned int BIOS_revision; /* Boot loader type from the setup header: */ extern int bootloader_type; +extern int bootloader_version; extern char ignore_fpu_irq; @@ -830,7 +854,6 @@ static inline void spin_lock_prefetch(co .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ } /* --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:50:44.000000000 +0100 @@ -198,7 +198,7 @@ extern unsigned disabled_cpus __cpuinitd static inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } #endif --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:50:44.000000000 +0100 @@ -432,4 +432,8 @@ static inline void __raw_write_unlock(ra #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + #endif /* _ASM_X86_SPINLOCK_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 14:50:44.000000000 +0100 @@ -111,6 +111,6 @@ static inline void flush_tlb_kernel_rang flush_tlb_all(); } -extern void zap_low_mappings(void); +extern void zap_low_mappings(bool early); #endif /* _ASM_X86_TLBFLUSH_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/xor.h 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/xor.h 2011-02-01 14:50:44.000000000 +0100 @@ -1,4 +1,7 @@ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_KMEMCHECK +/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ +# include +#elif defined(CONFIG_X86_32) # include "../../asm/xor_32.h" #else # include "xor_64.h" --- head-2011-03-17.orig/arch/x86/kernel/Makefile 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/Makefile 2011-02-01 14:50:44.000000000 +0100 @@ -128,6 +128,6 @@ ifeq ($(CONFIG_X86_64),y) endif disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ - i8259.o irqinit_$(BITS).o pci-swiotlb.o reboot.o smpboot.o tsc.o \ - tsc_sync.o uv_%.o vsmp_64.o + i8259.o irqinit.o pci-swiotlb.o reboot.o smpboot.o tsc.o tsc_sync.o \ + uv_%.o vsmp_64.o disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o --- head-2011-03-17.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -107,7 +107,7 @@ int acpi_save_state_mem(void) initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; - saved_magic = 0x123456789abcdef0; + saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ #endif --- head-2011-03-17.orig/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -135,12 +136,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -154,9 +152,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -188,16 +183,18 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; struct irq_desc *desc; int count; + int node; int i; cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); + node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; - alloc_bootmem_cpumask_var(&cfg[i].domain); - alloc_bootmem_cpumask_var(&cfg[i].old_domain); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } @@ -218,12 +215,9 @@ static struct irq_cfg *irq_cfg(unsigned return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -244,13 +238,13 @@ static struct irq_cfg *get_one_free_irq_ return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -260,10 +254,9 @@ int arch_init_chip_data(struct irq_desc return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -272,7 +265,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_ if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -282,7 +275,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_ tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -322,12 +315,12 @@ static void free_irq_2_pin(struct irq_cf } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -338,7 +331,7 @@ void arch_init_copy_chip_data(struct irq memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -362,19 +355,7 @@ void arch_free_chip_data(struct irq_desc old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -384,13 +365,6 @@ static struct irq_cfg *irq_cfg(unsigned #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -522,7 +496,8 @@ static struct IO_APIC_route_entry ioapic static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu; + union entry_union eu = {{0, 0}}; + eu.entry = e; io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -553,132 +528,18 @@ static void ioapic_mask_entry(int apic, spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -698,7 +559,7 @@ static void add_pin_to_irq_cpu(struct ir entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -707,7 +568,7 @@ static void add_pin_to_irq_cpu(struct ir /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -727,7 +588,7 @@ static void __init replace_pin_at_irq_cp /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -847,7 +708,7 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } #else -#define add_pin_to_irq_cpu(cfg, cpu, apic, pin) +#define add_pin_to_irq_node(cfg, node, apic, pin) #endif /* !CONFIG_XEN */ #ifdef CONFIG_X86_32 @@ -888,7 +749,7 @@ static int __init ioapic_pirq_setup(char __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP +#ifndef CONFIG_XEN struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -986,20 +847,6 @@ int restore_IO_APIC_setup(struct IO_APIC return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; @@ -1009,7 +856,7 @@ void free_ioapic_entries(struct IO_APIC_ kfree(ioapic_entries); } -#endif +#endif /* CONFIG_XEN */ /* * Find the IRQ entry number of a certain pin. @@ -1072,54 +919,6 @@ static int __init find_isa_irq_apic(int } #endif -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1338,6 +1137,64 @@ static int pin_2_irq(int idx, int apic, return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + struct io_apic_irq_attr *irq_attr) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + #ifndef CONFIG_XEN void lock_vector_lock(void) { @@ -1609,6 +1466,9 @@ int setup_ioapic_entry(int apic_id, int irte.vector = vector; irte.dest_id = IRTE_DEST(destination); + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, apic_id); + modify_irte(irq, &irte); ir_entry->index2 = (index >> 15) & 0x1; @@ -1684,63 +1544,75 @@ static void setup_IO_APIC_irq(int apic_i ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - irq = pin_2_irq(idx, apic_id, pin); + irq = pin_2_irq(idx, apic_id, pin); #ifdef CONFIG_XEN - if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) - continue; + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) + continue; #else - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; #endif - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); - - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -1908,36 +1780,30 @@ __apicdebuginit(void) print_IO_APIC(void return; } -__apicdebuginit(void) print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_field(int base) { - unsigned int v; - int i, j; + int i; if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ @@ -2019,6 +1885,18 @@ __apicdebuginit(void) print_local_APIC(v printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } @@ -2067,6 +1945,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); @@ -2188,7 +2071,9 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); + if (cpu_has_apic) + disconnect_bsp_APIC(!intr_remapping_enabled && + ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 @@ -2427,7 +2312,119 @@ static int ioapic_retrigger_irq(unsigned * races. */ -#ifdef CONFIG_SMP +#ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static int +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + int ret = -1; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + +static int +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + return set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP @@ -2442,26 +2439,25 @@ static int ioapic_retrigger_irq(unsigned * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2477,27 +2473,30 @@ migrate_ioapic_irq_desc(struct irq_desc send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -2559,86 +2558,19 @@ static void irq_complete_move(struct irq struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2702,9 +2634,6 @@ static void ack_apic_level(unsigned int */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2751,22 +2680,50 @@ static void ack_apic_level(unsigned int } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ @@ -2977,7 +2934,7 @@ static inline void __init check_timer(vo { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -3043,7 +3000,7 @@ static inline void __init check_timer(vo * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -3080,7 +3037,7 @@ static inline void __init check_timer(vo /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3310,14 +3267,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; struct irq_desc *desc_new = NULL; irq = 0; @@ -3326,7 +3282,7 @@ unsigned int create_irq_nr(unsigned int spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3335,6 +3291,9 @@ unsigned int create_irq_nr(unsigned int if (cfg_new->vector != 0) continue; + + desc_new = move_irq_desc(desc_new, node); + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3352,11 +3311,12 @@ unsigned int create_irq_nr(unsigned int int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3422,6 +3382,9 @@ static int msi_compose_msg(struct pci_de irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); + /* Set source-id of interrupt request */ + set_msi_sid(&irte, pdev); + modify_irte(irq, &irte); msg->address_hi = MSI_ADDR_BASE_HI; @@ -3459,7 +3422,7 @@ static int msi_compose_msg(struct pci_de } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3468,7 +3431,7 @@ static void set_msi_irq_affinity(unsigne dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3480,13 +3443,15 @@ static void set_msi_irq_affinity(unsigne msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3495,11 +3460,11 @@ ir_set_msi_irq_affinity(unsigned int irq struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3516,6 +3481,8 @@ ir_set_msi_irq_affinity(unsigned int irq */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3611,15 +3578,17 @@ int arch_setup_msi_irqs(struct pci_dev * unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; @@ -3669,7 +3638,7 @@ void arch_teardown_msi_irq(unsigned int #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3678,7 +3647,7 @@ static void dmar_msi_set_affinity(unsign dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3690,11 +3659,13 @@ static void dmar_msi_set_affinity(unsign msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ -struct irq_chip dmar_msi_type = { +static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .unmask = dmar_msi_unmask, .mask = dmar_msi_mask, @@ -3723,7 +3694,7 @@ int arch_setup_dmar_msi(unsigned int irq #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3732,7 +3703,7 @@ static void hpet_msi_set_affinity(unsign dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3744,6 +3715,8 @@ static void hpet_msi_set_affinity(unsign msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3800,7 +3773,7 @@ static void target_ht_irq(unsigned int i write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3808,11 +3781,13 @@ static void set_ht_irq_affinity(unsigned dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif @@ -3887,6 +3862,8 @@ int arch_enable_uv_irq(char *irq_name, u unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3900,19 +3877,20 @@ int arch_enable_uv_irq(char *irq_name, u mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + return irq; } @@ -3926,10 +3904,10 @@ void arch_disable_uv_irq(int mmr_blade, struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); @@ -3995,14 +3973,85 @@ int __init arch_probe_nr_irqs(void) #endif #endif /* CONFIG_XEN */ +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + int ioapic, pin; + int trigger, polarity; + + ioapic = irq_attr->ioapic; +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", + ioapic, irq); + return -EINVAL; + } +#endif + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + + return 0; +} + +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + int ioapic, pin; + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, irq, irq_attr); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI -#ifdef CONFIG_X86_32 -#ifndef CONFIG_XEN +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; @@ -4076,7 +4125,7 @@ int __init io_apic_get_unique_id(int ioa return apic_id; } -#endif /* !CONFIG_XEN */ +#endif int __init io_apic_get_version(int ioapic) { @@ -4089,47 +4138,6 @@ int __init io_apic_get_version(int ioapi return reg_01.bits.version; } -#endif - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int cpu = boot_cpu_id; - -#ifdef CONFIG_XEN - if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", - ioapic, irq); - return -EINVAL; - } -#endif - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { @@ -4161,51 +4169,44 @@ int acpi_get_override_irq(int bus_irq, i #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; + } +#endif - } + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); + desc = irq_to_desc(irq); - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); } + } #endif @@ -4288,29 +4289,21 @@ fake_ioapic_page: } } -static int __init ioapic_insert_resources(void) +void __init ioapic_insert_resources(void) { int i; struct resource *r = ioapic_resources; if (!r) { - if (nr_ioapics > 0) { + if (nr_ioapics > 0) printk(KERN_ERR "IO APIC resources couldn't be allocated.\n"); - return -1; - } - return 0; + return; } for (i = 0; i < nr_ioapics; i++) { insert_resource(&iomem_resource, r); r++; } - - return 0; } - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); #endif /* !CONFIG_XEN */ --- head-2011-03-17.orig/arch/x86/kernel/apic/probe_32-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/apic/probe_32-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -20,23 +20,12 @@ #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include #include -#include -#include #include #include #include -#include static int xen_phys_pkg_id(int cpuid_apic, int index_msb) { --- head-2011-03-17.orig/arch/x86/kernel/cpu/amd.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/amd.c 2011-02-01 14:50:44.000000000 +0100 @@ -415,7 +415,7 @@ static void __cpuinit early_init_amd(str (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) && !defined(CONFIG_XEN) /* check CPU config space for extended APIC ID */ if (cpu_has_apic && c->x86 >= 0xf) { unsigned int val; --- head-2011-03-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:42:07.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:42:17.000000000 +0100 @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -66,7 +67,30 @@ void __init setup_cpu_local_masks(void) #endif } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -116,7 +140,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p /* data */ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, #endif [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, GDT_STACK_CANARY_INIT @@ -312,7 +336,8 @@ static const char *__cpuinit table_looku return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -361,29 +386,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -516,7 +518,6 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - static int printed; int i; for (i = 0; i < X86_VENDOR_NUM; i++) { @@ -533,13 +534,9 @@ static void __cpuinit get_cpu_vendor(str } } - if (!printed) { - printed++; - printk(KERN_ERR - "CPU: vendor_id '%s' unknown, using generic init.\n", v); - - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } + printk_once(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; @@ -805,6 +802,12 @@ static void __cpuinit identify_cpu(struc if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif @@ -850,6 +853,16 @@ static void __cpuinit identify_cpu(struc #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -862,10 +875,6 @@ static void __cpuinit identify_cpu(struc boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); @@ -898,6 +907,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) --- head-2011-03-17.orig/arch/x86/kernel/cpu/mcheck/Makefile 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/mcheck/Makefile 2011-02-01 14:50:44.000000000 +0100 @@ -11,5 +11,3 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inje obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o - -disabled-obj-$(CONFIG_XEN) := therm_throt.o --- head-2011-03-17.orig/arch/x86/kernel/cpu/mcheck/mce.c 2011-01-31 14:53:50.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/mcheck/mce.c 2011-02-01 14:50:44.000000000 +0100 @@ -137,10 +137,12 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); +#ifndef CONFIG_XEN #ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; #endif m->apicid = cpu_data(m->extcpu).initial_apicid; +#endif rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -483,7 +485,9 @@ static inline void mce_get_rip(struct mc */ asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) { +#ifndef CONFIG_XEN ack_APIC_irq(); +#endif exit_idle(); irq_enter(); mce_notify_irq(); @@ -506,7 +510,7 @@ static void mce_report_event(struct pt_r return; } -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) /* * Without APIC do not notify. The event will be picked * up eventually. @@ -2167,7 +2171,7 @@ static __init int mcheck_init_device(voi #ifdef CONFIG_X86_XEN_MCE if (is_initial_xendomain()) { /* Register vIRQ handler for MCE LOG processing */ - extern void bind_virq_for_mce(void); + extern int bind_virq_for_mce(void); printk(KERN_DEBUG "MCE: bind virq for DOM0 logging\n"); bind_virq_for_mce(); --- head-2011-03-17.orig/arch/x86/kernel/cpu/mcheck/mce_dom0.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/mcheck/mce_dom0.c 2011-02-01 14:50:44.000000000 +0100 @@ -7,12 +7,17 @@ #include #include +static xen_mc_logical_cpu_t *g_physinfo; +static unsigned int ncpus; + static int convert_log(struct mc_info *mi) { struct mcinfo_common *mic = NULL; struct mcinfo_global *mc_global; struct mcinfo_bank *mc_bank; struct mce m; + unsigned int i; + bool found = false; x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); if (mic == NULL) @@ -21,9 +26,21 @@ static int convert_log(struct mc_info *m return -1; } + mce_setup(&m); mc_global = (struct mcinfo_global*)mic; m.mcgstatus = mc_global->mc_gstatus; - m.cpu = mc_global->mc_coreid;/*for test*/ + m.apicid = mc_global->mc_apicid; + + for (i = 0; i < ncpus; i++) + if (g_physinfo[i].mc_apicid == m.apicid) { + found = true; + break; + } + WARN_ON_ONCE(!found); + m.socketid = mc_global->mc_socketid; + m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; + m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; + x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); do { @@ -36,7 +53,6 @@ static int convert_log(struct mc_info *m m.status = mc_bank->mc_status; m.addr = mc_bank->mc_addr; m.tsc = mc_bank->mc_tsc; - m.res1 = mc_bank->mc_ctrl2; m.bank = mc_bank->mc_bank; printk(KERN_DEBUG "[CPU%d, BANK%d, addr %llx, state %llx]\n", m.bank, m.cpu, m.addr, m.status); @@ -116,18 +132,55 @@ end: return IRQ_HANDLED; } -void bind_virq_for_mce(void) +int __init bind_virq_for_mce(void) { int ret; + xen_mc_t mc_op; + + g_mi = kmalloc(sizeof(*g_mi), GFP_KERNEL); + if (!g_mi) + return -ENOMEM; + + /* fetch physical CPU count */ + mc_op.cmd = XEN_MC_physcpuinfo; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, NULL); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("MCE: Failed to get physical CPU count\n"); + kfree(g_mi); + return ret; + } + + /* fetch CPU physical info for later reference */ + ncpus = mc_op.u.mc_physcpuinfo.ncpus; + g_physinfo = kmalloc(sizeof(*g_physinfo) * ncpus, GFP_KERNEL); + if (!g_physinfo) { + kfree(g_mi); + return -ENOMEM; + } + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("MCE: Failed to get physical CPUs' info\n"); + kfree(g_mi); + kfree(g_physinfo); + return ret; + } ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, mce_dom0_interrupt, 0, "mce", NULL); - g_mi = kmalloc(sizeof(struct mc_info), GFP_KERNEL); - if (ret < 0) - pr_err("MCE_DOM0_LOG: bind_virq for DOM0 failed\n"); + if (ret < 0) { + pr_err("MCE: Failed to bind vIRQ for Dom0\n"); + kfree(g_mi); + kfree(g_physinfo); + return ret; + } /* Log the machine checks left over from the previous reset. */ mce_dom0_interrupt(VIRQ_MCA, NULL); + + return 0; } --- head-2011-03-17.orig/arch/x86/kernel/e820-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/e820-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -659,7 +659,7 @@ __init int e820_search_gap(unsigned long */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -668,24 +668,18 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0); WARN_ON(!found); } #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", @@ -1495,6 +1489,25 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + void __init e820_reserve_resources_late(void) { int i; @@ -1506,6 +1519,26 @@ void __init e820_reserve_resources_late( insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820.map[i]; + u64 start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); + } } #undef e820 --- head-2011-03-17.orig/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:50:44.000000000 +0100 @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include @@ -88,7 +87,7 @@ NMI_MASK = 0x80000000 #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else #define preempt_stop(clobbers) -#define resume_kernel restore_nocheck +#define resume_kernel restore_all #endif .macro TRACE_IRQS_IRET @@ -376,7 +375,7 @@ END(ret_from_exception) ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_nocheck + jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl @@ -569,6 +568,8 @@ syscall_exit: jne syscall_exit_work restore_all: + TRACE_IRQS_IRET +restore_all_notrace: #ifndef CONFIG_XEN movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we @@ -594,8 +595,6 @@ restore_nocheck: CFI_REMEMBER_STATE jnz restore_all_enable_events # != 0 => enable event delivery #endif - TRACE_IRQS_IRET -restore_nocheck_notrace: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: @@ -632,22 +631,34 @@ ldt_ss: jne restore_nocheck #endif - /* If returning to userspace with 16bit stack, - * try to fix the higher word of ESP, as the CPU - * won't restore it. - * This is an "official" bug of all the x86-compatible - * CPUs, which we can try to work around to make - * dosemu and wine happy. */ - movl PT_OLDESP(%esp), %eax - movl %esp, %edx - call patch_espfix_desc +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 - pushl %eax + push %eax /* new kernel esp */ CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ DISABLE_INTERRUPTS(CLBR_EAX) - TRACE_IRQS_OFF - lss (%esp), %esp + lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck #else @@ -786,15 +797,24 @@ PTREGSCALL(vm86old) #ifndef CONFIG_XEN .macro FIXUP_ESPFIX_STACK - /* since we are on a wrong stack, we cant make it a C code :( */ +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ PER_CPU(gdt_page, %ebx) - GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - addl %esp, %eax + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - lss (%esp), %esp + lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm .macro UNWIND_ESPFIX_STACK @@ -1284,6 +1304,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + movl (%ebp), %ecx subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx @@ -1298,6 +1319,7 @@ return_to_handler: pushl %eax pushl %ecx pushl %edx + movl %ebp, %eax call ftrace_return_to_handler movl %eax, 0xc(%esp) popl %edx @@ -1593,7 +1615,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_nocheck_notrace + jmp restore_all_notrace CFI_ENDPROC nmi_stack_fixup: --- head-2011-03-17.orig/arch/x86/kernel/entry_64.S 2011-02-16 16:02:30.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_64.S 2011-02-16 16:02:54.000000000 +0100 @@ -1363,7 +1363,7 @@ apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 errorentry xen_stack_segment do_stack_segment --- head-2011-03-17.orig/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:50:44.000000000 +0100 @@ -139,6 +139,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + movq (%rbp), %rdx subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return @@ -151,27 +152,15 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq @@ -869,6 +858,8 @@ END(\sym) #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV @@ -900,10 +891,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_STAR #endif apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt mce_threshold_interrupt + threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ + mce_self_interrupt smp_mce_self_interrupt +#endif + #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ call_function_single_interrupt smp_call_function_single_interrupt @@ -917,6 +913,11 @@ apicinterrupt ERROR_APIC_VECTOR \ error_interrupt smp_error_interrupt apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt +#endif #endif /* !CONFIG_XEN */ /* @@ -1219,7 +1220,7 @@ paranoiderrorentry stack_segment do_stac errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check +paranoidzeroentry machine_check *machine_check_vector(%rip) #endif #ifndef CONFIG_XEN --- head-2011-03-17.orig/arch/x86/kernel/head_32-xen.S 2011-03-03 16:23:08.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head_32-xen.S 2011-03-03 16:23:25.000000000 +0100 @@ -118,12 +118,6 @@ ENTRY(hypercall_page) CFI_ENDPROC /* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - -/* * BSS section */ .section ".bss.page_aligned","wa" --- head-2011-03-17.orig/arch/x86/kernel/head_64-xen.S 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head_64-xen.S 2011-02-01 14:50:44.000000000 +0100 @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include --- head-2011-03-17.orig/arch/x86/kernel/init_task.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/init_task.c 2011-02-01 14:50:44.000000000 +0100 @@ -31,6 +31,7 @@ union thread_union init_thread_union __i struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); +#ifndef CONFIG_X86_NO_TSS /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned @@ -39,4 +40,4 @@ EXPORT_SYMBOL(init_task); * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - +#endif --- head-2011-03-17.orig/arch/x86/kernel/irq-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/irq-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -12,6 +12,8 @@ #include #include #include +#include +#include atomic_t irq_err_count; @@ -26,9 +28,10 @@ void (*generic_interrupt_extension)(void */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at irq %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) +#ifndef CONFIG_XEN /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -38,8 +41,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); + ack_APIC_irq(); #endif } @@ -65,6 +67,14 @@ static int show_other_interrupts(struct for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "CNT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "%*s: ", prec, "PND"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif #ifndef CONFIG_XEN if (generic_interrupt_extension) { @@ -95,17 +105,27 @@ static int show_other_interrupts(struct seq_printf(p, " Spinlock wakeups\n"); #endif #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_64 +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif +#endif +#ifdef CONFIG_X86_NEW_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -177,6 +197,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif #ifndef CONFIG_XEN if (generic_interrupt_extension) @@ -191,11 +213,15 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_lock_count; #endif #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_64 +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; #endif +#ifdef CONFIG_X86_NEW_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); #endif return sum; } @@ -231,14 +257,11 @@ unsigned int __irq_entry do_IRQ(struct p irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); --- head-2011-03-17.orig/arch/x86/kernel/microcode_core-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/microcode_core-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -22,27 +22,21 @@ * 2 of the License, or (at your option) any later version. */ #include -#include #include -#include +#include #include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include #include +#include +#include +#include #include #include -#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -53,7 +47,18 @@ module_param(verbose, int, 0644); #define MICROCODE_VERSION "2.00-xen" -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); #ifdef CONFIG_MICROCODE_OLD_INTERFACE @@ -90,18 +95,16 @@ static int microcode_open(struct inode * static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -110,15 +113,16 @@ static ssize_t microcode_write(struct fi } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .devnode = "cpu/microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -127,9 +131,7 @@ static int __init microcode_dev_init(voi error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -188,38 +190,35 @@ static int __init microcode_init(void) else if (c->x86_vendor == X86_VENDOR_AMD) fw_name = "amd-ucode/microcode_amd.bin"; else { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); return PTR_ERR(microcode_pdev); } + error = microcode_dev_init(); + if (error) + return error; + request_microcode(fw_name); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " ," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { microcode_dev_exit(); platform_device_unregister(microcode_pdev); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); --- head-2011-03-17.orig/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -904,24 +905,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -980,7 +974,7 @@ static int __init replace_intsrc_all(st } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; @@ -997,11 +991,14 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -1014,6 +1011,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; --- head-2011-03-17.orig/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -32,6 +32,8 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; +int iommu_pass_through; + dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -264,6 +266,10 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif + if (!strncmp(p, "pt", 2)) { + iommu_pass_through = 1; + return 1; + } gart_parse_options(p); @@ -371,6 +377,8 @@ static int __init pci_iommu_init(void) void pci_iommu_shutdown(void) { gart_iommu_shutdown(); + + amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); --- head-2011-03-17.orig/arch/x86/kernel/process-xen.c 2011-03-03 16:06:40.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/process-xen.c 2011-03-03 16:07:25.000000000 +0100 @@ -8,12 +8,15 @@ #include #include #include +#include #include #include #include +#include #include #include #include +#include #include unsigned long idle_halt; @@ -46,6 +49,8 @@ void free_thread_xstate(struct task_stru kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -59,7 +64,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC, NULL); + SLAB_PANIC | SLAB_NOTRACK, NULL); } /* @@ -85,8 +90,6 @@ void exit_thread(void) t->io_bitmap_max = 0; kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) @@ -471,16 +474,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -575,3 +574,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + --- head-2011-03-17.orig/arch/x86/kernel/process_32-xen.c 2011-02-02 08:37:24.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/process_32-xen.c 2011-02-02 08:37:43.000000000 +0100 @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include @@ -33,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -297,7 +294,8 @@ int copy_thread(unsigned long clone_flag p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; @@ -468,7 +466,7 @@ __switch_to(struct task_struct *prev_p, * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the @@ -558,15 +556,3 @@ unsigned long get_wchan(struct task_stru return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} --- head-2011-03-17.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:37:17.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/process_64-xen.c 2011-02-02 08:37:47.000000000 +0100 @@ -17,8 +17,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include @@ -35,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -344,7 +341,8 @@ int copy_thread(unsigned long clone_flag } p->thread.iopl = current->thread.iopl; - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; @@ -506,7 +504,7 @@ __switch_to(struct task_struct *prev_p, * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* * Switch FS and GS. @@ -723,15 +721,3 @@ long sys_arch_prctl(int code, unsigned l return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} --- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-03-03 16:22:49.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-03-03 16:23:32.000000000 +0100 @@ -142,6 +142,14 @@ EXPORT_SYMBOL(xen_start_info); #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; @@ -247,8 +255,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -316,6 +324,20 @@ void * __init extend_brk(size_t size, si return ret; } +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -328,15 +350,13 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) - #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { - +#ifndef CONFIG_XEN u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -391,8 +411,14 @@ static void __init relocate_initrd(void) " %08llx - %08llx\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); -} +#else + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + __pa(xen_start_info->mod_start) + xen_start_info->mod_len, + max_low_pfn_mapped << PAGE_SHIFT); + initrd_start = 0; #endif +} static void __init reserve_initrd(void) { @@ -400,7 +426,7 @@ static void __init reserve_initrd(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -409,7 +435,7 @@ static void __init reserve_initrd(void) unsigned long ramdisk_image = __pa(xen_start_info->mod_start); unsigned long ramdisk_size = xen_start_info->mod_len; unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!xen_start_info->mod_start || !ramdisk_size) return; /* No initrd provided by bootloader */ @@ -442,14 +468,8 @@ static void __init reserve_initrd(void) return; } -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif + free_early(ramdisk_image, ramdisk_end); } #else @@ -721,6 +741,19 @@ static struct dmi_system_id __initdata b DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + /* + * AMI BIOS with low memory corruption was found on Intel DG45ID board. + * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * match only DMI_BOARD_NAME and see if there is more bad products + * with this vendor. + */ + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), + }, + }, #endif {} }; @@ -788,6 +821,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; @@ -970,14 +1009,22 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +#ifndef CONFIG_XEN + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; +#endif #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped<arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - -/** * x86_quirk_intr_init - post gate setup interrupt initialisation * * Description: --- head-2011-03-17.orig/arch/x86/kernel/smp-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/smp-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -135,11 +135,36 @@ void xen_send_call_func_ipi(const struct * this function calls the 'stop' function on all other CPUs in the system. */ +irqreturn_t smp_reboot_interrupt(int irq, void *dev_id) +{ + stop_this_cpu(NULL); + + return IRQ_HANDLED; +} + void xen_smp_send_stop(void) { unsigned long flags; + unsigned long wait; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + * On most systems we could also use an NMI here, + * but there are a few systems around where NMI + * is problematic so stay with an non NMI for now + * (this implies we cannot stop CPUs spinning with irq off + * currently) + */ + if (num_online_cpus() > 1) { + xen_send_IPI_allbutself(REBOOT_VECTOR); + + /* Don't wait longer than a second */ + wait = USEC_PER_SEC; + while (num_online_cpus() > 1 && wait--) + udelay(1); + } - smp_call_function(stop_this_cpu, NULL, 0); local_irq_save(flags); disable_all_local_evtchn(); local_irq_restore(flags); --- head-2011-03-17.orig/arch/x86/kernel/traps-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/traps-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -45,6 +45,7 @@ #include #endif +#include #include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include @@ -64,8 +66,6 @@ #include #include -#include "cpu/mcheck/mce.h" - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -347,6 +347,9 @@ io_check_error(unsigned char reason, str printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ clear_io_check_error(reason); } @@ -527,6 +530,10 @@ dotraplinkage void __kprobes do_debug(st get_debugreg(condition, 6); + /* Catch kmemcheck conditions first of all! */ + if (condition & DR_STEP && kmemcheck_trap(regs)) + return; + /* * The processor cleared BTF, so don't mark that we need it set. */ @@ -792,15 +799,15 @@ unsigned long patch_espfix_desc(unsigned return new_kesp; } -#else +#endif + asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -#endif #endif /* CONFIG_XEN */ /* @@ -834,9 +841,6 @@ asmlinkage void math_state_restore(void) } /* NB. 'clts' is done for us by Xen during virtual trap. */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -845,7 +849,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } --- head-2011-03-17.orig/arch/x86/kernel/vsyscall_64-xen.c 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/vsyscall_64-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeo return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; --- head-2011-03-17.orig/arch/x86/mm/dump_pagetables-xen.c 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/dump_pagetables-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -173,13 +173,14 @@ static void note_page(struct seq_file *m st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; + int width = sizeof(unsigned long) * 2; /* * Now print the actual finished series */ - seq_printf(m, "0x%p-0x%p ", - (void *)st->start_address, - (void *)st->current_address); + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); delta = (st->current_address - st->start_address) >> 10; while (!(delta & 1023) && unit[1]) { --- head-2011-03-17.orig/arch/x86/mm/fault-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/fault-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -3,40 +3,18 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include +#include /* STACK_END_MAGIC */ +#include /* test_thread_flag(), ... */ +#include /* oops_begin/end, ... */ +#include /* search_exception_table */ +#include /* max_low_pfn */ +#include /* __kprobes, ... */ +#include /* kmmio_handler, ... */ +#include /* perf_swcounter_event */ + +#include /* dotraplinkage, ... */ +#include /* pgd_*(), ... */ +#include /* kmemcheck_*(), ... */ /* * Page fault error code bits: @@ -228,10 +206,7 @@ static inline pmd_t *vmalloc_sync_one(pg if (!pmd_present(*pmd_k)) return NULL; - if (!pmd_present(*pmd)) { - bool lazy = percpu_read(xen_lazy_mmu); - - percpu_write(xen_lazy_mmu, false); + if (!pmd_present(*pmd)) #if CONFIG_XEN_COMPAT > 0x030002 set_pmd(pmd, *pmd_k); #else @@ -241,10 +216,8 @@ static inline pmd_t *vmalloc_sync_one(pg */ set_pmd(pmd, __pmd(pmd_val(*pmd_k))); #endif - percpu_write(xen_lazy_mmu, lazy); - } else { + else BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } return pmd_k; } @@ -474,10 +447,11 @@ static noinline int vmalloc_fault(unsign } static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; /* * No vm86 mode in 64-bit mode: @@ -562,8 +536,6 @@ bad: static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int once; - if (address != regs->ip) return 0; @@ -573,10 +545,7 @@ static int is_errata93(struct pt_regs *r address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!once) { - printk(errata93_warning); - once = 1; - } + printk_once(errata93_warning); regs->ip = address; return 1; } @@ -749,7 +718,7 @@ show_signal_msg(struct pt_regs *regs, un if (!printk_ratelimit()) return; - printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); @@ -1011,11 +980,17 @@ do_page_fault(struct pt_regs *regs, unsi tsk = current; mm = tsk->mm; - prefetchw(&mm->mmap_sem); - /* Get the faulting address: */ address = read_cr2(); + /* + * Detect and handle instructions that would cause a page fault for + * both a tracked kernel page and a userspace page. + */ + if (kmemcheck_active(regs)) + kmemcheck_hide(regs); + prefetchw(&mm->mmap_sem); + if (unlikely(kmmio_fault(regs, address))) return; @@ -1044,9 +1019,13 @@ do_page_fault(struct pt_regs *regs, unsi return; } - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - vmalloc_fault(address) >= 0) - return; + if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + + if (kmemcheck_fault(regs, address, error_code)) + return; + } /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) @@ -1085,6 +1064,8 @@ do_page_fault(struct pt_regs *regs, unsi if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: @@ -1171,17 +1152,22 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { tsk->min_flt++; + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } check_v8086_mode(regs, address, tsk); --- head-2011-03-17.orig/arch/x86/mm/highmem_32-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/highmem_32-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); - /*arch_flush_lazy_mmu_mode();*/ return (void *)vaddr; } @@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km #endif } - /*arch_flush_lazy_mmu_mode();*/ pagefault_enable(); } @@ -150,6 +148,7 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_prot); #ifdef CONFIG_HIGHPTE EXPORT_SYMBOL(kmap_atomic_to_page); #endif --- head-2011-03-17.orig/arch/x86/mm/hypervisor.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/hypervisor.c 2011-02-01 14:50:44.000000000 +0100 @@ -116,8 +116,8 @@ static int _xen_multicall_flush(bool ret return 0; } -void xen_multicall_flush(bool force) { - if (force || use_lazy_mmu_mode()) +void xen_multicall_flush(void) { + if (use_lazy_mmu_mode()) _xen_multicall_flush(false); } --- head-2011-03-17.orig/arch/x86/mm/init-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -1,3 +1,4 @@ +#include #include #include #include @@ -11,6 +12,10 @@ #include #include #include +#include +#include + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long __meminitdata e820_table_start; unsigned long __meminitdata e820_table_end; @@ -31,6 +36,69 @@ extern unsigned long extend_init_mapping extern void xen_finish_init_mapping(void); #endif +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +static inline void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { @@ -127,20 +195,6 @@ static int __meminit save_mr(struct map_ return nr_range; } -#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} -#endif - /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -160,10 +214,7 @@ unsigned long __init_refok init_memory_m printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - if (!after_bootmem) - init_gbpages(); - -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting @@ -175,12 +226,9 @@ unsigned long __init_refok init_memory_m use_gbpages = direct_gbpages; #endif -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif /* Enable PSE if available */ if (cpu_has_pse) @@ -191,7 +239,6 @@ unsigned long __init_refok init_memory_m set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } -#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; --- head-2011-03-17.orig/arch/x86/mm/init_32-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_32-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -52,12 +52,9 @@ #include #include #include +#include #include -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); @@ -122,7 +119,7 @@ static pte_t * __init one_page_table_ini pte_t *page_table = NULL; if (after_bootmem) { -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif if (!page_table) @@ -569,7 +566,7 @@ static inline void save_pg_dir(void) } #endif /* !CONFIG_ACPI_SLEEP */ -void zap_low_mappings(void) +void zap_low_mappings(bool early) { int i; @@ -586,64 +583,16 @@ void zap_low_mappings(void) set_pgd(swapper_pg_dir+i, __pgd(0)); #endif } - flush_tlb_all(); -} -int nx_enabled; + if (early) + __flush_tlb(); + else + flush_tlb_all(); +} pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); -#ifdef CONFIG_X86_PAE - -static int disable_nx __initdata; - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str || !strcmp(str, "on")) { - if (cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } - - return 0; -} -early_param("noexec", noexec_setup); - -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#endif - /* user-defined highmem size */ static unsigned int highmem_pages = -1; @@ -763,15 +712,15 @@ void __init initmem_init(unsigned long s highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memory_present(0, 0, highend_pfn); e820_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memory_present(0, 0, max_low_pfn); e820_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -1074,7 +1023,7 @@ void __init mem_init(void) test_wp_bit(); save_pg_dir(); - zap_low_mappings(); + zap_low_mappings(true); SetPagePinned(virt_to_page(init_mm.pgd)); } --- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -56,21 +56,11 @@ #include -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - #if CONFIG_XEN_COMPAT <= 0x030002 unsigned int __kernel_page_user; EXPORT_SYMBOL(__kernel_page_user); #endif -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD]; extern pte_t level1_fixmap_pgt[PTRS_PER_PTE]; @@ -151,39 +141,6 @@ early_param("gbpages", parse_direct_gbpa pteval_t __supported_pte_mask __read_mostly = ~0UL; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int disable_nx __cpuinitdata; - -/* - * noexec=on|off - * Control non-executable mappings for 64-bit processes. - * - * on Enable (default) - * off Disable - */ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - int force_personality32; /* @@ -213,7 +170,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); else if (e820_table_end < e820_table_top) { ptr = __va(e820_table_end << PAGE_SHIFT); e820_table_end++; @@ -399,7 +356,7 @@ static __ref void *alloc_low_page(unsign void *adr; if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); *phys = __pa(adr); return adr; @@ -804,7 +761,7 @@ void __init xen_finish_init_mapping(void e820_table_top = e820_table_end; } -unsigned long __init +unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) @@ -873,6 +830,7 @@ void __init initmem_init(unsigned long s early_res_to_bootmem(0, end_pfn< #include #include +#include #include #include @@ -486,7 +487,7 @@ static int split_large_page(pte_t *kpte, if (!debug_pagealloc) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL, 0); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); if (!debug_pagealloc) spin_lock(&cpa_lock); if (!base) @@ -610,9 +611,12 @@ static int __change_page_attr(struct cpa unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) - address = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; @@ -702,8 +706,9 @@ static int __change_page_attr_set_clr(st static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr, remapped; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -716,9 +721,12 @@ static int cpa_process_alias(struct cpa_ * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) - vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); - else if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) vaddr = cpa->vaddr[cpa->curpage]; else vaddr = *cpa->vaddr; @@ -727,42 +735,55 @@ static int cpa_process_alias(struct cpa_ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; - /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif /* - * The high mapping range is imprecise, so ignore the return value. - */ - __change_page_attr_set_clr(&alias_cpa, 0); -#endif - return ret; + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. + */ + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) @@ -860,15 +881,6 @@ static int change_page_attr_set_clr(unsi vm_unmap_aliases(); - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - * - arch_flush_lazy_mmu_mode();*/ - if (arch_use_lazy_mmu_mode()) - xen_multicall_flush(true); - cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; @@ -913,14 +925,6 @@ static int change_page_attr_set_clr(unsi } else cpa_flush_all(cache); - /* - * If we've been called with lazy mmu updates enabled, then - * make sure that everything gets flushed out before we - * return. - * - arch_flush_lazy_mmu_mode();*/ - WARN_ON_ONCE(arch_use_lazy_mmu_mode() && !irq_count()); - out: return ret; } @@ -1065,12 +1069,15 @@ EXPORT_SYMBOL(set_memory_array_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; + unsigned long addr_copy = addr; + ret = change_page_attr_set(&addr, numpages, __pgprot(_PAGE_CACHE_UC_MINUS), 0); - if (!ret) { - ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_WC), 0); + ret = change_page_attr_set_clr(&addr_copy, numpages, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); } return ret; } @@ -1187,7 +1194,9 @@ int set_pages_array_uc(struct page **pag int free_idx; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) goto err_out; @@ -1200,7 +1209,9 @@ int set_pages_array_uc(struct page **pag err_out: free_idx = i; for (i = 0; i < free_idx; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } @@ -1229,7 +1240,9 @@ int set_pages_array_wb(struct page **pag return retval; for (i = 0; i < addrinarray; i++) { - start = (unsigned long)page_address(pages[i]); + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; free_memtype(start, end); } --- head-2011-03-17.orig/arch/x86/mm/pat-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pat-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -639,7 +639,8 @@ static int reserve_pfn_range(u64 paddr, return ret; if (flags != want_flags) { - if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" " for %Lx-%Lx, got %s\n", --- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -8,9 +8,11 @@ #include #include +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP); if (pte) make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); return pte; @@ -27,9 +29,9 @@ pgtable_t pte_alloc_one(struct mm_struct struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(PGALLOC_GFP, 0); #endif if (pte) { pgtable_page_ctor(pte); @@ -65,7 +67,7 @@ void __pte_free(pgtable_t pte) __free_page(pte); } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); @@ -83,7 +85,7 @@ pmd_t *pmd_alloc_one(struct mm_struct *m { struct page *pmd; - pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pmd = alloc_pages(PGALLOC_GFP, 0); if (!pmd) return NULL; SetPageForeign(pmd, _pmd_free); @@ -107,14 +109,14 @@ void __pmd_free(pgtable_t pmd) __free_page(pmd); } -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } #if PAGETABLE_LEVELS > 3 -void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); @@ -609,7 +611,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmds[PREALLOCATED_PMDS]; unsigned long flags; - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER); + pgd = (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ORDER); if (pgd == NULL) goto out; --- head-2011-03-17.orig/arch/x86/pci/i386.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/arch/x86/pci/i386.c 2011-02-01 14:50:44.000000000 +0100 @@ -239,12 +239,14 @@ void __init pcibios_resource_survey(void pcibios_allocate_resources(1); e820_reserve_resources_late(); +#ifndef CONFIG_XEN /* * Insert the IO APIC resources after PCI initialization has * occured to handle IO APICS that are mapped in on a BAR in * PCI space, but before trying to assign unassigned pci res. */ ioapic_insert_resources(); +#endif } /** --- head-2011-03-17.orig/arch/x86/pci/irq-xen.c 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/arch/x86/pci/irq-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -895,6 +895,9 @@ static int pcibios_lookup_irq(struct pci return 0; } + if (io_apic_assign_pci_irqs) + return 0; + /* Find IRQ routing entry */ if (!pirq_table) @@ -1045,56 +1048,15 @@ static void __init pcibios_fixup_irqs(vo pirq_penalty[dev->irq]++; } + if (io_apic_assign_pci_irqs) + return; + dev = NULL; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; -#ifdef CONFIG_X86_IO_APIC - /* - * Recalculate IRQ numbers if we use the I/O APIC. - */ - if (io_apic_assign_pci_irqs) { - int irq; - - /* - * interrupt pins are numbered starting from 1 - */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin - 1); - /* - * Busses behind bridges are typically not listed in the - * MP-table. In this case we have to look up the IRQ - * based on the parent bus, parent slot, and pin number. - * The SMP code detects such bridged busses itself so we - * should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { - /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; - int bus; - - pin = pci_swizzle_interrupt_pin(dev, pin); - bus = bridge->bus->number; - irq = IO_APIC_get_PCI_irq_vector(bus, - PCI_SLOT(bridge->devfn), pin - 1); - if (irq >= 0) - dev_warn(&dev->dev, - "using bridge %s INT %c to " - "get IRQ %d\n", - pci_name(bridge), - 'A' + pin - 1, irq); - } - if (irq >= 0) { - dev_info(&dev->dev, - "PCI->APIC IRQ transform: INT %c " - "-> IRQ %d\n", - 'A' + pin - 1, irq); - dev->irq = irq; - } - } -#endif /* * Still no IRQ? Try to lookup one... */ @@ -1189,6 +1151,19 @@ int __init pcibios_irq_init(void) pcibios_enable_irq = pirq_enable_irq; pcibios_fixup_irqs(); + + if (io_apic_assign_pci_irqs && pci_routeirq) { + struct pci_dev *dev = NULL; + /* + * PCI IRQ routing is set up by pci_enable_device(), but we + * also do it here in case there are still broken drivers that + * don't use pci_enable_device(). + */ + printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + for_each_pci_dev(dev) + pirq_enable_irq(dev); + } + return 0; } @@ -1219,16 +1194,23 @@ void pcibios_penalize_isa_irq(int irq, i static int pirq_enable_irq(struct pci_dev *dev) { u8 pin; - struct pci_dev *temp_dev; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + if (pin && !pcibios_lookup_irq(dev, 1)) { char *msg = ""; + if (!io_apic_assign_pci_irqs && dev->irq) + return 0; + if (io_apic_assign_pci_irqs) { +#ifdef CONFIG_X86_IO_APIC + struct pci_dev *temp_dev; int irq; + struct io_apic_irq_attr irq_attr; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), + pin - 1, &irq_attr); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1241,7 +1223,8 @@ static int pirq_enable_irq(struct pci_de pin = pci_swizzle_interrupt_pin(dev, pin); irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin - 1); + PCI_SLOT(bridge->devfn), + pin - 1, &irq_attr); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s " "INT %c to get IRQ %d\n", @@ -1251,12 +1234,15 @@ static int pirq_enable_irq(struct pci_de } dev = temp_dev; if (irq >= 0) { + io_apic_set_pci_routing(&dev->dev, irq, + &irq_attr); + dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); - dev->irq = irq; return 0; } else msg = "; probably buggy MP table"; +#endif } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else --- head-2011-03-17.orig/arch/x86/pci/pcifront.c 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/arch/x86/pci/pcifront.c 2011-02-01 14:50:44.000000000 +0100 @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -15,6 +16,8 @@ static int pcifront_enable_irq(struct pc { u8 irq; pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); + if (!irq_to_desc_alloc_node(irq, numa_node_id())) + return -ENOMEM; evtchn_register_pirq(irq); dev->irq = irq; --- head-2011-03-17.orig/arch/x86/vdso/vdso32-setup-xen.c 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/arch/x86/vdso/vdso32-setup-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -377,6 +377,8 @@ int arch_setup_additional_pages(struct l } } + current->mm->context.vdso = (void *)addr; + if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -397,11 +399,13 @@ int arch_setup_additional_pages(struct l goto up_fail; } - current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = VDSO32_SYMBOL(addr, SYSENTER_RETURN); up_fail: + if (ret) + current->mm->context.vdso = NULL; + up_write(&mm->mmap_sem); return ret; --- head-2011-03-17.orig/drivers/acpi/processor_driver.c 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/drivers/acpi/processor_driver.c 2011-02-01 14:50:44.000000000 +0100 @@ -340,7 +340,14 @@ static int acpi_processor_get_info(struc * generated as the following format: * CPU+CPU ID. */ - sprintf(acpi_device_bid(device), "CPU%X", pr->id); + if (pr->id != -1) + sprintf(acpi_device_bid(device), "CPU%X", pr->id); + else + snprintf(acpi_device_bid(device), + ARRAY_SIZE(acpi_device_bid(device)), + "#%0*X", + (int)ARRAY_SIZE(acpi_device_bid(device)) - 2, + pr->acpi_id); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id, pr->acpi_id)); --- head-2011-03-17.orig/drivers/char/agp/intel-gtt.c 2011-03-11 10:53:08.000000000 +0100 +++ head-2011-03-17/drivers/char/agp/intel-gtt.c 2011-03-11 11:00:05.000000000 +0100 @@ -282,7 +282,11 @@ static struct agp_memory *alloc_agpphysm new->page_count = pg_count; new->num_scratch_pages = pg_count; new->type = AGP_PHYS_MEMORY; +#ifndef CONFIG_XEN new->physical = page_to_phys(new->pages[0]); +#else + new->physical = page_to_pseudophys(new->pages[0]); +#endif return new; } --- head-2011-03-17.orig/drivers/edac/Kconfig 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/drivers/edac/Kconfig 2011-02-01 14:50:44.000000000 +0100 @@ -77,6 +77,7 @@ config EDAC_MCE config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64) K8, F10h" depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE + depends on !XEN help Support for error detection and correction of DRAM ECC errors on the AMD64 families of memory controllers (K8 and F10h) --- head-2011-03-17.orig/drivers/gpu/drm/ttm/ttm_bo.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/drivers/gpu/drm/ttm/ttm_bo.c 2011-02-01 14:50:44.000000000 +0100 @@ -1440,6 +1440,14 @@ int ttm_bo_global_init(struct drm_global ret = -ENOMEM; goto out_no_drp; } +#ifdef CONFIG_XEN + ret = xen_limit_pages_to_max_mfn(glob->dummy_read_page, 0, 32); + if (!ret) + clear_page(page_address(glob->dummy_read_page)); + else + printk(KERN_WARNING + "Error restricting dummy read page: %d\n", ret); +#endif INIT_LIST_HEAD(&glob->swap_lru); INIT_LIST_HEAD(&glob->device_list); --- head-2011-03-17.orig/drivers/gpu/drm/ttm/ttm_bo_vm.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/drivers/gpu/drm/ttm/ttm_bo_vm.c 2011-03-02 11:54:22.000000000 +0100 @@ -169,7 +169,13 @@ static int ttm_bo_vm_fault(struct vm_are if (bo->mem.bus.is_iomem) { vma->vm_page_prot = ttm_io_prot(bo->mem.placement, vma->vm_page_prot); +#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP) + pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP; +#endif } else { +#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP) + pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP; +#endif ttm = bo->ttm; vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ? vm_get_page_prot(vma->vm_flags) : --- head-2011-03-17.orig/drivers/pci/msi-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/drivers/pci/msi-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -54,22 +54,17 @@ int arch_msi_check_device(struct pci_dev } #endif -static void __msi_set_enable(struct pci_dev *dev, int pos, int enable) +static void msi_set_enable(struct pci_dev *dev, int pos, int enable) { u16 control; - if (pos) { - pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); - control &= ~PCI_MSI_FLAGS_ENABLE; - if (enable) - control |= PCI_MSI_FLAGS_ENABLE; - pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); - } -} + BUG_ON(!pos); -static void msi_set_enable(struct pci_dev *dev, int enable) -{ - __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable); + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); + control &= ~PCI_MSI_FLAGS_ENABLE; + if (enable) + control |= PCI_MSI_FLAGS_ENABLE; + pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); } static void msix_set_enable(struct pci_dev *dev, int enable) @@ -294,8 +289,11 @@ void pci_restore_msi_state(struct pci_de return; pci_intx_for_msi(dev, 0); - if (dev->msi_enabled) - msi_set_enable(dev, 0); + if (dev->msi_enabled) { + int pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + + msi_set_enable(dev, pos, 0); + } if (dev->msix_enabled) msix_set_enable(dev, 0); @@ -322,9 +320,9 @@ static int msi_capability_init(struct pc int pos, pirq; u16 control; - msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + msi_set_enable(dev, pos, 0); /* Disable MSI during set up */ + pci_read_config_word(dev, msi_control_reg(pos), &control); WARN_ON(nvec > 1); /* XXX */ @@ -334,7 +332,7 @@ static int msi_capability_init(struct pc /* Set MSI enabled bits */ pci_intx_for_msi(dev, 0); - msi_set_enable(dev, 1); + msi_set_enable(dev, pos, 1); dev->msi_enabled = 1; dev->irq = pirq; @@ -356,6 +354,7 @@ static int msix_capability_init(struct p { u64 table_base; int pirq, i, j, mapped, pos; + u16 control; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); struct msi_pirq_entry *pirq_entry; @@ -365,11 +364,24 @@ static int msix_capability_init(struct p msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); + + /* Ensure MSI-X is disabled while it is set up */ + control &= ~PCI_MSIX_FLAGS_ENABLE; + pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control); + table_base = find_table_base(dev, pos); if (!table_base) return -ENODEV; - /* MSI-X Table Initialization */ + /* + * Some devices require MSI-X to be enabled before we can touch the + * MSI-X registers. We need to mask all the vectors to prevent + * interrupts coming in before they're fully set up. + */ + control |= PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE; + pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control); + for (i = 0; i < nvec; i++) { mapped = 0; list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { @@ -406,10 +418,13 @@ static int msix_capability_init(struct p return avail; } + /* Set MSI-X enabled bits and unmask the function */ pci_intx_for_msi(dev, 0); - msix_set_enable(dev, 1); dev->msix_enabled = 1; + control &= ~PCI_MSIX_FLAGS_MASKALL; + pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control); + return 0; } @@ -531,7 +546,7 @@ EXPORT_SYMBOL(pci_enable_msi_block); extern void pci_frontend_disable_msi(struct pci_dev* dev); void pci_msi_shutdown(struct pci_dev *dev) { - int pirq; + int pirq, pos; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); if (!pci_msi_enable || !dev || !dev->msi_enabled) @@ -553,7 +568,8 @@ void pci_msi_shutdown(struct pci_dev *de msi_unmap_pirq(dev, pirq); /* Disable MSI mode */ - msi_set_enable(dev, 0); + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + msi_set_enable(dev, pos, 0); pci_intx_for_msi(dev, 1); dev->msi_enabled = 0; } @@ -593,8 +609,8 @@ int pci_msix_table_size(struct pci_dev * * indicates the successful configuration of MSI-X capability structure * with new allocated MSI-X irqs. A return of < 0 indicates a failure. * Or a return of > 0 indicates that driver request is exceeding the number - * of irqs available. Driver should use the returned value to re-send - * its request. + * of irqs or MSI-X vectors available. Driver should use the returned value to + * re-send its request. **/ extern int pci_frontend_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec); @@ -650,7 +666,7 @@ int pci_enable_msix(struct pci_dev* dev, nr_entries = pci_msix_table_size(dev); if (nvec > nr_entries) - return -EINVAL; + return nr_entries; /* Check for any invalid entries */ for (i = 0; i < nvec; i++) { --- head-2011-03-17.orig/drivers/staging/vt6655/ttype.h 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/drivers/staging/vt6655/ttype.h 2010-06-22 15:50:05.000000000 +0200 @@ -30,6 +30,9 @@ #ifndef __TTYPE_H__ #define __TTYPE_H__ +#ifdef CONFIG_XEN +#include +#endif /******* Common definitions and typedefs ***********************************/ --- head-2011-03-17.orig/drivers/xen/Kconfig 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/drivers/xen/Kconfig 2011-02-02 15:37:23.000000000 +0100 @@ -371,7 +371,8 @@ config XEN_SCRUB_PAGES config XEN_DEV_EVTCHN tristate "Xen /dev/xen/evtchn device" - default y + depends on XEN || PARAVIRT_XEN + default PARAVIRT_XEN || XEN_PRIVILEGED_GUEST || m help The evtchn driver allows a userspace process to triger event channels and to receive notification of an event channel @@ -411,7 +412,7 @@ config XEN_COMPAT_XENFS config XEN_SYS_HYPERVISOR bool "Create xen entries under /sys/hypervisor" - depends on SYSFS + depends on PARAVIRT_XEN && SYSFS select SYS_HYPERVISOR default y help --- head-2011-03-17.orig/drivers/xen/Makefile 2011-02-24 14:09:54.000000000 +0100 +++ head-2011-03-17/drivers/xen/Makefile 2011-02-24 14:10:06.000000000 +0100 @@ -5,7 +5,6 @@ xen-balloon-$(CONFIG_PARAVIRT_XEN) := ba xen-balloon-$(CONFIG_XEN) := balloon/ obj-$(CONFIG_XEN) += core/ obj-$(CONFIG_XEN) += console/ -obj-$(CONFIG_XEN) += evtchn/ obj-y += xenbus/ obj-$(CONFIG_XEN) += char/ @@ -15,7 +14,9 @@ obj-$(CONFIG_XEN) += features.o $(xen- obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y) obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y) +obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ obj-$(CONFIG_XEN_BLKDEV_TAP2) += blktap2/ blktap2-new/ --- head-2011-03-17.orig/drivers/xen/balloon/balloon.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/balloon/balloon.c 2011-02-01 14:50:44.000000000 +0100 @@ -321,7 +321,7 @@ static int increase_reservation(unsigned balloon_unlock(flags); #ifndef MODULE - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); if (rc > 0) kswapd_run(0); if (need_zonelists_rebuild) --- head-2011-03-17.orig/drivers/xen/blkback/blkback.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkback/blkback.c 2011-02-01 14:50:44.000000000 +0100 @@ -495,7 +495,7 @@ static void dispatch_rw_block_io(blkif_t for (i = 0; i < nseg; i++) { if (((int)preq.sector_number|(int)seg[i].nsec) & - ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { + ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { DPRINTK("Misaligned I/O request from domain %d", blkif->domid); goto fail_put_bio; --- head-2011-03-17.orig/drivers/xen/blkback/vbd.c 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkback/vbd.c 2011-02-01 14:50:44.000000000 +0100 @@ -47,7 +47,7 @@ unsigned int vbd_info(struct vbd *vbd) unsigned long vbd_secsize(struct vbd *vbd) { - return bdev_hardsect_size(vbd->bdev); + return bdev_logical_block_size(vbd->bdev); } int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major, --- head-2011-03-17.orig/drivers/xen/blkback/xenbus.c 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkback/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -108,7 +108,7 @@ static void update_blkif_status(blkif_t if (!get_device(_dev)) \ return ret; \ dev = to_xenbus_device(_dev); \ - if ((be = dev->dev.driver_data) != NULL) \ + if ((be = dev_get_drvdata(&dev->dev)) != NULL) \ ret = sprintf(buf, format, ##args); \ put_device(_dev); \ return ret; \ @@ -173,7 +173,7 @@ void xenvbd_sysfs_delif(struct xenbus_de static int blkback_remove(struct xenbus_device *dev) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); DPRINTK(""); @@ -194,7 +194,7 @@ static int blkback_remove(struct xenbus_ } kfree(be); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return 0; } @@ -229,7 +229,7 @@ static int blkback_probe(struct xenbus_d return -ENOMEM; } be->dev = dev; - dev->dev.driver_data = be; + dev_set_drvdata(&dev->dev, be); be->blkif = blkif_alloc(dev->otherend_id); if (IS_ERR(be->blkif)) { @@ -352,7 +352,7 @@ static void backend_changed(struct xenbu static void frontend_changed(struct xenbus_device *dev, enum xenbus_state frontend_state) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); int err; DPRINTK("%s", xenbus_strstate(frontend_state)); --- head-2011-03-17.orig/drivers/xen/blkfront/blkfront.c 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/blkfront.c 2011-02-01 14:50:44.000000000 +0100 @@ -119,12 +119,12 @@ static int blkfront_probe(struct xenbus_ /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0); - dev->dev.driver_data = info; + dev_set_drvdata(&dev->dev, info); err = talk_to_backend(dev, info); if (err) { kfree(info); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return err; } @@ -140,7 +140,7 @@ static int blkfront_probe(struct xenbus_ */ static int blkfront_resume(struct xenbus_device *dev) { - struct blkfront_info *info = dev->dev.driver_data; + struct blkfront_info *info = dev_get_drvdata(&dev->dev); int err; DPRINTK("blkfront_resume: %s\n", dev->nodename); @@ -265,7 +265,7 @@ fail: static void backend_changed(struct xenbus_device *dev, enum xenbus_state backend_state) { - struct blkfront_info *info = dev->dev.driver_data; + struct blkfront_info *info = dev_get_drvdata(&dev->dev); struct block_device *bd; DPRINTK("blkfront:backend_changed.\n"); @@ -433,7 +433,7 @@ static void blkfront_closing(struct blkf static int blkfront_remove(struct xenbus_device *dev) { - struct blkfront_info *info = dev->dev.driver_data; + struct blkfront_info *info = dev_get_drvdata(&dev->dev); DPRINTK("blkfront_remove: %s removed\n", dev->nodename); @@ -682,7 +682,7 @@ static int blkif_queue_request(struct re info->shadow[id].request = (unsigned long)req; ring_req->id = id; - ring_req->sector_number = (blkif_sector_t)req->sector; + ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); ring_req->handle = info->handle; ring_req->operation = rq_data_dir(req) ? @@ -738,25 +738,25 @@ void do_blkif_request(struct request_que queued = 0; - while ((req = elv_next_request(rq)) != NULL) { + while ((req = blk_peek_request(rq)) != NULL) { info = req->rq_disk->private_data; - if (!blk_fs_request(req)) { - end_request(req, 0); - continue; - } if (RING_FULL(&info->ring)) goto wait; - DPRINTK("do_blk_req %p: cmd %p, sec %llx, " - "(%u/%li) buffer:%p [%s]\n", - req, req->cmd, (long long)req->sector, - req->current_nr_sectors, - req->nr_sectors, req->buffer, - rq_data_dir(req) ? "write" : "read"); + blk_start_request(req); + if (!blk_fs_request(req)) { + __blk_end_request_all(req, -EIO); + continue; + } + + DPRINTK("do_blk_req %p: cmd %p, sec %llx, " + "(%u/%u) buffer:%p [%s]\n", + req, req->cmd, (long long)blk_rq_pos(req), + blk_rq_cur_sectors(req), blk_rq_sectors(req), + req->buffer, rq_data_dir(req) ? "write" : "read"); - blkdev_dequeue_request(req); if (blkif_queue_request(req)) { blk_requeue_request(rq, req); wait: @@ -822,8 +822,7 @@ static irqreturn_t blkif_int(int irq, vo DPRINTK("Bad return from blkdev data " "request: %x\n", bret->status); - ret = __blk_end_request(req, ret, blk_rq_bytes(req)); - BUG_ON(ret); + __blk_end_request_all(req, ret); break; default: BUG(); @@ -953,7 +952,7 @@ static int blkif_recover(struct blkfront int blkfront_is_ready(struct xenbus_device *dev) { - struct blkfront_info *info = dev->dev.driver_data; + struct blkfront_info *info = dev_get_drvdata(&dev->dev); return info->is_ready && info->xbdev; } --- head-2011-03-17.orig/drivers/xen/blkfront/vbd.c 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/vbd.c 2011-02-01 14:50:44.000000000 +0100 @@ -310,7 +310,7 @@ xlvbd_init_blk_queue(struct gendisk *gd, #endif /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_hardsect_size(rq, sector_size); + blk_queue_logical_block_size(rq, sector_size); blk_queue_max_sectors(rq, 512); /* Each segment in a request is up to an aligned page in size. */ @@ -499,7 +499,7 @@ static ssize_t show_media(struct device struct device_attribute *attr, char *buf) { struct xenbus_device *xendev = to_xenbus_device(dev); - struct blkfront_info *info = xendev->dev.driver_data; + struct blkfront_info *info = dev_get_drvdata(&xendev->dev); if (info->gd->flags & GENHD_FL_CD) return sprintf(buf, "cdrom\n"); --- head-2011-03-17.orig/drivers/xen/blktap/blktap.c 2011-02-17 10:11:18.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap/blktap.c 2011-02-17 10:16:17.000000000 +0100 @@ -279,6 +279,15 @@ static inline unsigned int OFFSET_TO_SEG } while(0) +static char *blktap_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt)); +} + +static struct device_type blktap_type = { + .nodename = blktap_nodename +}; + /****************************************************************** * BLKTAP VM OPS */ @@ -438,7 +447,6 @@ static const struct file_operations blkt static tap_blkif_t *get_next_free_dev(void) { - struct class *class; tap_blkif_t *info; int minor; @@ -501,9 +509,9 @@ found: wmb(); tapfds[minor] = info; - if ((class = get_xen_class()) != NULL) - device_create(class, NULL, MKDEV(blktap_major, minor), - NULL, "blktap%d", minor); + xen_class_device_create(&blktap_type, NULL, + MKDEV(blktap_major, minor), + NULL, "blktap%d", minor); } out: @@ -546,7 +554,8 @@ void signal_tapdisk(int idx) return; if (info->pid > 0) { - ptask = find_task_by_pid_ns(info->pid, info->pid_ns); + ptask = pid_task(find_pid_ns(info->pid, info->pid_ns), + PIDTYPE_PID); if (ptask) info->status = CLEANSHUTDOWN; } @@ -1700,7 +1709,6 @@ static void make_response(blkif_t *blkif static int __init blkif_init(void) { int i, ret; - struct class *class; if (!is_running_on_xen()) return -ENODEV; @@ -1736,7 +1744,7 @@ static int __init blkif_init(void) DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret); /* Make sure the xen class exists */ - if ((class = get_xen_class()) != NULL) { + if (get_xen_class()) { /* * This will allow udev to create the blktap ctrl device. * We only want to create blktap0 first. We don't want @@ -1744,8 +1752,9 @@ static int __init blkif_init(void) * We only create the device when a request of a new device is * made. */ - device_create(class, NULL, MKDEV(blktap_major, 0), NULL, - "blktap0"); + xen_class_device_create(&blktap_type, NULL, + MKDEV(blktap_major, 0), NULL, + "blktap0"); } else { /* this is bad, but not fatal */ WPRINTK("blktap: sysfs xen_class not created\n"); --- head-2011-03-17.orig/drivers/xen/blktap/xenbus.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -128,7 +128,7 @@ static int blktap_name(blkif_t *blkif, c if (!get_device(_dev)) \ return ret; \ dev = to_xenbus_device(_dev); \ - if ((be = dev->dev.driver_data) != NULL) \ + if ((be = dev_get_drvdata(&dev->dev)) != NULL) \ ret = sprintf(buf, format, ##args); \ put_device(_dev); \ return ret; \ @@ -158,7 +158,7 @@ static struct attribute_group tapstat_gr int xentap_sysfs_addif(struct xenbus_device *dev) { int err; - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); err = sysfs_create_group(&dev->dev.kobj, &tapstat_group); if (!err) be->group_added = 1; @@ -167,14 +167,14 @@ int xentap_sysfs_addif(struct xenbus_dev void xentap_sysfs_delif(struct xenbus_device *dev) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); sysfs_remove_group(&dev->dev.kobj, &tapstat_group); be->group_added = 0; } static int blktap_remove(struct xenbus_device *dev) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); if (be->group_added) xentap_sysfs_delif(be->dev); @@ -192,7 +192,7 @@ static int blktap_remove(struct xenbus_d be->blkif = NULL; } kfree(be); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return 0; } @@ -261,7 +261,7 @@ static int blktap_probe(struct xenbus_de } be->dev = dev; - dev->dev.driver_data = be; + dev_set_drvdata(&dev->dev, be); be->xenbus_id = get_id(dev->nodename); be->blkif = tap_alloc_blkif(dev->otherend_id); @@ -351,7 +351,7 @@ static void blkif_disconnect(blkif_t *bl static void tap_frontend_changed(struct xenbus_device *dev, enum xenbus_state frontend_state) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); int err; DPRINTK("fe_changed(%s,%d)\n", dev->nodename, frontend_state); --- head-2011-03-17.orig/drivers/xen/blktap2/blktap.h 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2/blktap.h 2011-02-01 14:50:44.000000000 +0100 @@ -25,6 +25,8 @@ extern int blktap_debug_level; #define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) #define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) +#define BLKTAP2_DEV_DIR "xen/blktap-2/" + #define MAX_BLKTAP_DEVICE 256 #define BLKTAP_CONTROL 1 --- head-2011-03-17.orig/drivers/xen/blktap2/control.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2/control.c 2011-02-01 14:50:44.000000000 +0100 @@ -154,6 +154,7 @@ static const struct file_operations blkt static struct miscdevice blktap_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "blktap-control", + .devnode = BLKTAP2_DEV_DIR "control", .fops = &blktap_control_file_operations, }; --- head-2011-03-17.orig/drivers/xen/blktap2/device.c 2011-02-07 14:14:26.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2/device.c 2011-02-01 14:50:44.000000000 +0100 @@ -206,13 +206,6 @@ flush_tlb_kernel_page(unsigned long kvad #endif } -static void -blktap_device_end_dequeued_request(struct request *req, int ret) -{ - if (blk_end_request(req, ret, blk_rq_bytes(req))) - BUG(); -} - /* * tap->tap_sem held on entry */ @@ -378,7 +371,7 @@ blktap_device_fail_pending_requests(stru blktap_unmap(tap, request); req = (struct request *)(unsigned long)request->id; - blktap_device_end_dequeued_request(req, -ENODEV); + blk_end_request_all(req, -ENODEV); blktap_request_free(tap, request); } @@ -417,7 +410,7 @@ blktap_device_finish_request(struct blkt if (unlikely(res->status != BLKIF_RSP_OKAY)) BTERR("Bad return from device data " "request: %x\n", res->status); - blktap_device_end_dequeued_request(req, + blk_end_request_all(req, res->status == BLKIF_RSP_OKAY ? 0 : -EIO); break; default: @@ -647,7 +640,7 @@ blktap_device_process_request(struct blk ring = &tap->ring; usr_idx = request->usr_idx; blkif_req.id = usr_idx; - blkif_req.sector_number = (blkif_sector_t)req->sector; + blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req); blkif_req.handle = 0; blkif_req.operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; @@ -844,20 +837,22 @@ blktap_device_run_queue(struct blktap *t BTDBG("running queue for %d\n", tap->minor); - while ((req = elv_next_request(rq)) != NULL) { + while ((req = blk_peek_request(rq)) != NULL) { if (!blk_fs_request(req)) { - end_request(req, 0); + blk_start_request(req); + __blk_end_request_all(req, -EIO); continue; } if (blk_barrier_rq(req)) { - end_request(req, 0); + blk_start_request(req); + __blk_end_request_all(req, -EOPNOTSUPP); continue; } #ifdef ENABLE_PASSTHROUGH if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { - blkdev_dequeue_request(req); + blk_start_request(req); blktap_device_forward_request(tap, req); continue; } @@ -877,13 +872,13 @@ blktap_device_run_queue(struct blktap *t goto wait; } - BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) " + BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) " "buffer:%p [%s], pending: %p\n", req, tap->minor, - req->cmd, (unsigned long long)req->sector, - req->current_nr_sectors, req->nr_sectors, req->buffer, + req->cmd, (unsigned long long)blk_rq_pos(req), + blk_rq_cur_sectors(req), blk_rq_sectors(req), req->buffer, rq_data_dir(req) ? "write" : "read", request); - blkdev_dequeue_request(req); + blk_start_request(req); spin_unlock_irq(&dev->lock); down_write(&tap->tap_sem); @@ -892,7 +887,7 @@ blktap_device_run_queue(struct blktap *t if (!err) queued++; else { - blktap_device_end_dequeued_request(req, err); + blk_end_request_all(req, err); blktap_request_free(tap, request); } @@ -932,11 +927,12 @@ blktap_device_do_request(struct request_ return; fail: - while ((req = elv_next_request(rq))) { + while ((req = blk_fetch_request(rq))) { BTERR("device closed: failing secs %llu - %llu\n", - (unsigned long long)req->sector, - (unsigned long long)req->sector + req->nr_sectors); - end_request(req, 0); + (unsigned long long)blk_rq_pos(req), + (unsigned long long)blk_rq_pos(req) + + blk_rq_cur_sectors(req)); + __blk_end_request_all(req, -EIO); } } @@ -991,7 +987,7 @@ blktap_device_configure(struct blktap *t set_capacity(dev->gd, tap->params.capacity); /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_hardsect_size(rq, tap->params.sector_size); + blk_queue_logical_block_size(rq, tap->params.sector_size); blk_queue_max_sectors(rq, 512); /* Each segment in a request is up to an aligned page in size. */ @@ -1089,6 +1085,12 @@ blktap_device_destroy(struct blktap *tap return 0; } +static char *blktap_nodename(struct gendisk *gd) +{ + return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u", + gd->first_minor); +} + int blktap_device_create(struct blktap *tap) { @@ -1125,6 +1127,7 @@ blktap_device_create(struct blktap *tap) gd->major = blktap_device_major; gd->first_minor = minor; + gd->nodename = blktap_nodename; gd->fops = &blktap_device_file_operations; gd->private_data = dev; --- head-2011-03-17.orig/drivers/xen/blktap2/sysfs.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2/sysfs.c 2011-02-01 14:50:44.000000000 +0100 @@ -436,6 +436,12 @@ blktap_sysfs_free(void) class_destroy(class); } +static char *blktap_nodename(struct device *dev) +{ + return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u", + MINOR(dev->devt)); +} + int __init blktap_sysfs_init(void) { @@ -449,6 +455,8 @@ blktap_sysfs_init(void) if (IS_ERR(cls)) return PTR_ERR(cls); + cls->nodename = blktap_nodename; + err = class_create_file(cls, &class_attr_verbosity); if (!err) { err = class_create_file(cls, &class_attr_devices); --- head-2011-03-17.orig/drivers/xen/console/console.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/console/console.c 2011-02-01 14:50:44.000000000 +0100 @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -236,7 +235,7 @@ static int __init xen_console_init(void) goto out; } - wbuf = alloc_bootmem(wbuf_size); + wbuf = kmalloc(wbuf_size, GFP_KERNEL); register_console(&kcons_info); @@ -632,8 +631,8 @@ static void xencons_close(struct tty_str tty->closing = 1; tty_wait_until_sent(tty, 0); tty_driver_flush_buffer(tty); - if (tty->ldisc.ops->flush_buffer != NULL) - tty->ldisc.ops->flush_buffer(tty); + if (tty->ldisc->ops->flush_buffer) + tty->ldisc->ops->flush_buffer(tty); tty->closing = 0; spin_lock_irqsave(&xencons_lock, flags); xencons_tty = NULL; --- head-2011-03-17.orig/drivers/xen/core/evtchn.c 2011-02-09 13:57:45.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/evtchn.c 2011-02-01 14:50:44.000000000 +0100 @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -138,6 +137,12 @@ static inline unsigned int type_from_irq return cfg ? cfg->info >> (32 - _IRQT_BITS) : IRQT_UNBOUND; } +unsigned int irq_from_evtchn(unsigned int port) +{ + return evtchn_to_irq[port]; +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + /* IRQ <-> VIRQ mapping. */ DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; @@ -328,6 +333,8 @@ asmlinkage void __irq_entry evtchn_do_up } do { + bool handled = false; + masked_l2 = l2 & ((~0UL) << l2i); if (masked_l2 == 0) break; @@ -338,13 +345,12 @@ asmlinkage void __irq_entry evtchn_do_up mask_evtchn(port); if ((irq = evtchn_to_irq[port]) != -1) { clear_evtchn(port); - if (!handle_irq(irq, regs) - && printk_ratelimit()) - pr_emerg("No handler for " - "irq %d (port %u)\n", - irq, port); - } else - evtchn_device_upcall(port); + handled = handle_irq(irq, regs); + } + if (!handled && printk_ratelimit()) + pr_emerg("No handler for irq %d" + " (port %u)\n", + irq, port); l2i = (l2i + 1) % BITS_PER_LONG; @@ -371,16 +377,26 @@ asmlinkage void __irq_entry evtchn_do_up set_irq_regs(old_regs); } -static int find_unbound_irq(unsigned int cpu, struct irq_chip *chip) +static int find_unbound_irq(unsigned int node, struct irq_chip *chip) { static int warned; int irq; for (irq = DYNIRQ_BASE; irq < nr_irqs; irq++) { - struct irq_desc *desc = irq_to_desc_alloc_cpu(irq, cpu); - struct irq_cfg *cfg = desc->chip_data; + struct irq_desc *desc; + struct irq_cfg *cfg; - if (!cfg->bindcount) { + desc = irq_to_desc(irq); + if (!desc) + desc = irq_to_desc_alloc_node(irq, node); + else if (desc->chip != &no_irq_chip && + desc->chip != &dynirq_chip) + continue; + if (!desc) + return -ENOMEM; + + cfg = desc->chip_data; + if (cfg && !cfg->bindcount) { desc->status |= IRQ_NOPROBE; set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, @@ -407,7 +423,7 @@ static int bind_caller_port_to_irq(unsig spin_lock(&irq_mapping_update_lock); if ((irq = evtchn_to_irq[caller_port]) == -1) { - if ((irq = find_unbound_irq(smp_processor_id(), &dynirq_chip)) < 0) + if ((irq = find_unbound_irq(numa_node_id(), &dynirq_chip)) < 0) goto out; evtchn_to_irq[caller_port] = irq; @@ -430,9 +446,8 @@ static int bind_local_port_to_irq(unsign BUG_ON(evtchn_to_irq[local_port] != -1); - if ((irq = find_unbound_irq(smp_processor_id(), &dynirq_chip)) < 0) { - struct evtchn_close close = { .port = local_port }; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + if ((irq = find_unbound_irq(numa_node_id(), &dynirq_chip)) < 0) { + if (close_evtchn(local_port)) BUG(); goto out; } @@ -483,7 +498,8 @@ static int bind_virq_to_irq(unsigned int spin_lock(&irq_mapping_update_lock); if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { - if ((irq = find_unbound_irq(cpu, &dynirq_chip)) < 0) + if ((irq = find_unbound_irq(cpu_to_node(cpu), + &dynirq_chip)) < 0) goto out; bind_virq.virq = virq; @@ -516,7 +532,8 @@ static int bind_ipi_to_irq(unsigned int spin_lock(&irq_mapping_update_lock); if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { - if ((irq = find_unbound_irq(cpu, &dynirq_chip)) < 0) + if ((irq = find_unbound_irq(cpu_to_node(cpu), + &dynirq_chip)) < 0) goto out; bind_ipi.vcpu = cpu; @@ -542,16 +559,14 @@ static int bind_ipi_to_irq(unsigned int static void unbind_from_irq(unsigned int irq) { - struct evtchn_close close; unsigned int cpu; int evtchn = evtchn_from_irq(irq); spin_lock(&irq_mapping_update_lock); if (!--irq_cfg(irq)->bindcount && VALID_EVTCHN(evtchn)) { - close.port = evtchn; if ((type_from_irq(irq) != IRQT_CALLER_PORT) && - HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + close_evtchn(evtchn)) BUG(); switch (type_from_irq(irq)) { @@ -732,9 +747,11 @@ static void rebind_irq_to_cpu(unsigned i rebind_evtchn_to_cpu(evtchn, tcpu); } -static void set_affinity_irq(unsigned int irq, const struct cpumask *dest) +static int set_affinity_irq(unsigned int irq, const struct cpumask *dest) { rebind_irq_to_cpu(irq, cpumask_first(dest)); + + return 0; } #endif @@ -908,7 +925,6 @@ static unsigned int startup_pirq(unsigne static void shutdown_pirq(unsigned int irq) { - struct evtchn_close close; int evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) @@ -916,8 +932,7 @@ static void shutdown_pirq(unsigned int i mask_evtchn(evtchn); - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + if (close_evtchn(evtchn)) BUG(); bind_evtchn_to_cpu(evtchn, 0); @@ -1252,7 +1267,7 @@ int evtchn_map_pirq(int irq, int xen_pir if (irq < 0) { #ifdef CONFIG_SPARSE_IRQ spin_lock(&irq_mapping_update_lock); - irq = find_unbound_irq(smp_processor_id(), &pirq_chip); + irq = find_unbound_irq(numa_node_id(), &pirq_chip); if (irq >= 0) { struct irq_desc *desc; struct irq_cfg *cfg; @@ -1280,7 +1295,7 @@ int evtchn_map_pirq(int irq, int xen_pir if (identity_mapped_irq(irq)) continue; - desc = irq_to_desc_alloc_cpu(irq, smp_processor_id()); + desc = irq_to_desc_alloc_node(irq, numa_node_id()); cfg = desc->chip_data; if (!index_from_irq(irq)) { BUG_ON(type_from_irq(irq) != IRQT_UNBOUND); @@ -1340,8 +1355,9 @@ void __init xen_init_IRQ(void) #else i = nr_pirqs; #endif - pirq_needs_eoi = alloc_bootmem_pages(sizeof(unsigned long) - * BITS_TO_LONGS(ALIGN(i, PAGE_SIZE * 8))); + i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(i)); + pirq_needs_eoi = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i); + BUILD_BUG_ON(NR_PIRQS > PAGE_SIZE * 8); eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT; if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0) pirq_eoi_does_unmask = true; --- head-2011-03-17.orig/drivers/xen/core/smpboot.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/smpboot.c 2011-02-01 14:50:44.000000000 +0100 @@ -40,9 +40,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); static DEFINE_PER_CPU(int, resched_irq); static DEFINE_PER_CPU(int, callfunc_irq); static DEFINE_PER_CPU(int, call1func_irq); +static DEFINE_PER_CPU(int, reboot_irq); static char resched_name[NR_CPUS][15]; static char callfunc_name[NR_CPUS][15]; static char call1func_name[NR_CPUS][15]; +static char reboot_name[NR_CPUS][15]; void __init prefill_possible_map(void) { @@ -74,7 +76,7 @@ static int __cpuinit xen_smp_intr_init(u int rc; per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = - per_cpu(call1func_irq, cpu) = -1; + per_cpu(call1func_irq, cpu) = per_cpu(reboot_irq, cpu) = -1; sprintf(resched_name[cpu], "resched%u", cpu); rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, @@ -109,6 +111,17 @@ static int __cpuinit xen_smp_intr_init(u goto fail; per_cpu(call1func_irq, cpu) = rc; + sprintf(reboot_name[cpu], "reboot%u", cpu); + rc = bind_ipi_to_irqhandler(REBOOT_VECTOR, + cpu, + smp_reboot_interrupt, + IRQF_DISABLED|IRQF_NOBALANCING, + reboot_name[cpu], + NULL); + if (rc < 0) + goto fail; + per_cpu(reboot_irq, cpu) = rc; + rc = xen_spinlock_init(cpu); if (rc < 0) goto fail; @@ -125,6 +138,8 @@ static int __cpuinit xen_smp_intr_init(u unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); if (per_cpu(call1func_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL); + if (per_cpu(reboot_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(reboot_irq, cpu), NULL); xen_spinlock_cleanup(cpu); return rc; } @@ -138,6 +153,7 @@ static void __cpuinit xen_smp_intr_exit( unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(reboot_irq, cpu), NULL); xen_spinlock_cleanup(cpu); } #endif --- head-2011-03-17.orig/drivers/xen/evtchn.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/drivers/xen/evtchn.c 2011-02-01 14:50:44.000000000 +0100 @@ -48,10 +48,17 @@ #include #include +#ifdef CONFIG_PARAVIRT_XEN #include #include #include #include +#else +#include +#include +#define xen_domain() is_running_on_xen() +#define bind_evtchn_to_irqhandler bind_caller_port_to_irqhandler +#endif struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ @@ -278,6 +285,9 @@ static void evtchn_unbind_from_user(stru int irq = irq_from_evtchn(port); unbind_from_irqhandler(irq, (void *)(unsigned long)port); +#ifdef CONFIG_XEN + WARN_ON(close_evtchn(port)); +#endif set_port_user(port, NULL); } @@ -450,7 +460,8 @@ static int evtchn_open(struct inode *ino if (u == NULL) return -ENOMEM; - u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm); + u->name = kasprintf(GFP_KERNEL, "evtchn:%s[%d]", + current->comm, current->pid); if (u->name == NULL) { kfree(u); return -ENOMEM; @@ -519,6 +530,7 @@ static const struct file_operations evtc static struct miscdevice evtchn_miscdev = { .minor = MISC_DYNAMIC_MINOR, .name = "xen/evtchn", + .devnode = "xen/evtchn", .fops = &evtchn_fops, }; static int __init evtchn_init(void) @@ -534,10 +546,10 @@ static int __init evtchn_init(void) spin_lock_init(&port_user_lock); - /* Create '/dev/misc/evtchn'. */ + /* Create '/dev/xen/evtchn'. */ err = misc_register(&evtchn_miscdev); if (err != 0) { - printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + pr_alert("Could not register /dev/xen/evtchn\n"); return err; } --- head-2011-03-17.orig/drivers/xen/fbfront/xenfb.c 2011-02-17 10:11:23.000000000 +0100 +++ head-2011-03-17/drivers/xen/fbfront/xenfb.c 2011-02-17 10:16:12.000000000 +0100 @@ -597,7 +597,7 @@ static int __devinit xenfb_probe(struct fb_size = XENFB_DEFAULT_FB_LEN; } - dev->dev.driver_data = info; + dev_set_drvdata(&dev->dev, info); info->xbdev = dev; info->irq = -1; info->x1 = info->y1 = INT_MAX; @@ -701,7 +701,7 @@ static int __devinit xenfb_probe(struct static int xenfb_resume(struct xenbus_device *dev) { - struct xenfb_info *info = dev->dev.driver_data; + struct xenfb_info *info = dev_get_drvdata(&dev->dev); xenfb_disconnect_backend(info); xenfb_init_shared_page(info, info->fb_info); @@ -710,7 +710,7 @@ static int xenfb_resume(struct xenbus_de static int xenfb_remove(struct xenbus_device *dev) { - struct xenfb_info *info = dev->dev.driver_data; + struct xenfb_info *info = dev_get_drvdata(&dev->dev); del_timer(&info->refresh); if (info->kthread) @@ -819,7 +819,7 @@ static void xenfb_disconnect_backend(str static void xenfb_backend_changed(struct xenbus_device *dev, enum xenbus_state backend_state) { - struct xenfb_info *info = dev->dev.driver_data; + struct xenfb_info *info = dev_get_drvdata(&dev->dev); int val; switch (backend_state) { --- head-2011-03-17.orig/drivers/xen/fbfront/xenkbd.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/fbfront/xenkbd.c 2011-02-01 14:50:44.000000000 +0100 @@ -113,7 +113,7 @@ int __devinit xenkbd_probe(struct xenbus xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); return -ENOMEM; } - dev->dev.driver_data = info; + dev_set_drvdata(&dev->dev, info); info->xbdev = dev; snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename); @@ -186,7 +186,7 @@ int __devinit xenkbd_probe(struct xenbus static int xenkbd_resume(struct xenbus_device *dev) { - struct xenkbd_info *info = dev->dev.driver_data; + struct xenkbd_info *info = dev_get_drvdata(&dev->dev); xenkbd_disconnect_backend(info); info->page->in_cons = info->page->in_prod = 0; @@ -196,7 +196,7 @@ static int xenkbd_resume(struct xenbus_d static int xenkbd_remove(struct xenbus_device *dev) { - struct xenkbd_info *info = dev->dev.driver_data; + struct xenkbd_info *info = dev_get_drvdata(&dev->dev); xenkbd_disconnect_backend(info); input_unregister_device(info->kbd); @@ -262,7 +262,7 @@ static void xenkbd_disconnect_backend(st static void xenkbd_backend_changed(struct xenbus_device *dev, enum xenbus_state backend_state) { - struct xenkbd_info *info = dev->dev.driver_data; + struct xenkbd_info *info = dev_get_drvdata(&dev->dev); int ret, val; switch (backend_state) { --- head-2011-03-17.orig/drivers/xen/gntdev/gntdev.c 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/drivers/xen/gntdev/gntdev.c 2011-02-01 14:50:44.000000000 +0100 @@ -371,10 +371,18 @@ nomem_out: /* Interface functions. */ +static char *gntdev_nodename(struct device *dev) +{ + return kstrdup("xen/" GNTDEV_NAME, GFP_KERNEL); +} + +static struct device_type gntdev_type = { + .nodename = gntdev_nodename +}; + /* Initialises the driver. Called when the module is loaded. */ static int __init gntdev_init(void) { - struct class *class; struct device *device; if (!is_running_on_xen()) { @@ -393,14 +401,9 @@ static int __init gntdev_init(void) * device, and output the major number so that the device can be * created manually using mknod. */ - if ((class = get_xen_class()) == NULL) { - pr_err("Error setting up xen_class\n"); - pr_err("gntdev created, major number = %d\n", gntdev_major); - return 0; - } - - device = device_create(class, NULL, MKDEV(gntdev_major, 0), - NULL, GNTDEV_NAME); + device = xen_class_device_create(&gntdev_type, NULL, + MKDEV(gntdev_major, 0), + NULL, GNTDEV_NAME); if (IS_ERR(device)) { pr_err("Error creating gntdev device in xen_class\n"); pr_err("gntdev created, major number = %d\n", gntdev_major); --- head-2011-03-17.orig/drivers/xen/netback/accel.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-03-17/drivers/xen/netback/accel.c 2011-02-01 14:50:44.000000000 +0100 @@ -103,7 +103,7 @@ static int netback_accelerator_probe_bac struct xenbus_device *xendev = to_xenbus_device(dev); if (!strcmp("vif", xendev->devicetype)) { - struct backend_info *be = xendev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&xendev->dev); if (match_accelerator(xendev, be, accelerator) && try_module_get(accelerator->hooks->owner)) { @@ -124,7 +124,7 @@ static int netback_accelerator_remove_ba (struct netback_accelerator *)arg; if (!strcmp("vif", xendev->devicetype)) { - struct backend_info *be = xendev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&xendev->dev); if (be->accelerator == accelerator) { be->accelerator->hooks->remove(xendev); --- head-2011-03-17.orig/drivers/xen/netback/loopback.c 2011-03-01 11:52:05.000000000 +0100 +++ head-2011-03-17/drivers/xen/netback/loopback.c 2011-02-01 14:50:44.000000000 +0100 @@ -139,8 +139,8 @@ static int loopback_start_xmit(struct sk return 0; } - dst_release(skb->dst); - skb->dst = NULL; + dst_release(skb_dst(skb)); + skb_dst_set(skb, NULL); skb_orphan(skb); --- head-2011-03-17.orig/drivers/xen/netback/xenbus.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/netback/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -38,7 +38,7 @@ static void netback_disconnect(struct de static int netback_remove(struct xenbus_device *dev) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); netback_remove_accelerators(be, dev); @@ -49,7 +49,7 @@ static int netback_remove(struct xenbus_ static void netback_disconnect(struct device *xbdev_dev, int clear) { - struct backend_info *be = xbdev_dev->driver_data; + struct backend_info *be = dev_get_drvdata(xbdev_dev); if (be->netif) kobject_uevent(&xbdev_dev->kobj, KOBJ_OFFLINE); @@ -60,7 +60,7 @@ static void netback_disconnect(struct de be->netif = NULL; } if (clear) - xbdev_dev->driver_data = NULL; + dev_set_drvdata(xbdev_dev, NULL); up_write(&teardown_sem); } @@ -84,7 +84,7 @@ static int netback_probe(struct xenbus_d } be->dev = dev; - dev->dev.driver_data = be; + dev_set_drvdata(&dev->dev, be); sg = 1; if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) @@ -181,7 +181,7 @@ static int netback_uevent(struct xenbus_ kfree(val); down_read(&teardown_sem); - be = xdev->dev.driver_data; + be = dev_get_drvdata(&xdev->dev); if (be && be->netif) add_uevent_var(env, "vif=%s", be->netif->dev->name); up_read(&teardown_sem); @@ -224,7 +224,7 @@ static void backend_create_netif(struct static void frontend_changed(struct xenbus_device *dev, enum xenbus_state frontend_state) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); DPRINTK("%s", xenbus_strstate(frontend_state)); --- head-2011-03-17.orig/drivers/xen/netfront/netfront.c 2011-02-09 16:04:51.000000000 +0100 +++ head-2011-03-17/drivers/xen/netfront/netfront.c 2011-02-01 14:50:44.000000000 +0100 @@ -256,7 +256,7 @@ static int __devinit netfront_probe(stru } info = netdev_priv(netdev); - dev->dev.driver_data = info; + dev_set_drvdata(&dev->dev, info); err = register_netdev(info->netdev); if (err) { @@ -277,13 +277,13 @@ static int __devinit netfront_probe(stru fail: free_netdev(netdev); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return err; } static int __devexit netfront_remove(struct xenbus_device *dev) { - struct netfront_info *info = dev->dev.driver_data; + struct netfront_info *info = dev_get_drvdata(&dev->dev); DPRINTK("%s\n", dev->nodename); @@ -305,14 +305,14 @@ static int __devexit netfront_remove(str static int netfront_suspend(struct xenbus_device *dev) { - struct netfront_info *info = dev->dev.driver_data; + struct netfront_info *info = dev_get_drvdata(&dev->dev); return netfront_accelerator_suspend(info, dev); } static int netfront_suspend_cancel(struct xenbus_device *dev) { - struct netfront_info *info = dev->dev.driver_data; + struct netfront_info *info = dev_get_drvdata(&dev->dev); return netfront_accelerator_suspend_cancel(info, dev); } @@ -325,7 +325,7 @@ static int netfront_suspend_cancel(struc */ static int netfront_resume(struct xenbus_device *dev) { - struct netfront_info *info = dev->dev.driver_data; + struct netfront_info *info = dev_get_drvdata(&dev->dev); DPRINTK("%s\n", dev->nodename); @@ -530,7 +530,7 @@ static int setup_device(struct xenbus_de static void backend_changed(struct xenbus_device *dev, enum xenbus_state backend_state) { - struct netfront_info *np = dev->dev.driver_data; + struct netfront_info *np = dev_get_drvdata(&dev->dev); struct net_device *netdev = np->netdev; DPRINTK("%s\n", xenbus_strstate(backend_state)); --- head-2011-03-17.orig/drivers/xen/pciback/xenbus.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/pciback/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -24,7 +24,7 @@ static struct pciback_device *alloc_pdev dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); pdev->xdev = xdev; - xdev->dev.driver_data = pdev; + dev_set_drvdata(&xdev->dev, pdev); spin_lock_init(&pdev->dev_lock); @@ -74,7 +74,7 @@ static void free_pdev(struct pciback_dev pciback_release_devices(pdev); - pdev->xdev->dev.driver_data = NULL; + dev_set_drvdata(&pdev->xdev->dev, NULL); pdev->xdev = NULL; kfree(pdev); @@ -475,7 +475,7 @@ static int pciback_reconfigure(struct pc static void pciback_frontend_changed(struct xenbus_device *xdev, enum xenbus_state fe_state) { - struct pciback_device *pdev = xdev->dev.driver_data; + struct pciback_device *pdev = dev_get_drvdata(&xdev->dev); dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); @@ -668,7 +668,7 @@ static int pciback_xenbus_probe(struct x static int pciback_xenbus_remove(struct xenbus_device *dev) { - struct pciback_device *pdev = dev->dev.driver_data; + struct pciback_device *pdev = dev_get_drvdata(&dev->dev); if (pdev != NULL) free_pdev(pdev); --- head-2011-03-17.orig/drivers/xen/pcifront/pci_op.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/pcifront/pci_op.c 2011-02-01 14:50:44.000000000 +0100 @@ -416,7 +416,7 @@ void pci_frontend_disable_msi(struct pci #endif /* CONFIG_PCI_MSI */ /* Claim resources for the PCI frontend as-is, backend won't allow changes */ -static void pcifront_claim_resource(struct pci_dev *dev, void *data) +static int pcifront_claim_resource(struct pci_dev *dev, void *data) { struct pcifront_device *pdev = data; int i; @@ -431,6 +431,8 @@ static void pcifront_claim_resource(stru pci_claim_resource(dev, i); } } + + return 0; } int __devinit pcifront_scan_root(struct pcifront_device *pdev, --- head-2011-03-17.orig/drivers/xen/pcifront/xenbus.c 2011-01-31 17:32:29.000000000 +0100 +++ head-2011-03-17/drivers/xen/pcifront/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -34,7 +34,7 @@ static struct pcifront_device *alloc_pde /*Flag for registering PV AER handler*/ set_bit(_XEN_PCIB_AERHANDLER, (void*)&pdev->sh_info->flags); - xdev->dev.driver_data = pdev; + dev_set_drvdata(&xdev->dev, pdev); pdev->xdev = xdev; INIT_LIST_HEAD(&pdev->root_buses); @@ -75,7 +75,7 @@ static void free_pdev(struct pcifront_de else free_page((unsigned long)pdev->sh_info); - pdev->xdev->dev.driver_data = NULL; + dev_set_drvdata(&pdev->xdev->dev, NULL); kfree(pdev); } @@ -394,7 +394,7 @@ static int pcifront_detach_devices(struc static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev, enum xenbus_state be_state) { - struct pcifront_device *pdev = xdev->dev.driver_data; + struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev); switch (be_state) { case XenbusStateUnknown: @@ -446,8 +446,8 @@ static int pcifront_xenbus_probe(struct static int pcifront_xenbus_remove(struct xenbus_device *xdev) { - if (xdev->dev.driver_data) - free_pdev(xdev->dev.driver_data); + if (dev_get_drvdata(&xdev->dev)) + free_pdev(dev_get_drvdata(&xdev->dev)); return 0; } --- head-2011-03-17.orig/drivers/xen/scsiback/scsiback.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/scsiback/scsiback.c 2011-02-01 14:50:44.000000000 +0100 @@ -224,7 +224,7 @@ static void scsiback_cmd_done(struct req int errors; sense_buffer = req->sense; - resid = req->data_len; + resid = blk_rq_bytes(req); errors = req->errors; if (errors != 0) { @@ -331,21 +331,6 @@ fail_flush: return -ENOMEM; } -/* quoted scsi_lib.c/scsi_merge_bio */ -static int scsiback_merge_bio(struct request *rq, struct bio *bio) -{ - struct request_queue *q = rq->q; - - bio->bi_flags &= ~(1 << BIO_SEG_VALID); - if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << BIO_RW); - - blk_queue_bounce(q, &bio); - - return blk_rq_append_bio(q, rq, bio); -} - - /* quoted scsi_lib.c/scsi_bi_endio */ static void scsiback_bi_endio(struct bio *bio, int error) { @@ -355,29 +340,28 @@ static void scsiback_bi_endio(struct bio /* quoted scsi_lib.c/scsi_req_map_sg . */ -static int request_map_sg(struct request *rq, pending_req_t *pending_req, unsigned int count) +static struct bio *request_map_sg(pending_req_t *pending_req) { - struct request_queue *q = rq->q; - int nr_pages; - unsigned int nsegs = count; - unsigned int data_len = 0, len, bytes, off; + struct request_queue *q = pending_req->sdev->request_queue; + unsigned int nsegs = (unsigned int)pending_req->nr_segments; + unsigned int i, len, bytes, off, nr_pages, nr_vecs = 0; struct scatterlist *sg; struct page *page; - struct bio *bio = NULL; - int i, err, nr_vecs = 0; + struct bio *bio = NULL, *bio_first = NULL, *bio_last = NULL; + int err; for_each_sg (pending_req->sgl, sg, nsegs, i) { page = sg_page(sg); off = sg->offset; len = sg->length; - data_len += len; nr_pages = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT; while (len > 0) { bytes = min_t(unsigned int, len, PAGE_SIZE - off); if (!bio) { - nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages); + nr_vecs = min_t(unsigned int, BIO_MAX_PAGES, + nr_pages); nr_pages -= nr_vecs; bio = bio_alloc(GFP_KERNEL, nr_vecs); if (!bio) { @@ -385,6 +369,11 @@ static int request_map_sg(struct request goto free_bios; } bio->bi_end_io = scsiback_bi_endio; + if (bio_last) + bio_last->bi_next = bio; + else + bio_first = bio; + bio_last = bio; } if (bio_add_pc_page(q, bio, page, bytes, off) != @@ -395,11 +384,9 @@ static int request_map_sg(struct request } if (bio->bi_vcnt >= nr_vecs) { - err = scsiback_merge_bio(rq, bio); - if (err) { - bio_endio(bio, 0); - goto free_bios; - } + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + if (pending_req->sc_data_direction == WRITE) + bio->bi_rw |= (1 << BIO_RW); bio = NULL; } @@ -409,21 +396,15 @@ static int request_map_sg(struct request } } - rq->buffer = rq->data = NULL; - rq->data_len = data_len; - - return 0; + return bio_first; free_bios: - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - /* - * call endio instead of bio_put incase it was bounced - */ - bio_endio(bio, 0); + while ((bio = bio_first) != NULL) { + bio_first = bio->bi_next; + bio_put(bio); } - return err; + return ERR_PTR(err); } @@ -431,7 +412,6 @@ void scsiback_cmd_exec(pending_req_t *pe { int cmd_len = (int)pending_req->cmd_len; int data_dir = (int)pending_req->sc_data_direction; - unsigned int nr_segments = (unsigned int)pending_req->nr_segments; unsigned int timeout; struct request *rq; int write; @@ -445,7 +425,30 @@ void scsiback_cmd_exec(pending_req_t *pe timeout = VSCSIIF_TIMEOUT; write = (data_dir == DMA_TO_DEVICE); - rq = blk_get_request(pending_req->sdev->request_queue, write, GFP_KERNEL); + if (pending_req->nr_segments) { + struct bio *bio = request_map_sg(pending_req); + + if (IS_ERR(bio)) { + pr_err("scsiback: SG Request Map Error\n"); + return; + } + + rq = blk_make_request(pending_req->sdev->request_queue, bio, + GFP_KERNEL); + if (IS_ERR(rq)) { + pr_err("scsiback: Make Request Error\n"); + return; + } + + rq->buffer = NULL; + } else { + rq = blk_get_request(pending_req->sdev->request_queue, write, + GFP_KERNEL); + if (unlikely(!rq)) { + pr_err("scsiback: Get Request Error\n"); + return; + } + } rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_len = cmd_len; @@ -460,14 +463,6 @@ void scsiback_cmd_exec(pending_req_t *pe rq->timeout = timeout; rq->end_io_data = pending_req; - if (nr_segments) { - - if (request_map_sg(rq, pending_req, nr_segments)) { - pr_err("scsiback: SG Request Map Error\n"); - return; - } - } - scsiback_get(pending_req->info); blk_execute_rq_nowait(rq->q, NULL, rq, 1, scsiback_cmd_done); --- head-2011-03-17.orig/drivers/xen/scsiback/xenbus.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/scsiback/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -226,7 +226,7 @@ static void scsiback_do_lun_hotplug(stru static void scsiback_frontend_changed(struct xenbus_device *dev, enum xenbus_state frontend_state) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); int err; switch (frontend_state) { @@ -283,7 +283,7 @@ static void scsiback_frontend_changed(st static int scsiback_remove(struct xenbus_device *dev) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); if (be->info) { scsiback_disconnect(be->info); @@ -293,7 +293,7 @@ static int scsiback_remove(struct xenbus } kfree(be); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return 0; } @@ -316,7 +316,7 @@ static int scsiback_probe(struct xenbus_ return -ENOMEM; } be->dev = dev; - dev->dev.driver_data = be; + dev_set_drvdata(&dev->dev, be); be->info = vscsibk_info_alloc(dev->otherend_id); if (IS_ERR(be->info)) { --- head-2011-03-17.orig/drivers/xen/scsifront/xenbus.c 2011-02-08 10:04:06.000000000 +0100 +++ head-2011-03-17/drivers/xen/scsifront/xenbus.c 2011-02-08 10:05:30.000000000 +0100 @@ -189,7 +189,7 @@ static int scsifront_probe(struct xenbus info->host = host; - dev->dev.driver_data = info; + dev_set_drvdata(&dev->dev, info); info->dev = dev; for (i = 0; i < VSCSIIF_MAX_REQS; i++) { @@ -243,7 +243,7 @@ free_sring: static int scsifront_remove(struct xenbus_device *dev) { - struct vscsifrnt_info *info = dev->dev.driver_data; + struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev); DPRINTK("%s: %s removed\n",__FUNCTION__ ,dev->nodename); @@ -355,7 +355,7 @@ static void scsifront_do_lun_hotplug(str static void scsifront_backend_changed(struct xenbus_device *dev, enum xenbus_state backend_state) { - struct vscsifrnt_info *info = dev->dev.driver_data; + struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev); DPRINTK("%p %u %u\n", dev, dev->state, backend_state); --- head-2011-03-17.orig/drivers/xen/sfc_netback/accel_xenbus.c 2011-01-31 17:29:16.000000000 +0100 +++ head-2011-03-17/drivers/xen/sfc_netback/accel_xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -36,7 +36,7 @@ #define NODENAME_PATH_FMT "backend/vif/%d/%d" #define NETBACK_ACCEL_FROM_XENBUS_DEVICE(_dev) (struct netback_accel *) \ - ((struct backend_info *)(_dev)->dev.driver_data)->netback_accel_priv + ((struct backend_info *)dev_get_drvdata(&(_dev)->dev))->netback_accel_priv /* List of all the bends currently in existence. */ struct netback_accel *bend_list = NULL; @@ -615,7 +615,7 @@ int netback_accel_probe(struct xenbus_de mutex_lock(&bend->bend_mutex); /* ...and store it where we can get at it */ - binfo = (struct backend_info *) dev->dev.driver_data; + binfo = dev_get_drvdata(&dev->dev); binfo->netback_accel_priv = bend; /* And vice-versa */ bend->hdev_data = dev; @@ -729,7 +729,7 @@ int netback_accel_remove(struct xenbus_d struct netback_accel *bend; int frontend_state; - binfo = (struct backend_info *) dev->dev.driver_data; + binfo = dev_get_drvdata(&dev->dev); bend = (struct netback_accel *) binfo->netback_accel_priv; DPRINTK("%s: dev %p bend %p\n", __FUNCTION__, dev, bend); --- head-2011-03-17.orig/drivers/xen/sfc_netfront/accel_xenbus.c 2011-01-31 17:32:29.000000000 +0100 +++ head-2011-03-17/drivers/xen/sfc_netfront/accel_xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -727,8 +727,7 @@ int netfront_accel_probe(struct net_devi int netfront_accel_remove(struct xenbus_device *dev) { - struct netfront_info *np = - (struct netfront_info *)dev->dev.driver_data; + struct netfront_info *np = dev_get_drvdata(&dev->dev); netfront_accel_vnic *vnic = (netfront_accel_vnic *)np->accel_priv; DPRINTK("%s %s\n", __FUNCTION__, dev->nodename); --- head-2011-03-17.orig/drivers/xen/sys-hypervisor.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/drivers/xen/sys-hypervisor.c 2011-02-01 14:50:44.000000000 +0100 @@ -20,6 +20,8 @@ #include #include +#include "xenbus/xenbus_comms.h" + #define HYPERVISOR_ATTR_RO(_name) \ static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) @@ -118,9 +120,8 @@ static ssize_t uuid_show(struct hyp_sysf { char *vm, *val; int ret; - extern int xenstored_ready; - if (!xenstored_ready) + if (!is_xenstored_ready()) return -EBUSY; vm = xenbus_read(XBT_NIL, "vm", "", NULL); --- head-2011-03-17.orig/drivers/xen/tpmback/xenbus.c 2011-01-31 17:32:22.000000000 +0100 +++ head-2011-03-17/drivers/xen/tpmback/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -54,7 +54,7 @@ long int tpmback_get_instance(struct bac static int tpmback_remove(struct xenbus_device *dev) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); if (!be) return 0; @@ -70,7 +70,7 @@ static int tpmback_remove(struct xenbus_ be->tpmif = NULL; } kfree(be); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return 0; } @@ -89,7 +89,7 @@ static int tpmback_probe(struct xenbus_d be->is_instance_set = 0; be->dev = dev; - dev->dev.driver_data = be; + dev_set_drvdata(&dev->dev, be); err = xenbus_watch_path2(dev, dev->nodename, "instance", &be->backend_watch, @@ -139,7 +139,7 @@ static void backend_changed(struct xenbu static void frontend_changed(struct xenbus_device *dev, enum xenbus_state frontend_state) { - struct backend_info *be = dev->dev.driver_data; + struct backend_info *be = dev_get_drvdata(&dev->dev); int err; switch (frontend_state) { --- head-2011-03-17.orig/drivers/xen/usbback/usbback.h 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-17/drivers/xen/usbback/usbback.h 2011-02-01 14:50:44.000000000 +0100 @@ -63,6 +63,12 @@ struct usbstub; +#ifndef BUS_ID_SIZE +#define USBBACK_BUS_ID_SIZE 20 +#else +#define USBBACK_BUS_ID_SIZE BUS_ID_SIZE +#endif + #define USB_DEV_ADDR_SIZE 128 typedef struct usbif_st { @@ -110,7 +116,7 @@ typedef struct usbif_st { struct vusb_port_id { struct list_head id_list; - char phys_bus[BUS_ID_SIZE]; + char phys_bus[USBBACK_BUS_ID_SIZE]; domid_t domid; unsigned int handle; int portnum; --- head-2011-03-17.orig/drivers/xen/usbback/usbstub.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/drivers/xen/usbback/usbstub.c 2011-02-01 14:50:44.000000000 +0100 @@ -56,7 +56,7 @@ struct vusb_port_id *find_portid_by_busi spin_lock_irqsave(&port_list_lock, flags); list_for_each_entry(portid, &port_list, id_list) { - if (!(strncmp(portid->phys_bus, busid, BUS_ID_SIZE))) { + if (!(strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))) { found = 1; break; } @@ -110,7 +110,7 @@ int portid_add(const char *busid, portid->handle = handle; portid->portnum = portnum; - strncpy(portid->phys_bus, busid, BUS_ID_SIZE); + strncpy(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE); spin_lock_irqsave(&port_list_lock, flags); list_add(&portid->id_list, &port_list); @@ -228,7 +228,7 @@ static int usbstub_probe(struct usb_inte usbbk_hotplug_notify(usbif, portid->portnum, udev->speed); } else { /* maybe already called and connected by other intf */ - if (strncmp(stub->portid->phys_bus, busid, BUS_ID_SIZE)) + if (strncmp(stub->portid->phys_bus, busid, USBBACK_BUS_ID_SIZE)) goto out; /* invalid call */ } --- head-2011-03-17.orig/drivers/xen/usbback/xenbus.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/usbback/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -112,7 +112,7 @@ again: */ portid = find_portid(usbif->domid, usbif->handle, i); if (portid) { - if ((strncmp(portid->phys_bus, busid, BUS_ID_SIZE))) + if ((strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))) xenbus_dev_fatal(dev, err, "can't add port/%d, remove first", i); else @@ -142,7 +142,7 @@ abort: static int usbback_remove(struct xenbus_device *dev) { - usbif_t *usbif = dev->dev.driver_data; + usbif_t *usbif = dev_get_drvdata(&dev->dev); int i; if (usbif->backend_watch.node) { @@ -158,7 +158,7 @@ static int usbback_remove(struct xenbus_ usbif_disconnect(usbif); usbif_free(usbif);; } - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return 0; } @@ -182,7 +182,7 @@ static int usbback_probe(struct xenbus_d return -ENOMEM; } usbif->xbdev = dev; - dev->dev.driver_data = usbif; + dev_set_drvdata(&dev->dev, usbif); err = xenbus_scanf(XBT_NIL, dev->nodename, "num-ports", "%d", &num_ports); @@ -260,7 +260,7 @@ static int connect_rings(usbif_t *usbif) static void frontend_changed(struct xenbus_device *dev, enum xenbus_state frontend_state) { - usbif_t *usbif = dev->dev.driver_data; + usbif_t *usbif = dev_get_drvdata(&dev->dev); int err; switch (frontend_state) { --- head-2011-03-17.orig/drivers/xen/usbfront/xenbus.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/drivers/xen/usbfront/xenbus.c 2011-02-01 14:50:44.000000000 +0100 @@ -187,7 +187,7 @@ out: static int connect(struct xenbus_device *dev) { - struct usbfront_info *info = dev->dev.driver_data; + struct usbfront_info *info = dev_get_drvdata(&dev->dev); usbif_conn_request_t *req; int i, idx, err; @@ -299,7 +299,7 @@ static int usbfront_probe(struct xenbus_ } info = hcd_to_info(hcd); - dev->dev.driver_data = info; + dev_set_drvdata(&dev->dev, info); err = usb_add_hcd(hcd, 0, 0); if (err != 0) { @@ -314,13 +314,13 @@ static int usbfront_probe(struct xenbus_ fail: usb_put_hcd(hcd); - dev->dev.driver_data = NULL; + dev_set_drvdata(&dev->dev, NULL); return err; } static void usbfront_disconnect(struct xenbus_device *dev) { - struct usbfront_info *info = dev->dev.driver_data; + struct usbfront_info *info = dev_get_drvdata(&dev->dev); struct usb_hcd *hcd = info_to_hcd(info); usb_remove_hcd(hcd); @@ -364,7 +364,7 @@ static void backend_changed(struct xenbu static int usbfront_remove(struct xenbus_device *dev) { - struct usbfront_info *info = dev->dev.driver_data; + struct usbfront_info *info = dev_get_drvdata(&dev->dev); struct usb_hcd *hcd = info_to_hcd(info); destroy_rings(info); --- head-2011-03-17.orig/drivers/xen/util.c 2011-01-31 17:56:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/util.c 2011-02-01 14:50:44.000000000 +0100 @@ -1,20 +1,74 @@ #include #include +#include +#include #include -struct class *get_xen_class(void) +static struct class *_get_xen_class(void) { static struct class *xen_class; + static DEFINE_MUTEX(xc_mutex); - if (xen_class) - return xen_class; - - xen_class = class_create(THIS_MODULE, "xen"); - if (IS_ERR(xen_class)) { + mutex_lock(&xc_mutex); + if (IS_ERR_OR_NULL(xen_class)) + xen_class = class_create(THIS_MODULE, "xen"); + mutex_unlock(&xc_mutex); + if (IS_ERR(xen_class)) pr_err("failed to create xen sysfs class\n"); - xen_class = NULL; - } return xen_class; } + +struct class *get_xen_class(void) +{ + struct class *class = _get_xen_class(); + + return !IS_ERR(class) ? class : NULL; +} EXPORT_SYMBOL_GPL(get_xen_class); + +static void xcdev_release(struct device *dev) +{ + kfree(dev); +} + +struct device *xen_class_device_create(struct device_type *type, + struct device *parent, + dev_t devt, void *drvdata, + const char *fmt, ...) +{ + struct device *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (dev) { + va_list vargs; + + va_start(vargs, fmt); + err = kobject_set_name_vargs(&dev->kobj, fmt, vargs); + va_end(vargs); + } else + err = -ENOMEM; + + if (!err) { + dev->devt = devt; + dev->class = _get_xen_class(); + if (IS_ERR(dev->class)) + err = PTR_ERR(dev->class); + } + + if (!err) { + dev->type = type; + dev->parent = parent; + dev_set_drvdata(dev, drvdata); + dev->release = xcdev_release; + err = device_register(dev); + if (!err) + return dev; + put_device(dev); + } else + kfree(dev); + + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(xen_class_device_create); --- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:50:44.000000000 +0100 @@ -92,6 +92,11 @@ static int xenbus_probe_frontend(const c static void xenbus_dev_shutdown(struct device *_dev); +#if !defined(CONFIG_XEN) && !defined(MODULE) +static int xenbus_dev_suspend(struct device *dev, pm_message_t state); +static int xenbus_dev_resume(struct device *dev); +#endif + /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -228,6 +233,10 @@ static struct xen_bus_type xenbus_fronte #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) .dev_attrs = xenbus_dev_attrs, #endif +#if !defined(CONFIG_XEN) && !defined(MODULE) + .suspend = xenbus_dev_suspend, + .resume = xenbus_dev_resume, +#endif }, #if defined(CONFIG_XEN) || defined(MODULE) .dev = { @@ -767,6 +776,9 @@ void xenbus_dev_changed(const char *node kfree(root); } +#if !defined(CONFIG_XEN) && !defined(MODULE) +EXPORT_SYMBOL_GPL(xenbus_dev_changed); +#endif static void frontend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) @@ -782,7 +794,11 @@ static struct xenbus_watch fe_watch = { .callback = frontend_changed, }; +#if !defined(CONFIG_XEN) && !defined(MODULE) +static int xenbus_dev_suspend(struct device *dev, pm_message_t state) +#else static int suspend_dev(struct device *dev, void *data) +#endif { int err = 0; struct xenbus_driver *drv; @@ -795,13 +811,18 @@ static int suspend_dev(struct device *de drv = to_xenbus_driver(dev->driver); xdev = container_of(dev, struct xenbus_device, dev); if (drv->suspend) +#if !defined(CONFIG_XEN) && !defined(MODULE) + err = drv->suspend(xdev, state); +#else err = drv->suspend(xdev); +#endif if (err) pr_warning("xenbus: suspend %s failed: %i\n", dev_name(dev), err); return 0; } +#if defined(CONFIG_XEN) || defined(MODULE) static int suspend_cancel_dev(struct device *dev, void *data) { int err = 0; @@ -821,8 +842,13 @@ static int suspend_cancel_dev(struct dev dev_name(dev), err); return 0; } +#endif +#if !defined(CONFIG_XEN) && !defined(MODULE) +static int xenbus_dev_resume(struct device *dev) +#else static int resume_dev(struct device *dev, void *data) +#endif { int err; struct xenbus_driver *drv; @@ -864,6 +890,7 @@ static int resume_dev(struct device *dev return 0; } +#if defined(CONFIG_XEN) || defined(MODULE) void xenbus_suspend(void) { DPRINTK(""); @@ -893,6 +920,7 @@ void xenbus_suspend_cancel(void) xenbus_backend_resume(suspend_cancel_dev); } EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); +#endif /* A flag to determine if xenstored is 'ready' (i.e. has started) */ atomic_t xenbus_xsd_state = ATOMIC_INIT(XENBUS_XSD_UNCOMMITTED); @@ -995,13 +1023,6 @@ static int xsd_port_read(char *page, cha #endif #if defined(CONFIG_XEN_XENBUS_DEV) || defined(MODULE) -static int xb_free_port(evtchn_port_t port) -{ - struct evtchn_close close; - close.port = port; - return HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); -} - int xenbus_conn(domid_t remote_dom, unsigned long *grant_ref, evtchn_port_t *local_port) { struct evtchn_alloc_unbound alloc_unbound; @@ -1015,7 +1036,7 @@ int xenbus_conn(domid_t remote_dom, unsi remove_xen_proc_entry("xsd_port"); #endif - rc = xb_free_port(xen_store_evtchn); + rc = close_evtchn(xen_store_evtchn); if (rc != 0) goto fail0; @@ -1041,7 +1062,7 @@ int xenbus_conn(domid_t remote_dom, unsi return 0; fail1: - rc2 = xb_free_port(xen_store_evtchn); + rc2 = close_evtchn(xen_store_evtchn); if (rc2 != 0) pr_warning("XENBUS: Error freeing xenstore event channel:" " %d\n", rc2); --- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_xs.c 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/drivers/xen/xenbus/xenbus_xs.c 2011-02-01 14:50:44.000000000 +0100 @@ -718,6 +718,10 @@ void xs_resume(void) struct xenbus_watch *watch; char token[sizeof(watch) * 2 + 1]; +#if !defined(CONFIG_XEN) && !defined(MODULE) + xb_init_comms(); +#endif + mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); transaction_resume(); --- head-2011-03-17.orig/include/Kbuild 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/include/Kbuild 2011-02-01 14:50:44.000000000 +0100 @@ -8,6 +8,5 @@ header-y += mtd/ header-y += rdma/ header-y += video/ header-y += drm/ -header-y += xen/public/ header-y += xen/ header-y += scsi/ --- head-2011-03-17.orig/include/xen/Kbuild 2011-01-31 14:31:28.000000000 +0100 +++ head-2011-03-17/include/xen/Kbuild 2011-02-01 14:50:44.000000000 +0100 @@ -1,3 +1,2 @@ -header-y += evtchn.h header-y += privcmd.h header-y += public/ --- head-2011-03-17.orig/include/xen/driver_util.h 2011-01-31 17:49:31.000000000 +0100 +++ head-2011-03-17/include/xen/driver_util.h 2011-02-01 14:50:44.000000000 +0100 @@ -1,8 +1,14 @@ #ifndef __XEN_DRIVER_UTIL_H__ #define __XEN_DRIVER_UTIL_H__ +#include #include extern struct class *get_xen_class(void); +extern struct device *xen_class_device_create(struct device_type *, + struct device *parent, + dev_t devt, void *drvdata, + const char *fmt, ...) + __printf(5, 6); #endif /* __XEN_DRIVER_UTIL_H__ */ --- head-2011-03-17.orig/include/xen/evtchn.h 2011-02-01 14:42:26.000000000 +0100 +++ head-2011-03-17/include/xen/evtchn.h 2011-02-01 14:50:44.000000000 +0100 @@ -113,9 +113,6 @@ void irq_resume(void); /* Entry point for notifications into Linux subsystems. */ asmlinkage void evtchn_do_upcall(struct pt_regs *regs); -/* Entry point for notifications into the userland character device. */ -void evtchn_device_upcall(int port); - /* Mark a PIRQ as unavailable for dynamic allocation. */ void evtchn_register_pirq(int irq); /* Map a Xen-supplied PIRQ to a dynamically allocated one. */ @@ -126,6 +123,7 @@ int evtchn_get_xen_pirq(int irq); void mask_evtchn(int port); void disable_all_local_evtchn(void); void unmask_evtchn(int port); +unsigned int irq_from_evtchn(unsigned int port); #ifdef CONFIG_SMP void rebind_evtchn_to_cpu(int port, unsigned int cpu); @@ -163,6 +161,12 @@ static inline void notify_remote_via_evt VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send)); } +static inline int close_evtchn(int port) +{ + struct evtchn_close close = { .port = port }; + return HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); +} + /* * Use these to access the event channel underlying the IRQ handle returned * by bind_*_to_irqhandler(). --- head-2011-03-17.orig/include/xen/xenbus.h 2011-02-02 16:58:42.000000000 +0100 +++ head-2011-03-17/include/xen/xenbus.h 2011-02-02 16:59:07.000000000 +0100 @@ -104,8 +104,12 @@ struct xenbus_driver { void (*otherend_changed)(struct xenbus_device *dev, enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); +#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H) + int (*suspend)(struct xenbus_device *dev, pm_message_t state); +#else int (*suspend)(struct xenbus_device *dev); int (*suspend_cancel)(struct xenbus_device *dev); +#endif int (*resume)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); struct device_driver driver; --- head-2011-03-17.orig/lib/swiotlb-xen.c 2011-02-01 14:44:12.000000000 +0100 +++ head-2011-03-17/lib/swiotlb-xen.c 2011-02-01 14:50:44.000000000 +0100 @@ -47,8 +47,8 @@ int swiotlb; int swiotlb_force; /* - * Used to do a quick range check in swiotlb_unmap_single and - * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * Used to do a quick range check in unmap_single and + * sync_single_*, to see if the memory was in fact allocated by this * API. */ static char *io_tlb_start, *io_tlb_end; @@ -167,7 +167,7 @@ dma_addr_t swiotlb_phys_to_bus(struct de return phys_to_machine(paddr); } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return machine_to_phys(baddr); } @@ -178,9 +178,15 @@ static dma_addr_t swiotlb_virt_to_bus(st return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); } -static void *swiotlb_bus_to_virt(dma_addr_t address) +void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address) { - return phys_to_virt(swiotlb_bus_to_phys(address)); + return phys_to_virt(swiotlb_bus_to_phys(hwdev, address)); +} + +int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, + dma_addr_t addr, size_t size) +{ + return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); } int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) @@ -315,7 +321,7 @@ static void swiotlb_bounce(phys_addr_t p unsigned long flags; while (size) { - sz = min((size_t)(PAGE_SIZE - offset), size); + sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); buffer = kmap_atomic(pfn_to_page(pfn), @@ -449,7 +455,7 @@ found: * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ static void -unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -544,7 +550,7 @@ swiotlb_full(struct device *dev, size_t * PCI address to use is returned. * * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. */ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, @@ -558,7 +564,7 @@ dma_addr_t swiotlb_map_page(struct devic BUG_ON(dir == DMA_NONE); /* - * If the pointer passed in happens to be in the device's DMA window, + * If the address happens to be in the device's DMA window, * we can safely return the device addr and not worry about bounce * buffering it. */ @@ -583,23 +589,32 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); /* * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_single call. All + * match what was provided for in a previous swiotlb_map_page call. All * other usages are undefined. * * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(dev_addr)) { + do_unmap_single(hwdev, dma_addr, size, dir); + return; + } + + gnttab_dma_unmap_page(dev_addr); +} + void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); - - BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dev_addr)) - unmap_single(hwdev, dma_addr, size, dir); - else - gnttab_dma_unmap_page(dev_addr); + unmap_single(hwdev, dev_addr, size, dir); } EXPORT_SYMBOL_GPL(swiotlb_unmap_page); @@ -607,7 +622,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. * - * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * If you perform a swiotlb_map_page() but wish to interrogate the buffer * using the cpu, yet do not wish to teardown the PCI dma mapping, you must * call this function before doing so. At the next point you give the PCI dma * address back to the card, you must first perform a @@ -617,9 +632,10 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); + if (is_swiotlb_buffer(dev_addr)) sync_single(hwdev, dma_addr, size, dir, target); } @@ -648,11 +664,7 @@ swiotlb_sync_single_range(struct device unsigned long offset, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); - - BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dev_addr)) - sync_single(hwdev, dma_addr + offset, size, dir, target); + swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); } void @@ -677,7 +689,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra /* * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_single + * This is the scatter-gather version of the above swiotlb_map_page * interface. Here the scatter gather list elements are each tagged with the * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). @@ -688,7 +700,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra * The routine returns the number of addr/length pairs actually * used, at most nents. * - * Device ownership issues as mentioned above for swiotlb_map_single are the + * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ int @@ -741,7 +753,7 @@ EXPORT_SYMBOL(swiotlb_map_sg); /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_single() above. + * concerning calls here are the same as for swiotlb_unmap_page() above. */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, @@ -752,13 +764,9 @@ swiotlb_unmap_sg_attrs(struct device *hw BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != sg_phys(sg)) - unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), - sg->dma_length, dir); - else - gnttab_dma_unmap_page(sg->dma_address); - } + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); @@ -784,13 +792,9 @@ swiotlb_sync_sg(struct device *hwdev, st struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != sg_phys(sg)) - sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, sg->dma_length, dir, target); - } } void --- head-2011-03-17.orig/mm/init-mm.c 2011-03-17 14:35:44.000000000 +0100 +++ head-2011-03-17/mm/init-mm.c 2011-02-01 14:50:44.000000000 +0100 @@ -13,6 +13,10 @@ #define INIT_MM_CONTEXT(name) #endif +#ifdef CONFIG_X86_XEN +#define swapper_pg_dir ((pgd_t *)NULL) +#endif + struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, --- head-2011-03-17.orig/mm/memory.c 2011-01-31 18:01:51.000000000 +0100 +++ head-2011-03-17/mm/memory.c 2011-02-01 14:50:44.000000000 +0100 @@ -1522,7 +1522,7 @@ int __get_user_pages(struct task_struct vmas[i] = vma; i++; start += PAGE_SIZE; - len--; + nr_pages--; continue; } } --- head-2011-03-17.orig/mm/page_alloc.c 2011-02-08 10:05:20.000000000 +0100 +++ head-2011-03-17/mm/page_alloc.c 2011-02-01 14:50:44.000000000 +0100 @@ -649,6 +649,7 @@ static bool free_pages_prepare(struct pa #ifdef CONFIG_XEN if (PageForeign(page)) { + WARN_ON(wasMlocked); PageForeignDestructor(page, order); return; }