From: Linux Kernel Mailing List Subject: Linux: 2.6.29 Patch-mainline: 2.6.29 This patch contains the differences between 2.6.28 and 2.6.29. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches.py --- head-2010-04-29.orig/arch/x86/Kconfig 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/Kconfig 2010-03-24 15:17:58.000000000 +0100 @@ -331,7 +331,6 @@ config X86_XEN select X86_PAE select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST - select SWIOTLB help Choose this option if you plan to run this kernel on top of the Xen Hypervisor. @@ -369,7 +368,6 @@ config X86_64_XEN bool "Enable Xen compatible kernel" depends on X86_64 select XEN - select SWIOTLB help This option will compile a kernel compatible with Xen hypervisor @@ -819,7 +817,7 @@ config AMD_IOMMU_STATS # need this always selected by IOMMU for the VIA workaround config SWIOTLB - def_bool y if X86_64 + def_bool y if X86_64 || XEN ---help--- Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation @@ -925,7 +923,7 @@ config X86_XEN_GENAPIC config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" default n - depends on X86_IO_APIC + depends on X86_IO_APIC && !XEN ---help--- This option enables a workaround that fixes a source of spurious interrupts. This is recommended when threaded --- head-2010-04-29.orig/arch/x86/Makefile 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/Makefile 2010-03-24 15:17:58.000000000 +0100 @@ -156,8 +156,8 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdi PHONY += bzImage vmlinuz $(BOOT_TARGETS) ifdef CONFIG_XEN -KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ - -I$(srctree)/arch/x86/include/mach-xen $(KBUILD_CPPFLAGS) +LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ + -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE) ifdef CONFIG_X86_64 LDFLAGS_vmlinux := -e startup_64 --- head-2010-04-29.orig/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:17:58.000000000 +0100 @@ -363,9 +363,9 @@ ENTRY(ia32_syscall) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys -ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax - ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ + ja ia32_badsys +ia32_do_call: IA32_ARG_FIXUP call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: @@ -380,7 +380,9 @@ ia32_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - jmp ia32_do_syscall + cmpl $(IA32_NR_syscalls-1),%eax + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ + jmp ia32_do_call END(ia32_syscall) ia32_badsys: --- head-2010-04-29.orig/arch/x86/include/asm/hw_irq.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/asm/hw_irq.h 2010-03-24 15:17:58.000000000 +0100 @@ -136,7 +136,9 @@ extern irqreturn_t smp_call_function_sin #endif #endif +#ifndef CONFIG_XEN extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); +#endif typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); --- head-2010-04-29.orig/arch/x86/include/asm/hypervisor.h 2010-04-29 09:29:49.000000000 +0200 +++ head-2010-04-29/arch/x86/include/asm/hypervisor.h 2010-03-24 15:17:58.000000000 +0100 @@ -24,3 +24,7 @@ extern void init_hypervisor(struct cpuin extern void init_hypervisor_platform(void); #endif + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include_next +#endif --- head-2010-04-29.orig/arch/x86/include/asm/kexec.h 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/include/asm/kexec.h 2010-03-24 15:17:58.000000000 +0100 @@ -12,13 +12,10 @@ /* * The hypervisor interface implicitly requires that all entries (except * for possibly the final one) are arranged in matching PA_/VA_ pairs. +# define VA_PGD 3 */ -# define PA_PMD_0 8 -# define VA_PMD_0 9 -# define PA_PMD_1 10 -# define VA_PMD_1 11 -# define PA_SWAP_PAGE 12 -# define PAGES_NR 13 +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 # endif /* CONFIG_XEN */ #else # define PA_CONTROL_PAGE 0 --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/desc.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/desc.h 2010-03-24 15:17:58.000000000 +0100 @@ -342,16 +342,14 @@ static inline void set_intr_gate(unsigne _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); } -#define SYS_VECTOR_FREE 0 -#define SYS_VECTOR_ALLOCED 1 - extern int first_system_vector; -extern char system_vectors[]; +/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ +extern unsigned long used_vectors[]; static inline void alloc_system_vector(int vector) { - if (system_vectors[vector] == SYS_VECTOR_FREE) { - system_vectors[vector] = SYS_VECTOR_ALLOCED; + if (!test_bit(vector, used_vectors)) { + set_bit(vector, used_vectors); if (first_system_vector > vector) first_system_vector = vector; } else --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/fixmap_64.h 2010-03-24 15:17:58.000000000 +0100 @@ -16,7 +16,6 @@ #include #include #include -#include #include /* @@ -52,11 +51,6 @@ enum fixed_addresses { FIX_ISAMAP_END, FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, #endif -#ifdef CONFIG_EFI - FIX_EFI_IO_MAP_LAST_PAGE, - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE - + MAX_EFI_IO_PAGES - 1, -#endif #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #else --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/highmem.h 2010-03-24 17:05:09.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/highmem.h 2010-03-24 17:05:16.000000000 +0100 @@ -79,6 +79,7 @@ static inline void clear_user_highpage(s clear_highpage(page); } #define __HAVE_ARCH_CLEAR_HIGHPAGE +#define clear_user_highpage clear_user_highpage #define __HAVE_ARCH_CLEAR_USER_HIGHPAGE void copy_highpage(struct page *to, struct page *from); --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/hypervisor.h 2010-03-24 15:17:58.000000000 +0100 @@ -69,6 +69,8 @@ extern start_info_t *xen_start_info; #define is_initial_xendomain() 0 #endif +#define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN)) + struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu); /* arch/xen/kernel/evtchn.c */ @@ -139,7 +141,7 @@ void scrub_pages(void *, unsigned int); DECLARE_PER_CPU(bool, xen_lazy_mmu); -int xen_multicall_flush(bool); +void xen_multicall_flush(bool); int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t, unsigned long flags); --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/io.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/io.h 2010-03-24 15:17:58.000000000 +0100 @@ -4,6 +4,7 @@ #define ARCH_HAS_IOREMAP_WC #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -45,21 +46,39 @@ build_mmio_write(__writel, "l", unsigned #define mmiowb() barrier() #ifdef CONFIG_X86_64 + build_mmio_read(readq, "q", unsigned long, "=r", :"memory") -build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) __readq(a) -#define __raw_readq __readq -#define __raw_writeq writeq - -/* Let people know we have them */ -#define readq readq -#define writeq writeq +#else + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 low, high; + + low = readl(p); + high = readl(p + 1); + + return low + ((u64)high << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + #endif -extern int iommu_bio_merge; +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + +/* Let people know that we have them */ +#define readq readq +#define writeq writeq #define native_io_delay xen_io_delay @@ -120,7 +139,6 @@ extern void __iomem *ioremap_wc(unsigned * A boot-time mapping is currently limited to at most 16 pages. */ extern void early_ioremap_init(void); -extern void early_ioremap_clear(void); extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); extern void __iomem *early_memremap(unsigned long offset, unsigned long size); --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/irq_vectors.h 2010-03-24 15:17:58.000000000 +0100 @@ -24,6 +24,8 @@ #define LAST_VM86_IRQ 15 #define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) +#define NR_IRQS_LEGACY 16 + /* * The flat IRQ space is divided into two regions: * 1. A one-to-one mapping of real physical IRQs. This space is only used @@ -36,8 +38,10 @@ #define PIRQ_BASE 0 #if defined(NR_CPUS) && defined(MAX_IO_APICS) -# if NR_CPUS < MAX_IO_APICS +# if !defined(CONFIG_SPARSE_IRQ) && NR_CPUS < MAX_IO_APICS # define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS) +# elif defined(CONFIG_SPARSE_IRQ) && 8 * NR_CPUS > 32 * MAX_IO_APICS +# define NR_PIRQS (NR_VECTORS + 8 * NR_CPUS) # else # define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS) # endif --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/mmu_context_32.h 2010-03-24 15:17:58.000000000 +0100 @@ -3,10 +3,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { -#if 0 /* XEN: no lazy tlb */ - unsigned cpu = smp_processor_id(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -38,9 +37,9 @@ static inline void switch_mm(struct mm_s /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); -#if 0 /* XEN: no lazy tlb */ - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - per_cpu(cpu_tlbstate, cpu).active_mm = next; +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + x86_write_percpu(cpu_tlbstate.active_mm, next); #endif cpu_set(cpu, next->cpu_vm_mask); @@ -62,10 +61,10 @@ static inline void switch_mm(struct mm_s BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); } -#if 0 /* XEN: no lazy tlb */ +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ else { - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pci.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/pci.h 2010-03-24 15:17:58.000000000 +0100 @@ -22,6 +22,8 @@ struct pci_sysdata { }; extern int pci_routeirq; +extern int noioapicquirk; +extern int noioapicreroute; /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, @@ -88,6 +90,8 @@ static inline void pci_dma_burst_advice( static inline void early_quirks(void) { } #endif +extern void pci_iommu_alloc(void); + #endif /* __KERNEL__ */ #ifdef CONFIG_X86_32 @@ -104,9 +108,9 @@ static inline void early_quirks(void) { #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ -static inline int __pcibus_to_node(struct pci_bus *bus) +static inline int __pcibus_to_node(const struct pci_bus *bus) { - struct pci_sysdata *sd = bus->sysdata; + const struct pci_sysdata *sd = bus->sysdata; return sd->node; } @@ -115,6 +119,12 @@ static inline cpumask_t __pcibus_to_cpum { return node_to_cpumask(__pcibus_to_node(bus)); } + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + return cpumask_of_node(__pcibus_to_node(bus)); +} #endif #endif /* _ASM_X86_PCI_H */ --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:17:58.000000000 +0100 @@ -22,12 +22,10 @@ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ - -/* set: nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY - -/* if the user mapped it with PROT_NONE; pte_present gives true */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) @@ -176,8 +174,19 @@ extern unsigned int __kernel_page_user; #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + #ifndef __ASSEMBLY__ +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -309,41 +318,43 @@ static inline pte_t pte_mkspecial(pte_t extern pteval_t __supported_pte_mask; -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +/* + * Mask out unsupported bits in a present pgprot. Non-present pgprots + * can use those bits for other purposes, so leave them be. + */ +static inline pgprotval_t massage_pgprot(pgprot_t pgprot) { - pgprotval_t prot = pgprot_val(pgprot); + pgprotval_t protval = pgprot_val(pgprot); + + if (protval & _PAGE_PRESENT) + protval &= __supported_pte_mask; - if (prot & _PAGE_PRESENT) - prot &= __supported_pte_mask; - return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot); + return protval; } -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - pgprotval_t prot = pgprot_val(pgprot); + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} - if (prot & _PAGE_PRESENT) - prot &= __supported_pte_mask; - return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot); +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - pgprotval_t prot = pgprot_val(pgprot); - - if (prot & _PAGE_PRESENT) - prot &= __supported_pte_mask; - return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot); + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); } static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { - pgprotval_t prot = pgprot_val(newprot); pteval_t val = pte_val(pte) & _PAGE_CHG_MASK; - if (prot & _PAGE_PRESENT) - prot &= __supported_pte_mask; - val |= prot & ~_PAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; return __pte(val); } @@ -359,11 +370,33 @@ static inline pgprot_t pgprot_modify(pgp #define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) -#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \ - ? pgprot_val(p) & __supported_pte_mask \ - : pgprot_val(p)) +#define canon_pgprot(p) __pgprot(massage_pgprot(p)) + +static inline int is_new_memtype_allowed(unsigned long flags, + unsigned long new_flags) +{ + /* + * Certain new memtypes are not allowed with certain + * requested memtype: + * - request is uncached, return cannot be write-back + * - request is write-combine, return cannot be write-back + */ + if ((flags == _PAGE_CACHE_UC_MINUS && + new_flags == _PAGE_CACHE_WB) || + (flags == _PAGE_CACHE_WC && + new_flags == _PAGE_CACHE_WB)) { + return 0; + } + + return 1; +} #ifndef __ASSEMBLY__ +#ifndef CONFIG_XEN +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define __HAVE_PFNMAP_TRACKING +#endif + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable-3level.h 2010-03-24 15:17:58.000000000 +0100 @@ -151,6 +151,7 @@ static inline int pte_none(pte_t pte) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable_32.h 2010-03-24 15:17:58.000000000 +0100 @@ -107,15 +107,6 @@ extern unsigned long pg0[]; #endif /* - * Macro to mark a page protection value as "uncacheable". - * On processors which do not support it, this is a no-op. - */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ - : (prot)) - -/* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable_64.h 2010-03-24 15:17:58.000000000 +0100 @@ -149,8 +149,8 @@ static inline void xen_pgd_clear(pgd_t * #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - -#define MAXMEM _AC(0x000004ffffffffff, UL) +#define MAX_PHYSMEM_BITS 43 +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) @@ -183,12 +183,6 @@ static inline int pmd_bad(pmd_t pmd) #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) /* - * Macro to mark a page protection value as "uncacheable". - */ -#define pgprot_noncached(prot) \ - (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) - -/* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ @@ -270,6 +264,8 @@ static inline int pud_large(pud_t pte) #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) #endif +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + #define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ & ((1U << SWP_TYPE_BITS) - 1)) #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:17:58.000000000 +0100 @@ -111,6 +111,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + unsigned int x86_hyper_vendor; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -124,6 +125,10 @@ struct cpuinfo_x86 { #define X86_VENDOR_UNKNOWN 0xff +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 +#define X86_HYPER_VENDOR_XEN 'X' + /* * capabilities of CPUs */ @@ -354,7 +359,7 @@ struct i387_soft_struct { u8 no_update; u8 rm; u8 alimit; - struct info *info; + struct math_emu_info *info; u32 entry_eip; }; @@ -695,6 +700,19 @@ extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/smp.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/smp.h 2010-03-24 15:17:58.000000000 +0100 @@ -18,9 +18,26 @@ #include #include +#ifdef CONFIG_X86_64 + +#define cpu_callin_mask cpu_possible_mask +#define cpu_callout_mask cpu_possible_mask +extern cpumask_var_t cpu_initialized_mask; +extern cpumask_var_t cpu_sibling_setup_mask; + +#else /* CONFIG_X86_32 */ + +#define cpu_callin_map cpu_possible_map #define cpu_callout_map cpu_possible_map extern cpumask_t cpu_initialized; -#define cpu_callin_map cpu_possible_map +extern cpumask_t cpu_sibling_setup_map; + +#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map) +#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map) +#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized) +#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map) + +#endif /* CONFIG_X86_32 */ extern void (*mtrr_hook)(void); extern void zap_low_mappings(void); @@ -29,7 +46,6 @@ extern int __cpuinit get_local_pda(int c extern int smp_num_siblings; extern unsigned int num_processors; -extern cpumask_t cpu_initialized; DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); @@ -38,6 +54,16 @@ DECLARE_PER_CPU(u16, cpu_llc_id); DECLARE_PER_CPU(int, cpu_number); #endif +static inline struct cpumask *cpu_sibling_mask(int cpu) +{ + return &per_cpu(cpu_sibling_map, cpu); +} + +static inline struct cpumask *cpu_core_mask(int cpu) +{ + return &per_cpu(cpu_core_map, cpu); +} + DECLARE_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); @@ -64,7 +90,7 @@ struct smp_ops { void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); - void (*send_call_func_ipi)(cpumask_t mask); + void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); }; @@ -125,7 +151,7 @@ static inline void arch_send_call_functi static inline void arch_send_call_function_ipi(cpumask_t mask) { - smp_ops.send_call_func_ipi(mask); + smp_ops.send_call_func_ipi(&mask); } void cpu_disable_common(void); @@ -144,13 +170,13 @@ extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); void xen_smp_send_stop(void); void xen_smp_send_reschedule(int cpu); -void xen_send_call_func_ipi(cpumask_t mask); +void xen_send_call_func_ipi(const struct cpumask *mask); void xen_send_call_func_single_ipi(int cpu); #define smp_send_stop xen_smp_send_stop #define smp_send_reschedule xen_smp_send_reschedule #define arch_send_call_function_single_ipi xen_send_call_func_single_ipi -#define arch_send_call_function_ipi xen_send_call_func_ipi +#define arch_send_call_function_ipi(m) xen_send_call_func_ipi(&(m)) void play_dead(void); @@ -164,7 +190,7 @@ void smp_store_cpu_info(int id); /* We don't mark CPUs online until __cpu_up(), so we need another measure */ static inline int num_booting_cpus(void) { - return cpus_weight(cpu_callout_map); + return cpumask_weight(cpu_callout_mask); } #else static inline void prefill_possible_map(void) --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/spinlock.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/spinlock.h 2010-03-24 15:17:58.000000000 +0100 @@ -337,6 +337,7 @@ static inline int __raw_spin_is_contende { return __raw_spin(is_contended)(lock); } +#define __raw_spin_is_contended __raw_spin_is_contended static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) { --- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:17:58.000000000 +0100 @@ -18,12 +18,12 @@ # define AT_VECTOR_SIZE_ARCH 1 #endif -#ifdef CONFIG_X86_32 - struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +#ifdef CONFIG_X86_32 + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -298,6 +298,8 @@ extern void free_init_pages(char *what, void xen_idle(void); +void stop_this_cpu(void *dummy); + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking --- head-2010-04-29.orig/arch/x86/kernel/acpi/sleep-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/acpi/sleep-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -163,6 +163,8 @@ static int __init acpi_sleep_setup(char #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); + if (strncmp(str, "s4_nonvs", 8) == 0) + acpi_s4_no_nvs(); #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); --- head-2010-04-29.orig/arch/x86/kernel/apic/apic-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/apic/apic-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -32,7 +32,7 @@ static int __init apic_set_verbosity(cha else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" + pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } --- head-2010-04-29.orig/arch/x86/kernel/cpu/Makefile 2010-04-29 09:29:49.000000000 +0200 +++ head-2010-04-29/arch/x86/kernel/cpu/Makefile 2010-03-24 15:17:58.000000000 +0100 @@ -34,6 +34,8 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +disabled-obj-$(CONFIG_XEN) := hypervisor.o vmware.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ --- head-2010-04-29.orig/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -38,17 +38,41 @@ #include #include #include +#include #ifdef CONFIG_XEN #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC) #define phys_pkg_id(a,b) a #endif -#include #include #endif #include "cpu.h" +#ifdef CONFIG_X86_64 + +/* all of these masks are initialized in setup_cpu_local_masks() */ +#ifndef CONFIG_XEN +cpumask_var_t cpu_callin_mask; +cpumask_var_t cpu_callout_mask; +#endif +cpumask_var_t cpu_initialized_mask; + +/* representing cpus for which sibling maps can be computed */ +cpumask_var_t cpu_sibling_setup_mask; + +#else /* CONFIG_X86_32 */ + +#ifndef CONFIG_XEN +cpumask_t cpu_callin_map; +cpumask_t cpu_callout_map; +#endif +cpumask_t cpu_initialized; +cpumask_t cpu_sibling_setup_map; + +#endif /* CONFIG_X86_32 */ + + static struct cpu_dev *this_cpu __cpuinitdata; #ifdef CONFIG_X86_64 @@ -377,7 +401,7 @@ void __cpuinit detect_ht(struct cpuinfo_ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (smp_num_siblings > 1) { - if (smp_num_siblings > NR_CPUS) { + if (smp_num_siblings > nr_cpu_ids) { printk(KERN_WARNING "CPU: Unsupported number of siblings %d", smp_num_siblings); smp_num_siblings = 1; @@ -728,6 +752,7 @@ static void __cpuinit identify_cpu(struc detect_ht(c); #endif + init_hypervisor(c); /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -879,8 +904,6 @@ static __init int setup_disablecpuid(cha } __setup("clearcpuid=", setup_disablecpuid); -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - #ifdef CONFIG_X86_64 struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); @@ -889,7 +912,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; #endif -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; static void __ref switch_pt(int cpu) { @@ -949,8 +972,8 @@ void __cpuinit pda_init(int cpu) } #ifndef CONFIG_X86_NO_TSS -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; +static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; #endif extern asmlinkage void ignore_sysret(void); @@ -1038,7 +1061,7 @@ void __cpuinit cpu_init(void) me = current; - if (cpu_test_and_set(cpu, cpu_initialized)) + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) panic("CPU#%d already initialized!\n", cpu); printk(KERN_INFO "Initializing CPU#%d\n", cpu); @@ -1163,7 +1186,7 @@ void __cpuinit cpu_init(void) #endif struct thread_struct *thread = &curr->thread; - if (cpu_test_and_set(cpu, cpu_initialized)) { + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) local_irq_enable(); } --- head-2010-04-29.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2010-03-24 15:12:36.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/cpu/mtrr/main-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = { struct mtrr_ops *mtrr_if = &generic_mtrr_ops; unsigned int num_var_ranges; -unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static u64 tom2; --- head-2010-04-29.orig/arch/x86/kernel/e820-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/e820-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -719,6 +719,27 @@ void __init e820_mark_nosave_regions(uns } } #endif + +#ifdef CONFIG_HIBERNATION +/** + * Mark ACPI NVS memory region, so that we can save/restore it during + * hibernation and the subsequent resume. + */ +static int __init e820_mark_nvs_memory(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_NVS) + hibernate_nvs_register(ei->addr, ei->size); + } + + return 0; +} +core_initcall(e820_mark_nvs_memory); +#endif #endif /* @@ -734,22 +755,6 @@ struct early_res { static struct early_res early_res[MAX_EARLY_RES] __initdata = { #ifndef CONFIG_XEN { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, -#endif #endif {} }; --- head-2010-04-29.orig/arch/x86/kernel/early_printk-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/early_printk-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -904,49 +904,6 @@ static struct console early_dbgp_console }; #endif -/* Console interface to a host file on AMD's SimNow! */ - -static int simnow_fd; - -enum { - MAGIC1 = 0xBACCD00A, - MAGIC2 = 0xCA110000, - XOPEN = 5, - XWRITE = 4, -}; - -static noinline long simnow(long cmd, long a, long b, long c) -{ - long ret; - - asm volatile("cpuid" : - "=a" (ret) : - "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); - return ret; -} - -static void __init simnow_init(char *str) -{ - char *fn = "klog"; - - if (*str == '=') - fn = ++str; - /* error ignored */ - simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); -} - -static void simnow_write(struct console *con, const char *s, unsigned n) -{ - simnow(XWRITE, simnow_fd, (unsigned long)s, n); -} - -static struct console simnow_console = { - .name = "simnow", - .write = simnow_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -958,7 +915,7 @@ asmlinkage void early_printk(const char va_list ap; va_start(ap, fmt); - n = vscnprintf(buf, 512, fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); early_console->write(early_console, buf, n); va_end(ap); } @@ -991,10 +948,6 @@ static int __init setup_early_printk(cha current_ypos = boot_params.screen_info.orig_y; #endif early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; #ifdef CONFIG_EARLY_PRINTK_DBGP } else if (!strncmp(buf, "dbgp", 4)) { if (early_dbgp_init(buf+4) < 0) --- head-2010-04-29.orig/arch/x86/kernel/entry_32-xen.S 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/entry_32-xen.S 2010-03-24 15:17:58.000000000 +0100 @@ -690,28 +690,37 @@ END(syscall_badsys) 27:; /* - * Build the entry stubs and pointer table with - * some assembler magic. + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. */ -.section .rodata,"a" +.section .init.rodata,"a" ENTRY(interrupt) .text - + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) RING0_INT_FRAME -vector=0 -.rept NR_VECTORS - ALIGN - .if vector +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl $~(vector) + .endif +1: pushl $(~vector+0x80) /* Note: always in signed byte range */ CFI_ADJUST_CFA_OFFSET 4 - jmp common_interrupt - .previous + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous .long 1b - .text + .text vector=vector+1 + .endif + .endr +2: jmp common_interrupt .endr END(irq_entries_start) @@ -723,8 +732,9 @@ END(interrupt) * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: */ - ALIGN + .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ SAVE_ALL TRACE_IRQS_OFF movl %esp,%eax @@ -751,68 +761,7 @@ ENDPROC(name) #else #define UNWIND_ESPFIX_STACK -#endif - -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 - ALIGN -error_code: - /* the function address is in %fs's slot on the stack */ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 - cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(page_fault) -#ifdef CONFIG_XEN # A note on the "critical region" in our callback handler. # We want to avoid stacking callback handlers due to events occurring # during handling of the last event. To do this, we keep events disabled @@ -981,158 +930,6 @@ ENTRY(device_not_available) CFI_ENDPROC END(device_not_available) -#ifndef CONFIG_XEN -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl SYSENTER_stack_sp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $__KERNEL_CS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $sysenter_past_esp; \ - CFI_ADJUST_CFA_OFFSET 4; \ - CFI_REL_OFFSET eip, 0 -#endif /* CONFIG_XEN */ - -KPROBE_ENTRY(debug) - RING0_INT_FRAME -#ifndef CONFIG_XEN - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -debug_stack_correct: -#endif /* !CONFIG_XEN */ - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(debug) - -#ifndef CONFIG_XEN -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -KPROBE_ENTRY(nmi) - RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - je nmi_espfix_stack - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_nocheck_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK(12,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK(24,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 - .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return - CFI_ENDPROC -#else -KPROBE_ENTRY(nmi) - RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - orl $NMI_MASK, PT_EFLAGS(%esp) - jmp restore_all - CFI_ENDPROC -#endif -KPROBE_END(nmi) - #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret @@ -1148,19 +945,6 @@ ENTRY(native_irq_enable_sysexit) END(native_irq_enable_sysexit) #endif -KPROBE_ENTRY(int3) - RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(int3) - ENTRY(overflow) RING0_INT_FRAME pushl $0 @@ -1225,14 +1009,6 @@ ENTRY(stack_segment) CFI_ENDPROC END(stack_segment) -KPROBE_ENTRY(general_protection) - RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 - jmp error_code - CFI_ENDPROC -KPROBE_END(general_protection) - ENTRY(alignment_check) RING0_EC_FRAME pushl $do_alignment_check @@ -1292,6 +1068,7 @@ ENTRY(kernel_thread_helper) push %eax CFI_ADJUST_CFA_OFFSET 4 call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(kernel_thread_helper) @@ -1303,6 +1080,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1317,6 +1097,11 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -1326,8 +1111,18 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif .globl ftrace_stub ftrace_stub: ret @@ -1346,12 +1141,43 @@ trace: popl %edx popl %ecx popl %eax - jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif + #include # pv syscall call handler stub @@ -1485,3 +1311,238 @@ mask=0 #undef sys_fork #undef sys_clone #undef sys_vfork + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %fs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl PT_FS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +#ifndef CONFIG_XEN +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $__KERNEL_CS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 +#endif /* CONFIG_XEN */ + +ENTRY(debug) + RING0_INT_FRAME +#ifndef CONFIG_XEN + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: +#endif /* !CONFIG_XEN */ + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 +#ifndef CONFIG_XEN + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return +#else + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + orl $NMI_MASK, PT_EFLAGS(%esp) + jmp restore_all +#endif + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection --- head-2010-04-29.orig/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:17:58.000000000 +0100 @@ -14,15 +14,15 @@ * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is + * + * Normal syscalls and interrupts don't save a full stack frame, this is * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. + * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better @@ -65,7 +65,6 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 - #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -73,16 +72,10 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -92,14 +85,13 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -108,15 +100,63 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + movq %rax, (%rsp) movq %rcx, 8(%rsp) movq %rdx, 16(%rsp) @@ -124,13 +164,14 @@ trace: movq %rdi, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) - movq 0x38(%rsp), %rdi - movq 8(%rbp), %rsi - subq $MCOUNT_INSN_SIZE, %rdi - - call *ftrace_trace_function + call ftrace_return_to_handler + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 movq 48(%rsp), %r9 movq 40(%rsp), %r8 movq 32(%rsp), %rdi @@ -138,16 +179,14 @@ trace: movq 16(%rsp), %rdx movq 8(%rsp), %rcx movq (%rsp), %rax - addq $0x38, %rsp + addq $72, %rsp + retq +#endif - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args -#endif +#endif .macro TRACE_IRQS_IRETQ offset=ARGOFFSET @@ -162,20 +201,20 @@ END(mcount) NMI_MASK = 0x80000000 /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based * fast path FIXUP_TOP_OF_STACK is needed. * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) .endm - .macro RESTORE_TOP_OF_STACK tmp,offset=0 + .macro RESTORE_TOP_OF_STACK tmp offset=0 .endm .macro FAKE_STACK_FRAME child_rip @@ -187,7 +226,7 @@ NMI_MASK = 0x80000000 pushq %rax /* rsp */ CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rsp,0 - pushq $(1<<9) /* eflags - interrupts on */ + pushq $X86_EFLAGS_IF /* eflags - interrupts on */ CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ pushq $__KERNEL_CS /* cs */ @@ -205,36 +244,80 @@ NMI_MASK = 0x80000000 CFI_ADJUST_CFA_OFFSET -(6*8) .endm - .macro CFI_DEFAULT_STACK start=1,adj=0 +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro EMPTY_FRAME start=1 offset=0 .if \start - CFI_STARTPROC simple + CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET + CFI_DEF_CFA rsp,8+\offset .else - CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET + CFI_DEF_CFA_OFFSET 8+\offset .endif - .if \adj == 0 - CFI_REL_OFFSET r15,R15 - CFI_REL_OFFSET r14,R14 - CFI_REL_OFFSET r13,R13 - CFI_REL_OFFSET r12,R12 - CFI_REL_OFFSET rbp,RBP - CFI_REL_OFFSET rbx,RBX + .endm + +/* + * initial frame state for syscall + */ + .macro BASIC_FRAME start=1 offset=0 + EMPTY_FRAME \start, SS+8+\offset-RIP + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro INTR_FRAME start=1 offset=0 + .if \start == 1 + BASIC_FRAME 1, \offset+2*8 + CFI_REL_OFFSET rcx, 0+\offset + CFI_REL_OFFSET r11, 8+\offset + .else + BASIC_FRAME \start, \offset .endif - CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET - CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET - CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET - CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET - CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET - CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET - CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET - CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET - CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET - CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET - /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/ - /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET - /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/ + .endm + +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, RIP+\offset-ORIG_RAX + /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ + .endm + +/* + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + PARTIAL_FRAME \start, R11+\offset-R15 + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset .endm /* @@ -264,70 +347,149 @@ NMI_MASK = 0x80000000 jmp hypercall_page + (__HYPERVISOR_iret * 32) .endm +#ifndef CONFIG_XEN +/* save partial stack frame */ +ENTRY(save_args) + XCPT_FRAME + cld + movq_cfi rdi, RDI+16-ARGOFFSET + movq_cfi rsi, RSI+16-ARGOFFSET + movq_cfi rdx, RDX+16-ARGOFFSET + movq_cfi rcx, RCX+16-ARGOFFSET + movq_cfi rax, RAX+16-ARGOFFSET + movq_cfi r8, R8+16-ARGOFFSET + movq_cfi r9, R9+16-ARGOFFSET + movq_cfi r10, R10+16-ARGOFFSET + movq_cfi r11, R11+16-ARGOFFSET + + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 8 /* push %rbp */ + leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + testl $3, CS(%rdi) + je 1f + SWAPGS + /* + * irqcount is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl %gs:pda_irqcount + jne 2f + popq_cfi %rax /* move return address... */ + mov %gs:pda_irqstackptr,%rsp + EMPTY_FRAME 0 + pushq_cfi %rbp /* backlink for unwinder */ + pushq_cfi %rax /* ... to the new stack */ + /* + * We entered an interrupt context - irqs are off: + */ +2: TRACE_IRQS_OFF + ret + CFI_ENDPROC +END(save_args) +#endif + +ENTRY(save_rest) + PARTIAL_FRAME 1 REST_SKIP+8 + movq 5*8+16(%rsp), %r11 /* save return address */ + movq_cfi rbx, RBX+16 + movq_cfi rbp, RBP+16 + movq_cfi r12, R12+16 + movq_cfi r13, R13+16 + movq_cfi r14, R14+16 + movq_cfi r15, R15+16 + movq %r11, 8(%rsp) /* return address */ + FIXUP_TOP_OF_STACK %r11, 16 + ret + CFI_ENDPROC +END(save_rest) + +#ifndef CONFIG_XEN +/* save complete stack frame */ +ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) +#endif + /* - * A newly forked process directly context switches into this. - */ -/* rdi: prev */ + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ ENTRY(ret_from_fork) - CFI_DEFAULT_STACK + DEFAULT_FRAME + push kernel_eflags(%rip) CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags + popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -8 - call schedule_tail + + call schedule_tail # rdi: 'prev' task parameter + GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) - jnz rff_trace -rff_action: + + CFI_REMEMBER_STATE RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call - testl $_TIF_IA32,TI_flags(%rcx) + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET - jmp ret_from_sys_call -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action + + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + + CFI_RESTORE_STATE CFI_ENDPROC END(ret_from_fork) /* - * initial frame state for interrupts and exceptions - */ - .macro _frame ref - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-\ref - /*CFI_REL_OFFSET ss,SS-\ref*/ - CFI_REL_OFFSET rsp,RSP-\ref - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ - /*CFI_REL_OFFSET cs,CS-\ref*/ - CFI_REL_OFFSET rip,RIP-\ref - .endm - -/* * System call entry. Upto 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the * stack pointer. */ - + /* - * Register setup: + * Register setup: * rax system call number * rdi arg0 - * rcx return address for syscall/sysret, C arg3 + * rcx return address for syscall/sysret, C arg3 * rsi arg1 - * rdx arg2 + * rdx arg2 * r10 arg3 (--> moved to rcx for C) * r8 arg4 * r9 arg5 * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * + * r12-r15,rbp,rbx saved by C code, not touched. + * * Interrupts are enabled on entry. * Only called from user space. * @@ -337,10 +499,10 @@ END(ret_from_fork) * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. - */ + */ ENTRY(system_call) - _frame (RIP-0x10) + INTR_FRAME start=2 offset=2*8 SAVE_ARGS -8,0 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%rcx) @@ -354,19 +516,19 @@ system_call_fastpath: movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ + * Has incomplete stack frame and undefined top of stack. + */ ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ -sysret_check: +sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl TI_flags(%rcx),%edx andl %edi,%edx - jnz sysret_careful + jnz sysret_careful CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: @@ -378,7 +540,7 @@ sysret_check: CFI_RESTORE_STATE /* Handle reschedules */ - /* edx: work, edi: workmask */ + /* edx: work, edi: workmask */ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal @@ -391,7 +553,7 @@ sysret_careful: CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check - /* Handle a signal */ + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -400,17 +562,20 @@ sysret_signal: jc sysret_audit #endif /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 - call ptregscall_common + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call do_notify_resume + RESTORE_TOP_OF_STACK %r11 + RESTORE_REST movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check - + badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call @@ -449,7 +614,7 @@ sysret_audit: #endif /* CONFIG_AUDITSYSCALL */ /* Do syscall tracing */ -tracesys: +tracesys: #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jz auditsys @@ -472,8 +637,8 @@ tracesys: call *sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ - -/* + +/* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ @@ -521,18 +686,18 @@ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - /* Check for syscall exit trace */ + /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 - leaq 8(%rsp),%rdi # &ptregs -> arg1 + leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest - + int_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f @@ -547,22 +712,24 @@ int_restore_rest: jmp int_with_check CFI_ENDPROC END(system_call) - -/* + +/* * Certain special system calls that need to save a complete full stack frame. - */ - + */ .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common +ENTRY(\label) + PARTIAL_FRAME 1 8 /* offset 8: return address */ + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + call save_rest + DEFAULT_FRAME 0 8 /* offset 8: return address */ + leaq 8(%rsp), \arg /* pt_regs pointer */ + call \func + jmp ptregscall_common + CFI_ENDPROC END(\label) .endm - CFI_STARTPROC - PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi @@ -570,25 +737,18 @@ END(\label) PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore R15+8, r15 + movq_cfi_restore R14+8, r14 + movq_cfi_restore R13+8, r13 + movq_cfi_restore R12+8, r12 + movq_cfi_restore RBP+8, rbp + movq_cfi_restore RBX+8, rbx + ret $REST_SKIP /* pop extended registers */ CFI_ENDPROC END(ptregscall_common) - + ENTRY(stub_execve) CFI_STARTPROC popq %r11 @@ -604,11 +764,11 @@ ENTRY(stub_execve) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. - */ + */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp @@ -623,24 +783,12 @@ ENTRY(stub_rt_sigreturn) CFI_ENDPROC END(stub_rt_sigreturn) -/* initial frame state for interrupts (and exceptions without error code) */ -#define INTR_FRAME _frame (RIP-0x10); \ - CFI_REL_OFFSET rcx,0; \ - CFI_REL_OFFSET r11,8 - -/* initial frame state for exceptions with error code (and interrupts with - vector already pushed) */ -#define XCPT_FRAME _frame (RIP-0x18); \ - CFI_REL_OFFSET rcx,0; \ - CFI_REL_OFFSET r11,8 - -/* +/* * Interrupt exit. - * */ retint_with_reschedule: - CFI_DEFAULT_STACK adj=1 + PARTIAL_FRAME movl $_TIF_WORK_MASK,%edi retint_check: LOCKDEP_SYS_EXIT_IRQ @@ -669,20 +817,20 @@ retint_careful: pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule - popq %rdi + popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check - + retint_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_restore_args TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - movq $-1,ORIG_RAX(%rsp) + movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume @@ -704,324 +852,132 @@ ENTRY(retint_kernel) jnc retint_restore_args call preempt_schedule_irq jmp retint_kernel /* check again */ -#endif +#endif CFI_ENDPROC END(retint_check) - + #ifndef CONFIG_XEN /* * APIC interrupts. - */ - .macro apicinterrupt num,func + */ +.macro apicinterrupt num sym do_sym +ENTRY(\sym) INTR_FRAME pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 - interrupt \func + interrupt \do_sym jmp error_entry CFI_ENDPROC - .endm +END(\sym) +.endm -ENTRY(thermal_interrupt) - apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt -END(thermal_interrupt) - -ENTRY(threshold_interrupt) - apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt -END(threshold_interrupt) - -#ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -END(reschedule_interrupt) - - .macro INVALIDATE_ENTRY num -ENTRY(invalidate_interrupt\num) - apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt -END(invalidate_interrupt\num) - .endm +#ifdef CONFIG_SMP +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +#endif + +apicinterrupt UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt + +#ifdef CONFIG_SMP +apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ + invalidate_interrupt0 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ + invalidate_interrupt1 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ + invalidate_interrupt2 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ + invalidate_interrupt3 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ + invalidate_interrupt4 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ + invalidate_interrupt5 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ + invalidate_interrupt6 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ + invalidate_interrupt7 smp_invalidate_interrupt +#endif - INVALIDATE_ENTRY 0 - INVALIDATE_ENTRY 1 - INVALIDATE_ENTRY 2 - INVALIDATE_ENTRY 3 - INVALIDATE_ENTRY 4 - INVALIDATE_ENTRY 5 - INVALIDATE_ENTRY 6 - INVALIDATE_ENTRY 7 - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -END(call_function_interrupt) -ENTRY(call_function_single_interrupt) - apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt -END(call_function_single_interrupt) -ENTRY(irq_move_cleanup_interrupt) - apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt -END(irq_move_cleanup_interrupt) +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt mce_threshold_interrupt +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt #endif -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt -END(apic_timer_interrupt) - -ENTRY(uv_bau_message_intr1) - apicinterrupt 220,uv_bau_message_interrupt -END(uv_bau_message_intr1) - -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt -END(error_interrupt) - -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -END(spurious_interrupt) +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt #endif /* !CONFIG_XEN */ - + /* * Exception entry points. - */ - .macro zeroentry sym + */ +.macro zeroentry sym do_sym +ENTRY(\sym) INTR_FRAME movq (%rsp),%rcx CFI_RESTORE rcx movq 8(%rsp),%r11 CFI_RESTORE r11 - addq $0x10,%rsp /* skip rcx and r11 */ - CFI_ADJUST_CFA_OFFSET -0x10 - pushq $0 /* push error code/oldrax */ - CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* push real oldrax to the rdi slot */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + movq $-1,8(%rsp) /* ORIG_RAX: no syscall to restart */ + subq $(15-1)*8,%rsp + CFI_ADJUST_CFA_OFFSET (15-1)*8 + call error_entry + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm + +.macro paranoidzeroentry sym do_sym + zeroentry \sym \do_sym +.endm + +.macro paranoidzeroentry_ist sym do_sym ist + zeroentry \sym \do_sym +.endm - .macro errorentry sym +.macro errorentry sym do_sym +ENTRY(\sym) XCPT_FRAME movq (%rsp),%rcx CFI_RESTORE rcx movq 8(%rsp),%r11 CFI_RESTORE r11 - addq $0x10,%rsp /* rsp points to the error code */ - CFI_ADJUST_CFA_OFFSET -0x10 - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + subq $(15-2)*8,%rsp + CFI_ADJUST_CFA_OFFSET (15-2)*8 + call error_entry + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm -#if 0 /* not XEN */ /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0, irqtrace=1 - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* skip rcx and r11 */ - SAVE_ALL - cld -#if 0 /* not XEN */ - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f - SWAPGS - xorl %ebx,%ebx -1: -#endif - .if \ist - movq %gs:pda_data_offset, %rbp - .endif - .if \irqtrace - TRACE_IRQS_OFF - .endif - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - .if \ist - subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - call \sym - .if \ist - addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - DISABLE_INTERRUPTS(CLBR_NONE) - .if \irqtrace - TRACE_IRQS_OFF - .endif - .endm - - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - * - * "trace" is 0 for the NMI handler only, because irq-tracing - * is fundamentally NMI-unsafe. (we cannot change the soft and - * hard flags at once, atomically) - */ - .macro paranoidexit trace=1 - /* ebx: no swapgs flag */ -paranoid_exit\trace: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore\trace - testl $3,CS(%rsp) - jnz paranoid_userspace\trace -paranoid_swapgs\trace: - .if \trace - TRACE_IRQS_IRETQ 0 - .endif - SWAPGS_UNSAFE_STACK -paranoid_restore\trace: - RESTORE_ALL 8 - jmp irq_return -paranoid_userspace\trace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs\trace - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule\trace - movl %ebx,%edx /* arg3: thread flags */ - .if \trace - TRACE_IRQS_ON - .endif - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace -paranoid_schedule\trace: - .if \trace - TRACE_IRQS_ON - .endif - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace - CFI_ENDPROC - .endm -#endif +.macro paranoiderrorentry sym do_sym + errorentry \sym \do_sym +.endm /* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ -KPROBE_ENTRY(error_entry) - _frame RDI - CFI_REL_OFFSET rax,0 - /* rdi slot contains rax, oldrax contains error code */ - cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - CFI_REGISTER rax,rsi - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 -#if 0 - cmpl $__KERNEL_CS,CS(%rsp) - CFI_REMEMBER_STATE - je error_kernelspace -#endif -error_call_handler: - movq %rdi, RDI(%rsp) - CFI_REL_OFFSET rdi,RDI - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi # get error code - movq $-1,ORIG_RAX(%rsp) - call *%rax -error_exit: - RESTORE_REST - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testb $3,CS-ARGOFFSET(%rsp) - jz retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_restore_args - -#if 0 - /* - * We need to re-write the logic here because we don't do iretq to - * to return to user mode. It's still possible that we get trap/fault - * in the kernel (when accessing buffers pointed to by system calls, - * for example). - * - */ - CFI_RESTORE_STATE -error_kernelspace: - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq irq_return(%rip),%rcx - cmpq %rcx,RIP(%rsp) - je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti -#endif - CFI_ENDPROC -KPROBE_END(error_entry) - -ENTRY(hypervisor_callback) - zeroentry do_hypervisor_callback -END(hypervisor_callback) - -/* * Copied from arch/xen/i386/kernel/entry.S */ # A note on the "critical region" in our callback handler. @@ -1041,7 +997,7 @@ ENTRY(do_hypervisor_callback) # do_hyp # see the correct pointer to the pt_regs movq %rdi, %rsp # we don't return, adjust the stack frame CFI_ENDPROC - CFI_DEFAULT_STACK + DEFAULT_FRAME 11: incl %gs:pda_irqcount movq %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1057,7 +1013,7 @@ END(do_hypervisor_callback) ALIGN restore_all_enable_events: - CFI_DEFAULT_STACK adj=1 + PARTIAL_FRAME TRACE_IRQS_ON __ENABLE_INTERRUPTS @@ -1093,9 +1049,7 @@ ecrit: /**** END OF CRITICAL REGION *** # We distinguish between categories by comparing each saved segment register # with its current contents: any discrepancy means we in category 1. ENTRY(failsafe_callback) - _frame (RIP-0x30) - CFI_REL_OFFSET rcx, 0 - CFI_REL_OFFSET r11, 8 + INTR_FRAME offset=4*8 movw %ds,%cx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE @@ -1131,20 +1085,19 @@ ENTRY(failsafe_callback) SAVE_ALL jmp error_exit CFI_ENDPROC -#if 0 - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: -/* swapgs */ /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous -#endif + +zeroentry divide_error do_divide_error +zeroentry overflow do_overflow +zeroentry bounds do_bounds +zeroentry invalid_op do_invalid_op +zeroentry device_not_available do_device_not_available +zeroentry hypervisor_callback do_hypervisor_callback +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS +errorentry segment_not_present do_segment_not_present +zeroentry coprocessor_error do_coprocessor_error +errorentry alignment_check do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error /* * Create a kernel thread. @@ -1168,7 +1121,7 @@ ENTRY(kernel_thread) xorl %r8d,%r8d xorl %r9d,%r9d - + # clone now call do_fork movq %rax,RAX(%rsp) @@ -1179,15 +1132,15 @@ ENTRY(kernel_thread) * so internally to the x86_64 port you can rely on kernel_thread() * not to reschedule the child before returning, this avoids the need * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] + * [Hopefully no generic code relies on the reschedule -AK] */ RESTORE_ALL UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_thread) - -child_rip: +END(kernel_thread) + +ENTRY(child_rip) pushq $0 # fake return address CFI_STARTPROC /* @@ -1200,8 +1153,9 @@ child_rip: # exit mov %eax, %edi call do_exit + ud2 # padding for call trace CFI_ENDPROC -ENDPROC(child_rip) +END(child_rip) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -1221,10 +1175,10 @@ ENDPROC(child_rip) ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 - SAVE_ALL + SAVE_ALL movq %rsp,%rcx call sys_execve - movq %rax, RAX(%rsp) + movq %rax, RAX(%rsp) RESTORE_REST testq %rax,%rax jne 1f @@ -1233,132 +1187,7 @@ ENTRY(kernel_execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_execve) - -KPROBE_ENTRY(page_fault) - errorentry do_page_fault -KPROBE_END(page_fault) - -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error -END(simd_coprocessor_error) - -ENTRY(device_not_available) - zeroentry do_device_not_available -END(device_not_available) - - /* runs on exception stack */ -KPROBE_ENTRY(debug) -/* INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 */ - zeroentry do_debug -/* paranoidexit - CFI_ENDPROC */ -KPROBE_END(debug) - -KPROBE_ENTRY(nmi) - zeroentry do_nmi_callback -KPROBE_END(nmi) -do_nmi_callback: - CFI_STARTPROC - addq $8, %rsp - CFI_ENDPROC - CFI_DEFAULT_STACK - call do_nmi - orl $NMI_MASK,EFLAGS(%rsp) - RESTORE_REST - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_restore_args - CFI_ENDPROC -END(do_nmi_callback) - -KPROBE_ENTRY(int3) -/* INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 */ - zeroentry do_int3 -/* jmp paranoid_exit1 - CFI_ENDPROC */ -KPROBE_END(int3) - -ENTRY(overflow) - zeroentry do_overflow -END(overflow) - -ENTRY(bounds) - zeroentry do_bounds -END(bounds) - -ENTRY(invalid_op) - zeroentry do_invalid_op -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun -END(coprocessor_segment_overrun) - -#if 0 - /* runs on exception stack */ -ENTRY(double_fault) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_double_fault - jmp paranoid_exit1 - CFI_ENDPROC -END(double_fault) -#endif - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS -END(invalid_TSS) - -ENTRY(segment_not_present) - errorentry do_segment_not_present -END(segment_not_present) - - /* runs on exception stack */ -ENTRY(stack_segment) -/* XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_stack_segment */ - errorentry do_stack_segment -/* jmp paranoid_exit1 - CFI_ENDPROC */ -END(stack_segment) - -KPROBE_ENTRY(general_protection) - errorentry do_general_protection -KPROBE_END(general_protection) - -ENTRY(alignment_check) - errorentry do_alignment_check -END(alignment_check) - -ENTRY(divide_error) - zeroentry do_divide_error -END(divide_error) - -#ifndef CONFIG_XEN -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug -END(spurious_interrupt_bug) -#endif - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -KPROBE_ENTRY(machine_check) - zeroentry do_machine_check -END(machine_check) -#endif +END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) @@ -1378,24 +1207,191 @@ ENTRY(call_softirq) decl %gs:pda_irqcount ret CFI_ENDPROC -ENDPROC(call_softirq) +END(call_softirq) + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +paranoidzeroentry_ist debug do_debug DEBUG_STACK +zeroentry nmi do_nmi_callback +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection +errorentry page_fault do_page_fault +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check do_machine_check +#endif + +#ifndef CONFIG_XEN + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + INTR_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK +paranoid_restore: + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) +#endif + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME 2 + CFI_ADJUST_CFA_OFFSET 15*8 + /* oldrax contains error code */ + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 +#ifndef CONFIG_XEN + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: +#endif + TRACE_IRQS_OFF + ret + CFI_ENDPROC + +#ifndef CONFIG_XEN +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti +#endif +END(error_entry) + + +ENTRY(error_exit) + DEFAULT_FRAME + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testb $3,CS-ARGOFFSET(%rsp) + jz retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_restore_args + CFI_ENDPROC +END(error_exit) + + +do_nmi_callback: + CFI_STARTPROC + addq $8, %rsp + CFI_ENDPROC + DEFAULT_FRAME + call do_nmi + orl $NMI_MASK,EFLAGS(%rsp) + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_restore_args + CFI_ENDPROC +END(do_nmi_callback) + #ifndef CONFIG_IA32_EMULATION -KPROBE_ENTRY(ignore_sysret) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-RIP+16 -/* CFI_REL_OFFSET ss,SS-RIP+16 */ - CFI_REL_OFFSET rsp,RSP-RIP+16 -/* CFI_REL_OFFSET rflags,EFLAGS-RIP+16 */ -/* CFI_REL_OFFSET cs,CS-RIP+16 */ - CFI_REL_OFFSET rip,RIP-RIP+16 +ENTRY(ignore_sysret) + INTR_FRAME popq %rcx CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx popq %r11 CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE r11 mov $-ENOSYS,%eax HYPERVISOR_IRET 0 CFI_ENDPROC -ENDPROC(ignore_sysret) +END(ignore_sysret) #endif + +/* + * End of kprobes section + */ + .popsection --- head-2010-04-29.orig/arch/x86/kernel/head-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/head-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -36,7 +36,6 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); - printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ --- head-2010-04-29.orig/arch/x86/kernel/head32-xen.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/head32-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -12,9 +12,12 @@ #include #include #include +#include void __init i386_start_kernel(void) { + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifndef CONFIG_XEN --- head-2010-04-29.orig/arch/x86/kernel/head64-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/head64-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -31,9 +31,10 @@ #include #include #include +#include /* boot cpu pda */ -static struct x8664_pda _boot_cpu_pda __read_mostly; +static struct x8664_pda _boot_cpu_pda; #ifdef CONFIG_SMP /* @@ -163,6 +164,8 @@ void __init x86_64_start_reservations(ch { copy_bootdata(__va(real_mode_data)); + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), --- head-2010-04-29.orig/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -122,102 +122,276 @@ static int __init parse_noapic(char *str } early_param("noapic", parse_noapic); +#ifndef CONFIG_XEN struct irq_pin_list; + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +{ + struct irq_pin_list *pin; + int node; + + node = cpu_to_node(cpu); + + pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); + + return pin; +} + struct irq_cfg { -#ifndef CONFIG_XEN - unsigned int irq; struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; + cpumask_var_t domain; + cpumask_var_t old_domain; unsigned move_cleanup_count; -#endif u8 vector; -#ifndef CONFIG_XEN u8 move_in_progress : 1; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + u8 move_desc_pending : 1; #endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg irq_cfgx[] = { +#else static struct irq_cfg irq_cfgx[NR_IRQS] = { - [0] = { .irq = 0 }, - [1] = { .irq = 1 }, - [2] = { .irq = 2 }, - [3] = { .irq = 3 }, - [4] = { .irq = 4 }, - [5] = { .irq = 5 }, - [6] = { .irq = 6 }, - [7] = { .irq = 7 }, - [8] = { .irq = 8 }, - [9] = { .irq = 9 }, - [10] = { .irq = 10 }, - [11] = { .irq = 11 }, - [12] = { .irq = 12 }, - [13] = { .irq = 13 }, - [14] = { .irq = 14 }, - [15] = { .irq = 15 }, +#endif + [0] = { .vector = IRQ0_VECTOR, }, + [1] = { .vector = IRQ1_VECTOR, }, + [2] = { .vector = IRQ2_VECTOR, }, + [3] = { .vector = IRQ3_VECTOR, }, + [4] = { .vector = IRQ4_VECTOR, }, + [5] = { .vector = IRQ5_VECTOR, }, + [6] = { .vector = IRQ6_VECTOR, }, + [7] = { .vector = IRQ7_VECTOR, }, + [8] = { .vector = IRQ8_VECTOR, }, + [9] = { .vector = IRQ9_VECTOR, }, + [10] = { .vector = IRQ10_VECTOR, }, + [11] = { .vector = IRQ11_VECTOR, }, + [12] = { .vector = IRQ12_VECTOR, }, + [13] = { .vector = IRQ13_VECTOR, }, + [14] = { .vector = IRQ14_VECTOR, }, + [15] = { .vector = IRQ15_VECTOR, }, }; -#define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) +int __init arch_early_irq_init(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + int count; + int i; + + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); + for (i = 0; i < count; i++) { + desc = irq_to_desc(i); + desc->chip_data = &cfg[i]; + alloc_bootmem_cpumask_var(&cfg[i].domain); + alloc_bootmem_cpumask_var(&cfg[i].old_domain); + if (i < NR_IRQS_LEGACY) + cpumask_setall(cfg[i].domain); + } + + return 0; +} + +#ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return irq < nr_irqs ? irq_cfgx + irq : NULL; + struct irq_cfg *cfg = NULL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + if (desc) + cfg = desc->chip_data; + + return cfg; } -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +static struct irq_cfg *get_one_free_irq_cfg(int cpu) { - return irq_cfg(irq); + struct irq_cfg *cfg; + int node; + + node = cpu_to_node(cpu); + + cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + if (cfg) { + if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + kfree(cfg); + cfg = NULL; + } else if (!alloc_cpumask_var_node(&cfg->old_domain, + GFP_ATOMIC, node)) { + free_cpumask_var(cfg->domain); + kfree(cfg); + cfg = NULL; + } else { + cpumask_clear(cfg->domain); + cpumask_clear(cfg->old_domain); + } + } + + return cfg; } -#ifdef CONFIG_XEN -#define irq_2_pin_init() -#define add_pin_to_irq(irq, apic, pin) -#else -/* - * Rough estimation of how many shared IRQs there are, can be changed - * anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + struct irq_cfg *cfg; -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ + cfg = desc->chip_data; + if (!cfg) { + desc->chip_data = get_one_free_irq_cfg(cpu); + if (!desc->chip_data) { + printk(KERN_ERR "can not alloc irq_cfg\n"); + BUG_ON(1); + } + } -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; + return 0; +} -static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; -static struct irq_pin_list *irq_2_pin_ptr; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -static void __init irq_2_pin_init(void) +static void +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) { - struct irq_pin_list *pin = irq_2_pin_head; - int i; + struct irq_pin_list *old_entry, *head, *tail, *entry; + + cfg->irq_2_pin = NULL; + old_entry = old_cfg->irq_2_pin; + if (!old_entry) + return; + + entry = get_one_free_irq_2_pin(cpu); + if (!entry) + return; + + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + head = entry; + tail = entry; + old_entry = old_entry->next; + while (old_entry) { + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + entry = head; + while (entry) { + head = entry->next; + kfree(entry); + entry = head; + } + /* still use the old one */ + return; + } + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + tail->next = entry; + tail = entry; + old_entry = old_entry->next; + } - for (i = 1; i < PIN_MAP_SIZE; i++) - pin[i-1].next = &pin[i]; + tail->next = NULL; + cfg->irq_2_pin = head; +} + +static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry, *next; + + if (old_cfg->irq_2_pin == cfg->irq_2_pin) + return; + + entry = old_cfg->irq_2_pin; - irq_2_pin_ptr = &pin[0]; + while (entry) { + next = entry->next; + kfree(entry); + entry = next; + } + old_cfg->irq_2_pin = NULL; } -static struct irq_pin_list *get_one_free_irq_2_pin(void) +void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) { - struct irq_pin_list *pin = irq_2_pin_ptr; + struct irq_cfg *cfg; + struct irq_cfg *old_cfg; - if (!pin) - panic("can not get more irq_2_pin\n"); + cfg = get_one_free_irq_cfg(cpu); - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; + if (!cfg) + return; + + desc->chip_data = cfg; + + old_cfg = old_desc->chip_data; + + memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + + init_copy_irq_2_pin(old_cfg, cfg, cpu); +} + +static void free_irq_cfg(struct irq_cfg *old_cfg) +{ + kfree(old_cfg); +} + +void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +{ + struct irq_cfg *old_cfg, *cfg; + + old_cfg = old_desc->chip_data; + cfg = desc->chip_data; + + if (old_cfg == cfg) + return; + + if (old_cfg) { + free_irq_2_pin(old_cfg, cfg); + free_irq_cfg(old_cfg); + old_desc->chip_data = NULL; + } +} + +static void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg = desc->chip_data; + + if (!cfg->move_in_progress) { + /* it means that domain is not changed */ + if (!cpumask_intersects(&desc->affinity, mask)) + cfg->move_desc_pending = 1; + } +} +#endif + +#else +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +#endif + +#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC +static inline void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ } +#endif struct io_apic { unsigned int index; @@ -230,7 +404,7 @@ static __attribute_const__ struct io_api return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); } -#endif +#endif /* CONFIG_XEN */ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { @@ -285,11 +459,10 @@ static inline void io_apic_modify(unsign writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(unsigned int irq) +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { struct irq_pin_list *entry; unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); entry = cfg->irq_2_pin; @@ -375,13 +548,32 @@ static void ioapic_mask_entry(int apic, } #ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; + u8 vector = cfg->vector; - cfg = irq_cfg(irq); entry = cfg->irq_2_pin; for (;;) { unsigned int reg; @@ -411,36 +603,61 @@ static void __target_IO_APIC_irq(unsigne } } -static int assign_irq_vector(int irq, cpumask_t mask); +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); + +/* + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid + * of that, or returns BAD_APICID and leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_and(&desc->affinity, cfg->domain, mask); + set_extra_move_desc(desc, mask); + return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); +} -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; + irq = desc->irq; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); + + set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -449,16 +666,18 @@ static void set_ioapic_affinity_irq(unsi * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) +static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) { - struct irq_cfg *cfg; struct irq_pin_list *entry; - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(); + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", + apic, pin); + return; + } cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; @@ -473,7 +692,7 @@ static void add_pin_to_irq(unsigned int entry = entry->next; } - entry->next = get_one_free_irq_2_pin(); + entry->next = get_one_free_irq_2_pin(cpu); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -482,11 +701,10 @@ static void add_pin_to_irq(unsigned int /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq(unsigned int irq, +static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_cfg *cfg = irq_cfg(irq); struct irq_pin_list *entry = cfg->irq_2_pin; int replaced = 0; @@ -503,18 +721,16 @@ static void __init replace_pin_at_irq(un /* why? call replace before add? */ if (!replaced) - add_pin_to_irq(irq, newapic, newpin); + add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); } -static inline void io_apic_modify_irq(unsigned int irq, +static inline void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { int pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; - cfg = irq_cfg(irq); for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; pin = entry->pin; @@ -527,13 +743,13 @@ static inline void io_apic_modify_irq(un } } -static void __unmask_IO_APIC_irq(unsigned int irq) +static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } #ifdef CONFIG_X86_64 -void io_apic_sync(struct irq_pin_list *entry) +static void io_apic_sync(struct irq_pin_list *entry) { /* * Synchronize the IO-APIC and the CPU by doing @@ -544,47 +760,64 @@ void io_apic_sync(struct irq_pin_list *e readl(&io_apic->data); } -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } #else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); } -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, IO_APIC_REDIR_MASKED, NULL); } -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } #endif /* CONFIG_X86_32 */ -static void mask_IO_APIC_irq (unsigned int irq) +static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; + BUG_ON(!cfg); + spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); + __mask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq (unsigned int irq) +static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void mask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq_desc(desc); +} +static void unmask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unmask_IO_APIC_irq_desc(desc); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -624,6 +857,8 @@ void send_IPI_self(int vector) apic_write(APIC_ICR, cfg); } #endif /* !CONFIG_SMP && CONFIG_X86_32*/ +#else +#define add_pin_to_irq_cpu(cfg, cpu, apic, pin) #endif /* CONFIG_XEN */ #ifdef CONFIG_X86_32 @@ -864,7 +1099,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector */ static int EISA_ELCR(unsigned int irq) { - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1089,52 +1324,118 @@ void unlock_vector_lock(void) { spin_unlock(&vector_lock); } -#endif -static int assign_irq_vector(int irq, cpumask_t mask) +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { - struct physdev_irq irq_op; - struct irq_cfg *cfg; - - if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) - return -EINVAL; + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu, err; + cpumask_var_t tmp_mask; + + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; + + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + + old_vector = cfg->vector; + if (old_vector) { + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); + return 0; + } + } - cfg = irq_cfg(irq); + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { + int new_cpu; + int vector, offset; + + vector_allocation_domain(cpu, tmp_mask); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= first_system_vector) { + /* If out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; - if (cfg->vector) - return 0; + if (test_bit(vector, used_vectors)) + goto next; - irq_op.irq = irq; - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) - return -ENOSPC; +#ifdef CONFIG_KDB + if (vector == KDBENTER_VECTOR) + goto next; +#endif /* CONFIG_KDB */ + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cpumask_copy(cfg->old_domain, cfg->domain); + } + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; + } + free_cpumask_var(tmp_mask); + return err; +} - cfg->vector = irq_op.vector; +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + int err; + unsigned long flags; - return 0; + spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, cfg, mask); + spin_unlock_irqrestore(&vector_lock, flags); + return err; } -#ifndef CONFIG_XEN -static void __clear_irq_vector(int irq) +static void __clear_irq_vector(int irq, struct irq_cfg *cfg) { - struct irq_cfg *cfg; - cpumask_t mask; int cpu, vector; - cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cpus_clear(cfg->domain); + cpumask_clear(cfg->domain); if (likely(!cfg->move_in_progress)) return; - cpus_and(mask, cfg->old_domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -1152,10 +1453,12 @@ void __setup_vector_irq(int cpu) /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; + struct irq_desc *desc; /* Mark the inuse vectors */ - for_each_irq_cfg(irq, cfg) { - if (!cpu_isset(cpu, cfg->domain)) + for_each_irq_desc(irq, desc) { + cfg = desc->chip_data; + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; @@ -1167,7 +1470,7 @@ void __setup_vector_irq(int cpu) continue; cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -1205,11 +1508,8 @@ static inline int IO_APIC_irq_trigger(in } #endif -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) @@ -1240,8 +1540,8 @@ static void ioapic_register_intr(int irq handle_edge_irq, "edge"); } #else /* !CONFIG_XEN */ -#define __clear_irq_vector(irq) ((void)(irq)) -#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq) +#define __clear_irq_vector(irq, cfg) ((void)0) +#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq) #endif static int setup_ioapic_entry(int apic, int irq, @@ -1305,24 +1605,25 @@ static int setup_ioapic_entry(int apic, return 0; } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, int trigger, int polarity) { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - cpumask_t mask; + unsigned int dest; if (!IO_APIC_IRQ(irq)) return; - cfg = irq_cfg(irq); + cfg = desc->chip_data; - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) + if (assign_irq_vector(irq, cfg, TARGET_CPUS)) return; #ifndef CONFIG_XEN - cpus_and(mask, cfg->domain, mask); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +#else + dest = cpu_mask_to_apicid(TARGET_CPUS); #endif apic_printk(APIC_VERBOSE,KERN_DEBUG @@ -1333,16 +1634,15 @@ static void setup_IO_APIC_irq(int apic, if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { + dest, trigger, polarity, cfg->vector)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); return; } - ioapic_register_intr(irq, trigger); - if (irq < 16) + ioapic_register_intr(irq, desc, trigger); + if (irq < NR_IRQS_LEGACY) disable_8259A_irq(irq); ioapic_write_entry(apic, pin, entry); @@ -1352,6 +1652,9 @@ static void __init setup_IO_APIC_irqs(vo { int apic, pin, idx, irq; int notcon = 0; + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1386,9 +1689,15 @@ static void __init setup_IO_APIC_irqs(vo if (multi_timer_check(apic, irq)) continue; #endif - add_pin_to_irq(irq, apic, pin); + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, + setup_IO_APIC_irq(apic, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } } @@ -1448,6 +1757,7 @@ __apicdebuginit(void) print_IO_APIC(void union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + struct irq_desc *desc; unsigned int irq; if (apic_verbosity == APIC_QUIET) @@ -1537,8 +1847,11 @@ __apicdebuginit(void) print_IO_APIC(void } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(irq, cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; + for_each_irq_desc(irq, desc) { + struct irq_pin_list *entry; + + cfg = desc->chip_data; + entry = cfg->irq_2_pin; if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); @@ -2030,14 +2343,16 @@ static unsigned int startup_ioapic_irq(u { int was_pending = 0; unsigned long flags; + struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; } - __unmask_IO_APIC_irq(irq); + cfg = irq_cfg(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2051,7 +2366,7 @@ static int ioapic_retrigger_irq(unsigned unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2100,35 +2415,35 @@ static DECLARE_DELAYED_WORK(ir_migration * as simple as edge triggered migration and we can do the irq migration * with a simple atomic update to IO-APIC RTE. */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) +static void +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; struct irte irte; int modify_ioapic_rte; unsigned int dest; unsigned long flags; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpumask_intersects(mask, cpu_online_mask)) return; + irq = desc->irq; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + set_extra_move_desc(desc, mask); + + dest = cpu_mask_to_apicid_and(cfg->domain, mask); - desc = irq_to_desc(irq); modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); + __target_IO_APIC_irq(irq, dest, cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2140,24 +2455,20 @@ static void migrate_ioapic_irq(int irq, */ modify_irte(irq, &irte); - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (cfg->move_in_progress) + send_cleanup_vector(cfg); - desc->affinity = mask; + cpumask_copy(&desc->affinity, mask); } -static int migrate_irq_remapped_level(int irq) +static int migrate_irq_remapped_level_desc(struct irq_desc *desc) { int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); - if (io_apic_level_ack_pending(irq)) { + if (io_apic_level_ack_pending(cfg)) { /* * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse @@ -2169,14 +2480,15 @@ static int migrate_irq_remapped_level(in } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); + migrate_ioapic_irq_desc(desc, &desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); unmask: - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); + return ret; } @@ -2197,7 +2509,7 @@ static void ir_irq_migration(struct work continue; } - desc->chip->set_affinity(irq, desc->pending_mask); + desc->chip->set_affinity(irq, &desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -2206,28 +2518,33 @@ static void ir_irq_migration(struct work /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); + cpumask_copy(&desc->pending_mask, mask); + migrate_irq_remapped_level_desc(desc); return; } - migrate_ioapic_irq(irq, mask); + migrate_ioapic_irq_desc(desc, mask); +} +static void set_ir_ioapic_affinity_irq(unsigned int irq, + const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + set_ir_ioapic_affinity_irq_desc(desc, mask); } #endif asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); @@ -2237,6 +2554,9 @@ asmlinkage void smp_irq_move_cleanup_int struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; + if (irq == -1) + continue; + desc = irq_to_desc(irq); if (!desc) continue; @@ -2246,7 +2566,7 @@ asmlinkage void smp_irq_move_cleanup_int if (!cfg->move_cleanup_count) goto unlock; - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; __get_cpu_var(vector_irq)[vector] = -1; @@ -2258,28 +2578,45 @@ unlock: irq_exit(); } -static void irq_complete_move(unsigned int irq) +static void irq_complete_move(struct irq_desc **descp) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_desc *desc = *descp; + struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) + if (likely(!cfg->move_in_progress)) { +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + if (likely(!cfg->move_desc_pending)) + return; + + /* domain has not changed, but affinity did */ + me = smp_processor_id(); + if (cpu_isset(me, desc->affinity)) { + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; + cfg->move_desc_pending = 0; + } +#endif return; + } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; +#endif + send_cleanup_vector(cfg); } } #else -static inline void irq_complete_move(unsigned int irq) {} +static inline void irq_complete_move(struct irq_desc **descp) {} #endif + #ifdef CONFIG_INTR_REMAP static void ack_x2apic_level(unsigned int irq) { @@ -2290,11 +2627,14 @@ static void ack_x2apic_edge(unsigned int { ack_x2APIC_irq(); } + #endif static void ack_apic_edge(unsigned int irq) { - irq_complete_move(irq); + struct irq_desc *desc = irq_to_desc(irq); + + irq_complete_move(&desc); move_native_irq(irq); ack_APIC_irq(); } @@ -2303,18 +2643,21 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); + #ifdef CONFIG_X86_32 unsigned long v; int i; #endif + struct irq_cfg *cfg; int do_unmask_irq = 0; - irq_complete_move(irq); + irq_complete_move(&desc); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); } #endif @@ -2338,7 +2681,8 @@ static void ack_apic_level(unsigned int * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = irq_cfg(irq)->vector; + cfg = desc->chip_data; + i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); #endif @@ -2377,17 +2721,18 @@ static void ack_apic_level(unsigned int * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(irq)) + cfg = desc->chip_data; + if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); } #ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } #endif @@ -2439,24 +2784,23 @@ static inline void init_IO_APIC_traps(vo * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(irq, cfg) { + for_each_irq_desc(irq, desc) { #ifdef CONFIG_XEN if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) continue; #endif - if (IO_APIC_IRQ(irq) && !cfg->vector) { + cfg = desc->chip_data; + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < 16) + if (irq < NR_IRQS_LEGACY) make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); + else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; - } } } } @@ -2482,7 +2826,7 @@ static void unmask_lapic_irq(unsigned in apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void ack_lapic_irq(unsigned int irq) { ack_APIC_irq(); } @@ -2494,11 +2838,8 @@ static struct irq_chip lapic_chip __read .ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq) +static void lapic_register_intr(int irq, struct irq_desc *desc) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); @@ -2602,7 +2943,9 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_desc *desc = irq_to_desc(0); + struct irq_cfg *cfg = desc->chip_data; + int cpu = boot_cpu_id; int apic1, pin1, apic2, pin2; unsigned long flags; unsigned int ver; @@ -2617,7 +2960,7 @@ static inline void __init check_timer(vo * get/set the timer IRQ vector: */ disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); + assign_irq_vector(0, cfg, TARGET_CPUS); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2668,10 +3011,10 @@ static inline void __init check_timer(vo * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); + add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); @@ -2697,9 +3040,9 @@ static inline void __init check_timer(vo /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2731,7 +3074,7 @@ static inline void __init check_timer(vo apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0); + lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2930,22 +3273,26 @@ unsigned int create_irq_nr(unsigned int unsigned int irq; unsigned int new; unsigned long flags; - struct irq_cfg *cfg_new; - - irq_want = nr_irqs - 1; + struct irq_cfg *cfg_new = NULL; + int cpu = boot_cpu_id; + struct irq_desc *desc_new = NULL; irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { + for (new = irq_want; new < NR_IRQS; new++) { if (platform_legacy_irq(new)) continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) + + desc_new = irq_to_desc_alloc_cpu(new, cpu); + if (!desc_new) { + printk(KERN_INFO "can not get irq_desc for %d\n", new); + continue; + } + cfg_new = desc_new->chip_data; + + if (cfg_new->vector != 0) continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) + if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) irq = new; break; } @@ -2953,15 +3300,21 @@ unsigned int create_irq_nr(unsigned int if (irq > 0) { dynamic_irq_init(irq); + /* restore it, in case dynamic_irq_init clear it */ + if (desc_new) + desc_new->chip_data = cfg_new; } return irq; } +static int nr_irqs_gsi = NR_IRQS_LEGACY; int create_irq(void) { + unsigned int irq_want; int irq; - irq = create_irq_nr(nr_irqs - 1); + irq_want = nr_irqs_gsi; + irq = create_irq_nr(irq_want); if (irq == 0) irq = -1; @@ -2972,14 +3325,22 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc; + /* store it, in case dynamic_irq_cleanup clear it */ + desc = irq_to_desc(irq); + cfg = desc->chip_data; dynamic_irq_cleanup(irq); + /* connect back irq_cfg */ + if (desc) + desc->chip_data = cfg; #ifdef CONFIG_INTR_REMAP free_irte(irq); #endif spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } #endif /* CONFIG_XEN */ @@ -2993,16 +3354,13 @@ static int msi_compose_msg(struct pci_de struct irq_cfg *cfg; int err; unsigned dest; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (err) return err; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { @@ -3056,64 +3414,48 @@ static int msi_compose_msg(struct pci_de } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; - read_msi_msg(irq, &msg); + read_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + write_msi_msg_desc(desc, &msg); } - #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { - struct irq_cfg *cfg; + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; unsigned int dest; - cpumask_t tmp, cleanup_mask; struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3127,16 +3469,10 @@ static void ir_set_msi_irq_affinity(unsi * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; + if (cfg->move_in_progress) + send_cleanup_vector(cfg); } + #endif #endif /* CONFIG_SMP */ @@ -3195,7 +3531,7 @@ static int msi_alloc_irte(struct pci_dev } #endif -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { int ret; struct msi_msg msg; @@ -3204,7 +3540,7 @@ static int setup_msi_irq(struct pci_dev if (ret < 0) return ret; - set_irq_msi(irq, desc); + set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); #ifdef CONFIG_INTR_REMAP @@ -3224,26 +3560,13 @@ static int setup_msi_irq(struct pci_dev return 0; } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) { unsigned int irq; int ret; unsigned int irq_want; - irq_want = build_irq_for_pci_dev(dev) + 0x100; - + irq_want = nr_irqs_gsi; irq = create_irq_nr(irq_want); if (irq == 0) return -1; @@ -3257,7 +3580,7 @@ int arch_setup_msi_irq(struct pci_dev *d goto error; no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) { destroy_irq(irq); return ret; @@ -3275,7 +3598,7 @@ int arch_setup_msi_irqs(struct pci_dev * { unsigned int irq; int ret, sub_handle; - struct msi_desc *desc; + struct msi_desc *msidesc; unsigned int irq_want; #ifdef CONFIG_INTR_REMAP @@ -3283,10 +3606,11 @@ int arch_setup_msi_irqs(struct pci_dev * int index = 0; #endif - irq_want = build_irq_for_pci_dev(dev) + 0x100; + irq_want = nr_irqs_gsi; sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want); + irq_want++; if (irq == 0) return -1; #ifdef CONFIG_INTR_REMAP @@ -3318,7 +3642,7 @@ int arch_setup_msi_irqs(struct pci_dev * } no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; sub_handle++; @@ -3337,24 +3661,18 @@ void arch_teardown_msi_irq(unsigned int #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; dmar_msi_read(irq, &msg); @@ -3364,9 +3682,8 @@ static void dmar_msi_set_affinity(unsign msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip dmar_msi_type = { @@ -3398,24 +3715,18 @@ int arch_setup_dmar_msi(unsigned int irq #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; - struct irq_desc *desc; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; hpet_msi_read(irq, &msg); @@ -3425,9 +3736,8 @@ static void hpet_msi_set_affinity(unsign msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip hpet_msi_type = { @@ -3480,28 +3790,21 @@ static void target_ht_irq(unsigned int i write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif static struct irq_chip ht_irq_chip = { @@ -3519,17 +3822,14 @@ int arch_setup_ht_irq(unsigned int irq, { struct irq_cfg *cfg; int err; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (!err) { struct ht_irq_msg msg; unsigned dest; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -3565,7 +3865,7 @@ int arch_setup_ht_irq(unsigned int irq, int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset) { - const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@ -3573,7 +3873,9 @@ int arch_enable_uv_irq(char *irq_name, u unsigned long flags; int err; - err = assign_irq_vector(irq, *eligible_cpu); + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; @@ -3582,8 +3884,6 @@ int arch_enable_uv_irq(char *irq_name, u irq_name); spin_unlock_irqrestore(&vector_lock, flags); - cfg = irq_cfg(irq); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3594,7 +3894,7 @@ int arch_enable_uv_irq(char *irq_name, u entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(*eligible_cpu); + entry->dest = cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3635,10 +3935,29 @@ int __init io_apic_get_redir_entries (in return reg_01.bits.entries; } -int __init probe_nr_irqs(void) +#ifndef CONFIG_XEN +void __init probe_nr_irqs_gsi(void) { - return NR_IRQS; + int nr = 0; + + nr = acpi_probe_gsi(); + if (nr > nr_irqs_gsi) { + nr_irqs_gsi = nr; + } else { + /* for acpi=off or acpi is not compiled in */ + int idx; + + nr = 0; + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; + } + + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +#endif /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration @@ -3738,6 +4057,10 @@ int __init io_apic_get_version(int ioapi int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; + #ifdef CONFIG_XEN if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", @@ -3752,13 +4075,21 @@ int io_apic_set_pci_routing (int ioapic, return -EINVAL; } + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + } - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); return 0; } @@ -3797,7 +4128,7 @@ void __init setup_ioapic_dest(void) int pin, ioapic, irq, irq_entry; struct irq_desc *desc; struct irq_cfg *cfg; - cpumask_t mask; + const struct cpumask *mask; if (skip_ioapic_setup == 1) return; @@ -3813,9 +4144,10 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - cfg = irq_cfg(irq); + desc = irq_to_desc(irq); + cfg = desc->chip_data; if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, + setup_IO_APIC_irq(ioapic, pin, irq, desc, irq_trigger(irq_entry), irq_polarity(irq_entry)); continue; @@ -3825,19 +4157,18 @@ void __init setup_ioapic_dest(void) /* * Honour affinities which have been set in early boot */ - desc = irq_to_desc(irq); if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = &desc->affinity; else mask = TARGET_CPUS; #ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, mask); + set_ir_ioapic_affinity_irq_desc(desc, mask); else #endif - set_ioapic_affinity_irq(irq, mask); + set_ioapic_affinity_irq_desc(desc, mask); } } @@ -3886,7 +4217,6 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { --- head-2010-04-29.orig/arch/x86/kernel/ioport-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/ioport-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -36,7 +36,7 @@ static void set_bitmap(unsigned long *bi */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - struct thread_struct * t = ¤t->thread; + struct thread_struct *t = ¤t->thread; struct physdev_set_iobitmap set_iobitmap; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) --- head-2010-04-29.orig/arch/x86/kernel/apic/ipi-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/apic/ipi-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -150,31 +150,28 @@ static inline void __send_IPI_dest_field /* * This is only used on smaller machines. */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) { #ifndef CONFIG_XEN - unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long mask = cpumask_bits(cpumask)[0]; #else - cpumask_t mask; unsigned int cpu; #endif unsigned long flags; local_irq_save(flags); #ifndef CONFIG_XEN - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __send_IPI_dest_field(mask, vector); #else - cpus_andnot(mask, cpumask, cpu_online_map); - WARN_ON(!cpus_empty(mask)); - for_each_online_cpu(cpu) - if (cpu_isset(cpu, cpumask)) - __send_IPI_one(cpu, vector); + WARN_ON(!cpumask_subset(cpumask, cpu_online_mask)); + for_each_cpu_and(cpu, cpumask, cpu_online_mask) + __send_IPI_one(cpu, vector); #endif local_irq_restore(flags); } -void send_IPI_mask_sequence(cpumask_t mask, int vector) +void send_IPI_mask_sequence(const struct cpumask *mask, int vector) { #ifndef CONFIG_XEN unsigned long flags; @@ -187,18 +184,37 @@ void send_IPI_mask_sequence(cpumask_t ma */ local_irq_save(flags); - for_each_possible_cpu(query_cpu) { - if (cpu_isset(query_cpu, mask)) { - __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), - vector); - } - } + for_each_cpu(query_cpu, mask) + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); local_irq_restore(flags); #else send_IPI_mask_bitmask(mask, vector); #endif } +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); +#ifndef CONFIG_XEN + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), + vector); +#else + WARN_ON(!cpumask_subset(mask, cpu_online_mask)); + for_each_cpu_and(query_cpu, mask, cpu_online_mask) + if (query_cpu != this_cpu) + __send_IPI_one(query_cpu, vector); +#endif + local_irq_restore(flags); +} + #ifndef CONFIG_XEN /* must come after the send_IPI functions above for inlining */ static int convert_apicid_to_cpu(int apic_id) --- head-2010-04-29.orig/arch/x86/kernel/irq-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/irq-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -5,10 +5,11 @@ #include #include #include +#include #include #include -#include +#include atomic_t irq_err_count; @@ -43,57 +44,57 @@ void ack_bad_irq(unsigned int irq) /* * /proc/interrupts printing: */ -static int show_other_interrupts(struct seq_file *p) +static int show_other_interrupts(struct seq_file *p, int prec) { int j; - seq_printf(p, "NMI: "); + seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); seq_printf(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); + seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); #endif #ifdef CONFIG_SMP - seq_printf(p, "RES: "); + seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); + seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_printf(p, " Function call interrupts\n"); #ifndef CONFIG_XEN - seq_printf(p, "TLB: "); + seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif #endif #ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); + seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); # ifdef CONFIG_X86_64 - seq_printf(p, "THR: "); + seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); + seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); #endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif return 0; } @@ -101,25 +102,31 @@ static int show_other_interrupts(struct int show_interrupts(struct seq_file *p, void *v) { unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; + int i = *(loff_t *) v, j, prec; struct irqaction *action; struct irq_desc *desc; if (i > nr_irqs) return 0; + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + if (i == nr_irqs) - return show_other_interrupts(p); + return show_other_interrupts(p, prec); /* print header */ if (i == 0) { - seq_printf(p, " "); + seq_printf(p, "%*s", prec + 8, ""); for_each_online_cpu(j) seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); } desc = irq_to_desc(i); + if (!desc) + return 0; + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); @@ -131,7 +138,7 @@ int show_interrupts(struct seq_file *p, if (!action && !any_count) goto out; - seq_printf(p, "%3d: ", i); + seq_printf(p, "%*d: ", prec, i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else --- head-2010-04-29.orig/arch/x86/kernel/ldt-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/ldt-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -12,8 +12,8 @@ #include #include #include +#include -#include #include #include #include --- head-2010-04-29.orig/arch/x86/kernel/machine_kexec_32.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/machine_kexec_32.c 2010-03-24 15:17:58.000000000 +0100 @@ -123,13 +123,7 @@ void machine_kexec_setup_load_arg(xen_ke memcpy(control_page, relocate_kernel, PAGE_SIZE); xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); - xki->page_list[PA_PGD] = __ma(kexec_pgd); -#ifdef CONFIG_X86_PAE - xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); - xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); -#endif - xki->page_list[PA_PTE_0] = __ma(kexec_pte0); - xki->page_list[PA_PTE_1] = __ma(kexec_pte1); + xki->page_list[PA_PGD] = __ma(image->arch.pgd); if (image->type == KEXEC_TYPE_DEFAULT) xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); --- head-2010-04-29.orig/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -2,7 +2,7 @@ * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. * - * (c) 1995 Alan Cox, Building #3 + * (c) 1995 Alan Cox, Building #3 * (c) 1998, 1999, 2000 Ingo Molnar * (c) 2008 Alexey Starikovskiy */ @@ -16,18 +16,18 @@ #include #include #include +#include -#include #include #include #include #include #include -#include #include #include #include #include +#include #include #ifdef CONFIG_X86_32 @@ -54,13 +54,13 @@ static int __init mpf_checksum(unsigned return sum & 0xFF; } -static void __init MP_processor_info(struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_cpu *m) { #ifndef CONFIG_XEN int apicid; char *bootup_cpu = ""; - if (!(m->mpc_cpuflag & CPU_ENABLED)) { + if (!(m->cpuflag & CPU_ENABLED)) { disabled_cpus++; return; } @@ -68,57 +68,57 @@ static void __init MP_processor_info(str if (x86_quirks->mpc_apic_id) apicid = x86_quirks->mpc_apic_id(m); else - apicid = m->mpc_apicid; + apicid = m->apicid; - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; - boot_cpu_physical_apicid = m->mpc_apicid; + boot_cpu_physical_apicid = m->apicid; } - printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); - generic_processor_info(apicid, m->mpc_apicver); + printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(apicid, m->apicver); #else /* CONFIG_XEN */ num_processors++; #endif } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_config_bus *m) +static void __init MP_bus_info(struct mpc_bus *m) { char str[7]; - memcpy(str, m->mpc_bustype, 6); + memcpy(str, m->bustype, 6); str[6] = 0; if (x86_quirks->mpc_oem_bus_info) x86_quirks->mpc_oem_bus_info(m, str); else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str); + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); #if MAX_MP_BUSSES < 256 - if (m->mpc_busid >= MAX_MP_BUSSES) { + if (m->busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", - m->mpc_busid, str, MAX_MP_BUSSES - 1); + m->busid, str, MAX_MP_BUSSES - 1); return; } #endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { - set_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + set_bit(m->busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { if (x86_quirks->mpc_oem_pci_bus) x86_quirks->mpc_oem_pci_bus(m); - clear_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + clear_bit(m->busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + mp_bus_id_to_type[m->busid] = MP_BUS_MCA; #endif } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); @@ -142,32 +142,31 @@ static int bad_ioapic(unsigned long addr return 0; } -static void __init MP_ioapic_info(struct mpc_config_ioapic *m) +static void __init MP_ioapic_info(struct mpc_ioapic *m) { - if (!(m->mpc_flags & MPC_APIC_USABLE)) + if (!(m->flags & MPC_APIC_USABLE)) return; printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + m->apicid, m->apicver, m->apicaddr); - if (bad_ioapic(m->mpc_apicaddr)) + if (bad_ioapic(m->apicaddr)) return; - mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr; - mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid; - mp_ioapics[nr_ioapics].mp_type = m->mpc_type; - mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver; - mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags; + mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; + mp_ioapics[nr_ioapics].mp_apicid = m->apicid; + mp_ioapics[nr_ioapics].mp_type = m->type; + mp_ioapics[nr_ioapics].mp_apicver = m->apicver; + mp_ioapics[nr_ioapics].mp_flags = m->flags; nr_ioapics++; } -static void print_MP_intsrc_info(struct mpc_config_intsrc *m) +static void print_MP_intsrc_info(struct mpc_intsrc *m) { apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); } static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) @@ -179,52 +178,52 @@ static void __init print_mp_irq_info(str mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); } -static void __init assign_to_mp_irq(struct mpc_config_intsrc *m, +static void __init assign_to_mp_irq(struct mpc_intsrc *m, struct mp_config_intsrc *mp_irq) { - mp_irq->mp_dstapic = m->mpc_dstapic; - mp_irq->mp_type = m->mpc_type; - mp_irq->mp_irqtype = m->mpc_irqtype; - mp_irq->mp_irqflag = m->mpc_irqflag; - mp_irq->mp_srcbus = m->mpc_srcbus; - mp_irq->mp_srcbusirq = m->mpc_srcbusirq; - mp_irq->mp_dstirq = m->mpc_dstirq; + mp_irq->mp_dstapic = m->dstapic; + mp_irq->mp_type = m->type; + mp_irq->mp_irqtype = m->irqtype; + mp_irq->mp_irqflag = m->irqflag; + mp_irq->mp_srcbus = m->srcbus; + mp_irq->mp_srcbusirq = m->srcbusirq; + mp_irq->mp_dstirq = m->dstirq; } static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, - struct mpc_config_intsrc *m) + struct mpc_intsrc *m) { - m->mpc_dstapic = mp_irq->mp_dstapic; - m->mpc_type = mp_irq->mp_type; - m->mpc_irqtype = mp_irq->mp_irqtype; - m->mpc_irqflag = mp_irq->mp_irqflag; - m->mpc_srcbus = mp_irq->mp_srcbus; - m->mpc_srcbusirq = mp_irq->mp_srcbusirq; - m->mpc_dstirq = mp_irq->mp_dstirq; + m->dstapic = mp_irq->mp_dstapic; + m->type = mp_irq->mp_type; + m->irqtype = mp_irq->mp_irqtype; + m->irqflag = mp_irq->mp_irqflag; + m->srcbus = mp_irq->mp_srcbus; + m->srcbusirq = mp_irq->mp_srcbusirq; + m->dstirq = mp_irq->mp_dstirq; } static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, - struct mpc_config_intsrc *m) + struct mpc_intsrc *m) { - if (mp_irq->mp_dstapic != m->mpc_dstapic) + if (mp_irq->mp_dstapic != m->dstapic) return 1; - if (mp_irq->mp_type != m->mpc_type) + if (mp_irq->mp_type != m->type) return 2; - if (mp_irq->mp_irqtype != m->mpc_irqtype) + if (mp_irq->mp_irqtype != m->irqtype) return 3; - if (mp_irq->mp_irqflag != m->mpc_irqflag) + if (mp_irq->mp_irqflag != m->irqflag) return 4; - if (mp_irq->mp_srcbus != m->mpc_srcbus) + if (mp_irq->mp_srcbus != m->srcbus) return 5; - if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq) + if (mp_irq->mp_srcbusirq != m->srcbusirq) return 6; - if (mp_irq->mp_dstirq != m->mpc_dstirq) + if (mp_irq->mp_dstirq != m->dstirq) return 7; return 0; } -static void __init MP_intsrc_info(struct mpc_config_intsrc *m) +static void __init MP_intsrc_info(struct mpc_intsrc *m) { int i; @@ -242,57 +241,55 @@ static void __init MP_intsrc_info(struct #endif -static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) { apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); } /* * Read/parse the MPC */ -static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem, - char *str) +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) { - if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", - mpc->mpc_signature[0], mpc->mpc_signature[1], - mpc->mpc_signature[2], mpc->mpc_signature[3]); + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); return 0; } - if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) { + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { printk(KERN_ERR "MPTABLE: checksum error!\n"); return 0; } - if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) { + if (mpc->spec != 0x01 && mpc->spec != 0x04) { printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", - mpc->mpc_spec); + mpc->spec); return 0; } - if (!mpc->mpc_lapic) { + if (!mpc->lapic) { printk(KERN_ERR "MPTABLE: null local APIC address!\n"); return 0; } - memcpy(oem, mpc->mpc_oem, 8); + memcpy(oem, mpc->oem, 8); oem[8] = 0; printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); - memcpy(str, mpc->mpc_productid, 12); + memcpy(str, mpc->productid, 12); str[12] = 0; printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); - printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); return 1; } -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; char oem[10]; @@ -317,14 +314,14 @@ static int __init smp_read_mpc(struct mp #endif /* save the local APIC address, it might be non-default */ if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; + mp_lapic_addr = mpc->lapic; if (early) return 1; - if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { - struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); + if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { + struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; + x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); } /* @@ -333,12 +330,11 @@ static int __init smp_read_mpc(struct mp if (x86_quirks->mpc_record) *x86_quirks->mpc_record = 0; - while (count < mpc->mpc_length) { + while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: { - struct mpc_config_processor *m = - (struct mpc_config_processor *)mpt; + struct mpc_cpu *m = (struct mpc_cpu *)mpt; /* ACPI may have already provided this data */ if (!acpi_lapic) MP_processor_info(m); @@ -348,8 +344,7 @@ static int __init smp_read_mpc(struct mp } case MP_BUS: { - struct mpc_config_bus *m = - (struct mpc_config_bus *)mpt; + struct mpc_bus *m = (struct mpc_bus *)mpt; #ifdef CONFIG_X86_IO_APIC MP_bus_info(m); #endif @@ -360,30 +355,28 @@ static int __init smp_read_mpc(struct mp case MP_IOAPIC: { #ifdef CONFIG_X86_IO_APIC - struct mpc_config_ioapic *m = - (struct mpc_config_ioapic *)mpt; + struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; MP_ioapic_info(m); #endif - mpt += sizeof(struct mpc_config_ioapic); - count += sizeof(struct mpc_config_ioapic); + mpt += sizeof(struct mpc_ioapic); + count += sizeof(struct mpc_ioapic); break; } case MP_INTSRC: { #ifdef CONFIG_X86_IO_APIC - struct mpc_config_intsrc *m = - (struct mpc_config_intsrc *)mpt; + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; MP_intsrc_info(m); #endif - mpt += sizeof(struct mpc_config_intsrc); - count += sizeof(struct mpc_config_intsrc); + mpt += sizeof(struct mpc_intsrc); + count += sizeof(struct mpc_intsrc); break; } case MP_LINTSRC: { - struct mpc_config_lintsrc *m = - (struct mpc_config_lintsrc *)mpt; + struct mpc_lintsrc *m = + (struct mpc_lintsrc *)mpt; MP_lintsrc_info(m); mpt += sizeof(*m); count += sizeof(*m); @@ -394,8 +387,8 @@ static int __init smp_read_mpc(struct mp printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); printk(KERN_ERR "type %x\n", *mpt); print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, - 1, mpc, mpc->mpc_length, 1); - count = mpc->mpc_length; + 1, mpc, mpc->length, 1); + count = mpc->length; break; } if (x86_quirks->mpc_record) @@ -426,16 +419,16 @@ static int __init ELCR_trigger(unsigned static void __init construct_default_ioirq_mptable(int mpc_default_type) { - struct mpc_config_intsrc intsrc; + struct mpc_intsrc intsrc; int i; int ELCR_fallback = 0; - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid; + intsrc.type = MP_INTSRC; + intsrc.irqflag = 0; /* conforming */ + intsrc.srcbus = 0; + intsrc.dstapic = mp_ioapics[0].mp_apicid; - intsrc.mpc_irqtype = mp_INT; + intsrc.irqtype = mp_INT; /* * If true, we have an ISA/PCI system with no IRQ entries @@ -478,30 +471,30 @@ static void __init construct_default_ioi * irqflag field (level sensitive, active high polarity). */ if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; + intsrc.irqflag = 13; else - intsrc.mpc_irqflag = 0; + intsrc.irqflag = 0; } - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ MP_intsrc_info(&intsrc); } - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ MP_intsrc_info(&intsrc); } static void __init construct_ioapic_table(int mpc_default_type) { - struct mpc_config_ioapic ioapic; - struct mpc_config_bus bus; + struct mpc_ioapic ioapic; + struct mpc_bus bus; - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; + bus.type = MP_BUS; + bus.busid = 0; switch (mpc_default_type) { default: printk(KERN_ERR "???\nUnknown standard configuration %d\n", @@ -509,29 +502,29 @@ static void __init construct_ioapic_tabl /* fall through */ case 1: case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); + memcpy(bus.bustype, "ISA ", 6); break; case 2: case 6: case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); + memcpy(bus.bustype, "EISA ", 6); break; case 4: case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); + memcpy(bus.bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); MP_bus_info(&bus); } - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); /* @@ -545,8 +538,8 @@ static inline void __init construct_ioap static inline void __init construct_default_ISA_mptable(int mpc_default_type) { - struct mpc_config_processor processor; - struct mpc_config_lintsrc lintsrc; + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; int linttypes[2] = { mp_ExtINT, mp_NMI }; int i; @@ -558,30 +551,30 @@ static inline void __init construct_defa /* * 2 CPUs, numbered 0 & 1. */ - processor.mpc_type = MP_PROCESSOR; + processor.type = MP_PROCESSOR; /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; + processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; + processor.apicid = i; MP_processor_info(&processor); } construct_ioapic_table(mpc_default_type); - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = 0; /* conforming */ + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; MP_lintsrc_info(&lintsrc); } } @@ -595,26 +588,23 @@ static void __init __get_smp_config(unsi { struct intel_mp_floating *mpf = mpf_found; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } + if (!mpf) + return; + if (acpi_lapic && early) return; + /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + if (acpi_lapic && acpi_ioapic) return; - } else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); - if (!mpf) - return; + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) + return; + } printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); @@ -669,15 +659,15 @@ static void __init __get_smp_config(unsi * ISA defaults and hope it will work. */ if (!mp_irq_entries) { - struct mpc_config_bus bus; + struct mpc_bus bus; printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " "using default mptable. " "(tell your hw vendor)\n"); - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); MP_bus_info(&bus); construct_default_ioirq_mptable(0); @@ -823,14 +813,14 @@ void __init find_smp_config(void) #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; -static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m) +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) { int i; - if (m->mpc_irqtype != mp_INT) + if (m->irqtype != mp_INT) return 0; - if (m->mpc_irqflag != 0x0f) + if (m->irqflag != 0x0f) return 0; /* not legacy */ @@ -842,9 +832,9 @@ static int __init get_MP_intsrc_index(s if (mp_irqs[i].mp_irqflag != 0x0f) continue; - if (mp_irqs[i].mp_srcbus != m->mpc_srcbus) + if (mp_irqs[i].mp_srcbus != m->srcbus) continue; - if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq) + if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) continue; if (irq_used[i]) { /* already claimed */ @@ -860,10 +850,10 @@ static int __init get_MP_intsrc_index(s #define SPARE_SLOT_NUM 20 -static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; #endif -static int __init replace_intsrc_all(struct mp_config_table *mpc, +static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, unsigned long mpc_new_length) { @@ -875,36 +865,33 @@ static int __init replace_intsrc_all(st int count = sizeof(*mpc); unsigned char *mpt = ((unsigned char *)mpc) + count; - printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length); - while (count < mpc->mpc_length) { + printk(KERN_INFO "mpc_length %x\n", mpc->length); + while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: { - struct mpc_config_processor *m = - (struct mpc_config_processor *)mpt; + struct mpc_cpu *m = (struct mpc_cpu *)mpt; mpt += sizeof(*m); count += sizeof(*m); break; } case MP_BUS: { - struct mpc_config_bus *m = - (struct mpc_config_bus *)mpt; + struct mpc_bus *m = (struct mpc_bus *)mpt; mpt += sizeof(*m); count += sizeof(*m); break; } case MP_IOAPIC: { - mpt += sizeof(struct mpc_config_ioapic); - count += sizeof(struct mpc_config_ioapic); + mpt += sizeof(struct mpc_ioapic); + count += sizeof(struct mpc_ioapic); break; } case MP_INTSRC: { #ifdef CONFIG_X86_IO_APIC - struct mpc_config_intsrc *m = - (struct mpc_config_intsrc *)mpt; + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; apic_printk(APIC_VERBOSE, "OLD "); print_MP_intsrc_info(m); @@ -925,14 +912,14 @@ static int __init replace_intsrc_all(st nr_m_spare++; } #endif - mpt += sizeof(struct mpc_config_intsrc); - count += sizeof(struct mpc_config_intsrc); + mpt += sizeof(struct mpc_intsrc); + count += sizeof(struct mpc_intsrc); break; } case MP_LINTSRC: { - struct mpc_config_lintsrc *m = - (struct mpc_config_lintsrc *)mpt; + struct mpc_lintsrc *m = + (struct mpc_lintsrc *)mpt; mpt += sizeof(*m); count += sizeof(*m); break; @@ -942,7 +929,7 @@ static int __init replace_intsrc_all(st printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); printk(KERN_ERR "type %x\n", *mpt); print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, - 1, mpc, mpc->mpc_length, 1); + 1, mpc, mpc->length, 1); goto out; } } @@ -964,9 +951,8 @@ static int __init replace_intsrc_all(st assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); m_spare[nr_m_spare] = NULL; } else { - struct mpc_config_intsrc *m = - (struct mpc_config_intsrc *)mpt; - count += sizeof(struct mpc_config_intsrc); + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); if (!mpc_new_phys) { printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); } else { @@ -978,17 +964,16 @@ static int __init replace_intsrc_all(st } } assign_to_mpc_intsrc(&mp_irqs[i], m); - mpc->mpc_length = count; - mpt += sizeof(struct mpc_config_intsrc); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); } print_mp_irq_info(&mp_irqs[i]); } #endif out: /* update checksum */ - mpc->mpc_checksum = 0; - mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc, - mpc->mpc_length); + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); return 0; } @@ -1034,8 +1019,7 @@ static int __init update_mp_table(void) char str[16]; char oem[10]; struct intel_mp_floating *mpf; - struct mp_config_table *mpc; - struct mp_config_table *mpc_new; + struct mpc_table *mpc, *mpc_new; if (!enable_update_mptable) return 0; @@ -1061,7 +1045,7 @@ static int __init update_mp_table(void) printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf)); printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); - if (mpc_new_phys && mpc->mpc_length > mpc_new_length) { + if (mpc_new_phys && mpc->length > mpc_new_length) { mpc_new_phys = 0; printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", mpc_new_length); @@ -1070,10 +1054,10 @@ static int __init update_mp_table(void) if (!mpc_new_phys) { unsigned char old, new; /* check if we can change the postion */ - mpc->mpc_checksum = 0; - old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); - mpc->mpc_checksum = 0xff; - new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); return 0; @@ -1085,7 +1069,7 @@ static int __init update_mp_table(void) mpc_new_bus = phys_to_machine(mpc_new_phys); mpf->mpf_physptr = mpc_new_bus; mpc_new = phys_to_virt(mpc_new_phys); - memcpy(mpc_new, mpc, mpc->mpc_length); + memcpy(mpc_new, mpc, mpc->length); mpc = mpc_new; /* check if we can modify that */ if (mpc_new_bus - mpf->mpf_physptr) { --- head-2010-04-29.orig/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -30,11 +31,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -42,7 +38,7 @@ EXPORT_SYMBOL(bad_dma_address); be probably a smaller DMA mask, but this is bug-to-bug compatible to older i386. */ struct device x86_dma_fallback_dev = { - .bus_id = "fallback device", + .init_name = "fallback device", .coherent_dma_mask = DMA_32BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; @@ -105,8 +101,6 @@ static void __init dma32_free_bootmem(vo dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } -#else -#define dma32_free_bootmem() ((void)0) #endif static struct dma_mapping_ops swiotlb_dma_ops = { @@ -128,8 +122,11 @@ static struct dma_mapping_ops swiotlb_dm void __init pci_iommu_alloc(void) { +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); +#endif + /* * The order of these functions is important for * fall-back/fail-over reasons @@ -149,16 +146,6 @@ void __init pci_iommu_alloc(void) } } -#ifndef CONFIG_XEN -unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) -{ - unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); - - return size >> PAGE_SHIFT; -} -EXPORT_SYMBOL(iommu_nr_pages); -#endif - void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -246,7 +233,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "biomerge", 8)) { - iommu_bio_merge = 4096; iommu_merge = 1; force_iommu = 1; } @@ -385,8 +371,8 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); + printk(KERN_INFO + "PCI: VIA PCI bridge detected. Disabling DAC.\n"); forbid_dac = 1; } } --- head-2010-04-29.orig/arch/x86/kernel/process-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/process-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -1,13 +1,17 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include +#include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -99,6 +103,9 @@ static inline int hlt_use_halt(void) */ void xen_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -111,11 +118,27 @@ void xen_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(&it); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); #endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_all_local_evtchn(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + static void do_nothing(void *unused) { } @@ -149,24 +172,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { + if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(ax, cx); } + trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { + struct power_trace it; if (!need_resched()) { + trace_power_start(&it, POWER_CSTATE, 1); + if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(&it); } else local_irq_enable(); } @@ -179,9 +215,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); + trace_power_end(&it); } #ifndef CONFIG_XEN @@ -267,7 +307,7 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); --- head-2010-04-29.orig/arch/x86/kernel/process_32-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/process_32-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -38,11 +38,13 @@ #include #include #include +#include +#include +#include +#include -#include #include #include -#include #include #include #include @@ -59,10 +61,9 @@ #include #include -#include #include #include -#include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); @@ -108,9 +109,6 @@ void cpu_idle(void) check_pgt_cache(); rmb(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, 0); - if (cpu_is_offline(cpu)) play_dead(); @@ -208,7 +206,7 @@ extern void kernel_thread_helper(void); /* * Create a kernel thread */ -int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct pt_regs regs; @@ -247,14 +245,8 @@ void exit_thread(void) t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); } -#ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -267,7 +259,7 @@ void flush_thread(void) tsk->thread.debugreg3 = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); clear_tsk_thread_flag(tsk, TIF_DEBUG); /* * Forget coprocessor state.. @@ -294,9 +286,9 @@ void prepare_to_copy(struct task_struct int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, - struct task_struct * p, struct pt_regs * regs) + struct task_struct *p, struct pt_regs *regs) { - struct pt_regs * childregs; + struct pt_regs *childregs; struct task_struct *tsk; int err; @@ -340,13 +332,19 @@ int copy_thread(int nr, unsigned long cl kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - __asm__("movl %0, %%gs" :: "r"(0)); + __asm__("movl %0, %%gs" : : "r"(0)); regs->fs = 0; set_fs(USER_DS); regs->ds = __USER_DS; @@ -420,47 +418,18 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -481,14 +450,6 @@ __switch_to_xtra(struct task_struct *pre else hard_enable_TSC(); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -518,7 +479,8 @@ __switch_to_xtra(struct task_struct *pre * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -698,7 +660,7 @@ asmlinkage int sys_vfork(struct pt_regs asmlinkage int sys_execve(struct pt_regs regs) { int error; - char * filename; + char *filename; filename = getname((char __user *) regs.bx); error = PTR_ERR(filename); --- head-2010-04-29.orig/arch/x86/kernel/process_64-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/process_64-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include #include @@ -59,6 +61,7 @@ #include #include #include +#include #include @@ -158,14 +161,18 @@ void __show_regs(struct pt_regs *regs, i unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; + const char *board; printk("\n"); print_modules(); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, board); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, @@ -256,14 +263,8 @@ void exit_thread(void) #endif t->io_bitmap_max = 0; } -#ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void xen_load_gs_index(unsigned gs) @@ -399,6 +400,11 @@ int copy_thread(int nr, unsigned long cl } p->thread.iopl = current->thread.iopl; + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { @@ -495,35 +501,14 @@ static inline void __switch_to_xtra(stru struct task_struct *next_p) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -544,14 +529,6 @@ static inline void __switch_to_xtra(stru else hard_enable_TSC(); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -562,8 +539,9 @@ static inline void __switch_to_xtra(stru * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. */ -struct task_struct * +__notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; --- head-2010-04-29.orig/arch/x86/kernel/quirks-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/quirks-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -169,6 +169,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); --- head-2010-04-29.orig/arch/x86/kernel/setup-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/setup-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -93,11 +93,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -508,6 +510,7 @@ static void __init reserve_early_setup_d * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ +static unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ @@ -650,165 +653,32 @@ static int __init setup_elfcorehdr(char early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - -/* - * Some BIOSes seem to corrupt the low 64k of memory during events - * like suspend/resume and unplugging an HDMI cable. Reserve all - * remaining free memory in that area and fill it with a distinct - * pattern. - */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -#define MAX_SCAN_AREAS 8 - -static int __read_mostly memory_corruption_check = -1; - -static unsigned __read_mostly corruption_check_size = 64*1024; -static unsigned __read_mostly corruption_check_period = 60; /* seconds */ - -static struct e820entry scan_areas[MAX_SCAN_AREAS]; -static int num_scan_areas; - - -static int set_corruption_check(char *arg) -{ - char *end; - - memory_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check", set_corruption_check); - -static int set_corruption_check_period(char *arg) -{ - char *end; - - corruption_check_period = simple_strtoul(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_period", set_corruption_check_period); - -static int set_corruption_check_size(char *arg) +#ifndef CONFIG_XEN +static int __init default_update_genapic(void) { - char *end; - unsigned size; - - size = memparse(arg, &end); - - if (*end == '\0') - corruption_check_size = size; +#ifdef CONFIG_X86_SMP +# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +# endif +#endif - return (size == corruption_check_size) ? 0 : -EINVAL; + return 0; } -early_param("memory_corruption_check_size", set_corruption_check_size); - - -static void __init setup_bios_corruption_check(void) -{ - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - - if (memory_corruption_check == -1) { - memory_corruption_check = -#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK - 1 #else - 0 +#define default_update_genapic NULL #endif - ; - } - - if (corruption_check_size == 0) - memory_corruption_check = 0; - - if (!memory_corruption_check) - return; - - corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - - while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = find_e820_area_size(addr, &size, PAGE_SIZE); - - if (addr == 0) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - if (size == 0) - break; - - e820_update_range(addr, size, E820_RAM, E820_RESERVED); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; - - /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); - - addr += size; - } - - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); - update_e820(); -} - -static struct timer_list periodic_check_timer; - -void check_for_bios_corruption(void) -{ - int i; - int corruption = 0; - - if (!memory_corruption_check) - return; - - for(i = 0; i < num_scan_areas; i++) { - unsigned long *addr = __va(scan_areas[i].addr); - unsigned long size = scan_areas[i].size; - - for(; size; addr++, size -= sizeof(unsigned long)) { - if (!*addr) - continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); - corruption = 1; - *addr = 0; - } - } - - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); -} - -static void periodic_check_for_corruption(unsigned long data) -{ - check_for_bios_corruption(); - mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); -} - -void start_periodic_check_for_corruption(void) -{ - if (!memory_corruption_check || corruption_check_period == 0) - return; - - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); +static struct x86_quirks default_x86_quirks __initdata = { + .update_genapic = default_update_genapic, +}; - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); -} -#endif +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +#ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE - "%s detected: BIOS may corrupt low RAM, working it around.\n", + "%s detected: BIOS may corrupt low RAM, working around it.\n", d->ident); e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); @@ -816,6 +686,7 @@ static int __init dmi_low_memory_corrupt return 0; } +#endif /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { @@ -1023,15 +894,25 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + if (efi_enabled) + efi_init(); + if (is_initial_xendomain()) { dmi_scan_machine(); dmi_check_system(bad_bios_dmi_table); + } + + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor(&boot_cpu_data); #ifdef CONFIG_X86_32 + if (is_initial_xendomain()) probe_roms(); #endif - } #ifndef CONFIG_XEN /* after parse_early_param, so could debug it */ @@ -1039,8 +920,6 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - if (efi_enabled) - efi_init(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { @@ -1295,7 +1174,7 @@ void __init setup_arch(char **cmdline_p) ioapic_init_mappings(); /* need to wait for io_apic is mapped */ - nr_irqs = probe_nr_irqs(); + probe_nr_irqs_gsi(); kvm_guest_init(); --- head-2010-04-29.orig/arch/x86/kernel/smp-xen.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/smp-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -1,7 +1,7 @@ /* * Intel SMP support routines. * - * (c) 1995 Alan Cox, Building #3 + * (c) 1995 Alan Cox, Building #3 * (c) 1998-99, 2000 Ingo Molnar * (c) 2002,2003 Andi Kleen, SuSE Labs. * @@ -118,30 +118,17 @@ void xen_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void xen_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR); + send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR); } -void xen_send_call_func_ipi(cpumask_t mask) +void xen_send_call_func_ipi(const struct cpumask *mask) { - send_IPI_mask(mask, CALL_FUNCTION_VECTOR); -} - -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_all_local_evtchn(); - if (hlt_works(smp_processor_id())) - for (;;) halt(); - for (;;); + send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR); } /* @@ -165,11 +152,7 @@ void xen_smp_send_stop(void) */ irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_resched_count++; -#else - add_pda(irq_resched_count, 1); -#endif + inc_irq_stat(irq_resched_count); return IRQ_HANDLED; } @@ -177,11 +160,7 @@ irqreturn_t smp_call_function_interrupt( { irq_enter(); generic_smp_call_function_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); return IRQ_HANDLED; @@ -191,11 +170,7 @@ irqreturn_t smp_call_function_single_int { irq_enter(); generic_smp_call_function_single_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); return IRQ_HANDLED; --- head-2010-04-29.orig/arch/x86/kernel/time-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/time-xen.c 2010-05-11 17:14:09.000000000 +0200 @@ -454,11 +454,7 @@ irqreturn_t timer_interrupt(int irq, voi struct vcpu_runstate_info runstate; /* Keep nmi watchdog up to date */ -#ifdef __i386__ - x86_add_percpu(irq_stat.irq0_irqs, 1); -#else - add_pda(irq0_irqs, 1); -#endif + inc_irq_stat(irq0_irqs); /* * Here we are in the timer irq handler. We just have irqs locally @@ -518,7 +514,6 @@ irqreturn_t timer_interrupt(int irq, voi /* * Account stolen ticks. - * HACK: Passing NULL to account_steal_time() * ensures that the ticks are accounted as stolen. */ stolen = runstate.time[RUNSTATE_runnable] @@ -531,12 +526,11 @@ irqreturn_t timer_interrupt(int irq, voi do_div(stolen, NS_PER_TICK); per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; - account_steal_time(NULL, (cputime_t)stolen); + account_steal_ticks(stolen); } /* * Account blocked ticks. - * HACK: Passing idle_task to account_steal_time() * ensures that the ticks are accounted as idle/wait. */ blocked = runstate.time[RUNSTATE_blocked] @@ -548,18 +542,23 @@ irqreturn_t timer_interrupt(int irq, voi do_div(blocked, NS_PER_TICK); per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; - account_steal_time(idle_task(cpu), (cputime_t)blocked); + account_idle_ticks(blocked); } /* Account user/system ticks. */ if (delta_cpu > 0) { + cputime_t ct; + do_div(delta_cpu, NS_PER_TICK); per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; + ct = jiffies_to_cputime(delta_cpu); if (user_mode_vm(get_irq_regs())) - account_user_time(current, (cputime_t)delta_cpu); - else + account_user_time(current, ct, cputime_to_scaled(ct)); + else if (current != idle_task(cpu)) account_system_time(current, HARDIRQ_OFFSET, - (cputime_t)delta_cpu); + ct, cputime_to_scaled(ct)); + else + account_idle_ticks(delta_cpu); } /* Offlined for more than a few seconds? Avoid lockup warnings. */ @@ -788,7 +786,7 @@ static void stop_hz_timer(void) unsigned long j; int rc; - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */ @@ -804,7 +802,7 @@ static void stop_hz_timer(void) local_softirq_pending() || (j = get_next_timer_interrupt(jiffies), time_before_eq(j, jiffies))) { - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); j = jiffies + 1; } @@ -835,7 +833,7 @@ static void start_hz_timer(void) } #endif BUG_ON(rc); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } void xen_safe_halt(void) --- head-2010-04-29.orig/arch/x86/kernel/traps-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/traps-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -51,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -65,18 +63,10 @@ #else #include #include -#include -#include -#include #include #include "cpu/mcheck/mce.h" -#ifndef CONFIG_XEN -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); -#endif - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -93,6 +83,11 @@ gate_desc idt_table[256] #endif #endif +#ifndef CONFIG_XEN +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); +#endif + static int ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) @@ -108,6 +103,12 @@ static inline void preempt_conditional_s local_irq_enable(); } +static inline void conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -298,8 +299,10 @@ dotraplinkage void do_double_fault(struc tsk->thread.error_code = error_code; tsk->thread.trap_no = 8; - /* This is always a kernel trap and never fixable (and thus must - never return). */ + /* + * This is always a kernel trap and never fixable (and thus must + * never return). + */ for (;;) die(str, regs, error_code); } @@ -476,11 +479,7 @@ do_nmi(struct pt_regs *regs, long error_ { nmi_enter(); -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif + inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); @@ -519,9 +518,11 @@ dotraplinkage void __kprobes do_int3(str } #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ +/* + * Help handler running on IST stack to switch back to user stack + * for scheduling or signal handling. The actual stack switch is done in + * entry.S + */ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; @@ -531,8 +532,10 @@ asmlinkage __kprobes struct pt_regs *syn /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ + /* + * Exception from kernel and interrupts are enabled. Move to + * kernel process stack. + */ else if (eregs->flags & X86_EFLAGS_IF) regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) @@ -624,8 +627,10 @@ clear_dr7: #ifdef CONFIG_X86_32 debug_vm86: + /* reenable preemption: handle_vm86_trap() might sleep */ + dec_preempt_count(); handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); + conditional_cli(regs); return; #endif @@ -659,7 +664,7 @@ void math_error(void __user *ip) { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@ -670,7 +675,6 @@ void math_error(void __user *ip) task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -684,34 +688,30 @@ void math_error(void __user *ip) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 - return; -#endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + err = swd & ~cwd; + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + /* + * If we're using IRQ 13, or supposedly even some trap 16 + * implementations, it's possible we get a spurious trap... + */ + return; /* Spurious trap, no error */ } force_sig_info(SIGFPE, &info, task); } @@ -901,7 +901,7 @@ asmlinkage void math_state_restore(void) EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION -asmlinkage void math_emulate(long arg) +void math_emulate(struct math_emu_info *info) { printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); @@ -911,16 +911,19 @@ asmlinkage void math_emulate(long arg) } #endif /* CONFIG_MATH_EMULATION */ -dotraplinkage void __kprobes -do_device_not_available(struct pt_regs *regs, long error) +dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) { #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) if (read_cr0() & X86_CR0_EM) { - conditional_sti(regs); - math_emulate(0); + struct math_emu_info info = { }; + + conditional_sti(®s); + + info.regs = ®s; + math_emulate(&info); } else { math_state_restore(); /* interrupts still off */ - conditional_sti(regs); + conditional_sti(®s); } #else math_state_restore(); --- head-2010-04-29.orig/arch/x86/kernel/vsyscall_64-xen.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/kernel/vsyscall_64-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -17,6 +17,9 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + #include #include #include @@ -128,7 +131,16 @@ static __always_inline void do_vgettimeo gettimeofday(tv,NULL); return; } + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); now = vread(); + rdtsc_barrier(); + base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; --- head-2010-04-29.orig/arch/x86/mm/fault-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/fault-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -53,7 +53,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE_HOOKS +#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; @@ -406,7 +406,7 @@ static void show_fault_oops(struct pt_re if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); + "(uid: %d)\n", current_uid()); } #endif @@ -426,6 +426,7 @@ static noinline void pgtable_bad(unsigne unsigned long error_code) { unsigned long flags = oops_begin(); + int sig = SIGKILL; struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -436,8 +437,8 @@ static noinline void pgtable_bad(unsigne tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) - regs = NULL; - oops_end(flags, regs, SIGKILL); + sig = 0; + oops_end(flags, regs, sig); } #endif @@ -546,10 +547,7 @@ static int vmalloc_fault(unsigned long a happen within a race in page table update. In the later case just flush. */ - /* On Xen the line below does not always work. Needs investigating! */ - /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); + pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -606,6 +604,7 @@ void __kprobes do_page_fault(struct pt_r int fault; #ifdef CONFIG_X86_64 unsigned long flags; + int sig; #endif /* Set the "privileged fault" bit to something sane. */ @@ -623,8 +622,6 @@ void __kprobes do_page_fault(struct pt_r si_code = SEGV_MAPERR; - if (notify_page_fault(regs)) - return; if (unlikely(kmmio_fault(regs, address))) return; @@ -663,6 +660,9 @@ void __kprobes do_page_fault(struct pt_r if (spurious_fault(address, error_code)) return; + /* kprobes don't want to hook the spurious faults. */ + if (notify_page_fault(regs)) + return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. @@ -670,6 +670,9 @@ void __kprobes do_page_fault(struct pt_r goto bad_area_nosemaphore; } + /* kprobes don't want to hook the spurious faults. */ + if (notify_page_fault(regs)) + return; /* * It's safe to allow irq's after cr2 has been saved and the @@ -696,7 +699,6 @@ void __kprobes do_page_fault(struct pt_r if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; -again: /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -880,32 +882,22 @@ no_context: bust_spinlocks(0); do_exit(SIGKILL); #else + sig = SIGKILL; if (__die("Oops", regs, error_code)) - regs = NULL; + sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, SIGKILL); + oops_end(flags, regs, sig); #endif -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ out_of_memory: + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - /* - * Re-lookup the vma - in theory the vma tree might - * have changed: - */ - goto again; - } - - printk("VM: killing process %s\n", tsk->comm); - if (error_code & PF_USER) - do_group_exit(SIGKILL); - goto no_context; + pagefault_out_of_memory(); + return; do_sigbus: up_read(&mm->mmap_sem); --- head-2010-04-29.orig/arch/x86/mm/hypervisor.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/hypervisor.c 2010-03-24 15:17:58.000000000 +0100 @@ -79,12 +79,12 @@ static void multicall_failed(const multi BUG(); } -int xen_multicall_flush(bool ret_last) { +static int _xen_multicall_flush(bool ret_last) { struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); multicall_entry_t *mc = lazy->mc; unsigned int count = lazy->nr_mc; - if (!count || !use_lazy_mmu_mode()) + if (!count) return 0; lazy->nr_mc = 0; @@ -112,6 +112,11 @@ int xen_multicall_flush(bool ret_last) { return 0; } + +void xen_multicall_flush(bool force) { + if (force || use_lazy_mmu_mode()) + _xen_multicall_flush(false); +} EXPORT_SYMBOL(xen_multicall_flush); int xen_multi_update_va_mapping(unsigned long va, pte_t pte, @@ -130,7 +135,7 @@ int xen_multi_update_va_mapping(unsigned #endif if (unlikely(lazy->nr_mc == NR_MC)) - xen_multicall_flush(false); + _xen_multicall_flush(false); mc = lazy->mc + lazy->nr_mc++; mc->op = __HYPERVISOR_update_va_mapping; @@ -169,7 +174,7 @@ int xen_multi_mmu_update(mmu_update_t *s merge = lazy->nr_mc && !commit && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid); if (unlikely(lazy->nr_mc == NR_MC) && !merge) { - xen_multicall_flush(false); + _xen_multicall_flush(false); mc = lazy->mc; commit = count > NR_MMU || success_count; } @@ -207,7 +212,7 @@ int xen_multi_mmu_update(mmu_update_t *s break; } - return commit ? xen_multicall_flush(true) : 0; + return commit ? _xen_multicall_flush(true) : 0; } int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count, @@ -291,7 +296,7 @@ int xen_multi_mmuext_op(struct mmuext_op merge = lazy->nr_mc && !commit && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid); if (unlikely(lazy->nr_mc == NR_MC) && !merge) { - xen_multicall_flush(false); + _xen_multicall_flush(false); mc = lazy->mc; commit = count > NR_MMUEXT || success_count; } @@ -338,7 +343,7 @@ int xen_multi_mmuext_op(struct mmuext_op break; } - return commit ? xen_multicall_flush(true) : 0; + return commit ? _xen_multicall_flush(true) : 0; } void xen_l1_entry_update(pte_t *ptr, pte_t val) --- head-2010-04-29.orig/arch/x86/mm/init_32-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/init_32-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -71,7 +71,7 @@ static unsigned long __initdata table_to static int __initdata after_init_bootmem; -static __init void *alloc_low_page(unsigned long *phys) +static __init void *alloc_low_page(void) { unsigned long pfn = table_end++; void *adr; @@ -81,7 +81,6 @@ static __init void *alloc_low_page(unsig adr = __va(pfn * PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; return adr; } @@ -96,17 +95,18 @@ static pmd_t * __init one_md_table_init( pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - unsigned long phys; if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_init_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else - pmd_table = (pmd_t *)alloc_low_page(&phys); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; } #endif pud = pud_offset(pgd, 0); @@ -135,10 +135,8 @@ static pte_t * __init one_page_table_ini if (!page_table) page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); - } else { - unsigned long phys; - page_table = (pte_t *)alloc_low_page(&phys); - } + } else + page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); make_lowmem_page_readonly(page_table, @@ -150,6 +148,51 @@ static pte_t * __init one_page_table_ini return pte_offset_kernel(pmd, 0); } +static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte) +{ +#ifdef CONFIG_HIGHMEM + /* + * Something (early fixmap) may already have put a pte + * page here, which causes the page table allocation + * to become nonlinear. Attempt to fix it, and if it + * is still nonlinear then we have to bug. + */ + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end + && ((__pa(pte) >> PAGE_SHIFT) < table_start + || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { + pte_t *newpte; + int i; + + BUG_ON(after_init_bootmem); + newpte = alloc_low_page(); + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); + + paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); + make_lowmem_page_readonly(newpte, + XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); + BUG_ON(newpte != pte_offset_kernel(pmd, 0)); + __flush_tlb_all(); + + paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); + make_lowmem_page_writable(pte, + XENFEAT_writable_page_tables); + pte = newpte; + } + BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) + && vaddr > fix_to_virt(FIX_KMAP_END) + && lastpte && lastpte + PTRS_PER_PTE != pte); +#endif + return pte; +} + /* * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in @@ -166,6 +209,7 @@ page_table_range_init(unsigned long star unsigned long vaddr; pgd_t *pgd; pmd_t *pmd; + pte_t *pte = NULL; vaddr = start; pgd_idx = pgd_index(vaddr); @@ -177,8 +221,10 @@ page_table_range_init(unsigned long star pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (vaddr < hypervisor_virt_start) - one_page_table_init(pmd); + if (vaddr >= hypervisor_virt_start) + break; + pte = page_table_kmap_check(one_page_table_init(pmd), + pmd, vaddr, pte); vaddr += PMD_SIZE; } @@ -361,6 +407,8 @@ int devmem_is_allowed(unsigned long page { if (pagenr <= 256) return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; if (mfn_to_local_pfn(pagenr) >= max_pfn) return 1; return 0; @@ -476,8 +524,12 @@ static void __init set_highmem_pages_ini #endif /* !CONFIG_NUMA */ #else -# define permanent_kmaps_init(pgd_base) do { } while (0) -# define set_highmem_pages_init() do { } while (0) +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +static inline void set_highmem_pages_init(void) +{ +} #endif /* CONFIG_HIGHMEM */ pgd_t *swapper_pg_dir; @@ -509,7 +561,6 @@ static void __init early_ioremap_page_ta * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ - early_ioremap_clear(); vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); @@ -856,10 +907,7 @@ static void __init find_early_table_spac tables += PAGE_ALIGN(ptes * sizeof(pte_t)); /* for fixmap */ - tables += PAGE_SIZE - * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK) - - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK)) - >> PMD_SHIFT); + tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); table_start = extend_init_mapping(tables); @@ -1023,8 +1071,6 @@ void __init mem_init(void) pci_iommu_alloc(); - start_periodic_check_for_corruption(); - #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif @@ -1099,11 +1145,25 @@ void __init mem_init(void) (unsigned long)&_text, (unsigned long)&_etext, ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); if (boot_cpu_data.wp_works_ok < 0) @@ -1123,7 +1183,7 @@ int arch_add_memory(int nid, u64 start, unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages); } #endif --- head-2010-04-29.orig/arch/x86/mm/init_64-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/init_64-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -841,7 +841,7 @@ static void __init init_gbpages(void) #endif } -static unsigned long __init kernel_physical_mapping_init(unsigned long start, +static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { @@ -966,6 +966,8 @@ unsigned long __init_refok init_memory_m pos = start_pfn << PAGE_SHIFT; end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pos = end_pfn << PAGE_SHIFT; @@ -1146,7 +1148,7 @@ int arch_add_memory(int nid, u64 start, if (last_mapped_pfn > max_pfn_mapped) max_pfn_mapped = last_mapped_pfn; - ret = __add_pages(zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); return ret; @@ -1177,6 +1179,8 @@ int devmem_is_allowed(unsigned long page { if (pagenr <= 256) return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; if (mfn_to_local_pfn(pagenr) >= max_pfn) return 1; return 0; @@ -1192,8 +1196,6 @@ void __init mem_init(void) unsigned long absent_pages; unsigned long pfn; - start_periodic_check_for_corruption(); - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ --- head-2010-04-29.orig/arch/x86/mm/iomap_32-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/iomap_32-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -17,9 +17,21 @@ */ #include +#include #include #include +int is_io_mapping_possible(resource_size_t base, unsigned long size) +{ +#ifndef CONFIG_X86_PAE + /* There is no way to map greater than 1 << 32 address without PAE */ + if (base + size > 0x100000000ULL) + return 0; +#endif + return 1; +} +EXPORT_SYMBOL_GPL(is_io_mapping_possible); + /* Map 'mfn' using fixed map 'type' and protections 'prot' */ void * @@ -30,6 +42,15 @@ iomap_atomic_prot_pfn(unsigned long mfn, pagefault_disable(); + /* + * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. + * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the + * MTRR is UC or WC. UC_MINUS gets the real intention, of the + * user, which is "WC if the MTRR is WC, UC if you can't do that." + */ + if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) + prot = PAGE_KERNEL_UC_MINUS; + idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); pgprot_val(prot) |= _PAGE_IOMAP; --- head-2010-04-29.orig/arch/x86/mm/ioremap-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/ioremap-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -293,25 +293,6 @@ int page_is_ram(unsigned long pagenr) return 0; } -int pagerange_is_ram(unsigned long start, unsigned long end) -{ - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; - - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - if (page_is_ram(mfn_to_local_pfn(page_nr))) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; - } - - return ram_page; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -402,7 +383,8 @@ static void __iomem *__ioremap_caller(re * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ON(iomem_map_sanity_check(phys_addr, size)); + WARN_ONCE(iomem_map_sanity_check(phys_addr, size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); /* * Don't allow anybody to remap normal RAM that we're using.. @@ -746,38 +728,10 @@ void __init early_ioremap_init(void) } } -#ifdef CONFIG_X86_32 -void __init early_ioremap_clear(void) -{ - pmd_t *pmd; - - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_clear()\n"); - - pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); - pmd_clear(pmd); - make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables); - /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */ - __flush_tlb_all(); -} - void __init early_ioremap_reset(void) { - enum fixed_addresses idx; - unsigned long addr, phys; - pte_t *pte; - after_paging_init = 1; - for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { - addr = fix_to_virt(idx); - pte = early_ioremap_pte(addr); - if (pte_present(*pte)) { - phys = __pte_val(*pte) & PAGE_MASK; - set_fixmap(idx, phys); - } - } } -#endif /* CONFIG_X86_32 */ static void __init __early_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) --- head-2010-04-29.orig/arch/x86/mm/pageattr-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/pageattr-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -524,22 +524,28 @@ static int split_large_page(pte_t *kpte, set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot)); /* - * Install the new, split up pagetable. Important details here: + * Install the new, split up pagetable. * - * On Intel the NX bit of all levels must be cleared to make a - * page executable. See section 4.13.2 of Intel 64 and IA-32 - * Architectures Software Developer's Manual). - * - * Mark the entry present. The current mapping might be - * set to not present, which we preserved above. + * We use the standard kernel pagetable protections for the new + * pagetable protections, the actual ptes set above control the + * primary protection behavior: */ if (!xen_feature(XENFEAT_writable_page_tables) && HYPERVISOR_update_va_mapping((unsigned long)pbase, mk_pte(base, PAGE_KERNEL_RO), 0)) BUG(); - ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); - pgprot_val(ref_prot) |= _PAGE_PRESENT; - __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot)); + __set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE))); + + /* + * Intel Atom errata AAH41 workaround. + * + * The real fix should be in hw or in a microcode update, but + * we also probabilistically try to reduce the window of having + * a large TLB mixed with 4K TLBs while instruction fetches are + * going on. + */ + __flush_tlb_all(); + base = NULL; out_unlock: @@ -554,6 +560,36 @@ out_unlock: return 0; } +static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, + int primary) +{ + /* + * Ignore all non primary paths. + */ + if (!primary) + return 0; + + /* + * Ignore the NULL PTE for kernel identity mapping, as it is expected + * to have holes. + * Also set numpages to '1' indicating that we processed cpa req for + * one virtual address page and its pfn. TBD: numpages can be set based + * on the initial value and the level returned by lookup_address(). + */ + if (within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + cpa->numpages = 1; + cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; + return 0; + } else { + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", vaddr, + *cpa->vaddr); + + return -EFAULT; + } +} + static int __change_page_attr(struct cpa_data *cpa, int primary) { unsigned long address; @@ -565,21 +601,14 @@ static int __change_page_attr(struct cpa address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; - repeat: kpte = lookup_address(address, &level); if (!kpte) - return 0; + return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; - if (!__pte_val(old_pte)) { - if (!primary) - return 0; - WARN(1, KERN_WARNING "CPA: called for zero pte. " - "vaddr = %lx cpa->vaddr = %lx\n", address, - *cpa->vaddr); - return -EINVAL; - } + if (!__pte_val(old_pte)) + return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { pte_t new_pte; @@ -678,12 +707,7 @@ static int cpa_process_alias(struct cpa_ vaddr = *cpa->vaddr; if (!(within(vaddr, PAGE_OFFSET, - PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) -#ifdef CONFIG_X86_64 - || within(vaddr, PAGE_OFFSET + (1UL<<32), - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) -#endif - )) { + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); @@ -814,6 +838,15 @@ static int change_page_attr_set_clr(unsi vm_unmap_aliases(); + /* + * If we're called with lazy mmu updates enabled, the + * in-memory pte state may be stale. Flush pending updates to + * bring them up to date. + * + arch_flush_lazy_mmu_mode();*/ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(true); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; @@ -856,6 +889,14 @@ static int change_page_attr_set_clr(unsi } else cpa_flush_all(cache); + /* + * If we've been called with lazy mmu updates enabled, then + * make sure that everything gets flushed out before we + * return. + * + arch_flush_lazy_mmu_mode();*/ + WARN_ON_ONCE(arch_use_lazy_mmu_mode() && !irq_count()); + out: return ret; } --- head-2010-04-29.orig/arch/x86/mm/pat-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/mm/pat-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -167,11 +168,12 @@ struct memtype { static LIST_HEAD(memtype_list); static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end); static inline u8 _mtrr_type_lookup(u64 start, u64 end) { if (is_initial_xendomain()) return mtrr_type_lookup(start, end); - return pagerange_is_ram(start, end) > 0 + return pat_pagerange_is_ram(start, end) > 0 ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE; } #define mtrr_type_lookup _mtrr_type_lookup @@ -232,6 +234,33 @@ chk_conflict(struct memtype *new, struct static struct memtype *cached_entry; static u64 cached_start; +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && + page_is_ram(mfn_to_local_pfn(page_nr))) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * For RAM pages, mark the pages as non WB memory type using * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or @@ -360,9 +389,13 @@ int reserve_memtype(u64 start, u64 end, req_type & _PAGE_CACHE_MASK); } - is_range_ram = pagerange_is_ram(start, end); + if (new_type) + *new_type = actual_type; + + is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, new_type); + return reserve_ram_pages_type(start, end, req_type, + new_type); else if (is_range_ram < 0) return -EINVAL; @@ -374,9 +407,6 @@ int reserve_memtype(u64 start, u64 end, new->end = end; new->type = actual_type; - if (new_type) - *new_type = actual_type; - spin_lock(&memtype_lock); if (cached_entry && start >= cached_start) @@ -464,7 +494,7 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; - is_range_ram = pagerange_is_ram(start, end); + is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) return free_ram_pages_type(start, end); else if (is_range_ram < 0) @@ -623,6 +653,254 @@ void unmap_devmem(unsigned long mfn, uns free_memtype(addr, addr + size); } +#ifndef CONFIG_XEN +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) +{ + int is_ram = 0; + int id_sz, ret; + unsigned long flags; + unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + + /* + * reserve_pfn_range() doesn't support RAM pages. + */ + if (is_ram != 0) + return -EINVAL; + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR "%s:%d map pfn expected mapping type %s" + " for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } + + /* Need to keep identity mapping in sync */ + if (paddr >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < paddr + size) ? + __pa(high_memory) - paddr : + size; + + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d reserve_pfn_range ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + * Otherwise, we reserve the entire vma range, my ging through the PTEs page + * by page to get physical address and protection. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + int retval = 0; + unsigned long i, j; + resource_size_t paddr; + unsigned long prot; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + pgprot_t pgprot; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + } + + /* reserve entire vma page by page, using pfn and prot from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) + continue; + + pgprot = __pgprot(prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) + continue; + + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + * Otherwise, we look t the pfn and size and reserve only the specified range + * page by page. + * + * Note that this function can be called with caller trying to map only a + * subrange/page inside the vma. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size) +{ + int retval = 0; + unsigned long i, j; + resource_size_t base_paddr; + resource_size_t paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot, 0); + } + + /* reserve page by page using pfn and size */ + base_paddr = (resource_size_t)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = base_paddr + i; + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + paddr = base_paddr + j; + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + unsigned long i; + resource_size_t paddr; + unsigned long prot; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } + + if (size != 0 && size != vma_size) { + /* free page by page, using pfn and size */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = paddr + i; + free_pfn_range(paddr, PAGE_SIZE); + } + } else { + /* free entire vma, page by page, using the pfn from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) + continue; + + free_pfn_range(paddr, PAGE_SIZE); + } + } +} +#endif /* CONFIG_XEN */ + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ --- head-2010-04-29.orig/arch/x86/pci/irq-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/arch/x86/pci/irq-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -16,8 +16,7 @@ #include #include #include - -#include "pci.h" +#include #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) #define PIRQ_VERSION 0x0100 @@ -540,7 +539,7 @@ static int pirq_bios_set(struct pci_dev { struct pci_dev *bridge; int pin = pci_get_interrupt_pin(dev, &bridge); - return pcibios_set_irq_routing(bridge, pin, irq); + return pcibios_set_irq_routing(bridge, pin - 1, irq); } #endif @@ -579,6 +578,7 @@ static __init int intel_router_probe(str case PCI_DEVICE_ID_INTEL_ICH7_1: case PCI_DEVICE_ID_INTEL_ICH7_30: case PCI_DEVICE_ID_INTEL_ICH7_31: + case PCI_DEVICE_ID_INTEL_TGP_LPC: case PCI_DEVICE_ID_INTEL_ESB2_0: case PCI_DEVICE_ID_INTEL_ICH8_0: case PCI_DEVICE_ID_INTEL_ICH8_1: @@ -894,7 +894,6 @@ static int pcibios_lookup_irq(struct pci dev_dbg(&dev->dev, "no interrupt pin\n"); return 0; } - pin = pin - 1; /* Find IRQ routing entry */ @@ -904,17 +903,17 @@ static int pcibios_lookup_irq(struct pci info = pirq_get_info(dev); if (!info) { dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n", - 'A' + pin); + 'A' + pin - 1); return 0; } - pirq = info->irq[pin].link; - mask = info->irq[pin].bitmap; + pirq = info->irq[pin - 1].link; + mask = info->irq[pin - 1].bitmap; if (!pirq) { - dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin); + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1); return 0; } dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x", - 'A' + pin, pirq, mask, pirq_table->exclusive_irqs); + 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs); mask &= pcibios_irq_mask; /* Work around broken HP Pavilion Notebooks which assign USB to @@ -956,7 +955,7 @@ static int pcibios_lookup_irq(struct pci newirq = i; } } - dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq); + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq); /* Check if it is hardcoded */ if ((pirq & 0xf0) == 0xf0) { @@ -984,18 +983,18 @@ static int pcibios_lookup_irq(struct pci return 0; } } - dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq); + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); /* Update IRQ for all devices with the same pirq value */ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; - pin--; + info = pirq_get_info(dev2); if (!info) continue; - if (info->irq[pin].link == pirq) { + if (info->irq[pin - 1].link == pirq) { /* * We refuse to override the dev->irq * information. Give a warning! @@ -1049,6 +1048,9 @@ static void __init pcibios_fixup_irqs(vo dev = NULL; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (!pin) + continue; + #ifdef CONFIG_X86_IO_APIC /* * Recalculate IRQ numbers if we use the I/O APIC. @@ -1056,15 +1058,11 @@ static void __init pcibios_fixup_irqs(vo if (io_apic_assign_pci_irqs) { int irq; - if (!pin) - continue; - /* * interrupt pins are numbered starting from 1 */ - pin--; irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin); + PCI_SLOT(dev->devfn), pin - 1); /* * Busses behind bridges are typically not listed in the * MP-table. In this case we have to look up the IRQ @@ -1077,22 +1075,22 @@ static void __init pcibios_fixup_irqs(vo struct pci_dev *bridge = dev->bus->self; int bus; - pin = (pin + PCI_SLOT(dev->devfn)) % 4; + pin = pci_swizzle_interrupt_pin(dev, pin); bus = bridge->bus->number; irq = IO_APIC_get_PCI_irq_vector(bus, - PCI_SLOT(bridge->devfn), pin); + PCI_SLOT(bridge->devfn), pin - 1); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s INT %c to " "get IRQ %d\n", pci_name(bridge), - 'A' + pin, irq); + 'A' + pin - 1, irq); } if (irq >= 0) { dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c " "-> IRQ %d\n", - 'A' + pin, irq); + 'A' + pin - 1, irq); dev->irq = irq; } } @@ -1100,7 +1098,7 @@ static void __init pcibios_fixup_irqs(vo /* * Still no IRQ? Try to lookup one... */ - if (pin && !dev->irq) + if (!dev->irq) pcibios_lookup_irq(dev, 0); } } @@ -1227,12 +1225,10 @@ static int pirq_enable_irq(struct pci_de if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { char *msg = ""; - pin--; /* interrupt pins are numbered starting from 1 */ - if (io_apic_assign_pci_irqs) { int irq; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1243,20 +1239,20 @@ static int pirq_enable_irq(struct pci_de while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ struct pci_dev *bridge = dev->bus->self; - pin = (pin + PCI_SLOT(dev->devfn)) % 4; + pin = pci_swizzle_interrupt_pin(dev, pin); irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); + PCI_SLOT(bridge->devfn), pin - 1); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s " "INT %c to get IRQ %d\n", - pci_name(bridge), 'A' + pin, + pci_name(bridge), 'A' + pin - 1, irq); dev = bridge; } dev = temp_dev; if (irq >= 0) { dev_info(&dev->dev, "PCI->APIC IRQ transform: " - "INT %c -> IRQ %d\n", 'A' + pin, irq); + "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); dev->irq = irq; return 0; } else @@ -1275,7 +1271,7 @@ static int pirq_enable_irq(struct pci_de return 0; dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n", - 'A' + pin, msg); + 'A' + pin - 1, msg); } return 0; } --- head-2010-04-29.orig/arch/x86/pci/pcifront.c 2009-03-18 10:39:31.000000000 +0100 +++ head-2010-04-29/arch/x86/pci/pcifront.c 2010-03-24 15:17:58.000000000 +0100 @@ -8,8 +8,8 @@ #include #include #include +#include #include -#include "pci.h" static int pcifront_enable_irq(struct pci_dev *dev) { --- head-2010-04-29.orig/arch/x86/vdso/vdso32-setup-xen.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/arch/x86/vdso/vdso32-setup-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -349,7 +349,7 @@ int __init sysenter_setup(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; --- head-2010-04-29.orig/drivers/acpi/Kconfig 2010-03-24 14:36:44.000000000 +0100 +++ head-2010-04-29/drivers/acpi/Kconfig 2010-03-24 15:17:58.000000000 +0100 @@ -9,7 +9,7 @@ menuconfig ACPI depends on PCI depends on PM select PNP - select CPU_IDLE + select CPU_IDLE if !PROCESSOR_EXTERNAL_CONTROL default y help Advanced Configuration and Power Interface (ACPI) support for --- head-2010-04-29.orig/drivers/acpi/processor_extcntl.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/acpi/processor_extcntl.c 2010-03-24 15:17:58.000000000 +0100 @@ -230,3 +230,117 @@ err_out: kfree(perf); return ret; } + +/* + * Objects and functions removed in native 2.6.29, and thus moved here. + */ +#ifdef CONFIG_SMP +static void smp_callback(void *v) +{ + /* we already woke the CPU up, nothing more to do */ +} + +/* + * This function gets called when a part of the kernel has a new latency + * requirement. This means we need to get all processors out of their C-state, + * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that + * wakes them all right up. + */ +static int acpi_processor_latency_notify(struct notifier_block *b, + unsigned long l, void *v) +{ + smp_call_function(smp_callback, NULL, 1); + return NOTIFY_OK; +} + +struct notifier_block acpi_processor_latency_notifier = { + .notifier_call = acpi_processor_latency_notify, +}; +#endif + +/* + * bm_history -- bit-mask with a bit per jiffy of bus-master activity + * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms + * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms + * 100 HZ: 0x0000000F: 4 jiffies = 40ms + * reduce history for more aggressive entry into C3 + */ +static unsigned int bm_history __read_mostly = + (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); +module_param(bm_history, uint, 0644); + +int acpi_processor_set_power_policy(struct acpi_processor *pr) +{ + unsigned int i; + unsigned int state_is_set = 0; + struct acpi_processor_cx *lower = NULL; + struct acpi_processor_cx *higher = NULL; + struct acpi_processor_cx *cx; + + + if (!pr) + return -EINVAL; + + /* + * This function sets the default Cx state policy (OS idle handler). + * Our scheme is to promote quickly to C2 but more conservatively + * to C3. We're favoring C2 for its characteristics of low latency + * (quick response), good power savings, and ability to allow bus + * mastering activity. Note that the Cx state policy is completely + * customizable and can be altered dynamically. + */ + + /* startup state */ + for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { + cx = &pr->power.states[i]; + if (!cx->valid) + continue; + + if (!state_is_set) + pr->power.state = cx; + state_is_set++; + break; + } + + if (!state_is_set) + return -ENODEV; + + /* demotion */ + for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { + cx = &pr->power.states[i]; + if (!cx->valid) + continue; + + if (lower) { + cx->demotion.state = lower; + cx->demotion.threshold.ticks = cx->latency_ticks; + cx->demotion.threshold.count = 1; + if (cx->type == ACPI_STATE_C3) + cx->demotion.threshold.bm = bm_history; + } + + lower = cx; + } + + /* promotion */ + for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) { + cx = &pr->power.states[i]; + if (!cx->valid) + continue; + + if (higher) { + cx->promotion.state = higher; + cx->promotion.threshold.ticks = cx->latency_ticks; + if (cx->type >= ACPI_STATE_C2) + cx->promotion.threshold.count = 4; + else + cx->promotion.threshold.count = 10; + if (higher->type == ACPI_STATE_C3) + cx->promotion.threshold.bm = bm_history; + } + + higher = cx; + } + + return 0; +} --- head-2010-04-29.orig/drivers/acpi/processor_idle.c 2010-04-15 09:55:39.000000000 +0200 +++ head-2010-04-29/drivers/acpi/processor_idle.c 2010-04-15 10:06:51.000000000 +0200 @@ -123,6 +123,7 @@ static struct dmi_system_id __cpuinitdat }; +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL /* * Callers should disable interrupts before the call and enable * interrupts after return. @@ -141,6 +142,7 @@ static void acpi_safe_halt(void) } current_thread_info()->status |= TS_POLLING; } +#endif #ifdef ARCH_APICTIMER_STOPS_ON_C3 @@ -211,7 +213,7 @@ static void lapic_timer_state_broadcast( static void lapic_timer_check_state(int state, struct acpi_processor *pr, struct acpi_processor_cx *cstate) { } static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) { } -static void lapic_timer_state_broadcast(struct acpi_processor *pr, +static inline void lapic_timer_state_broadcast(struct acpi_processor *pr, struct acpi_processor_cx *cx, int broadcast) { @@ -259,7 +261,8 @@ int acpi_processor_resume(struct acpi_de return 0; } -#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86) +#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86) \ + && !defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) static void tsc_check_state(int state) { switch (boot_cpu_data.x86_vendor) { @@ -600,7 +603,11 @@ static void acpi_processor_power_verify_ */ cx->valid = 1; +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL cx->latency_ticks = cx->latency; +#else + cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); +#endif /* * On older chipsets, BM_RLD needs to be set * in order for Bus Master activity to wake the @@ -633,7 +640,11 @@ static int acpi_processor_power_verify(s if (!cx->address) break; cx->valid = 1; +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL cx->latency_ticks = cx->latency; /* Normalize latency */ +#else + cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); +#endif break; case ACPI_STATE_C3: @@ -676,6 +687,20 @@ static int acpi_processor_get_power_info pr->power.count = acpi_processor_power_verify(pr); +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL + /* + * Set Default Policy + * ------------------ + * Now that we know which states are supported, set the default + * policy. Note that this policy can be changed dynamically + * (e.g. encourage deeper sleeps to conserve battery life when + * not on AC). + */ + result = acpi_processor_set_power_policy(pr); + if (result) + return result; +#endif + /* * if one state of type C2 or C3 is available, mark this * CPU as being "idle manageable" @@ -773,6 +798,7 @@ static const struct file_operations acpi }; #endif +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL /** * acpi_idle_bm_check - checks if bus master activity was detected */ @@ -1142,6 +1168,13 @@ static int acpi_processor_setup_cpuidle( return 0; } +#else /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ +static inline int acpi_processor_setup_cpuidle(struct acpi_processor *pr) +{ + return 0; +} +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ + int acpi_processor_cst_has_changed(struct acpi_processor *pr) { int ret = 0; @@ -1208,6 +1241,10 @@ int __cpuinit acpi_processor_power_init( "ACPI: processor limited to max C-state %d\n", max_cstate); first_run++; +#if defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) && defined(CONFIG_SMP) + pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, + &acpi_processor_latency_notifier); +#endif } if (!pr) @@ -1267,5 +1304,12 @@ int acpi_processor_power_exit(struct acp acpi_device_dir(device)); #endif +#if defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) && defined(CONFIG_SMP) + /* Unregister the idle handler when processor #0 is removed. */ + if (pr->id == 0) + pm_qos_remove_notifier(PM_QOS_CPU_DMA_LATENCY, + &acpi_processor_latency_notifier); +#endif + return 0; } --- head-2010-04-29.orig/drivers/gpu/drm/i915/i915_drv.c 2010-04-29 09:29:49.000000000 +0200 +++ head-2010-04-29/drivers/gpu/drm/i915/i915_drv.c 2010-04-29 09:52:19.000000000 +0200 @@ -533,7 +533,7 @@ static struct drm_driver driver = { .open = drm_open, .release = drm_release, .unlocked_ioctl = drm_ioctl, - .mmap = drm_gem_mmap, + .mmap = i915_gem_mmap, .poll = drm_poll, .fasync = drm_fasync, .read = drm_read, --- head-2010-04-29.orig/drivers/gpu/drm/i915/i915_drv.h 2010-04-29 09:29:49.000000000 +0200 +++ head-2010-04-29/drivers/gpu/drm/i915/i915_drv.h 2010-04-29 09:52:20.000000000 +0200 @@ -926,6 +926,11 @@ int i915_gem_idle(struct drm_device *dev uint32_t i915_add_request(struct drm_device *dev, struct drm_file *file_priv, uint32_t flush_domains); int i915_do_wait_request(struct drm_device *dev, uint32_t seqno, int interruptible); +#ifdef CONFIG_XEN +int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma); +#else +#define i915_gem_mmap drm_gem_mmap +#endif int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); int i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj, int write); --- head-2010-04-29.orig/drivers/gpu/drm/i915/i915_gem.c 2010-04-29 09:29:49.000000000 +0200 +++ head-2010-04-29/drivers/gpu/drm/i915/i915_gem.c 2010-04-15 10:06:44.000000000 +0200 @@ -1146,6 +1146,17 @@ i915_gem_mmap_ioctl(struct drm_device *d return 0; } +#ifdef CONFIG_XEN +int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int ret = drm_gem_mmap(filp, vma); + + pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP; + + return ret; +} +#endif + /** * i915_gem_fault - fault a page into the GTT * vma: VMA in question --- head-2010-04-29.orig/drivers/oprofile/buffer_sync.c 2010-04-15 09:51:29.000000000 +0200 +++ head-2010-04-29/drivers/oprofile/buffer_sync.c 2010-04-15 10:06:28.000000000 +0200 @@ -537,7 +537,6 @@ void sync_buffer(int cpu) int cpu_mode = CPU_MODE_KERNEL; sync_buffer_state state = sb_buffer_start; unsigned int i; - int domain_switch = 0; unsigned long available; unsigned long flags; struct op_entry entry; @@ -562,15 +561,6 @@ void sync_buffer(int cpu) if (!sample) break; -#ifdef CONFIG_XEN - if (domain_switch) { - cpu_current_domain[cpu] = sample->eip; - add_domain_switch(sample->eip); - domain_switch = 0; - continue; - } -#endif - if (is_code(sample->eip)) { flags = sample->event; if (flags & TRACE_BEGIN) { @@ -596,8 +586,11 @@ void sync_buffer(int cpu) add_user_ctx_switch(new, cookie); } #ifdef CONFIG_XEN - if (flags & DOMAIN_SWITCH) - domain_switch = 1; + if ((flags & DOMAIN_SWITCH) + && op_cpu_buffer_get_data(&entry, &val)) { + cpu_current_domain[cpu] = val; + add_domain_switch(val); + } #endif if (op_cpu_buffer_get_size(&entry)) add_data(&entry, mm); --- head-2010-04-29.orig/drivers/oprofile/cpu_buffer.c 2010-03-24 15:12:36.000000000 +0100 +++ head-2010-04-29/drivers/oprofile/cpu_buffer.c 2010-03-24 15:17:58.000000000 +0100 @@ -444,34 +444,15 @@ void oprofile_add_pc(unsigned long pc, i #ifdef CONFIG_XEN /* - * This is basically log_sample(b, ESCAPE_CODE, cpu_mode, CPU_TRACE_BEGIN), + * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN), * as was previously accessible through oprofile_add_pc(). */ void oprofile_add_mode(int cpu_mode) { struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); - struct task_struct *task; - if (nr_available_slots(cpu_buf) < 3) { + if (op_add_code(cpu_buf, 1, cpu_mode, current)) cpu_buf->sample_lost_overflow++; - return; - } - - task = current; - - /* notice a switch from user->kernel or vice versa */ - if (cpu_buf->last_cpu_mode != cpu_mode) { - cpu_buf->last_cpu_mode = cpu_mode; - add_code(cpu_buf, cpu_mode); - } - - /* notice a task switch */ - if (cpu_buf->last_task != task) { - cpu_buf->last_task = task; - add_code(cpu_buf, (unsigned long)task); - } - - add_code(cpu_buf, CPU_TRACE_BEGIN); } #endif @@ -502,17 +483,18 @@ fail: #ifdef CONFIG_XEN int oprofile_add_domain_switch(int32_t domain_id) { - struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); + struct op_entry entry; + struct op_sample *sample; - /* should have space for switching into and out of domain - (2 slots each) plus one sample and one cpu mode switch */ - if (((nr_available_slots(cpu_buf) < 6) && - (domain_id != COORDINATOR_DOMAIN)) || - (nr_available_slots(cpu_buf) < 2)) + sample = op_cpu_buffer_write_reserve(&entry, 1); + if (!sample) return 0; - add_code(cpu_buf, DOMAIN_SWITCH); - add_sample(cpu_buf, domain_id, 0); + sample->eip = ESCAPE_CODE; + sample->event = DOMAIN_SWITCH; + + op_cpu_buffer_add_data(&entry, domain_id); + op_cpu_buffer_write_commit(&entry); current_domain = domain_id; --- head-2010-04-29.orig/drivers/pci/msi-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/pci/msi-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -763,30 +763,21 @@ void pci_no_msi(void) pci_msi_enable = 0; } +/** + * pci_msi_enabled - is MSI enabled? + * + * Returns true if MSI has not been disabled by the command-line option + * pci=nomsi. + **/ +int pci_msi_enabled(void) +{ + return pci_msi_enable; +} +EXPORT_SYMBOL(pci_msi_enabled); + void pci_msi_init_pci_dev(struct pci_dev *dev) { #ifndef CONFIG_XEN INIT_LIST_HEAD(&dev->msi_list); #endif } - -#ifdef CONFIG_ACPI -#include -#include -static void __devinit msi_acpi_init(void) -{ - if (acpi_pci_disabled) - return; - pci_osc_support_set(OSC_MSI_SUPPORT); - pcie_osc_support_set(OSC_MSI_SUPPORT); -} -#else -static inline void msi_acpi_init(void) { } -#endif /* CONFIG_ACPI */ - -void __devinit msi_init(void) -{ - if (!pci_msi_enable) - return; - msi_acpi_init(); -} --- head-2010-04-29.orig/drivers/xen/Kconfig 2010-03-24 15:12:36.000000000 +0100 +++ head-2010-04-29/drivers/xen/Kconfig 2010-03-24 15:18:46.000000000 +0100 @@ -388,6 +388,7 @@ config XEN_DEV_EVTCHN config XENFS tristate "Xen filesystem" + depends on PARAVIRT_XEN default y help The xen filesystem provides a way for domains to share --- head-2010-04-29.orig/drivers/xen/Makefile 2010-04-19 14:52:08.000000000 +0200 +++ head-2010-04-29/drivers/xen/Makefile 2010-04-19 14:52:22.000000000 +0200 @@ -13,6 +13,7 @@ obj-$(CONFIG_XEN) += features.o util.o obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y) obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y) +obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ obj-$(CONFIG_XEN_BLKDEV_TAP2) += blktap2/ --- head-2010-04-29.orig/drivers/xen/balloon/sysfs.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/drivers/xen/balloon/sysfs.c 2010-03-24 15:17:58.000000000 +0100 @@ -67,7 +67,7 @@ static ssize_t store_target_kb(struct sy struct sysdev_attribute *attr, const char *buf, size_t count) { - char memstring[64], *endchar; + char *endchar; unsigned long long target_bytes; if (!capable(CAP_SYS_ADMIN)) @@ -75,11 +75,8 @@ static ssize_t store_target_kb(struct sy if (count <= 1) return -EBADMSG; /* runt */ - if (count > sizeof(memstring)) - return -EFBIG; /* too long */ - strcpy(memstring, buf); - target_bytes = memparse(memstring, &endchar); + target_bytes = simple_strtoull(buf, &endchar, 0) << 10; balloon_set_new_target(target_bytes >> PAGE_SHIFT); return count; @@ -88,8 +85,40 @@ static ssize_t store_target_kb(struct sy static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, show_target_kb, store_target_kb); +static ssize_t show_target(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages + << PAGE_SHIFT); +} + +static ssize_t store_target(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + + target_bytes = memparse(buf, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, + show_target, store_target); + static struct sysdev_attribute *balloon_attrs[] = { &attr_target_kb, + &attr_target, }; static struct attribute *balloon_info_attrs[] = { --- head-2010-04-29.orig/drivers/xen/blkfront/vbd.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/xen/blkfront/vbd.c 2010-03-24 15:17:58.000000000 +0100 @@ -308,6 +308,10 @@ xlvbd_init_blk_queue(struct gendisk *gd, if (rq == NULL) return -1; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) + queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); +#endif + /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_hardsect_size(rq, sector_size); blk_queue_max_sectors(rq, 512); --- head-2010-04-29.orig/drivers/xen/core/cpu_hotplug.c 2009-04-07 13:58:48.000000000 +0200 +++ head-2010-04-29/drivers/xen/core/cpu_hotplug.c 2010-03-24 15:17:58.000000000 +0100 @@ -10,10 +10,10 @@ * Set of CPUs that remote admin software will allow us to bring online. * Notified to us via xenbus. */ -static cpumask_t xenbus_allowed_cpumask; +static cpumask_var_t xenbus_allowed_cpumask; /* Set of CPUs that local admin will allow us to bring online. */ -static cpumask_t local_allowed_cpumask = CPU_MASK_ALL; +static cpumask_var_t local_allowed_cpumask; static int local_cpu_hotplug_request(void) { @@ -40,10 +40,10 @@ static void vcpu_hotplug(unsigned int cp } if (strcmp(state, "online") == 0) { - cpu_set(cpu, xenbus_allowed_cpumask); + cpumask_set_cpu(cpu, xenbus_allowed_cpumask); (void)cpu_up(cpu); } else if (strcmp(state, "offline") == 0) { - cpu_clear(cpu, xenbus_allowed_cpumask); + cpumask_clear_cpu(cpu, xenbus_allowed_cpumask); (void)cpu_down(cpu); } else { printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", @@ -75,7 +75,7 @@ static int smpboot_cpu_notify(struct not * as it's always executed from within a stopmachine kthread. */ if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request()) - cpu_clear(cpu, local_allowed_cpumask); + cpumask_clear_cpu(cpu, local_allowed_cpumask); return NOTIFY_OK; } @@ -156,21 +156,26 @@ int cpu_up_check(unsigned int cpu) int rc = 0; if (local_cpu_hotplug_request()) { - cpu_set(cpu, local_allowed_cpumask); - if (!cpu_isset(cpu, xenbus_allowed_cpumask)) { + cpumask_set_cpu(cpu, local_allowed_cpumask); + if (!cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) { printk("%s: attempt to bring up CPU %u disallowed by " "remote admin.\n", __FUNCTION__, cpu); rc = -EBUSY; } - } else if (!cpu_isset(cpu, local_allowed_cpumask) || - !cpu_isset(cpu, xenbus_allowed_cpumask)) { + } else if (!cpumask_test_cpu(cpu, local_allowed_cpumask) || + !cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) { rc = -EBUSY; } return rc; } -void init_xenbus_allowed_cpumask(void) +void __init init_xenbus_allowed_cpumask(void) { - xenbus_allowed_cpumask = cpu_present_map; + if (!alloc_cpumask_var(&xenbus_allowed_cpumask, GFP_KERNEL)) + BUG(); + cpumask_copy(xenbus_allowed_cpumask, cpu_present_mask); + if (!alloc_cpumask_var(&local_allowed_cpumask, GFP_KERNEL)) + BUG(); + cpumask_setall(local_allowed_cpumask); } --- head-2010-04-29.orig/drivers/xen/core/evtchn.c 2010-04-23 15:18:24.000000000 +0200 +++ head-2010-04-29/drivers/xen/core/evtchn.c 2010-04-23 15:19:25.000000000 +0200 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -57,9 +58,6 @@ static DEFINE_SPINLOCK(irq_mapping_updat static int evtchn_to_irq[NR_EVENT_CHANNELS] = { [0 ... NR_EVENT_CHANNELS-1] = -1 }; -/* Packed IRQ information: binding type, sub-type index, and event channel. */ -static u32 irq_info[NR_IRQS]; - /* Binding types. */ enum { IRQT_UNBOUND, @@ -75,6 +73,30 @@ enum { #define _EVTCHN_BITS 12 #define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS) +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND (IRQT_UNBOUND << (32 - _IRQT_BITS)) + +static struct irq_cfg _irq_cfg[] = { + [0 ... +#ifdef CONFIG_SPARSE_IRQ + BUILD_BUG_ON_ZERO(PIRQ_BASE) + NR_IRQS_LEGACY +#else + NR_IRQS +#endif + - 1].info = IRQ_UNBOUND +}; + +static inline struct irq_cfg *__pure irq_cfg(unsigned int irq) +{ +#ifdef CONFIG_SPARSE_IRQ + struct irq_desc *desc = irq_to_desc(irq); + + return desc ? desc->chip_data : NULL; +#else + return irq < NR_IRQS ? _irq_cfg + irq : NULL; +#endif +} + /* Constructor for packed IRQ information. */ static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) { @@ -90,26 +112,30 @@ static inline u32 mk_irq_info(u32 type, return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn); } -/* Convenient shorthand for packed representation of an unbound IRQ. */ -#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) - /* * Accessors for packed IRQ information. */ static inline unsigned int evtchn_from_irq(int irq) { - return irq_info[irq] & ((1U << _EVTCHN_BITS) - 1); + const struct irq_cfg *cfg = irq_cfg(irq); + + return cfg ? cfg->info & ((1U << _EVTCHN_BITS) - 1) : 0; } static inline unsigned int index_from_irq(int irq) { - return (irq_info[irq] >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1); + const struct irq_cfg *cfg = irq_cfg(irq); + + return cfg ? (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1) + : 0; } static inline unsigned int type_from_irq(int irq) { - return irq_info[irq] >> (32 - _IRQT_BITS); + const struct irq_cfg *cfg = irq_cfg(irq); + + return cfg ? cfg->info >> (32 - _IRQT_BITS) : IRQT_UNBOUND; } /* IRQ <-> VIRQ mapping. */ @@ -121,9 +147,6 @@ DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS #endif DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1}; -/* Reference counts for bindings to IRQs. */ -static int irq_bindcount[NR_IRQS]; - #ifdef CONFIG_SMP static u8 cpu_evtchn[NR_EVENT_CHANNELS]; @@ -157,8 +180,12 @@ static void init_evtchn_cpu_bindings(voi int i; /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_to_desc(i)->affinity = cpumask_of_cpu(0); + for (i = 0; i < nr_irqs; i++) { + struct irq_desc *desc = irq_to_desc(i); + + if (desc) + desc->affinity = cpumask_of_cpu(0); + } memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); @@ -232,7 +259,7 @@ static DEFINE_PER_CPU(unsigned int, curr static DEFINE_PER_CPU(unsigned int, current_l2i); /* NB. Interrupts are disabled on entry. */ -asmlinkage void evtchn_do_upcall(struct pt_regs *regs) +asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs) { unsigned long l1, l2; unsigned long masked_l1, masked_l2; @@ -320,14 +347,25 @@ asmlinkage void evtchn_do_upcall(struct irq_exit(); } -static int find_unbound_irq(void) +static struct irq_chip dynirq_chip; + +static int find_unbound_irq(unsigned int cpu) { static int warned; int irq; - for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) - if (irq_bindcount[irq] == 0) + for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) { + struct irq_desc *desc = irq_to_desc_alloc_cpu(irq, cpu); + struct irq_cfg *cfg = desc->chip_data; + + if (!cfg->bindcount) { + desc->status |= IRQ_NOPROBE; + set_irq_chip_and_handler_name(irq, &dynirq_chip, + handle_level_irq, + "level"); return irq; + } + } if (!warned) { warned = 1; @@ -345,14 +383,15 @@ static int bind_caller_port_to_irq(unsig spin_lock(&irq_mapping_update_lock); if ((irq = evtchn_to_irq[caller_port]) == -1) { - if ((irq = find_unbound_irq()) < 0) + if ((irq = find_unbound_irq(smp_processor_id())) < 0) goto out; evtchn_to_irq[caller_port] = irq; - irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port); + irq_cfg(irq)->info = mk_irq_info(IRQT_CALLER_PORT, + 0, caller_port); } - irq_bindcount[irq]++; + irq_cfg(irq)->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -367,7 +406,7 @@ static int bind_local_port_to_irq(unsign BUG_ON(evtchn_to_irq[local_port] != -1); - if ((irq = find_unbound_irq()) < 0) { + if ((irq = find_unbound_irq(smp_processor_id())) < 0) { struct evtchn_close close = { .port = local_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) BUG(); @@ -375,8 +414,8 @@ static int bind_local_port_to_irq(unsign } evtchn_to_irq[local_port] = irq; - irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); - irq_bindcount[irq]++; + irq_cfg(irq)->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); + irq_cfg(irq)->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -420,7 +459,7 @@ static int bind_virq_to_irq(unsigned int spin_lock(&irq_mapping_update_lock); if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { - if ((irq = find_unbound_irq()) < 0) + if ((irq = find_unbound_irq(cpu)) < 0) goto out; bind_virq.virq = virq; @@ -431,14 +470,14 @@ static int bind_virq_to_irq(unsigned int evtchn = bind_virq.port; evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn); per_cpu(virq_to_irq, cpu)[virq] = irq; bind_evtchn_to_cpu(evtchn, cpu); } - irq_bindcount[irq]++; + irq_cfg(irq)->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -453,7 +492,7 @@ static int bind_ipi_to_irq(unsigned int spin_lock(&irq_mapping_update_lock); if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { - if ((irq = find_unbound_irq()) < 0) + if ((irq = find_unbound_irq(cpu)) < 0) goto out; bind_ipi.vcpu = cpu; @@ -463,14 +502,14 @@ static int bind_ipi_to_irq(unsigned int evtchn = bind_ipi.port; evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn); per_cpu(ipi_to_irq, cpu)[ipi] = irq; bind_evtchn_to_cpu(evtchn, cpu); } - irq_bindcount[irq]++; + irq_cfg(irq)->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -485,7 +524,7 @@ static void unbind_from_irq(unsigned int spin_lock(&irq_mapping_update_lock); - if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { + if (!--irq_cfg(irq)->bindcount && VALID_EVTCHN(evtchn)) { close.port = evtchn; if ((type_from_irq(irq) != IRQT_CALLER_PORT) && HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) @@ -508,11 +547,15 @@ static void unbind_from_irq(unsigned int bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; + irq_cfg(irq)->info = IRQ_UNBOUND; /* Zap stats across IRQ changes of use. */ for_each_possible_cpu(cpu) +#ifdef CONFIG_SPARSE_IRQ + irq_to_desc(irq)->kstat_irqs[cpu] = 0; +#else kstat_cpu(cpu).irqs[irq] = 0; +#endif } spin_unlock(&irq_mapping_update_lock); @@ -664,10 +707,9 @@ static void rebind_irq_to_cpu(unsigned i rebind_evtchn_to_cpu(evtchn, tcpu); } -static void set_affinity_irq(unsigned int irq, cpumask_t dest) +static void set_affinity_irq(unsigned int irq, const struct cpumask *dest) { - unsigned tcpu = first_cpu(dest); - rebind_irq_to_cpu(irq, tcpu); + rebind_irq_to_cpu(irq, cpumask_first(dest)); } #endif @@ -835,7 +877,7 @@ static void enable_pirq(unsigned int irq evtchn_to_irq[evtchn] = irq; bind_evtchn_to_cpu(evtchn, 0); - irq_info[irq] = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn); + irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn); out: pirq_unmask_and_notify(evtchn, irq); @@ -857,7 +899,7 @@ static void disable_pirq(unsigned int ir bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; - irq_info[irq] = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0); + irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0); } static unsigned int startup_pirq(unsigned int irq) @@ -1023,7 +1065,7 @@ static void restore_cpu_virqs(unsigned i if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) continue; - BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0)); + BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_VIRQ, virq, 0)); /* Get a new binding from Xen. */ bind_virq.virq = virq; @@ -1035,7 +1077,7 @@ static void restore_cpu_virqs(unsigned i /* Record the new mapping. */ evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn); bind_evtchn_to_cpu(evtchn, cpu); /* Ready for use. */ @@ -1052,7 +1094,7 @@ static void restore_cpu_ipis(unsigned in if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) continue; - BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0)); + BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_IPI, ipi, 0)); /* Get a new binding from Xen. */ bind_ipi.vcpu = cpu; @@ -1063,7 +1105,7 @@ static void restore_cpu_ipis(unsigned in /* Record the new mapping. */ evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn); bind_evtchn_to_cpu(evtchn, cpu); /* Ready for use. */ @@ -1075,6 +1117,7 @@ static void restore_cpu_ipis(unsigned in void irq_resume(void) { unsigned int cpu, irq, evtchn; + struct irq_cfg *cfg; init_evtchn_cpu_bindings(); @@ -1091,12 +1134,17 @@ void irq_resume(void) mask_evtchn(evtchn); /* Check that no PIRQs are still bound. */ - for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) - BUG_ON(irq_info[irq] != IRQ_UNBOUND); + for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) { + cfg = irq_cfg(irq); + BUG_ON(cfg && cfg->info != IRQ_UNBOUND); + } /* No IRQ <-> event-channel mappings. */ - for (irq = 0; irq < NR_IRQS; irq++) - irq_info[irq] &= ~((1U << _EVTCHN_BITS) - 1); + for (irq = 0; irq < nr_irqs; irq++) { + cfg = irq_cfg(irq); + if (cfg) + cfg->info &= ~((1U << _EVTCHN_BITS) - 1); + } for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) evtchn_to_irq[evtchn] = -1; @@ -1108,10 +1156,56 @@ void irq_resume(void) } #endif +int __init arch_early_irq_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++) + irq_to_desc(i)->chip_data = _irq_cfg + i; + + return 0; +} + +#ifdef CONFIG_SPARSE_IRQ +int arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + if (!desc->chip_data) { + /* By default all event channels notify CPU#0. */ + desc->affinity = cpumask_of_cpu(0); + + desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC); + } + if (!desc->chip_data) { + printk(KERN_ERR "cannot alloc irq_cfg\n"); + BUG(); + } + + return 0; +} +#endif + #if defined(CONFIG_X86_IO_APIC) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + struct physdev_irq irq_op; + + if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) + return -EINVAL; + + if (cfg->vector) + return 0; + + irq_op.irq = irq; + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) + return -ENOSPC; + + cfg->vector = irq_op.vector; + + return 0; +} #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE)) #elif defined(CONFIG_X86) -#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < 16) +#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < NR_IRQS_LEGACY) #else #define identity_mapped_irq(irq) (1) #endif @@ -1121,7 +1215,7 @@ void evtchn_register_pirq(int irq) BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS); if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND) return; - irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, 0); + irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0); set_irq_chip_and_handler_name(irq, &pirq_chip, handle_level_irq, "level"); } @@ -1134,12 +1228,17 @@ int evtchn_map_pirq(int irq, int xen_pir irq = PIRQ_BASE + NR_PIRQS - 1; spin_lock(&irq_alloc_lock); do { + struct irq_desc *desc; + struct irq_cfg *cfg; + if (identity_mapped_irq(irq)) continue; + desc = irq_to_desc_alloc_cpu(irq, smp_processor_id()); + cfg = desc->chip_data; if (!index_from_irq(irq)) { BUG_ON(type_from_irq(irq) != IRQT_UNBOUND); - irq_info[irq] = mk_irq_info(IRQT_PIRQ, - xen_pirq, 0); + cfg->info = mk_irq_info(IRQT_PIRQ, + xen_pirq, 0); break; } } while (--irq >= PIRQ_BASE); @@ -1158,7 +1257,7 @@ int evtchn_map_pirq(int irq, int xen_pir * then causes a warning in dynamic_irq_cleanup(). */ set_irq_chip_and_handler(irq, NULL, NULL); - irq_info[irq] = IRQ_UNBOUND; + irq_cfg(irq)->info = IRQ_UNBOUND; return 0; } else if (type_from_irq(irq) != IRQT_PIRQ || index_from_irq(irq) != xen_pirq) { @@ -1195,23 +1294,17 @@ void __init xen_init_IRQ(void) for (i = 0; i < NR_EVENT_CHANNELS; i++) mask_evtchn(i); - /* No IRQ -> event-channel mappings. */ - for (i = 0; i < NR_IRQS; i++) - irq_info[i] = IRQ_UNBOUND; - - /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ +#ifndef CONFIG_SPARSE_IRQ for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) { - irq_bindcount[i] = 0; - irq_to_desc(i)->status |= IRQ_NOPROBE; set_irq_chip_and_handler_name(i, &dynirq_chip, handle_level_irq, "level"); } - /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) { - irq_bindcount[i] = 1; - +#else + for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) { +#endif if (!identity_mapped_irq(i)) continue; --- head-2010-04-29.orig/drivers/xen/core/machine_reboot.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/drivers/xen/core/machine_reboot.c 2010-03-24 15:17:58.000000000 +0100 @@ -19,6 +19,9 @@ #include #if defined(__i386__) || defined(__x86_64__) +#include +/* TBD: Dom0 should propagate the determined value to Xen. */ +bool port_cf9_safe = false; /* * Power off function, if any @@ -84,7 +87,7 @@ static void post_suspend(int suspend_can pfn_to_mfn(xen_start_info->console.domU.mfn); } else { #ifdef CONFIG_SMP - cpu_initialized_map = cpu_online_map; + cpumask_copy(vcpu_initialized_mask, cpu_online_mask); #endif for_each_possible_cpu(i) setup_runstate_area(i); @@ -222,6 +225,12 @@ int __xen_suspend(int fast_suspend, void if (num_possible_cpus() == 1) fast_suspend = 0; + if (fast_suspend) { + err = stop_machine_create(); + if (err) + return err; + } + suspend.fast_suspend = fast_suspend; suspend.resume_notifier = resume_notifier; @@ -248,6 +257,8 @@ int __xen_suspend(int fast_suspend, void if (!fast_suspend) smp_resume(); + else + stop_machine_destroy(); return 0; } --- head-2010-04-29.orig/drivers/xen/core/smpboot.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/xen/core/smpboot.c 2010-03-24 15:17:58.000000000 +0100 @@ -36,11 +36,7 @@ extern void smp_trap_init(trap_info_t *) /* Number of siblings per CPU package */ int smp_num_siblings = 1; -cpumask_t cpu_online_map; -EXPORT_SYMBOL(cpu_online_map); -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); -cpumask_t cpu_initialized_map; +cpumask_var_t vcpu_initialized_mask; DEFINE_PER_CPU(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -76,10 +72,14 @@ void __init prefill_possible_map(void) #endif rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { - cpu_set(i, cpu_possible_map); + set_cpu_possible(i, true); nr_cpu_ids = i + 1; } } + total_cpus = num_possible_cpus(); + for (; HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL) >= 0; ++i) + if (i != smp_processor_id()) + ++total_cpus; } static inline void @@ -203,7 +203,7 @@ static void __cpuinit cpu_initialize_con struct task_struct *idle = idle_task(cpu); - if (cpu_test_and_set(cpu, cpu_initialized_map)) + if (cpumask_test_and_set_cpu(cpu, vcpu_initialized_mask)) return; spin_lock(&ctxt_lock); @@ -284,13 +284,15 @@ void __init smp_prepare_cpus(unsigned in if (xen_smp_intr_init(0)) BUG(); - cpu_initialized_map = cpumask_of_cpu(0); + if (!alloc_cpumask_var(&vcpu_initialized_mask, GFP_KERNEL)) + BUG(); + cpumask_copy(vcpu_initialized_mask, cpumask_of(0)); /* Restrict the possible_map according to max_cpus. */ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + for (cpu = nr_cpu_ids-1; !cpumask_test_cpu(cpu, cpu_possible_mask); cpu--) continue; - cpu_clear(cpu, cpu_possible_map); + set_cpu_possible(cpu, false); } for_each_possible_cpu (cpu) { @@ -328,10 +330,8 @@ void __init smp_prepare_cpus(unsigned in #ifdef CONFIG_HOTPLUG_CPU if (is_initial_xendomain()) - cpu_set(cpu, cpu_present_map); -#else - cpu_set(cpu, cpu_present_map); #endif + set_cpu_present(cpu, true); } init_xenbus_allowed_cpumask(); @@ -364,14 +364,17 @@ void __init smp_prepare_boot_cpu(void) */ static int __init initialize_cpu_present_map(void) { - cpu_present_map = cpu_possible_map; + unsigned int cpu; + + for_each_possible_cpu(cpu) + set_cpu_present(cpu, true); + return 0; } core_initcall(initialize_cpu_present_map); int __cpuexit __cpu_disable(void) { - cpumask_t map = cpu_online_map; unsigned int cpu = smp_processor_id(); if (cpu == 0) @@ -379,9 +382,8 @@ int __cpuexit __cpu_disable(void) remove_siblinginfo(cpu); - cpu_clear(cpu, map); - fixup_irqs(map); - cpu_clear(cpu, cpu_online_map); + set_cpu_online(cpu, false); + fixup_irqs(); return 0; } @@ -424,7 +426,7 @@ int __cpuinit __cpu_up(unsigned int cpu) return rc; } - cpu_set(cpu, cpu_online_map); + set_cpu_online(cpu, true); rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); @@ -436,7 +438,7 @@ void __ref play_dead(void) { idle_task_exit(); local_irq_disable(); - cpu_clear(smp_processor_id(), cpu_initialized); + cpumask_clear_cpu(smp_processor_id(), cpu_initialized_mask); preempt_enable_no_resched(); VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); #ifdef CONFIG_HOTPLUG_CPU --- head-2010-04-29.orig/drivers/xen/core/spinlock.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/xen/core/spinlock.c 2010-03-24 15:17:58.000000000 +0100 @@ -173,7 +173,8 @@ bool xen_spin_wait(raw_spinlock_t *lock, current_vcpu_info()->evtchn_upcall_mask = upcall_mask; rc = !xen_test_irq_pending(irq); - kstat_this_cpu.irqs[irq] += !rc; + if (!rc) + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); } while (spinning.prev || rc); /* --- head-2010-04-29.orig/drivers/xen/netback/interface.c 2010-03-24 15:09:08.000000000 +0100 +++ head-2010-04-29/drivers/xen/netback/interface.c 2010-03-24 15:17:58.000000000 +0100 @@ -176,6 +176,14 @@ static struct ethtool_ops network_ethtoo .get_strings = netbk_get_strings, }; +static const struct net_device_ops netif_be_netdev_ops = { + .ndo_open = net_open, + .ndo_stop = net_close, + .ndo_start_xmit = netif_be_start_xmit, + .ndo_change_mtu = netbk_change_mtu, + .ndo_get_stats = netif_be_get_stats, +}; + netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle) { int err = 0; @@ -210,11 +218,7 @@ netif_t *netif_alloc(struct device *pare init_timer(&netif->tx_queue_timeout); - dev->hard_start_xmit = netif_be_start_xmit; - dev->get_stats = netif_be_get_stats; - dev->open = net_open; - dev->stop = net_close; - dev->change_mtu = netbk_change_mtu; + dev->netdev_ops = &netif_be_netdev_ops; dev->features = NETIF_F_IP_CSUM; SET_ETHTOOL_OPS(dev, &network_ethtool_ops); --- head-2010-04-29.orig/drivers/xen/netback/loopback.c 2010-03-24 15:10:29.000000000 +0100 +++ head-2010-04-29/drivers/xen/netback/loopback.c 2010-03-24 15:17:58.000000000 +0100 @@ -201,19 +201,21 @@ static void loopback_set_multicast_list( { } +static const struct net_device_ops loopback_netdev_ops = { + .ndo_open = loopback_open, + .ndo_stop = loopback_close, + .ndo_start_xmit = loopback_start_xmit, + .ndo_set_multicast_list = loopback_set_multicast_list, + .ndo_change_mtu = NULL, /* allow arbitrary mtu */ + .ndo_get_stats = loopback_get_stats, +}; + static void loopback_construct(struct net_device *dev, struct net_device *lo) { struct net_private *np = netdev_priv(dev); np->loopback_dev = lo; - - dev->open = loopback_open; - dev->stop = loopback_close; - dev->hard_start_xmit = loopback_start_xmit; - dev->get_stats = loopback_get_stats; - dev->set_multicast_list = loopback_set_multicast_list; - dev->change_mtu = NULL; /* allow arbitrary mtu */ - + dev->netdev_ops = &loopback_netdev_ops; dev->tx_queue_len = 0; dev->features = (NETIF_F_HIGHDMA | --- head-2010-04-29.orig/drivers/xen/netback/netback.c 2010-03-24 15:10:29.000000000 +0100 +++ head-2010-04-29/drivers/xen/netback/netback.c 2010-03-24 15:17:58.000000000 +0100 @@ -354,7 +354,7 @@ static void xen_network_done_notify(void static struct net_device *eth0_dev = NULL; if (unlikely(eth0_dev == NULL)) eth0_dev = __dev_get_by_name(&init_net, "eth0"); - netif_rx_schedule(eth0_dev, ???); + netif_rx_schedule(???); } /* * Add following to poll() function in NAPI driver (Tigon3 is example): --- head-2010-04-29.orig/drivers/xen/netfront/netfront.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/xen/netfront/netfront.c 2010-03-24 15:17:58.000000000 +0100 @@ -635,7 +635,7 @@ static int network_open(struct net_devic if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){ netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(dev, &np->napi); + netif_rx_schedule(&np->napi); } } spin_unlock_bh(&np->rx_lock); @@ -707,7 +707,7 @@ static void rx_refill_timeout(unsigned l netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(dev, &np->napi); + netif_rx_schedule(&np->napi); } static void network_alloc_rx_buffers(struct net_device *dev) @@ -1064,8 +1064,7 @@ static irqreturn_t netif_int(int irq, vo if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) { netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(dev, &np->napi); - dev->last_rx = jiffies; + netif_rx_schedule(&np->napi); } } @@ -1481,7 +1480,6 @@ err: /* Pass it up. */ netif_receive_skb(skb); - dev->last_rx = jiffies; } /* If we get a callback with very few responses, reduce fill target. */ @@ -1523,7 +1521,7 @@ err: } if (!more_to_do && !accel_more_to_do) - __netif_rx_complete(dev, napi); + __netif_rx_complete(napi); local_irq_restore(flags); } @@ -2024,6 +2022,18 @@ static void network_set_multicast_list(s { } +static const struct net_device_ops xennet_netdev_ops = { + .ndo_uninit = netif_uninit, + .ndo_open = network_open, + .ndo_stop = network_close, + .ndo_start_xmit = network_start_xmit, + .ndo_set_multicast_list = network_set_multicast_list, + .ndo_set_mac_address = xennet_set_mac_address, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = xennet_change_mtu, + .ndo_get_stats = network_get_stats, +}; + static struct net_device * __devinit create_netdev(struct xenbus_device *dev) { int i, err = 0; @@ -2080,15 +2090,8 @@ static struct net_device * __devinit cre goto exit_free_tx; } - netdev->open = network_open; - netdev->hard_start_xmit = network_start_xmit; - netdev->stop = network_close; - netdev->get_stats = network_get_stats; + netdev->netdev_ops = &xennet_netdev_ops; netif_napi_add(netdev, &np->napi, netif_poll, 64); - netdev->set_multicast_list = network_set_multicast_list; - netdev->uninit = netif_uninit; - netdev->set_mac_address = xennet_set_mac_address; - netdev->change_mtu = xennet_change_mtu; netdev->features = NETIF_F_IP_CSUM; SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); @@ -2119,7 +2122,7 @@ inetdev_notify(struct notifier_block *th struct net_device *dev = ifa->ifa_dev->dev; /* UP event and is it one of our devices? */ - if (event == NETDEV_UP && dev->open == network_open) + if (event == NETDEV_UP && dev->netdev_ops->ndo_open == network_open) send_fake_arp(dev); return NOTIFY_DONE; --- head-2010-04-29.orig/drivers/xen/sfc_netfront/accel_msg.c 2010-03-24 15:10:29.000000000 +0100 +++ head-2010-04-29/drivers/xen/sfc_netfront/accel_msg.c 2010-03-24 15:17:58.000000000 +0100 @@ -47,7 +47,7 @@ static void vnic_start_interrupts(netfro netfront_accel_disable_net_interrupts(vnic); vnic->irq_enabled = 0; NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++); - netif_rx_schedule(vnic->net_dev, &np->napi); + netif_rx_schedule(&np->napi); } else { /* * Nothing yet, make sure we get interrupts through @@ -532,7 +532,7 @@ irqreturn_t netfront_accel_net_channel_i vnic->stats.event_count_since_irq; vnic->stats.event_count_since_irq = 0; #endif - netif_rx_schedule(net_dev, &np->napi); + netif_rx_schedule(&np->napi); } else { spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); --- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_client.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/drivers/xen/xenbus/xenbus_client.c 2010-03-24 15:17:58.000000000 +0100 @@ -170,7 +170,6 @@ EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); /** * xenbus_switch_state * @dev: xenbus device - * @xbt: transaction handle * @state: new state * * Advertise in the store a change of the given driver to the given new_state. @@ -304,7 +303,7 @@ EXPORT_SYMBOL_GPL(xenbus_dev_error); * @fmt: error message format * * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by - * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly + * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly * closedown of this driver and its peer. */ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, --- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_probe.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-04-29/drivers/xen/xenbus/xenbus_probe.c 2010-03-24 15:17:58.000000000 +0100 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,10 @@ #endif int xen_store_evtchn; +#if !defined(CONFIG_XEN) && !defined(MODULE) +EXPORT_SYMBOL(xen_store_evtchn); +#endif + struct xenstore_domain_interface *xen_store_interface; static unsigned long xen_store_mfn; @@ -197,6 +202,12 @@ static int xenbus_uevent_frontend(struct } #endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) +static struct device_attribute xenbus_dev_attrs[] = { + __ATTR_NULL +}; +#endif + /* Bus type for frontend drivers. */ static struct xen_bus_type xenbus_frontend = { .root = "device", @@ -205,13 +216,16 @@ static struct xen_bus_type xenbus_fronte .probe = xenbus_probe_frontend, .error = -ENODEV, .bus = { - .name = "xen", - .match = xenbus_match, + .name = "xen", + .match = xenbus_match, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, - .shutdown = xenbus_dev_shutdown, - .uevent = xenbus_uevent_frontend, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, + .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_frontend, +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) + .dev_attrs = xenbus_dev_attrs, #endif }, #if defined(CONFIG_XEN) || defined(MODULE) @@ -584,7 +598,17 @@ int xenbus_probe_node(struct xen_bus_typ xendev->dev.bus = &bus->bus; xendev->dev.release = xenbus_dev_release; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) + { + char devname[XEN_BUS_ID_SIZE]; + + err = bus->get_bus_id(devname, xendev->nodename); + if (!err) + dev_set_name(&xendev->dev, devname); + } +#else err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename); +#endif if (err) goto fail; @@ -770,7 +794,7 @@ static int suspend_dev(struct device *de err = drv->suspend(xdev); if (err) printk(KERN_WARNING - "xenbus: suspend %s failed: %i\n", dev->bus_id, err); + "xenbus: suspend %s failed: %i\n", dev_name(dev), err); return 0; } @@ -791,7 +815,7 @@ static int suspend_cancel_dev(struct dev if (err) printk(KERN_WARNING "xenbus: suspend_cancel %s failed: %i\n", - dev->bus_id, err); + dev_name(dev), err); return 0; } @@ -813,7 +837,7 @@ static int resume_dev(struct device *dev if (err) { printk(KERN_WARNING "xenbus: resume (talk_to_otherend) %s failed: %i\n", - dev->bus_id, err); + dev_name(dev), err); return err; } @@ -824,7 +848,7 @@ static int resume_dev(struct device *dev if (err) { printk(KERN_WARNING "xenbus: resume %s failed: %i\n", - dev->bus_id, err); + dev_name(dev), err); return err; } } @@ -833,7 +857,7 @@ static int resume_dev(struct device *dev if (err) { printk(KERN_WARNING "xenbus_probe: resume (watch_otherend) %s failed: " - "%d.\n", dev->bus_id, err); + "%d.\n", dev_name(dev), err); return err; } @@ -1145,6 +1169,14 @@ static int __devinit xenbus_probe_init(v if (!is_initial_xendomain()) xenbus_probe(NULL); +#if defined(CONFIG_XEN_COMPAT_XENFS) && !defined(MODULE) + /* + * Create xenfs mountpoint in /proc for compatibility with + * utilities that expect to find "xenbus" under "/proc/xen". + */ + proc_mkdir("xen", NULL); +#endif + return 0; err: --- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_probe.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/drivers/xen/xenbus/xenbus_probe.h 2010-03-24 15:17:58.000000000 +0100 @@ -45,6 +45,10 @@ #define is_initial_xendomain() xen_initial_domain() #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) +#define dev_name(dev) ((dev)->bus_id) +#endif + #if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE) extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); --- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_probe_backend.c 2010-03-24 15:10:29.000000000 +0100 +++ head-2010-04-29/drivers/xen/xenbus/xenbus_probe_backend.c 2010-03-24 15:17:58.000000000 +0100 @@ -36,6 +36,7 @@ __FUNCTION__, __LINE__, ##args) #include +#include #include #include #include @@ -108,6 +109,10 @@ static int backend_bus_id(char bus_id[BU return 0; } +static struct device_attribute xenbus_backend_attrs[] = { + __ATTR_NULL +}; + static struct xen_bus_type xenbus_backend = { .root = "backend", .levels = 3, /* backend/type// */ @@ -115,12 +120,13 @@ static struct xen_bus_type xenbus_backen .probe = xenbus_probe_backend, .error = -ENODEV, .bus = { - .name = "xen-backend", - .match = xenbus_match, - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, -// .shutdown = xenbus_dev_shutdown, - .uevent = xenbus_uevent_backend, + .name = "xen-backend", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, +// .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_backend, + .dev_attrs = xenbus_backend_attrs, }, .dev = { .bus_id = "xen-backend", --- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_xs.c 2010-03-24 15:09:22.000000000 +0100 +++ head-2010-04-29/drivers/xen/xenbus/xenbus_xs.c 2010-03-24 15:17:58.000000000 +0100 @@ -227,6 +227,9 @@ void *xenbus_dev_request_and_reply(struc return ret; } +#if !defined(CONFIG_XEN) && !defined(MODULE) +EXPORT_SYMBOL(xenbus_dev_request_and_reply); +#endif /* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */ static void *xs_talkv(struct xenbus_transaction t, --- head-2010-04-29.orig/drivers/xen/xenoprof/xenoprofile.c 2010-03-24 15:10:37.000000000 +0100 +++ head-2010-04-29/drivers/xen/xenoprof/xenoprofile.c 2010-03-24 15:17:58.000000000 +0100 @@ -50,7 +50,7 @@ static int xenoprof_enabled = 0; static int xenoprof_is_primary = 0; static int active_defined; -extern unsigned long backtrace_depth; +extern unsigned long oprofile_backtrace_depth; /* Number of buffers in shared area (one per VCPU) */ static int nbuf; @@ -339,11 +339,11 @@ static int xenoprof_setup(void) active_defined = 1; } - if (backtrace_depth > 0) { + if (oprofile_backtrace_depth > 0) { ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace, - &backtrace_depth); + &oprofile_backtrace_depth); if (ret) - backtrace_depth = 0; + oprofile_backtrace_depth = 0; } ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL); --- head-2010-04-29.orig/include/acpi/processor.h 2010-03-24 14:53:41.000000000 +0100 +++ head-2010-04-29/include/acpi/processor.h 2010-03-24 15:17:58.000000000 +0100 @@ -451,6 +451,13 @@ extern int processor_extcntl_prepare(str extern int acpi_processor_get_performance_info(struct acpi_processor *pr); extern int acpi_processor_get_psd(struct acpi_processor *pr); void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **); + +/* + * Declarations for objects and functions removed in native 2.6.29, and + * thus moved to drivers/acpi/processor_extcntl.c. + */ +extern struct notifier_block acpi_processor_latency_notifier; +int acpi_processor_set_power_policy(struct acpi_processor *); #else static inline int processor_cntl_external(void) {return 0;} static inline int processor_pm_external(void) {return 0;} --- head-2010-04-29.orig/include/xen/cpu_hotplug.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/include/xen/cpu_hotplug.h 2010-03-24 15:17:58.000000000 +0100 @@ -5,7 +5,7 @@ #include #if defined(CONFIG_X86) && defined(CONFIG_SMP) -extern cpumask_t cpu_initialized_map; +extern cpumask_var_t vcpu_initialized_mask; #endif #if defined(CONFIG_HOTPLUG_CPU) --- head-2010-04-29.orig/include/xen/evtchn.h 2010-03-24 15:10:37.000000000 +0100 +++ head-2010-04-29/include/xen/evtchn.h 2010-03-24 15:17:58.000000000 +0100 @@ -48,6 +48,18 @@ * LOW-LEVEL DEFINITIONS */ +struct irq_cfg { + u32 info; + union { + int bindcount; /* for dynamic IRQs */ +#ifdef CONFIG_X86_IO_APIC + u8 vector; /* for physical IRQs */ +#endif + }; +}; + +int assign_irq_vector(int irq, struct irq_cfg *, const struct cpumask *); + /* * Dynamically bind an event source to an IRQ-like callback handler. * On some platforms this may not be implemented via the Linux IRQ subsystem. --- head-2010-04-29.orig/include/xen/xenbus.h 2010-03-24 15:10:29.000000000 +0100 +++ head-2010-04-29/include/xen/xenbus.h 2010-03-24 15:17:58.000000000 +0100 @@ -322,7 +322,9 @@ void xenbus_dev_error(struct xenbus_devi void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...); +#if defined(CONFIG_XEN) || defined(MODULE) int xenbus_dev_init(void); +#endif const char *xenbus_strstate(enum xenbus_state state); int xenbus_dev_is_online(struct xenbus_device *dev); --- head-2010-04-29.orig/lib/swiotlb-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-04-29/lib/swiotlb-xen.c 2010-03-24 15:17:58.000000000 +0100 @@ -8,6 +8,7 @@ * Copyright (C) 2000, 2003 Hewlett-Packard Co * David Mosberger-Tang * Copyright (C) 2005 Keir Fraser + * 08/12/11 beckyb Add highmem support */ #include @@ -16,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -30,27 +33,11 @@ #include #include -int swiotlb; - #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1))) -/* - * Maximum allowable number of contiguous slabs to map, - * must be a power of 2. What is the appropriate value ? - * The complexity of {map,unmap}_single is linearly dependent on this value. - */ -#define IO_TLB_SEGSIZE 128 - -/* - * log of the size of each IO TLB slab. The number of slabs is command line - * controllable. - */ -#define IO_TLB_SHIFT 11 - +int swiotlb; int swiotlb_force; -static unsigned long iotlb_nslabs; - /* * Used to do a quick range check in swiotlb_unmap_single and * swiotlb_sync_single_*, to see if the memory was in fact allocated by this @@ -59,6 +46,12 @@ static unsigned long iotlb_nslabs; static char *io_tlb_start, *io_tlb_end; /* + * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and + * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. + */ +static unsigned long io_tlb_nslabs; + +/* * When the IOMMU overflows we return a fallback buffer. This sets the size. */ static unsigned long io_tlb_overflow = 32*1024; @@ -76,10 +69,7 @@ static unsigned int io_tlb_index; * We need to save away the original address corresponding to a mapped entry * for the sync operations. */ -static struct phys_addr { - struct page *page; - unsigned int offset; -} *io_tlb_orig_addr; +static phys_addr_t *io_tlb_orig_addr; /* * Protect the above data structures in the map and unmap calls @@ -101,9 +91,9 @@ setup_io_tlb_npages(char *str) { /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */ if (isdigit(*str)) { - iotlb_nslabs = simple_strtoul(str, &str, 0) << + io_tlb_nslabs = simple_strtoul(str, &str, 0) << (20 - IO_TLB_SHIFT); - iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); } if (*str == ',') ++str; @@ -121,35 +111,17 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the PCI DMA API. - */ -void __init -swiotlb_init_with_default_size(size_t default_size) +void *__init swiotlb_alloc_boot(size_t size, unsigned long nslabs) { - unsigned long i, bytes; + void *start = alloc_bootmem_pages(size); + unsigned int i; int rc; - if (!iotlb_nslabs) { - iotlb_nslabs = (default_size >> IO_TLB_SHIFT); - iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT); - - /* - * Get IO TLB memory from the low pages - */ - io_tlb_start = alloc_bootmem_pages(bytes); - if (!io_tlb_start) - panic("Cannot allocate SWIOTLB buffer!\n"); - dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; - for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) { + for (i = 0; i < nslabs; i += IO_TLB_SEGSIZE) { do { rc = xen_create_contiguous_region( - (unsigned long)io_tlb_start + (i << IO_TLB_SHIFT), + (unsigned long)start + (i << IO_TLB_SHIFT), get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT), dma_bits); } while (rc && dma_bits++ < max_dma_bits); @@ -158,12 +130,12 @@ swiotlb_init_with_default_size(size_t de panic("No suitable physical memory available for SWIOTLB buffer!\n" "Use dom0_mem Xen boot parameter to reserve\n" "some DMA memory (e.g., dom0_mem=-128M).\n"); - iotlb_nslabs = i; + io_tlb_nslabs = i; i <<= IO_TLB_SHIFT; - free_bootmem(__pa(io_tlb_start + i), bytes - i); - bytes = i; + free_bootmem(__pa(start + i), size - i); + size = i; for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) { - unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1)); + unsigned int bits = fls64(virt_to_bus(start + i - 1)); if (bits > dma_bits) dma_bits = bits; @@ -171,18 +143,88 @@ swiotlb_init_with_default_size(size_t de break; } } + + return start; +} + +#ifndef CONFIG_XEN +void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs) +{ + return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); +} +#endif + +dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) +{ + return phys_to_machine(paddr); +} + +phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +{ + return machine_to_phys(baddr); +} + +static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, + volatile void *address) +{ + return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); +} + +static void *swiotlb_bus_to_virt(dma_addr_t address) +{ + return phys_to_virt(swiotlb_bus_to_phys(address)); +} + +int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) +{ + return 0; +} + +static void swiotlb_print_info(unsigned long bytes) +{ + printk(KERN_INFO "Software IO TLB enabled: \n" + " Aperture: %lu megabytes\n" + " Address size: %u bits\n" + " Kernel range: %p - %p\n", + bytes >> 20, dma_bits, + io_tlb_start, io_tlb_end); +} + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the PCI DMA API. + */ +void __init +swiotlb_init_with_default_size(size_t default_size) +{ + unsigned long i, bytes; + int rc; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from the low pages + */ + io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs); + if (!io_tlb_start) + panic("Cannot allocate SWIOTLB buffer!\n"); + bytes = io_tlb_nslabs << IO_TLB_SHIFT; io_tlb_end = io_tlb_start + bytes; /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE. */ - io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int)); - for (i = 0; i < iotlb_nslabs; i++) + io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); + for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_index = 0; - io_tlb_orig_addr = alloc_bootmem( - iotlb_nslabs * sizeof(*io_tlb_orig_addr)); + io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); /* * Get the overflow emergency buffer @@ -200,13 +242,7 @@ swiotlb_init_with_default_size(size_t de if (rc) panic("No suitable physical memory available for SWIOTLB overflow buffer!\n"); - printk(KERN_INFO "Software IO TLB enabled: \n" - " Aperture: %lu megabytes\n" - " Kernel range: %p - %p\n" - " Address size: %u bits\n", - bytes >> 20, - io_tlb_start, io_tlb_end, - dma_bits); + swiotlb_print_info(bytes); } void __init @@ -233,6 +269,11 @@ swiotlb_init(void) printk(KERN_INFO "Software IO TLB disabled\n"); } +static inline int range_needs_mapping(phys_addr_t pa, size_t size) +{ + return range_straddles_page_boundary(pa, size); +} + static int is_swiotlb_buffer(dma_addr_t addr) { unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr)); @@ -246,46 +287,50 @@ static int is_swiotlb_buffer(dma_addr_t } /* + * Bounce: copy the swiotlb buffer back to the original dma location + * * We use __copy_to_user_inatomic to transfer to the host buffer because the * buffer may be mapped read-only (e.g, in blkback driver) but lower-level * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an * unnecessary copy from the aperture to the host buffer, and a page fault. */ -static void -__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir) +static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir) { - if (PageHighMem(buffer.page)) { - size_t len, bytes; - char *dev, *host, *kmp; - len = size; - while (len != 0) { - unsigned long flags; - - if (((bytes = len) + buffer.offset) > PAGE_SIZE) - bytes = PAGE_SIZE - buffer.offset; - local_irq_save(flags); /* protects KM_BOUNCE_READ */ - kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ); - dev = dma_addr + size - len; - host = kmp + buffer.offset; - if (dir == DMA_FROM_DEVICE) { - if (__copy_to_user_inatomic(host, dev, bytes)) - /* inaccessible */; - } else - memcpy(dev, host, bytes); - kunmap_atomic(kmp, KM_BOUNCE_READ); + unsigned long pfn = PFN_DOWN(phys); + + if (PageHighMem(pfn_to_page(pfn))) { + /* The buffer does not have a mapping. Map it in and copy */ + unsigned int offset = phys & ~PAGE_MASK; + char *buffer; + unsigned int sz = 0; + unsigned long flags; + + while (size) { + sz = min((size_t)(PAGE_SIZE - offset), size); + + local_irq_save(flags); + buffer = kmap_atomic(pfn_to_page(pfn), + KM_BOUNCE_READ); + if (dir == DMA_TO_DEVICE) + memcpy(dma_addr, buffer + offset, sz); + else if (__copy_to_user_inatomic(buffer + offset, + dma_addr, sz)) + /* inaccessible */; + kunmap_atomic(buffer, KM_BOUNCE_READ); local_irq_restore(flags); - len -= bytes; - buffer.page++; - buffer.offset = 0; + + size -= sz; + pfn++; + dma_addr += sz; + offset = 0; } } else { - char *host = (char *)phys_to_virt( - page_to_pseudophys(buffer.page)) + buffer.offset; - if (dir == DMA_FROM_DEVICE) { - if (__copy_to_user_inatomic(host, dma_addr, size)) - /* inaccessible */; - } else if (dir == DMA_TO_DEVICE) - memcpy(dma_addr, host, size); + if (dir == DMA_TO_DEVICE) + memcpy(dma_addr, phys_to_virt(phys), size); + else if (__copy_to_user_inatomic(phys_to_virt(phys), + dma_addr, size)) + /* inaccessible */; } } @@ -293,12 +338,11 @@ __sync_single(struct phys_addr buffer, c * Allocates bounce buffer and returns its kernel virtual address. */ static void * -map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir) +map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) { unsigned long flags; char *dma_addr; unsigned int nslots, stride, index, wrap; - struct phys_addr slot_buf; int i; unsigned long mask; unsigned long offset_slots; @@ -306,6 +350,10 @@ map_single(struct device *hwdev, struct mask = dma_get_seg_boundary(hwdev); offset_slots = -IO_TLB_SEGSIZE; + + /* + * Carefully handle integer overflow which can occur when mask == ~0UL. + */ max_slots = mask + 1 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); @@ -328,7 +376,7 @@ map_single(struct device *hwdev, struct */ spin_lock_irqsave(&io_tlb_lock, flags); index = ALIGN(io_tlb_index, stride); - if (index >= iotlb_nslabs) + if (index >= io_tlb_nslabs) index = 0; wrap = index; @@ -336,7 +384,7 @@ map_single(struct device *hwdev, struct while (iommu_is_span_boundary(index, nslots, offset_slots, max_slots)) { index += stride; - if (index >= iotlb_nslabs) + if (index >= io_tlb_nslabs) index = 0; if (index == wrap) goto not_found; @@ -360,13 +408,13 @@ map_single(struct device *hwdev, struct * Update the indices to avoid searching in the next * round. */ - io_tlb_index = ((index + nslots) < iotlb_nslabs + io_tlb_index = ((index + nslots) < io_tlb_nslabs ? (index + nslots) : 0); goto found; } index += stride; - if (index >= iotlb_nslabs) + if (index >= io_tlb_nslabs) index = 0; } while (index != wrap); @@ -381,29 +429,14 @@ found: * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - slot_buf = buffer; - for (i = 0; i < nslots; i++) { - slot_buf.page += slot_buf.offset >> PAGE_SHIFT; - slot_buf.offset &= PAGE_SIZE - 1; - io_tlb_orig_addr[index+i] = slot_buf; - slot_buf.offset += 1 << IO_TLB_SHIFT; - } - if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL)) - __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE); + for (i = 0; i < nslots; i++) + io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); return dma_addr; } -static struct phys_addr dma_addr_to_phys_addr(char *dma_addr) -{ - int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; - struct phys_addr buffer = io_tlb_orig_addr[index]; - buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1); - buffer.page += buffer.offset >> PAGE_SHIFT; - buffer.offset &= PAGE_SIZE - 1; - return buffer; -} - /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ @@ -413,13 +446,13 @@ unmap_single(struct device *hwdev, char unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; - struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); + phys_addr_t phys = io_tlb_orig_addr[index]; /* * First, sync the memory before unmapping the entry */ - if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)) - __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE); + if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -453,9 +486,13 @@ unmap_single(struct device *hwdev, char static void sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir) { - struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); + int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t phys = io_tlb_orig_addr[index]; + + phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE)); - __sync_single(buffer, dma_addr, size, dir); + swiotlb_bounce(phys, dma_addr, size, dir); } static void @@ -469,7 +506,7 @@ swiotlb_full(struct device *dev, size_t * the damage, or panic when the transfer is too big. */ printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at " - "device %s\n", size, dev ? dev->bus_id : "?"); + "device %s\n", size, dev ? dev_name(dev) : "?"); if (size > io_tlb_overflow && do_panic) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) @@ -494,7 +531,6 @@ _swiotlb_map_single(struct device *hwdev dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset_in_page(paddr); void *map; - struct phys_addr buffer; BUG_ON(dir == DMA_NONE); @@ -503,23 +539,21 @@ _swiotlb_map_single(struct device *hwdev * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!range_straddles_page_boundary(paddr, size) && - !address_needs_mapping(hwdev, dev_addr, size)) + if (!address_needs_mapping(hwdev, dev_addr, size) && + !range_needs_mapping(paddr, size)) return dev_addr; /* * Oh well, have to allocate and map a bounce buffer. */ gnttab_dma_unmap_page(dev_addr); - buffer.page = page; - buffer.offset = offset_in_page(paddr); - map = map_single(hwdev, buffer, size, dir); + map = map_single(hwdev, paddr, size, dir); if (!map) { swiotlb_full(hwdev, size, dir, 1); map = io_tlb_overflow_buffer; } - dev_addr = virt_to_bus(map); + dev_addr = swiotlb_virt_to_bus(hwdev, map); return dev_addr; } @@ -536,6 +570,7 @@ swiotlb_map_single(struct device *hwdev, { return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL); } +EXPORT_SYMBOL(swiotlb_map_single); dma_addr_t swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir) @@ -555,7 +590,7 @@ void swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, struct dma_attrs *attrs) { - char *dma_addr = bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) @@ -571,6 +606,8 @@ swiotlb_unmap_single(struct device *hwde { return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL); } +EXPORT_SYMBOL(swiotlb_unmap_single); + /* * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. @@ -585,48 +622,50 @@ void swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - char *dma_addr = bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) sync_single(hwdev, dma_addr, size, dir); } +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); void swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - char *dma_addr = bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) sync_single(hwdev, dma_addr, size, dir); } +EXPORT_SYMBOL(swiotlb_sync_single_for_device); void swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, unsigned long offset, size_t size, int dir) { - char *dma_addr = bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) sync_single(hwdev, dma_addr + offset, size, dir); } +EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu); void swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, unsigned long offset, size_t size, int dir) { - char *dma_addr = bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(dev_addr)) sync_single(hwdev, dma_addr + offset, size, dir); } +EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); -void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int, - struct dma_attrs *); /* * Map a set of buffers described by scatterlist in streaming mode for DMA. * This is the scatter-gather version of the above swiotlb_map_single @@ -648,23 +687,23 @@ swiotlb_map_sg_attrs(struct device *hwde int dir, struct dma_attrs *attrs) { struct scatterlist *sg; - struct phys_addr buffer; - dma_addr_t dev_addr; - char *map; int i; BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) { - dev_addr = gnttab_dma_map_page(sg_page(sg)) + sg->offset; + dma_addr_t dev_addr = gnttab_dma_map_page(sg_page(sg)) + + sg->offset; + phys_addr_t paddr = page_to_pseudophys(sg_page(sg)) + + sg->offset; - if (range_straddles_page_boundary(page_to_pseudophys(sg_page(sg)) - + sg->offset, sg->length) + if (range_needs_mapping(paddr, sg->length) || address_needs_mapping(hwdev, dev_addr, sg->length)) { + void *map; + gnttab_dma_unmap_page(dev_addr); - buffer.page = sg_page(sg); - buffer.offset = sg->offset; - map = map_single(hwdev, buffer, sg->length, dir); + map = map_single(hwdev, paddr, + sg->length, dir); if (!map) { /* Don't panic here, we expect map_sg users to do proper error handling. */ @@ -674,7 +713,7 @@ swiotlb_map_sg_attrs(struct device *hwde sgl[0].dma_length = 0; return 0; } - sg->dma_address = virt_to_bus(map); + sg->dma_address = swiotlb_virt_to_bus(hwdev, map); } else sg->dma_address = dev_addr; sg->dma_length = sg->length; @@ -689,6 +728,7 @@ swiotlb_map_sg(struct device *hwdev, str { return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); } +EXPORT_SYMBOL(swiotlb_map_sg); /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules @@ -705,7 +745,7 @@ swiotlb_unmap_sg_attrs(struct device *hw for_each_sg(sgl, sg, nelems, i) { if (sg->dma_address != sg_phys(sg)) - unmap_single(hwdev, bus_to_virt(sg->dma_address), + unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), sg->dma_length, dir); else gnttab_dma_unmap_page(sg->dma_address); @@ -719,6 +759,7 @@ swiotlb_unmap_sg(struct device *hwdev, s { return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); } +EXPORT_SYMBOL(swiotlb_unmap_sg); /* * Make physical memory consistent for a set of streaming mode DMA translations @@ -738,10 +779,11 @@ swiotlb_sync_sg_for_cpu(struct device *h for_each_sg(sgl, sg, nelems, i) { if (sg->dma_address != sg_phys(sg)) - sync_single(hwdev, bus_to_virt(sg->dma_address), + sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), sg->dma_length, dir); } } +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); void swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sgl, @@ -754,16 +796,18 @@ swiotlb_sync_sg_for_device(struct device for_each_sg(sgl, sg, nelems, i) { if (sg->dma_address != sg_phys(sg)) - sync_single(hwdev, bus_to_virt(sg->dma_address), + sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), sg->dma_length, dir); } } +EXPORT_SYMBOL(swiotlb_sync_sg_for_device); int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) { - return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); + return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer)); } +EXPORT_SYMBOL(swiotlb_dma_mapping_error); /* * Return whether the given PCI device DMA address mask can be supported @@ -776,14 +820,4 @@ swiotlb_dma_supported (struct device *hw { return (mask >= ((1UL << dma_bits) - 1)); } - -EXPORT_SYMBOL(swiotlb_map_single); -EXPORT_SYMBOL(swiotlb_unmap_single); -EXPORT_SYMBOL(swiotlb_map_sg); -EXPORT_SYMBOL(swiotlb_unmap_sg); -EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); -EXPORT_SYMBOL(swiotlb_sync_single_for_device); -EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); -EXPORT_SYMBOL(swiotlb_sync_sg_for_device); -EXPORT_SYMBOL(swiotlb_dma_mapping_error); EXPORT_SYMBOL(swiotlb_dma_supported);