From: Linux Kernel Mailing List Subject: Linux: 2.6.37 Patch-mainline: 2.6.37 This patch contains the differences between 2.6.36 and 2.6.37. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.37" by xen-port-patches.py --- head-2011-03-17.orig/arch/x86/Kconfig 2011-02-17 13:43:12.000000000 +0100 +++ head-2011-03-17/arch/x86/Kconfig 2011-02-01 15:09:47.000000000 +0100 @@ -1782,7 +1782,6 @@ config USE_PERCPU_NUMA_NODE_ID depends on NUMA menu "Power management and ACPI options" - depends on !XEN_UNPRIVILEGED_GUEST config ARCH_HIBERNATION_HEADER def_bool y @@ -1790,6 +1789,8 @@ config ARCH_HIBERNATION_HEADER source "kernel/power/Kconfig" +if !XEN_UNPRIVILEGED_GUEST + source "drivers/acpi/Kconfig" source "drivers/sfi/Kconfig" @@ -1925,6 +1926,8 @@ source "drivers/cpuidle/Kconfig" source "drivers/idle/Kconfig" +endif # !XEN_UNPRIVILEGED_GUEST + endmenu @@ -2005,7 +2008,7 @@ config PCI_OLPC config PCI_XEN def_bool y - depends on PCI && XEN + depends on PCI && PARAVIRT_XEN select SWIOTLB_XEN config PCI_DOMAINS @@ -2030,21 +2033,6 @@ config PCI_CNB20LE_QUIRK You should say N unless you know you need this. -config XEN_PCIDEV_FRONTEND - def_bool y - prompt "Xen PCI Frontend" if X86_64 - depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) - select HOTPLUG - help - The PCI device frontend driver allows the kernel to import arbitrary - PCI devices from a PCI backend to support PCI driver domains. - -config XEN_PCIDEV_FE_DEBUG - bool "Xen PCI Frontend Debugging" - depends on XEN_PCIDEV_FRONTEND - help - Enables some debug statements within the PCI Frontend. - config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" depends on PCI_MSI && ACPI && !XEN && EXPERIMENTAL --- head-2011-03-17.orig/arch/x86/include/asm/hw_irq.h 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/hw_irq.h 2011-02-01 15:09:47.000000000 +0100 @@ -78,6 +78,7 @@ static inline void set_io_apic_irq_attr( irq_attr->polarity = polarity; } +#ifndef CONFIG_XEN struct irq_2_iommu { struct intel_iommu *iommu; u16 irte_index; @@ -85,7 +86,6 @@ struct irq_2_iommu { u8 irte_mask; }; -#ifndef CONFIG_XEN /* * This is performance-critical, we want to do it O(1) * @@ -147,6 +147,7 @@ extern irqreturn_t smp_reschedule_interr extern irqreturn_t smp_call_function_interrupt(int, void *); extern irqreturn_t smp_call_function_single_interrupt(int, void *); extern irqreturn_t smp_reboot_interrupt(int, void *); +extern irqreturn_t smp_irq_work_interrupt(int, void *); #endif #endif --- head-2011-03-17.orig/arch/x86/include/asm/io.h 2011-03-17 14:35:43.000000000 +0100 +++ head-2011-03-17/arch/x86/include/asm/io.h 2011-02-01 15:09:47.000000000 +0100 @@ -353,7 +353,7 @@ extern void early_iounmap(void __iomem * extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, @@ -362,7 +362,7 @@ extern bool xen_biovec_phys_mergeable(co #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ #define IO_SPACE_LIMIT 0xffff --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 15:09:47.000000000 +0100 @@ -217,5 +217,20 @@ static inline unsigned long virt_to_fix( BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); return __virt_to_fix(vaddr); } + +/* Return an pointer with offset calculated */ +static __always_inline unsigned long +__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +{ + __set_fixmap(idx, phys, flags); + return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); +} + +#define set_fixmap_offset(idx, phys) \ + __set_fixmap_offset(idx, phys, PAGE_KERNEL) + +#define set_fixmap_offset_nocache(idx, phys) \ + __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE) + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 15:09:47.000000000 +0100 @@ -58,15 +58,16 @@ extern void kunmap_high(struct page *pag void *kmap(struct page *page); void kunmap(struct page *page); -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); + +void *kmap_atomic_prot(struct page *page, pgprot_t prot); +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); +void *kmap_atomic_pfn(unsigned long pfn); +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -#define kmap_atomic_pte(page, type) \ - kmap_atomic_prot(page, type, \ +#define kmap_atomic_pte(page) \ + kmap_atomic_prot(page, \ PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot) #define flush_cache_kmaps() do { } while (0) --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/io.h 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/io.h 2011-02-01 15:09:47.000000000 +0100 @@ -212,6 +212,7 @@ static inline void __iomem *ioremap(reso extern void iounmap(volatile void __iomem *addr); +extern void set_iounmap_nonlazy(void); #ifdef __KERNEL__ @@ -353,6 +354,7 @@ extern void __iomem *early_memremap(reso unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void fixup_early_ioremap(void); +extern bool is_early_ioremap_ptep(pte_t *ptep); #define IO_SPACE_LIMIT 0xffff --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:49:16.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:50:13.000000000 +0100 @@ -13,7 +13,12 @@ #define NMI_VECTOR 0x02 #define CALL_FUNC_SINGLE_VECTOR 3 #define REBOOT_VECTOR 4 +#ifdef CONFIG_IRQ_WORK +#define IRQ_WORK_VECTOR 5 +#define NR_IPIS 6 +#else #define NR_IPIS 5 +#endif /* * The maximum number of vectors supported by i386 processors --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 14:54:13.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 15:09:47.000000000 +0100 @@ -47,19 +47,19 @@ void xen_safe_halt(void); void xen_halt(void); -#define __raw_local_save_flags() xen_save_fl() +#define arch_local_save_flags() xen_save_fl() -#define raw_local_irq_restore(flags) xen_restore_fl(flags) +#define arch_local_irq_restore(flags) xen_restore_fl(flags) -#define raw_local_irq_disable() xen_irq_disable() +#define arch_local_irq_disable() xen_irq_disable() -#define raw_local_irq_enable() xen_irq_enable() +#define arch_local_irq_enable() xen_irq_enable() /* * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline void raw_safe_halt(void) +static inline void arch_safe_halt(void) { xen_safe_halt(); } @@ -76,11 +76,11 @@ static inline void halt(void) /* * For spinlocks, etc: */ -#define __raw_local_irq_save() \ +#define arch_local_irq_save() \ ({ \ - unsigned long flags = __raw_local_save_flags(); \ + unsigned long flags = arch_local_save_flags(); \ \ - raw_local_irq_disable(); \ + arch_local_irq_disable(); \ \ flags; \ }) @@ -140,22 +140,16 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT #endif /* __ASSEMBLY__ */ #ifndef __ASSEMBLY__ -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) - -#define raw_local_irq_save(flags) \ - do { (flags) = __raw_local_irq_save(); } while (0) - -static inline int raw_irqs_disabled_flags(unsigned long flags) +static inline int arch_irqs_disabled_flags(unsigned long flags) { return (flags != 0); } -#define raw_irqs_disabled() \ +#define arch_irqs_disabled() \ ({ \ - unsigned long flags = __raw_local_save_flags(); \ + unsigned long flags = arch_local_save_flags(); \ \ - raw_irqs_disabled_flags(flags); \ + arch_irqs_disabled_flags(flags); \ }) #else --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 15:09:47.000000000 +0100 @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef __KERNEL__ @@ -100,9 +101,36 @@ static inline void early_quirks(void) { extern void pci_iommu_alloc(void); -/* MSI arch hooks */ -#define arch_setup_msi_irqs arch_setup_msi_irqs -#define arch_teardown_msi_irqs arch_teardown_msi_irqs +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) +/* MSI arch specific hooks */ +static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return x86_msi.setup_msi_irqs(dev, nvec, type); +} + +static inline void x86_teardown_msi_irqs(struct pci_dev *dev) +{ + x86_msi.teardown_msi_irqs(dev); +} + +static inline void x86_teardown_msi_irq(unsigned int irq) +{ + x86_msi.teardown_msi_irq(irq); +} +#define arch_setup_msi_irqs x86_setup_msi_irqs +#define arch_teardown_msi_irqs x86_teardown_msi_irqs +#define arch_teardown_msi_irq x86_teardown_msi_irq +/* implemented in arch/x86/kernel/apic/io_apic. */ +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); +void native_teardown_msi_irq(unsigned int irq); +/* default to the implementation in drivers/lib/msi.c */ +#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS +void default_teardown_msi_irqs(struct pci_dev *dev); +#else +#define native_setup_msi_irqs NULL +#define native_teardown_msi_irq NULL +#define default_teardown_msi_irqs NULL +#endif #define PCI_DMA_BUS_IS_PHYS 0 --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 15:09:47.000000000 +0100 @@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAG extern spinlock_t pgd_lock; extern struct list_head pgd_list; +extern struct mm_struct *pgd_page_get_mm(struct page *page); + #define set_pte(ptep, pte) xen_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) @@ -637,6 +639,8 @@ static inline void ptep_set_wrprotect(st set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } +#define flush_tlb_fix_spurious_fault(vma, address) + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 15:09:47.000000000 +0100 @@ -25,7 +25,7 @@ struct vm_area_struct; extern pgd_t *swapper_pg_dir; -extern pgd_t trampoline_pg_dir[1024]; +extern pgd_t initial_page_table[1024]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } @@ -48,24 +48,14 @@ extern void set_pmd_pfn(unsigned long, u #endif #if defined(CONFIG_HIGHPTE) -#define __KM_PTE \ - (in_nmi() ? KM_NMI_PTE : \ - in_irq() ? KM_IRQ_PTE : \ - KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir))) + \ pte_index((address))) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) -#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#define pte_unmap(pte) kunmap_atomic((pte)) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) -#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 15:09:47.000000000 +0100 @@ -109,6 +109,8 @@ static inline void xen_pgd_clear(pgd_t * #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) +extern void sync_global_pgds(unsigned long start, unsigned long end); + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. @@ -132,9 +134,7 @@ static inline int pgd_large(pgd_t pgd) { /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:47:27.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:47:48.000000000 +0100 @@ -120,6 +120,8 @@ struct cpuinfo_x86 { u16 phys_proc_id; /* Core id: */ u16 cpu_core_id; + /* Compute unit id */ + u8 compute_unit_id; #endif #ifdef CONFIG_SMP /* Index into per_cpu list: */ @@ -556,7 +558,7 @@ extern unsigned long mmu_cr4_features; static inline void set_in_cr4(unsigned long mask) { - unsigned cr4; + unsigned long cr4; mmu_cr4_features |= mask; cr4 = read_cr4(); @@ -566,7 +568,7 @@ static inline void set_in_cr4(unsigned l static inline void clear_in_cr4(unsigned long mask) { - unsigned cr4; + unsigned long cr4; mmu_cr4_features &= ~mask; cr4 = read_cr4(); @@ -718,31 +720,6 @@ extern unsigned long idle_halt; extern unsigned long idle_nomwait; extern bool c1e_detected; -#ifndef CONFIG_XEN -/* - * on systems with caches, caches must be flashed as the absolute - * last instruction before going into a suspended halt. Otherwise, - * dirty data can linger in the cache and become stale on resume, - * leading to strange errors. - * - * perform a variety of operations to guarantee that the compiler - * will not reorder instructions. wbinvd itself is serializing - * so the processor will not reorder. - * - * Systems without cache can just go into halt. - */ -static inline void wbinvd_halt(void) -{ - mb(); - /* check for clflush to determine if wbinvd is legal */ - if (cpu_has_clflush) - asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); - else - while (1) - halt(); -} -#endif - extern void enable_sep_cpu(void); extern int sysenter_setup(void); --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/smp.h 2011-03-03 16:10:16.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/smp.h 2011-03-03 16:12:15.000000000 +0100 @@ -57,7 +57,7 @@ struct smp_ops { void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); - void (*smp_send_stop)(void); + void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu); @@ -76,7 +76,12 @@ extern struct smp_ops smp_ops; static inline void smp_send_stop(void) { - smp_ops.smp_send_stop(); + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); } static inline void smp_prepare_boot_cpu(void) @@ -148,12 +153,16 @@ void smp_store_cpu_info(int id); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); -void xen_smp_send_stop(void); +void xen_stop_other_cpus(int wait); void xen_smp_send_reschedule(int cpu); void xen_send_call_func_ipi(const struct cpumask *mask); void xen_send_call_func_single_ipi(int cpu); -#define smp_send_stop xen_smp_send_stop +static inline void smp_send_stop(void) +{ + xen_stop_other_cpus(0); +} + #define smp_send_reschedule xen_smp_send_reschedule #define arch_send_call_function_single_ipi xen_send_call_func_single_ipi #define arch_send_call_function_ipi_mask xen_send_call_func_ipi --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 15:09:47.000000000 +0100 @@ -200,16 +200,16 @@ static inline int __ticket_spin_is_conte static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { unsigned int token, count; - unsigned int flags = __raw_local_irq_save(); + unsigned int flags = arch_local_irq_save(); bool free; __ticket_spin_lock_preamble; if (likely(free)) { - raw_local_irq_restore(flags); + arch_local_irq_restore(flags); return; } token = xen_spin_adjust(lock, token); - raw_local_irq_restore(flags); + arch_local_irq_restore(flags); do { count = 1 << 10; __ticket_spin_lock_body; --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/swiotlb.h 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/swiotlb.h 2011-02-01 15:09:47.000000000 +0100 @@ -1,6 +1,4 @@ #include_next -#define pci_swiotlb_detect() 1 - dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size, int dir); --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 14:54:13.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 15:09:47.000000000 +0100 @@ -111,6 +111,4 @@ static inline void flush_tlb_kernel_rang flush_tlb_all(); } -extern void zap_low_mappings(bool early); - #endif /* _ASM_X86_TLBFLUSH_H */ --- head-2011-03-17.orig/arch/x86/kernel/Makefile 2011-02-01 14:54:13.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/Makefile 2011-02-01 15:09:47.000000000 +0100 @@ -125,7 +125,6 @@ ifeq ($(CONFIG_X86_64),y) obj-y += vsmp_64.o endif -disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ - i8259.o irqinit.o pci-swiotlb.o reboot.o smpboot.o tsc.o tsc_sync.o \ - uv_%.o vsmp_64.o +disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259.o \ + irqinit.o pci-swiotlb.o reboot.o smpboot.o tsc.o tsc_sync.o vsmp_64.o disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o --- head-2011-03-17.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -7,11 +7,16 @@ #include #include +#include #include #include #include #include +#ifdef CONFIG_X86_32 +#include +#endif + #include "realmode/wakeup.h" #include "sleep.h" @@ -93,7 +98,7 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); + header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; @@ -130,7 +135,7 @@ void acpi_restore_state_mem(void) void __init acpi_reserve_wakeup_memory(void) { #ifndef CONFIG_ACPI_PV_SLEEP - unsigned long mem; + phys_addr_t mem; if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR @@ -138,15 +143,15 @@ void __init acpi_reserve_wakeup_memory(v return; } - mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); + mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - if (mem == -1L) { + if (mem == MEMBLOCK_ERROR) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); return; } acpi_realmode = (unsigned long) phys_to_virt(mem); acpi_wakeup_address = mem; - reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); + memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); #endif } --- head-2011-03-17.orig/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -144,13 +144,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int node) +static struct irq_pin_list *alloc_irq_pin_list(int node) { - struct irq_pin_list *pin; - - pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); - - return pin; + return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); } /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -163,10 +159,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS]; int __init arch_early_irq_init(void) { struct irq_cfg *cfg; - struct irq_desc *desc; - int count; - int node; - int i; + int count, node, i; if (!legacy_pic->nr_legacy_irqs) { nr_irqs_gsi = 0; @@ -175,13 +168,15 @@ int __init arch_early_irq_init(void) cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); - node= cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); + + /* Make sure the legacy interrupts are marked in the bitmap */ + irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); for (i = 0; i < count; i++) { - desc = irq_to_desc(i); - desc->chip_data = &cfg[i]; - zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); - zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); + set_irq_chip_data(i, &cfg[i]); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. @@ -196,170 +191,88 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -struct irq_cfg *irq_cfg(unsigned int irq) +static struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg = NULL; - struct irq_desc *desc; - - desc = irq_to_desc(irq); - if (desc) - cfg = desc->chip_data; - - return cfg; + return get_irq_chip_data(irq); } -static struct irq_cfg *get_one_free_irq_cfg(int node) +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) { struct irq_cfg *cfg; - cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); - if (cfg) { - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { - kfree(cfg); - cfg = NULL; - } else if (!zalloc_cpumask_var_node(&cfg->old_domain, - GFP_ATOMIC, node)) { - free_cpumask_var(cfg->domain); - kfree(cfg); - cfg = NULL; - } - } - + cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); + if (!cfg) + return NULL; + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) + goto out_cfg; + if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + goto out_domain; return cfg; +out_domain: + free_cpumask_var(cfg->domain); +out_cfg: + kfree(cfg); + return NULL; } -int arch_init_chip_data(struct irq_desc *desc, int node) -{ - struct irq_cfg *cfg; - - cfg = desc->chip_data; - if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(node); - if (!desc->chip_data) { - printk(KERN_ERR "can not alloc irq_cfg\n"); - BUG_ON(1); - } - } - - return 0; -} - -/* for move_irq_desc */ -static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) +static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { - struct irq_pin_list *old_entry, *head, *tail, *entry; - - cfg->irq_2_pin = NULL; - old_entry = old_cfg->irq_2_pin; - if (!old_entry) - return; - - entry = get_one_free_irq_2_pin(node); - if (!entry) + if (!cfg) return; + set_irq_chip_data(at, NULL); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); +} - entry->apic = old_entry->apic; - entry->pin = old_entry->pin; - head = entry; - tail = entry; - old_entry = old_entry->next; - while (old_entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - entry = head; - while (entry) { - head = entry->next; - kfree(entry); - entry = head; - } - /* still use the old one */ - return; - } - entry->apic = old_entry->apic; - entry->pin = old_entry->pin; - tail->next = entry; - tail = entry; - old_entry = old_entry->next; - } +#else - tail->next = NULL; - cfg->irq_2_pin = head; +struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; } -static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) { - struct irq_pin_list *entry, *next; - - if (old_cfg->irq_2_pin == cfg->irq_2_pin) - return; + return irq_cfgx + irq; +} - entry = old_cfg->irq_2_pin; +static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { } - while (entry) { - next = entry->next; - kfree(entry); - entry = next; - } - old_cfg->irq_2_pin = NULL; -} +#endif -void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int node) +static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) { + int res = irq_alloc_desc_at(at, node); struct irq_cfg *cfg; - struct irq_cfg *old_cfg; - - cfg = get_one_free_irq_cfg(node); - - if (!cfg) - return; - - desc->chip_data = cfg; - old_cfg = old_desc->chip_data; - - cfg->vector = old_cfg->vector; - cfg->move_in_progress = old_cfg->move_in_progress; - cpumask_copy(cfg->domain, old_cfg->domain); - cpumask_copy(cfg->old_domain, old_cfg->old_domain); - - init_copy_irq_2_pin(old_cfg, cfg, node); -} + if (res < 0) { + if (res != -EEXIST) + return NULL; + cfg = get_irq_chip_data(at); + if (cfg) + return cfg; + } -static void free_irq_cfg(struct irq_cfg *cfg) -{ - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); + cfg = alloc_irq_cfg(at, node); + if (cfg) + set_irq_chip_data(at, cfg); + else + irq_free_desc(at); + return cfg; } -void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +static int alloc_irq_from(unsigned int from, int node) { - struct irq_cfg *old_cfg, *cfg; - - old_cfg = old_desc->chip_data; - cfg = desc->chip_data; - - if (old_cfg == cfg) - return; - - if (old_cfg) { - free_irq_2_pin(old_cfg, cfg); - free_irq_cfg(old_cfg); - old_desc->chip_data = NULL; - } + return irq_alloc_desc_from(from, node); } -/* end for move_irq_desc */ -#else -struct irq_cfg *irq_cfg(unsigned int irq) +static void free_irq_at(unsigned int at, struct irq_cfg *cfg) { - return irq < nr_irqs ? irq_cfgx + irq : NULL; + free_irq_cfg(at, cfg); + irq_free_desc(at); } -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -492,7 +405,7 @@ __ioapic_write_entry(int apic, int pin, io_apic_write(apic, 0x10 + 2*pin, eu.w1); } -void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -523,7 +436,7 @@ static void ioapic_mask_entry(int apic, * fast in the common case, and fast for shared ISA-space IRQs. */ static int -add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) +__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -535,7 +448,7 @@ add_pin_to_irq_node_nopanic(struct irq_c last = &entry->next; } - entry = get_one_free_irq_2_pin(node); + entry = alloc_irq_pin_list(node); if (!entry) { printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -550,7 +463,7 @@ add_pin_to_irq_node_nopanic(struct irq_c static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(cfg, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } @@ -613,11 +526,6 @@ static void __unmask_and_level_IO_APIC_i IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); -} - static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -629,44 +537,37 @@ static void io_apic_sync(struct irq_pin_ readl(&io_apic->data); } -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) +static void mask_ioapic(struct irq_cfg *cfg) { + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_IO_APIC_irq_desc(struct irq_desc *desc) +static void mask_ioapic_irq(struct irq_data *data) { - struct irq_cfg *cfg = desc->chip_data; - unsigned long flags; - - BUG_ON(!cfg); + mask_ioapic(data->chip_data); +} - raw_spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(cfg); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); +static void __unmask_ioapic(struct irq_cfg *cfg) +{ + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) +static void unmask_ioapic(struct irq_cfg *cfg) { - struct irq_cfg *cfg = desc->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(cfg); + __unmask_ioapic(cfg); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_IO_APIC_irq(unsigned int irq) +static void unmask_ioapic_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); - - mask_IO_APIC_irq_desc(desc); -} -static void unmask_IO_APIC_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - unmask_IO_APIC_irq_desc(desc); + unmask_ioapic(data->chip_data); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) @@ -693,7 +594,7 @@ static void clear_IO_APIC (void) } #else #define add_pin_to_irq_node(cfg, node, apic, pin) -#define add_pin_to_irq_node_nopanic(cfg, node, apic, pin) 0 +#define __add_pin_to_irq_node(cfg, node, apic, pin) 0 #endif /* !CONFIG_XEN */ #ifdef CONFIG_X86_32 @@ -741,14 +642,14 @@ struct IO_APIC_route_entry **alloc_ioapi struct IO_APIC_route_entry **ioapic_entries; ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, - GFP_ATOMIC); + GFP_KERNEL); if (!ioapic_entries) return 0; for (apic = 0; apic < nr_ioapics; apic++) { ioapic_entries[apic] = kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_ATOMIC); + nr_ioapic_registers[apic], GFP_KERNEL); if (!ioapic_entries[apic]) goto nomem; } @@ -1314,7 +1215,6 @@ void __setup_vector_irq(int cpu) /* Initialize vector_irq on a new cpu */ int irq, vector; struct irq_cfg *cfg; - struct irq_desc *desc; /* * vector_lock will make sure that we don't run into irq vector @@ -1323,9 +1223,10 @@ void __setup_vector_irq(int cpu) */ raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ - for_each_irq_desc(irq, desc) { - cfg = desc->chip_data; - + for_each_active_irq(irq) { + cfg = get_irq_chip_data(irq); + if (!cfg) + continue; /* * If it is a legacy IRQ handled by the legacy PIC, this cpu * will be part of the irq_cfg's domain. @@ -1382,17 +1283,17 @@ static inline int IO_APIC_irq_trigger(in } #endif -static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) +static void ioapic_register_intr(unsigned int irq, unsigned long trigger) { if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - desc->status |= IRQ_LEVEL; + irq_set_status_flags(irq, IRQ_LEVEL); else - desc->status &= ~IRQ_LEVEL; + irq_clear_status_flags(irq, IRQ_LEVEL); - if (irq_remapped(irq)) { - desc->status |= IRQ_MOVE_PCNTXT; + if (irq_remapped(get_irq_chip_data(irq))) { + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, @@ -1414,13 +1315,13 @@ static void ioapic_register_intr(int irq } #else /* !CONFIG_XEN */ #define __clear_irq_vector(irq, cfg) ((void)0) -#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq) +#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq) #endif -int setup_ioapic_entry(int apic_id, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector, int pin) +static int setup_ioapic_entry(int apic_id, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector, int pin) { /* * add it to the IO-APIC irq-routing table: @@ -1442,21 +1343,7 @@ int setup_ioapic_entry(int apic_id, int if (index < 0) panic("Failed to allocate IRTE for ioapic %d\n", apic_id); - memset(&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = apic->irq_dest_mode; - /* - * Trigger mode in the IRTE will always be edge, and the - * actual level or edge trigger will be setup in the IO-APIC - * RTE. This will help simplify level triggered irq migration. - * For more details, see the comments above explainig IO-APIC - * irq migration in the presence of interrupt-remapping. - */ - irte.trigger_mode = 0; - irte.dlvry_mode = apic->irq_delivery_mode; - irte.vector = vector; - irte.dest_id = IRTE_DEST(destination); + prepare_irte(&irte, vector, destination); /* Set source-id of interrupt request */ set_ioapic_sid(&irte, apic_id); @@ -1493,18 +1380,14 @@ int setup_ioapic_entry(int apic_id, int return 0; } -static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, - int trigger, int polarity) +static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, + struct irq_cfg *cfg, int trigger, int polarity) { - struct irq_cfg *cfg; struct IO_APIC_route_entry entry; unsigned int dest; if (!IO_APIC_IRQ(irq)) return; - - cfg = desc->chip_data; - #ifndef CONFIG_XEN /* * For legacy irqs, cfg->domain starts with cpu 0 for legacy @@ -1539,10 +1422,10 @@ static void setup_IO_APIC_irq(int apic_i return; } - ioapic_register_intr(irq, desc, trigger); + ioapic_register_intr(irq, trigger); #ifndef CONFIG_XEN if (irq < legacy_pic->nr_legacy_irqs) - legacy_pic->chip->mask(irq); + legacy_pic->mask(irq); #endif ioapic_write_entry(apic_id, pin, entry); @@ -1554,11 +1437,9 @@ static struct { static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; - int notcon = 0; - struct irq_desc *desc; + int apic_id, pin, idx, irq, notcon = 0; + int node = cpu_to_node(0); struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1600,19 +1481,17 @@ static void __init setup_IO_APIC_irqs(vo continue; #endif - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); + cfg = alloc_irq_and_cfg_at(irq, node); + if (!cfg) continue; - } - cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); /* * don't mark it in pin_programmed, so later acpi could * set it correctly when irq < 16 */ - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), + irq_polarity(idx)); } if (notcon) @@ -1627,9 +1506,7 @@ static void __init setup_IO_APIC_irqs(vo */ void setup_IO_APIC_irq_extra(u32 gsi) { - int apic_id = 0, pin, idx, irq; - int node = cpu_to_node(boot_cpu_id); - struct irq_desc *desc; + int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); struct irq_cfg *cfg; /* @@ -1649,18 +1526,15 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) return; #endif -#ifdef CONFIG_SPARSE_IRQ - desc = irq_to_desc(irq); - if (desc) + + /* Only handle the non legacy irqs on secondary ioapics */ + if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; -#endif - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); + + cfg = alloc_irq_and_cfg_at(irq, node); + if (!cfg) return; - } - cfg = desc->chip_data; add_pin_to_irq_node(cfg, node, apic_id, pin); if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { @@ -1670,7 +1544,7 @@ void setup_IO_APIC_irq_extra(u32 gsi) } set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); - setup_IO_APIC_irq(apic_id, pin, irq, desc, + setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), irq_polarity(idx)); } @@ -1722,7 +1596,6 @@ __apicdebuginit(void) print_IO_APIC(void union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; - struct irq_desc *desc; unsigned int irq; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); @@ -1809,10 +1682,10 @@ __apicdebuginit(void) print_IO_APIC(void } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_desc(irq, desc) { + for_each_active_irq(irq) { struct irq_pin_list *entry; - cfg = desc->chip_data; + cfg = get_irq_chip_data(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -2319,29 +2192,26 @@ static int __init timer_irq_works(void) * an edge even if it isn't on the 8259A... */ -static unsigned int startup_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(struct irq_data *data) { - int was_pending = 0; + int was_pending = 0, irq = data->irq; unsigned long flags; - struct irq_cfg *cfg; raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < legacy_pic->nr_legacy_irqs) { - legacy_pic->chip->mask(irq); + legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; } - cfg = irq_cfg(irq); - __unmask_IO_APIC_irq(cfg); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } -static int ioapic_retrigger_irq(unsigned int irq) +static int ioapic_retrigger_irq(struct irq_data *data) { - - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_cfg *cfg = data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); @@ -2392,7 +2262,7 @@ static void __target_IO_APIC_irq(unsigne * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ - if (!irq_remapped(irq)) + if (!irq_remapped(cfg)) io_apic_write(apic, 0x11 + pin*2, dest); reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; @@ -2402,65 +2272,46 @@ static void __target_IO_APIC_irq(unsigne } /* - * Either sets desc->affinity to a valid value, and returns + * Either sets data->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves desc->affinity untouched. + * leaves data->affinity untouched. */ -unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, - unsigned int *dest_id) +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) { - struct irq_cfg *cfg; - unsigned int irq; + struct irq_cfg *cfg = data->chip_data; if (!cpumask_intersects(mask, cpu_online_mask)) return -1; - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) + if (assign_irq_vector(data->irq, data->chip_data, mask)) return -1; - cpumask_copy(desc->affinity, mask); + cpumask_copy(data->affinity, mask); - *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); return 0; } static int -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_cfg *cfg; + unsigned int dest, irq = data->irq; unsigned long flags; - unsigned int dest; - unsigned int irq; - int ret = -1; - - irq = desc->irq; - cfg = desc->chip_data; + int ret; raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = set_desc_affinity(desc, mask, &dest); + ret = __ioapic_set_affinity(data, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); + __target_IO_APIC_irq(irq, dest, data->chip_data); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; } -static int -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - return set_ioapic_affinity_irq_desc(desc, mask); -} - #ifdef CONFIG_INTR_REMAP /* @@ -2475,24 +2326,21 @@ set_ioapic_affinity_irq(unsigned int irq * the interrupt-remapping table entry. */ static int -migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; struct irte irte; - unsigned int dest; - unsigned int irq; - int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return ret; + return -EINVAL; - irq = desc->irq; if (get_irte(irq, &irte)) - return ret; + return -EBUSY; - cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return ret; + return -EBUSY; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2507,29 +2355,14 @@ migrate_ioapic_irq_desc(struct irq_desc if (cfg->move_in_progress) send_cleanup_vector(cfg); - cpumask_copy(desc->affinity, mask); - + cpumask_copy(data->affinity, mask); return 0; } -/* - * Migrates the IRQ destination in the process context. - */ -static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, - const struct cpumask *mask) -{ - return migrate_ioapic_irq_desc(desc, mask); -} -static int set_ir_ioapic_affinity_irq(unsigned int irq, - const struct cpumask *mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return set_ir_ioapic_affinity_irq_desc(desc, mask); -} #else -static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, - const struct cpumask *mask) +static inline int +ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { return 0; } @@ -2591,10 +2424,8 @@ unlock: irq_exit(); } -static void __irq_complete_move(struct irq_desc **descp, unsigned vector) +static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { - struct irq_desc *desc = *descp; - struct irq_cfg *cfg = desc->chip_data; unsigned me; if (likely(!cfg->move_in_progress)) @@ -2606,31 +2437,28 @@ static void __irq_complete_move(struct i send_cleanup_vector(cfg); } -static void irq_complete_move(struct irq_desc **descp) +static void irq_complete_move(struct irq_cfg *cfg) { - __irq_complete_move(descp, ~get_irq_regs()->orig_ax); + __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); } void irq_force_complete_move(int irq) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; + struct irq_cfg *cfg = get_irq_chip_data(irq); if (!cfg) return; - __irq_complete_move(&desc, cfg->vector); + __irq_complete_move(cfg, cfg->vector); } #else -static inline void irq_complete_move(struct irq_desc **descp) {} +static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif -static void ack_apic_edge(unsigned int irq) +static void ack_apic_edge(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); - - irq_complete_move(&desc); - move_native_irq(irq); + irq_complete_move(data->chip_data); + move_native_irq(data->irq); ack_APIC_irq(); } @@ -2652,10 +2480,12 @@ atomic_t irq_mis_count; * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; + unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { if (mp_ioapics[entry->apic].apicver >= 0x20) { /* @@ -2664,7 +2494,7 @@ static void __eoi_ioapic_irq(unsigned in * intr-remapping table entry. Hence for the io-apic * EOI we use the pin number. */ - if (irq_remapped(irq)) + if (irq_remapped(cfg)) io_apic_eoi(entry->apic, entry->pin); else io_apic_eoi(entry->apic, cfg->vector); @@ -2673,36 +2503,21 @@ static void __eoi_ioapic_irq(unsigned in __unmask_and_level_IO_APIC_irq(entry); } } -} - -static void eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void ack_apic_level(unsigned int irq) +static void ack_apic_level(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = data->chip_data; + int i, do_unmask_irq = 0, irq = data->irq; unsigned long v; - int i; - struct irq_cfg *cfg; - int do_unmask_irq = 0; - irq_complete_move(&desc); + irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; - mask_IO_APIC_irq_desc(desc); + mask_ioapic(cfg); } #endif @@ -2738,7 +2553,6 @@ static void ack_apic_level(unsigned int * we use the above logic (mask+edge followed by unmask+level) from * Manfred Spraul to clear the remote IRR. */ - cfg = desc->chip_data; i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -2758,7 +2572,7 @@ static void ack_apic_level(unsigned int if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - eoi_ioapic_irq(desc); + eoi_ioapic_irq(irq, cfg); } /* Now we can move and renable the irq */ @@ -2789,62 +2603,58 @@ static void ack_apic_level(unsigned int * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - cfg = desc->chip_data; if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); - unmask_IO_APIC_irq_desc(desc); + unmask_ioapic(cfg); } } #ifdef CONFIG_INTR_REMAP -static void ir_ack_apic_edge(unsigned int irq) +static void ir_ack_apic_edge(struct irq_data *data) { ack_APIC_irq(); } -static void ir_ack_apic_level(unsigned int irq) +static void ir_ack_apic_level(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); - ack_APIC_irq(); - eoi_ioapic_irq(desc); + eoi_ioapic_irq(data->irq, data->chip_data); } #endif /* CONFIG_INTR_REMAP */ static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = ack_apic_edge, + .irq_eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .irq_set_affinity = ioapic_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, #ifdef CONFIG_INTR_REMAP - .ack = ir_ack_apic_edge, - .eoi = ir_ack_apic_level, + .irq_ack = ir_ack_apic_edge, + .irq_eoi = ir_ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, + .irq_set_affinity = ir_ioapic_set_affinity, #endif #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; #endif /* !CONFIG_XEN */ static inline void init_IO_APIC_traps(void) { - int irq; - struct irq_desc *desc; struct irq_cfg *cfg; + unsigned int irq; /* * NOTE! The local APIC isn't very good at handling @@ -2857,12 +2667,12 @@ static inline void init_IO_APIC_traps(vo * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_desc(irq, desc) { + for_each_active_irq(irq) { #ifdef CONFIG_XEN if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) continue; #endif - cfg = desc->chip_data; + cfg = get_irq_chip_data(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2873,7 +2683,7 @@ static inline void init_IO_APIC_traps(vo legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ - desc->chip = &no_irq_chip; + set_irq_chip(irq, &no_irq_chip); } } } @@ -2883,7 +2693,7 @@ static inline void init_IO_APIC_traps(vo * The local APIC irq-chip implementation: */ -static void mask_lapic_irq(unsigned int irq) +static void mask_lapic_irq(struct irq_data *data) { unsigned long v; @@ -2891,7 +2701,7 @@ static void mask_lapic_irq(unsigned int apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } -static void unmask_lapic_irq(unsigned int irq) +static void unmask_lapic_irq(struct irq_data *data) { unsigned long v; @@ -2899,21 +2709,21 @@ static void unmask_lapic_irq(unsigned in apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq(unsigned int irq) +static void ack_lapic_irq(struct irq_data *data) { ack_APIC_irq(); } static struct irq_chip lapic_chip __read_mostly = { .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, + .irq_mask = mask_lapic_irq, + .irq_unmask = unmask_lapic_irq, + .irq_ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq, struct irq_desc *desc) +static void lapic_register_intr(int irq) { - desc->status &= ~IRQ_LEVEL; + irq_clear_status_flags(irq, IRQ_LEVEL); set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -3016,9 +2826,8 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_desc *desc = irq_to_desc(0); - struct irq_cfg *cfg = desc->chip_data; - int node = cpu_to_node(boot_cpu_id); + struct irq_cfg *cfg = get_irq_chip_data(0); + int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -3028,7 +2837,7 @@ static inline void __init check_timer(vo /* * get/set the timer IRQ vector: */ - legacy_pic->chip->mask(0); + legacy_pic->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -3087,7 +2896,7 @@ static inline void __init check_timer(vo add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { - /* for edge trigger, setup_IO_APIC_irq already + /* for edge trigger, setup_ioapic_irq already * leave it unmasked. * so only need to unmask if it is level-trigger * do we really have level trigger timer? @@ -3095,12 +2904,12 @@ static inline void __init check_timer(vo int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_IO_APIC_irq_desc(desc); + unmask_ioapic(cfg); } if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -3123,14 +2932,14 @@ static inline void __init check_timer(vo */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->chip->mask(0); + legacy_pic->mask(0); setup_nmi(); - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); } goto out; } @@ -3138,7 +2947,7 @@ static inline void __init check_timer(vo * Cleanup, just in case ... */ local_irq_disable(); - legacy_pic->chip->mask(0); + legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3155,16 +2964,16 @@ static inline void __init check_timer(vo apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0, desc); + lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - legacy_pic->chip->unmask(0); + legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - legacy_pic->chip->mask(0); + legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -3344,49 +3153,42 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want, int node) +unsigned int create_irq_nr(unsigned int from, int node) { - /* Allocate an unused irq */ - unsigned int irq; - unsigned int new; + struct irq_cfg *cfg; unsigned long flags; - struct irq_cfg *cfg_new = NULL; - struct irq_desc *desc_new = NULL; - - irq = 0; - if (irq_want < nr_irqs_gsi) - irq_want = nr_irqs_gsi; - - raw_spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_node(new, node); - if (!desc_new) { - printk(KERN_INFO "can not get irq_desc for %d\n", new); - continue; - } - cfg_new = desc_new->chip_data; - - if (cfg_new->vector != 0) - continue; + unsigned int ret = 0; + int irq; - desc_new = move_irq_desc(desc_new, node); - cfg_new = desc_new->chip_data; + if (from < nr_irqs_gsi) + from = nr_irqs_gsi; - if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) - irq = new; - break; + irq = alloc_irq_from(from, node); + if (irq < 0) + return 0; + cfg = alloc_irq_cfg(irq, node); + if (!cfg) { + free_irq_at(irq, NULL); + return 0; } - raw_spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) - dynamic_irq_init_keep_chip_data(irq); + raw_spin_lock_irqsave(&vector_lock, flags); + if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) + ret = irq; + raw_spin_unlock_irqrestore(&vector_lock, flags); - return irq; + if (ret) { + set_irq_chip_data(irq, cfg); + irq_clear_status_flags(irq, IRQ_NOREQUEST); + } else { + free_irq_at(irq, cfg); + } + return ret; } int create_irq(void) { - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); unsigned int irq_want; int irq; @@ -3401,14 +3203,17 @@ int create_irq(void) void destroy_irq(unsigned int irq) { + struct irq_cfg *cfg = get_irq_chip_data(irq); unsigned long flags; - dynamic_irq_cleanup_keep_chip_data(irq); + irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - free_irte(irq); + if (irq_remapped(cfg)) + free_irte(irq); raw_spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, get_irq_chip_data(irq)); + __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); + free_irq_at(irq, cfg); } #endif /* !CONFIG_XEN */ @@ -3433,7 +3238,7 @@ static int msi_compose_msg(struct pci_de dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(irq)) { + if (irq_remapped(get_irq_chip_data(irq))) { struct irte irte; int ir_index; u16 sub_handle; @@ -3441,14 +3246,7 @@ static int msi_compose_msg(struct pci_de ir_index = map_irq_to_irte_handle(irq, &sub_handle); BUG_ON(ir_index == -1); - memset (&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = apic->irq_dest_mode; - irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = apic->irq_delivery_mode; - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); + prepare_irte(&irte, cfg->vector, dest); /* Set source-id of interrupt request */ if (pdev) @@ -3493,26 +3291,24 @@ static int msi_compose_msg(struct pci_de } #ifdef CONFIG_SMP -static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int +msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = desc->chip_data; - - get_cached_msi_msg_desc(desc, &msg); + __get_cached_msi_msg(data->msi_desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - write_msi_msg_desc(desc, &msg); + __write_msi_msg(data->msi_desc, &msg); return 0; } @@ -3522,17 +3318,17 @@ static int set_msi_irq_affinity(unsigned * done in the process context using interrupt-remapping hardware. */ static int -ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; - unsigned int dest; + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; struct irte irte; if (get_irte(irq, &irte)) return -1; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3562,27 +3358,27 @@ ir_set_msi_irq_affinity(unsigned int irq * which implement the MSI or MSI-X Capability Structure. */ static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, + .name = "PCI-MSI", + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, + .irq_set_affinity = msi_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, + .name = "IR-PCI-MSI", + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, #ifdef CONFIG_INTR_REMAP - .ack = ir_ack_apic_edge, + .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, + .irq_set_affinity = ir_msi_set_affinity, #endif #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; /* @@ -3614,8 +3410,8 @@ static int msi_alloc_irte(struct pci_dev static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { - int ret; struct msi_msg msg; + int ret; ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) @@ -3624,12 +3420,8 @@ static int setup_msi_irq(struct pci_dev set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); - if (irq_remapped(irq)) { - struct irq_desc *desc = irq_to_desc(irq); - /* - * irq migration in process context - */ - desc->status |= IRQ_MOVE_PCNTXT; + if (irq_remapped(get_irq_chip_data(irq))) { + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); } else set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); @@ -3639,15 +3431,12 @@ static int setup_msi_irq(struct pci_dev return 0; } -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - unsigned int irq; - int ret, sub_handle; + int node, ret, sub_handle, index = 0; + unsigned int irq, irq_want; struct msi_desc *msidesc; - unsigned int irq_want; struct intel_iommu *iommu = NULL; - int index = 0; - int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) @@ -3700,31 +3489,31 @@ error: return ret; } -void arch_teardown_msi_irq(unsigned int irq) +void native_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); } #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int +dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; struct msi_msg msg; - unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = desc->chip_data; - dmar_msi_read(irq, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); @@ -3734,14 +3523,14 @@ static int dmar_msi_set_affinity(unsigne #endif /* CONFIG_SMP */ static struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, - .ack = ack_apic_edge, + .name = "DMAR_MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, + .irq_set_affinity = dmar_msi_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_dmar_msi(unsigned int irq) @@ -3762,26 +3551,24 @@ int arch_setup_dmar_msi(unsigned int irq #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; struct msi_msg msg; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = desc->chip_data; - - hpet_msi_read(irq, &msg); + hpet_msi_read(data->handler_data, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - hpet_msi_write(irq, &msg); + hpet_msi_write(data->handler_data, &msg); return 0; } @@ -3789,34 +3576,33 @@ static int hpet_msi_set_affinity(unsigne #endif /* CONFIG_SMP */ static struct irq_chip ir_hpet_msi_type = { - .name = "IR-HPET_MSI", - .unmask = hpet_msi_unmask, - .mask = hpet_msi_mask, + .name = "IR-HPET_MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, #ifdef CONFIG_INTR_REMAP - .ack = ir_ack_apic_edge, + .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, + .irq_set_affinity = ir_msi_set_affinity, #endif #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", - .unmask = hpet_msi_unmask, - .mask = hpet_msi_mask, - .ack = ack_apic_edge, + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = hpet_msi_set_affinity, + .irq_set_affinity = hpet_msi_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { - int ret; struct msi_msg msg; - struct irq_desc *desc = irq_to_desc(irq); + int ret; if (intr_remapping_enabled) { struct intel_iommu *iommu = map_hpet_to_ir(id); @@ -3834,9 +3620,9 @@ int arch_setup_hpet_msi(unsigned int irq if (ret < 0) return ret; - hpet_msi_write(irq, &msg); - desc->status |= IRQ_MOVE_PCNTXT; - if (irq_remapped(irq)) + hpet_msi_write(get_irq_data(irq), &msg); + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + if (irq_remapped(get_irq_chip_data(irq))) set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, handle_edge_irq, "edge"); else @@ -3869,33 +3655,30 @@ static void target_ht_irq(unsigned int i write_ht_irq_msg(irq, &msg); } -static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int +ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = data->chip_data; unsigned int dest; - if (set_desc_affinity(desc, mask, &dest)) + if (__ioapic_set_affinity(data, mask, &dest)) return -1; - cfg = desc->chip_data; - - target_ht_irq(irq, dest, cfg->vector); - + target_ht_irq(data->irq, dest, cfg->vector); return 0; } #endif static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, + .name = "PCI-HT", + .irq_mask = mask_ht_irq, + .irq_unmask = unmask_ht_irq, + .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, + .irq_set_affinity = ht_set_affinity, #endif - .retrigger = ioapic_retrigger_irq, + .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) @@ -3969,6 +3752,11 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +int get_nr_irqs_gsi(void) +{ + return nr_irqs_gsi; +} + #ifdef CONFIG_SPARSE_IRQ int __init arch_probe_nr_irqs(void) { @@ -3987,7 +3775,7 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return 0; + return NR_IRQS_LEGACY; } #endif #endif /* CONFIG_XEN */ @@ -3995,7 +3783,6 @@ int __init arch_probe_nr_irqs(void) static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { - struct irq_desc *desc; struct irq_cfg *cfg; int node; int ioapic, pin; @@ -4018,13 +3805,11 @@ static int __io_apic_set_pci_routing(str if (dev) node = dev_to_node(dev); else - node = cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); - desc = irq_to_desc_alloc_node(irq, node); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); + cfg = alloc_irq_and_cfg_at(irq, node); + if (!cfg) return 0; - } pin = irq_attr->ioapic_pin; trigger = irq_attr->trigger; @@ -4034,15 +3819,14 @@ static int __io_apic_set_pci_routing(str * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= legacy_pic->nr_legacy_irqs) { - cfg = desc->chip_data; - if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", pin, irq); return 0; } } - setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity); return 0; } @@ -4238,14 +4022,14 @@ void __init setup_ioapic_dest(void) */ if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = desc->irq_data.affinity; else mask = apic->target_cpus(); if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); + ir_ioapic_set_affinity(&desc->irq_data, mask, false); else - set_ioapic_affinity_irq_desc(desc, mask); + ioapic_set_affinity(&desc->irq_data, mask, false); } } @@ -4433,20 +4217,19 @@ void __init mp_register_ioapic(int id, u void __init pre_init_apic_IRQ0(void) { struct irq_cfg *cfg; - struct irq_desc *desc; printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); #endif - desc = irq_to_desc_alloc_node(0, 0); + /* Make sure the irq descriptor is set up */ + cfg = alloc_irq_and_cfg_at(0, 0); setup_local_APIC(); - cfg = irq_cfg(0); add_pin_to_irq_node(cfg, 0, 0, 0); set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); + setup_ioapic_irq(0, 0, 0, cfg, 0, 0); } #endif --- head-2011-03-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:43:00.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:43:08.000000000 +0100 @@ -696,7 +696,7 @@ static void __init early_identify_cpu(st this_cpu->c_early_init(c); #ifdef CONFIG_SMP - c->cpu_index = boot_cpu_id; + c->cpu_index = 0; #endif filter_cpuid_features(c, false); } @@ -735,16 +735,21 @@ void __init early_cpu_init(void) } /* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6; unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. In the latter case it doesn't even *fail* - * reliably, so probing for it doesn't even work. Disable it completely + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) @@ -1355,13 +1360,6 @@ void __cpuinit cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - /* - * Force FPU initialization: - */ - current_thread_info()->status = 0; - clear_used_math(); - mxcsr_feature_mask_init(); - fpu_init(); xsave_init(); } --- head-2011-03-17.orig/arch/x86/kernel/e820-xen.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/e820-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -786,73 +787,7 @@ core_initcall(e820_mark_nvs_memory); #endif /* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr != -1ULL) - return addr; - } - return -1ULL; -} - -u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) -{ - return find_e820_area(start, end, size, align); -} - -u64 __init get_max_mapped(void) -{ - u64 end = max_pfn_mapped; - - end <<= PAGE_SHIFT; - - return end; -} -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area_size(ei_start, ei_last, start, - sizep, align); - - if (addr != -1ULL) - return addr; - } - - return -1ULL; -} - -/* - * pre allocated 4k and reserved it in e820 + * pre allocated 4k and reserved it in memblock and e820_saved */ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) { @@ -869,8 +804,8 @@ u64 __init early_reserve_e820(u64 startt } #endif for (start = startt; ; start += size) { - start = find_e820_area_size(start, &size, align); - if (!(start + 1)) + start = memblock_x86_find_in_range_size(start, &size, align); + if (start == MEMBLOCK_ERROR) return 0; if (size >= sizet) break; @@ -924,10 +859,9 @@ u64 __init early_reserve_e820(u64 startt return 0; } #endif - e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); + memblock_x86_reserve_range(addr, addr + sizet, "new next"); e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820 for early_reserve_e820\n"); - update_e820(); + printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); update_e820_saved(); return addr; @@ -989,83 +923,6 @@ unsigned long __init e820_end_of_low_ram { return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); } -/* - * Finds an active region in the address range from start_pfn to last_pfn and - * returns its range in ei_startpfn and ei_endpfn for the e820 entry. - */ -int __init e820_find_active_region(const struct e820entry *ei, - unsigned long start_pfn, - unsigned long last_pfn, - unsigned long *ei_startpfn, - unsigned long *ei_endpfn) -{ - u64 align = PAGE_SIZE; - -#ifdef CONFIG_XEN - if (last_pfn > xen_start_info->nr_pages) - last_pfn = xen_start_info->nr_pages; -#endif - - *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; - *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; - - /* Skip map entries smaller than a page */ - if (*ei_startpfn >= *ei_endpfn) - return 0; - - /* Skip if map is outside the node */ - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || - *ei_startpfn >= last_pfn) - return 0; - - /* Check for overlaps */ - if (*ei_startpfn < start_pfn) - *ei_startpfn = start_pfn; - if (*ei_endpfn > last_pfn) - *ei_endpfn = last_pfn; - - return 1; -} - -/* Walk the e820 map and register active regions within a node */ -void __init e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned long ei_startpfn; - unsigned long ei_endpfn; - int i; - - for (i = 0; i < e820.nr_map; i++) - if (e820_find_active_region(&e820.map[i], - start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - add_active_range(nid, ei_startpfn, ei_endpfn); -#ifdef CONFIG_XEN - BUG_ON(nid); - add_active_range(nid, last_pfn, last_pfn); -#endif -} - -/* - * Find the hole size (in bytes) in the memory range. - * @start: starting address of the memory range to scan - * @end: ending address of the memory range to scan - */ -u64 __init e820_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long last_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn, ei_endpfn, ram = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - if (e820_find_active_region(&e820.map[i], - start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - ram += ei_endpfn - ei_startpfn; - } - return end - start - ((u64)ram << PAGE_SHIFT); -} static void early_panic(char *msg) { @@ -1344,3 +1201,48 @@ void __init setup_memory_map(void) printk(KERN_INFO "Xen-provided physical RAM map:\n"); _e820_print_map(&e820, who); } + +void __init memblock_x86_fill(void) +{ + int i; + u64 end; + + /* + * EFI may have more than 128 entries + * We are safe to enable resizing, beause memblock_x86_fill() + * is rather later for x86 + */ + memblock_can_resize = 1; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + end = ei->addr + ei->size; + if (end != (resource_size_t)end) + continue; + + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + continue; + + memblock_add(ei->addr, ei->size); + } + + memblock_analyze(); + memblock_dump_all(); +} + +void __init memblock_find_dma_reserve(void) +{ +#ifdef CONFIG_X86_64 + u64 free_size_pfn; + u64 mem_size_pfn; + /* + * need to find out used area below MAX_DMA_PFN + * need to use memblock to get free size in [0, MAX_DMA_PFN] + * at first, and assume boot_mem will not take below MAX_DMA_PFN + */ + mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; + free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; + set_dma_reserve(mem_size_pfn - free_size_pfn); +#endif +} --- head-2011-03-17.orig/arch/x86/kernel/early_printk-xen.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/early_printk-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -271,6 +272,18 @@ static int __init setup_early_printk(cha if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif +#ifdef CONFIG_X86_MRST_EARLY_PRINTK + if (!strncmp(buf, "mrst", 4)) { + mrst_early_console_init(); + early_console_register(&early_mrst_console, keep); + } + + if (!strncmp(buf, "hsu", 3)) { + hsu_early_console_init(); + early_console_register(&early_hsu_console, keep); + } + +#endif buf++; } return 0; --- head-2011-03-17.orig/arch/x86/kernel/entry_32-xen.S 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_32-xen.S 2011-02-01 15:09:47.000000000 +0100 @@ -119,8 +119,7 @@ NMI_MASK = 0x80000000 /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp @@ -144,14 +143,12 @@ NMI_MASK = 0x80000000 #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl %gs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %gs /*CFI_REL_OFFSET gs, 0*/ .endm .macro POP_GS pop=0 -98: popl %gs - CFI_ADJUST_CFA_OFFSET -4 +98: popl_cfi %gs /*CFI_RESTORE gs*/ .if \pop <> 0 add $\pop, %esp @@ -199,35 +196,25 @@ NMI_MASK = 0x80000000 .macro SAVE_ALL cld PUSH_GS - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0;*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0;*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0;*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl $(__USER_DS), %edx movl %edx, %ds @@ -238,39 +225,29 @@ NMI_MASK = 0x80000000 .endm .macro RESTORE_INT_REGS - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx CFI_RESTORE edx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax CFI_RESTORE eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl %ds - CFI_ADJUST_CFA_OFFSET -4 +1: popl_cfi %ds /*CFI_RESTORE ds;*/ -2: popl %es - CFI_ADJUST_CFA_OFFSET -4 +2: popl_cfi %es /*CFI_RESTORE es;*/ -3: popl %fs - CFI_ADJUST_CFA_OFFSET -4 +3: popl_cfi %fs /*CFI_RESTORE fs;*/ POP_GS \pop .pushsection .fixup, "ax" @@ -324,16 +301,12 @@ NMI_MASK = 0x80000000 ENTRY(ret_from_fork) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - pushl $0x0202 # Reset kernel eflags - CFI_ADJUST_CFA_OFFSET 4 - popfl - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi jmp syscall_exit CFI_ENDPROC END(ret_from_fork) @@ -413,29 +386,23 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl $(__USER_DS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__USER_DS /*CFI_REL_OFFSET ss, 0*/ - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET esp, 0 - pushfl + pushfl_cfi orl $X86_EFLAGS_IF, (%esp) - CFI_ADJUST_CFA_OFFSET 4 - pushl $(__USER_CS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__USER_CS /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -490,8 +457,7 @@ sysenter_audit: movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ call audit_syscall_entry - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -535,8 +501,7 @@ ENTRY(ia32pv_sysenter_target) addl $4,%esp CFI_ADJUST_CFA_OFFSET -4 /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ - pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) /* * Load the potential sixth argument from user stack. * Careful about security. @@ -559,8 +524,7 @@ ENDPROC(ia32pv_sysenter_target) # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway - pushl %eax # save orig_eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) @@ -609,7 +573,6 @@ restore_nocheck: jnz restore_all_enable_events # != 0 => enable event delivery #endif RESTORE_REGS 4 # skip orig_eax/error_code - CFI_ADJUST_CFA_OFFSET -4 irq_return: INTERRUPT_RETURN .section .fixup,"ax" @@ -663,10 +626,8 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - CFI_ADJUST_CFA_OFFSET 4 - push %eax /* new kernel esp */ - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__ESPFIX_SS + pushl_cfi %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ @@ -735,11 +696,9 @@ work_notifysig: # deal with pending s ALIGN work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx movl %eax, %esp #else movl %esp, %eax @@ -819,14 +778,18 @@ ptregs_##name: \ #define PTREGSCALL3(name) \ ALIGN; \ ptregs_##name: \ + CFI_STARTPROC; \ leal 4(%esp),%eax; \ - pushl %eax; \ + pushl_cfi %eax; \ movl PT_EDX(%eax),%ecx; \ movl PT_ECX(%eax),%edx; \ movl PT_EBX(%eax),%eax; \ call sys_##name; \ addl $4,%esp; \ - ret + CFI_ADJUST_CFA_OFFSET -4; \ + ret; \ + CFI_ENDPROC; \ +ENDPROC(ptregs_##name) PTREGSCALL1(iopl) PTREGSCALL0(fork) @@ -841,15 +804,19 @@ PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ ALIGN; ptregs_clone: + CFI_STARTPROC leal 4(%esp),%eax - pushl %eax - pushl PT_EDI(%eax) + pushl_cfi %eax + pushl_cfi PT_EDI(%eax) movl PT_EDX(%eax),%ecx movl PT_ECX(%eax),%edx movl PT_EBX(%eax),%eax call sys_clone addl $8,%esp + CFI_ADJUST_CFA_OFFSET -8 ret + CFI_ENDPROC +ENDPROC(ptregs_clone) #ifndef CONFIG_XEN .macro FIXUP_ESPFIX_STACK @@ -865,10 +832,8 @@ ptregs_clone: mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - CFI_ADJUST_CFA_OFFSET 4 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__KERNEL_DS + pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm @@ -905,8 +870,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif -1: pushl $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 4 +1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -946,8 +910,7 @@ ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ - pushl $~(nr); \ - CFI_ADJUST_CFA_OFFSET 4; \ + pushl_cfi $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ @@ -984,8 +947,7 @@ ENDPROC(name) # so we can simply throw away the new one. ENTRY(hypervisor_callback) RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL movl PT_CS(%esp),%ecx movl PT_EIP(%esp),%eax @@ -1005,8 +967,7 @@ ENTRY(hypervisor_callback) addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame. #endif .Ldo_upcall: - push %esp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esp call evtchn_do_upcall add $4,%esp CFI_ADJUST_CFA_OFFSET -4 @@ -1111,21 +1072,18 @@ ENTRY(failsafe_callback) ENTRY(coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_error jmp error_code CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl $do_general_protection +661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" .balign 4 @@ -1140,19 +1098,16 @@ ENTRY(simd_coprocessor_error) 664: .previous #else - pushl $do_simd_coprocessor_error + pushl_cfi $do_simd_coprocessor_error #endif - CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_device_not_available - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int + pushl_cfi $do_device_not_available jmp error_code CFI_ENDPROC END(device_not_available) @@ -1174,82 +1129,68 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_overflow - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_overflow jmp error_code CFI_ENDPROC END(overflow) ENTRY(bounds) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_bounds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_bounds jmp error_code CFI_ENDPROC END(bounds) ENTRY(invalid_op) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_invalid_op - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_invalid_op jmp error_code CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_segment_overrun - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_segment_overrun jmp error_code CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME - pushl $do_invalid_TSS - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_invalid_TSS jmp error_code CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME - pushl $do_segment_not_present - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_segment_not_present jmp error_code CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME - pushl $do_stack_segment - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_stack_segment jmp error_code CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME - pushl $do_alignment_check - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_alignment_check jmp error_code CFI_ENDPROC END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 # no error code + pushl_cfi $do_divide_error jmp error_code CFI_ENDPROC END(divide_error) @@ -1257,10 +1198,8 @@ END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi machine_check_vector jmp error_code CFI_ENDPROC END(machine_check) @@ -1269,18 +1208,15 @@ END(machine_check) #ifndef CONFIG_XEN ENTRY(spurious_interrupt_bug) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_spurious_interrupt_bug - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_spurious_interrupt_bug jmp error_code CFI_ENDPROC #endif /* !CONFIG_XEN */ ENTRY(fixup_4gb_segment) RING0_EC_FRAME - pushl $do_fixup_4gb_segment - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_fixup_4gb_segment jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) @@ -1413,8 +1349,7 @@ ENTRY(ia32pv_cstar_target) movl %ebp,%ecx movl $__USER_CS,4(%esp) movl 12(%esp),%ebp - pushl %eax # save orig_eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax # save orig_eax /* * Load the potential sixth argument from user stack. * Careful about security. @@ -1545,40 +1480,29 @@ mask=0 ENTRY(page_fault) RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 cld movl $(__KERNEL_PERCPU), %ecx @@ -1621,12 +1545,9 @@ END(page_fault) movl TSS_sysenter_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip - pushfl - CFI_ADJUST_CFA_OFFSET 4 - pushl $__KERNEL_CS - CFI_ADJUST_CFA_OFFSET 4 - pushl $sysenter_past_esp - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi + pushl_cfi $__KERNEL_CS + pushl_cfi $sysenter_past_esp CFI_REL_OFFSET eip, 0 .endm #endif /* CONFIG_XEN */ @@ -1639,8 +1560,7 @@ ENTRY(debug) FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: #endif /* !CONFIG_XEN */ - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 @@ -1660,33 +1580,28 @@ END(debug) */ ENTRY(nmi) RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax #ifndef CONFIG_XEN movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax je nmi_espfix_stack cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer @@ -1715,18 +1630,14 @@ nmi_espfix_stack: * * create the pointer to lss back */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ss + pushl_cfi %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi 16(%esp) .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code @@ -1748,8 +1659,7 @@ END(nmi) ENTRY(int3) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code @@ -1761,8 +1671,7 @@ END(int3) ENTRY(general_protection) RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_general_protection jmp error_code CFI_ENDPROC END(general_protection) --- head-2011-03-17.orig/arch/x86/kernel/entry_64-xen.S 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/entry_64-xen.S 2011-02-01 15:09:47.000000000 +0100 @@ -204,23 +204,17 @@ NMI_MASK = 0x80000000 .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ xorl %eax, %eax - pushq $__KERNEL_DS /* ss */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_DS /* ss */ /*CFI_REL_OFFSET ss,0*/ - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq $X86_EFLAGS_IF /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi \child_rip /* rip */ CFI_REL_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* orig rax */ .endm .macro UNFAKE_STACK_FRAME @@ -333,6 +327,7 @@ NMI_MASK = 0x80000000 #ifndef CONFIG_XEN /* save partial stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_args) XCPT_FRAME cld @@ -372,6 +367,7 @@ ENTRY(save_args) ret CFI_ENDPROC END(save_args) + .popsection #endif ENTRY(save_rest) @@ -433,10 +429,8 @@ ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK,TI_flags(%r8) - push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -8 + pushq_cfi kernel_eflags(%rip) + popfq_cfi # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -532,11 +526,9 @@ sysret_careful: jnc sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi jmp sysret_check /* Handle a signal */ @@ -649,11 +641,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -667,12 +657,10 @@ int_check_syscall_exit_work: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -729,9 +717,8 @@ END(ptregscall_common) ENTRY(stub_execve) CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 + addq $8, %rsp + PARTIAL_FRAME 0 SAVE_REST FIXUP_TOP_OF_STACK %r11 movq %rsp, %rcx @@ -750,7 +737,7 @@ END(stub_execve) ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 + PARTIAL_FRAME 0 SAVE_REST movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 @@ -792,11 +779,9 @@ retint_careful: jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -842,8 +827,7 @@ END(retint_check) .macro apicinterrupt num sym do_sym ENTRY(\sym) INTR_FRAME - pushq $~(\num) - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $~(\num) interrupt \do_sym jmp error_entry CFI_ENDPROC @@ -867,22 +851,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ - invalidate_interrupt0 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ - invalidate_interrupt1 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ - invalidate_interrupt2 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ - invalidate_interrupt3 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ - invalidate_interrupt4 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ - invalidate_interrupt5 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ - invalidate_interrupt6 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ - invalidate_interrupt7 smp_invalidate_interrupt +.irpc idx, "01234567" +apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ + invalidate_interrupt\idx smp_invalidate_interrupt +.endr #endif apicinterrupt THRESHOLD_APIC_VECTOR \ @@ -909,9 +881,9 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_EVENTS -apicinterrupt LOCAL_PENDING_VECTOR \ - perf_pending_interrupt smp_perf_pending_interrupt +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt #endif #endif /* !CONFIG_XEN */ @@ -926,8 +898,8 @@ ENTRY(\sym) movq 8(%rsp),%r11 CFI_RESTORE r11 movq $-1,8(%rsp) /* ORIG_RAX: no syscall to restart */ - subq $(15-1)*8,%rsp - CFI_ADJUST_CFA_OFFSET (15-1)*8 + subq $ORIG_RAX-R15-1*8,%rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-1*8 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -953,8 +925,8 @@ ENTRY(\sym) CFI_RESTORE rcx movq 8(%rsp),%r11 CFI_RESTORE r11 - subq $(15-2)*8,%rsp - CFI_ADJUST_CFA_OFFSET (15-2)*8 + subq $ORIG_RAX-R15-2*8,%rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-2*8 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1074,8 +1046,7 @@ ENTRY(failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $0 SAVE_ALL jmp error_exit CFI_ENDPROC @@ -1143,8 +1114,7 @@ END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - push %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1153,6 +1123,7 @@ ENTRY(call_softirq) push %rbp # backlink for old unwinder call __do_softirq leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) @@ -1191,7 +1162,7 @@ paranoidzeroentry machine_check *machine /* ebx: no swapgs flag */ ENTRY(paranoid_exit) - INTR_FRAME + DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %ebx,%ebx /* swapgs needed? */ @@ -1271,7 +1242,6 @@ error_sti: #endif TRACE_IRQS_OFF ret - CFI_ENDPROC #ifndef CONFIG_XEN /* @@ -1298,6 +1268,7 @@ bstep_iret: movq %rcx,RIP+8(%rsp) jmp error_swapgs #endif + CFI_ENDPROC END(error_entry) @@ -1338,11 +1309,9 @@ END(do_nmi_callback) #ifndef CONFIG_IA32_EMULATION ENTRY(ignore_sysret) INTR_FRAME - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rcx CFI_RESTORE rcx - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %r11 CFI_RESTORE r11 mov $-ENOSYS,%eax HYPERVISOR_IRET 0 --- head-2011-03-17.orig/arch/x86/kernel/head-xen.c 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -53,7 +54,7 @@ void __init reserve_ebda_region(void) lowmem = 0x9f000; /* reserve all memory between lowmem and the 1MB mark */ - reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); + memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); } #else /* CONFIG_XEN */ #include @@ -103,10 +104,12 @@ void __init xen_start_kernel(void) WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables)); - reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE), - __pa(xen_start_info->pt_base) - + (xen_start_info->nr_pt_frames << PAGE_SHIFT), - "Xen provided"); + memblock_init(); + memblock_x86_reserve_range(ALIGN(__pa_symbol(&_end), PAGE_SIZE), + __pa(xen_start_info->pt_base) + + (xen_start_info->nr_pt_frames + << PAGE_SHIFT), + "Xen provided"); #ifdef CONFIG_X86_32 { --- head-2011-03-17.orig/arch/x86/kernel/head32-xen.c 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head32-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include static void __init i386_default_early_setup(void) { @@ -49,17 +51,18 @@ void __init i386_start_kernel(void) BUG_ON(pte_index(hypervisor_virt_start)); #endif + memblock_init(); + #ifdef CONFIG_X86_TRAMPOLINE /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, - "EX TRAMPOLINE"); + memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); #endif - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifndef CONFIG_XEN #ifdef CONFIG_BLK_DEV_INITRD @@ -69,7 +72,7 @@ void __init i386_start_kernel(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif --- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -119,7 +120,9 @@ void __init x86_64_start_reservations(ch { copy_bootdata(__va(real_mode_data)); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_init(); + + memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); /* * At this point everything still needed from the boot loader --- head-2011-03-17.orig/arch/x86/kernel/irq-xen.c 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/irq-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -71,10 +71,10 @@ static int show_other_interrupts(struct for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "PND"); + seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); - seq_printf(p, " Performance pending work\n"); + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); #endif #ifndef CONFIG_XEN if (x86_platform_ipi_callback) { @@ -172,7 +172,7 @@ int show_interrupts(struct seq_file *p, seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, " %8s", desc->irq_data.chip->name); seq_printf(p, "-%-8s", desc->name); if (action) { @@ -198,7 +198,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; - sum += irq_stats(cpu)->apic_pending_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; #endif #ifndef CONFIG_XEN if (x86_platform_ipi_callback) @@ -302,6 +302,7 @@ void fixup_irqs(void) unsigned int irq; static int warned; struct irq_desc *desc; + struct irq_data *data; static DECLARE_BITMAP(irqs_used, NR_IRQS); for_each_irq_desc(irq, desc) { @@ -317,7 +318,8 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ raw_spin_lock(&desc->lock); - affinity = desc->affinity; + data = &desc->irq_data; + affinity = data->affinity; if (!irq_has_action(irq) || cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); @@ -332,16 +334,16 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) - desc->chip->mask(irq); + if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) + data->chip->irq_mask(data); - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->chip != &no_irq_chip && !(warned++)) + if (data->chip->irq_set_affinity) + data->chip->irq_set_affinity(data, affinity, true); + else if (data->chip != &no_irq_chip && !(warned++)) set_affinity = 0; - if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) - desc->chip->unmask(irq); + if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) + data->chip->irq_unmask(data); raw_spin_unlock(&desc->lock); @@ -367,9 +369,10 @@ void fixup_irqs(void) continue; if (xen_test_irq_pending(irq)) { + data = irq_get_irq_data(irq); raw_spin_lock(&desc->lock); - if (desc->chip->retrigger) - desc->chip->retrigger(irq); + if (data->chip->irq_retrigger) + data->chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } } --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-17/arch/x86/kernel/irq_work-xen.c 2011-02-03 11:19:35.000000000 +0100 @@ -0,0 +1,23 @@ +/* + * x86/Xen specific code for irq_work + */ + +#include +#include +#include +#include + +#ifdef CONFIG_SMP +irqreturn_t smp_irq_work_interrupt(int irq, void *dev_id) +{ + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + + return IRQ_HANDLED; +} + +void arch_irq_work_raise(void) +{ + xen_send_IPI_self(IRQ_WORK_VECTOR); +} +#endif --- head-2011-03-17.orig/arch/x86/kernel/microcode_core-xen.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/microcode_core-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -12,7 +12,7 @@ * Software Developer's Manual * Order Number 253668 or free download from: * - * http://developer.intel.com/design/pentium4/manuals/253668.htm + * http://developer.intel.com/Assets/PDF/manual/253668.pdf * * For more information, go to http://www.urbanmyth.org/microcode * @@ -117,6 +117,7 @@ static const struct file_operations micr .owner = THIS_MODULE, .write = microcode_write, .open = microcode_open, + .llseek = no_llseek, }; static struct miscdevice microcode_dev = { --- head-2011-03-17.orig/arch/x86/kernel/mpparse-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/mpparse-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -686,7 +687,7 @@ static void __init smp_reserve_memory(st { unsigned long size = get_mpc_size(mpf->physptr); - reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); + memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); } #endif @@ -719,7 +720,7 @@ static int __init smp_scan_config(unsign mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); + memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); if (mpf->physptr) smp_reserve_memory(mpf); #else --- head-2011-03-17.orig/arch/x86/kernel/pci-dma-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/pci-dma-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include static int forbid_dac __read_mostly; @@ -44,6 +44,8 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; +extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; + /* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", @@ -142,7 +144,10 @@ static struct dma_map_ops swiotlb_dma_op .dma_supported = swiotlb_dma_supported }; -#define pci_xen_swiotlb_detect() 1 +static int __init pci_xen_swiotlb_detect(void) +{ + return 1; +} static void __init pci_xen_swiotlb_init(void) { @@ -153,26 +158,28 @@ static void __init pci_xen_swiotlb_init( } } +IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, NULL, pci_xen_swiotlb_init, NULL); + void __init pci_iommu_alloc(void) { + struct iommu_table_entry *p; + /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) - goto out; - - gart_iommu_hole_init(); - - detect_calgary(); + sort_iommu_table(__iommu_table, __iommu_table_end); + check_iommu_entries(__iommu_table, __iommu_table_end); - detect_intel_iommu(); - - /* needs to be called after gart_iommu_hole_init */ - amd_iommu_detect(); -out: - pci_xen_swiotlb_init(); + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && p->detect && p->detect() > 0) { + p->flags |= IOMMU_DETECTED; + if (p->early_init) + p->early_init(); + if (p->flags & IOMMU_FINISH_IF_DETECTED) + break; + } + } } - void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -375,6 +382,7 @@ EXPORT_SYMBOL(dma_supported); static int __init pci_iommu_init(void) { + struct iommu_table_entry *p; dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); #ifdef CONFIG_PCI @@ -382,14 +390,10 @@ static int __init pci_iommu_init(void) #endif x86_init.iommu.iommu_init(); -#ifndef CONFIG_XEN - if (swiotlb || xen_swiotlb) { - printk(KERN_INFO "PCI-DMA: " - "Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_print_info(); - } else - swiotlb_free(); -#endif + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && (p->flags & IOMMU_DETECTED) && p->late_init) + p->late_init(); + } return 0; } --- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-03-03 16:25:01.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-03-03 16:25:11.000000000 +0100 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -83,7 +84,6 @@ #include #include #include -#include #include #include #include @@ -107,11 +107,12 @@ #include #include #include -#include +#include #ifdef CONFIG_X86_64 #include #endif #include +#include #ifdef CONFIG_XEN #include @@ -155,7 +156,6 @@ unsigned long max_pfn_mapped; RESERVE_BRK(dmi_alloc, 65536); #endif -unsigned int boot_cpu_id __read_mostly; static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; @@ -337,7 +337,7 @@ static inline void init_gbpages(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -360,17 +360,16 @@ static void __init relocate_initrd(void) char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, + ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, PAGE_SIZE); - if (ramdisk_here == -1ULL) + if (ramdisk_here == MEMBLOCK_ERROR) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + area_size, - "NEW RAMDISK"); + memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -443,7 +442,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - free_early(ramdisk_image, ramdisk_end); + memblock_x86_free_range(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -469,7 +468,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - free_early(ramdisk_image, ramdisk_end); + memblock_x86_free_range(ramdisk_image, ramdisk_end); } #else static void __init reserve_initrd(void) @@ -529,7 +528,7 @@ static void __init e820_reserve_setup_da #endif } -static void __init reserve_early_setup_data(void) +static void __init memblock_x86_reserve_range_setup_data(void) { #ifndef CONFIG_XEN struct setup_data *data; @@ -542,7 +541,7 @@ static void __init reserve_early_setup_d while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; early_iounmap(data, sizeof(*data)); } @@ -565,6 +564,18 @@ static inline unsigned long long get_tot return total << PAGE_SHIFT; } +/* + * Keep the crash kernel below this limit. On 32 bits earlier kernels + * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this + * limit once kexec-tools are fixed. + */ +#ifdef CONFIG_X86_32 +# define CRASH_KERNEL_ADDR_MAX (512 << 20) +#else +# define CRASH_KERNEL_ADDR_MAX (896 << 20) +#endif + static void __init reserve_crashkernel(void) { unsigned long long total_mem; @@ -582,23 +593,27 @@ static void __init reserve_crashkernel(v if (crash_base <= 0) { const unsigned long long alignment = 16<<20; /* 16M */ - crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, - alignment); - if (crash_base == -1ULL) { + /* + * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX + */ + crash_base = memblock_find_in_range(alignment, + CRASH_KERNEL_ADDR_MAX, crash_size, alignment); + + if (crash_base == MEMBLOCK_ERROR) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } } else { unsigned long long start; - start = find_e820_area(crash_base, ULONG_MAX, crash_size, - 1<<20); + start = memblock_find_in_range(crash_base, + crash_base + crash_size, crash_size, 1<<20); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } - reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); + memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -684,93 +699,27 @@ static __init void reserve_ibft_region(v #ifndef CONFIG_XEN if (size) - reserve_early_overlap_ok(addr, addr + size, "ibft"); + memblock_x86_reserve_range(addr, addr + size, "* ibft"); #endif } -#ifdef CONFIG_X86_RESERVE_LOW_64K -static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE - "%s detected: BIOS may corrupt low RAM, working around it.\n", - d->ident); - - e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - - return 0; -} -#endif - -/* List of systems that have known low memory corruption BIOS problems */ -static struct dmi_system_id __initdata bad_bios_dmi_table[] = { -#ifdef CONFIG_X86_RESERVE_LOW_64K - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix/MSC BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), - }, - }, - /* - * AMI BIOS with low memory corruption was found on Intel DG45ID and - * DG45FC boards. - * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will - * match only DMI_BOARD_NAME and see if there is more bad products - * with this vendor. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), - }, - }, - /* - * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so - * match on the product name. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), - }, - }, -#endif - {} -}; - #ifndef CONFIG_XEN +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + static void __init trim_bios_range(void) { /* * A special case is the first 4Kb of memory; * This is a BIOS owned area, not kernel ram, but generally * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), + E820_RAM, E820_RESERVED); + /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. @@ -779,8 +728,39 @@ static void __init trim_bios_range(void) e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } + +static int __init parse_reservelow(char *p) +{ + unsigned long long size; + + if (!p) + return -EINVAL; + + size = memparse(p, &p); + + if (size < 4096) + size = 4096; + + if (size > 640*1024) + size = 640*1024; + + reserve_low = size; + + return 0; +} + +early_param("reservelow", parse_reservelow); #endif +static u64 __init get_max_mapped(void) +{ + u64 end = max_pfn_mapped; + + end <<= PAGE_SHIFT; + + return end; +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -798,6 +778,7 @@ void __init setup_arch(char **cmdline_p) { int acpi = 0; int k8 = 0; + unsigned long flags; #ifdef CONFIG_XEN unsigned int i; unsigned long p2m_pages; @@ -820,14 +801,27 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); + +#ifndef CONFIG_XEN + /* + * copy kernel address range established so far and switch + * to the proper swapper page table + */ + clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, + initial_page_table + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif - /* VMI may relocate the fixmap; do this before touching ioremap area */ - vmi_init(); - - /* OFW also may relocate the fixmap */ + /* + * If we have OLPC OFW, we might end up relocating the fixmap due to + * reserve_top(), so do this before touching the ioremap area. + */ olpc_ofw_detect(); early_trap_init(); @@ -873,7 +867,7 @@ void __init setup_arch(char **cmdline_p) #endif 4)) { efi_enabled = 1; - efi_reserve_early(); + efi_memblock_x86_reserve_range(); } #endif #else /* CONFIG_XEN */ @@ -901,6 +895,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); /* update the e820_saved too */ @@ -953,11 +948,8 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* Must be before kernel pagetables are setup */ - vmi_activate(); - /* after early param, so could get panic from serial */ - reserve_early_setup_data(); + memblock_x86_reserve_range_setup_data(); if (acpi_mps_check()) { #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) @@ -976,12 +968,9 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); - if (is_initial_xendomain()) { + if (is_initial_xendomain()) dmi_scan_machine(); - dmi_check_system(bad_bios_dmi_table); - } - /* * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. @@ -1016,8 +1005,6 @@ void __init setup_arch(char **cmdline_p) */ max_pfn = e820_end_of_ram_pfn(); - /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); #ifndef CONFIG_XEN @@ -1044,20 +1031,8 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; -#ifndef CONFIG_XEN - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; -#endif #endif -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION - setup_bios_corruption_check(); -#endif - - printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", - max_pfn_mapped< 1) { xen_send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } --- head-2011-03-17.orig/arch/x86/kernel/traps-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/traps-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -568,6 +568,7 @@ dotraplinkage void __kprobes do_debug(st if (regs->flags & X86_VM_MASK) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; } @@ -770,21 +771,10 @@ asmlinkage void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); -#ifndef CONFIG_MATH_EMULATION -void math_emulate(struct math_emu_info *info) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { -#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) +#ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -792,12 +782,12 @@ do_device_not_available(struct pt_regs * info.regs = regs; math_emulate(&info); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); + return; } -#else - math_state_restore(); +#endif + math_state_restore(); /* interrupts still off */ +#ifdef CONFIG_X86_32 + conditional_sti(regs); #endif } @@ -880,20 +870,6 @@ void __init trap_init(void) if (ret) printk("HYPERVISOR_set_trap_table failed (%d)\n", ret); -#ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - -#endif /* * Should be a barrier for any external CPU state: */ --- head-2011-03-17.orig/arch/x86/mm/fault-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/fault-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -11,6 +11,7 @@ #include /* __kprobes, ... */ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ +#include /* hstate_index_to_shift */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ @@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsign static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) + struct task_struct *tsk, int fault) { + unsigned lsb = 0; siginfo_t info; info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } @@ -176,9 +182,6 @@ force_sig_info_fault(int si_signo, int s DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); -#define pgd_page_table(what, pg) \ - spin_##what(&((struct mm_struct *)(pg)->private)->page_table_lock) - #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { @@ -240,13 +243,16 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - pmd_t *pmd; + spinlock_t *pgt_lock; + pmd_t *ret; + + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; - pgd_page_table(lock, page); - pmd = vmalloc_sync_one(page_address(page), address); - pgd_page_table(unlock, page); + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); - if (!pmd) + if (!ret) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -268,6 +274,8 @@ static noinline __kprobes int vmalloc_fa if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Synchronize this task's top level page-table * with the 'reference' page table. @@ -343,31 +351,7 @@ out: void vmalloc_sync_all(void) { - unsigned long address; - - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pgd_page_table(lock, page); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - pgd_page_table(unlock, page); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* @@ -388,6 +372,8 @@ static noinline __kprobes int vmalloc_fa if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later @@ -750,7 +736,7 @@ __bad_area_nosemaphore(struct pt_regs *r tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); return; } @@ -835,14 +821,14 @@ do_sigbus(struct pt_regs *regs, unsigned tsk->thread.trap_no = 14; #ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void @@ -852,7 +838,8 @@ mm_fault_error(struct pt_regs *regs, uns if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else BUG(); @@ -913,8 +900,14 @@ spurious_fault(unsigned long error_code, if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); + /* + * Note: don't use pte_present() here, since it returns true + * if the _PAGE_PROTNONE bit is set. However, this aliases the + * _PAGE_GLOBAL bit, which for kernel pages give false positives + * when CONFIG_DEBUG_PAGEALLOC is used. + */ pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) + if (!(pte_flags(*pte) & _PAGE_PRESENT)) return 0; ret = spurious_fault_check(error_code, pte); @@ -934,9 +927,9 @@ spurious_fault(unsigned long error_code, int show_unhandled_signals = 1; static inline int -access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +access_error(unsigned long error_code, struct vm_area_struct *vma) { - if (write) { + if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -971,8 +964,10 @@ do_page_fault(struct pt_regs *regs, unsi struct task_struct *tsk; unsigned long address; struct mm_struct *mm; - int write; int fault; + int write = error_code & PF_WRITE; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + (write ? FAULT_FLAG_WRITE : 0); /* Set the "privileged fault" bit to something sane. */ if (user_mode_vm(regs)) @@ -1100,6 +1095,7 @@ do_page_fault(struct pt_regs *regs, unsi bad_area_nosemaphore(regs, error_code, address); return; } +retry: down_read(&mm->mmap_sem); } else { /* @@ -1143,9 +1139,7 @@ do_page_fault(struct pt_regs *regs, unsi * we can handle it.. */ good_area: - write = error_code & PF_WRITE; - - if (unlikely(access_error(error_code, write, vma))) { + if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; } @@ -1155,21 +1149,34 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } } check_v8086_mode(regs, address, tsk); --- head-2011-03-17.orig/arch/x86/mm/highmem_32-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/highmem_32-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -9,6 +9,7 @@ void *kmap(struct page *page) return page_address(page); return kmap_high(page); } +EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { @@ -18,6 +19,7 @@ void kunmap(struct page *page) return; kunmap_high(page); } +EXPORT_SYMBOL(kunmap); /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because @@ -27,10 +29,10 @@ void kunmap(struct page *page) * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); @@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); - + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); @@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page return (void *)vaddr; } +EXPORT_SYMBOL(kmap_atomic_prot); -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} +EXPORT_SYMBOL(__kmap_atomic); + +/* + * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn) { - return kmap_atomic_prot(page, type, kmap_prot); + return kmap_atomic_prot_pfn(pfn, kmap_prot); } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); - else { + kmap_atomic_idx_pop(); + } #ifdef CONFIG_DEBUG_HIGHMEM + else { BUG_ON(vaddr < PAGE_OFFSET); BUG_ON(vaddr >= (unsigned long)high_memory); -#endif } +#endif pagefault_enable(); } - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) -{ - return kmap_atomic_prot_pfn(pfn, type, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ +EXPORT_SYMBOL(__kunmap_atomic); struct page *kmap_atomic_to_page(void *ptr) { @@ -98,6 +112,7 @@ struct page *kmap_atomic_to_page(void *p pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } +EXPORT_SYMBOL(kmap_atomic_to_page); void clear_highpage(struct page *page) { @@ -117,6 +132,7 @@ void clear_highpage(struct page *page) clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); } +EXPORT_SYMBOL(clear_highpage); void copy_highpage(struct page *to, struct page *from) { @@ -143,14 +159,6 @@ void copy_highpage(struct page *to, stru kunmap_atomic(vfrom, KM_USER0); kunmap_atomic(vto, KM_USER1); } - -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic_notypecheck); -EXPORT_SYMBOL(kmap_atomic_prot); -EXPORT_SYMBOL(kmap_atomic_to_page); -EXPORT_SYMBOL(clear_highpage); EXPORT_SYMBOL(copy_highpage); void __init set_highmem_pages_init(void) --- head-2011-03-17.orig/arch/x86/mm/init-xen.c 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -340,7 +341,7 @@ unsigned long __init_refok init_memory_m __flush_tlb_all(); if (!after_bootmem && e820_table_top > e820_table_start) - reserve_early(e820_table_start << PAGE_SHIFT, + memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) --- head-2011-03-17.orig/arch/x86/mm/init_32-xen.c 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_32-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ static __init void *alloc_low_page(void) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); return adr; } @@ -458,49 +459,28 @@ static void __init add_one_highpage_init totalhigh_pages++; } -struct add_highpages_data { - unsigned long start_pfn; - unsigned long end_pfn; -}; - -static int __init add_highpages_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - int node_pfn; - struct page *page; - unsigned long final_start_pfn, final_end_pfn; - struct add_highpages_data *data; - - data = (struct add_highpages_data *)datax; - - final_start_pfn = max(start_pfn, data->start_pfn); - final_end_pfn = min(end_pfn, data->end_pfn); - if (final_start_pfn >= final_end_pfn) - return 0; - - for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; - node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page); - } - - return 0; - -} - -void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) +void __init add_highpages_with_active_regions(int nid, + unsigned long start_pfn, unsigned long end_pfn) { - struct add_highpages_data data; + struct range *range; + int nr_range; + int i; - data.start_pfn = start_pfn; - data.end_pfn = end_pfn; + nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); - work_with_active_regions(nid, add_highpages_work_fn, &data); + for (i = 0; i < nr_range; i++) { + struct page *page; + int node_pfn; + + for (node_pfn = range[i].start; node_pfn < range[i].end; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page); + } + } } - #else static inline void permanent_kmaps_init(pgd_t *pgd_base) { @@ -550,48 +530,6 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN) -/* - * ACPI suspend needs this for resume, because things like the intel-agp - * driver might have split up a kernel 4MB mapping. - */ -char swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned(PAGE_SIZE))); - -static inline void save_pg_dir(void) -{ - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); -} -#else /* !CONFIG_ACPI_SLEEP */ -static inline void save_pg_dir(void) -{ -} -#endif /* !CONFIG_ACPI_SLEEP */ - -void zap_low_mappings(bool early) -{ - int i; - - /* - * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. - */ - for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - } - - if (early) - __flush_tlb(); - else - flush_tlb_all(); -} - pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); @@ -714,14 +652,14 @@ void __init initmem_init(unsigned long s highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - e820_register_active_regions(0, 0, highend_pfn); + memblock_x86_register_active_regions(0, 0, highend_pfn); sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - e820_register_active_regions(0, 0, max_low_pfn); + memblock_x86_register_active_regions(0, 0, max_low_pfn); sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; @@ -752,75 +690,18 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -#ifndef CONFIG_NO_BOOTMEM -static unsigned long __init setup_node_bootmem(int nodeid, - unsigned long start_pfn, - unsigned long end_pfn, - unsigned long bootmap) -{ - unsigned long bootmap_size; - - /* don't touch min_low_pfn */ - bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap >> PAGE_SHIFT, - start_pfn, end_pfn); - printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", - nodeid, start_pfn<nr_pages); - - /* - * Initialize the boot-time allocator (with low memory only): - */ - bootmap_size = bootmem_bootmap_pages(end_xen_pfn)<nr_pages)< xen_start_info->nr_pages) - reserve_early(xen_start_info->nr_pages << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, "BALLOON"); + memblock_x86_reserve_range(xen_start_info->nr_pages << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, "BALLOON"); #endif printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< end_xen_pfn) - continue; - if (end_pfn > end_xen_pfn) - end_pfn = end_xen_pfn; -#else - start_pfn = 0; - end_pfn = end_xen_pfn; -#endif - bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, - bootmap); - } -#endif - after_bootmem = 1; } @@ -870,8 +751,8 @@ unsigned long __init extend_init_mapping } if (start_pfn > start) - reserve_early(start << PAGE_SHIFT, - start_pfn << PAGE_SHIFT, "INITMAP"); + memblock_x86_reserve_range(start << PAGE_SHIFT, + start_pfn << PAGE_SHIFT, "INITMAP"); return start_pfn; } @@ -1026,9 +907,6 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - save_pg_dir(); - zap_low_mappings(true); - SetPagePinned(virt_to_page(init_mm.pgd)); } @@ -1139,8 +1017,3 @@ void mark_rodata_ro(void) } #endif -int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ - return reserve_bootmem(phys, len, flags); -} --- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,6 @@ #include #include #include -#include #include @@ -164,6 +164,43 @@ static int __init nonx32_setup(char *str __setup("noexec32=", nonx32_setup); /* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. */ @@ -370,7 +407,7 @@ static __ref void *alloc_low_page(unsign panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } @@ -772,11 +809,13 @@ kernel_physical_mapping_init(unsigned lo unsigned long end, unsigned long page_size_mask) { - + bool pgd_changed = false; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -808,9 +847,13 @@ kernel_physical_mapping_init(unsigned lo spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } } + if (pgd_changed) + sync_global_pgds(addr, end); + return last_map_addr; } @@ -818,31 +861,11 @@ kernel_physical_mapping_init(unsigned lo void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8) { -#ifndef CONFIG_NO_BOOTMEM - unsigned long bootmap_size, bootmap; - - e820_register_active_regions(0, start_pfn, end_pfn); -#ifdef CONFIG_XEN - if (end_pfn > xen_start_info->nr_pages) - end_pfn = xen_start_info->nr_pages; -#endif - bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, - 0, end_pfn); - free_bootmem_with_active_regions(0, end_pfn); -#else - e820_register_active_regions(0, start_pfn, end_pfn); + memblock_x86_register_active_regions(0, start_pfn, end_pfn); #ifdef CONFIG_XEN if (end_pfn > xen_start_info->nr_pages) - reserve_early(xen_start_info->nr_pages << PAGE_SHIFT, - end_pfn << PAGE_SHIFT, "BALLOON"); -#endif + memblock_x86_reserve_range(xen_start_info->nr_pages << PAGE_SHIFT, + end_pfn << PAGE_SHIFT, "BALLOON"); #endif } #endif @@ -1062,54 +1085,6 @@ void mark_rodata_ro(void) #endif -int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ -#ifdef CONFIG_NUMA - int nid, next_nid; - int ret; -#endif - unsigned long pfn = phys >> PAGE_SHIFT; - - if (pfn >= max_pfn) { - /* - * This can happen with kdump kernels when accessing - * firmware tables: - */ - if (pfn < max_pfn_mapped) - return -EFAULT; - - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", - phys, len); - return -EFAULT; - } - - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_NUMA - nid = phys_to_nid(phys); - next_nid = phys_to_nid(phys + len - 1); - if (nid == next_nid) - ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); - else - ret = reserve_bootmem(phys, len, flags); - - if (ret != 0) - return ret; - -#else - reserve_bootmem(phys, len, flags); -#endif - -#ifndef CONFIG_XEN - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { - dma_reserve += len / PAGE_SIZE; - set_dma_reserve(dma_reserve); - } -#endif - - return 0; -} - int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; @@ -1281,6 +1256,7 @@ vmemmap_populate(struct page *start_page } } + sync_global_pgds((unsigned long)start_page, end); return 0; } --- head-2011-03-17.orig/arch/x86/mm/iomap_32-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/iomap_32-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -49,21 +49,20 @@ int iomap_create_wc(resource_size_t base } EXPORT_SYMBOL_GPL(iomap_create_wc); -void -iomap_free(resource_size_t base, unsigned long size) +void iomap_free(resource_size_t base, unsigned long size) { io_free_memtype(base, base + size); } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; pagefault_disable(); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot)); @@ -73,10 +72,10 @@ void *kmap_atomic_prot_pfn(unsigned long } /* - * Map 'mfn' using fixed map 'type' and protections 'prot' + * Map 'mfn' using protections 'prot' */ void __iomem * -iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot) +iomap_atomic_prot_pfn(unsigned long mfn, pgprot_t prot) { /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. @@ -88,24 +87,34 @@ iomap_atomic_prot_pfn(unsigned long mfn, prot = PAGE_KERNEL_UC_MINUS; pgprot_val(prot) |= _PAGE_IOMAP; - return (void __force __iomem *) kmap_atomic_prot_pfn(mfn, type, prot); + return (void __force __iomem *) kmap_atomic_prot_pfn(mfn, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type) +iounmap_atomic(void __iomem *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + } pagefault_enable(); } --- head-2011-03-17.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:42:02.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/ioremap-xen.c 2011-02-07 15:42:09.000000000 +0100 @@ -532,6 +532,11 @@ static inline pte_t * __init early_iorem return &bm_pte[pte_index(addr)]; } +bool __init is_early_ioremap_ptep(pte_t *ptep) +{ + return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; +} + static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; void __init early_ioremap_init(void) --- head-2011-03-17.orig/arch/x86/mm/memblock.c 2011-03-17 14:35:43.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/memblock.c 2011-02-01 15:09:47.000000000 +0100 @@ -293,6 +293,11 @@ static int __init memblock_x86_find_acti { u64 align = PAGE_SIZE; +#ifdef CONFIG_XEN + if (last_pfn > xen_start_info->nr_pages) + last_pfn = xen_start_info->nr_pages; +#endif + *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; @@ -325,6 +330,11 @@ void __init memblock_x86_register_active if (memblock_x86_find_active_region(r, start_pfn, last_pfn, &ei_startpfn, &ei_endpfn)) add_active_range(nid, ei_startpfn, ei_endpfn); + +#ifdef CONFIG_XEN + BUG_ON(nid); + add_active_range(nid, last_pfn, last_pfn); +#endif } /* --- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -428,7 +428,19 @@ static inline void pgd_list_del(pgd_t *p #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { pgd_test_and_unpin(pgd); @@ -441,10 +453,6 @@ static void pgd_ctor(pgd_t *pgd) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); } #ifdef CONFIG_X86_64 @@ -454,8 +462,10 @@ static void pgd_ctor(pgd_t *pgd) #endif /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) @@ -662,12 +672,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm) } #endif - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); - /* Store a back link for vmalloc_sync_all(). */ - set_page_private(virt_to_page(pgd), (unsigned long)mm); - spin_unlock_irqrestore(&pgd_lock, flags); return pgd; --- head-2011-03-17.orig/arch/x86/pci/irq-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/pci/irq-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -590,27 +590,28 @@ static __init int intel_router_probe(str case PCI_DEVICE_ID_INTEL_ICH9_3: case PCI_DEVICE_ID_INTEL_ICH9_4: case PCI_DEVICE_ID_INTEL_ICH9_5: - case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_EP80579_0: case PCI_DEVICE_ID_INTEL_ICH10_0: case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; --- head-2011-03-17.orig/arch/x86/pci/pcifront.c 2011-02-01 14:50:44.000000000 +0100 +++ head-2011-03-17/arch/x86/pci/pcifront.c 2011-02-01 15:09:47.000000000 +0100 @@ -16,7 +16,7 @@ static int pcifront_enable_irq(struct pc { u8 irq; pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); - if (!irq_to_desc_alloc_node(irq, numa_node_id())) + if (!alloc_irq_and_cfg_at(irq, numa_node_id())) return -ENOMEM; evtchn_register_pirq(irq); dev->irq = irq; --- head-2011-03-17.orig/arch/x86/xen/Kconfig 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/arch/x86/xen/Kconfig 2011-02-01 15:09:47.000000000 +0100 @@ -15,13 +15,16 @@ config PARAVIRT_XEN config XEN_DOM0 def_bool y - depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on PARAVIRT_XEN && PCI_XEN && SWIOTLB_XEN depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI # Dummy symbol since people have come to rely on the PRIVILEGED_GUEST # name in tools. -config XEN_PRIVILEGED_GUEST - def_bool XEN_DOM0 +# This doesn't work together with our identical symbol in drivers/xen/Kconfig +# (produces a recursive dependency), and renaming it is pointless given that +# it's meant as a compatibility thing. +#config XEN_PRIVILEGED_GUEST +# def_bool XEN_DOM0 config XEN_PVHVM def_bool y --- head-2011-03-17.orig/drivers/pci/Kconfig 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/drivers/pci/Kconfig 2011-02-01 15:09:47.000000000 +0100 @@ -61,9 +61,9 @@ config PCI_STUB When in doubt, say N. -config XEN_PCIDEV_FRONTEND +config PARAVIRT_XEN_PCIDEV_FRONTEND tristate "Xen PCI Frontend" - depends on PCI && X86 && XEN + depends on PCI && X86 && PARAVIRT_XEN select HOTPLUG select PCI_XEN select XEN_XENBUS_FRONTEND @@ -72,9 +72,18 @@ config XEN_PCIDEV_FRONTEND The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. +config XEN_PCIDEV_FRONTEND + def_bool y + prompt "Xen PCI Frontend" if X86_64 + depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) + select HOTPLUG + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. + config XEN_PCIDEV_FE_DEBUG bool "Xen PCI Frontend debugging" - depends on XEN_PCIDEV_FRONTEND && PCI_DEBUG + depends on XEN_PCIDEV_FRONTEND || (PARAVIRT_XEN_PCIDEV_FRONTEND && PCI_DEBUG) help Say Y here if you want the Xen PCI frontend to produce a bunch of debug messages to the system log. Select this if you are having a --- head-2011-03-17.orig/drivers/pci/Makefile 2011-01-31 14:32:40.000000000 +0100 +++ head-2011-03-17/drivers/pci/Makefile 2011-02-01 15:09:47.000000000 +0100 @@ -71,6 +71,6 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o obj-$(CONFIG_PCI_STUB) += pci-stub.o -obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o +obj-$(CONFIG_PARAVIRT_XEN_PCIDEV_FRONTEND) += xen-pcifront.o ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG --- head-2011-03-17.orig/drivers/pci/xen-pcifront.c 2011-03-17 14:35:43.000000000 +0100 +++ head-2011-03-17/drivers/pci/xen-pcifront.c 2011-02-01 15:09:47.000000000 +0100 @@ -1118,7 +1118,6 @@ static const struct xenbus_device_id xen static struct xenbus_driver xenbus_pcifront_driver = { .name = "pcifront", - .owner = THIS_MODULE, .ids = xenpci_ids, .probe = pcifront_xenbus_probe, .remove = pcifront_xenbus_remove, --- head-2011-03-17.orig/drivers/xen/Kconfig 2011-02-02 15:37:53.000000000 +0100 +++ head-2011-03-17/drivers/xen/Kconfig 2011-02-01 15:09:47.000000000 +0100 @@ -20,10 +20,6 @@ config XEN_PRIVILEGED_GUEST config XEN_UNPRIVILEGED_GUEST def_bool !XEN_PRIVILEGED_GUEST select PM - select PM_SLEEP - select PM_SLEEP_SMP if SMP - select PM_RUNTIME if PCI - select PM_OPS if PCI select SUSPEND config XEN_PRIVCMD --- head-2011-03-17.orig/drivers/xen/Makefile 2011-02-24 15:05:06.000000000 +0100 +++ head-2011-03-17/drivers/xen/Makefile 2011-02-24 15:17:40.000000000 +0100 @@ -1,6 +1,8 @@ obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o +xen-biomerge-$(CONFIG_PARAVIRT_XEN) := biomerge.o xen-hotplug-$(CONFIG_PARAVIRT_XEN) := cpu_hotplug.o xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o +xen-evtchn-name-$(CONFIG_PARAVIRT_XEN) := xen-evtchn xen-balloon-$(CONFIG_XEN) := balloon/ obj-$(CONFIG_XEN) += core/ @@ -9,6 +11,7 @@ obj-y += xenbus/ obj-$(CONFIG_XEN) += char/ xen-backend-$(CONFIG_XEN_BACKEND) := util.o +xen-evtchn-name-$(CONFIG_XEN) := evtchn nostackp := $(call cc-option, -fno-stack-protector) ifeq ($(CONFIG_PARAVIRT_XEN),y) @@ -16,14 +19,19 @@ CFLAGS_features.o := $(nostackp) endif obj-$(CONFIG_XEN) += features.o $(xen-backend-y) $(xen-backend-m) +obj-$(CONFIG_BLOCK) += $(xen-biomerge-y) obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y) obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y) -obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +obj-$(CONFIG_XEN_DEV_EVTCHN) += $(xen-evtchn-name-y).o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o +obj-$(CONFIG_XEN_DOM0) += pci.o + +xen-evtchn-y := evtchn.o + obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ obj-$(CONFIG_XEN_BLKDEV_TAP2) += blktap2/ blktap2-new/ --- head-2011-03-17.orig/drivers/xen/balloon/balloon.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/drivers/xen/balloon/balloon.c 2011-02-01 15:09:47.000000000 +0100 @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include --- head-2011-03-17.orig/drivers/xen/blkback/blkback.c 2011-02-01 14:50:44.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkback/blkback.c 2011-02-01 15:09:47.000000000 +0100 @@ -406,7 +406,7 @@ static void dispatch_rw_block_io(blkif_t operation = WRITE; break; case BLKIF_OP_WRITE_BARRIER: - operation = WRITE_BARRIER; + operation = WRITE_FLUSH_FUA; break; default: operation = 0; /* make gcc happy */ @@ -415,7 +415,7 @@ static void dispatch_rw_block_io(blkif_t /* Check that number of segments is sane. */ nseg = req->nr_segments; - if (unlikely(nseg == 0 && operation != WRITE_BARRIER) || + if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) || unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { DPRINTK("Bad number of segments in request (%d)\n", nseg); goto fail_response; @@ -525,7 +525,7 @@ static void dispatch_rw_block_io(blkif_t } if (!bio) { - BUG_ON(operation != WRITE_BARRIER); + BUG_ON(!(operation & (REQ_FLUSH|REQ_FUA))); bio = bio_alloc(GFP_KERNEL, 0); if (unlikely(bio == NULL)) goto fail_put_bio; @@ -540,7 +540,7 @@ static void dispatch_rw_block_io(blkif_t if (operation == READ) blkif->st_rd_sect += preq.nr_sects; - else if (operation == WRITE || operation == WRITE_BARRIER) + else blkif->st_wr_sect += preq.nr_sects; return; --- head-2011-03-17.orig/drivers/xen/blkfront/blkfront.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/blkfront.c 2011-02-01 15:09:47.000000000 +0100 @@ -369,20 +369,23 @@ static void connect(struct blkfront_info /* * If there's no "feature-barrier" defined, then it means * we're dealing with a very old backend which writes - * synchronously; draining will do what needs to get done. + * synchronously; nothing to do. * - * If there are barriers, then we can do full queued writes - * with tagged barriers. - * - * If barriers are not supported, then there's no much we can - * do, so just set ordering to NONE. + * If there are barriers, then we use flush. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) + if (!err && barrier) + info->feature_flush = REQ_FLUSH | REQ_FUA; + else + info->feature_flush = 0; +#else if (err) - info->feature_barrier = QUEUE_ORDERED_DRAIN; + info->feature_flush = QUEUE_ORDERED_DRAIN; else if (barrier) - info->feature_barrier = QUEUE_ORDERED_TAG; + info->feature_flush = QUEUE_ORDERED_TAG; else - info->feature_barrier = QUEUE_ORDERED_NONE; + info->feature_flush = QUEUE_ORDERED_NONE; +#endif err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); if (err) { @@ -477,7 +480,7 @@ static inline void ADD_ID_TO_FREELIST( struct blkfront_info *info, unsigned long id) { info->shadow[id].req.id = info->shadow_free; - info->shadow[id].request = 0; + info->shadow[id].request = NULL; info->shadow_free = id; } @@ -658,14 +661,11 @@ int blkif_getgeo(struct block_device *bd /* - * blkif_queue_request + * Generate a Xen blkfront IO request from a blk layer request. Reads + * and writes are handled as expected. Since we lack a loose flush + * request, we map flushes into a full ordered barrier. * - * request block io - * - * id: for guest use only. - * operation: BLKIF_OP_{READ,WRITE,PROBE} - * buffer: buffer to read/write into. this should be a - * virtual address in the guest os. + * @req: a request struct */ static int blkif_queue_request(struct request *req) { @@ -694,7 +694,7 @@ static int blkif_queue_request(struct re /* Fill out a communications ring structure. */ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); id = GET_ID_FROM_FREELIST(info); - info->shadow[id].request = (unsigned long)req; + info->shadow[id].request = req; ring_req->id = id; ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); @@ -702,7 +702,11 @@ static int blkif_queue_request(struct re ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) +#else if (req->cmd_flags & REQ_HARDBARRIER) +#endif ring_req->operation = BLKIF_OP_WRITE_BARRIER; ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); @@ -813,7 +817,7 @@ static irqreturn_t blkif_int(int irq, vo bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; - req = (struct request *)info->shadow[id].request; + req = info->shadow[id].request; blkif_completion(&info->shadow[id]); @@ -827,8 +831,23 @@ static irqreturn_t blkif_int(int irq, vo " write barrier op failed\n", info->gd->disk_name); ret = -EOPNOTSUPP; - info->feature_barrier = QUEUE_ORDERED_NONE; - xlvbd_barrier(info); + } + if (unlikely(bret->status == BLKIF_RSP_ERROR && + info->shadow[id].req.nr_segments == 0)) { + pr_warning("blkfront: %s:" + " empty write barrier op failed\n", + info->gd->disk_name); + ret = -EOPNOTSUPP; + } + if (unlikely(ret)) { + if (ret == -EOPNOTSUPP) + ret = 0; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) + info->feature_flush = 0; +#else + info->feature_flush = QUEUE_ORDERED_NONE; +#endif + xlvbd_flush(info); } /* fall through */ case BLKIF_OP_READ: @@ -919,7 +938,7 @@ static int blkif_recover(struct blkfront /* Stage 3: Find pending requests and requeue them. */ for (i = 0; i < BLK_RING_SIZE; i++) { /* Not in use? */ - if (copy[i].request == 0) + if (!copy[i].request) continue; /* Grab a request slot and copy shadow state into it. */ @@ -937,8 +956,7 @@ static int blkif_recover(struct blkfront req->seg[j].gref, info->xbdev->otherend_id, pfn_to_mfn(info->shadow[req->id].frame[j]), - rq_data_dir((struct request *) - info->shadow[req->id].request) ? + rq_data_dir(info->shadow[req->id].request) ? GTF_readonly : 0); info->shadow[req->id].req = *req; --- head-2011-03-17.orig/drivers/xen/blkfront/block.h 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/block.h 2011-02-01 15:09:47.000000000 +0100 @@ -83,7 +83,7 @@ struct xlbd_major_info struct blk_shadow { blkif_request_t req; - unsigned long request; + struct request *request; unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; @@ -111,7 +111,7 @@ struct blkfront_info struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; unsigned long shadow_free; - int feature_barrier; + int feature_flush; int is_ready; /** @@ -146,7 +146,7 @@ extern void do_blkif_request (struct req int xlvbd_add(blkif_sector_t capacity, int device, u16 vdisk_info, u16 sector_size, struct blkfront_info *info); void xlvbd_del(struct blkfront_info *info); -int xlvbd_barrier(struct blkfront_info *info); +void xlvbd_flush(struct blkfront_info *info); #ifdef CONFIG_SYSFS int xlvbd_sysfs_addif(struct blkfront_info *info); --- head-2011-03-17.orig/drivers/xen/blkfront/vbd.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/blkfront/vbd.c 2011-02-01 15:09:47.000000000 +0100 @@ -422,7 +422,7 @@ xlvbd_add(blkif_sector_t capacity, int v info->rq = gd->queue; info->gd = gd; - xlvbd_barrier(info); + xlvbd_flush(info); if (vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); @@ -468,36 +468,35 @@ xlvbd_del(struct blkfront_info *info) info->rq = NULL; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) -int -xlvbd_barrier(struct blkfront_info *info) +void +xlvbd_flush(struct blkfront_info *info) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) + blk_queue_flush(info->rq, info->feature_flush); + pr_info("blkfront: %s: barriers %s\n", + info->gd->disk_name, + info->feature_flush ? "enabled" : "disabled"); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) int err; const char *barrier; - switch (info->feature_barrier) { + switch (info->feature_flush) { case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break; case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break; case QUEUE_ORDERED_NONE: barrier = "disabled"; break; default: return -EINVAL; } - err = blk_queue_ordered(info->rq, info->feature_barrier); + err = blk_queue_ordered(info->rq, info->feature_flush); if (err) return err; pr_info("blkfront: %s: barriers %s\n", info->gd->disk_name, barrier); - return 0; -} #else -int -xlvbd_barrier(struct blkfront_info *info) -{ - if (info->feature_barrier) + if (info->feature_flush) pr_info("blkfront: %s: barriers disabled\n", info->gd->disk_name); - return -ENOSYS; -} #endif +} #ifdef CONFIG_SYSFS static ssize_t show_media(struct device *dev, --- head-2011-03-17.orig/drivers/xen/blktap/blktap.c 2011-02-17 10:19:19.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap/blktap.c 2011-02-17 10:19:26.000000000 +0100 @@ -441,6 +441,7 @@ static const struct file_operations blkt .unlocked_ioctl = blktap_ioctl, .open = blktap_open, .release = blktap_release, + .llseek = no_llseek, .mmap = blktap_mmap, }; @@ -573,6 +574,8 @@ static int blktap_open(struct inode *ino tap_blkif_t *info; int i; + nonseekable_open(inode, filp); + /* ctrl device, treat differently */ if (!idx) return 0; --- head-2011-03-17.orig/drivers/xen/blktap2/device.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2/device.c 2011-02-01 15:09:47.000000000 +0100 @@ -844,7 +844,7 @@ blktap_device_run_queue(struct blktap *t continue; } - if (req->cmd_flags & REQ_HARDBARRIER) { + if (req->cmd_flags & (REQ_FLUSH|REQ_FUA)) { blk_start_request(req); __blk_end_request_all(req, -EOPNOTSUPP); continue; --- head-2011-03-17.orig/drivers/xen/blktap2-new/device.c 2011-02-24 16:23:08.000000000 +0100 +++ head-2011-03-17/drivers/xen/blktap2-new/device.c 2011-02-24 16:31:17.000000000 +0100 @@ -302,9 +302,6 @@ blktap_device_configure(struct blktap *t /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); - /* We are reordering, but cacheless. */ - blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN); - spin_unlock_irq(&dev->lock); } --- head-2011-03-17.orig/drivers/xen/core/evtchn.c 2011-02-09 12:45:24.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/evtchn.c 2011-02-03 11:12:32.000000000 +0100 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -89,14 +90,17 @@ static struct irq_cfg _irq_cfg[] = { static inline struct irq_cfg *__pure irq_cfg(unsigned int irq) { #ifdef CONFIG_SPARSE_IRQ - struct irq_desc *desc = irq_to_desc(irq); - - return desc ? desc->chip_data : NULL; + return get_irq_chip_data(irq); #else return irq < NR_IRQS ? _irq_cfg + irq : NULL; #endif } +static inline struct irq_cfg *__pure irq_data_cfg(struct irq_data *data) +{ + return irq_data_get_irq_chip_data(data); +} + /* Constructor for packed IRQ information. */ static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) { @@ -116,26 +120,47 @@ static inline u32 mk_irq_info(u32 type, * Accessors for packed IRQ information. */ +static inline unsigned int evtchn_from_irq_cfg(const struct irq_cfg *cfg) +{ + return cfg->info & ((1U << _EVTCHN_BITS) - 1); +} + +static inline unsigned int evtchn_from_irq_data(struct irq_data *data) +{ + const struct irq_cfg *cfg = irq_data_cfg(data); + + return cfg ? evtchn_from_irq_cfg(cfg) : 0; +} + static inline unsigned int evtchn_from_irq(int irq) { - const struct irq_cfg *cfg = irq_cfg(irq); + struct irq_data *data = irq_get_irq_data(irq); - return cfg ? cfg->info & ((1U << _EVTCHN_BITS) - 1) : 0; + return data ? evtchn_from_irq_data(data) : 0; +} + +static inline unsigned int index_from_irq_cfg(const struct irq_cfg *cfg) +{ + return (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1); } static inline unsigned int index_from_irq(int irq) { const struct irq_cfg *cfg = irq_cfg(irq); - return cfg ? (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1) - : 0; + return cfg ? index_from_irq_cfg(cfg) : 0; +} + +static inline unsigned int type_from_irq_cfg(const struct irq_cfg *cfg) +{ + return cfg->info >> (32 - _IRQT_BITS); } static inline unsigned int type_from_irq(int irq) { const struct irq_cfg *cfg = irq_cfg(irq); - return cfg ? cfg->info >> (32 - _IRQT_BITS) : IRQT_UNBOUND; + return cfg ? type_from_irq_cfg(cfg) : IRQT_UNBOUND; } unsigned int irq_from_evtchn(unsigned int port) @@ -180,7 +205,7 @@ static void bind_evtchn_to_cpu(unsigned BUG_ON(!test_bit(chn, s->evtchn_mask)); if (irq != -1) - cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn])); set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); @@ -193,10 +218,10 @@ static void init_evtchn_cpu_bindings(voi /* By default all event channels notify CPU#0. */ for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_to_desc(i); + struct irq_data *data = irq_get_irq_data(i); - if (desc) - cpumask_copy(desc->affinity, cpumask_of(0)); + if (data) + cpumask_copy(data->affinity, cpumask_of(0)); } memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); @@ -378,26 +403,24 @@ asmlinkage void __irq_entry evtchn_do_up set_irq_regs(old_regs); } -static int find_unbound_irq(unsigned int node, struct irq_chip *chip) +static int find_unbound_irq(unsigned int node, struct irq_cfg **pcfg, + struct irq_chip *chip) { static int warned; int irq; for (irq = DYNIRQ_BASE; irq < nr_irqs; irq++) { - struct irq_desc *desc; - struct irq_cfg *cfg; + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + struct irq_desc *desc = irq_to_desc(irq); - desc = irq_to_desc(irq); - if (!desc) - desc = irq_to_desc_alloc_node(irq, node); - else if (desc->chip != &no_irq_chip && - desc->chip != &dynirq_chip) - continue; - if (!desc) + if (unlikely(!cfg)) return -ENOMEM; + if (desc->irq_data.chip != &no_irq_chip && + desc->irq_data.chip != chip) + continue; - cfg = desc->chip_data; - if (cfg && !cfg->bindcount) { + if (!cfg->bindcount) { + *pcfg = cfg; desc->status |= IRQ_NOPROBE; set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, @@ -419,20 +442,22 @@ static struct irq_chip dynirq_chip; static int bind_caller_port_to_irq(unsigned int caller_port) { + struct irq_cfg *cfg; int irq; spin_lock(&irq_mapping_update_lock); if ((irq = evtchn_to_irq[caller_port]) == -1) { - if ((irq = find_unbound_irq(numa_node_id(), &dynirq_chip)) < 0) + if ((irq = find_unbound_irq(numa_node_id(), &cfg, + &dynirq_chip)) < 0) goto out; evtchn_to_irq[caller_port] = irq; - irq_cfg(irq)->info = mk_irq_info(IRQT_CALLER_PORT, - 0, caller_port); - } + cfg->info = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port); + } else + cfg = irq_cfg(irq); - irq_cfg(irq)->bindcount++; + cfg->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -441,21 +466,22 @@ static int bind_caller_port_to_irq(unsig static int bind_local_port_to_irq(unsigned int local_port) { + struct irq_cfg *cfg; int irq; spin_lock(&irq_mapping_update_lock); BUG_ON(evtchn_to_irq[local_port] != -1); - if ((irq = find_unbound_irq(numa_node_id(), &dynirq_chip)) < 0) { + if ((irq = find_unbound_irq(numa_node_id(), &cfg, &dynirq_chip)) < 0) { if (close_evtchn(local_port)) BUG(); goto out; } evtchn_to_irq[local_port] = irq; - irq_cfg(irq)->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); - irq_cfg(irq)->bindcount++; + cfg->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); + cfg->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -494,12 +520,13 @@ static int bind_interdomain_evtchn_to_ir static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; + struct irq_cfg *cfg; int evtchn, irq; spin_lock(&irq_mapping_update_lock); if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { - if ((irq = find_unbound_irq(cpu_to_node(cpu), + if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg, &dynirq_chip)) < 0) goto out; @@ -511,14 +538,15 @@ static int bind_virq_to_irq(unsigned int evtchn = bind_virq.port; evtchn_to_irq[evtchn] = irq; - irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn); + cfg->info = mk_irq_info(IRQT_VIRQ, virq, evtchn); per_cpu(virq_to_irq, cpu)[virq] = irq; bind_evtchn_to_cpu(evtchn, cpu); - } + } else + cfg = irq_cfg(irq); - irq_cfg(irq)->bindcount++; + cfg->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -528,12 +556,13 @@ static int bind_virq_to_irq(unsigned int static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; + struct irq_cfg *cfg; int evtchn, irq; spin_lock(&irq_mapping_update_lock); if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { - if ((irq = find_unbound_irq(cpu_to_node(cpu), + if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg, &dynirq_chip)) < 0) goto out; @@ -544,14 +573,15 @@ static int bind_ipi_to_irq(unsigned int evtchn = bind_ipi.port; evtchn_to_irq[evtchn] = irq; - irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn); + cfg->info = mk_irq_info(IRQT_IPI, ipi, evtchn); per_cpu(ipi_to_irq, cpu)[ipi] = irq; bind_evtchn_to_cpu(evtchn, cpu); - } + } else + cfg = irq_cfg(irq); - irq_cfg(irq)->bindcount++; + cfg->bindcount++; out: spin_unlock(&irq_mapping_update_lock); @@ -561,23 +591,24 @@ static int bind_ipi_to_irq(unsigned int static void unbind_from_irq(unsigned int irq) { unsigned int cpu; - int evtchn = evtchn_from_irq(irq); + struct irq_cfg *cfg = irq_cfg(irq); + int evtchn = evtchn_from_irq_cfg(cfg); spin_lock(&irq_mapping_update_lock); - if (!--irq_cfg(irq)->bindcount && VALID_EVTCHN(evtchn)) { - if ((type_from_irq(irq) != IRQT_CALLER_PORT) && + if (!--cfg->bindcount && VALID_EVTCHN(evtchn)) { + if ((type_from_irq_cfg(cfg) != IRQT_CALLER_PORT) && close_evtchn(evtchn)) BUG(); - switch (type_from_irq(irq)) { + switch (type_from_irq_cfg(cfg)) { case IRQT_VIRQ: per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; + [index_from_irq_cfg(cfg)] = -1; break; case IRQT_IPI: per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; + [index_from_irq_cfg(cfg)] = -1; break; default: break; @@ -587,7 +618,7 @@ static void unbind_from_irq(unsigned int bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; - irq_cfg(irq)->info = IRQ_UNBOUND; + cfg->info = IRQ_UNBOUND; /* Zap stats across IRQ changes of use. */ for_each_possible_cpu(cpu) @@ -740,25 +771,26 @@ void rebind_evtchn_to_cpu(int port, unsi unmask_evtchn(port); } -static void rebind_irq_to_cpu(unsigned int irq, unsigned int tcpu) +static void rebind_irq_to_cpu(struct irq_data *data, unsigned int tcpu) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq_data(data); if (VALID_EVTCHN(evtchn)) rebind_evtchn_to_cpu(evtchn, tcpu); } -static int set_affinity_irq(unsigned int irq, const struct cpumask *dest) +static int set_affinity_irq(struct irq_data *data, + const struct cpumask *dest, bool force) { - rebind_irq_to_cpu(irq, cpumask_first(dest)); + rebind_irq_to_cpu(data, cpumask_first(dest)); return 0; } #endif -int resend_irq_on_evtchn(unsigned int irq) +int resend_irq_on_evtchn(struct irq_data *data) { - int masked, evtchn = evtchn_from_irq(irq); + int masked, evtchn = evtchn_from_irq_data(data); if (!VALID_EVTCHN(evtchn)) return 1; @@ -775,52 +807,51 @@ int resend_irq_on_evtchn(unsigned int ir * Interface to generic handling in irq.c */ -static void unmask_dynirq(unsigned int irq) +static void unmask_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq_data(data); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static void mask_dynirq(unsigned int irq) +static void mask_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq_data(data); if (VALID_EVTCHN(evtchn)) mask_evtchn(evtchn); } -static unsigned int startup_dynirq(unsigned int irq) +static unsigned int startup_dynirq(struct irq_data *data) { - unmask_dynirq(irq); + unmask_dynirq(data); return 0; } #define shutdown_dynirq mask_dynirq -static void end_dynirq(unsigned int irq) +static void end_dynirq(struct irq_data *data) { - if (!(irq_to_desc(irq)->status & IRQ_DISABLED)) { - move_masked_irq(irq); - unmask_dynirq(irq); + if (!(irq_to_desc(data->irq)->status & IRQ_DISABLED)) { + move_masked_irq(data->irq); + unmask_dynirq(data); } } static struct irq_chip dynirq_chip = { - .name = "Dynamic", - .startup = startup_dynirq, - .shutdown = shutdown_dynirq, - .enable = unmask_dynirq, - .disable = mask_dynirq, - .mask = mask_dynirq, - .unmask = unmask_dynirq, - .end = end_dynirq, - .eoi = end_dynirq, + .name = "Dynamic", + .irq_startup = startup_dynirq, + .irq_shutdown = shutdown_dynirq, + .irq_enable = unmask_dynirq, + .irq_disable = mask_dynirq, + .irq_mask = mask_dynirq, + .irq_unmask = unmask_dynirq, + .irq_eoi = end_dynirq, #ifdef CONFIG_SMP - .set_affinity = set_affinity_irq, + .irq_set_affinity = set_affinity_irq, #endif - .retrigger = resend_irq_on_evtchn, + .irq_retrigger = resend_irq_on_evtchn, }; /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ @@ -873,18 +904,20 @@ static inline void pirq_query_unmask(int set_bit(irq - PIRQ_BASE, pirq_needs_eoi); } -static int set_type_pirq(unsigned int irq, unsigned int type) +static int set_type_pirq(struct irq_data *data, unsigned int type) { if (type != IRQ_TYPE_PROBE) return -EINVAL; - set_bit(irq - PIRQ_BASE, probing_pirq); + set_bit(data->irq - PIRQ_BASE, probing_pirq); return 0; } -static void enable_pirq(unsigned int irq) +static void enable_pirq(struct irq_data *data) { struct evtchn_bind_pirq bind_pirq; - int evtchn = evtchn_from_irq(irq); + unsigned int irq = data->irq; + struct irq_cfg *cfg = irq_data_cfg(data); + int evtchn = evtchn_from_irq_cfg(cfg); unsigned int pirq = irq - PIRQ_BASE; if (VALID_EVTCHN(evtchn)) { @@ -910,7 +943,7 @@ static void enable_pirq(unsigned int irq evtchn_to_irq[evtchn] = irq; bind_evtchn_to_cpu(evtchn, 0); - irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn); + cfg->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn); out: pirq_unmask_and_notify(evtchn, irq); @@ -918,15 +951,16 @@ static void enable_pirq(unsigned int irq #define disable_pirq mask_pirq -static unsigned int startup_pirq(unsigned int irq) +static unsigned int startup_pirq(struct irq_data *data) { - enable_pirq(irq); + enable_pirq(data); return 0; } -static void shutdown_pirq(unsigned int irq) +static void shutdown_pirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + struct irq_cfg *cfg = irq_data_cfg(data); + int evtchn = evtchn_from_irq_cfg(cfg); if (!VALID_EVTCHN(evtchn)) return; @@ -938,48 +972,47 @@ static void shutdown_pirq(unsigned int i bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; - irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0); + cfg->info = mk_irq_info(IRQT_PIRQ, index_from_irq_cfg(cfg), 0); } -static void unmask_pirq(unsigned int irq) +static void unmask_pirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq_data(data); if (VALID_EVTCHN(evtchn)) - pirq_unmask_and_notify(evtchn, irq); + pirq_unmask_and_notify(evtchn, data->irq); } #define mask_pirq mask_dynirq -static void end_pirq(unsigned int irq) +static void end_pirq(struct irq_data *data) { - const struct irq_desc *desc = irq_to_desc(irq); + const struct irq_desc *desc = irq_to_desc(data->irq); if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == (IRQ_DISABLED|IRQ_PENDING)) - shutdown_pirq(irq); + shutdown_pirq(data); else { if (!(desc->status & IRQ_DISABLED)) - move_masked_irq(irq); - unmask_pirq(irq); + move_masked_irq(data->irq); + unmask_pirq(data); } } static struct irq_chip pirq_chip = { - .name = "Phys", - .startup = startup_pirq, - .shutdown = shutdown_pirq, - .enable = enable_pirq, - .disable = disable_pirq, - .mask = mask_pirq, - .unmask = unmask_pirq, - .end = end_pirq, - .eoi = end_pirq, - .set_type = set_type_pirq, + .name = "Phys", + .irq_startup = startup_pirq, + .irq_shutdown = shutdown_pirq, + .irq_enable = enable_pirq, + .irq_disable = disable_pirq, + .irq_mask = mask_pirq, + .irq_unmask = unmask_pirq, + .irq_eoi = end_pirq, + .irq_set_type = set_type_pirq, #ifdef CONFIG_SMP - .set_affinity = set_affinity_irq, + .irq_set_affinity = set_affinity_irq, #endif - .retrigger = resend_irq_on_evtchn, + .irq_retrigger = resend_irq_on_evtchn, }; int irq_ignore_unhandled(unsigned int irq) @@ -1169,28 +1202,39 @@ int __init arch_early_irq_init(void) unsigned int i; for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++) - irq_to_desc(i)->chip_data = _irq_cfg + i; + set_irq_chip_data(i, _irq_cfg + i); return 0; } -#ifdef CONFIG_SPARSE_IRQ -int arch_init_chip_data(struct irq_desc *desc, int cpu) +struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) { - if (!desc->chip_data) { - /* By default all event channels notify CPU#0. */ - cpumask_copy(desc->affinity, cpumask_of(0)); + int res = irq_alloc_desc_at(at, node); + struct irq_cfg *cfg = NULL; - desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC); - } - if (!desc->chip_data) { - pr_emerg("cannot alloc irq_cfg\n"); - BUG(); + if (res < 0) { + if (res != -EEXIST) + return NULL; + cfg = get_irq_chip_data(at); + if (cfg) + return cfg; } - return 0; -} +#ifdef CONFIG_SPARSE_IRQ + /* By default all event channels notify CPU#0. */ + cpumask_copy(irq_get_irq_data(at)->affinity, cpumask_of(0)); + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (cfg) + set_irq_chip_data(at, cfg); + else + irq_free_desc(at); + + return cfg; +#else + return irq_cfg(at); #endif +} #ifdef CONFIG_SPARSE_IRQ int nr_pirqs = NR_PIRQS; @@ -1223,7 +1267,7 @@ int __init arch_probe_nr_irqs(void) printk(KERN_DEBUG "nr_pirqs: %d\n", nr_pirqs); - return 0; + return ARRAY_SIZE(_irq_cfg); } #endif @@ -1255,10 +1299,12 @@ int assign_irq_vector(int irq, struct ir void evtchn_register_pirq(int irq) { + struct irq_cfg *cfg = irq_cfg(irq); + BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs); - if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND) + if (identity_mapped_irq(irq) || type_from_irq_cfg(cfg) != IRQT_UNBOUND) return; - irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0); + cfg->info = mk_irq_info(IRQT_PIRQ, irq, 0); set_irq_chip_and_handler_name(irq, &pirq_chip, handle_fasteoi_irq, "fasteoi"); } @@ -1267,15 +1313,12 @@ int evtchn_map_pirq(int irq, int xen_pir { if (irq < 0) { #ifdef CONFIG_SPARSE_IRQ + struct irq_cfg *cfg; + spin_lock(&irq_mapping_update_lock); - irq = find_unbound_irq(numa_node_id(), &pirq_chip); + irq = find_unbound_irq(numa_node_id(), &cfg, &pirq_chip); if (irq >= 0) { - struct irq_desc *desc; - struct irq_cfg *cfg; - - desc = irq_to_desc_alloc_node(irq, numa_node_id()); - cfg = desc->chip_data; - BUG_ON(type_from_irq(irq) != IRQT_UNBOUND); + BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND); cfg->bindcount++; cfg->info = mk_irq_info(IRQT_PIRQ, xen_pirq, 0); } @@ -1291,15 +1334,17 @@ int evtchn_map_pirq(int irq, int xen_pir irq = PIRQ_BASE + nr_pirqs - 1; spin_lock(&irq_alloc_lock); do { - struct irq_desc *desc; struct irq_cfg *cfg; if (identity_mapped_irq(irq)) continue; - desc = irq_to_desc_alloc_node(irq, numa_node_id()); - cfg = desc->chip_data; - if (!index_from_irq(irq)) { - BUG_ON(type_from_irq(irq) != IRQT_UNBOUND); + cfg = alloc_irq_and_cfg_at(irq, numa_node_id()); + if (unlikely(!cfg)) { + spin_unlock(&irq_alloc_lock); + return -ENOMEM; + } + if (!index_from_irq_cfg(cfg)) { + BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND); cfg->info = mk_irq_info(IRQT_PIRQ, xen_pirq, 0); break; @@ -1312,7 +1357,9 @@ int evtchn_map_pirq(int irq, int xen_pir handle_fasteoi_irq, "fasteoi"); #endif } else if (!xen_pirq) { - if (unlikely(type_from_irq(irq) != IRQT_PIRQ)) + struct irq_cfg *cfg = irq_cfg(irq); + + if (!cfg || unlikely(type_from_irq_cfg(cfg) != IRQT_PIRQ)) return -EINVAL; /* * dynamic_irq_cleanup(irq) would seem to be the correct thing @@ -1321,9 +1368,9 @@ int evtchn_map_pirq(int irq, int xen_pir * then causes a warning in dynamic_irq_cleanup(). */ set_irq_chip_and_handler(irq, NULL, NULL); - irq_cfg(irq)->info = IRQ_UNBOUND; + cfg->info = IRQ_UNBOUND; #ifdef CONFIG_SPARSE_IRQ - irq_cfg(irq)->bindcount--; + cfg->bindcount--; #endif return 0; } else if (type_from_irq(irq) != IRQT_PIRQ @@ -1338,10 +1385,12 @@ int evtchn_map_pirq(int irq, int xen_pir int evtchn_get_xen_pirq(int irq) { + struct irq_cfg *cfg = irq_cfg(irq); + if (identity_mapped_irq(irq)) return irq; - BUG_ON(type_from_irq(irq) != IRQT_PIRQ); - return index_from_irq(irq); + BUG_ON(type_from_irq_cfg(cfg) != IRQT_PIRQ); + return index_from_irq_cfg(cfg); } void __init xen_init_IRQ(void) --- head-2011-03-17.orig/drivers/xen/core/smpboot.c 2011-02-01 14:50:44.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/smpboot.c 2011-03-03 16:11:42.000000000 +0100 @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -34,7 +33,7 @@ extern void smp_trap_init(trap_info_t *) cpumask_var_t vcpu_initialized_mask; -DEFINE_PER_CPU(struct cpuinfo_x86, cpu_info); +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); static DEFINE_PER_CPU(int, resched_irq); @@ -46,6 +45,11 @@ static char callfunc_name[NR_CPUS][15]; static char call1func_name[NR_CPUS][15]; static char reboot_name[NR_CPUS][15]; +#ifdef CONFIG_IRQ_WORK +static DEFINE_PER_CPU(int, irq_work_irq); +static char irq_work_name[NR_CPUS][15]; +#endif + void __init prefill_possible_map(void) { int i, rc; @@ -76,6 +80,9 @@ static int __cpuinit xen_smp_intr_init(u int rc; per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = +#ifdef CONFIG_IRQ_WORK + per_cpu(irq_work_irq, cpu) = +#endif per_cpu(call1func_irq, cpu) = per_cpu(reboot_irq, cpu) = -1; sprintf(resched_name[cpu], "resched%u", cpu); @@ -122,6 +129,19 @@ static int __cpuinit xen_smp_intr_init(u goto fail; per_cpu(reboot_irq, cpu) = rc; +#ifdef CONFIG_IRQ_WORK + sprintf(irq_work_name[cpu], "irqwork%u", cpu); + rc = bind_ipi_to_irqhandler(IRQ_WORK_VECTOR, + cpu, + smp_irq_work_interrupt, + IRQF_DISABLED|IRQF_NOBALANCING, + irq_work_name[cpu], + NULL); + if (rc < 0) + goto fail; + per_cpu(irq_work_irq, cpu) = rc; +#endif + rc = xen_spinlock_init(cpu); if (rc < 0) goto fail; @@ -140,6 +160,10 @@ static int __cpuinit xen_smp_intr_init(u unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL); if (per_cpu(reboot_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(reboot_irq, cpu), NULL); +#ifdef CONFIG_IRQ_WORK + if (per_cpu(irq_work_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(irq_work_irq, cpu), NULL); +#endif xen_spinlock_cleanup(cpu); return rc; } @@ -154,6 +178,9 @@ static void __cpuinit xen_smp_intr_exit( unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(reboot_irq, cpu), NULL); +#ifdef CONFIG_IRQ_WORK + unbind_from_irqhandler(per_cpu(irq_work_irq, cpu), NULL); +#endif xen_spinlock_cleanup(cpu); } #endif --- head-2011-03-17.orig/drivers/xen/core/spinlock.c 2011-03-15 16:33:52.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/spinlock.c 2011-03-03 16:11:45.000000000 +0100 @@ -21,7 +21,7 @@ struct spinning { struct spinning *prev; }; static DEFINE_PER_CPU(struct spinning *, _spinning); -static DEFINE_PER_CPU(evtchn_port_t, poll_evtchn); +static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, poll_evtchn); /* * Protect removal of objects: Addition can be done lockless, and even * removal itself doesn't need protection - what needs to be prevented is @@ -153,7 +153,7 @@ bool xen_spin_wait(arch_spinlock_t *lock */ arch_spinlock_t *lock = other->lock; - raw_local_irq_disable(); + arch_local_irq_disable(); while (lock->cur == other->ticket) { unsigned int token; bool kick, free; @@ -175,7 +175,7 @@ bool xen_spin_wait(arch_spinlock_t *lock } /* - * No need to use raw_local_irq_restore() here, as the + * No need to use arch_local_irq_restore() here, as the * intended event processing will happen with the poll * call. */ @@ -200,7 +200,7 @@ bool xen_spin_wait(arch_spinlock_t *lock /* announce we're done */ other = spinning.prev; percpu_write(_spinning, other); - raw_local_irq_disable(); + arch_local_irq_disable(); rm_idx = percpu_read(rm_seq.idx); smp_wmb(); percpu_write(rm_seq.idx, rm_idx + 1); @@ -229,7 +229,7 @@ bool xen_spin_wait(arch_spinlock_t *lock rm_idx &= 1; while (percpu_read(rm_seq.ctr[rm_idx].counter)) cpu_relax(); - raw_local_irq_restore(upcall_mask); + arch_local_irq_restore(upcall_mask); *ptok = lock->cur | (spinning.ticket << TICKET_SHIFT); return rc; @@ -256,7 +256,7 @@ void xen_spin_kick(arch_spinlock_t *lock return; } - flags = __raw_local_irq_save(); + flags = arch_local_irq_save(); for (;;) { unsigned int rm_idx = per_cpu(rm_seq.idx, cpu); @@ -281,7 +281,7 @@ void xen_spin_kick(arch_spinlock_t *lock } atomic_dec(rm_ctr); - raw_local_irq_restore(flags); + arch_local_irq_restore(flags); if (unlikely(spinning)) { notify_remote_via_evtchn(per_cpu(poll_evtchn, cpu)); --- head-2011-03-17.orig/drivers/xen/evtchn.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/drivers/xen/evtchn.c 2011-02-01 15:09:47.000000000 +0100 @@ -528,7 +528,11 @@ static const struct file_operations evtc static struct miscdevice evtchn_miscdev = { .minor = MISC_DYNAMIC_MINOR, +#ifdef CONFIG_PARAVIRT_XEN .name = "xen/evtchn", +#else + .name = "evtchn", +#endif .nodename = "xen/evtchn", .fops = &evtchn_fops, }; --- head-2011-03-17.orig/drivers/xen/gntdev/gntdev.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/drivers/xen/gntdev/gntdev.c 2011-02-01 15:09:47.000000000 +0100 @@ -145,6 +145,7 @@ static long gntdev_ioctl(struct file *fl static const struct file_operations gntdev_fops = { .owner = THIS_MODULE, .open = gntdev_open, + .llseek = no_llseek, .release = gntdev_release, .mmap = gntdev_mmap, .unlocked_ioctl = gntdev_ioctl @@ -430,6 +431,8 @@ static int gntdev_open(struct inode *ino { gntdev_file_private_data_t *private_data; + nonseekable_open(inode, flip); + try_module_get(THIS_MODULE); /* Allocate space for the per-instance private data. */ --- head-2011-03-17.orig/drivers/xen/privcmd/privcmd.c 2011-01-31 18:07:35.000000000 +0100 +++ head-2011-03-17/drivers/xen/privcmd/privcmd.c 2011-02-01 15:09:47.000000000 +0100 @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -416,7 +415,8 @@ static int privcmd_mmap(struct file * fi if (xen_feature(XENFEAT_auto_translated_physmap)) return -ENOSYS; - /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + /* DONTCOPY is essential for Xen because copy_page_range doesn't know + * how to recreate these mappings */ vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY; vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; @@ -426,6 +426,8 @@ static int privcmd_mmap(struct file * fi #endif static const struct file_operations privcmd_file_ops = { + .open = nonseekable_open, + .llseek = no_llseek, .unlocked_ioctl = privcmd_ioctl, .mmap = privcmd_mmap, }; --- head-2011-03-17.orig/drivers/xen/scsifront/scsifront.c 2011-02-08 10:04:41.000000000 +0100 +++ head-2011-03-17/drivers/xen/scsifront/scsifront.c 2011-02-08 10:08:14.000000000 +0100 @@ -315,11 +315,12 @@ big_to_sg: return ref_cnt; } -static int scsifront_queuecommand(struct scsi_cmnd *sc, - void (*done)(struct scsi_cmnd *)) +static int scsifront_queuecommand(struct Scsi_Host *shost, + struct scsi_cmnd *sc) { - struct vscsifrnt_info *info = shost_priv(sc->device->host); + struct vscsifrnt_info *info = shost_priv(shost); vscsiif_request_t *ring_req; + unsigned long flags; int ref_cnt; uint16_t rqid; @@ -328,11 +329,12 @@ static int scsifront_queuecommand(struct sc->cmnd[0],sc->cmnd[1],sc->cmnd[2],sc->cmnd[3],sc->cmnd[4], sc->cmnd[5],sc->cmnd[6],sc->cmnd[7],sc->cmnd[8],sc->cmnd[9]); */ + spin_lock_irqsave(shost->host_lock, flags); if (RING_FULL(&info->ring)) { - goto out_host_busy; + spin_unlock_irqrestore(shost->host_lock, flags); + return SCSI_MLQUEUE_HOST_BUSY; } - sc->scsi_done = done; sc->result = 0; ring_req = scsifront_pre_request(info); @@ -361,27 +363,21 @@ static int scsifront_queuecommand(struct ref_cnt = map_data_for_request(info, sc, ring_req, rqid); if (ref_cnt < 0) { add_id_to_freelist(info, rqid); + spin_unlock_irqrestore(shost->host_lock, flags); if (ref_cnt == (-ENOMEM)) - goto out_host_busy; - else { - sc->result = (DID_ERROR << 16); - goto out_fail_command; - } + return SCSI_MLQUEUE_HOST_BUSY; + sc->result = (DID_ERROR << 16); + sc->scsi_done(sc); + return 0; } ring_req->nr_segments = (uint8_t)ref_cnt; info->shadow[rqid].nr_segments = ref_cnt; scsifront_do_request(info); + spin_unlock_irqrestore(shost->host_lock, flags); return 0; - -out_host_busy: - return SCSI_MLQUEUE_HOST_BUSY; - -out_fail_command: - done(sc); - return 0; } --- head-2011-03-17.orig/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h 2008-02-20 09:32:49.000000000 +0100 +++ head-2011-03-17/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h 2011-02-01 15:09:47.000000000 +0100 @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include --- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_dev.c 2011-02-01 15:03:03.000000000 +0100 +++ head-2011-03-17/drivers/xen/xenbus/xenbus_dev.c 2011-02-01 15:09:47.000000000 +0100 @@ -454,6 +454,7 @@ static const struct file_operations xenb .write = xenbus_dev_write, .open = xenbus_dev_open, .release = xenbus_dev_release, + .llseek = no_llseek, .poll = xenbus_dev_poll, #ifdef HAVE_UNLOCKED_IOCTL .unlocked_ioctl = xenbus_dev_ioctl --- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 15:09:47.000000000 +0100 @@ -82,10 +82,13 @@ int xen_store_evtchn; #if !defined(CONFIG_XEN) && !defined(MODULE) -EXPORT_SYMBOL(xen_store_evtchn); +EXPORT_SYMBOL_GPL(xen_store_evtchn); #endif struct xenstore_domain_interface *xen_store_interface; +#if !defined(CONFIG_XEN) && !defined(MODULE) +EXPORT_SYMBOL_GPL(xen_store_interface); +#endif static unsigned long xen_store_mfn; @@ -1102,9 +1105,7 @@ int __devinit xenbus_init(void) #endif { int err = 0; -#if defined(CONFIG_XEN) || defined(MODULE) unsigned long page = 0; -#endif DPRINTK(""); @@ -1122,7 +1123,6 @@ int __devinit xenbus_init(void) * Domain0 doesn't have a store_evtchn or store_mfn yet. */ if (is_initial_xendomain()) { -#if defined(CONFIG_XEN) || defined(MODULE) struct evtchn_alloc_unbound alloc_unbound; /* Allocate Xenstore page */ @@ -1161,9 +1161,6 @@ int __devinit xenbus_init(void) if (xsd_port_intf) xsd_port_intf->read_proc = xsd_port_read; #endif -#else - /* dom0 not yet supported */ -#endif xen_store_interface = mfn_to_virt(xen_store_mfn); } else { #if !defined(CONFIG_XEN) && !defined(MODULE) @@ -1249,10 +1246,8 @@ int __devinit xenbus_init(void) * registered. */ -#if defined(CONFIG_XEN) || defined(MODULE) if (page != 0) free_page(page); -#endif return err; } --- head-2011-03-17.orig/include/xen/Kbuild 2011-02-01 14:50:44.000000000 +0100 +++ head-2011-03-17/include/xen/Kbuild 2011-02-01 15:09:47.000000000 +0100 @@ -1,2 +1 @@ -header-y += privcmd.h header-y += public/ --- head-2011-03-17.orig/include/xen/evtchn.h 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/include/xen/evtchn.h 2011-02-01 15:09:47.000000000 +0100 @@ -58,6 +58,7 @@ struct irq_cfg { #endif }; }; +struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); #endif /* --- head-2011-03-17.orig/include/xen/interface/memory.h 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/include/xen/interface/memory.h 2011-02-01 15:09:47.000000000 +0100 @@ -198,6 +198,7 @@ struct xen_machphys_mapping { xen_ulong_t v_start, v_end; /* Start and end virtual addresses. */ xen_ulong_t max_mfn; /* Maximum MFN that can be looked up. */ }; +DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping); typedef struct xen_machphys_mapping xen_machphys_mapping_t; DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t); @@ -252,6 +253,7 @@ struct xen_memory_map { */ XEN_GUEST_HANDLE(void) buffer; }; +DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); typedef struct xen_memory_map xen_memory_map_t; DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t); --- head-2011-03-17.orig/include/xen/privcmd.h 2011-03-17 14:35:43.000000000 +0100 +++ head-2011-03-17/include/xen/privcmd.h 2011-02-01 15:09:47.000000000 +0100 @@ -1,77 +1,3 @@ -/****************************************************************************** - * privcmd.h - * - * Interface to /proc/xen/privcmd. - * - * Copyright (c) 2003-2005, K A Fraser - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __LINUX_PUBLIC_PRIVCMD_H__ -#define __LINUX_PUBLIC_PRIVCMD_H__ - -#include -#include - -typedef unsigned long xen_pfn_t; - -struct privcmd_hypercall { - __u64 op; - __u64 arg[5]; -}; - -struct privcmd_mmap_entry { - __u64 va; - __u64 mfn; - __u64 npages; -}; - -struct privcmd_mmap { - int num; - domid_t dom; /* target domain */ - struct privcmd_mmap_entry __user *entry; -}; - -struct privcmd_mmapbatch { - int num; /* number of pages to populate */ - domid_t dom; /* target domain */ - __u64 addr; /* virtual address */ - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ -}; - -/* - * @cmd: IOCTL_PRIVCMD_HYPERCALL - * @arg: &privcmd_hypercall_t - * Return: Value returned from execution of the specified hypercall. - */ -#define IOCTL_PRIVCMD_HYPERCALL \ - _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall)) -#define IOCTL_PRIVCMD_MMAP \ - _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap)) -#define IOCTL_PRIVCMD_MMAPBATCH \ - _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch)) - -#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ +#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__) +#include "public/privcmd.h" +#endif --- head-2011-03-17.orig/include/xen/public/privcmd.h 2011-02-01 14:38:38.000000000 +0100 +++ head-2011-03-17/include/xen/public/privcmd.h 2011-02-01 15:09:47.000000000 +0100 @@ -34,6 +34,7 @@ #define __LINUX_PUBLIC_PRIVCMD_H__ #include +#include typedef struct privcmd_hypercall { --- head-2011-03-17.orig/kernel/power/Kconfig 2011-03-17 14:35:43.000000000 +0100 +++ head-2011-03-17/kernel/power/Kconfig 2011-02-01 15:09:47.000000000 +0100 @@ -65,7 +65,7 @@ config PM_TRACE config PM_TRACE_RTC bool "Suspend/resume event tracing" depends on CAN_PM_TRACE - depends on X86 + depends on X86 && !XEN_UNPRIVILEGED_GUEST select PM_TRACE default n ---help--- @@ -111,7 +111,7 @@ config SUSPEND config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" - depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y && !XEN_UNPRIVILEGED_GUEST ---help--- This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. --- head-2011-03-17.orig/lib/swiotlb-xen.c 2011-02-01 15:04:27.000000000 +0100 +++ head-2011-03-17/lib/swiotlb-xen.c 2011-02-01 15:09:47.000000000 +0100 @@ -58,7 +58,7 @@ static unsigned long io_tlb_nslabs; */ static unsigned long io_tlb_overflow = 32*1024; -void *io_tlb_overflow_buffer; +static void *io_tlb_overflow_buffer; /* * This is a free list describing the number of free entries available from @@ -174,16 +174,16 @@ void __init swiotlb_init_with_tbl(char * * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE. */ - io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); + io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_index = 0; - io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); /* * Get the overflow emergency buffer */ - io_tlb_overflow_buffer = alloc_bootmem(io_tlb_overflow); + io_tlb_overflow_buffer = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_overflow)); if (!io_tlb_overflow_buffer) panic("Cannot allocate SWIOTLB overflow buffer!\n"); @@ -218,7 +218,7 @@ swiotlb_init_with_default_size(size_t de /* * Get IO TLB memory from the low pages */ - io_tlb_start = alloc_bootmem_pages(bytes); + io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); if (!io_tlb_start) panic("Cannot allocate SWIOTLB buffer"); --- head-2011-03-17.orig/mm/vmalloc.c 2011-02-01 14:39:24.000000000 +0100 +++ head-2011-03-17/mm/vmalloc.c 2011-02-01 15:09:47.000000000 +0100 @@ -478,8 +478,6 @@ static void vmap_debug_free_range(unsign #ifdef CONFIG_DEBUG_PAGEALLOC vunmap_page_range(start, end); flush_tlb_kernel_range(start, end); -#elif defined(CONFIG_XEN) && defined(CONFIG_X86) - vunmap_page_range(start, end); #endif }