From: Linux Kernel Mailing List Subject: Linux 2.6.30 Patch-mainline: 2.6.30 This patch contains the differences between 2.6.29 and 2.6.30. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches.py --- head-2010-05-25.orig/arch/ia64/include/asm/xen/hypervisor.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/ia64/include/asm/xen/hypervisor.h 2010-03-24 15:25:06.000000000 +0100 @@ -34,13 +34,13 @@ #define _ASM_IA64_XEN_HYPERVISOR_H #include +#ifdef CONFIG_PARAVIRT_XEN #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ #include #include -#ifdef CONFIG_PARAVIRT_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; --- head-2010-05-25.orig/arch/ia64/kernel/vmlinux.lds.S 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/arch/ia64/kernel/vmlinux.lds.S 2010-03-24 15:25:06.000000000 +0100 @@ -182,7 +182,7 @@ SECTIONS __start_gate_section = .; *(.data.gate) __stop_gate_section = .; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN . = ALIGN(PAGE_SIZE); __xen_start_gate_section = .; *(.data.gate.xen) --- head-2010-05-25.orig/arch/x86/Kconfig 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/Kconfig 2010-03-24 15:25:06.000000000 +0100 @@ -49,8 +49,8 @@ config X86 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select HAVE_KERNEL_LZMA + select HAVE_KERNEL_BZIP2 if !XEN + select HAVE_KERNEL_LZMA if !XEN select HAVE_KERNEL_LZO select HAVE_HW_BREAKPOINT select PERF_EVENTS @@ -337,11 +337,11 @@ config X86_XEN config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" - depends on X86_32 && SMP + depends on X86_32 && SMP && !XEN ---help--- This option is needed for the systems that have more than 8 CPUs -if X86_32 +if X86_32 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -371,7 +371,7 @@ config X86_64_XEN help This option will compile a kernel compatible with Xen hypervisor -if X86_64 +if X86_64 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -842,7 +842,7 @@ config MAXSMP config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP - range 2 8 if SMP && X86_32 && !X86_BIGSMP + range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN range 2 512 if SMP && !MAXSMP default "1" if !SMP default "4096" if MAXSMP @@ -916,10 +916,6 @@ config X86_VISWS_APIC def_bool y depends on X86_32 && X86_VISWS -config X86_XEN_GENAPIC - def_bool y - depends on X86_64_XEN - config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" default n --- head-2010-05-25.orig/arch/x86/Makefile 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/Makefile 2010-03-24 15:25:06.000000000 +0100 @@ -111,10 +111,6 @@ endif # prevent gcc from generating any FP code by mistake KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) -# Xen subarch support -mflags-$(CONFIG_XEN) := -Iarch/x86/include/mach-xen -mcore-$(CONFIG_XEN) := arch/x86/mach-xen/ - KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) @@ -187,10 +183,10 @@ endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +endif $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ -endif PHONY += install install: --- head-2010-05-25.orig/arch/x86/boot/Makefile 2010-03-24 15:01:37.000000000 +0100 +++ head-2010-05-25/arch/x86/boot/Makefile 2010-03-24 15:25:06.000000000 +0100 @@ -204,6 +204,12 @@ $(obj)/vmlinux-stripped: OBJCOPYFLAGS := $(obj)/vmlinux-stripped: vmlinux FORCE $(call if_changed,objcopy) +ifndef CONFIG_XEN +bzImage := bzImage +else +bzImage := vmlinuz +endif + install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \ System.map "$(INSTALL_PATH)" --- head-2010-05-25.orig/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:25:06.000000000 +0100 @@ -502,7 +502,7 @@ ia32_sys_call_table: .quad sys32_olduname .quad sys_umask /* 60 */ .quad sys_chroot - .quad sys32_ustat + .quad compat_sys_ustat .quad sys_dup2 .quad sys_getppid .quad sys_getpgrp /* 65 */ @@ -773,4 +773,6 @@ ia32_sys_call_table: .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 + .quad compat_sys_preadv + .quad compat_sys_pwritev ia32_syscall_end: --- head-2010-05-25.orig/arch/x86/include/asm/kexec.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/asm/kexec.h 2010-03-24 15:25:06.000000000 +0100 @@ -21,8 +21,14 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_TABLE_PAGE 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN, see comment above +# define VA_TABLE_PAGE 3 */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #endif # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 --- head-2010-05-25.orig/arch/x86/include/asm/page_64_types.h 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/arch/x86/include/asm/page_64_types.h 2010-03-24 15:25:06.000000000 +0100 @@ -69,7 +69,15 @@ extern void init_extra_mapping_wb(unsign #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM +/* + * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen + * other than for hotplugged memory. + */ +#ifndef CONFIG_XEN #define pfn_valid(pfn) ((pfn) < max_pfn) +#else +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/desc.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/desc.h 2010-03-24 15:25:06.000000000 +0100 @@ -39,7 +39,7 @@ extern gate_desc idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU(struct gdt_page, gdt_page); +DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { @@ -91,7 +91,6 @@ static inline int desc_empty(const void #define store_gdt(dtr) native_store_gdt(dtr) #define store_idt(dtr) native_store_idt(dtr) #define store_tr(tr) (tr = native_store_tr()) -#define store_ldt(ldt) asm("sldt %0":"=m" (ldt)) #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt @@ -111,6 +110,8 @@ static inline void paravirt_free_ldt(str { } +#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) + static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) { @@ -251,6 +252,8 @@ static inline void native_load_tls(struc gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; } #else +#include + #define load_TLS(t, cpu) xen_load_tls(t, cpu) #define set_ldt xen_set_ldt @@ -265,8 +268,9 @@ static inline void xen_load_tls(struct t struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), - *(u64 *)&t->tls_array[i])) + if (HYPERVISOR_update_descriptor( + arbitrary_virt_to_machine(&gdt[i]), + *(u64 *)&t->tls_array[i])) BUG(); } #endif --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/fixmap.h 2010-03-24 15:25:06.000000000 +0100 @@ -1,11 +1,154 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009 + */ + #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +#ifndef __ASSEMBLY__ +#include +#include +#include +#include +#ifdef CONFIG_X86_32 +#include +#include +#else +#include +#endif + +/* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall + * uses fixmaps that relies on FIXADDR_TOP for proper address calculation. + * Because of this, FIXADDR_TOP x86 integration was left as later work. + */ +#ifdef CONFIG_X86_32 +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +#else +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) + +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) +#endif + + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * for x86_32: We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * These 'compile-time allocated' memory buffers are + * fixed-size 4k pages (or larger if used with an increment + * higher than 1). Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { #ifdef CONFIG_X86_32 -# include "fixmap_32.h" + FIX_HOLE, + FIX_VDSO, #else -# include "fixmap_64.h" + VSYSCALL_LAST_PAGE, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VSYSCALL_HPET, +#endif + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, +#endif +#else + FIX_SHARED_INFO, +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif +#endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif + FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE1, + __end_of_permanent_fixed_addresses, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 256 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_SLOTS 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_X86_32 + FIX_WP_TEST, +#endif + __end_of_fixed_addresses +}; + + +extern void reserve_top_address(unsigned long reserve); + +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) extern int fixmaps_set; @@ -13,10 +156,10 @@ extern pte_t *kmap_pte; extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; -void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t); +void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t); static inline void __set_fixmap(enum fixed_addresses idx, - maddr_t phys, pgprot_t flags) + phys_addr_t phys, pgprot_t flags) { xen_set_fixmap(idx, phys, flags); } @@ -65,4 +208,5 @@ static inline unsigned long virt_to_fix( BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); return __virt_to_fix(vaddr); } +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2010-03-24 15:14:47.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,125 +0,0 @@ -/* - * fixmap.h: compile-time virtual memory allocation - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1998 Ingo Molnar - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - */ - -#ifndef _ASM_X86_FIXMAP_32_H -#define _ASM_X86_FIXMAP_32_H - -/* used by vmalloc.c, vsyscall.lds.S. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap. - */ -extern unsigned long __FIXADDR_TOP; -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) - -#ifndef __ASSEMBLY__ -#include -#include -#include -#include -#include -#include - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. - * Also this lets us do fail-safe vmalloc(), we - * can guarantee that these special addresses and - * vmalloc()-ed addresses never overlap. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ -enum fixed_addresses { - FIX_HOLE, - FIX_VDSO, - FIX_DBGP_BASE, - FIX_EARLYCON_MEM_BASE, -#ifdef CONFIG_X86_LOCAL_APIC - FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ -#endif -#ifndef CONFIG_XEN -#ifdef CONFIG_X86_IO_APIC - FIX_IO_APIC_BASE_0, - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, -#endif -#else - FIX_SHARED_INFO, -#define NR_FIX_ISAMAPS 256 - FIX_ISAMAP_END, - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, -#endif -#ifdef CONFIG_X86_VISWS_APIC - FIX_CO_CPU, /* Cobalt timer */ - FIX_CO_APIC, /* Cobalt APIC Redirection Table */ - FIX_LI_PCIA, /* Lithium PCI Bridge A */ - FIX_LI_PCIB, /* Lithium PCI Bridge B */ -#endif -#ifdef CONFIG_X86_F00F_BUG - FIX_F00F_IDT, /* Virtual mapping for IDT */ -#endif -#ifdef CONFIG_X86_CYCLONE_TIMER - FIX_CYCLONE_TIMER, /*cyclone timer register*/ -#endif - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#ifdef CONFIG_PCI_MMCONFIG - FIX_PCIE_MCFG, -#endif -#ifdef CONFIG_PARAVIRT - FIX_PARAVIRT_BOOTMAP, -#endif - __end_of_permanent_fixed_addresses, - /* - * 256 temporary boot-time mappings, used by early_ioremap(), - * before ioremap() is functional. - * - * We round it up to the next 256 pages boundary so that we - * can have a single pgd entry and a single pte table: - */ -#define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_SLOTS 4 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, - FIX_WP_TEST, -#ifdef CONFIG_ACPI - FIX_ACPI_BEGIN, - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, -#endif -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif - __end_of_fixed_addresses -}; - -extern void reserve_top_address(unsigned long reserve); - - -#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) - -#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) -#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_FIXMAP_32_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2010-03-24 15:17:58.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,90 +0,0 @@ -/* - * fixmap.h: compile-time virtual memory allocation - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1998 Ingo Molnar - */ - -#ifndef _ASM_X86_FIXMAP_64_H -#define _ASM_X86_FIXMAP_64_H - -#include -#include -#include -#include -#include -#include - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. - * - * These 'compile-time allocated' memory buffers are - * fixed-size 4k pages (or larger if used with an increment - * higher than 1). Use set_fixmap(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ - -enum fixed_addresses { - VSYSCALL_LAST_PAGE, - VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE - + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, - VSYSCALL_HPET, - FIX_DBGP_BASE, - FIX_EARLYCON_MEM_BASE, -#ifdef CONFIG_X86_LOCAL_APIC - FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ -#endif -#ifndef CONFIG_XEN - FIX_IO_APIC_BASE_0, - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, -#else -#define NR_FIX_ISAMAPS 256 - FIX_ISAMAP_END, - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, -#endif -#ifdef CONFIG_PARAVIRT - FIX_PARAVIRT_BOOTMAP, -#else - FIX_SHARED_INFO, -#endif - __end_of_permanent_fixed_addresses, -#ifdef CONFIG_ACPI - FIX_ACPI_BEGIN, - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, -#endif -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif - /* - * 256 temporary boot-time mappings, used by early_ioremap(), - * before ioremap() is functional. - * - * We round it up to the next 256 pages boundary so that we - * can have a single pgd entry and a single pte table: - */ -#define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_SLOTS 4 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, - __end_of_fixed_addresses -}; - -#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) -#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - -/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ -#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) - -#endif /* _ASM_X86_FIXMAP_64_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/highmem.h 2010-03-24 17:05:16.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/highmem.h 2010-03-24 17:05:22.000000000 +0100 @@ -62,6 +62,7 @@ void *kmap_atomic_prot(struct page *page void *kmap_atomic(struct page *page, enum km_type type); void kunmap_atomic(void *kvaddr, enum km_type type); void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); #define kmap_atomic_pte(page, type) \ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/hypervisor.h 2010-03-24 15:25:06.000000000 +0100 @@ -46,7 +46,7 @@ #include #include #include -#include +#include extern shared_info_t *HYPERVISOR_shared_info; @@ -153,20 +153,16 @@ int __must_check xen_multi_mmuext_op(str #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { - __get_cpu_var(xen_lazy_mmu) = true; + percpu_write(xen_lazy_mmu, true); } static inline void arch_leave_lazy_mmu_mode(void) { - __get_cpu_var(xen_lazy_mmu) = false; + percpu_write(xen_lazy_mmu, false); xen_multicall_flush(false); } -#if defined(CONFIG_X86_32) -#define arch_use_lazy_mmu_mode() unlikely(x86_read_percpu(xen_lazy_mmu)) -#elif !defined(arch_use_lazy_mmu_mode) -#define arch_use_lazy_mmu_mode() unlikely(__get_cpu_var(xen_lazy_mmu)) -#endif +#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu)) #if 0 /* All uses are in places potentially called asynchronously, but * asynchronous code should rather not make use of lazy mode at all. --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/io.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/io.h 2010-03-24 15:25:06.000000000 +0100 @@ -5,6 +5,10 @@ #include #include +#include +#ifdef __KERNEL__ +#include +#endif #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -82,6 +86,101 @@ static inline void writeq(__u64 val, vol #define native_io_delay xen_io_delay +/** + * virt_to_phys - map virtual addresses to physical + * @address: address to remap + * + * The returned physical address is the physical (CPU) mapping for + * the memory address given. It is only valid to use this function on + * addresses directly mapped or allocated via kmalloc. + * + * This function does not give bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline phys_addr_t virt_to_phys(volatile void *address) +{ + return __pa(address); +} + +/** + * phys_to_virt - map physical address to virtual + * @address: address to remap + * + * The returned virtual address is a current CPU mapping for + * the memory address given. It is only valid to use this function on + * addresses that have a kernel mapping + * + * This function does not handle bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline void *phys_to_virt(phys_addr_t address) +{ + return __va(address); +} + +/* + * Change "struct page" to physical address. + */ +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#undef page_to_phys +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) + +/* + * ISA I/O bus memory addresses are 1:1 with the physical address. + * However, we truncate the address to unsigned int to avoid undesirable + * promitions in legacy drivers. + */ +#define isa_virt_to_bus(_x) ({ \ + unsigned long _va_ = (unsigned long)(_x); \ + _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \ + ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \ + : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); }) +#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) + +/* + * However PCI ones are not necessarily 1:1 and therefore these interfaces + * are forbidden in portable PCI drivers. + * + * Allow them on x86 for legacy drivers, though. + */ +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) +#define bus_to_virt(_x) __va(machine_to_phys(_x)) + +/** + * ioremap - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * If the area you are trying to map is a PCI BAR you should have a + * look at pci_iomap(). + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); + +/* + * The default ioremap() behavior is non-cached: + */ +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap_nocache(offset, size); +} + +extern void iounmap(volatile void __iomem *addr); + + #ifdef CONFIG_X86_32 # include "../../asm/io_32.h" #else @@ -93,11 +192,6 @@ static inline void writeq(__u64 val, vol /* We will be supplying our own /dev/mem implementation */ #define ARCH_HAS_DEV_MEM -#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#undef page_to_phys -#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) -#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) - #define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ (unsigned long)(bv)->bv_offset) @@ -106,23 +200,7 @@ static inline void writeq(__u64 val, vol && bvec_to_pseudophys(vec1) + (vec1)->bv_len \ == bvec_to_pseudophys(vec2)) -#undef virt_to_bus -#undef bus_to_virt -#define virt_to_bus(_x) phys_to_machine(__pa(_x)) -#define bus_to_virt(_x) __va(machine_to_phys(_x)) - -#include - #undef __ISA_IO_base -#undef isa_virt_to_bus -#undef isa_page_to_bus -#undef isa_bus_to_virt -#define isa_virt_to_bus(_x) ({ \ - unsigned long _va_ = (unsigned long)(_x); \ - _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \ - ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \ - : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); }) -#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) #endif @@ -131,7 +209,7 @@ extern void unxlate_dev_mem_ptr(unsigned extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size, unsigned long prot_val); -extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); +extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); /* * early_ioremap() and early_iounmap() are for temporary early boot-time @@ -140,10 +218,12 @@ extern void __iomem *ioremap_wc(unsigned */ extern void early_ioremap_init(void); extern void early_ioremap_reset(void); -extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); -extern void __iomem *early_memremap(unsigned long offset, unsigned long size); +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); +#define IO_SPACE_LIMIT 0xffff #endif /* _ASM_X86_IO_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/ipi.h 2010-03-24 15:25:06.000000000 +0100 @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_IPI_H +#define _ASM_X86_IPI_H + +#include +#include + +void xen_send_IPI_mask(const struct cpumask *, int vector); +void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector); +void xen_send_IPI_allbutself(int vector); +void xen_send_IPI_all(int vector); +void xen_send_IPI_self(int vector); + +#endif /* _ASM_X86_IPI_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/irqflags.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/irqflags.h 2010-03-24 15:25:06.000000000 +0100 @@ -94,7 +94,7 @@ static inline void halt(void) #ifdef CONFIG_X86_64 # define __REG_si %rsi -# define __CPU_num %gs:pda_cpunumber +# define __CPU_num PER_CPU_VAR(cpu_number) #else # define __REG_si %esi # define __CPU_num TI_cpu(%ebp) @@ -130,6 +130,7 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT mov $__KERNEL_PERCPU, %ecx ; \ push %esp ; \ mov %ecx, %fs ; \ + SET_KERNEL_GS %ecx ; \ call evtchn_do_upcall ; \ add $4,%esp ; \ jmp ret_from_intr --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/irq_vectors.h 2010-03-24 15:25:06.000000000 +0100 @@ -2,29 +2,46 @@ #define _ASM_X86_IRQ_VECTORS_H #ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 +# define SYSCALL_VECTOR 0x80 #else -# define IA32_SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 #endif -#define RESCHEDULE_VECTOR 0 -#define CALL_FUNCTION_VECTOR 1 -#define CALL_FUNC_SINGLE_VECTOR 2 -#define SPIN_UNLOCK_VECTOR 3 -#define NR_IPIS 4 +#define RESCHEDULE_VECTOR 0 +#define CALL_FUNCTION_VECTOR 1 +#define CALL_FUNC_SINGLE_VECTOR 2 +#define SPIN_UNLOCK_VECTOR 3 +#define NR_IPIS 4 /* * The maximum number of vectors supported by i386 processors * is limited to 256. For processors other than i386, NR_VECTORS * should be changed accordingly. */ -#define NR_VECTORS 256 +#define NR_VECTORS 256 -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 -#define NR_IRQS_LEGACY 16 +#ifndef __ASSEMBLY__ +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} +#endif + +/* + * Size the maximum number of interrupts. + * + * If the irq_desc[] array has a sparse layout, we can size things + * generously - it scales up linearly with the maximum number of CPUs, + * and the maximum number of IO-APICs, whichever is higher. + * + * In other cases we size more conservatively, to not create too large + * static arrays. + */ + +#define NR_IRQS_LEGACY 16 /* * The flat IRQ space is divided into two regions: @@ -35,21 +52,41 @@ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These * are bound using the provided bind/unbind functions. */ +#define PIRQ_BASE 0 -#define PIRQ_BASE 0 -#if defined(NR_CPUS) && defined(MAX_IO_APICS) -# if !defined(CONFIG_SPARSE_IRQ) && NR_CPUS < MAX_IO_APICS -# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS) -# elif defined(CONFIG_SPARSE_IRQ) && 8 * NR_CPUS > 32 * MAX_IO_APICS -# define NR_PIRQS (NR_VECTORS + 8 * NR_CPUS) +#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) +#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) + +#ifdef CONFIG_X86_IO_APIC +# if !defined(NR_CPUS) || !defined(MAX_IO_APICS) +/* nothing */ +# elif defined(CONFIG_SPARSE_IRQ) +# define NR_PIRQS \ + (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) +# elif NR_CPUS < MAX_IO_APICS +# define NR_PIRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) # else -# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS) +# define NR_PIRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) # endif +#elif defined(CONFIG_XEN_PCIDEV_FRONTEND) +# define NR_PIRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else /* !CONFIG_X86_IO_APIC: */ +# define NR_PIRQS NR_IRQS_LEGACY +#endif + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ) +extern int nr_pirqs; +#else +# define nr_pirqs NR_PIRQS +#endif #endif -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) -#define NR_DYNIRQS 256 +#define DYNIRQ_BASE (PIRQ_BASE + nr_pirqs) +#define NR_DYNIRQS 256 -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) #endif /* _ASM_X86_IRQ_VECTORS_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/mmu_context.h 2010-03-24 15:25:06.000000000 +0100 @@ -26,11 +26,117 @@ static inline void xen_activate_mm(struc int init_new_context(struct task_struct *tsk, struct mm_struct *mm); void destroy_context(struct mm_struct *mm); + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); +#endif +} + +#define prepare_arch_switch(next) __prepare_arch_switch() + +static inline void __prepare_arch_switch(void) +{ #ifdef CONFIG_X86_32 -# include "mmu_context_32.h" + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. + */ + lazy_save_gs(current->thread.gs); + lazy_load_gs(__KERNEL_STACK_CANARY); #else -# include "mmu_context_64.h" + /* + * Save away %es, %ds, %fs and %gs. Must happen before reload + * of cr3/ldt (i.e., not in __switch_to). + */ + __asm__ __volatile__ ( + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" + : "=m" (current->thread.es), + "=m" (current->thread.ds), + "=m" (current->thread.fsindex), + "=m" (current->thread.gsindex) ); + + if (current->thread.ds) + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); + + if (current->thread.es) + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); + + if (current->thread.fsindex) { + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); + current->thread.fs = 0; + } + + if (current->thread.gsindex) { + load_gs_index(0); + current->thread.gs = 0; + } +#endif +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op; + + if (likely(prev != next)) { + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && + !PagePinned(virt_to_page(next->pgd))); + + /* stop flush ipis for the previous mm */ + cpu_clear(cpu, prev->cpu_vm_mask); +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); #endif + cpu_set(cpu, next->cpu_vm_mask); + + /* Re-load page tables: load_cr3(next->pgd) */ + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = virt_to_mfn(next->pgd); + op++; + + /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ +#ifdef CONFIG_X86_64 + op->cmd = MMUEXT_NEW_USER_BASEPTR; + op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd)); + op++; +#endif + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) { + /* load_LDT_nolock(&next->context) */ + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)next->context.ldt; + op->arg2.nr_ents = next->context.size; + op++; + } + + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); + } +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + else { + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + */ + load_cr3(next->pgd); + xen_new_user_pt(__pa(__user_pgd(next->pgd))); + load_LDT_nolock(&next->context); + } + } +#endif +} #define activate_mm(prev, next) \ do { \ @@ -38,5 +144,17 @@ do { \ switch_mm((prev), (next), NULL); \ } while (0); +#ifdef CONFIG_X86_32 +#define deactivate_mm(tsk, mm) \ +do { \ + lazy_load_gs(0); \ +} while (0) +#else +#define deactivate_mm(tsk, mm) \ +do { \ + load_gs_index(0); \ + loadsegment(fs, 0); \ +} while (0) +#endif #endif /* _ASM_X86_MMU_CONTEXT_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2010-03-24 15:17:58.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,83 +0,0 @@ -#ifndef _ASM_X86_MMU_CONTEXT_32_H -#define _ASM_X86_MMU_CONTEXT_32_H - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ - if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) - x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); -#endif -} - -#define prepare_arch_switch(next) __prepare_arch_switch() - -static inline void __prepare_arch_switch(void) -{ - /* - * Save away %gs. No need to save %fs, as it was saved on the - * stack on entry. No need to save %es and %ds, as those are - * always kernel segments while inside the kernel. - */ - asm volatile ( "mov %%gs,%0" - : "=m" (current->thread.gs)); - asm volatile ( "movl %0,%%gs" - : : "r" (0) ); -} - -static inline void switch_mm(struct mm_struct *prev, - struct mm_struct *next, - struct task_struct *tsk) -{ - int cpu = smp_processor_id(); - struct mmuext_op _op[2], *op = _op; - - if (likely(prev != next)) { - BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && - !PagePinned(virt_to_page(next->pgd))); - - /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ - x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); - x86_write_percpu(cpu_tlbstate.active_mm, next); -#endif - cpu_set(cpu, next->cpu_vm_mask); - - /* Re-load page tables: load_cr3(next->pgd) */ - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); - op++; - - /* - * load the LDT, if the LDT is different: - */ - if (unlikely(prev->context.ldt != next->context.ldt)) { - /* load_LDT_nolock(&next->context, cpu) */ - op->cmd = MMUEXT_SET_LDT; - op->arg1.linear_addr = (unsigned long)next->context.ldt; - op->arg2.nr_ents = next->context.size; - op++; - } - - BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); - } -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ - else { - x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); - - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { - /* We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload %cr3. - */ - load_cr3(next->pgd); - load_LDT_nolock(&next->context); - } - } -#endif -} - -#define deactivate_mm(tsk, mm) \ - asm("movl %0,%%gs": :"r" (0)); - -#endif /* _ASM_X86_MMU_CONTEXT_32_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h 2010-03-24 15:14:47.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,106 +0,0 @@ -#ifndef _ASM_X86_MMU_CONTEXT_64_H -#define _ASM_X86_MMU_CONTEXT_64_H - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - if (read_pda(mmu_state) == TLBSTATE_OK) - write_pda(mmu_state, TLBSTATE_LAZY); -#endif -} - -#define prepare_arch_switch(next) __prepare_arch_switch() - -static inline void __prepare_arch_switch(void) -{ - /* - * Save away %es, %ds, %fs and %gs. Must happen before reload - * of cr3/ldt (i.e., not in __switch_to). - */ - __asm__ __volatile__ ( - "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" - : "=m" (current->thread.es), - "=m" (current->thread.ds), - "=m" (current->thread.fsindex), - "=m" (current->thread.gsindex) ); - - if (current->thread.ds) - __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); - - if (current->thread.es) - __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); - - if (current->thread.fsindex) { - __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); - current->thread.fs = 0; - } - - if (current->thread.gsindex) { - load_gs_index(0); - current->thread.gs = 0; - } -} - -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); - struct mmuext_op _op[3], *op = _op; - - if (likely(prev != next)) { - BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && - !PagePinned(virt_to_page(next->pgd))); - - /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - write_pda(mmu_state, TLBSTATE_OK); - write_pda(active_mm, next); -#endif - cpu_set(cpu, next->cpu_vm_mask); - - /* load_cr3(next->pgd) */ - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); - op++; - - /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ - op->cmd = MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT); - op++; - - if (unlikely(next->context.ldt != prev->context.ldt)) { - /* load_LDT_nolock(&next->context) */ - op->cmd = MMUEXT_SET_LDT; - op->arg1.linear_addr = (unsigned long)next->context.ldt; - op->arg2.nr_ents = next->context.size; - op++; - } - - BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); - } -#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - else { - write_pda(mmu_state, TLBSTATE_OK); - if (read_pda(active_mm) != next) - BUG(); - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { - /* We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - */ - load_cr3(next->pgd); - xen_new_user_pt(__pa(__user_pgd(next->pgd))); - load_LDT_nolock(&next->context); - } - } -#endif -} - -#define deactivate_mm(tsk, mm) \ -do { \ - load_gs_index(0); \ - asm volatile("movl %0,%%fs"::"r"(0)); \ -} while (0) - -#endif /* _ASM_X86_MMU_CONTEXT_64_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pci.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pci.h 2010-03-24 15:25:06.000000000 +0100 @@ -41,7 +41,6 @@ static inline int pci_proc_domain(struct return pci_domain_nr(bus); } -extern void pci_iommu_alloc(void); /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes @@ -92,12 +91,44 @@ static inline void early_quirks(void) { extern void pci_iommu_alloc(void); -#endif /* __KERNEL__ */ +/* MSI arch hooks */ +#define arch_setup_msi_irqs arch_setup_msi_irqs +#define arch_teardown_msi_irqs arch_teardown_msi_irqs + +#define PCI_DMA_BUS_IS_PHYS 0 + +#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) || defined(CONFIG_SWIOTLB) + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ + dma_addr_t ADDR_NAME; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ + __u32 LEN_NAME; +#define pci_unmap_addr(PTR, ADDR_NAME) \ + ((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + (((PTR)->ADDR_NAME) = (VAL)) +#define pci_unmap_len(PTR, LEN_NAME) \ + ((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + (((PTR)->LEN_NAME) = (VAL)) -#ifdef CONFIG_X86_32 -# include "pci_32.h" #else -# include "../../asm/pci_64.h" + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { break; } while (pci_unmap_len(PTR, LEN_NAME)) + +#endif + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_64 +#include "../../asm/pci_64.h" #endif /* implement the pci_ DMA API in terms of the generic device dma_ one */ @@ -115,11 +146,6 @@ static inline int __pcibus_to_node(const return sd->node; } -static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) -{ - return node_to_cpumask(__pcibus_to_node(bus)); -} - static inline const struct cpumask * cpumask_of_pcibus(const struct pci_bus *bus) { --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:25:06.000000000 +0100 @@ -1,178 +1,9 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H -#define FIRST_USER_ADDRESS 0 +#include -#define _PAGE_BIT_PRESENT 0 /* is present */ -#define _PAGE_BIT_RW 1 /* writeable */ -#define _PAGE_BIT_USER 2 /* userspace addressable */ -#define _PAGE_BIT_PWT 3 /* page write through */ -#define _PAGE_BIT_PCD 4 /* page cache disabled */ -#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ -#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ -#define _PAGE_BIT_PAT 7 /* on 4KB pages */ -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_UNUSED3 11 -#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ - -/* If _PAGE_BIT_PRESENT is clear, we use these: */ -/* - if the user mapped it with PROT_NONE; pte_present gives true */ -#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -/* - set: nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY - -#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) -#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) -#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) -#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) -#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) -#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) -#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) -#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) -#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) -#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) -#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) -#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) -#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) -#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) -#define __HAVE_ARCH_PTE_SPECIAL - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) -#else -#define _PAGE_NX (_AT(pteval_t, 0)) -#endif - -#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) - -#ifndef __ASSEMBLY__ -#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 -extern unsigned int __kernel_page_user; -#else -#define __kernel_page_user 0 -#endif -#endif - -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY | __kernel_page_user) - -/* Set of bits not changed in pte_modify */ -#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) - -/* - * PAT settings are part of the hypervisor interface, which sets the - * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). - */ -#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) -#define _PAGE_CACHE_WB (0) -#define _PAGE_CACHE_WT (_PAGE_PWT) -#define _PAGE_CACHE_WC (_PAGE_PAT) -#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) -#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) -#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) - -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_NX) - -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ - _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_ACCESSED) -#define PAGE_COPY PAGE_COPY_NOEXEC -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_ACCESSED) - -#define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) -#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) - -#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) -#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) -#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) -#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) -#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) - -#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) - -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) -#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) - -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) -#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) - -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - -/* - * early identity mapping pte attrib macros. - */ -#ifdef CONFIG_X86_64 -#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC -#else -/* - * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection - * bits are combined, this will alow user to access the high address mapped - * VDSO in the presence of CONFIG_COMPAT_VDSO - */ -#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ -#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ -#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ -#endif +#include /* * Macro to mark a page protection value as UC- @@ -184,9 +15,6 @@ extern unsigned int __kernel_page_user; #ifndef __ASSEMBLY__ -#define pgprot_writecombine pgprot_writecombine -extern pgprot_t pgprot_writecombine(pgprot_t prot); - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -197,6 +25,59 @@ extern unsigned long empty_zero_page[PAG extern spinlock_t pgd_lock; extern struct list_head pgd_list; +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) + +#define set_pte_atomic(ptep, pte) \ + xen_set_pte_atomic(ptep, pte) + +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) + +#ifndef __PAGETABLE_PUD_FOLDED +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) +#define pgd_clear(pgd) xen_pgd_clear(pgd) +#endif + +#ifndef set_pud +# define set_pud(pudp, pud) xen_set_pud(pudp, pud) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pud_clear(pud) xen_pud_clear(pud) +#endif + +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) xen_pmd_clear(pmd) + +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) + +static inline void __init paravirt_pagetable_setup_start(pgd_t *base) +{ + xen_pagetable_setup_start(base); +} + +static inline void __init paravirt_pagetable_setup_done(pgd_t *base) +{ + xen_pagetable_setup_done(base); +} + +#define pgd_val(x) xen_pgd_val(x) +#define __pgd(x) xen_make_pgd(x) + +#ifndef __PAGETABLE_PUD_FOLDED +#define pud_val(x) xen_pud_val(x) +#define __pud(x) xen_make_pud(x) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pmd_val(x) xen_pmd_val(x) +#define __pmd(x) xen_make_pmd(x) +#endif + +#define pte_val(x) xen_pte_val(x) +#define __pte(x) xen_make_pte(x) + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -252,53 +133,67 @@ static inline int pte_special(pte_t pte) static inline int pmd_large(pmd_t pte) { - return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == (_PAGE_PSE | _PAGE_PRESENT); } +static inline pte_t pte_set_flags(pte_t pte, pteval_t set) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v | set); +} + +static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v & ~clear); +} + static inline pte_t pte_mkclean(pte_t pte) { - return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY); + return pte_clear_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkold(pte_t pte) { - return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED); + return pte_clear_flags(pte, _PAGE_ACCESSED); } static inline pte_t pte_wrprotect(pte_t pte) { - return __pte_ma(__pte_val(pte) & ~_PAGE_RW); + return pte_clear_flags(pte, _PAGE_RW); } static inline pte_t pte_mkexec(pte_t pte) { - return __pte_ma(__pte_val(pte) & ~_PAGE_NX); + return pte_clear_flags(pte, _PAGE_NX); } static inline pte_t pte_mkdirty(pte_t pte) { - return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); + return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) { - return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); + return pte_set_flags(pte, _PAGE_ACCESSED); } static inline pte_t pte_mkwrite(pte_t pte) { - return __pte_ma(__pte_val(pte) | _PAGE_RW); + return pte_set_flags(pte, _PAGE_RW); } static inline pte_t pte_mkhuge(pte_t pte) { - return __pte_ma(__pte_val(pte) | _PAGE_PSE); + return pte_set_flags(pte, _PAGE_PSE); } static inline pte_t pte_clrhuge(pte_t pte) { - return __pte_ma(__pte_val(pte) & ~_PAGE_PSE); + return pte_clear_flags(pte, _PAGE_PSE); } static inline pte_t pte_mkglobal(pte_t pte) @@ -313,11 +208,9 @@ static inline pte_t pte_clrglobal(pte_t static inline pte_t pte_mkspecial(pte_t pte) { - return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL); + return pte_set_flags(pte, _PAGE_SPECIAL); } -extern pteval_t __supported_pte_mask; - /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -391,68 +284,208 @@ static inline int is_new_memtype_allowed return 1; } -#ifndef __ASSEMBLY__ -#ifndef CONFIG_XEN -/* Indicate that x86 has its own track and untrack pfn vma functions */ -#define __HAVE_PFNMAP_TRACKING -#endif +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); +#endif /* __ASSEMBLY__ */ -#define __HAVE_PHYS_MEM_ACCESS_PROT -struct file; -pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t vma_prot); -int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t *vma_prot); +#ifdef CONFIG_X86_32 +# include "pgtable_32.h" +#else +# include "pgtable_64.h" #endif -/* Install a pte for a particular vaddr in kernel space. */ -void set_pte_vaddr(unsigned long vaddr, pte_t pte); +#ifndef __ASSEMBLY__ +#include -#ifndef CONFIG_XEN -extern void native_pagetable_setup_start(pgd_t *base); -extern void native_pagetable_setup_done(pgd_t *base); +static inline int pte_none(pte_t pte) +{ + return !pte.pte; +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte == b.pte; +} + +static inline int pte_present(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + +static inline int pmd_present(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ + return __pmd_val(pmd) != 0; #else -static inline void xen_pagetable_setup_start(pgd_t *base) {} -static inline void xen_pagetable_setup_done(pgd_t *base) {} + return pmd_flags(pmd) & _PAGE_PRESENT; #endif +} -struct seq_file; -extern void arch_report_meminfo(struct seq_file *m); +static inline int pmd_none(pmd_t pmd) +{ + /* Only check low word on 32-bit platforms, since it might be + out of sync with upper half. */ + return (unsigned long)__pmd_val(pmd) == 0; +} -#define set_pte(ptep, pte) xen_set_pte(ptep, pte) -#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); +} -#define set_pte_atomic(ptep, pte) \ - xen_set_pte_atomic(ptep, pte) +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) -#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) +/* + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] + * + * this macro returns the index of the entry in the pmd page which would + * control the given virtual address + */ +static inline unsigned pmd_index(unsigned long address) +{ + return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); +} -#ifndef __PAGETABLE_PUD_FOLDED -#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) -#define pgd_clear(pgd) xen_pgd_clear(pgd) -#endif +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * (Currently stuck as a macro because of indirect forward reference + * to linux/mm.h:page_to_nid()) + */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -#ifndef set_pud -# define set_pud(pudp, pud) xen_set_pud(pudp, pud) -#endif +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this function returns the index of the entry in the pte page which would + * control the given virtual address + */ +static inline unsigned pte_index(unsigned long address) +{ + return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); +} -#ifndef __PAGETABLE_PMD_FOLDED -#define pud_clear(pud) xen_pud_clear(pud) +static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) +{ + return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); +} + +static inline int pmd_bad(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 + return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT) + != (_KERNPG_TABLE & ~_PAGE_PRESENT); +#else + return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; #endif +} -#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) -#define pmd_clear(pmd) xen_pmd_clear(pmd) +static inline unsigned long pages_to_mb(unsigned long npg) +{ + return npg >> (20 - PAGE_SHIFT); +} -#define pte_update(mm, addr, ptep) do { } while (0) -#define pte_update_defer(mm, addr, ptep) do { } while (0) +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO) -#endif /* __ASSEMBLY__ */ +#if PAGETABLE_LEVELS > 2 +static inline int pud_none(pud_t pud) +{ + return __pud_val(pud) == 0; +} -#ifdef CONFIG_X86_32 -# include "pgtable_32.h" +static inline int pud_present(pud_t pud) +{ + return pud_flags(pud) & _PAGE_PRESENT; +} + +static inline unsigned long pud_page_vaddr(pud_t pud) +{ + return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) + +/* Find an entry in the second-level page table.. */ +static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +{ + return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); +} + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static inline int pud_large(pud_t pud) +{ + return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline int pud_bad(pud_t pud) +{ + return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} #else -# include "pgtable_64.h" -#endif +static inline int pud_large(pud_t pud) +{ + return 0; +} +#endif /* PAGETABLE_LEVELS > 2 */ + +#if PAGETABLE_LEVELS > 3 +static inline int pgd_present(pgd_t pgd) +{ + return pgd_flags(pgd) & _PAGE_PRESENT; +} + +static inline unsigned long pgd_page_vaddr(pgd_t pgd) +{ + return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +/* to find an entry in a page-table-directory. */ +static inline unsigned pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +{ + return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); +} + +static inline int pgd_bad(pgd_t pgd) +{ + return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; +} + +static inline int pgd_none(pgd_t pgd) +{ + return !__pgd_val(pgd); +} +#endif /* PAGETABLE_LEVELS > 3 */ + +#endif /* __ASSEMBLY__ */ /* * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] @@ -479,28 +512,6 @@ extern void arch_report_meminfo(struct s #ifndef __ASSEMBLY__ -enum { - PG_LEVEL_NONE, - PG_LEVEL_4K, - PG_LEVEL_2M, - PG_LEVEL_1G, - PG_LEVEL_NUM -}; - -#ifdef CONFIG_PROC_FS -extern void update_page_count(int level, unsigned long pages); -#else -static inline void update_page_count(int level, unsigned long pages) { } -#endif - -/* - * Helper function that returns the kernel pagetable entry controlling - * the virtual address 'address'. NULL means no pagetable entry present. - * NOTE: the return type is pte_t but if the pmd is PSE then we return it - * as a pte too. - */ -extern pte_t *lookup_address(unsigned long address, unsigned int *level); - /* local pte updates need not use xchg for locking */ static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) { @@ -633,15 +644,18 @@ static inline void clone_pgd_range(pgd_t memcpy(dst, src, count * sizeof(pgd_t)); } -#define arbitrary_virt_to_machine(va) \ +#define arbitrary_virt_to_mfn(va) \ ({ \ unsigned int __lvl; \ pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \ BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\ - (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \ - | ((unsigned long)(va) & (PAGE_SIZE - 1))); \ + pte_mfn(*__ptep); \ }) +#define arbitrary_virt_to_machine(va) \ + (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))) + #ifdef CONFIG_HIGHPTE #include struct page *kmap_atomic_to_page(void *); --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable-3level.h 2010-03-24 15:25:06.000000000 +0100 @@ -20,21 +20,6 @@ __FILE__, __LINE__, &(e), __pgd_val(e), \ (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) -static inline int pud_none(pud_t pud) -{ - return __pud_val(pud) == 0; - -} -static inline int pud_bad(pud_t pud) -{ - return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0; -} - -static inline int pud_present(pud_t pud) -{ - return __pud_val(pud) & _PAGE_PRESENT; -} - /* Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is @@ -102,15 +87,6 @@ static inline void pud_clear(pud_t *pudp xen_tlb_flush(); } -#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) - -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK)) - - -/* Find an entry in the second-level page table.. */ -#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \ - pmd_index(address)) - #ifdef CONFIG_SMP static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) { @@ -127,17 +103,6 @@ static inline pte_t xen_ptep_get_and_cle #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) #endif -#define __HAVE_ARCH_PTE_SAME -static inline int pte_same(pte_t a, pte_t b) -{ - return a.pte_low == b.pte_low && a.pte_high == b.pte_high; -} - -static inline int pte_none(pte_t pte) -{ - return !(pte.pte_low | pte.pte_high); -} - #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ ((_pte).pte_high << (32-PAGE_SHIFT))) --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable-3level-defs.h 2010-03-24 15:14:47.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H -#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H - -#define SHARED_KERNEL_PMD 0 - -/* - * PGDIR_SHIFT determines what a top-level page table entry can map - */ -#define PGDIR_SHIFT 30 -#define PTRS_PER_PGD 4 - -/* - * PMD_SHIFT determines the size of the area a middle-level - * page table can map - */ -#define PMD_SHIFT 21 -#define PTRS_PER_PMD 512 - -/* - * entries per page directory level - */ -#define PTRS_PER_PTE 512 - -#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable-3level_types.h 2010-03-24 15:25:06.000000000 +0100 @@ -0,0 +1,44 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +typedef u64 pteval_t; +typedef u64 pmdval_t; +typedef u64 pudval_t; +typedef u64 pgdval_t; +typedef u64 pgprotval_t; + +typedef union { + struct { + unsigned long pte_low, pte_high; + }; + pteval_t pte; +} pte_t; +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 + +#define PAGETABLE_LEVELS 3 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + + +#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_32.h 2010-03-24 15:25:06.000000000 +0100 @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_32_H #define _ASM_X86_PGTABLE_32_H +#include + /* * The Linux memory management assumes a three-level page table setup. On * the i386, we use that, but "fold" the mid level into the top-level page @@ -31,47 +33,6 @@ void paging_init(void); extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); -/* - * The Linux x86 paging architecture is 'compile-time dual-mode', it - * implements both the traditional 2-level x86 page tables and the - * newer 3-level PAE-mode page tables. - */ -#ifdef CONFIG_X86_PAE -# include -# define PMD_SIZE (1UL << PMD_SHIFT) -# define PMD_MASK (~(PMD_SIZE - 1)) -#else -# include -#endif - -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE - 1)) - -/* Just any arbitrary offset to the start of the vmalloc VM area: the - * current 8MB value just means that there will be a 8MB "hole" after the - * physical memory until the kernel virtual memory starts. That means that - * any out-of-bounds memory accesses will hopefully be caught. - * The vmalloc() routines leaves a hole of 4kB between each vmalloced - * area for the same reason. ;) - */ -#define VMALLOC_OFFSET (8 * 1024 * 1024) -#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif - -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ - & PMD_MASK) - -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) -#endif - -#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) /* * Define this if things work differently on an i386 and an i486: @@ -80,66 +41,12 @@ extern void set_pmd_pfn(unsigned long, u */ #undef TEST_ACCESS_OK -/* The boot page tables (all created as a single array) */ -extern unsigned long pg0[]; - -#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) - -/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ -#define pmd_none(x) (!(unsigned long)__pmd_val(x)) -#if CONFIG_XEN_COMPAT <= 0x030002 -/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. - can temporarily clear it. */ -#define pmd_present(x) (__pmd_val(x)) -#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) -#else -#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) -#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) -#endif - - -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - #ifdef CONFIG_X86_PAE # include #else # include #endif -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - - -static inline int pud_large(pud_t pud) { return 0; } - -/* - * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] - * - * this macro returns the index of the entry in the pmd page which would - * control the given virtual address - */ -#define pmd_index(address) \ - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) - -/* - * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] - * - * this macro returns the index of the entry in the pte page which would - * control the given virtual address - */ -#define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) \ - ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address))) - -#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) - -#define pmd_page_vaddr(pmd) \ - ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK)) - #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ @@ -185,7 +92,4 @@ void make_lowmem_page_writable(void *va, #define kern_addr_valid(kaddr) (0) #endif -#define io_remap_pfn_range(vma, from, pfn, size, prot) \ - direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO) - #endif /* _ASM_X86_PGTABLE_32_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_64.h 2010-03-24 15:25:06.000000000 +0100 @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_64_H #include +#include + #ifndef __ASSEMBLY__ /* @@ -12,12 +14,12 @@ #include #include #include -#include #ifdef CONFIG_XEN extern pud_t level3_user_pgt[512]; extern void xen_init_pt(void); +extern void xen_switch_pt(void); #endif extern pud_t level3_kernel_pgt[512]; @@ -33,39 +35,13 @@ extern void paging_init(void); #endif /* !__ASSEMBLY__ */ -#define SHARED_KERNEL_PMD 0 - -/* - * PGDIR_SHIFT determines what a top-level page table entry can map - */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 - -/* - * 3rd level page - */ -#define PUD_SHIFT 30 -#define PTRS_PER_PUD 512 - -/* - * PMD_SHIFT determines the size of the area a middle-level - * page table can map - */ -#define PMD_SHIFT 21 -#define PTRS_PER_PMD 512 - -/* - * entries per page directory level - */ -#define PTRS_PER_PTE 512 - #ifndef __ASSEMBLY__ #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx pfn %010Lx).\n", \ + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \ __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e)) #define pud_ERROR(e) \ printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \ @@ -76,9 +52,6 @@ extern void paging_init(void); __FILE__, __LINE__, &(e), __pgd_val(e), \ (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) -#define pgd_none(x) (!__pgd_val(x)) -#define pud_none(x) (!__pud_val(x)) - struct mm_struct; void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); @@ -138,48 +111,6 @@ static inline void xen_pgd_clear(pgd_t * xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0)); } -#define pte_same(a, b) ((a).pte == (b).pte) - -#endif /* !__ASSEMBLY__ */ - -#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE - 1)) -#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE - 1)) -#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE - 1)) - -#define MAX_PHYSMEM_BITS 43 -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc20000000000, UL) -#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffe20000000000, UL) -#define MODULES_VADDR _AC(0xffffffffa0000000, UL) -#define MODULES_END _AC(0xffffffffff000000, UL) -#define MODULES_LEN (MODULES_END - MODULES_VADDR) - -#ifndef __ASSEMBLY__ - -static inline int pgd_bad(pgd_t pgd) -{ - return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; -} - -static inline int pud_bad(pud_t pud) -{ - return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; -} - -static inline int pmd_bad(pmd_t pmd) -{ - return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; -} - -#define pte_none(x) (!(x).pte) -#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) - -#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ - #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) /* @@ -190,47 +121,12 @@ static inline int pmd_bad(pmd_t pmd) /* * Level 4 access. */ -#define pgd_page_vaddr(pgd) \ - ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK)) -#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT)) -#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT) static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ -/* to find an entry in a page-table-directory. */ -#define pud_page_vaddr(pud) \ - ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK)) -#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT)) -#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -#define pud_offset(pgd, address) \ - ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address))) -#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) - -static inline int pud_large(pud_t pte) -{ - return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); -} /* PMD - Level 2 access */ -#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK)) -#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) - -#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \ - pmd_index(address)) -#define pmd_none(x) (!__pmd_val(x)) -#if CONFIG_XEN_COMPAT <= 0x030002 -/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. - can temporarily clear it. */ -#define pmd_present(x) (__pmd_val(x)) -#else -#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) -#endif -#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot)))) -#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT) - #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) #define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ _PAGE_FILE }) @@ -238,13 +134,6 @@ static inline int pud_large(pud_t pte) /* PTE - Level 1 access. */ -/* page, protection -> pte */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot)) - -#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ - pte_index((address))) - /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) @@ -278,9 +167,6 @@ static inline int pud_large(pud_t pte) extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO) - #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_64_types.h 2010-03-24 15:25:06.000000000 +0100 @@ -0,0 +1,63 @@ +#ifndef _ASM_X86_PGTABLE_64_DEFS_H +#define _ASM_X86_PGTABLE_64_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +/* + * These are used to make use of C type-checking.. + */ +typedef unsigned long pteval_t; +typedef unsigned long pmdval_t; +typedef unsigned long pudval_t; +typedef unsigned long pgdval_t; +typedef unsigned long pgprotval_t; + +typedef union { pteval_t pte; unsigned int pte_low; } pte_t; + +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 +#define PAGETABLE_LEVELS 4 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 + +/* + * 3rd level page + */ +#define PUD_SHIFT 30 +#define PTRS_PER_PUD 512 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) + +#define MAX_PHYSMEM_BITS 43 +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define VMALLOC_START _AC(0xffffc20000000000, UL) +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_END _AC(0xffffffffff000000, UL) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_types.h 2010-03-24 15:25:06.000000000 +0100 @@ -0,0 +1,388 @@ +#ifndef _ASM_X86_PGTABLE_DEFS_H +#define _ASM_X86_PGTABLE_DEFS_H + +#include +#include + +#define FIRST_USER_ADDRESS 0 + +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define __HAVE_ARCH_PTE_SPECIAL + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#else +#define _PAGE_NX (_AT(pteval_t, 0)) +#endif + +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif +#endif + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | __kernel_page_user) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + +/* + * PAT settings are part of the hypervisor interface, which sets the + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). + */ +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) +#define _PAGE_CACHE_WB (0) +#define _PAGE_CACHE_WT (_PAGE_PWT) +#define _PAGE_CACHE_WC (_PAGE_PAT) +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) + +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) + +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) + +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +/* + * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + +#ifdef CONFIG_X86_32 +# include +#else +# include "pgtable_64_types.h" +#endif + +#ifndef __ASSEMBLY__ + +#include + +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; + +#include + +typedef struct { pgdval_t pgd; } pgd_t; + +#define __pgd_ma(x) ((pgd_t) { (x) } ) +static inline pgd_t xen_make_pgd(pgdval_t val) +{ + if (val & _PAGE_PRESENT) + val = pte_phys_to_machine(val); + return (pgd_t) { val }; +} + +#define __pgd_val(x) ((x).pgd) +static inline pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = __pgd_val(pgd); +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002 + if (ret) + ret = machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (ret & _PAGE_PRESENT) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} + +static inline pgdval_t pgd_flags(pgd_t pgd) +{ + return __pgd_val(pgd) & PTE_FLAGS_MASK; +} + +#if PAGETABLE_LEVELS > 3 +typedef struct { pudval_t pud; } pud_t; + +#define __pud_ma(x) ((pud_t) { (x) } ) +static inline pud_t xen_make_pud(pudval_t val) +{ + if (val & _PAGE_PRESENT) + val = pte_phys_to_machine(val); + return (pud_t) { val }; +} + +#define __pud_val(x) ((x).pud) +static inline pudval_t xen_pud_val(pud_t pud) +{ + pudval_t ret = __pud_val(pud); + if (ret & _PAGE_PRESENT) + ret = pte_machine_to_phys(ret); + return ret; +} +#else +#include + +#define __pud_val(x) __pgd_val((x).pgd) +static inline pudval_t xen_pud_val(pud_t pud) +{ + return xen_pgd_val(pud.pgd); +} +#endif + +#if PAGETABLE_LEVELS > 2 +typedef struct { pmdval_t pmd; } pmd_t; + +#define __pmd_ma(x) ((pmd_t) { (x) } ) +static inline pmd_t xen_make_pmd(pmdval_t val) +{ + if (val & _PAGE_PRESENT) + val = pte_phys_to_machine(val); + return (pmd_t) { val }; +} + +#define __pmd_val(x) ((x).pmd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = __pmd_val(pmd); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret) + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (ret & _PAGE_PRESENT) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} +#else +#include + +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } ) +#define __pmd_val(x) __pgd_val((x).pud.pgd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + return xen_pgd_val(pmd.pud.pgd); +} +#endif + +static inline pudval_t pud_flags(pud_t pud) +{ + return __pud_val(pud) & PTE_FLAGS_MASK; +} + +static inline pmdval_t pmd_flags(pmd_t pmd) +{ + return __pmd_val(pmd) & PTE_FLAGS_MASK; +} + +#define __pte_ma(x) ((pte_t) { .pte = (x) } ) +static inline pte_t xen_make_pte(pteval_t val) +{ + if ((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT) + val = pte_phys_to_machine(val); + return (pte_t) { .pte = val }; +} + +#define __pte_val(x) ((x).pte) +static inline pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = __pte_val(pte); + if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT) + ret = pte_machine_to_phys(ret); + return ret; +} + +static inline pteval_t pte_flags(pte_t pte) +{ + return __pte_val(pte) & PTE_FLAGS_MASK; +} + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + + +typedef struct page *pgtable_t; + +extern pteval_t __supported_pte_mask; +extern int nx_enabled; +extern void set_nx(void); + +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + +#ifndef CONFIG_XEN +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define __HAVE_PFNMAP_TRACKING +#endif + +#define __HAVE_PHYS_MEM_ACCESS_PROT +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); + +/* Install a pte for a particular vaddr in kernel space. */ +void set_pte_vaddr(unsigned long vaddr, pte_t pte); + +#ifndef CONFIG_XEN +extern void native_pagetable_setup_start(pgd_t *base); +extern void native_pagetable_setup_done(pgd_t *base); +#else +static inline void xen_pagetable_setup_start(pgd_t *base) {} +static inline void xen_pagetable_setup_done(pgd_t *base) {} +#endif + +struct seq_file; +extern void arch_report_meminfo(struct seq_file *m); + +enum { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_NUM +}; + +#ifdef CONFIG_PROC_FS +extern void update_page_count(int level, unsigned long pages); +#else +static inline void update_page_count(int level, unsigned long pages) { } +#endif + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address, unsigned int *level); + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_DEFS_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:25:06.000000000 +0100 @@ -16,6 +16,7 @@ struct mm_struct; #include #include #include +#include #include #include #include @@ -74,10 +75,10 @@ struct cpuinfo_x86 { char pad0; #else /* Number of 4K pages in DTLB/ITLB combined(in pages): */ - int x86_tlbsize; + int x86_tlbsize; +#endif __u8 x86_virt_bits; __u8 x86_phys_bits; -#endif /* CPUID returned core id bits: */ __u8 x86_coreid_bits; /* Max extended CPUID function supported: */ @@ -92,9 +93,9 @@ struct cpuinfo_x86 { int x86_cache_alignment; /* In bytes */ int x86_power; unsigned long loops_per_jiffy; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* cpus sharing the last level cache: */ - cpumask_t llc_shared_map; + cpumask_var_t llc_shared_map; #endif /* cpuid returned max cores value: */ u16 x86_max_cores; @@ -138,7 +139,7 @@ extern struct cpuinfo_x86 new_cpu_data; extern __u32 cleared_cpu_caps[NCAPINTS]; #ifdef CONFIG_SMP -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); +DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) #define current_cpu_data __get_cpu_var(cpu_info) #else @@ -251,7 +252,6 @@ struct x86_hw_tss { #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) #define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) #define INVALID_IO_BITMAP_OFFSET 0x8000 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 #ifndef CONFIG_X86_NO_TSS struct tss_struct { @@ -267,11 +267,6 @@ struct tss_struct { * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; - /* - * Cache the current maximum and the last task that used the bitmap: - */ - unsigned long io_bitmap_max; - struct thread_struct *io_bitmap_owner; /* * .. and then another 0x100 bytes for the emergency kernel stack: @@ -280,7 +275,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); /* * Save the original ist values for checking stack pointers during debugging @@ -363,6 +358,11 @@ struct i387_soft_struct { u32 entry_eip; }; +struct ymmh_struct { + /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ + u32 ymmh_space[64]; +}; + struct xsave_hdr_struct { u64 xstate_bv; u64 reserved1[2]; @@ -372,6 +372,7 @@ struct xsave_hdr_struct { struct xsave_struct { struct i387_fxsave_struct i387; struct xsave_hdr_struct xsave_hdr; + struct ymmh_struct ymmh; /* new processor state extensions will go here */ } __attribute__ ((packed, aligned (64))); @@ -382,11 +383,37 @@ union thread_xstate { struct xsave_struct xsave; }; -#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS) +#ifdef CONFIG_X86_64 +#ifndef CONFIG_X86_NO_TSS DECLARE_PER_CPU(struct orig_ist, orig_ist); #endif -extern void print_cpu_info(struct cpuinfo_x86 *); +union irq_stack_union { + char irq_stack[IRQ_STACK_SIZE]; + /* + * GCC hardcodes the stack canary as %gs:40. Since the + * irq_stack is the object at %gs:0, we reserve the bottom + * 48 bytes of the irq stack for the canary. + */ + struct { + char gs_base[40]; + unsigned long stack_canary; + }; +}; + +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_INIT_PER_CPU(irq_stack_union); + +DECLARE_PER_CPU(char *, irq_stack_ptr); +DECLARE_PER_CPU(unsigned int, irq_count); +extern unsigned long kernel_eflags; +extern asmlinkage void ignore_sysret(void); +#else /* X86_64 */ +#ifdef CONFIG_CC_STACKPROTECTOR +DECLARE_PER_CPU(unsigned long, stack_canary); +#endif +#endif /* X86_64 */ + extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; @@ -659,6 +686,7 @@ static inline void __sti_mwait(unsigned extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void init_c1e_mask(void); extern unsigned long boot_option_idle_override; extern unsigned long idle_halt; @@ -696,9 +724,9 @@ extern int sysenter_setup(void); extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); -extern void switch_to_new_gdt(void); +extern void switch_to_new_gdt(int); +extern void load_percpu_segment(int); extern void cpu_init(void); -extern void init_gdt(int cpu); static inline unsigned long get_debugctlmsr(void) { @@ -783,6 +811,7 @@ static inline void spin_lock_prefetch(co * User space process size: 3GB (default). */ #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -840,7 +869,7 @@ extern unsigned long thread_saved_pc(str /* * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -849,12 +878,12 @@ extern unsigned long thread_saved_pc(str 0xc0000000 : 0xFFFFe000) #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE64 +#define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/smp.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/smp.h 2010-03-24 15:25:06.000000000 +0100 @@ -15,53 +15,25 @@ # include # endif #endif -#include #include - -#ifdef CONFIG_X86_64 - -#define cpu_callin_mask cpu_possible_mask -#define cpu_callout_mask cpu_possible_mask -extern cpumask_var_t cpu_initialized_mask; -extern cpumask_var_t cpu_sibling_setup_mask; - -#else /* CONFIG_X86_32 */ - -#define cpu_callin_map cpu_possible_map -#define cpu_callout_map cpu_possible_map -extern cpumask_t cpu_initialized; -extern cpumask_t cpu_sibling_setup_map; - -#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map) -#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map) -#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized) -#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map) - -#endif /* CONFIG_X86_32 */ - -extern void (*mtrr_hook)(void); -extern void zap_low_mappings(void); - -extern int __cpuinit get_local_pda(int cpu); +#include extern int smp_num_siblings; extern unsigned int num_processors; -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); -DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); -#ifdef CONFIG_X86_32 DECLARE_PER_CPU(int, cpu_number); -#endif static inline struct cpumask *cpu_sibling_mask(int cpu) { - return &per_cpu(cpu_sibling_map, cpu); + return per_cpu(cpu_sibling_map, cpu); } static inline struct cpumask *cpu_core_mask(int cpu) { - return &per_cpu(cpu_core_map, cpu); + return per_cpu(cpu_core_map, cpu); } DECLARE_PER_CPU(u16, x86_cpu_to_apicid); @@ -149,9 +121,10 @@ static inline void arch_send_call_functi smp_ops.send_call_func_single_ipi(cpu); } -static inline void arch_send_call_function_ipi(cpumask_t mask) +#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask +static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { - smp_ops.send_call_func_ipi(&mask); + smp_ops.send_call_func_ipi(mask); } void cpu_disable_common(void); @@ -176,14 +149,12 @@ void xen_send_call_func_single_ipi(int c #define smp_send_stop xen_smp_send_stop #define smp_send_reschedule xen_smp_send_reschedule #define arch_send_call_function_single_ipi xen_send_call_func_single_ipi -#define arch_send_call_function_ipi(m) xen_send_call_func_ipi(&(m)) +#define arch_send_call_function_ipi_mask xen_send_call_func_ipi void play_dead(void); #endif /* CONFIG_XEN */ -extern void prefill_possible_map(void); - void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) @@ -192,10 +163,6 @@ static inline int num_booting_cpus(void) { return cpumask_weight(cpu_callout_mask); } -#else -static inline void prefill_possible_map(void) -{ -} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; @@ -206,11 +173,11 @@ extern unsigned disabled_cpus __cpuinitd * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) +#define raw_smp_processor_id() (percpu_read(cpu_number)) #define safe_smp_processor_id() smp_processor_id() #elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() read_pda(cpunumber) +#define raw_smp_processor_id() (percpu_read(cpu_number)) #define stack_smp_processor_id() \ ({ \ @@ -220,10 +187,6 @@ extern unsigned disabled_cpus __cpuinitd }) #define safe_smp_processor_id() smp_processor_id() -#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */ -#define cpu_physical_id(cpu) boot_cpu_physical_apicid -#define safe_smp_processor_id() 0 -#define stack_smp_processor_id() 0 #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -235,28 +198,9 @@ static inline int logical_smp_processor_ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); } -#include -static inline unsigned int read_apic_id(void) -{ - unsigned int reg; - - reg = *(u32 *)(APIC_BASE + APIC_ID); - - return GET_APIC_ID(reg); -} #endif - -# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64) extern int hard_smp_processor_id(void); -# else -#include -static inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return read_apic_id(); -} -# endif /* APIC_DEFINITION */ #else /* CONFIG_X86_LOCAL_APIC */ @@ -266,11 +210,5 @@ static inline int hard_smp_processor_id( #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_X86_HAS_BOOT_CPU_ID -extern unsigned char boot_cpu_id; -#else -#define boot_cpu_id 0 -#endif - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMP_H */ --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/spinlock.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/spinlock.h 2010-03-24 15:25:06.000000000 +0100 @@ -255,40 +255,18 @@ static __always_inline void __ticket_spi static inline int xen_spinlock_init(unsigned int cpu) { return 0; } static inline void xen_spinlock_cleanup(unsigned int cpu) {} -/* - * Define virtualization-friendly old-style lock byte lock, for use in - * pv_lock_ops if desired. - * - * This differs from the pre-2.6.24 spinlock by always using xchgb - * rather than decb to take the lock; this allows it to use a - * zero-initialized lock structure. It also maintains a 1-byte - * contention counter, so that we can implement - * __byte_spin_is_contended. - */ -struct __byte_spinlock { - s8 lock; -#if NR_CPUS < 256 - s8 spinners; -#else -#error NR_CPUS >= 256 support not implemented -#endif -}; - static inline int __byte_spin_is_locked(raw_spinlock_t *lock) { - struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; - return bl->lock != 0; + return lock->lock != 0; } static inline int __byte_spin_is_contended(raw_spinlock_t *lock) { - struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; - return bl->spinners != 0; + return lock->spinners != 0; } static inline void __byte_spin_lock(raw_spinlock_t *lock) { - struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; s8 val = 1; asm("1: xchgb %1, %0\n" @@ -301,27 +279,25 @@ static inline void __byte_spin_lock(raw_ " " LOCK_PREFIX "decb %2\n" " jmp 1b\n" "3:" - : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory"); + : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory"); } #define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock) static inline int __byte_spin_trylock(raw_spinlock_t *lock) { - struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; u8 old = 1; asm("xchgb %1,%0" - : "+m" (bl->lock), "+q" (old) : : "memory"); + : "+m" (lock->lock), "+q" (old) : : "memory"); return old == 0; } static inline void __byte_spin_unlock(raw_spinlock_t *lock) { - struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; smp_wmb(); - bl->lock = 0; + lock->lock = 0; } #define __raw_spin(n) __byte_spin_##n @@ -422,8 +398,7 @@ static inline int __raw_read_trylock(raw { atomic_t *count = (atomic_t *)lock; - atomic_dec(count); - if (atomic_read(count) >= 0) + if (atomic_dec_return(count) >= 0) return 1; atomic_inc(count); return 0; @@ -450,6 +425,9 @@ static inline void __raw_write_unlock(ra : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); } +#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) +#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) + #define _raw_spin_relax(lock) cpu_relax() #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/spinlock_types.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/spinlock_types.h 2010-03-24 15:25:06.000000000 +0100 @@ -26,6 +26,20 @@ typedef union { # define TICKET_SHIFT 16 u16 cur, seq; #endif +#else +/* + * This differs from the pre-2.6.24 spinlock by always using xchgb + * rather than decb to take the lock; this allows it to use a + * zero-initialized lock structure. It also maintains a 1-byte + * contention counter, so that we can implement + * __byte_spin_is_contended. + */ + u8 lock; +#if CONFIG_NR_CPUS < 256 + u8 spinners; +#else +# error NR_CPUS >= 256 not implemented +#endif #endif }; } raw_spinlock_t; --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:25:06.000000000 +0100 @@ -21,9 +21,24 @@ struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); #ifdef CONFIG_X86_32 +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (per_cpu_var(stack_canary)) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -45,6 +60,7 @@ do { \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ @@ -59,6 +75,8 @@ do { \ "=b" (ebx), "=c" (ecx), "=d" (edx), \ "=S" (esi), "=D" (edi) \ \ + __switch_canary_oparam \ + \ /* input parameters: */ \ : [next_sp] "m" (next->thread.sp), \ [next_ip] "m" (next->thread.ip), \ @@ -67,6 +85,8 @@ do { \ [prev] "a" (prev), \ [next] "d" (next) \ \ + __switch_canary_iparam \ + \ : /* reloaded segment registers */ \ "memory"); \ } while (0) @@ -87,27 +107,44 @@ do { \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ "r12", "r13", "r14", "r15" +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + /* Save restore flags to clear handle leaking NT */ #define switch_to(prev, next, last) \ - asm volatile(SAVE_CONTEXT \ + asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ ".globl thread_return\n" \ "thread_return:\n\t" \ - "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ - LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ "movq %%rax,%%rdi\n\t" \ - "jc ret_from_fork\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ RESTORE_CONTEXT \ : "=a" (last) \ + __switch_canary_oparam \ : [next] "S" (next), [prev] "D" (prev), \ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [tif_fork] "i" (TIF_FORK), \ + [_tif_fork] "i" (_TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ + [current_task] "m" (per_cpu_var(current_task)) \ + __switch_canary_iparam \ : "memory", "cc" __EXTRA_CLOBBER) #endif @@ -166,6 +203,25 @@ extern void xen_load_gs_index(unsigned); #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; --- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/include/mach-xen/asm/tlbflush.h 2010-03-24 15:25:06.000000000 +0100 @@ -86,21 +86,20 @@ static inline void flush_tlb_range(struc flush_tlb_mm(vma->vm_mm); } +#ifndef CONFIG_XEN #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -#ifdef CONFIG_X86_32 struct tlb_state { struct mm_struct *active_mm; int state; - char __cacheline_padding[L1_CACHE_BYTES-8]; }; -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); -void reset_lazy_tlbstate(void); -#else static inline void reset_lazy_tlbstate(void) { + percpu_write(cpu_tlbstate.state, 0); + percpu_write(cpu_tlbstate.active_mm, &init_mm); } #endif @@ -112,4 +111,6 @@ static inline void flush_tlb_kernel_rang flush_tlb_all(); } +extern void zap_low_mappings(void); + #endif /* _ASM_X86_TLBFLUSH_H */ --- head-2010-05-25.orig/arch/x86/kernel/Makefile 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/Makefile 2010-03-24 15:25:06.000000000 +0100 @@ -122,7 +122,6 @@ obj-$(CONFIG_X86_XEN) += fixup.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o @@ -134,11 +133,10 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - obj-$(CONFIG_XEN) += nmi.o time_64-$(CONFIG_XEN) += time_32.o endif -disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o genx2apic_%.o \ - hpet.o i8253.o i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o \ - smpboot.o tlb_$(BITS).o tsc.o tsc_sync.o uv_%.o vsmp_64.o +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ + i8259.o irqinit_$(BITS).o pci-swiotlb.o reboot.o smpboot.o tsc.o \ + tsc_sync.o uv_%.o vsmp_64.o disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o --- head-2010-05-25.orig/arch/x86/kernel/acpi/boot.c 2010-04-15 10:05:36.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/acpi/boot.c 2010-04-15 10:07:05.000000000 +0200 @@ -115,11 +115,6 @@ char *__init __acpi_map_table(unsigned l if (!phys || !size) return NULL; -#ifdef CONFIG_XEN - if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT)) - return isa_bus_to_virt(phys); -#endif - return early_ioremap(phys, size); } void __init __acpi_unmap_table(char *map, unsigned long size) @@ -151,8 +146,10 @@ static int __init acpi_parse_madt(struct madt->address); } +#ifndef CONFIG_XEN default_acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); +#endif return 0; } --- head-2010-05-25.orig/arch/x86/kernel/acpi/sleep-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/acpi/sleep-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -104,6 +104,7 @@ int acpi_save_state_mem(void) stack_start.sp = temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); + initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; --- head-2010-05-25.orig/arch/x86/kernel/apic/Makefile 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/apic/Makefile 2010-03-24 15:25:06.000000000 +0100 @@ -17,3 +17,10 @@ obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT) += summit_32.o + +obj-$(CONFIG_XEN) += nmi.o + +probe_64-$(CONFIG_XEN) := probe_32.o + +disabled-obj-$(CONFIG_XEN) := apic_flat_$(BITS).o +disabled-obj-$(filter-out $(CONFIG_SMP),$(CONFIG_XEN)) += ipi.o --- head-2010-05-25.orig/arch/x86/kernel/apic/apic-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/apic/apic-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -4,11 +4,20 @@ #include #include +#include #include #include #include +unsigned int num_processors; + +/* + * Map cpu index to physical APIC ID + */ +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); + /* * Debug level, exported for io_apic.c */ --- head-2010-05-25.orig/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -1,7 +1,7 @@ /* * Intel IO-APIC support for multi-Pentium hosts. * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * * Many thanks to Stig Venaas for trying out countless experimental * patches and reporting/debugging problems patiently! @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -61,9 +62,7 @@ #include #include -#include -#include -#include +#include #ifdef CONFIG_XEN #include @@ -97,11 +96,11 @@ static DEFINE_SPINLOCK(vector_lock); int nr_ioapic_registers[MAX_IO_APICS]; /* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; @@ -114,10 +113,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BU int skip_ioapic_setup; +static void __init _arch_disable_smp_support(void) +{ +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif + skip_ioapic_setup = 1; +} + static int __init parse_noapic(char *str) { /* disable IO-APIC */ - disable_ioapic_setup(); + _arch_disable_smp_support(); return 0; } early_param("noapic", parse_noapic); @@ -372,7 +380,7 @@ set_extra_move_desc(struct irq_desc *des if (!cfg->move_in_progress) { /* it means that domain is not changed */ - if (!cpumask_intersects(&desc->affinity, mask)) + if (!cpumask_intersects(desc->affinity, mask)) cfg->move_desc_pending = 1; } } @@ -397,12 +405,20 @@ struct io_apic { unsigned int index; unsigned int unused[3]; unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; }; static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); + + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); +} + +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); } #endif /* CONFIG_XEN */ @@ -416,7 +432,7 @@ static inline unsigned int io_apic_read( struct physdev_apic apic_op; int ret; - apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; + apic_op.apic_physbase = mp_ioapics[apic].apicaddr; apic_op.reg = reg; ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); if (ret) @@ -434,7 +450,7 @@ static inline void io_apic_write(unsigne #else struct physdev_apic apic_op; - apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr; + apic_op.apic_physbase = mp_ioapics[apic].apicaddr; apic_op.reg = reg; apic_op.value = value; WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); @@ -522,7 +538,7 @@ __ioapic_write_entry(int apic, int pin, io_apic_write(apic, 0x10 + 2*pin, eu.w1); } -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); @@ -558,11 +574,11 @@ static void send_cleanup_vector(struct i for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } cfg->move_in_progress = 0; @@ -583,16 +599,12 @@ static void __target_IO_APIC_irq(unsigne apic = entry->apic; pin = entry->pin; -#ifdef CONFIG_INTR_REMAP /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ if (!irq_remapped(irq)) io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -607,8 +619,9 @@ static int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); /* - * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid - * of that, or returns BAD_APICID and leaves desc->affinity untouched. + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. */ static unsigned int set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) @@ -624,9 +637,12 @@ set_desc_affinity(struct irq_desc *desc, if (assign_irq_vector(irq, cfg, mask)) return BAD_APICID; - cpumask_and(&desc->affinity, cfg->domain, mask); + /* check that before desc->addinity get updated */ set_extra_move_desc(desc, mask); - return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); } static void @@ -840,23 +856,6 @@ static void clear_IO_APIC (void) for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) clear_IO_APIC_pin(apic, pin); } - -#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP && CONFIG_X86_32*/ #else #define add_pin_to_irq_cpu(cfg, cpu, apic, pin) #endif /* CONFIG_XEN */ @@ -868,8 +867,9 @@ void send_IPI_self(int vector) */ #define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; +static int pirq_entries[MAX_PIRQS] = { + [0 ... MAX_PIRQS - 1] = -1 +}; static int __init ioapic_pirq_setup(char *str) { @@ -878,10 +878,6 @@ static int __init ioapic_pirq_setup(char get_options(str, ARRAY_SIZE(ints), ints); - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; apic_printk(APIC_VERBOSE, KERN_INFO "PIRQ redirection, working around broken MP-BIOS.\n"); max = MAX_PIRQS; @@ -903,75 +899,106 @@ __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ #ifdef CONFIG_INTR_REMAP -/* I/O APIC RTE contents at the OS boot up */ -static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; +struct IO_APIC_route_entry **alloc_ioapic_entries(void) +{ + int apic; + struct IO_APIC_route_entry **ioapic_entries; + + ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, + GFP_ATOMIC); + if (!ioapic_entries) + return 0; + + for (apic = 0; apic < nr_ioapics; apic++) { + ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_ATOMIC); + if (!ioapic_entries[apic]) + goto nomem; + } + + return ioapic_entries; + +nomem: + while (--apic >= 0) + kfree(ioapic_entries[apic]); + kfree(ioapic_entries); + + return 0; +} /* - * Saves and masks all the unmasked IO-APIC RTE's + * Saves all the IO-APIC RTE's */ -int save_mask_IO_APIC_setup(void) +int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) { - union IO_APIC_reg_01 reg_01; - unsigned long flags; int apic, pin; - /* - * The number of IO-APIC IRQ registers (== #pins): - */ + if (!ioapic_entries) + return -ENOMEM; + for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; + if (!ioapic_entries[apic]) + return -ENOMEM; + + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); } + return 0; +} + +/* + * Mask all IO APIC entries. + */ +void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +{ + int apic, pin; + + if (!ioapic_entries) + return; + for (apic = 0; apic < nr_ioapics; apic++) { - early_ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!early_ioapic_entries[apic]) - goto nomem; - } + if (!ioapic_entries[apic]) + break; - for (apic = 0; apic < nr_ioapics; apic++) for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - entry = early_ioapic_entries[apic][pin] = - ioapic_read_entry(apic, pin); + entry = ioapic_entries[apic][pin]; if (!entry.mask) { entry.mask = 1; ioapic_write_entry(apic, pin, entry); } } - - return 0; - -nomem: - while (apic >= 0) - kfree(early_ioapic_entries[apic--]); - memset(early_ioapic_entries, 0, - ARRAY_SIZE(early_ioapic_entries)); - - return -ENOMEM; + } } -void restore_IO_APIC_setup(void) +/* + * Restore IO APIC entries which was saved in ioapic_entries. + */ +int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) { int apic, pin; + if (!ioapic_entries) + return -ENOMEM; + for (apic = 0; apic < nr_ioapics; apic++) { - if (!early_ioapic_entries[apic]) - break; + if (!ioapic_entries[apic]) + return -ENOMEM; + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ioapic_write_entry(apic, pin, - early_ioapic_entries[apic][pin]); - kfree(early_ioapic_entries[apic]); - early_ioapic_entries[apic] = NULL; + ioapic_entries[apic][pin]); } + return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping) +void reinit_intr_remapped_IO_APIC(int intr_remapping, + struct IO_APIC_route_entry **ioapic_entries) + { /* * for now plain restore of previous settings. @@ -980,7 +1007,17 @@ void reinit_intr_remapped_IO_APIC(int in * table entries. for now, do a plain restore, and wait for * the setup_IO_APIC_irqs() to do proper initialization. */ - restore_IO_APIC_setup(); + restore_IO_APIC_setup(ioapic_entries); +} + +void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) +{ + int apic; + + for (apic = 0; apic < nr_ioapics; apic++) + kfree(ioapic_entries[apic]); + + kfree(ioapic_entries); } #endif @@ -992,10 +1029,10 @@ static int find_irq_entry(int apic, int int i; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) + if (mp_irqs[i].irqtype == type && + (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || + mp_irqs[i].dstapic == MP_APIC_ALL) && + mp_irqs[i].dstirq == pin) return i; return -1; @@ -1010,13 +1047,13 @@ static int __init find_isa_irq_pin(int i int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; + int lbus = mp_irqs[i].srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) - return mp_irqs[i].mp_dstirq; + return mp_irqs[i].dstirq; } return -1; } @@ -1026,17 +1063,17 @@ static int __init find_isa_irq_apic(int int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; + int lbus = mp_irqs[i].srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) break; } if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) return apic; } } @@ -1062,23 +1099,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, return -1; } for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; + int lbus = mp_irqs[i].srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) break; if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && + !mp_irqs[i].irqtype && (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) + if (pin == (mp_irqs[i].srcbusirq & 3)) return irq; /* * Use the first all-but-pin matching entry as a @@ -1121,7 +1158,7 @@ static int EISA_ELCR(unsigned int irq) * EISA conforming in the MP table, that means its trigger type must * be read in from the ELCR */ -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) /* PCI interrupts are always polarity one level triggered, @@ -1138,13 +1175,13 @@ static int EISA_ELCR(unsigned int irq) static int MPBIOS_polarity(int idx) { - int bus = mp_irqs[idx].mp_srcbus; + int bus = mp_irqs[idx].srcbus; int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mp_irqflag & 3) + switch (mp_irqs[idx].irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ if (test_bit(bus, mp_bus_not_pci)) @@ -1180,13 +1217,13 @@ static int MPBIOS_polarity(int idx) static int MPBIOS_trigger(int idx) { - int bus = mp_irqs[idx].mp_srcbus; + int bus = mp_irqs[idx].srcbus; int trigger; /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) + switch ((mp_irqs[idx].irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ if (test_bit(bus, mp_bus_not_pci)) @@ -1264,16 +1301,16 @@ int (*ioapic_renumber_irq)(int ioapic, i static int pin_2_irq(int idx, int apic, int pin) { int irq, i; - int bus = mp_irqs[idx].mp_srcbus; + int bus = mp_irqs[idx].srcbus; /* * Debugging check, we are in big trouble if this message pops up! */ - if (mp_irqs[idx].mp_dstirq != pin) + if (mp_irqs[idx].dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mp_srcbusirq; + irq = mp_irqs[idx].srcbusirq; } else { /* * PCI IRQs are mapped in order @@ -1366,7 +1403,7 @@ __assign_irq_vector(int irq, struct irq_ int new_cpu; int vector, offset; - vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask); vector = current_vector; offset = current_offset; @@ -1476,9 +1513,7 @@ void __setup_vector_irq(int cpu) } static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip; -#endif #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1517,7 +1552,6 @@ static void ioapic_register_intr(int irq else desc->status &= ~IRQ_LEVEL; -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { desc->status |= IRQ_MOVE_PCNTXT; if (trigger) @@ -1529,7 +1563,7 @@ static void ioapic_register_intr(int irq handle_edge_irq, "edge"); return; } -#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, @@ -1544,37 +1578,44 @@ static void ioapic_register_intr(int irq #define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq) #endif -static int setup_ioapic_entry(int apic, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector) +int setup_ioapic_entry(int apic_id, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector, int pin) { /* * add it to the IO-APIC irq-routing table: */ memset(entry,0,sizeof(*entry)); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_ioapic_to_ir(apic); +#ifndef CONFIG_XEN + struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); struct irte irte; struct IR_IO_APIC_route_entry *ir_entry = (struct IR_IO_APIC_route_entry *) entry; int index; if (!iommu) - panic("No mapping iommu for ioapic %d\n", apic); + panic("No mapping iommu for ioapic %d\n", apic_id); index = alloc_irte(iommu, irq, 1); if (index < 0) - panic("Failed to allocate IRTE for ioapic %d\n", apic); + panic("Failed to allocate IRTE for ioapic %d\n", apic_id); memset(&irte, 0, sizeof(irte)); irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = trigger; - irte.dlvry_mode = INT_DELIVERY_MODE; + irte.dst_mode = apic->irq_dest_mode; + /* + * Trigger mode in the IRTE will always be edge, and the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments above explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte.trigger_mode = 0; + irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = vector; irte.dest_id = IRTE_DEST(destination); @@ -1584,18 +1625,22 @@ static int setup_ioapic_entry(int apic, ir_entry->zero = 0; ir_entry->format = 1; ir_entry->index = (index & 0x7fff); - } else + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + ir_entry->vector = pin; #endif - { - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; + } else { + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; entry->dest = destination; + entry->vector = vector; } entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; entry->polarity = polarity; - entry->vector = vector; /* Mask level triggered irqs. * Use IRQ_DELAYED_DISABLE for edge triggered irqs. @@ -1605,7 +1650,7 @@ static int setup_ioapic_entry(int apic, return 0; } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, +static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, int trigger, int polarity) { struct irq_cfg *cfg; @@ -1617,26 +1662,26 @@ static void setup_IO_APIC_irq(int apic, cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, TARGET_CPUS)) + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; #ifndef CONFIG_XEN - dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); + dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); #else - dest = cpu_mask_to_apicid(TARGET_CPUS); + dest = 0; /* meaningless */ #endif apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, + apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, irq, trigger, polarity); - if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - dest, trigger, polarity, cfg->vector)) { + if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, + dest, trigger, polarity, cfg->vector, pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic].mp_apicid, pin); + mp_ioapics[apic_id].apicid, pin); __clear_irq_vector(irq, cfg); return; } @@ -1645,12 +1690,12 @@ static void setup_IO_APIC_irq(int apic, if (irq < NR_IRQS_LEGACY) disable_8259A_irq(irq); - ioapic_write_entry(apic, pin, entry); + ioapic_write_entry(apic_id, pin, entry); } static void __init setup_IO_APIC_irqs(void) { - int apic, pin, idx, irq; + int apic_id, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1658,21 +1703,19 @@ static void __init setup_IO_APIC_irqs(vo apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); + idx = find_irq_entry(apic_id, pin, mp_INT); if (idx == -1) { if (!notcon) { notcon = 1; apic_printk(APIC_VERBOSE, KERN_DEBUG " %d-%d", - mp_ioapics[apic].mp_apicid, - pin); + mp_ioapics[apic_id].apicid, pin); } else apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic].mp_apicid, - pin); + mp_ioapics[apic_id].apicid, pin); continue; } if (notcon) { @@ -1681,23 +1724,30 @@ static void __init setup_IO_APIC_irqs(vo notcon = 0; } - irq = pin_2_irq(idx, apic, pin); -#if defined(CONFIG_XEN) - if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) + irq = pin_2_irq(idx, apic_id, pin); + +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) continue; -#elif defined(CONFIG_X86_32) - if (multi_timer_check(apic, irq)) +#else + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) continue; #endif + desc = irq_to_desc_alloc_cpu(irq, cpu); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); continue; } cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic, pin); + add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); - setup_IO_APIC_irq(apic, pin, irq, desc, + setup_IO_APIC_irq(apic_id, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } } @@ -1711,15 +1761,13 @@ static void __init setup_IO_APIC_irqs(vo /* * Set up the timer pin, possibly with the 8259A-master behind. */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, +static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) return; -#endif memset(&entry, 0, sizeof(entry)); @@ -1727,10 +1775,10 @@ static void __init setup_timer_IRQ0_pin( * We use logical delivery to get the timer IRQ * to the first CPU. */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = apic->irq_dest_mode; + entry.mask = 0; /* don't mask IRQ for edge */ + entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); + entry.delivery_mode = apic->irq_delivery_mode; entry.polarity = 0; entry.trigger = 0; entry.vector = vector; @@ -1744,7 +1792,7 @@ static void __init setup_timer_IRQ0_pin( /* * Add it to the IO-APIC irq-routing table: */ - ioapic_write_entry(apic, pin, entry); + ioapic_write_entry(apic_id, pin, entry); } @@ -1766,7 +1814,7 @@ __apicdebuginit(void) print_IO_APIC(void printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); + mp_ioapics[i].apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -1786,7 +1834,7 @@ __apicdebuginit(void) print_IO_APIC(void spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -2050,13 +2098,6 @@ void __init enable_IO_APIC(void) int apic; unsigned long flags; -#ifdef CONFIG_X86_32 - int i; - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; -#endif - /* * The number of IO-APIC IRQ registers (== #pins): */ @@ -2129,8 +2170,13 @@ void disable_IO_APIC(void) * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode * so legacy interrupts can be delivered. + * + * With interrupt-remapping, for now we will use virtual wire A mode, + * as virtual wire B is little complex (need to configure both + * IOAPIC RTE aswell as interrupt-remapping table entry). + * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1) { + if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2150,7 +2196,10 @@ void disable_IO_APIC(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } - disconnect_bsp_APIC(ioapic_i8259.pin != -1); + /* + * Use virtual wire A mode when interrupt remapping is enabled. + */ + disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } #ifdef CONFIG_X86_32 @@ -2165,7 +2214,7 @@ static void __init setup_ioapic_ids_from { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; - int apic; + int apic_id; int i; unsigned char old_id; unsigned long flags; @@ -2184,26 +2233,26 @@ static void __init setup_ioapic_ids_from * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. */ - for (apic = 0; apic < nr_ioapics; apic++) { + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); + reg_00.raw = io_apic_read(apic_id, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mp_ioapics[apic].mp_apicid; + old_id = mp_ioapics[apic_id].apicid; - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); + apic_id, mp_ioapics[apic_id].apicid); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + mp_ioapics[apic_id].apicid = reg_00.bits.ID; } /* @@ -2211,10 +2260,10 @@ static void __init setup_ioapic_ids_from * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { + if (apic->check_apicid_used(phys_id_present_map, + mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); + apic_id, mp_ioapics[apic_id].apicid); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -2223,13 +2272,13 @@ static void __init setup_ioapic_ids_from printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; + mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); + mp_ioapics[apic_id].apicid); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -2238,11 +2287,11 @@ static void __init setup_ioapic_ids_from * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mp_ioapics[apic].mp_apicid) + if (old_id != mp_ioapics[apic_id].apicid) for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; + if (mp_irqs[i].dstapic == old_id) + mp_irqs[i].dstapic + = mp_ioapics[apic_id].apicid; /* * Read the right value from the MPC table and @@ -2250,20 +2299,20 @@ static void __init setup_ioapic_ids_from */ apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); + mp_ioapics[apic_id].apicid); - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + reg_00.bits.ID = mp_ioapics[apic_id].apicid; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); + io_apic_write(apic_id, 0, reg_00.raw); spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); + reg_00.raw = io_apic_read(apic_id, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2366,7 +2415,7 @@ static int ioapic_retrigger_irq(unsigned unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2374,7 +2423,7 @@ static int ioapic_retrigger_irq(unsigned #else static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + apic->send_IPI_self(irq_cfg(irq)->vector); return 1; } @@ -2392,37 +2441,24 @@ static int ioapic_retrigger_irq(unsigned #ifdef CONFIG_SMP #ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); /* * Migrate the IO-APIC irq in the presence of intr-remapping. * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. */ static void migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; - int modify_ioapic_rte; unsigned int dest; - unsigned long flags; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) @@ -2438,14 +2474,7 @@ migrate_ioapic_irq_desc(struct irq_desc set_extra_move_desc(desc, mask); - dest = cpu_mask_to_apicid_and(cfg->domain, mask); - - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); - } + dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -2458,61 +2487,7 @@ migrate_ioapic_irq_desc(struct irq_desc if (cfg->move_in_progress) send_cleanup_vector(cfg); - cpumask_copy(&desc->affinity, mask); -} - -static int migrate_irq_remapped_level_desc(struct irq_desc *desc) -{ - int ret = -1; - struct irq_cfg *cfg = desc->chip_data; - - mask_IO_APIC_irq_desc(desc); - - if (io_apic_level_ack_pending(cfg)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq_desc(desc, &desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpumask_clear(&desc->pending_mask); - -unmask: - unmask_IO_APIC_irq_desc(desc); - - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, &desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } + cpumask_copy(desc->affinity, mask); } /* @@ -2521,13 +2496,6 @@ static void ir_irq_migration(struct work static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(&desc->pending_mask, mask); - migrate_irq_remapped_level_desc(desc); - return; - } - migrate_ioapic_irq_desc(desc, mask); } static void set_ir_ioapic_affinity_irq(unsigned int irq, @@ -2537,6 +2505,11 @@ static void set_ir_ioapic_affinity_irq(u set_ir_ioapic_affinity_irq_desc(desc, mask); } +#else +static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) +{ +} #endif asmlinkage void smp_irq_move_cleanup_interrupt(void) @@ -2550,6 +2523,7 @@ asmlinkage void smp_irq_move_cleanup_int me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irq; + unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; @@ -2569,6 +2543,18 @@ asmlinkage void smp_irq_move_cleanup_int if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + /* + * Check if the vector that needs to be cleanedup is + * registered at the cpu's IRR. If so, then this is not + * the best time to clean it up. Lets clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to myself. + */ + if (irr & (1 << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + goto unlock; + } __get_cpu_var(vector_irq)[vector] = -1; cfg->move_cleanup_count--; unlock: @@ -2591,7 +2577,7 @@ static void irq_complete_move(struct irq /* domain has not changed, but affinity did */ me = smp_processor_id(); - if (cpu_isset(me, desc->affinity)) { + if (cpumask_test_cpu(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); /* get the new one */ cfg = desc->chip_data; @@ -2617,17 +2603,51 @@ static void irq_complete_move(struct irq static inline void irq_complete_move(struct irq_desc **descp) {} #endif -#ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_X86_X2APIC static void ack_x2apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); ack_x2APIC_irq(); + eoi_ioapic_irq(desc); } static void ack_x2apic_edge(unsigned int irq) { ack_x2APIC_irq(); } - #endif static void ack_apic_edge(unsigned int irq) @@ -2693,6 +2713,9 @@ static void ack_apic_level(unsigned int */ ack_APIC_irq(); + if (irq_remapped(irq)) + eoi_ioapic_irq(desc); + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2738,6 +2761,26 @@ static void ack_apic_level(unsigned int #endif } +#ifdef CONFIG_INTR_REMAP +static void ir_ack_apic_edge(unsigned int irq) +{ +#ifdef CONFIG_X86_X2APIC + if (x2apic_enabled()) + return ack_x2apic_edge(irq); +#endif + return ack_apic_edge(irq); +} + +static void ir_ack_apic_level(unsigned int irq) +{ +#ifdef CONFIG_X86_X2APIC + if (x2apic_enabled()) + return ack_x2apic_level(irq); +#endif + return ack_apic_level(irq); +} +#endif /* CONFIG_INTR_REMAP */ + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .startup = startup_ioapic_irq, @@ -2751,20 +2794,20 @@ static struct irq_chip ioapic_chip __rea .retrigger = ioapic_retrigger_irq, }; -#ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip __read_mostly = { .name = "IR-IO-APIC", .startup = startup_ioapic_irq, .mask = mask_IO_APIC_irq, .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, + .eoi = ir_ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ir_ioapic_affinity_irq, #endif +#endif .retrigger = ioapic_retrigger_irq, }; -#endif #endif /* CONFIG_XEN */ static inline void init_IO_APIC_traps(void) @@ -2786,7 +2829,7 @@ static inline void init_IO_APIC_traps(vo */ for_each_irq_desc(irq, desc) { #ifdef CONFIG_XEN - if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) continue; #endif cfg = desc->chip_data; @@ -2948,19 +2991,15 @@ static inline void __init check_timer(vo int cpu = boot_cpu_id; int apic1, pin1, apic2, pin2; unsigned long flags; - unsigned int ver; int no_pin1 = 0; local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - /* * get/set the timer IRQ vector: */ disable_8259A_irq(0); - assign_irq_vector(0, cfg, TARGET_CPUS); + assign_irq_vector(0, cfg, apic->target_cpus()); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2974,7 +3013,13 @@ static inline void __init check_timer(vo apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); #ifdef CONFIG_X86_32 - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); + { + unsigned int ver; + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); + } #endif pin1 = find_isa_irq_pin(0, mp_INT); @@ -2994,10 +3039,8 @@ static inline void __init check_timer(vo * 8259A. */ if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); -#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -3013,8 +3056,17 @@ static inline void __init check_timer(vo if (no_pin1) { add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + } else { + /* for edge trigger, setup_IO_APIC_irq already + * leave it unmasked. + * so only need to unmask if it is level-trigger + * do we really have level trigger timer? + */ + int idx; + idx = find_irq_entry(apic1, pin1, mp_INT); + if (idx != -1 && irq_trigger(idx)) + unmask_IO_APIC_irq_desc(desc); } - unmask_IO_APIC_irq_desc(desc); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); @@ -3024,10 +3076,9 @@ static inline void __init check_timer(vo clear_IO_APIC_pin(0, pin1); goto out; } -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif + local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -3042,7 +3093,6 @@ static inline void __init check_timer(vo */ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq_desc(desc); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -3057,6 +3107,7 @@ static inline void __init check_timer(vo /* * Cleanup, just in case ... */ + local_irq_disable(); disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); @@ -3082,6 +3133,7 @@ static inline void __init check_timer(vo apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } + local_irq_disable(); disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -3099,6 +3151,7 @@ static inline void __init check_timer(vo apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } + local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); @@ -3131,7 +3184,7 @@ out: void __init setup_IO_APIC(void) { -#if defined(CONFIG_X86_32) || defined(CONFIG_XEN) +#ifdef CONFIG_XEN enable_IO_APIC(); #else /* @@ -3213,8 +3266,8 @@ static int ioapic_resume(struct sys_devi spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; + if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -3264,6 +3317,7 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3278,11 +3332,11 @@ unsigned int create_irq_nr(unsigned int struct irq_desc *desc_new = NULL; irq = 0; - spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new < NR_IRQS; new++) { - if (platform_legacy_irq(new)) - continue; + if (irq_want < nr_irqs_gsi) + irq_want = nr_irqs_gsi; + spin_lock_irqsave(&vector_lock, flags); + for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_cpu(new, cpu); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); @@ -3292,7 +3346,7 @@ unsigned int create_irq_nr(unsigned int if (cfg_new->vector != 0) continue; - if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; } @@ -3307,7 +3361,6 @@ unsigned int create_irq_nr(unsigned int return irq; } -static int nr_irqs_gsi = NR_IRQS_LEGACY; int create_irq(void) { unsigned int irq_want; @@ -3336,9 +3389,7 @@ void destroy_irq(unsigned int irq) if (desc) desc->chip_data = cfg; -#ifdef CONFIG_INTR_REMAP free_irte(irq); -#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); @@ -3355,14 +3406,16 @@ static int msi_compose_msg(struct pci_de int err; unsigned dest; + if (disable_apic) + return -ENXIO; + cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, TARGET_CPUS); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); if (err) return err; - dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); + dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { struct irte irte; int ir_index; @@ -3374,9 +3427,9 @@ static int msi_compose_msg(struct pci_de memset (&irte, 0, sizeof(irte)); irte.present = 1; - irte.dst_mode = INT_DEST_MODE; + irte.dst_mode = apic->irq_dest_mode; irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = INT_DELIVERY_MODE; + irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3388,16 +3441,19 @@ static int msi_compose_msg(struct pci_de MSI_ADDR_IR_SHV | MSI_ADDR_IR_INDEX1(ir_index) | MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { - msg->address_hi = MSI_ADDR_BASE_HI; + } else { + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? + ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL: MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? + ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU: MSI_ADDR_REDIRECTION_LOWPRI) | MSI_ADDR_DEST_ID(dest); @@ -3405,7 +3461,7 @@ static int msi_compose_msg(struct pci_de msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? + ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_DATA_DELIVERY_FIXED: MSI_DATA_DELIVERY_LOWPRI) | MSI_DATA_VECTOR(cfg->vector); @@ -3491,15 +3547,16 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; -#ifdef CONFIG_INTR_REMAP static struct irq_chip msi_ir_chip = { .name = "IR-PCI-MSI", .unmask = unmask_msi_irq, .mask = mask_msi_irq, - .ack = ack_x2apic_edge, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = ir_set_msi_irq_affinity, #endif +#endif .retrigger = ioapic_retrigger_irq, }; @@ -3529,7 +3586,6 @@ static int msi_alloc_irte(struct pci_dev } return index; } -#endif static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { @@ -3543,7 +3599,6 @@ static int setup_msi_irq(struct pci_dev set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); -#ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { struct irq_desc *desc = irq_to_desc(irq); /* @@ -3552,7 +3607,6 @@ static int setup_msi_irq(struct pci_dev desc->status |= IRQ_MOVE_PCNTXT; set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); } else -#endif set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3560,60 +3614,26 @@ static int setup_msi_irq(struct pci_dev return 0; } -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) -{ - unsigned int irq; - int ret; - unsigned int irq_want; - - irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); - if (irq == 0) - return -1; - -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - ret = msi_alloc_irte(dev, irq, 1); - if (ret < 0) - goto error; -no_ir: -#endif - ret = setup_msi_irq(dev, msidesc, irq); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - return 0; - -#ifdef CONFIG_INTR_REMAP -error: - destroy_irq(irq); - return ret; -#endif -} - int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { unsigned int irq; int ret, sub_handle; struct msi_desc *msidesc; unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP - struct intel_iommu *iommu = 0; + struct intel_iommu *iommu = NULL; int index = 0; -#endif + + /* x86 doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { irq = create_irq_nr(irq_want); - irq_want++; if (irq == 0) return -1; -#ifdef CONFIG_INTR_REMAP + irq_want = irq + 1; if (!intr_remapping_enabled) goto no_ir; @@ -3641,7 +3661,6 @@ int arch_setup_msi_irqs(struct pci_dev * set_irte_irq(irq, iommu, index, sub_handle); } no_ir: -#endif ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; @@ -3659,7 +3678,7 @@ void arch_teardown_msi_irq(unsigned int destroy_irq(irq); } -#ifdef CONFIG_DMAR +#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { @@ -3740,7 +3759,7 @@ static void hpet_msi_set_affinity(unsign #endif /* CONFIG_SMP */ -struct irq_chip hpet_msi_type = { +static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, .mask = hpet_msi_mask, @@ -3755,12 +3774,14 @@ int arch_setup_hpet_msi(unsigned int irq { int ret; struct msi_msg msg; + struct irq_desc *desc = irq_to_desc(irq); ret = msi_compose_msg(NULL, irq, &msg); if (ret < 0) return ret; hpet_msi_write(irq, &msg); + desc->status |= IRQ_MOVE_PCNTXT; set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, "edge"); @@ -3823,13 +3844,17 @@ int arch_setup_ht_irq(unsigned int irq, struct irq_cfg *cfg; int err; + if (disable_apic) + return -ENXIO; + cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, TARGET_CPUS); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); if (!err) { struct ht_irq_msg msg; unsigned dest; - dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); + dest = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus()); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -3837,11 +3862,11 @@ int arch_setup_ht_irq(unsigned int irq, HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(dest) | HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? + ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : HT_IRQ_LOW_DM_LOGICAL) | HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? + ((apic->irq_delivery_mode != dest_LowestPrio) ? HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; @@ -3857,7 +3882,7 @@ int arch_setup_ht_irq(unsigned int irq, } #endif /* CONFIG_HT_IRQ */ -#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +#ifdef CONFIG_X86_UV /* * Re-target the irq to the specified CPU and enable the specified MMR located * on the specified blade to allow the sending of MSIs to the specified CPU. @@ -3889,12 +3914,12 @@ int arch_enable_uv_irq(char *irq_name, u BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); entry->vector = cfg->vector; - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(eligible_cpu); + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3957,7 +3982,29 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } + +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + nr += nr_irqs_gsi * 16; #endif + if (nr < nr_irqs) + nr_irqs = nr; + + return 0; +} +#endif +#endif /* CONFIG_XEN */ /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration @@ -3985,7 +4032,7 @@ int __init io_apic_get_unique_id(int ioa */ if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -4001,10 +4048,10 @@ int __init io_apic_get_unique_id(int ioa * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(apic_id_map, i)) break; } @@ -4017,7 +4064,7 @@ int __init io_apic_get_unique_id(int ioa apic_id = i; } - tmp = apicid_to_cpu_present(apic_id); + tmp = apic->apicid_to_cpu_present(apic_id); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -4062,7 +4109,7 @@ int io_apic_set_pci_routing (int ioapic, int cpu = boot_cpu_id; #ifdef CONFIG_XEN - if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) { + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", ioapic, irq); return -EINVAL; @@ -4103,8 +4150,8 @@ int acpi_get_override_irq(int bus_irq, i return -1; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) + if (mp_irqs[i].irqtype == mp_INT && + mp_irqs[i].srcbusirq == bus_irq) break; if (i >= mp_irq_entries) return -1; @@ -4120,7 +4167,7 @@ int acpi_get_override_irq(int bus_irq, i /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS + * so mask in all cases should simply be apic->target_cpus() */ #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) @@ -4159,15 +4206,13 @@ void __init setup_ioapic_dest(void) */ if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = &desc->affinity; + mask = desc->affinity; else - mask = TARGET_CPUS; + mask = apic->target_cpus(); -#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) set_ir_ioapic_affinity_irq_desc(desc, mask); else -#endif set_ioapic_affinity_irq_desc(desc, mask); } @@ -4220,7 +4265,7 @@ void __init ioapic_init_mappings(void) ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; + ioapic_phys = mp_ioapics[i].apicaddr; #ifdef CONFIG_X86_32 if (!ioapic_phys) { printk(KERN_ERR @@ -4260,9 +4305,12 @@ static int __init ioapic_insert_resource struct resource *r = ioapic_resources; if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; + if (nr_ioapics > 0) { + printk(KERN_ERR + "IO APIC resources couldn't be allocated.\n"); + return -1; + } + return 0; } for (i = 0; i < nr_ioapics; i++) { --- head-2010-05-25.orig/arch/x86/kernel/apic/ipi-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/apic/ipi-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -17,38 +17,8 @@ #include #include #include +#include -#ifdef CONFIG_X86_32 -#ifndef CONFIG_XEN -#include -#include - -/* - * the following functions deal with sending IPIs between CPUs. - * - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. - */ - -static inline int __prepare_ICR(unsigned int shortcut, int vector) -{ - unsigned int icr = shortcut | APIC_DEST_LOGICAL; - - switch (vector) { - default: - icr |= APIC_DM_FIXED | vector; - break; - case NMI_VECTOR: - icr |= APIC_DM_NMI; - break; - } - return icr; -} - -static inline int __prepare_ICR2(unsigned int mask) -{ - return SET_APIC_DEST_FIELD(mask); -} -#else #include DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); @@ -59,36 +29,10 @@ static inline void __send_IPI_one(unsign BUG_ON(irq < 0); notify_remote_via_irq(irq); } -#endif -void __send_IPI_shortcut(unsigned int shortcut, int vector) +static void __send_IPI_shortcut(unsigned int shortcut, int vector) { -#ifndef CONFIG_XEN - /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * No need to touch the target chip field - */ - cfg = __prepare_ICR(shortcut, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -#else - int cpu; + unsigned int cpu; switch (shortcut) { case APIC_DEST_SELF: @@ -99,149 +43,53 @@ void __send_IPI_shortcut(unsigned int sh if (cpu != smp_processor_id()) __send_IPI_one(cpu, vector); break; + case APIC_DEST_ALLINC: + for_each_online_cpu(cpu) + __send_IPI_one(cpu, vector); + break; default: printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, vector); break; } -#endif } -void send_IPI_self(int vector) +void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { - __send_IPI_shortcut(APIC_DEST_SELF, vector); -} - -#ifndef CONFIG_XEN -/* - * This is used to send an IPI with no shorthand notation (the destination is - * specified in bits 56 to 63 of the ICR). - */ -static inline void __send_IPI_dest_field(unsigned long mask, int vector) -{ - unsigned long cfg; - - /* - * Wait for idle. - */ - if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); - else - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - apic_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif - -/* - * This is only used on smaller machines. - */ -void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) -{ -#ifndef CONFIG_XEN - unsigned long mask = cpumask_bits(cpumask)[0]; -#else unsigned int cpu; -#endif unsigned long flags; local_irq_save(flags); -#ifndef CONFIG_XEN - WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); - __send_IPI_dest_field(mask, vector); -#else WARN_ON(!cpumask_subset(cpumask, cpu_online_mask)); for_each_cpu_and(cpu, cpumask, cpu_online_mask) - __send_IPI_one(cpu, vector); -#endif + if (cpu != smp_processor_id()) + __send_IPI_one(cpu, vector); local_irq_restore(flags); } -void send_IPI_mask_sequence(const struct cpumask *mask, int vector) +void xen_send_IPI_mask(const struct cpumask *cpumask, int vector) { -#ifndef CONFIG_XEN + unsigned int cpu; unsigned long flags; - unsigned int query_cpu; - - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicasts to each CPU instead. This - * should be modified to do 1 message per cluster ID - mbligh - */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) - __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); + WARN_ON(!cpumask_subset(cpumask, cpu_online_mask)); + for_each_cpu_and(cpu, cpumask, cpu_online_mask) + __send_IPI_one(cpu, vector); local_irq_restore(flags); -#else - send_IPI_mask_bitmask(mask, vector); -#endif } -void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +void xen_send_IPI_allbutself(int vector) { - unsigned long flags; - unsigned int query_cpu; - unsigned int this_cpu = smp_processor_id(); - - /* See Hack comment above */ - - local_irq_save(flags); -#ifndef CONFIG_XEN - for_each_cpu(query_cpu, mask) - if (query_cpu != this_cpu) - __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), - vector); -#else - WARN_ON(!cpumask_subset(mask, cpu_online_mask)); - for_each_cpu_and(query_cpu, mask, cpu_online_mask) - if (query_cpu != this_cpu) - __send_IPI_one(query_cpu, vector); -#endif - local_irq_restore(flags); + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } -#ifndef CONFIG_XEN -/* must come after the send_IPI functions above for inlining */ -static int convert_apicid_to_cpu(int apic_id) -{ - int i; - - for_each_possible_cpu(i) { - if (per_cpu(x86_cpu_to_apicid, i) == apic_id) - return i; - } - return -1; +void xen_send_IPI_all(int vector) +{ + __send_IPI_shortcut(APIC_DEST_ALLINC, vector); } -int safe_smp_processor_id(void) +void xen_send_IPI_self(int vector) { - int apicid, cpuid; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - - apicid = hard_smp_processor_id(); - if (apicid == BAD_APICID) - return 0; - - cpuid = convert_apicid_to_cpu(apicid); - - return cpuid >= 0 ? cpuid : 0; + __send_IPI_shortcut(APIC_DEST_SELF, vector); } -#endif -#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/kernel/apic/probe_32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -0,0 +1,69 @@ +/* + * Default generic APIC driver. This handles up to 8 CPUs. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, v.2 + * + * Generic x86 APIC driver probe layer. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int xen_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic; +} + +static struct apic apic_xen = { + + .name = "default", + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = default_target_cpus, + + .phys_pkg_id = xen_phys_pkg_id, + .mps_oem_check = NULL, + +#ifdef CONFIG_SMP + .send_IPI_mask = xen_send_IPI_mask, + .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_self = xen_send_IPI_self, +#endif +}; + +struct apic *apic = &apic_xen; +EXPORT_SYMBOL_GPL(apic); --- head-2010-05-25.orig/arch/x86/kernel/asm-offsets_32.c 2010-03-24 15:10:37.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/asm-offsets_32.c 2010-03-24 15:25:06.000000000 +0100 @@ -115,6 +115,11 @@ void foo(void) OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_START_mfn_list, start_info, mfn_list); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); --- head-2010-05-25.orig/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -1,101 +1,94 @@ -#include -#include -#include -#include #include +#include #include +#include #include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include -#include -#include -#include +#include + +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include +#include #include -#include -#include #include + #ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#include -#include -#elif defined(CONFIG_X86_64_XEN) -#include +#include #endif -#include -#include -#include -#include -#include -#include -#include -#include -#include - #ifdef CONFIG_XEN -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC) -#define phys_pkg_id(a,b) a -#endif #include #endif #include "cpu.h" -#ifdef CONFIG_X86_64 - /* all of these masks are initialized in setup_cpu_local_masks() */ +cpumask_var_t cpu_initialized_mask; #ifndef CONFIG_XEN -cpumask_var_t cpu_callin_mask; cpumask_var_t cpu_callout_mask; +cpumask_var_t cpu_callin_mask; #endif -cpumask_var_t cpu_initialized_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; -#else /* CONFIG_X86_32 */ - +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_initialized_mask); #ifndef CONFIG_XEN -cpumask_t cpu_callin_map; -cpumask_t cpu_callout_map; + alloc_bootmem_cpumask_var(&cpu_callin_mask); + alloc_bootmem_cpumask_var(&cpu_callout_mask); #endif -cpumask_t cpu_initialized; -cpumask_t cpu_sibling_setup_map; - -#endif /* CONFIG_X86_32 */ - + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} -static struct cpu_dev *this_cpu __cpuinitdata; +static const struct cpu_dev *this_cpu __cpuinitdata; +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, -} }; + /* + * We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + * + * TLS descriptors are currently at a different place compared to i386. + * Hopefully nobody expects them at a fixed place (Wine?) + */ + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, #else -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, #ifndef CONFIG_XEN /* * Segments used for calling PnP BIOS have byte granularity. @@ -103,33 +96,41 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, #endif - [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, -} }; + [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + GDT_STACK_CANARY_INIT #endif +} }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +static int __init x86_xsave_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + return 1; +} +__setup("noxsave", x86_xsave_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -168,16 +169,17 @@ static inline int flag_is_changeable_p(u * the CPUID. Add "volatile" to not allow gcc to * optimize the subsequent calls to this function. */ - asm volatile ("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0, %1 \n\t" + "xorl %2, %0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl \n\t" + : "=&r" (f1), "=&r" (f2) : "ir" (flag)); @@ -192,18 +194,22 @@ static int __cpuinit have_cpuid_p(void) static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); + unsigned long lo, hi; - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } + if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) + return; + + /* Disable processor serial number: */ + + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); } static int __init x86_serial_nr_setup(char *s) @@ -228,16 +234,64 @@ static inline void squash_the_stupid_ser #endif /* + * Some CPU features depend on higher CPUID levels, which may not always + * be available due to CPUID level capping or broken virtualization + * software. Add those features to this table to auto-disable them. + */ +struct cpuid_dependent_feature { + u32 feature; + u32 level; +}; + +static const struct cpuid_dependent_feature __cpuinitconst +cpuid_dependent_features[] = { + { X86_FEATURE_MWAIT, 0x00000005 }, + { X86_FEATURE_DCA, 0x00000009 }, + { X86_FEATURE_XSAVE, 0x0000000d }, + { 0, 0 } +}; + +static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ + const struct cpuid_dependent_feature *df; + + for (df = cpuid_dependent_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; + /* + * Note: cpuid_level is set to -1 if unavailable, but + * extended_extended_level is set to 0 if unavailable + * and the legitimate extended levels are all negative + * when signed; hence the weird messing around with + * signs here... + */ + if (!((s32)df->level < 0 ? + (u32)df->level > (u32)c->extended_cpuid_level : + (s32)df->level > (s32)c->cpuid_level)) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + printk(KERN_WARNING + "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); + } +} + +/* * Naming convention should be: [()] * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * + * in particular, if CPUID levels 0x80000002..4 are supported, this + * isn't used */ /* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) { - struct cpu_model_info *info; + const struct cpu_model_info *info; if (c->x86_model >= 16) return NULL; /* Range check */ @@ -257,32 +311,52 @@ static char __cpuinit *table_lookup_mode __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) +void load_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + loadsegment(fs, __KERNEL_PERCPU); +#else + loadsegment(gs, 0); +#ifndef CONFIG_XEN + wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); +#else + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, + (unsigned long)per_cpu(irq_stack_union.gs_base, cpu))) + BUG(); +#endif +#endif + load_stack_canary_segment(); +} + +/* + * Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. + */ +void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr; unsigned long va, frames[16]; int f; - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.address = (long)get_cpu_gdt_table(cpu); gdt_descr.size = GDT_SIZE - 1; for (va = gdt_descr.address, f = 0; va < gdt_descr.address + gdt_descr.size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); - make_lowmem_page_readonly( - (void *)va, XENFEAT_writable_descriptor_tables); + frames[f] = arbitrary_virt_to_mfn(va); + make_page_readonly((void *)va, + XENFEAT_writable_descriptor_tables); } if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8)) BUG(); -#ifdef CONFIG_X86_32 - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -#endif + + /* Reload the per-cpu base */ + + load_percpu_segment(cpu); } -static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { @@ -301,7 +375,7 @@ static void __cpuinit default_init(struc #endif } -static struct cpu_dev __cpuinitdata default_cpu = { +static const struct cpu_dev __cpuinitconst default_cpu = { .c_init = default_init, .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, @@ -315,22 +389,24 @@ static void __cpuinit get_model_name(str if (c->extended_cpuid_level < 0x80000004) return; - v = (unsigned int *) c->x86_model_id; + v = (unsigned int *)c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; - /* Intel chips right-justify this string for some dumb reason; - undo that brain damage */ + /* + * Intel chips right-justify this string for some dumb reason; + * undo that brain damage: + */ p = q = &c->x86_model_id[0]; while (*p == ' ') - p++; + p++; if (p != q) { - while (*p) - *q++ = *p++; - while (q <= &c->x86_model_id[48]) - *q++ = '\0'; /* Zero-pad the rest */ + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ } } @@ -399,36 +475,30 @@ void __cpuinit detect_ht(struct cpuinfo_ if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { + goto out; + } - if (smp_num_siblings > nr_cpu_ids) { - printk(KERN_WARNING "CPU: Unsupported number of siblings %d", - smp_num_siblings); - smp_num_siblings = 1; - return; - } + if (smp_num_siblings <= 1) + goto out; - index_msb = get_count_order(smp_num_siblings); -#ifdef CONFIG_X86_64 - c->phys_proc_id = phys_pkg_id(index_msb); -#else - c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); -#endif + if (smp_num_siblings > nr_cpu_ids) { + pr_warning("CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } - smp_num_siblings = smp_num_siblings / c->x86_max_cores; + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); - index_msb = get_count_order(smp_num_siblings); + smp_num_siblings = smp_num_siblings / c->x86_max_cores; - core_bits = get_count_order(c->x86_max_cores); + index_msb = get_count_order(smp_num_siblings); -#ifdef CONFIG_X86_64 - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); -#else - c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); -#endif - } + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); out: if ((c->x86_max_cores * smp_num_siblings) > 1) { @@ -443,8 +513,8 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - int i; static int printed; + int i; for (i = 0; i < X86_VENDOR_NUM; i++) { if (!cpu_devs[i]) @@ -453,6 +523,7 @@ static void __cpuinit get_cpu_vendor(str if (!strcmp(v, cpu_devs[i]->c_ident[0]) || (cpu_devs[i]->c_ident[1] && !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; c->x86_vendor = this_cpu->c_x86_vendor; return; @@ -461,7 +532,9 @@ static void __cpuinit get_cpu_vendor(str if (!printed) { printed++; - printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); + printk(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n", v); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); } @@ -481,14 +554,17 @@ void __cpuinit cpu_detect(struct cpuinfo /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xf) << 4; + if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; c->x86_cache_alignment = c->x86_clflush_size; @@ -504,6 +580,7 @@ static void __cpuinit get_cpu_cap(struct /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; @@ -512,6 +589,7 @@ static void __cpuinit get_cpu_cap(struct /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { if (xlvl >= 0x80000001) { c->x86_capability[1] = cpuid_edx(0x80000001); @@ -519,13 +597,15 @@ static void __cpuinit get_cpu_cap(struct } } -#ifdef CONFIG_X86_64 if (c->extended_cpuid_level >= 0x80000008) { u32 eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; #endif if (c->extended_cpuid_level >= 0x80000007) @@ -572,8 +652,12 @@ static void __init early_identify_cpu(st { #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; #else c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; @@ -596,21 +680,20 @@ static void __init early_identify_cpu(st if (this_cpu->c_early_init) this_cpu->c_early_init(c); - validate_pat_support(c); - #ifdef CONFIG_SMP c->cpu_index = boot_cpu_id; #endif + filter_cpuid_features(c, false); } void __init early_cpu_init(void) { - struct cpu_dev **cdev; + const struct cpu_dev *const *cdev; int count = 0; - printk("KERNEL supported cpus:\n"); + printk(KERN_INFO "KERNEL supported cpus:\n"); for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { - struct cpu_dev *cpudev = *cdev; + const struct cpu_dev *cpudev = *cdev; unsigned int j; if (count >= X86_VENDOR_NUM) @@ -621,7 +704,7 @@ void __init early_cpu_init(void) for (j = 0; j < 2; j++) { if (!cpudev->c_ident[j]) continue; - printk(" %s %s\n", cpudev->c_vendor, + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, cpudev->c_ident[j]); } } @@ -663,7 +746,7 @@ static void __cpuinit generic_identify(s c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) # ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id(c->initial_apicid, 0); + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); # else c->apicid = c->initial_apicid; # endif @@ -697,9 +780,13 @@ static void __cpuinit identify_cpu(struc c->x86_coreid_bits = 0; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; #else c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; #endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -712,7 +799,7 @@ static void __cpuinit identify_cpu(struc this_cpu->c_identify(c); #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) - c->apicid = phys_pkg_id(0); + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif /* @@ -732,13 +819,16 @@ static void __cpuinit identify_cpu(struc squash_the_stupid_serial_number(c); /* - * The vendor-specific functions might have changed features. Now - * we do "generic changes." + * The vendor-specific functions might have changed features. + * Now we do "generic changes." */ + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { - char *p; + const char *p; p = table_lookup_model(c); if (p) strcpy(c->x86_model_id, p); @@ -794,6 +884,7 @@ static void vgetcpu_set_mode(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); + init_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -813,11 +904,11 @@ void __cpuinit identify_secondary_cpu(st } struct msr_range { - unsigned min; - unsigned max; + unsigned min; + unsigned max; }; -static struct msr_range msr_range_array[] __cpuinitdata = { +static const struct msr_range msr_range_array[] __cpuinitconst = { { 0x00000000, 0x00000418}, { 0xc0000000, 0xc000040b}, { 0xc0010000, 0xc0010142}, @@ -826,14 +917,15 @@ static struct msr_range msr_range_array[ static void __cpuinit print_cpu_msr(void) { + unsigned index_min, index_max; unsigned index; u64 val; int i; - unsigned index_min, index_max; for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { index_min = msr_range_array[i].min; index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { if (rdmsrl_amd_safe(index, &val)) continue; @@ -843,6 +935,7 @@ static void __cpuinit print_cpu_msr(void } static int show_msr __cpuinitdata; + static __init int setup_show_msr(char *arg) { int num; @@ -864,12 +957,14 @@ __setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { - char *vendor = NULL; + const char *vendor = NULL; - if (c->x86_vendor < X86_VENDOR_NUM) + if (c->x86_vendor < X86_VENDOR_NUM) { vendor = this_cpu->c_vendor; - else if (c->cpuid_level >= 0) - vendor = c->x86_vendor_id; + } else { + if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + } if (vendor && !strstr(c->x86_model_id, vendor)) printk(KERN_CONT "%s ", vendor); @@ -896,87 +991,57 @@ void __cpuinit print_cpu_info(struct cpu static __init int setup_disablecpuid(char *arg) { int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) setup_clear_cpu_cap(bit); else return 0; + return 1; } __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - #ifndef CONFIG_X86_NO_IDT struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; #endif -static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +DEFINE_PER_CPU_FIRST(union irq_stack_union, + irq_stack_union) __aligned(PAGE_SIZE); -static void __ref switch_pt(int cpu) +void xen_switch_pt(void) { #ifdef CONFIG_XEN - if (cpu == 0) - xen_init_pt(); xen_pt_switch(__pa_symbol(init_level4_pgt)); xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); #endif } -void __cpuinit pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; - /* Setup up data that may be needed in __get_free_pages early */ - loadsegment(fs, 0); - loadsegment(gs, 0); -#ifndef CONFIG_XEN - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); -#else - if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, - (unsigned long)pda)) - BUG(); -#endif - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = (unsigned long)stack_thread_info() - - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - pda->irqstackptr += IRQSTACKSIZE - 64; - } else { - if (!pda->irqstackptr) { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", - cpu); - pda->irqstackptr += IRQSTACKSIZE - 64; - } +DEFINE_PER_CPU(unsigned long, kernel_stack) = + (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } - - switch_pt(cpu); -} +DEFINE_PER_CPU(unsigned int, irq_count) = -1; #ifndef CONFIG_X86_NO_TSS -static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; -#endif +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; -extern asmlinkage void ignore_sysret(void); +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) + __aligned(PAGE_SIZE); +#endif void __cpuinit syscall_init(void) { @@ -1020,16 +1085,38 @@ unsigned long kernel_eflags; DEFINE_PER_CPU(struct orig_ist, orig_ist); #endif -#else +#else /* CONFIG_X86_64 */ -/* Make sure %fs is initialized properly in idle threads */ +#ifdef CONFIG_CC_STACKPROTECTOR +DEFINE_PER_CPU(unsigned long, stack_canary); +#endif + +/* Make sure %fs and %gs are initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); regs->fs = __KERNEL_PERCPU; + regs->gs = __KERNEL_STACK_CANARY; + return regs; } -#endif +#endif /* CONFIG_X86_64 */ + +/* + * Clear all 6 debug registers: + */ +static void clear_all_debug_regs(void) +{ + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + + set_debugreg(0, i); + } +} /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -1039,24 +1126,31 @@ struct pt_regs * __cpuinit idle_regs(str * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 + void __cpuinit cpu_init(void) { - int cpu = stack_smp_processor_id(); #ifndef CONFIG_X86_NO_TSS - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + struct orig_ist *orig_ist; + struct tss_struct *t; unsigned long v; - char *estacks = NULL; int i; #endif struct task_struct *me; + int cpu; + cpu = stack_smp_processor_id(); /* CPU 0 is initialised in head64.c */ if (cpu != 0) - pda_init(cpu); + xen_switch_pt(); #ifndef CONFIG_X86_NO_TSS - else - estacks = boot_exception_stacks; + t = &per_cpu(init_tss, cpu); + orig_ist = &per_cpu(orig_ist, cpu); +#endif + +#ifdef CONFIG_NUMA + if (cpu != 0 && percpu_read(node_number) == 0 && + cpu_to_node(cpu) != NUMA_NO_NODE) + percpu_write(node_number, cpu_to_node(cpu)); #endif me = current; @@ -1073,7 +1167,9 @@ void __cpuinit cpu_init(void) * and set up the GDT descriptor: */ - switch_to_new_gdt(); + switch_to_new_gdt(cpu); + loadsegment(fs, 0); + #ifndef CONFIG_X86_NO_IDT load_idt((const struct desc_ptr *)&idt_descr); #endif @@ -1086,8 +1182,8 @@ void __cpuinit cpu_init(void) barrier(); check_efer(); -#ifndef CONFIG_XEN - if (cpu != 0 && x2apic) +#ifdef CONFIG_X86_LOCAL_APIC + if (cpu != 0) enable_x2apic(); #endif @@ -1096,24 +1192,17 @@ void __cpuinit cpu_init(void) * set up and load the per-CPU TSS */ if (!orig_ist->ist[0]) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; + char *estacks = per_cpu(exception_stacks, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception " - "stack %ld %d\n", v, cpu); - } - estacks += PAGE_SIZE << order[v]; + estacks += exception_stack_sizes[v]; orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. @@ -1124,8 +1213,7 @@ void __cpuinit cpu_init(void) atomic_inc(&init_mm.mm_count); me->active_mm = &init_mm; - if (me->mm) - BUG(); + BUG_ON(me->mm); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1144,22 +1232,9 @@ void __cpuinit cpu_init(void) */ if (kgdb_connected && arch_kgdb_ops.correct_hw_break) arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } + else #endif + clear_all_debug_regs(); fpu_init(); @@ -1171,8 +1246,10 @@ void __cpuinit cpu_init(void) kernel_eflags &= ~X86_EFLAGS_IF; #endif +#ifdef CONFIG_X86_LOCAL_APIC if (is_uv_system()) uv_cpu_init(); +#endif } #else @@ -1188,7 +1265,8 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); + for (;;) + local_irq_enable(); } printk(KERN_INFO "Initializing CPU#%d\n", cpu); @@ -1196,36 +1274,30 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - switch_to_new_gdt(); + switch_to_new_gdt(cpu); /* * Set up and load the per-CPU TSS and LDT */ atomic_inc(&init_mm.mm_count); curr->active_mm = &init_mm; - if (curr->mm) - BUG(); + BUG_ON(curr->mm); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); load_LDT(&init_mm.context); +#ifndef CONFIG_X86_NO_TSS + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); +#endif + #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %gs. */ - asm volatile ("mov %0, %%gs" : : "r" (0)); - - /* Clear all 6 debug registers: */ - set_debugreg(0, 0); - set_debugreg(0, 1); - set_debugreg(0, 2); - set_debugreg(0, 3); - set_debugreg(0, 6); - set_debugreg(0, 7); + clear_all_debug_regs(); /* * Force FPU initialization: @@ -1245,6 +1317,4 @@ void __cpuinit cpu_init(void) xsave_init(); } - - #endif --- head-2010-05-25.orig/arch/x86/kernel/cpu/intel.c 2010-05-25 09:12:08.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/cpu/intel.c 2010-05-25 09:24:45.000000000 +0200 @@ -91,8 +91,10 @@ static void __cpuinit early_init_intel(s if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); +#ifndef CONFIG_XEN if (!check_tsc_unstable()) sched_clock_stable = 1; +#endif } /* --- head-2010-05-25.orig/arch/x86/kernel/e820-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/e820-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -129,19 +129,50 @@ int __init e820_all_mapped(u64 start, u6 /* * Add a memory region to the kernel e820 map. */ -void __init e820_add_region(u64 start, u64 size, int type) +static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, + int type) { - int x = e820.nr_map; + int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820.map)) { + if (x == ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + e820x->map[x].addr = start; + e820x->map[x].size = size; + e820x->map[x].type = type; + e820x->nr_map++; +} + +void __init e820_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820, start, size, type); +} + +static void __init e820_print_type(u32 type) +{ + switch (type) { + case E820_RAM: + case E820_RESERVED_KERN: + printk(KERN_CONT "(usable)"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)"); + break; + case E820_UNUSABLE: + printk(KERN_CONT "(unusable)"); + break; + default: + printk(KERN_CONT "type %u", type); + break; + } } static void __init _e820_print_map(const struct e820map *e820, const char *who) @@ -153,27 +184,8 @@ static void __init _e820_print_map(const (unsigned long long) e820->map[i].addr, (unsigned long long) (e820->map[i].addr + e820->map[i].size)); - switch (e820->map[i].type) { - case E820_RAM: - case E820_RESERVED_KERN: - printk(KERN_CONT "(usable)\n"); - break; - case E820_RESERVED: - printk(KERN_CONT "(reserved)\n"); - break; - case E820_ACPI: - printk(KERN_CONT "(ACPI data)\n"); - break; - case E820_NVS: - printk(KERN_CONT "(ACPI NVS)\n"); - break; - case E820_UNUSABLE: - printk("(unusable)\n"); - break; - default: - printk(KERN_CONT "type %u\n", e820->map[i].type); - break; - } + e820_print_type(e820->map[i].type); + printk(KERN_CONT "\n"); } } @@ -240,7 +252,7 @@ static void __init _e820_print_map(const */ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - int *pnr_map) + u32 *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -444,11 +456,12 @@ static int __init append_e820_map(struct return __append_e820_map(biosmap, nr_map); } -static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, +static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 size, unsigned old_type, unsigned new_type) { - unsigned int i, x; + u64 end; + unsigned int i; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -456,40 +469,59 @@ static u64 __init e820_update_range_map( if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT " ==> "); + e820_print_type(new_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820x->nr_map; i++) { struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; + u64 ei_end; + if (ei->type != old_type) continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + + ei_end = ei->addr + ei->size; + /* totally covered by new range? */ + if (ei->addr >= start && ei_end <= end) { ei->type = new_type; real_updated_size += ei->size; continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + __e820_add_region(e820x, start, size, new_type); + __e820_add_region(e820x, end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_updated_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; - x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "Too many memory map entries!\n"); - break; - } - e820x->map[x].addr = final_start; - e820x->map[x].size = final_end - final_start; - e820x->map[x].type = new_type; - e820x->nr_map++; + __e820_add_region(e820x, final_start, final_end - final_start, + new_type); real_updated_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; if (ei->addr < final_start) continue; ei->addr = final_end; - ei->size -= final_end - final_start; } return real_updated_size; } @@ -497,7 +529,7 @@ static u64 __init e820_update_range_map( u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return e820_update_range_map(&e820, start, size, old_type, new_type); + return __e820_update_range(&e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, @@ -505,11 +537,11 @@ static u64 __init e820_update_range_save { #ifdef CONFIG_XEN if (is_initial_xendomain()) - return e820_update_range_map(&machine_e820, - phys_to_machine(start), size, - old_type, new_type); + return __e820_update_range(&machine_e820, + phys_to_machine(start), size, + old_type, new_type); #endif - return e820_update_range_map(&e820_saved, start, size, old_type, + return __e820_update_range(&e820_saved, start, size, old_type, new_type); } @@ -553,7 +585,7 @@ u64 __init e820_remove_range(u64 start, void __init update_e820(void) { - int nr_map; + u32 nr_map; nr_map = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) @@ -564,7 +596,7 @@ void __init update_e820(void) } static void __init update_e820_saved(void) { - int nr_map; + u32 nr_map; nr_map = e820_saved.nr_map; if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) @@ -916,6 +948,9 @@ void __init reserve_early_overlap_ok(u64 */ void __init reserve_early(u64 start, u64 end, char *name) { + if (start >= end) + return; + drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); } @@ -1389,7 +1424,7 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - int nr = e820.nr_map; + u32 nr = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) early_panic("Invalid user supplied memory map"); @@ -1479,7 +1514,7 @@ void __init e820_reserve_resources_late( char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; - int new_nr; + u32 new_nr; /* * Try to copy the BIOS-supplied E820-map. * --- head-2010-05-25.orig/arch/x86/kernel/early_printk-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/early_printk-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -12,8 +12,8 @@ #include #include #include -#include #include +#include #include #ifndef CONFIG_XEN @@ -279,7 +279,7 @@ static int dbgp_wait_until_complete(void return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); } -static void dbgp_mdelay(int ms) +static void __init dbgp_mdelay(int ms) { int i; @@ -340,7 +340,7 @@ static void dbgp_set_data(const void *bu writel(hi, &ehci_debug->data47); } -static void dbgp_get_data(void *buf, int size) +static void __init dbgp_get_data(void *buf, int size) { unsigned char *bytes = buf; u32 lo, hi; @@ -384,7 +384,7 @@ static int dbgp_bulk_write(unsigned devn return ret; } -static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, +static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, int size) { u32 pids, addr, ctrl; @@ -415,8 +415,8 @@ static int dbgp_bulk_read(unsigned devnu return ret; } -static int dbgp_control_msg(unsigned devnum, int requesttype, int request, - int value, int index, void *data, int size) +static int __init dbgp_control_msg(unsigned devnum, int requesttype, + int request, int value, int index, void *data, int size) { u32 pids, addr, ctrl; struct usb_ctrlrequest req; @@ -518,7 +518,7 @@ static u32 __init find_dbgp(int ehci_num return 0; } -static int ehci_reset_port(int port) +static int __init ehci_reset_port(int port) { u32 portsc; u32 delay_time, delay; @@ -561,7 +561,7 @@ static int ehci_reset_port(int port) return -EBUSY; } -static int ehci_wait_for_port(int port) +static int __init ehci_wait_for_port(int port) { u32 status; int ret, reps; @@ -586,13 +586,13 @@ static inline void dbgp_printk(const cha typedef void (*set_debug_port_t)(int port); -static void default_set_debug_port(int port) +static void __init default_set_debug_port(int port) { } -static set_debug_port_t set_debug_port = default_set_debug_port; +static set_debug_port_t __initdata set_debug_port = default_set_debug_port; -static void nvidia_set_debug_port(int port) +static void __init nvidia_set_debug_port(int port) { u32 dword; dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, --- head-2010-05-25.orig/arch/x86/kernel/entry_32-xen.S 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/entry_32-xen.S 2010-03-24 15:25:06.000000000 +0100 @@ -30,12 +30,13 @@ * 1C(%esp) - %ds * 20(%esp) - %es * 24(%esp) - %fs - * 28(%esp) - orig_eax - * 2C(%esp) - %eip - * 30(%esp) - %cs - * 34(%esp) - %eflags - * 38(%esp) - %oldesp - * 3C(%esp) - %oldss + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -46,7 +47,7 @@ #include #include #include -#include +#include #include #include #include @@ -105,121 +106,221 @@ NMI_MASK = 0x80000000 #define resume_userspace_sig resume_userspace #endif -#define SAVE_ALL \ - cld; \ - pushl %fs; \ - CFI_ADJUST_CFA_OFFSET 4;\ - /*CFI_REL_OFFSET fs, 0;*/\ - pushl %es; \ - CFI_ADJUST_CFA_OFFSET 4;\ - /*CFI_REL_OFFSET es, 0;*/\ - pushl %ds; \ - CFI_ADJUST_CFA_OFFSET 4;\ - /*CFI_REL_OFFSET ds, 0;*/\ - pushl %eax; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET eax, 0;\ - pushl %ebp; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET ebp, 0;\ - pushl %edi; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET edi, 0;\ - pushl %esi; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET esi, 0;\ - pushl %edx; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET edx, 0;\ - pushl %ecx; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET ecx, 0;\ - pushl %ebx; \ - CFI_ADJUST_CFA_OFFSET 4;\ - CFI_REL_OFFSET ebx, 0;\ - movl $(__USER_DS), %edx; \ - movl %edx, %ds; \ - movl %edx, %es; \ - movl $(__KERNEL_PERCPU), %edx; \ +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp + CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98: popl %gs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE gs*/ + .if \pop <> 0 + add $\pop, %esp + CFI_ADJUST_CFA_OFFSET -\pop + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro GS_TO_REG reg + movl %gs, \reg + /*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) + /*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0;*/ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0;*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0;*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx movl %edx, %fs + SET_KERNEL_GS %edx +.endm -#define RESTORE_INT_REGS \ - popl %ebx; \ - CFI_ADJUST_CFA_OFFSET -4;\ - CFI_RESTORE ebx;\ - popl %ecx; \ - CFI_ADJUST_CFA_OFFSET -4;\ - CFI_RESTORE ecx;\ - popl %edx; \ - CFI_ADJUST_CFA_OFFSET -4;\ - CFI_RESTORE edx;\ - popl %esi; \ - CFI_ADJUST_CFA_OFFSET -4;\ - CFI_RESTORE esi;\ - popl %edi; \ - CFI_ADJUST_CFA_OFFSET -4;\ - CFI_RESTORE edi;\ - popl %ebp; \ - CFI_ADJUST_CFA_OFFSET -4;\ - CFI_RESTORE ebp;\ - popl %eax; \ - CFI_ADJUST_CFA_OFFSET -4;\ +.macro RESTORE_INT_REGS + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi + popl %ebp + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebp + popl %eax + CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE eax +.endm -#define RESTORE_REGS \ - RESTORE_INT_REGS; \ -1: popl %ds; \ - CFI_ADJUST_CFA_OFFSET -4;\ - /*CFI_RESTORE ds;*/\ -2: popl %es; \ - CFI_ADJUST_CFA_OFFSET -4;\ - /*CFI_RESTORE es;*/\ -3: popl %fs; \ - CFI_ADJUST_CFA_OFFSET -4;\ - /*CFI_RESTORE fs;*/\ -.pushsection .fixup,"ax"; \ -4: movl $0,(%esp); \ - jmp 1b; \ -5: movl $0,(%esp); \ - jmp 2b; \ -6: movl $0,(%esp); \ - jmp 3b; \ -.section __ex_table,"a";\ - .align 4; \ - .long 1b,4b; \ - .long 2b,5b; \ - .long 3b,6b; \ +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE ds;*/ +2: popl %es + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE es;*/ +3: popl %fs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE fs;*/ + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.section __ex_table, "a" + .align 4 + .long 1b, 4b + .long 2b, 5b + .long 3b, 6b .popsection + POP_GS_EX +.endm -#define RING0_INT_FRAME \ - CFI_STARTPROC simple;\ - CFI_SIGNAL_FRAME;\ - CFI_DEF_CFA esp, 3*4;\ - /*CFI_OFFSET cs, -2*4;*/\ +.macro RING0_INT_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 3*4 + /*CFI_OFFSET cs, -2*4;*/ CFI_OFFSET eip, -3*4 +.endm -#define RING0_EC_FRAME \ - CFI_STARTPROC simple;\ - CFI_SIGNAL_FRAME;\ - CFI_DEF_CFA esp, 4*4;\ - /*CFI_OFFSET cs, -2*4;*/\ +.macro RING0_EC_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 4*4 + /*CFI_OFFSET cs, -2*4;*/ CFI_OFFSET eip, -3*4 +.endm -#define RING0_PTREGS_FRAME \ - CFI_STARTPROC simple;\ - CFI_SIGNAL_FRAME;\ - CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ - /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ - CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ - /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ - /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ - CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ - CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ - CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ - CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ - CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ - CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ +.macro RING0_PTREGS_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ + CFI_OFFSET eip, PT_EIP-PT_OLDESP + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ + CFI_OFFSET eax, PT_EAX-PT_OLDESP + CFI_OFFSET ebp, PT_EBP-PT_OLDESP + CFI_OFFSET edi, PT_EDI-PT_OLDESP + CFI_OFFSET esi, PT_ESI-PT_OLDESP + CFI_OFFSET edx, PT_EDX-PT_OLDESP + CFI_OFFSET ecx, PT_ECX-PT_OLDESP CFI_OFFSET ebx, PT_EBX-PT_OLDESP +.endm ENTRY(ret_from_fork) CFI_STARTPROC @@ -344,7 +445,8 @@ sysenter_past_esp: .previous GET_THREAD_INFO(%ebp) - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: cmpl $(nr_syscalls), %eax @@ -355,7 +457,7 @@ sysenter_do_call: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx + testl $_TIF_ALLWORK_MASK, %ecx jne sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ @@ -364,11 +466,12 @@ sysenter_exit: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs + PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT #ifdef CONFIG_AUDITSYSCALL sysenter_audit: - testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry addl $4,%esp CFI_ADJUST_CFA_OFFSET -4 @@ -385,7 +488,7 @@ sysenter_audit: jmp sysenter_do_call sysexit_audit: - testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx jne syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) @@ -398,7 +501,7 @@ sysexit_audit: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx jne syscall_exit_work movl PT_EAX(%esp),%eax /* reload syscall return value */ jmp sysenter_exit @@ -412,6 +515,7 @@ sysexit_audit: .align 4 .long 1b,2b .popsection + PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) # pv sysenter call handler stub @@ -447,7 +551,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -461,7 +565,7 @@ syscall_exit: # between sampling and the iret TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx # current->work + testl $_TIF_ALLWORK_MASK, %ecx # current->work jne syscall_exit_work restore_all: @@ -492,8 +596,7 @@ restore_nocheck: #endif TRACE_IRQS_IRET restore_nocheck_notrace: - RESTORE_REGS - addl $4, %esp # skip orig_eax/error_code + RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: INTERRUPT_RETURN @@ -555,9 +658,7 @@ restore_all_enable_events: scrit: /**** START OF CRITICAL REGION ****/ __TEST_PENDING jnz 14f # process more events if necessary... - RESTORE_REGS - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 + RESTORE_REGS 4 1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 @@ -571,9 +672,7 @@ ecrit: /**** END OF CRITICAL REGION *** CFI_RESTORE_STATE hypervisor_iret: andl $~NMI_MASK, PT_EFLAGS(%esp) - RESTORE_REGS - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 + RESTORE_REGS 4 jmp hypercall_page + (__HYPERVISOR_iret * 32) #endif CFI_ENDPROC @@ -641,7 +740,7 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $_TIF_WORK_SYSCALL_EXIT, %cl + testl $_TIF_WORK_SYSCALL_EXIT, %ecx jz work_pending TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call @@ -665,29 +764,51 @@ syscall_badsys: END(syscall_badsys) CFI_ENDPROC +/* + * System calls that need a pt_regs pointer. + */ +#define PTREGSCALL(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + jmp sys_##name; + +PTREGSCALL(iopl) +PTREGSCALL(fork) +PTREGSCALL(clone) +PTREGSCALL(vfork) +PTREGSCALL(execve) +PTREGSCALL(sigaltstack) +PTREGSCALL(sigreturn) +PTREGSCALL(rt_sigreturn) +PTREGSCALL(vm86) +PTREGSCALL(vm86old) + #ifndef CONFIG_XEN -#define FIXUP_ESPFIX_STACK \ - /* since we are on a wrong stack, we cant make it a C code :( */ \ - PER_CPU(gdt_page, %ebx); \ - GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ - addl %esp, %eax; \ - pushl $__KERNEL_DS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl %eax; \ - CFI_ADJUST_CFA_OFFSET 4; \ - lss (%esp), %esp; \ - CFI_ADJUST_CFA_OFFSET -8; -#define UNWIND_ESPFIX_STACK \ - movl %ss, %eax; \ - /* see if on espfix stack */ \ - cmpw $__ESPFIX_SS, %ax; \ - jne 27f; \ - movl $__KERNEL_DS, %eax; \ - movl %eax, %ds; \ - movl %eax, %es; \ - /* switch to normal stack */ \ - FIXUP_ESPFIX_STACK; \ -27:; +.macro FIXUP_ESPFIX_STACK + /* since we are on a wrong stack, we cant make it a C code :( */ + PER_CPU(gdt_page, %ebx) + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) + addl %esp, %eax + pushl $__KERNEL_DS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + lss (%esp), %esp + CFI_ADJUST_CFA_OFFSET -8 +.endm +.macro UNWIND_ESPFIX_STACK + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +.endm /* * Build the entry stubs and pointer table with some assembler magic. @@ -743,7 +864,7 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC -#define BUILD_INTERRUPT(name, nr) \ +#define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ pushl $~(nr); \ @@ -751,13 +872,15 @@ ENTRY(name) \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ - call smp_##name; \ + call fn; \ jmp ret_from_intr; \ CFI_ENDPROC; \ ENDPROC(name) +#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + /* The include is where all of the SMP etc. interrupts come from */ -#include "entry_arch.h" +#include #else #define UNWIND_ESPFIX_STACK @@ -844,8 +967,13 @@ critical_fixup_table: .byte 7 # pop %ds .byte 8 # pop %es .byte 9,9 # pop %fs - .byte 10,10,10 # add $4,%esp - .byte 11 # iret +#ifndef CONFIG_X86_32_LAZY_GS + .byte 10,10 # pop %gs + .byte 11,11,11 # add $4,%esp +#else + .byte 10,10,10 # add $8,%esp +#endif + .byte 12 # iret .byte -1,-1,-1,-1 # movb $1,1(%esi) = __DISABLE_INTERRUPTS .previous @@ -1203,7 +1331,7 @@ ENTRY(ia32pv_cstar_target) .previous SAVE_ALL GET_THREAD_INFO(%ebp) - testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz cstar_trace_entry cmpl $nr_syscalls,%eax jae cstar_badsys @@ -1323,7 +1451,10 @@ ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: - /* the function address is in %fs's slot on the stack */ + /* the function address is in %gs's slot on the stack */ + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ pushl %es CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET es, 0*/ @@ -1352,20 +1483,15 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -1390,20 +1516,21 @@ END(page_fault) * by hand onto the new stack - while updating the return eip past * the instruction that would have done it for sysenter. */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl TSS_sysenter_sp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $__KERNEL_CS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $sysenter_past_esp; \ - CFI_ADJUST_CFA_OFFSET 4; \ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + CFI_DEF_CFA esp, 0 + CFI_UNDEFINED eip + pushfl + CFI_ADJUST_CFA_OFFSET 4 + pushl $__KERNEL_CS + CFI_ADJUST_CFA_OFFSET 4 + pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 +.endm #endif /* CONFIG_XEN */ ENTRY(debug) @@ -1411,7 +1538,7 @@ ENTRY(debug) #ifndef CONFIG_XEN cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: #endif /* !CONFIG_XEN */ pushl $-1 # mark this as an int @@ -1471,7 +1598,7 @@ nmi_stack_correct: nmi_stack_fixup: RING0_INT_FRAME - FIX_STACK(12,nmi_stack_correct, 1) + FIX_STACK 12, nmi_stack_correct, 1 jmp nmi_stack_correct nmi_debug_stack_check: @@ -1482,7 +1609,7 @@ nmi_debug_stack_check: jb nmi_stack_correct cmpl $debug_esp_fix_insn,(%esp) ja nmi_stack_correct - FIX_STACK(24,nmi_stack_correct, 1) + FIX_STACK 24, nmi_stack_correct, 1 jmp nmi_stack_correct nmi_espfix_stack: @@ -1494,7 +1621,7 @@ nmi_espfix_stack: CFI_ADJUST_CFA_OFFSET 4 pushl %esp CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) + addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 pushl 16(%esp) --- head-2010-05-25.orig/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:25:06.000000000 +0100 @@ -51,10 +51,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include @@ -81,20 +81,17 @@ ENTRY(ftrace_caller) movq 8(%rbp), %rsi subq $MCOUNT_INSN_SIZE, %rdi -.globl ftrace_call -ftrace_call: +GLOBAL(ftrace_call) call ftrace_stub MCOUNT_RESTORE_FRAME #ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: +GLOBAL(ftrace_graph_call) jmp ftrace_stub #endif -.globl ftrace_stub -ftrace_stub: +GLOBAL(ftrace_stub) retq END(ftrace_caller) @@ -114,8 +111,7 @@ ENTRY(mcount) jnz ftrace_graph_caller #endif -.globl ftrace_stub -ftrace_stub: +GLOBAL(ftrace_stub) retq trace: @@ -152,9 +148,7 @@ ENTRY(ftrace_graph_caller) retq END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: +GLOBAL(return_to_handler) subq $80, %rsp movq %rax, (%rsp) @@ -369,15 +363,15 @@ ENTRY(save_args) je 1f SWAPGS /* - * irqcount is used to check if a CPU is already on an interrupt stack + * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ -1: incl %gs:pda_irqcount +1: incl PER_CPU_VAR(irq_count) jne 2f popq_cfi %rax /* move return address... */ - mov %gs:pda_irqstackptr,%rsp + mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 pushq_cfi %rbp /* backlink for unwinder */ pushq_cfi %rax /* ... to the new stack */ @@ -407,6 +401,7 @@ END(save_rest) #ifndef CONFIG_XEN /* save complete stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -435,6 +430,7 @@ ENTRY(save_paranoid) 1: ret CFI_ENDPROC END(save_paranoid) + .popsection #endif /* @@ -445,6 +441,8 @@ END(save_paranoid) ENTRY(ret_from_fork) DEFAULT_FRAME + LOCK ; btr $TIF_FORK,TI_flags(%r8) + push kernel_eflags(%rip) CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags @@ -454,7 +452,6 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%rcx) - CFI_REMEMBER_STATE RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? @@ -466,7 +463,6 @@ ENTRY(ret_from_fork) RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET jmp ret_from_sys_call # go to the SYSRET fastpath - CFI_RESTORE_STATE CFI_ENDPROC END(ret_from_fork) @@ -642,9 +638,7 @@ tracesys: * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ - .globl int_ret_from_sys_call - .globl int_with_check -int_ret_from_sys_call: +GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testb $3,CS-ARGOFFSET(%rsp) @@ -655,7 +649,7 @@ int_ret_from_sys_call: 1: movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ -int_with_check: +GLOBAL(int_with_check) LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%edx @@ -877,10 +871,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt #endif +#ifdef CONFIG_X86_UV apicinterrupt UV_BAU_MESSAGE \ uv_bau_message_intr1 uv_bau_message_interrupt +#endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ + generic_interrupt smp_generic_interrupt #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ @@ -998,15 +996,15 @@ ENTRY(do_hypervisor_callback) # do_hyp movq %rdi, %rsp # we don't return, adjust the stack frame CFI_ENDPROC DEFAULT_FRAME -11: incl %gs:pda_irqcount +11: incl PER_CPU_VAR(irq_count) movq %rsp,%rbp CFI_DEF_CFA_REGISTER rbp - cmovzq %gs:pda_irqstackptr,%rsp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp pushq %rbp # backlink for old unwinder call evtchn_do_upcall popq %rsp CFI_DEF_CFA_REGISTER rsp - decl %gs:pda_irqcount + decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC END(do_hypervisor_callback) @@ -1197,14 +1195,14 @@ ENTRY(call_softirq) CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp - incl %gs:pda_irqcount - cmove %gs:pda_irqstackptr,%rsp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp push %rbp # backlink for old unwinder call __do_softirq leaveq CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 - decl %gs:pda_irqcount + decl PER_CPU_VAR(irq_count) ret CFI_ENDPROC END(call_softirq) @@ -1250,7 +1248,10 @@ ENTRY(paranoid_exit) paranoid_swapgs: TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK + RESTORE_ALL 8 + jmp irq_return paranoid_restore: + TRACE_IRQS_IRETQ 0 RESTORE_ALL 8 jmp irq_return paranoid_userspace: --- head-2010-05-25.orig/arch/x86/kernel/head-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head-xen.c 2010-04-28 17:07:13.000000000 +0200 @@ -2,6 +2,7 @@ #include #include +#ifndef CONFIG_XEN #include #define BIOS_LOWMEM_KILOBYTES 0x413 @@ -18,7 +19,6 @@ */ void __init reserve_ebda_region(void) { -#ifndef CONFIG_XEN unsigned int lowmem, ebda_addr; /* To determine the position of the EBDA and the */ @@ -53,5 +53,174 @@ void __init reserve_ebda_region(void) /* reserve all memory between lowmem and the 1MB mark */ reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); +} +#else /* CONFIG_XEN */ +#include +#include +#include +#include +#include +#include +#include + +extern void hypervisor_callback(void); +extern void failsafe_callback(void); +extern void nmi(void); + +#ifdef CONFIG_X86_64 +#include +#define CALLBACK_ADDR(fn) ((unsigned long)(fn)) +#else +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) } +#endif + +unsigned long *__read_mostly machine_to_phys_mapping = + (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned int __read_mostly machine_to_phys_order; +EXPORT_SYMBOL(machine_to_phys_order); + +void __init xen_start_kernel(void) +{ + unsigned int i; + struct xen_machphys_mapping mapping; + unsigned long machine_to_phys_nr_ents; +#ifdef CONFIG_X86_32 + struct xen_platform_parameters pp; + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; + unsigned long addr; +#endif + + xen_setup_features(); + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr_ents = mapping.max_mfn + 1; + } else + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) + machine_to_phys_order++; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = + (unsigned long *)xen_start_info->mfn_list; + + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables)); + + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE), + __pa(xen_start_info->pt_base) + + (xen_start_info->nr_pt_frames << PAGE_SHIFT), + "Xen provided"); + +#ifdef CONFIG_X86_32 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments)); + + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) { + hypervisor_virt_start = pp.virt_start; + reserve_top_address(0UL - pp.virt_start); + } + + BUG_ON(pte_index(hypervisor_virt_start)); + + /* Do an early initialization of the fixmap area */ + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); + addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); + set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr), + addr), + addr), + __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); +#else + check_efer(); + xen_init_pt(); +#endif + +#define __FIXADDR_TOP (-PAGE_SIZE) +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \ + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE))) + FIX_BUG_ON(SHARED_INFO); + FIX_BUG_ON(ISAMAP_BEGIN); + FIX_BUG_ON(ISAMAP_END); +#undef pmd_index +#undef __FIXADDR_TOP + + /* Switch to the real shared_info page, and clear the dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + clear_page(empty_zero_page); + + /* Set up mapping of lowest 1MB of physical memory. */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (is_initial_xendomain()) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_machine(empty_zero_page), + PAGE_KERNEL_RO); + +} + +void __init machine_specific_arch_setup(void) +{ + int ret; + static const struct callback_register __initconst event = { + .type = CALLBACKTYPE_event, + .address = CALLBACK_ADDR(hypervisor_callback) + }; + static const struct callback_register __initconst failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = CALLBACK_ADDR(failsafe_callback) + }; +#ifdef CONFIG_X86_64 + static const struct callback_register __initconst syscall = { + .type = CALLBACKTYPE_syscall, + .address = CALLBACK_ADDR(system_call) + }; +#endif +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32) + static const struct callback_register __initconst nmi_cb = { + .type = CALLBACKTYPE_nmi, + .address = CALLBACK_ADDR(nmi) + }; +#endif + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#ifdef CONFIG_X86_64 + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); +#endif +#if CONFIG_XEN_COMPAT <= 0x030002 +#ifdef CONFIG_X86_32 + if (ret == -ENOSYS) + ret = HYPERVISOR_set_callbacks( + event.address.cs, event.address.eip, + failsafe.address.cs, failsafe.address.eip); +#else + ret = HYPERVISOR_set_callbacks( + event.address, + failsafe.address, + syscall.address); +#endif +#endif + BUG_ON(ret); + +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) { + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; + + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); + } +#endif #endif } +#endif /* CONFIG_XEN */ --- head-2010-05-25.orig/arch/x86/kernel/head32-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -18,7 +19,7 @@ void __init i386_start_kernel(void) { reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifndef CONFIG_XEN #ifdef CONFIG_BLK_DEV_INITRD @@ -30,14 +31,8 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_early(init_pg_tables_start, init_pg_tables_end, - "INIT_PG_TABLE"); + reserve_ebda_region(); #else - reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE), - __pa(xen_start_info->pt_base) - + (xen_start_info->nr_pt_frames << PAGE_SHIFT), - "Xen provided"); - { int max_cmdline; @@ -46,9 +41,9 @@ void __init i386_start_kernel(void) memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline); boot_command_line[max_cmdline-1] = '\0'; } -#endif - reserve_ebda_region(); + xen_start_kernel(); +#endif /* * At this point everything still needed from the boot loader --- head-2010-05-25.orig/arch/x86/kernel/head64-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head64-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -7,9 +7,6 @@ * Modified for Xen. */ -/* PDA is not ready to be used until the end of x86_64_start_kernel(). */ -#define arch_use_lazy_mmu_mode() false - #include #include #include @@ -18,12 +15,12 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -33,27 +30,6 @@ #include #include -/* boot cpu pda */ -static struct x8664_pda _boot_cpu_pda; - -#ifdef CONFIG_SMP -/* - * We install an empty cpu_pda pointer table to indicate to early users - * (numa_set_node) that the cpu_pda pointer table for cpus other than - * the boot cpu is not yet setup. - */ -static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; -#else -static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; -#endif - -void __init x86_64_init_pda(void) -{ - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); -} - #ifndef CONFIG_XEN static void __init zap_identity_mappings(void) { @@ -92,16 +68,9 @@ static void __init copy_bootdata(char *r } #include -unsigned long *machine_to_phys_mapping; -EXPORT_SYMBOL(machine_to_phys_mapping); -unsigned int machine_to_phys_order; -EXPORT_SYMBOL(machine_to_phys_order); void __init x86_64_start_kernel(char * real_mode_data) { - struct xen_machphys_mapping mapping; - unsigned long machine_to_phys_nr_ents; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -116,21 +85,8 @@ void __init x86_64_start_kernel(char * r (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); - xen_setup_features(); - xen_start_info = (struct start_info *)real_mode_data; - if (!xen_feature(XENFEAT_auto_translated_physmap)) - phys_to_machine_mapping = - (unsigned long *)xen_start_info->mfn_list; - - machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START; - machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; - if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { - machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr_ents = mapping.max_mfn + 1; - } - while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) - machine_to_phys_order++; + xen_start_kernel(); #ifndef CONFIG_XEN /* clear bss before set_intr_gate with early_idt_handler */ @@ -155,7 +111,7 @@ void __init x86_64_start_kernel(char * r if (console_loglevel == 10) early_printk("Kernel alive\n"); - x86_64_init_pda(); + xen_switch_pt(); x86_64_start_reservations(real_mode_data); } @@ -166,12 +122,7 @@ void __init x86_64_start_reservations(ch reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); - - reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), - __pa(xen_start_info->pt_base) - + (xen_start_info->nr_pt_frames << PAGE_SHIFT), - "Xen provided"); + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); /* * At this point everything still needed from the boot loader --- head-2010-05-25.orig/arch/x86/kernel/head_32-xen.S 2010-03-24 15:12:36.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head_32-xen.S 2010-03-24 15:25:06.000000000 +0100 @@ -6,12 +6,14 @@ #include #include #include -#include +#include +#include #include #include #include #include #include +#include #include #include @@ -38,9 +40,6 @@ ENTRY(startup_32) /* Set up the stack pointer */ movl $(init_thread_union+THREAD_SIZE),%esp - movl %ss,%eax - movl %eax,%fs # gets reset once there's real percpu - /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID XEN_CPUID @@ -63,7 +62,49 @@ ENTRY(startup_32) movb $1,X86_HARD_MATH - xorl %eax,%eax # Clear GS +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * The linker can't handle this by relocation. Manually set + * base address in stack canary segment descriptor. + */ + movl $per_cpu__gdt_page,%eax + movl $per_cpu__stack_canary,%ecx + subl $20, %ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + + # %esi still points to start_info, and no registers + # need to be preserved. + + movl XEN_START_mfn_list(%esi), %ebx + movl $(per_cpu__gdt_page - __PAGE_OFFSET), %eax + shrl $PAGE_SHIFT, %eax + movl (%ebx,%eax,4), %ecx + pushl %ecx # frame number for set_gdt below + + xorl %esi, %esi + xorl %edx, %edx + shldl $PAGE_SHIFT, %ecx, %edx + shll $PAGE_SHIFT, %ecx + orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx + movl $per_cpu__gdt_page, %ebx + movl $__HYPERVISOR_update_va_mapping, %eax + int $0x82 + + movl $(PAGE_SIZE_asm / 8), %ecx + movl %esp, %ebx + movl $__HYPERVISOR_set_gdt, %eax + int $0x82 + + popl %ecx + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + movl $(__KERNEL_STACK_CANARY),%eax movl %eax,%gs cld # gcc2 wants the direction flag cleared at all times --- head-2010-05-25.orig/arch/x86/kernel/head_64-xen.S 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/head_64-xen.S 2010-03-24 15:25:06.000000000 +0100 @@ -21,6 +21,7 @@ #include #include #include +#include #include .section .text.head, "ax", @progbits @@ -32,11 +33,23 @@ startup_64: /* rsi is pointer to startup info structure. pass it to C */ movq %rsi,%rdi + + /* Set up %gs. + * + * The base of %gs always points to the bottom of the irqstack + * union. If the stack protector canary is enabled, it is + * located at %gs:40. Note that, on SMP, the boot cpu uses + * init data section till per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx + movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + pushq $0 # fake return address jmp x86_64_start_kernel -.balign PAGE_SIZE - #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ phys_##name = . - .text.head; \ --- head-2010-05-25.orig/arch/x86/kernel/ioport-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/ioport-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -91,9 +91,8 @@ static int do_iopl(unsigned int level, s } #ifdef CONFIG_X86_32 -asmlinkage long sys_iopl(unsigned long regsp) +long sys_iopl(struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)®sp; unsigned int level = regs->bx; #else asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) --- head-2010-05-25.orig/arch/x86/kernel/irq-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/irq-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -6,13 +6,20 @@ #include #include #include +#include #include #include #include +#include atomic_t irq_err_count; +#ifndef CONFIG_XEN +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; +#endif + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -36,11 +43,7 @@ void ack_bad_irq(unsigned int irq) #endif } -#ifdef CONFIG_X86_32 -# define irq_stats(x) (&per_cpu(irq_stat, x)) -#else -# define irq_stats(x) cpu_pda(x) -#endif +#define irq_stats(x) (&per_cpu(irq_stat, x)) /* * /proc/interrupts printing: */ @@ -57,6 +60,19 @@ static int show_other_interrupts(struct for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif +#ifndef CONFIG_XEN + if (generic_interrupt_extension) { + seq_printf(p, "%*s: ", prec, "PLT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, " Platform interrupts\n"); + } #endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); @@ -86,12 +102,6 @@ static int show_other_interrupts(struct seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "%*s: ", prec, "SPU"); - for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); @@ -128,23 +138,15 @@ int show_interrupts(struct seq_file *p, return 0; spin_lock_irqsave(&desc->lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); -#endif action = desc->action; if (!action && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); @@ -169,6 +171,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; +#endif +#ifndef CONFIG_XEN + if (generic_interrupt_extension) + sum += irq_stats(cpu)->generic_irqs; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; @@ -183,9 +190,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; #endif #endif -#ifdef CONFIG_X86_LOCAL_APIC - sum += irq_stats(cpu)->irq_spurious_count; -#endif return sum; } @@ -198,3 +202,64 @@ u64 arch_irq_stat(void) #endif return sum; } + + +#ifndef CONFIG_XEN +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_ax; + unsigned irq; + + exit_idle(); + irq_enter(); + + irq = __get_cpu_var(vector_irq)[vector]; + + if (!handle_irq(irq, regs)) { +#ifdef CONFIG_X86_64 + if (!disable_apic) + ack_APIC_irq(); +#endif + + if (printk_ratelimit()) + printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); + } + + irq_exit(); + + set_irq_regs(old_regs); + return 1; +} + +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + exit_idle(); + + irq_enter(); + + inc_irq_stat(generic_irqs); + + if (generic_interrupt_extension) + generic_interrupt_extension(); + + irq_exit(); + + set_irq_regs(old_regs); +} +#endif --- head-2010-05-25.orig/arch/x86/kernel/machine_kexec_64.c 2010-04-15 10:03:05.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/machine_kexec_64.c 2010-04-15 10:07:08.000000000 +0200 @@ -92,13 +92,8 @@ void machine_kexec_setup_load_arg(xen_ke xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); xki->page_list[PA_TABLE_PAGE] = __ma(table_page); - xki->page_list[PA_PGD] = __ma(kexec_pgd); - xki->page_list[PA_PUD_0] = __ma(kexec_pud0); - xki->page_list[PA_PUD_1] = __ma(kexec_pud1); - xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); - xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); - xki->page_list[PA_PTE_0] = __ma(kexec_pte0); - xki->page_list[PA_PTE_1] = __ma(kexec_pte1); + if (image->type == KEXEC_TYPE_DEFAULT) + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); } int __init machine_kexec_setup_resources(struct resource *hypervisor, @@ -161,7 +156,7 @@ static int init_one_level2_page(struct k } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); result = 0; out: return result; --- head-2010-05-25.orig/arch/x86/kernel/microcode_core-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/microcode_core-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -21,28 +21,28 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include #include -#include -#include -#include +#include +#include #include +#include #include -#include -#include +#include #include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include +#include +#include -#include -#include -#include #include +#include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -51,7 +51,7 @@ MODULE_LICENSE("GPL"); static int verbose; module_param(verbose, int, 0644); -#define MICROCODE_VERSION "2.00-xen" +#define MICROCODE_VERSION "2.00-xen" /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); @@ -143,12 +143,12 @@ static void microcode_dev_exit(void) MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) #endif /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; static int request_microcode(const char *name) { --- head-2010-05-25.orig/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -3,7 +3,7 @@ * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 - * (c) 1998, 1999, 2000 Ingo Molnar + * (c) 1998, 1999, 2000, 2009 Ingo Molnar * (c) 2008 Alexey Starikovskiy */ @@ -29,11 +29,7 @@ #include #include -#include -#ifdef CONFIG_X86_32 -#include -#include -#endif +#include static void *_bus_to_virt(unsigned long ma) { @@ -123,9 +119,6 @@ static void __init MP_bus_info(struct mp } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } -#endif - -#ifdef CONFIG_X86_IO_APIC static int bad_ioapic(unsigned long address) { @@ -153,11 +146,11 @@ static void __init MP_ioapic_info(struct if (bad_ioapic(m->apicaddr)) return; - mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; - mp_ioapics[nr_ioapics].mp_apicid = m->apicid; - mp_ioapics[nr_ioapics].mp_type = m->type; - mp_ioapics[nr_ioapics].mp_apicver = m->apicver; - mp_ioapics[nr_ioapics].mp_flags = m->flags; + mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; + mp_ioapics[nr_ioapics].apicid = m->apicid; + mp_ioapics[nr_ioapics].type = m->type; + mp_ioapics[nr_ioapics].apicver = m->apicver; + mp_ioapics[nr_ioapics].flags = m->flags; nr_ioapics++; } @@ -169,55 +162,55 @@ static void print_MP_intsrc_info(struct m->srcbusirq, m->dstapic, m->dstirq); } -static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) { apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", - mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, - (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, - mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); } static void __init assign_to_mp_irq(struct mpc_intsrc *m, - struct mp_config_intsrc *mp_irq) + struct mpc_intsrc *mp_irq) { - mp_irq->mp_dstapic = m->dstapic; - mp_irq->mp_type = m->type; - mp_irq->mp_irqtype = m->irqtype; - mp_irq->mp_irqflag = m->irqflag; - mp_irq->mp_srcbus = m->srcbus; - mp_irq->mp_srcbusirq = m->srcbusirq; - mp_irq->mp_dstirq = m->dstirq; + mp_irq->dstapic = m->dstapic; + mp_irq->type = m->type; + mp_irq->irqtype = m->irqtype; + mp_irq->irqflag = m->irqflag; + mp_irq->srcbus = m->srcbus; + mp_irq->srcbusirq = m->srcbusirq; + mp_irq->dstirq = m->dstirq; } -static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, +static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq, struct mpc_intsrc *m) { - m->dstapic = mp_irq->mp_dstapic; - m->type = mp_irq->mp_type; - m->irqtype = mp_irq->mp_irqtype; - m->irqflag = mp_irq->mp_irqflag; - m->srcbus = mp_irq->mp_srcbus; - m->srcbusirq = mp_irq->mp_srcbusirq; - m->dstirq = mp_irq->mp_dstirq; + m->dstapic = mp_irq->dstapic; + m->type = mp_irq->type; + m->irqtype = mp_irq->irqtype; + m->irqflag = mp_irq->irqflag; + m->srcbus = mp_irq->srcbus; + m->srcbusirq = mp_irq->srcbusirq; + m->dstirq = mp_irq->dstirq; } -static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, +static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq, struct mpc_intsrc *m) { - if (mp_irq->mp_dstapic != m->dstapic) + if (mp_irq->dstapic != m->dstapic) return 1; - if (mp_irq->mp_type != m->type) + if (mp_irq->type != m->type) return 2; - if (mp_irq->mp_irqtype != m->irqtype) + if (mp_irq->irqtype != m->irqtype) return 3; - if (mp_irq->mp_irqflag != m->irqflag) + if (mp_irq->irqflag != m->irqflag) return 4; - if (mp_irq->mp_srcbus != m->srcbus) + if (mp_irq->srcbus != m->srcbus) return 5; - if (mp_irq->mp_srcbusirq != m->srcbusirq) + if (mp_irq->srcbusirq != m->srcbusirq) return 6; - if (mp_irq->mp_dstirq != m->dstirq) + if (mp_irq->dstirq != m->dstirq) return 7; return 0; @@ -238,8 +231,12 @@ static void __init MP_intsrc_info(struct if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} +#endif /* CONFIG_X86_IO_APIC */ -#endif static void __init MP_lintsrc_info(struct mpc_lintsrc *m) { @@ -289,6 +286,20 @@ static int __init smp_check_mpc(struct m return 1; } +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" + "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -300,17 +311,8 @@ static int __init smp_read_mpc(struct mp if (!smp_check_mpc(mpc, oem, str)) return 0; -#ifdef CONFIG_X86_32 - /* - * need to make sure summit and es7000's mps_oem_check is safe to be - * called early via genericarch 's mps_oem_check - */ - if (early) { -#ifdef CONFIG_X86_NUMAQ - numaq_mps_oem_check(mpc, oem, str); -#endif - } else - mps_oem_check(mpc, oem, str); +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + generic_mps_oem_check(mpc, oem, str); #endif /* save the local APIC address, it might be non-default */ if (!acpi_lapic) @@ -333,61 +335,30 @@ static int __init smp_read_mpc(struct mp while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: - { - struct mpc_cpu *m = (struct mpc_cpu *)mpt; - /* ACPI may have already provided this data */ - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; case MP_BUS: - { - struct mpc_bus *m = (struct mpc_bus *)mpt; -#ifdef CONFIG_X86_IO_APIC - MP_bus_info(m); -#endif - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; case MP_IOAPIC: - { -#ifdef CONFIG_X86_IO_APIC - struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; - MP_ioapic_info(m); -#endif - mpt += sizeof(struct mpc_ioapic); - count += sizeof(struct mpc_ioapic); - break; - } + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; case MP_INTSRC: - { -#ifdef CONFIG_X86_IO_APIC - struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; - - MP_intsrc_info(m); -#endif - mpt += sizeof(struct mpc_intsrc); - count += sizeof(struct mpc_intsrc); - break; - } + MP_intsrc_info((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; case MP_LINTSRC: - { - struct mpc_lintsrc *m = - (struct mpc_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; default: /* wrong mptable */ - printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); - printk(KERN_ERR "type %x\n", *mpt); - print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, - 1, mpc, mpc->length, 1); + smp_dump_mptable(mpc, mpt); count = mpc->length; break; } @@ -395,13 +366,13 @@ static int __init smp_read_mpc(struct mp (*x86_quirks->mpc_record)++; } -#ifdef CONFIG_X86_GENERICARCH - generic_bigsmp_probe(); +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); #endif -#ifdef CONFIG_X86_32 - setup_apic_routing(); -#endif + if (apic->setup_apic_routing) + apic->setup_apic_routing(); + if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; @@ -426,7 +397,7 @@ static void __init construct_default_ioi intsrc.type = MP_INTSRC; intsrc.irqflag = 0; /* conforming */ intsrc.srcbus = 0; - intsrc.dstapic = mp_ioapics[0].mp_apicid; + intsrc.dstapic = mp_ioapics[0].apicid; intsrc.irqtype = mp_INT; @@ -579,14 +550,76 @@ static inline void __init construct_defa } } -static struct intel_mp_floating *mpf_found; +static struct mpf_intel *mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" + "... disabling SMP support. (tell your hw vendor)\n"); + early_iounmap(mpc, size); + return -1; + } + early_iounmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} /* * Scan the memory blocks for an SMP configuration block. */ static void __init __get_smp_config(unsigned int early) { - struct intel_mp_floating *mpf = mpf_found; + struct mpf_intel *mpf = mpf_found; if (!mpf) return; @@ -607,9 +640,9 @@ static void __init __get_smp_config(unsi } printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", - mpf->mpf_specification); + mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) - if (mpf->mpf_feature2 & (1 << 7)) { + if (mpf->feature2 & (1 << 7)) { printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); pic_mode = 1; } else { @@ -620,7 +653,7 @@ static void __init __get_smp_config(unsi /* * Now see if we need to read further. */ - if (mpf->mpf_feature1 != 0) { + if (mpf->feature1 != 0) { if (early) { /* * local APIC has default address @@ -630,49 +663,12 @@ static void __init __get_smp_config(unsi } printk(KERN_INFO "Default MP configuration #%d\n", - mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) { -#ifdef CONFIG_X86_LOCAL_APIC - smp_found_config = 0; -#endif - printk(KERN_ERR - "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. " - "(tell your hw vendor)\n"); - return; - } + mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); - if (early) + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) return; -#ifdef CONFIG_X86_IO_APIC - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " - "using default mptable. " - "(tell your hw vendor)\n"); - - bus.type = MP_BUS; - bus.busid = 0; - memcpy(bus.bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } -#endif } else BUG(); @@ -693,58 +689,68 @@ void __init get_smp_config(void) __get_smp_config(0); } +#ifndef CONFIG_XEN +static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +{ + unsigned long size = get_mpc_size(mpf->physptr); +#ifdef CONFIG_X86_32 + /* + * We cannot access to MPC table to compute table size yet, + * as only few megabytes from the bottom is mapped now. + * PC-9800's MPC table places on the very last of physical + * memory; so that simply reserving PAGE_SIZE from mpf->physptr + * yields BUG() in reserve_bootmem. + * also need to make sure physptr is below than max_low_pfn + * we don't need reserve the area above max_low_pfn + */ + unsigned long end = max_low_pfn * PAGE_SIZE; + + if (mpf->physptr < end) { + if (mpf->physptr + size > end) + size = end - mpf->physptr; + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); + } +#else + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); +#endif +} +#endif + static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned reserve) { unsigned int *bp = _bus_to_virt(base); - struct intel_mp_floating *mpf; + struct mpf_intel *mpf; apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { - mpf = (struct intel_mp_floating *)bp; + mpf = (struct mpf_intel *)bp; if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && + (mpf->length == 1) && !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4))) { + ((mpf->specification == 1) + || (mpf->specification == 4))) { #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; #endif mpf_found = mpf; #ifndef CONFIG_XEN - printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", - mpf, virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", + mpf, (u64)virt_to_phys(mpf)); if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, - BOOTMEM_DEFAULT); - if (mpf->mpf_physptr) { - unsigned long size = PAGE_SIZE; -#ifdef CONFIG_X86_32 - /* - * We cannot access to MPC table to compute - * table size yet, as only few megabytes from - * the bottom is mapped now. - * PC-9800's MPC table places on the very last - * of physical memory; so that simply reserving - * PAGE_SIZE from mpg->mpf_physptr yields BUG() - * in reserve_bootmem. - */ - unsigned long end = max_low_pfn * PAGE_SIZE; - if (mpf->mpf_physptr + size > end) - size = end - mpf->mpf_physptr; -#endif - reserve_bootmem_generic(mpf->mpf_physptr, size, + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), BOOTMEM_DEFAULT); - } + if (mpf->physptr) + smp_reserve_bootmem(mpf); #else printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", - mpf, ((void *)bp - _bus_to_virt(base)) + base); + mpf, ((void *)bp - _bus_to_virt(base)) + base); #endif return 1; } @@ -826,15 +832,15 @@ static int __init get_MP_intsrc_index(s /* not legacy */ for (i = 0; i < mp_irq_entries; i++) { - if (mp_irqs[i].mp_irqtype != mp_INT) + if (mp_irqs[i].irqtype != mp_INT) continue; - if (mp_irqs[i].mp_irqflag != 0x0f) + if (mp_irqs[i].irqflag != 0x0f) continue; - if (mp_irqs[i].mp_srcbus != m->srcbus) + if (mp_irqs[i].srcbus != m->srcbus) continue; - if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) + if (mp_irqs[i].srcbusirq != m->srcbusirq) continue; if (irq_used[i]) { /* already claimed */ @@ -851,7 +857,58 @@ static int __init get_MP_intsrc_index(s #define SPARE_SLOT_NUM 20 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; -#endif + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_MP_intsrc_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + assign_to_mpc_intsrc(&mp_irqs[i], m); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, + int count) +{ + if (!mpc_new_phys) { + pr_info("No spare slots, try to append...take your risk, " + "new mpc_length %x\n", count); + } else { + if (count <= mpc_new_length) + pr_info("No spare slots, try to append..., " + "new mpc_length %x\n", count); + else { + pr_err("mpc_new_length %lx is too small\n", + mpc_new_length); + return -1; + } + } + + return 0; +} static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, @@ -859,77 +916,33 @@ static int __init replace_intsrc_all(st { #ifdef CONFIG_X86_IO_APIC int i; - int nr_m_spare = 0; #endif - int count = sizeof(*mpc); + int nr_m_spare = 0; unsigned char *mpt = ((unsigned char *)mpc) + count; printk(KERN_INFO "mpc_length %x\n", mpc->length); while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: - { - struct mpc_cpu *m = (struct mpc_cpu *)mpt; - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; case MP_BUS: - { - struct mpc_bus *m = (struct mpc_bus *)mpt; - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; case MP_IOAPIC: - { - mpt += sizeof(struct mpc_ioapic); - count += sizeof(struct mpc_ioapic); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; case MP_INTSRC: - { -#ifdef CONFIG_X86_IO_APIC - struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; - - apic_printk(APIC_VERBOSE, "OLD "); - print_MP_intsrc_info(m); - i = get_MP_intsrc_index(m); - if (i > 0) { - assign_to_mpc_intsrc(&mp_irqs[i], m); - apic_printk(APIC_VERBOSE, "NEW "); - print_mp_irq_info(&mp_irqs[i]); - } else if (!i) { - /* legacy, do nothing */ - } else if (nr_m_spare < SPARE_SLOT_NUM) { - /* - * not found (-1), or duplicated (-2) - * are invalid entries, - * we need to use the slot later - */ - m_spare[nr_m_spare] = m; - nr_m_spare++; - } -#endif - mpt += sizeof(struct mpc_intsrc); - count += sizeof(struct mpc_intsrc); - break; - } + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; case MP_LINTSRC: - { - struct mpc_lintsrc *m = - (struct mpc_lintsrc *)mpt; - mpt += sizeof(*m); - count += sizeof(*m); - break; - } + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; default: /* wrong mptable */ - printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); - printk(KERN_ERR "type %x\n", *mpt); - print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, - 1, mpc, mpc->length, 1); + smp_dump_mptable(mpc, mpt); goto out; } } @@ -939,10 +952,10 @@ static int __init replace_intsrc_all(st if (irq_used[i]) continue; - if (mp_irqs[i].mp_irqtype != mp_INT) + if (mp_irqs[i].irqtype != mp_INT) continue; - if (mp_irqs[i].mp_irqflag != 0x0f) + if (mp_irqs[i].irqflag != 0x0f) continue; if (nr_m_spare > 0) { @@ -953,16 +966,8 @@ static int __init replace_intsrc_all(st } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!mpc_new_phys) { - printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count); - else { - printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length); - goto out; - } - } + if (!check_slot(mpc_new_phys, mpc_new_length, count)) + goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; mpt += sizeof(struct mpc_intsrc); @@ -1018,7 +1023,7 @@ static int __init update_mp_table(void) { char str[16]; char oem[10]; - struct intel_mp_floating *mpf; + struct mpf_intel *mpf; struct mpc_table *mpc, *mpc_new; if (!enable_update_mptable) @@ -1031,19 +1036,19 @@ static int __init update_mp_table(void) /* * Now see if we need to go further. */ - if (mpf->mpf_feature1 != 0) + if (mpf->feature1 != 0) return 0; - if (!mpf->mpf_physptr) + if (!mpf->physptr) return 0; - mpc = _bus_to_virt(mpf->mpf_physptr); + mpc = _bus_to_virt(mpf->physptr); if (!smp_check_mpc(mpc, oem, str)) return 0; - printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf)); - printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); + printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf)); + printk(KERN_INFO "physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { mpc_new_phys = 0; @@ -1067,23 +1072,23 @@ static int __init update_mp_table(void) maddr_t mpc_new_bus; mpc_new_bus = phys_to_machine(mpc_new_phys); - mpf->mpf_physptr = mpc_new_bus; + mpf->physptr = mpc_new_bus; mpc_new = phys_to_virt(mpc_new_phys); memcpy(mpc_new, mpc, mpc->length); mpc = mpc_new; /* check if we can modify that */ - if (mpc_new_bus - mpf->mpf_physptr) { - struct intel_mp_floating *mpf_new; + if (mpc_new_bus - mpf->physptr) { + struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); mpf_new = isa_bus_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); mpf = mpf_new; - mpf->mpf_physptr = mpc_new_bus; + mpf->physptr = mpc_new_bus; } - mpf->mpf_checksum = 0; - mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); - printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "physptr new: %x\n", mpf->physptr); } /* --- head-2010-05-25.orig/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -12,7 +13,7 @@ static int forbid_dac __read_mostly; -struct dma_mapping_ops *dma_ops; +struct dma_map_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -39,11 +40,14 @@ EXPORT_SYMBOL(bad_dma_address); to older i386. */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_32BIT_MASK, + .coherent_dma_mask = DMA_BIT_MASK(32), .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); +/* Number of entries preallocated for DMA-API debugging */ +#define PREALLOC_DMA_DEBUG_ENTRIES 32768 + int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) @@ -103,20 +107,20 @@ static void __init dma32_free_bootmem(vo } #endif -static struct dma_mapping_ops swiotlb_dma_ops = { +static struct dma_map_ops swiotlb_dma_ops = { .alloc_coherent = dma_generic_alloc_coherent, .free_coherent = dma_generic_free_coherent, .mapping_error = swiotlb_dma_mapping_error, - .map_single = swiotlb_map_single_phys, - .unmap_single = swiotlb_unmap_single, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, - .map_sg = swiotlb_map_sg, - .unmap_sg = swiotlb_unmap_sg, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, .dma_supported = swiotlb_dma_supported }; @@ -175,7 +179,7 @@ again: if (!is_buffer_dma_capable(dma_mask, addr, size)) { __free_pages(page, order); - if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) { + if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { flag = (flag & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -305,7 +309,7 @@ int range_straddles_page_boundary(paddr_ int dma_supported(struct device *dev, u64 mask) { - struct dma_mapping_ops *ops = get_dma_ops(dev); + struct dma_map_ops *ops = get_dma_ops(dev); #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { @@ -320,7 +324,7 @@ int dma_supported(struct device *dev, u6 /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ - if (mask < DMA_24BIT_MASK) + if (mask < DMA_BIT_MASK(24)) return 0; /* Tell the device to use SAC when IOMMU force is on. This @@ -335,7 +339,7 @@ int dma_supported(struct device *dev, u6 SAC for these. Assume all masks <= 40 bits are of this type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { + if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { dev_info(dev, "Force SAC with mask %Lx\n", mask); return 0; } @@ -346,6 +350,12 @@ EXPORT_SYMBOL(dma_supported); static int __init pci_iommu_init(void) { + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + +#ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); +#endif + calgary_iommu_init(); intel_iommu_init(); @@ -371,8 +381,7 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO - "PCI: VIA PCI bridge detected. Disabling DAC.\n"); + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } --- head-2010-05-25.orig/arch/x86/kernel/pci-nommu-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/pci-nommu-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -24,7 +24,7 @@ do { \ static int gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, - int direction) + enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned int i; struct scatterlist *sg; @@ -48,7 +48,7 @@ gnttab_map_sg(struct device *hwdev, stru static void gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, - int direction) + enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned int i; struct scatterlist *sg; @@ -58,24 +58,25 @@ gnttab_unmap_sg(struct device *hwdev, st } static dma_addr_t -gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size, - int direction) +gnttab_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { dma_addr_t dma; WARN_ON(size == 0); - dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) + - offset_in_page(paddr); - IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size)); + dma = gnttab_dma_map_page(page) + offset; + IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) + + offset, size)); IOMMU_BUG_ON(address_needs_mapping(dev, dma, size)); return dma; } static void -gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - int direction) +gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) { gnttab_dma_unmap_page(dma_addr); } @@ -85,14 +86,14 @@ static int nommu_dma_supported(struct de return 1; } -struct dma_mapping_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = dma_generic_free_coherent, - .map_single = gnttab_map_single, - .unmap_single = gnttab_unmap_single, - .map_sg = gnttab_map_sg, - .unmap_sg = gnttab_unmap_sg, - .dma_supported = nommu_dma_supported, +struct dma_map_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .map_page = gnttab_map_page, + .unmap_page = gnttab_unmap_page, + .map_sg = gnttab_map_sg, + .unmap_sg = gnttab_unmap_sg, + .dma_supported = nommu_dma_supported, }; void __init no_iommu_init(void) --- head-2010-05-25.orig/arch/x86/kernel/process-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/process-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -1,16 +1,19 @@ #include #include #include -#include #include +#include #include #include #include #include #include -#include +#include #include #include +#include +#include +#include #include unsigned long idle_halt; @@ -20,6 +23,9 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +DEFINE_TRACE(power_start); +DEFINE_TRACE(power_end); + int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -57,6 +63,179 @@ void arch_task_cache_init(void) } /* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { + struct physdev_set_iobitmap set_iobitmap; + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(&set_iobitmap, 0, sizeof(set_iobitmap)); + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + t->io_bitmap_max = 0; + kfree(bp); + } + + ds_exit_thread(current); +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_X86_64 + if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { + clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); + if (test_tsk_thread_flag(tsk, TIF_IA32)) { + clear_tsk_thread_flag(tsk, TIF_IA32); + } else { + set_tsk_thread_flag(tsk, TIF_IA32); + current_thread_info()->status |= TS_COMPAT; + } + } +#endif + + clear_tsk_thread_flag(tsk, TIF_DEBUG); + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + +static void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} + +static void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread; + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) + update_debugctlmsr(next->debugctlmsr); + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); + } + + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } +} + +int sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +int sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, + NULL, NULL); +} + + +/* * Idle related variables and functions */ unsigned long boot_option_idle_override = 0; @@ -130,7 +309,7 @@ void stop_this_cpu(void *dummy) /* * Remove this CPU: */ - cpu_clear(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), false); disable_all_local_evtchn(); for (;;) { @@ -283,12 +462,13 @@ static int __cpuinit check_c1e_idle(cons return 1; } -static cpumask_t c1e_mask = CPU_MASK_NONE; +static cpumask_var_t c1e_mask; static int c1e_detected; void c1e_remove_cpu(int cpu) { - cpu_clear(cpu, c1e_mask); + if (c1e_mask != NULL) + cpumask_clear_cpu(cpu, c1e_mask); } /* @@ -317,8 +497,8 @@ static void c1e_idle(void) if (c1e_detected) { int cpu = smp_processor_id(); - if (!cpu_isset(cpu, c1e_mask)) { - cpu_set(cpu, c1e_mask); + if (!cpumask_test_cpu(cpu, c1e_mask)) { + cpumask_set_cpu(cpu, c1e_mask); /* * Force broadcast so ACPI can not interfere. Needs * to run with interrupts enabled as it uses @@ -350,7 +530,7 @@ static void c1e_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifndef CONFIG_XEN -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { printk(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); @@ -373,6 +553,17 @@ void __cpuinit select_idle_routine(const #endif } +void __init init_c1e_mask(void) +{ +#ifndef CONFIG_XEN + /* If we're using c1e_idle, we need to allocate c1e_mask. */ + if (pm_idle == c1e_idle) { + alloc_cpumask_var(&c1e_mask, GFP_KERNEL); + cpumask_clear(c1e_mask); + } +#endif +} + static int __init idle_setup(char *str) { if (!str) --- head-2010-05-25.orig/arch/x86/kernel/process_32-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/process_32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -71,9 +72,6 @@ asmlinkage void cstar_ret_from_fork(void DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - /* * Return saved PC of a blocked thread. */ @@ -99,6 +97,15 @@ void cpu_idle(void) { int cpu = smp_processor_id(); + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ @@ -113,7 +120,6 @@ void cpu_idle(void) play_dead(); local_irq_disable(); - __get_cpu_var(irq_stat).idle_timestamp = jiffies; /* Don't trace irqs off for idle */ stop_critical_timings(); xen_idle(); @@ -137,7 +143,7 @@ void __show_regs(struct pt_regs *regs, i if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; - savesegment(gs, gs); + gs = get_user_gs(regs); } else { sp = (unsigned long) (®s->sp); savesegment(ss, ss); @@ -218,6 +224,7 @@ int kernel_thread(int (*fn)(void *), voi regs.ds = __USER_DS; regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); @@ -228,47 +235,6 @@ int kernel_thread(int (*fn)(void *), voi } EXPORT_SYMBOL(kernel_thread); -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - /* The process may have allocated an io port bitmap... nuke it. */ - if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { - struct task_struct *tsk = current; - struct thread_struct *t = &tsk->thread; - struct physdev_set_iobitmap set_iobitmap; - memset(&set_iobitmap, 0, sizeof(set_iobitmap)); - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, - &set_iobitmap)); - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - } - - ds_exit_thread(current); -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - clear_tsk_thread_flag(tsk, TIF_DEBUG); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); -} - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); @@ -284,7 +250,7 @@ void prepare_to_copy(struct task_struct unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { @@ -302,7 +268,7 @@ int copy_thread(int nr, unsigned long cl p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs, p->thread.gs); + task_user_gs(p) = get_user_gs(regs); tsk = current; if (test_tsk_thread_flag(tsk, TIF_CSTAR)) @@ -344,7 +310,7 @@ int copy_thread(int nr, unsigned long cl void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - __asm__("movl %0, %%gs" : : "r"(0)); + set_user_gs(regs, 0); regs->fs = 0; set_fs(USER_DS); regs->ds = __USER_DS; @@ -360,98 +326,6 @@ start_thread(struct pt_regs *regs, unsig } EXPORT_SYMBOL_GPL(start_thread); -static void hard_disable_TSC(void) -{ - write_cr4(read_cr4() | X86_CR4_TSD); -} - -void disable_TSC(void) -{ - preempt_disable(); - if (!test_and_set_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_disable_TSC(); - preempt_enable(); -} - -static void hard_enable_TSC(void) -{ - write_cr4(read_cr4() & ~X86_CR4_TSD); -} - -static void enable_TSC(void) -{ - preempt_disable(); - if (test_and_clear_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_enable_TSC(); - preempt_enable(); -} - -int get_tsc_mode(unsigned long adr) -{ - unsigned int val; - - if (test_thread_flag(TIF_NOTSC)) - val = PR_TSC_SIGSEGV; - else - val = PR_TSC_ENABLE; - - return put_user(val, (unsigned int __user *)adr); -} - -int set_tsc_mode(unsigned int val) -{ - if (val == PR_TSC_SIGSEGV) - disable_TSC(); - else if (val == PR_TSC_ENABLE) - enable_TSC(); - else - return -EINVAL; - - return 0; -} - -static noinline void -__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; - - if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || - test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) - ds_switch_to(prev_p, next_p); - else if (next->debugctlmsr != prev->debugctlmsr) - update_debugctlmsr(next->debugctlmsr); - - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); - } -} - /* * switch_to(x,yn) should switch tasks from x to y. * @@ -532,7 +406,7 @@ __switch_to(struct task_struct *prev_p, if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ next->tls_array[i].b != prev->tls_array[i].b)) { \ mcl->op = __HYPERVISOR_update_descriptor; \ - *(u64 *)&mcl->args[0] = virt_to_machine( \ + *(u64 *)&mcl->args[0] = arbitrary_virt_to_machine( \ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ mcl++; \ @@ -612,64 +486,44 @@ __switch_to(struct task_struct *prev_p, * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - loadsegment(gs, next->gs); + lazy_load_gs(next->gs); - x86_write_percpu(current_task, next_p); + percpu_write(current_task, next_p); return prev_p; } -asmlinkage int sys_fork(struct pt_regs regs) -{ - return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); -} - -asmlinkage int sys_clone(struct pt_regs regs) +int sys_clone(struct pt_regs *regs) { unsigned long clone_flags; unsigned long newsp; int __user *parent_tidptr, *child_tidptr; - clone_flags = regs.bx; - newsp = regs.cx; - parent_tidptr = (int __user *)regs.dx; - child_tidptr = (int __user *)regs.di; + clone_flags = regs->bx; + newsp = regs->cx; + parent_tidptr = (int __user *)regs->dx; + child_tidptr = (int __user *)regs->di; if (!newsp) - newsp = regs.sp; - return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage int sys_vfork(struct pt_regs regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(struct pt_regs regs) +int sys_execve(struct pt_regs *regs) { int error; char *filename; - filename = getname((char __user *) regs.bx); + filename = getname((char __user *) regs->bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) regs.cx, - (char __user * __user *) regs.dx, - ®s); + (char __user * __user *) regs->cx, + (char __user * __user *) regs->dx, + regs); if (error == 0) { /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); --- head-2010-05-25.orig/arch/x86/kernel/process_64-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/process_64-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -50,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -67,6 +67,11 @@ asmlinkage extern void ret_from_fork(void); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +static DEFINE_PER_CPU(unsigned char, is_idle); + unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -85,13 +90,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregist void enter_idle(void) { - write_pda(isidle, 1); + percpu_write(is_idle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { - if (test_and_clear_bit_pda(0, isidle) == 0) + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) return; atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } @@ -121,6 +126,16 @@ static inline void play_dead(void) void cpu_idle(void) { current_thread_info()->status |= TS_POLLING; + + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(1); @@ -230,78 +245,11 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - struct task_struct *me = current; - struct thread_struct *t = &me->thread; - - if (me->thread.io_bitmap_ptr) { -#ifndef CONFIG_X86_NO_TSS - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); -#endif -#ifdef CONFIG_XEN - struct physdev_set_iobitmap iobmp_op; - memset(&iobmp_op, 0, sizeof(iobmp_op)); -#endif - - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ -#ifndef CONFIG_X86_NO_TSS - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - put_cpu(); -#endif -#ifdef CONFIG_XEN - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, - &iobmp_op)); -#endif - t->io_bitmap_max = 0; - } - - ds_exit_thread(current); -} - void xen_load_gs_index(unsigned gs) { WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); } -void flush_thread(void) -{ - struct task_struct *tsk = current; - - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); -} - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { @@ -343,7 +291,7 @@ void prepare_to_copy(struct task_struct unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { @@ -434,103 +382,6 @@ start_thread(struct pt_regs *regs, unsig } EXPORT_SYMBOL_GPL(start_thread); -static void hard_disable_TSC(void) -{ - write_cr4(read_cr4() | X86_CR4_TSD); -} - -void disable_TSC(void) -{ - preempt_disable(); - if (!test_and_set_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_disable_TSC(); - preempt_enable(); -} - -static void hard_enable_TSC(void) -{ - write_cr4(read_cr4() & ~X86_CR4_TSD); -} - -static void enable_TSC(void) -{ - preempt_disable(); - if (test_and_clear_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_enable_TSC(); - preempt_enable(); -} - -int get_tsc_mode(unsigned long adr) -{ - unsigned int val; - - if (test_thread_flag(TIF_NOTSC)) - val = PR_TSC_SIGSEGV; - else - val = PR_TSC_ENABLE; - - return put_user(val, (unsigned int __user *)adr); -} - -int set_tsc_mode(unsigned int val) -{ - if (val == PR_TSC_SIGSEGV) - disable_TSC(); - else if (val == PR_TSC_ENABLE) - enable_TSC(); - else - return -EINVAL; - - return 0; -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) - -static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread, - next = &next_p->thread; - - if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || - test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) - ds_switch_to(prev_p, next_p); - else if (next->debugctlmsr != prev->debugctlmsr) - update_debugctlmsr(next->debugctlmsr); - - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); - } -} - /* * switch_to(x,y) should switch tasks from x to y. * @@ -596,7 +447,7 @@ __switch_to(struct task_struct *prev_p, if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ next->tls_array[i].b != prev->tls_array[i].b)) { \ mcl->op = __HYPERVISOR_update_descriptor; \ - mcl->args[0] = virt_to_machine( \ + mcl->args[0] = arbitrary_virt_to_machine( \ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ mcl->args[1] = *(u64 *)&next->tls_array[i]; \ mcl++; \ @@ -683,19 +534,11 @@ __switch_to(struct task_struct *prev_p, /* * Switch the PDA context. */ - write_pda(pcurrent, next_p); - write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - PDA_STACKOFFSET); -#ifdef CONFIG_CC_STACKPROTECTOR - write_pda(stack_canary, next_p->stack_canary); + percpu_write(current_task, next_p); - /* - * Build time only check to make sure the stack_canary is at - * offset 40 in the pda; this is a gcc ABI requirement - */ - BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); -#endif + percpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); /* * Now maybe reload the debug registers @@ -749,11 +592,6 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) @@ -763,22 +601,6 @@ sys_clone(unsigned long clone_flags, uns return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage long sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, - NULL, NULL); -} - unsigned long get_wchan(struct task_struct *p) { unsigned long stack; --- head-2010-05-25.orig/arch/x86/kernel/quirks-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/quirks-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -75,8 +75,7 @@ static void ich_force_hpet_resume(void) if (!force_hpet_address) return; - if (rcba_base == NULL) - BUG(); + BUG_ON(rcba_base == NULL); /* read the Function Disable register, dword mode only */ val = readl(rcba_base + 0x3404); @@ -173,7 +172,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); - +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); static struct pci_dev *cached_dev; @@ -262,8 +262,6 @@ static void old_ich_force_enable_hpet_us { if (hpet_force_user) old_ich_force_enable_hpet(dev); - else - hpet_print_force_info(); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, --- head-2010-05-25.orig/arch/x86/kernel/setup-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/setup-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -74,14 +74,15 @@ #include #include #include -#include #include +#include +#include #include #include #include #include #include -#include +#include #include #include #include @@ -89,7 +90,7 @@ #include #include -#include +#include #include #include #include @@ -97,7 +98,6 @@ #include #include -#include #include #include @@ -118,9 +118,6 @@ #include #include -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; -EXPORT_SYMBOL(HYPERVISOR_shared_info); - static int xen_panic_event(struct notifier_block *, unsigned long, void *); static struct notifier_block xen_panic_block = { xen_panic_event, NULL, 0 /* try to go last */ @@ -145,7 +142,26 @@ EXPORT_SYMBOL(xen_start_info); #define ARCH_SETUP #endif +RESERVE_BRK(dmi_alloc, 65536); + +unsigned int boot_cpu_id __read_mostly; + +static __initdata unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; + #ifndef CONFIG_XEN +#ifdef CONFIG_X86_64 +int default_cpu_present_to_apicid(int mps_cpu) +{ + return __default_cpu_present_to_apicid(mps_cpu); +} + +int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return __default_check_phys_apicid_present(boot_cpu_physical_apicid); +} +#endif + #ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; #else @@ -179,14 +195,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -#ifndef CONFIG_XEN -/* This value is set up by the early boot code to point to the value - immediately after the boot time page tables. It contains a *physical* - address, and must not be in the .bss segment! */ -unsigned long init_pg_tables_start __initdata = ~0UL; -unsigned long init_pg_tables_end __initdata = ~0UL; -#endif - static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, @@ -226,7 +234,9 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .x86_phys_bits = MAX_PHYSMEM_BITS, +}; EXPORT_SYMBOL(boot_cpu_data); #endif @@ -241,12 +251,6 @@ unsigned long mmu_cr4_features = X86_CR4 int bootloader_type; /* - * Early DMI memory - */ -int dmi_alloc_index; -char dmi_alloc_data[DMI_MAX_DATA]; - -/* * Setup options */ struct screen_info screen_info; @@ -293,6 +297,35 @@ static inline void copy_edd(void) } #endif +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); + + /* Mark brk area as locked down and no longer taking any + new allocations */ + _brk_start = 0; +} + #ifdef CONFIG_BLK_DEV_INITRD #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) @@ -653,24 +686,7 @@ static int __init setup_elfcorehdr(char early_param("elfcorehdr", setup_elfcorehdr); #endif -#ifndef CONFIG_XEN -static int __init default_update_genapic(void) -{ -#ifdef CONFIG_X86_SMP -# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) - genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; -# endif -#endif - - return 0; -} -#else -#define default_update_genapic NULL -#endif - -static struct x86_quirks default_x86_quirks __initdata = { - .update_genapic = default_update_genapic, -}; +static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; @@ -738,19 +754,11 @@ void __init setup_arch(char **cmdline_p) /* Register a call for panic conditions. */ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); - - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_writable_pagetables)); -#ifdef CONFIG_X86_32 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_4gb_segments)); -#endif #endif /* CONFIG_XEN */ #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); - pre_setup_arch_hook(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif @@ -834,16 +842,7 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; -#ifdef CONFIG_X86_32 -#ifndef CONFIG_XEN - init_mm.brk = init_pg_tables_end + PAGE_OFFSET; -#else - init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) + - xen_start_info->nr_pt_frames) << PAGE_SHIFT; -#endif -#else - init_mm.brk = (unsigned long) &_end; -#endif + init_mm.brk = _brk_end; code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; @@ -956,9 +955,8 @@ void __init setup_arch(char **cmdline_p) num_physpages = max_pfn; max_mapnr = max_pfn; -#ifndef CONFIG_XEN - if (cpu_has_x2apic) - check_x2apic(); +#ifdef CONFIG_X86_LOCAL_APIC + check_x2apic(); #endif /* How many end-of-memory variables you have, grandma! */ @@ -975,6 +973,8 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif + reserve_brk(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } + init_ISA_irqs(); +} + +/** + * x86_quirk_intr_init - post gate setup interrupt initialisation + * + * Description: + * Fill in any interrupts that may have been left out by the general + * init_IRQ() routine. interrupts having to do with the machine rather + * than the devices on the I/O bus (like APIC interrupts in intel MP + * systems) are started here. + **/ +void __init x86_quirk_intr_init(void) +{ + if (x86_quirks->arch_intr_init) { + if (x86_quirks->arch_intr_init()) + return; + } +} + +/** + * x86_quirk_trap_init - initialise system specific traps + * + * Description: + * Called as the final act of trap_init(). Used in VISWS to initialise + * the various board specific APIC traps. + **/ +void __init x86_quirk_trap_init(void) +{ + if (x86_quirks->arch_trap_init) { + if (x86_quirks->arch_trap_init()) + return; + } +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +/** + * x86_quirk_pre_time_init - do any specific initialisations before. + * + **/ +void __init x86_quirk_pre_time_init(void) +{ + if (x86_quirks->arch_pre_time_init) + x86_quirks->arch_pre_time_init(); +} + +/** + * x86_quirk_time_init - do any specific initialisations for the system timer. + * + * Description: + * Must plug the system timer interrupt source at HZ into the IRQ listed + * in irq_vectors.h:TIMER_IRQ + **/ +void __init x86_quirk_time_init(void) +{ + if (x86_quirks->arch_time_init) { + /* + * A nonzero return code does not mean failure, it means + * that the architecture quirk does not want any + * generic (timer) setup to be performed after this: + */ + if (x86_quirks->arch_time_init()) + return; + } + + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} +#endif /* CONFIG_X86_32 */ + #ifdef CONFIG_XEN static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) --- head-2010-05-25.orig/arch/x86/kernel/setup_percpu.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/setup_percpu.c 2010-03-24 15:25:06.000000000 +0100 @@ -231,7 +231,7 @@ void __init setup_per_cpu_areas(void) * are zeroed indicating that the static arrays are * gone. */ -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = @@ -255,7 +255,7 @@ void __init setup_per_cpu_areas(void) } /* indicate the early static arrays will soon be gone */ -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif --- head-2010-05-25.orig/arch/x86/kernel/smp-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/smp-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -2,7 +2,7 @@ * Intel SMP support routines. * * (c) 1995 Alan Cox, Building #3 - * (c) 1998-99, 2000 Ingo Molnar + * (c) 1998-99, 2000, 2009 Ingo Molnar * (c) 2002,2003 Andi Kleen, SuSE Labs. * * i386 and x86_64 integration by Glauber Costa @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include /* * Some notes on x86 processor bugs affecting SMP operation: @@ -118,17 +118,17 @@ void xen_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); + xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void xen_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR); + xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR); } void xen_send_call_func_ipi(const struct cpumask *mask) { - send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR); + xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR); } /* --- head-2010-05-25.orig/arch/x86/kernel/time-xen.c 2010-05-11 17:14:09.000000000 +0200 +++ head-2010-05-25/arch/x86/kernel/time-xen.c 2010-05-12 09:02:08.000000000 +0200 @@ -554,7 +554,8 @@ irqreturn_t timer_interrupt(int irq, voi ct = jiffies_to_cputime(delta_cpu); if (user_mode_vm(get_irq_regs())) account_user_time(current, ct, cputime_to_scaled(ct)); - else if (current != idle_task(cpu)) + else if (current != idle_task(cpu) + || irq_count() != HARDIRQ_OFFSET) account_system_time(current, HARDIRQ_OFFSET, ct, cputime_to_scaled(ct)); else @@ -587,7 +588,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); static cycle_t cs_last; -static cycle_t xen_clocksource_read(void) +static cycle_t xen_clocksource_read(struct clocksource *cs) { #ifdef CONFIG_SMP cycle_t last = get64(&cs_last); --- head-2010-05-25.orig/arch/x86/kernel/traps-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/traps-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -54,15 +54,14 @@ #include #include -#include +#include #ifdef CONFIG_X86_64 #include #include -#include #else #include -#include +#include #include #include "cpu/mcheck/mce.h" @@ -123,49 +122,6 @@ die_if_kernel(const char *str, struct pt if (!user_mode_vm(regs)) die(str, regs, err); } - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ -#ifndef CONFIG_XEN - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - - return 1; - } - put_cpu(); -#endif - - return 0; -} #endif static void __kprobes @@ -316,11 +272,6 @@ do_general_protection(struct pt_regs *re conditional_sti(regs); #ifdef CONFIG_X86_32 - if (lazy_iobitmap_copy()) { - /* restart the faulting instruction */ - return; - } - if (regs->flags & X86_VM_MASK) goto gp_in_vm86; #endif @@ -911,19 +862,20 @@ void math_emulate(struct math_emu_info * } #endif /* CONFIG_MATH_EMULATION */ -dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error_code) { #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; - conditional_sti(®s); + conditional_sti(regs); - info.regs = ®s; + info.regs = regs; math_emulate(&info); } else { math_state_restore(); /* interrupts still off */ - conditional_sti(®s); + conditional_sti(regs); } #else math_state_restore(); @@ -939,7 +891,7 @@ dotraplinkage void do_iret_error(struct info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_BADSTK; - info.si_addr = 0; + info.si_addr = NULL; if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 32, SIGILL) == NOTIFY_STOP) return; --- head-2010-05-25.orig/arch/x86/kernel/vmlinux.lds.S 2010-03-24 15:08:58.000000000 +0100 +++ head-2010-05-25/arch/x86/kernel/vmlinux.lds.S 2010-03-24 15:25:06.000000000 +0100 @@ -16,8 +16,10 @@ #ifdef CONFIG_X86_32 #define LOAD_OFFSET __PAGE_OFFSET -#else +#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002 #define LOAD_OFFSET __START_KERNEL_map +#else +#define LOAD_OFFSET 0 #endif #include --- head-2010-05-25.orig/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,5 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-y := setup.o --- head-2010-05-25.orig/arch/x86/mach-xen/setup.c 2010-03-24 15:12:46.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,190 +0,0 @@ -/* - * Machine specific setup for generic - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_HOTPLUG_CPU -#define DEFAULT_SEND_IPI (1) -#else -#define DEFAULT_SEND_IPI (0) -#endif - -int no_broadcast=DEFAULT_SEND_IPI; - -static __init int no_ipi_broadcast(char *str) -{ - get_option(&str, &no_broadcast); - printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : - "IPI Broadcast"); - return 1; -} - -__setup("no_ipi_broadcast", no_ipi_broadcast); - -static int __init print_ipi_mode(void) -{ - printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : - "Shortcut"); - return 0; -} - -late_initcall(print_ipi_mode); - -unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; -EXPORT_SYMBOL(machine_to_phys_mapping); -unsigned int machine_to_phys_order; -EXPORT_SYMBOL(machine_to_phys_order); - -void __init pre_setup_arch_hook(void) -{ - struct xen_machphys_mapping mapping; - unsigned long machine_to_phys_nr_ents; - struct xen_platform_parameters pp; - - init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; - - xen_setup_features(); - - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) { - hypervisor_virt_start = pp.virt_start; - reserve_top_address(0UL - pp.virt_start); - } - - if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { - machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr_ents = mapping.max_mfn + 1; - } else - machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; - machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); - - if (!xen_feature(XENFEAT_auto_translated_physmap)) - phys_to_machine_mapping = - (unsigned long *)xen_start_info->mfn_list; -} - -#endif /* CONFIG_X86_32 */ - -extern void hypervisor_callback(void); -extern void failsafe_callback(void); -extern void nmi(void); - -#ifdef CONFIG_X86_64 -#include -#define CALLBACK_ADDR(fn) ((unsigned long)(fn)) -#else -#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) } -#endif - -void __init machine_specific_arch_setup(void) -{ - int ret; - static struct callback_register __initdata event = { - .type = CALLBACKTYPE_event, - .address = CALLBACK_ADDR(hypervisor_callback) - }; - static struct callback_register __initdata failsafe = { - .type = CALLBACKTYPE_failsafe, - .address = CALLBACK_ADDR(failsafe_callback) - }; -#ifdef CONFIG_X86_64 - static struct callback_register __initdata syscall = { - .type = CALLBACKTYPE_syscall, - .address = CALLBACK_ADDR(system_call) - }; -#endif -#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32) - static struct callback_register __initdata nmi_cb = { - .type = CALLBACKTYPE_nmi, - .address = CALLBACK_ADDR(nmi) - }; -#endif - - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); - if (ret == 0) - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); -#ifdef CONFIG_X86_64 - if (ret == 0) - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); -#endif -#if CONFIG_XEN_COMPAT <= 0x030002 -#ifdef CONFIG_X86_32 - if (ret == -ENOSYS) - ret = HYPERVISOR_set_callbacks( - event.address.cs, event.address.eip, - failsafe.address.cs, failsafe.address.eip); -#else - ret = HYPERVISOR_set_callbacks( - event.address, - failsafe.address, - syscall.address); -#endif -#endif - BUG_ON(ret); - -#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32) - ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); -#if CONFIG_XEN_COMPAT <= 0x030002 - if (ret == -ENOSYS) { - static struct xennmi_callback __initdata cb = { - .handler_address = (unsigned long)nmi - }; - - HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); - } -#endif -#endif - -#ifdef CONFIG_X86_32 - /* Do an early initialization of the fixmap area */ - { - extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; - unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); - pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr); - pmd_t *pmd = pmd_offset(pud, addr); - unsigned int i; - - make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); - set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); - -#define __FIXADDR_TOP (-PAGE_SIZE) -#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \ - != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE))) - FIX_BUG_ON(SHARED_INFO); - FIX_BUG_ON(ISAMAP_BEGIN); - FIX_BUG_ON(ISAMAP_END); -#undef __FIXADDR_TOP - BUG_ON(pte_index(hypervisor_virt_start)); - - /* Switch to the real shared_info page, and clear the - * dummy page. */ - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - memset(empty_zero_page, 0, sizeof(empty_zero_page)); - - /* Setup mapping of lower 1st MB */ - for (i = 0; i < NR_FIX_ISAMAPS; i++) - if (is_initial_xendomain()) - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); - else - __set_fixmap(FIX_ISAMAP_BEGIN - i, - virt_to_machine(empty_zero_page), - PAGE_KERNEL_RO); - } -#endif -} --- head-2010-05-25.orig/arch/x86/mm/Makefile 2010-03-24 15:01:37.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/Makefile 2010-03-24 15:25:06.000000000 +0100 @@ -26,5 +26,6 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64. obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_XEN) += hypervisor.o +disabled-obj-$(CONFIG_XEN) := tlb.o obj-$(CONFIG_MEMTEST) += memtest.o --- head-2010-05-25.orig/arch/x86/mm/fault-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/fault-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -1,73 +1,79 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include /* For unblank_screen() */ +#include +#include #include #include -#include /* for max_low_pfn */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include -#include #include +#include /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch - */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + */ +enum x86_pf_error_code { + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; + +/* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -75,29 +81,76 @@ static inline int notify_page_fault(stru } return ret; -#else - return 0; -#endif } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: + * + * 32-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * 64-bit mode: + * + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) { + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +{ + unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -106,99 +159,174 @@ static int is_prefetch(struct pt_regs *r if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { + while (instr < max_instr) { unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; + } + return prefetch; +} + +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + + force_sig_info(si_signo, &info, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + bool lazy = percpu_read(xen_lazy_mmu); + + percpu_write(xen_lazy_mmu, false); +#if CONFIG_XEN_COMPAT > 0x030002 + set_pmd(pmd, *pmd_k); +#else + /* + * When running on older Xen we must launder *pmd_k through + * pmd_val() to ensure that _PAGE_PRESENT is correctly set. + */ + set_pmd(pmd, __pmd(pmd_val(*pmd_k))); #endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; + percpu_write(xen_lazy_mmu, lazy); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; - if (probe_kernel_address(instr, opcode)) + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; } + spin_unlock_irqrestore(&pgd_lock, flags); } - return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) { - siginfo_t info; + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - force_sig_info(si_signo, &info, tsk); + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) { - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page & _PAGE_PRESENT) @@ -206,7 +334,7 @@ static void dump_pagetable(unsigned long page = mfn_to_pfn(page >> PAGE_SHIFT); page <<= PAGE_SHIFT; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,20 +346,146 @@ static void dump_pagetable(unsigned long * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page & _PAGE_PRESENT) && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn && !(page & _PAGE_PSE)) { + page = mfn_to_pfn(page >> PAGE_SHIFT); page <<= PAGE_SHIFT; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page); } printk(KERN_CONT "\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -240,113 +494,77 @@ static void dump_pagetable(unsigned long pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk(KERN_CONT "PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk(KERN_CONT "PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk(KERN_CONT "PTE %lx", pte_val(*pte)); -ret: +out: printk(KERN_CONT "\n"); return; bad: printk("BAD\n"); -#endif -} - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) { - bool lazy = x86_read_percpu(xen_lazy_mmu); - - x86_write_percpu(xen_lazy_mmu, false); -#if CONFIG_XEN_COMPAT > 0x030002 - set_pmd(pmd, *pmd_k); -#else - /* - * When running on older Xen we must launder *pmd_k through - * pmd_val() to ensure that _PAGE_PRESENT is correctly set. - */ - set_pmd(pmd, __pmd(pmd_val(*pmd_k))); -#endif - x86_write_percpu(xen_lazy_mmu, lazy); - } else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; } -#endif -#ifdef CONFIG_X86_64 -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; -#endif +#endif /* CONFIG_X86_64 */ -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -356,16 +574,17 @@ static int is_errata93(struct pt_regs *r } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -375,8 +594,9 @@ static int is_f00f_bug(struct pt_regs *r { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -390,62 +610,277 @@ static int is_f00f_bug(struct pt_regs *r return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } -#ifdef CONFIG_X86_64 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); + tsk->comm, address); dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } -#endif + +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk = current; + unsigned long *stackend; + unsigned long flags; + int sig; + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: + * + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice: + */ + flags = oops_begin(); + + show_fault_oops(regs, error_code, address); + + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + + oops_end(flags, regs, sig); +} + +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here: + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed): + */ + up_read(¤t->mm->mmap_sem); + + pagefault_out_of_memory(); +} + +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); + + /* User-space => ok to do another page fault: */ + if (is_prefetch(regs, error_code, address)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) { + out_of_memory(regs, error_code, address); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } +} static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -453,21 +888,25 @@ static int spurious_fault_check(unsigned } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static int spurious_fault(unsigned long address, - unsigned long error_code) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -495,117 +934,62 @@ static int spurious_fault(unsigned long if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); -} - -/* - * X86_32 - * Handle a fault on the vmalloc or module mapping area - * - * X86_64 - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); - /* Make sure we are in vmalloc area */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; + return ret; +} - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ +int show_unhandled_signals = 1; - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } - /* Below here mismatches are bugs because these lower tables - are shared */ + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); return 0; -#endif } -int show_unhandled_signals = 1; +static int fault_in_kernel_space(unsigned long address) +{ + return address >= TASK_SIZE_MAX; +} /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; + struct task_struct *tsk; unsigned long address; - int write, si_code; + struct mm_struct *mm; + int write; int fault; -#ifdef CONFIG_X86_64 - unsigned long flags; - int sig; -#endif /* Set the "privileged fault" bit to something sane. */ if (user_mode_vm(regs)) @@ -615,13 +999,12 @@ void __kprobes do_page_fault(struct pt_r tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); - si_code = SEGV_MAPERR; - if (unlikely(kmmio_fault(regs, address))) return; @@ -638,328 +1021,158 @@ void __kprobes do_page_fault(struct pt_r * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ -#ifdef CONFIG_X86_32 - if (unlikely(address >= TASK_SIZE)) { -#else - if (unlikely(address >= TASK_SIZE64)) { -#endif + if (unlikely(fault_in_kernel_space(address))) { /* Faults in hypervisor area can never be patched up. */ #if defined(CONFIG_X86_XEN) - if (address >= hypervisor_virt_start) - goto bad_area_nosemaphore; + if (address >= hypervisor_virt_start) { #elif defined(CONFIG_X86_64_XEN) if (address >= HYPERVISOR_VIRT_START - && address < HYPERVISOR_VIRT_END) - goto bad_area_nosemaphore; + && address < HYPERVISOR_VIRT_END) { #endif + bad_area_nosemaphore(regs, error_code, address); + return; + } + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) + /* Can handle a stale RO->RW TLB: */ + if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ - goto bad_area_nosemaphore; - } + bad_area_nosemaphore(regs, error_code, address); - /* kprobes don't want to hook the spurious faults. */ - if (notify_page_fault(regs)) return; + } + /* kprobes don't want to hook the spurious faults: */ + if (unlikely(notify_page_fault(regs))) + return; /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); -#endif + pgtable_bad(regs, error_code, address); /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->ip)) - goto bad_area_nosemaphore; + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) - goto bad_area; + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + write = error_code & PF_WRITE; + + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + mm_fault_error(regs, error_code, address, fault); + return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata100(regs, address)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - - if (is_f00f_bug(regs, address)) - return; - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * X86_64 - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else - flags = oops_begin(); -#endif - - show_fault_oops(regs, error_code, address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); -#endif - -out_of_memory: - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; + check_v8086_mode(regs, address, tsk); -do_sigbus: up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif } --- head-2010-05-25.orig/arch/x86/mm/highmem_32-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/highmem_32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -1,5 +1,6 @@ #include #include +#include /* for totalram_pages */ void *kmap(struct page *page) { @@ -18,49 +19,6 @@ void kunmap(struct page *page) kunmap_high(page); } -static void debug_kmap_atomic_prot(enum km_type type) -{ -#ifdef CONFIG_DEBUG_HIGHMEM - static unsigned warn_count = 10; - - if (unlikely(warn_count == 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#endif -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -80,7 +38,7 @@ void *kmap_atomic_prot(struct page *page if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic_prot(type); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -120,22 +78,13 @@ void kunmap_atomic(void *kvaddr, enum km pagefault_enable(); } -/* This is the same as kmap_atomic() but can map memory that doesn't +/* + * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) { - enum fixed_addresses idx; - unsigned long vaddr; - - pagefault_disable(); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); - /*arch_flush_lazy_mmu_mode();*/ - - return (void*) vaddr; + return kmap_atomic_prot_pfn(pfn, type, kmap_prot); } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ @@ -206,3 +155,35 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #endif EXPORT_SYMBOL(clear_highpage); EXPORT_SYMBOL(copy_highpage); + +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + + /* XEN: init high-mem pages outside initial allocation. */ + if (zone_start_pfn < xen_start_info->nr_pages) + zone_start_pfn = xen_start_info->nr_pages; + for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) { + ClearPageReserved(pfn_to_page(zone_start_pfn)); + init_page_count(pfn_to_page(zone_start_pfn)); + } + } + totalram_pages += totalhigh_pages; +} --- head-2010-05-25.orig/arch/x86/mm/hypervisor.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/hypervisor.c 2010-03-24 15:25:06.000000000 +0100 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,9 @@ EXPORT_SYMBOL(hypercall_page); +shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + #define NR_MC BITS_PER_LONG #define NR_MMU BITS_PER_LONG #define NR_MMUEXT (BITS_PER_LONG / 4) @@ -538,7 +542,7 @@ int xen_create_contiguous_region( unsigned int level; if (vstart < __START_KERNEL_map - || vstart + (PAGE_SIZE << order) > (unsigned long)_end) + || vstart + (PAGE_SIZE << order) > _brk_end) return -EINVAL; ptep = lookup_address((unsigned long)__va(__pa(vstart)), &level); @@ -953,6 +957,6 @@ int write_ldt_entry(struct desc_struct * int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type) { - maddr_t mach_gp = virt_to_machine(gdt + entry); + maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry); return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc); } --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2010-05-25/arch/x86/mm/init-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -0,0 +1,459 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long __meminitdata e820_table_start; +unsigned long __meminitdata e820_table_end; +unsigned long __meminitdata e820_table_top; + +int after_bootmem; + +#if !defined(CONFIG_XEN) +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; +#elif defined(CONFIG_X86_32) +#define direct_gbpages 0 +extern unsigned long extend_init_mapping(unsigned long tables_space); +#else +extern void xen_finish_init_mapping(void); +#endif + +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) +{ + unsigned long puds, pmds, ptes, tables; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (use_pse) { + unsigned long extra; + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ +#ifdef CONFIG_X86_32 + e820_table_start = extend_init_mapping(tables); + e820_table_end = e820_table_start; +#else /* CONFIG_X86_64 */ + if (!e820_table_top) { + e820_table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + + xen_start_info->nr_pt_frames; + e820_table_end = e820_table_start; + } else { + /* + * [table_start, table_top) gets passed to reserve_early(), + * so we must not use table_end here, despite continuing + * to allocate from there. table_end possibly being below + * table_start is otoh not a problem. + */ + e820_table_start = e820_table_top; + } +#endif + if (e820_table_start == -1UL) + panic("Cannot find space for the kernel page tables"); + + e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int __meminit save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } + + /* big page (2M) range */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<> PAGE_SHIFT) \ + << PAGE_SHIFT) + __START_KERNEL_map)) + + if (!start) { + unsigned long addr, va = __START_KERNEL_map; + unsigned long *page = (unsigned long *)init_level4_pgt; + + /* Kill mapping of memory below _text. */ + while (va < (unsigned long)&_text) { + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } + + /* Blow away any spurious initial mappings. */ + va = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT); + + addr = page[pgd_index(va)]; + page = addr_to_page(addr); + addr = page[pud_index(va)]; + page = addr_to_page(addr); + while (pmd_index(va) | pte_index(va)) { + if (pmd_none(*(pmd_t *)&page[pmd_index(va)])) + break; + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } + } + + for (i = 0; i < nr_range; i++) + ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask); +#undef addr_to_page +#endif + +#ifdef CONFIG_X86_32 + early_ioremap_page_table_range_init(); +#endif + +#ifdef CONFIG_X86_64 + BUG_ON(e820_table_end > e820_table_top); + if (!start) + xen_finish_init_mapping(); + else +#endif + if (e820_table_end < e820_table_top) + /* Disable the 'table_end' allocator. */ + e820_table_top = e820_table_end; + + __flush_tlb_all(); + + if (!after_bootmem && e820_table_top > e820_table_start) + reserve_early(e820_table_start << PAGE_SHIFT, + e820_table_top << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start, end); + + return ret >> PAGE_SHIFT; +} + + +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (mfn_to_local_pfn(pagenr) >= max_pfn) + return 1; + return 0; +} + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr = begin; + + if (addr >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); +#ifdef CONFIG_X86_64 + if (addr >= __START_KERNEL_map) { + /* make_readonly() reports all kernel addresses. */ + if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)), + pfn_pte(__pa(addr) >> PAGE_SHIFT, + PAGE_KERNEL), + 0)) + BUG(); + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + BUG(); + } +#endif + free_page(addr); + totalram_pages++; + } +#endif +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif --- head-2010-05-25.orig/arch/x86/mm/init_32-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/init_32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -52,9 +52,7 @@ #include #include #include -#include - -unsigned int __VMALLOC_RESERVE = 128 << 20; +#include unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -64,19 +62,14 @@ unsigned long highstart_pfn, highend_pfn static noinline int do_test_wp_bit(void); - -static unsigned long __initdata table_start; -static unsigned long __initdata table_end; -static unsigned long __initdata table_top; - -static int __initdata after_init_bootmem; +bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -96,7 +89,7 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_init_bootmem) + if (after_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); @@ -128,7 +121,7 @@ static pte_t * __init one_page_table_ini #endif pte_t *page_table = NULL; - if (after_init_bootmem) { + if (after_bootmem) { #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif @@ -148,6 +141,23 @@ static pte_t * __init one_page_table_ini return pte_offset_kernel(pmd, 0); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { @@ -164,12 +174,12 @@ static pte_t *__init page_table_kmap_che if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < table_start - || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start + || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { pte_t *newpte; int i; - BUG_ON(after_init_bootmem); + BUG_ON(after_bootmem); newpte = alloc_low_page(); for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); @@ -244,11 +254,14 @@ static inline int is_kernel_text(unsigne * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base, - unsigned long start_pfn, - unsigned long end_pfn, - int use_pse) +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { + int use_pse = page_size_mask == (1<> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + /* * First iteration will setup identity mapping using large/small pages * based on use_pse, with other attributes same as set by @@ -391,26 +407,6 @@ repeat: mapping_iter = 2; goto repeat; } -} - -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (mfn_to_local_pfn(pagenr) >= max_pfn) - return 1; return 0; } @@ -506,30 +502,10 @@ void __init add_highpages_with_active_re work_with_active_regions(nid, add_highpages_work_fn, &data); } -#ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(void) -{ - int pfn; - - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - /* XEN: init high-mem pages outside initial allocation. */ - for (pfn = xen_start_info->nr_pages; pfn < highend_pfn; pfn++) { - ClearPageReserved(pfn_to_page(pfn)); - init_page_count(pfn_to_page(pfn)); - } - - totalram_pages += totalhigh_pages; -} -#endif /* !CONFIG_NUMA */ - #else static inline void permanent_kmaps_init(pgd_t *pgd_base) { } -static inline void set_highmem_pages_init(void) -{ -} #endif /* CONFIG_HIGHMEM */ pgd_t *swapper_pg_dir; @@ -553,8 +529,9 @@ pgd_t *swapper_pg_dir; * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) +void __init early_ioremap_page_table_range_init(void) { + pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; /* @@ -649,7 +626,7 @@ static int __init noexec_setup(char *str } early_param("noexec", noexec_setup); -static void __init set_nx(void) +void __init set_nx(void) { unsigned int v[4], l, h; @@ -685,75 +662,97 @@ static int __init parse_highmem(char *ar } early_param("highmem", parse_highmem); +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" /* - * Determine low and high memory ranges: + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: */ -void __init find_low_pfn_range(void) +void __init lowmem_pfn_init(void) { - /* it could update max_pfn */ - /* max_low_pfn is 0, we already have early_res support */ - max_low_pfn = max_pfn; - if (max_low_pfn > MAXMEM_PFN) { - if (highmem_pages == -1) - highmem_pages = max_pfn - MAXMEM_PFN; - if (highmem_pages + MAXMEM_PFN < max_pfn) - max_pfn = MAXMEM_PFN + highmem_pages; - if (highmem_pages + MAXMEM_PFN > max_pfn) { - printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %uMB.\n", - pages_to_mb(max_pfn - MAXMEM_PFN), + + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +void __init highmem_pfn_init(void) +{ + max_low_pfn = MAXMEM_PFN; + + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } #ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING - "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = MAXMEM_PFN; + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ #ifndef CONFIG_HIGHMEM64G - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used." - "Use a HIGHMEM64G enabled kernel.\n"); - } + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ - } else { - if (highmem_pages == -1) - highmem_pages = 0; -#ifdef CONFIG_HIGHMEM - if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%uMB) is " - "bigger than pages available (%luMB)!.\n", - pages_to_mb(highmem_pages), - pages_to_mb(max_pfn)); - highmem_pages = 0; - } - if (highmem_pages) { - if (max_low_pfn - highmem_pages < - 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %uMB results in " - "smaller than 64MB lowmem, ignoring it.\n" - , pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn -= highmem_pages; - } -#else - if (highmem_pages) - printk(KERN_ERR "ignoring highmem size on non-highmem" - " kernel!\n"); -#endif - } +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + if (max_pfn <= MAXMEM_PFN) + lowmem_pfn_init(); + else + highmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -779,6 +778,8 @@ void __init initmem_init(unsigned long s #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif + __vmalloc_start_set = true; + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); @@ -800,40 +801,70 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +static unsigned long __init setup_node_bootmem(int nodeid, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long bootmap) +{ + unsigned long bootmap_size; + + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), + bootmap >> PAGE_SHIFT, + start_pfn, end_pfn); + printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", + nodeid, start_pfn<nr_pages); + unsigned long end_xen_pfn = min(max_low_pfn, xen_start_info->nr_pages); /* * Initialize the boot-time allocator (with low memory only): */ - bootmap_size = bootmem_bootmap_pages(end_pfn)<nr_pages)<nr_pages)<> PAGE_SHIFT, - min_low_pfn, end_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< end_xen_pfn) + continue; + if (end_pfn > end_xen_pfn) + end_pfn = end_xen_pfn; +#else + start_pfn = 0; + end_pfn = end_xen_pfn; +#endif + bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, + bootmap); + } + + after_bootmem = 1; } -static unsigned long __init extend_init_mapping(unsigned long tables_space) +unsigned long __init extend_init_mapping(unsigned long tables_space) { unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + xen_start_info->nr_pt_frames; @@ -885,133 +916,6 @@ static unsigned long __init extend_init_ return start_pfn; } -static void __init find_early_table_space(unsigned long end, int use_pse) -{ - unsigned long puds, pmds, ptes, tables; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = PAGE_ALIGN(puds * sizeof(pud_t)); - - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); - extra += PMD_SIZE; - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - - tables += PAGE_ALIGN(ptes * sizeof(pte_t)); - - /* for fixmap */ - tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); - - table_start = extend_init_mapping(tables); - - table_end = table_start; - table_top = table_start + (tables>>PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); -} - -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) -{ - pgd_t *pgd_base = swapper_pg_dir; - unsigned long start_pfn, end_pfn; - unsigned long big_page_start; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - int use_pse = 0; -#else - int use_pse = cpu_has_pse; -#endif - - /* - * Find space for the kernel direct mapping tables. - */ - if (!after_init_bootmem) - find_early_table_space(end, use_pse); - -#ifdef CONFIG_X86_PAE - set_nx(); - if (nx_enabled) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif - - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } - - /* - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. - */ - big_page_start = PMD_SIZE; - - if (start < big_page_start) { - start_pfn = start >> PAGE_SHIFT; - end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); - } else { - /* head is not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - } - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); - - /* big page range */ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < (big_page_start >> PAGE_SHIFT)) - start_pfn = big_page_start >> PAGE_SHIFT; - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - use_pse); - - /* tail is not big page alignment ? */ - start_pfn = end_pfn; - if (start_pfn > (big_page_start>>PAGE_SHIFT)) { - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, - end_pfn, 0); - } - - early_ioremap_page_table_range_init(pgd_base); - - __flush_tlb_all(); - - if (!after_init_bootmem) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - if (!after_init_bootmem) - early_memtest(start, end); - - return end >> PAGE_SHIFT; -} - - /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. @@ -1215,17 +1119,47 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel text: %luk\n", size >> 10); + kernel_set_to_readonly = 1; + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", start, start+size); @@ -1234,7 +1168,6 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif -#endif /* CONFIG_DYNAMIC_FTRACE */ start += size; size = (unsigned long)__end_rodata - start; @@ -1253,52 +1186,6 @@ void mark_rodata_ro(void) } #endif -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - unsigned long addr; - - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { --- head-2010-05-25.orig/arch/x86/mm/init_64-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/init_64-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include @@ -67,8 +69,6 @@ unsigned int __kernel_page_user; EXPORT_SYMBOL(__kernel_page_user); #endif -int after_bootmem; - DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD]; @@ -127,12 +127,6 @@ void __meminit early_make_page_readonly( } #ifndef CONFIG_XEN -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -154,14 +148,10 @@ early_param("gbpages", parse_direct_gbpa * around without checking the pgd every time. */ -static unsigned long __meminitdata table_start; -static unsigned long __meminitdata table_cur; -static unsigned long __meminitdata table_top; - pteval_t __supported_pte_mask __read_mostly = ~0UL; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int do_not_nx __cpuinitdata; +static int disable_nx __cpuinitdata; /* * noexec=on|off @@ -176,9 +166,9 @@ static int __init nonx_setup(char *str) return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; + disable_nx = 0; } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; + disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } return 0; @@ -190,7 +180,7 @@ void __cpuinit check_efer(void) unsigned long efer; rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) + if (!(efer & EFER_NX) || disable_nx) __supported_pte_mask &= ~_PAGE_NX; } @@ -224,9 +214,9 @@ static __ref void *spp_getpage(void) if (after_bootmem) ptr = (void *) get_zeroed_page(GFP_ATOMIC); - else if (table_cur < table_top) { - ptr = __va(table_cur << PAGE_SHIFT); - table_cur++; + else if (e820_table_end < e820_table_top) { + ptr = __va(e820_table_end << PAGE_SHIFT); + e820_table_end++; memset(ptr, 0, PAGE_SIZE); } else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -241,36 +231,54 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + make_page_readonly(pud, XENFEAT_writable_page_tables); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); make_page_readonly(pmd, XENFEAT_writable_page_tables); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); make_page_readonly(pte, XENFEAT_writable_page_tables); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -280,8 +288,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsig __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -298,6 +305,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t set_pte_vaddr_pud(pud_page, vaddr, pteval); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + #ifndef CONFIG_XEN /* * Create large page table mappings for a range of physical addresses. @@ -380,9 +405,9 @@ static __ref void *alloc_low_page(unsign return adr; } - BUG_ON(!table_cur); - pfn = table_cur++; - if (pfn >= table_top) + BUG_ON(!e820_table_end); + pfn = e820_table_end++; + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); @@ -407,13 +432,13 @@ static inline int __meminit make_readonl /* Make new page tables read-only on the first pass. */ if (!xen_feature(XENFEAT_writable_page_tables) && !max_pfn_mapped - && (paddr >= (table_start << PAGE_SHIFT)) - && (paddr < (table_top << PAGE_SHIFT))) + && (paddr >= (e820_table_start << PAGE_SHIFT)) + && (paddr < (e820_table_top << PAGE_SHIFT))) readonly = 1; /* Make old page tables read-only. */ if (!xen_feature(XENFEAT_writable_page_tables) && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) - && (paddr < (table_cur << PAGE_SHIFT))) + && (paddr < (e820_table_end << PAGE_SHIFT))) readonly = 1; /* @@ -422,7 +447,7 @@ static inline int __meminit make_readonl * mappings. Exclude the vsyscall area here, allowing alternative * instruction patching to work. */ - if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)) + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa(_brk_end)) && !(paddr >= __pa_symbol(&__vsyscall_0) && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE)) readonly = 1; @@ -747,43 +772,9 @@ void __init xen_init_pt(void) } } -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); - - if (!table_top) { - table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + - xen_start_info->nr_pt_frames; - table_cur = table_start; - } else { - /* - * [table_start, table_top) gets passed to reserve_early(), - * so we must not use table_cur here, despite continuing - * to allocate from there. table_cur possibly being below - * table_start is otoh not a problem. - */ - table_start = table_top; - } - __flush_tlb_all(); - - table_top = table_cur + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT); -} - -static void __init xen_finish_init_mapping(void) +void __init xen_finish_init_mapping(void) { - unsigned long i, start, end; + unsigned long start, end; /* Re-vector virtual addresses pointing into the initial mapping to the just-established permanent ones. */ @@ -801,49 +792,22 @@ static void __init xen_finish_init_mappi __va(__pa(xen_start_info->mod_start)); /* Destroy the Xen-created mappings beyond the kernel image. */ - start = PAGE_ALIGN((unsigned long)_end); - end = __START_KERNEL_map + (table_start << PAGE_SHIFT); + start = PAGE_ALIGN(_brk_end); + end = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT); for (; start < end; start += PAGE_SIZE) if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0)) BUG(); - /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */ - start = table_top; - WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n", - table_start, table_cur, start); - table_top = ~0UL; - - /* Switch to the real shared_info page, and clear the dummy page. */ - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - memset(empty_zero_page, 0, sizeof(empty_zero_page)); - - /* Set up mapping of lowest 1MB of physical memory. */ - for (i = 0; i < NR_FIX_ISAMAPS; i++) - if (is_initial_xendomain()) - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); - else - __set_fixmap(FIX_ISAMAP_BEGIN - i, - virt_to_mfn(empty_zero_page) - << PAGE_SHIFT, - PAGE_KERNEL_RO); - - table_top = max(table_cur, start); + WARN(e820_table_end != e820_table_top, "start=%lx cur=%lx top=%lx\n", + e820_table_start, e820_table_end, e820_table_top); + if (e820_table_end > e820_table_top) + e820_table_top = e820_table_end; } -static void __init init_gbpages(void) -{ -#ifndef CONFIG_XEN - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -#endif -} - -static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask) +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { unsigned long next, last_map_addr = end; @@ -887,207 +851,6 @@ static unsigned long __meminit kernel_ph return last_map_addr; } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -#define NR_RANGE_MR 5 - -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ - - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (2M) range*/ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) - continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i--].start = old_start; - nr_range--; - } - - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1< table_top); - if (!start) - xen_finish_init_mapping(); - else if (table_cur < table_top) - /* Disable the 'table_cur' allocator. */ - table_top = table_cur; - - __flush_tlb_all(); - - if (!after_bootmem && table_top > table_start) - reserve_early(table_start << PAGE_SHIFT, - table_top << PAGE_SHIFT, "PGTABLE"); - - printk(KERN_INFO "last_map_addr: %lx end: %lx\n", - last_map_addr, end); - - if (!after_bootmem) - early_memtest(start, end); - - return last_map_addr >> PAGE_SHIFT; -} - #ifndef CONFIG_NUMA void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { @@ -1165,28 +928,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (mfn_to_local_pfn(pagenr) >= max_pfn) - return 1; - return 0; -} - - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -1243,56 +984,39 @@ void __init mem_init(void) initsize >> 10); } -void free_init_pages(char *what, unsigned long begin, unsigned long end) +#ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) { - unsigned long addr = begin; + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); - if (addr >= end) + if (!kernel_set_to_readonly) return; - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - if (addr >= __START_KERNEL_map) { - /* make_readonly() reports all kernel addresses. */ - if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)), - pfn_pte(__pa(addr) >> PAGE_SHIFT, - PAGE_KERNEL), - 0)) - BUG(); - if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) - BUG(); - } - free_page(addr); - totalram_pages++; - } -#endif + set_memory_rw(start, (end - start) >> PAGE_SHIFT); } -void free_initmem(void) +void set_kernel_text_ro(void) { - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); -#ifdef CONFIG_DEBUG_RODATA -const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} void mark_rodata_ro(void) { @@ -1300,15 +1024,12 @@ void mark_rodata_ro(void) unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; -#ifdef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ - start = rodata_start; -#endif - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + kernel_set_to_readonly = 1; + /* * The rodata section (but not the kernel text!) should also be * not-executable. @@ -1328,13 +1049,6 @@ void mark_rodata_ro(void) #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { --- head-2010-05-25.orig/arch/x86/mm/iomap_32-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/iomap_32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -20,10 +20,11 @@ #include #include #include +#include int is_io_mapping_possible(resource_size_t base, unsigned long size) { -#ifndef CONFIG_X86_PAE +#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ if (base + size > 0x100000000ULL) return 0; @@ -32,16 +33,28 @@ int is_io_mapping_possible(resource_size } EXPORT_SYMBOL_GPL(is_io_mapping_possible); -/* Map 'mfn' using fixed map 'type' and protections 'prot' - */ -void * -iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; pagefault_disable(); + debug_kmap_atomic(type); + idx = type + KM_TYPE_NR * smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot)); + /*arch_flush_lazy_mmu_mode();*/ + + return (void *)vaddr; +} + +/* + * Map 'mfn' using fixed map 'type' and protections 'prot' + */ +void * +iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot) +{ /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the @@ -51,13 +64,8 @@ iomap_atomic_prot_pfn(unsigned long mfn, if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); pgprot_val(prot) |= _PAGE_IOMAP; - set_pte_at(&init_mm, vaddr, kmap_pte-idx, pfn_pte_ma(mfn, prot)); - /*arch_flush_lazy_mmu_mode()*/; - - return (void*) vaddr; + return kmap_atomic_prot_pfn(mfn, type, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); --- head-2010-05-25.orig/arch/x86/mm/ioremap-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/ioremap-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -23,13 +23,17 @@ #include #include -#ifdef CONFIG_X86_64 - -static inline int phys_addr_valid(unsigned long addr) +static inline int phys_addr_valid(resource_size_t addr) { - return addr < (1UL << boot_cpu_data.x86_phys_bits); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif } +#ifdef CONFIG_X86_64 + #define phys_base 0 unsigned long __phys_addr(unsigned long x) @@ -41,8 +45,7 @@ unsigned long __phys_addr(unsigned long } else { VIRTUAL_BUG_ON(x < PAGE_OFFSET); x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : - !phys_addr_valid(x)); + VIRTUAL_BUG_ON(!phys_addr_valid(x)); } return x; } @@ -59,10 +62,8 @@ bool __virt_addr_valid(unsigned long x) if (x < PAGE_OFFSET) return false; x -= PAGE_OFFSET; - if (system_state == SYSTEM_BOOTING ? - x > MAXMEM : !phys_addr_valid(x)) { + if (!phys_addr_valid(x)) return false; - } } return pfn_valid(x >> PAGE_SHIFT); @@ -73,18 +74,12 @@ EXPORT_SYMBOL(__virt_addr_valid); #else -static inline int phys_addr_valid(unsigned long addr) -{ - return 1; -} - #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - /* VMALLOC_* aren't constants; not available at the boot time */ + /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && - is_vmalloc_addr((void *) x)); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); @@ -94,7 +89,9 @@ bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; - if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } @@ -462,16 +459,17 @@ static void __iomem *__ioremap_caller(re return NULL; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), - size, prot, domid)) { + + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { free_memtype(phys_addr, phys_addr + size); free_vm_area(area); return NULL; } - if (ioremap_change_attr(vaddr, size, prot_val) < 0) { + if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), + size, prot, domid)) { free_memtype(phys_addr, phys_addr + size); - vunmap(area->addr); + free_vm_area(area); return NULL; } @@ -528,7 +526,7 @@ EXPORT_SYMBOL(ioremap_nocache); * * Must be freed with iounmap. */ -void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, @@ -558,7 +556,8 @@ static void __iomem *ioremap_default(res * - UC_MINUS for non-WB-able memory with no other conflicting mappings * - Inherit from confliting mappings otherwise */ - err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags); + err = reserve_memtype(phys_addr, phys_addr + size, + _PAGE_CACHE_WB, &flags); if (err < 0) return NULL; @@ -697,13 +696,19 @@ static inline pte_t * __init early_iorem return &bm_pte[pte_index(addr)]; } +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + void __init early_ioremap_init(void) { pmd_t *pmd; + int i; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables); @@ -734,7 +739,7 @@ void __init early_ioremap_reset(void) } static void __init __early_set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags) + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -753,7 +758,7 @@ static void __init __early_set_fixmap(en } static inline void __init early_set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t prot) + phys_addr_t phys, pgprot_t prot) { if (after_paging_init) __set_fixmap(idx, phys, prot); @@ -771,6 +776,7 @@ static inline void __init early_clear_fi static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; + static int __init check_early_ioremap_leak(void) { int count = 0; @@ -792,9 +798,11 @@ static int __init check_early_ioremap_le } late_initcall(check_early_ioremap_leak); -static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) { - unsigned long offset, last_addr; + unsigned long offset; + resource_size_t last_addr; unsigned int nrpages; enum fixed_addresses idx0, idx; int i, slot; @@ -810,15 +818,15 @@ static void __init __iomem *__early_iore } if (slot < 0) { - printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", - phys_addr, size); + printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", + (u64)phys_addr, size); WARN_ON(1); return NULL; } if (early_ioremap_debug) { - printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", - phys_addr, size, slot); + printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", + (u64)phys_addr, size, slot); dump_stack(); } @@ -858,20 +866,28 @@ static void __init __iomem *__early_iore --nrpages; } if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; } /* Remap an IO device */ -void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size) +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) { + /* + * Don't remap the low PCI/ISA area, it's always mapped. + */ + if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1)) + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr); + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); } /* Remap memory */ -void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size) +void __init __iomem * +early_memremap(resource_size_t phys_addr, unsigned long size) { return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL); } @@ -884,6 +900,15 @@ void __init early_iounmap(void __iomem * enum fixed_addresses idx; int i, slot; + /* + * early_ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN) + && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1)) + return; + slot = -1; for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { if (prev_map[i] == addr) { @@ -928,8 +953,3 @@ void __init early_iounmap(void __iomem * } prev_map[slot] = NULL; } - -void __this_fixmap_does_not_exist(void) -{ - WARN_ON(1); -} --- head-2010-05-25.orig/arch/x86/mm/pageattr-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/pageattr-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ struct cpa_data { unsigned long pfn; unsigned force_split : 1; int curpage; + struct page **pages; }; /* @@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 +#define CPA_PAGES_ARRAY 4 #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -95,7 +98,7 @@ static inline unsigned long highmap_star static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -150,7 +153,7 @@ static void __cpa_flush_all(void *arg) */ __flush_tlb_all(); - if (cache && boot_cpu_data.x86_model >= 4) + if (cache && boot_cpu_data.x86 >= 4) wbinvd(); } @@ -201,38 +204,41 @@ static void cpa_flush_range(unsigned lon } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache) +static void cpa_flush_array(unsigned long *start, int numpages, int cache, + int in_flags, struct page **pages) { unsigned int i, level; - unsigned long *addr; + unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ BUG_ON(irqs_disabled()); - on_each_cpu(__cpa_flush_range, NULL, 1); + on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); - if (!cache) + if (!cache || do_wbinvd) return; - /* 4M threshold */ - if (numpages >= 1024) { - if (boot_cpu_data.x86_model >= 4) - wbinvd(); - return; - } /* * We only need to flush on one CPU, * clflush is a MESI-coherent instruction that * will cause all other CPUs to flush the same * cachelines: */ - for (i = 0, addr = start; i < numpages; i++, addr++) { - pte_t *pte = lookup_address(*addr, &level); + for (i = 0; i < numpages; i++) { + unsigned long addr; + pte_t *pte; + + if (in_flags & CPA_PAGES_ARRAY) + addr = (unsigned long)page_address(pages[i]); + else + addr = start[i]; + + pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (__pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *) *addr, PAGE_SIZE); + clflush_cache_range((void *)addr, PAGE_SIZE); } } @@ -498,6 +504,13 @@ static int split_large_page(pte_t *kpte, pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { @@ -597,7 +610,9 @@ static int __change_page_attr(struct cpa unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) + address = (unsigned long)page_address(cpa->pages[cpa->curpage]); + else if (cpa->flags & CPA_ARRAY) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; @@ -701,7 +716,9 @@ static int cpa_process_alias(struct cpa_ * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) + vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); + else if (cpa->flags & CPA_ARRAY) vaddr = cpa->vaddr[cpa->curpage]; else vaddr = *cpa->vaddr; @@ -712,7 +729,7 @@ static int cpa_process_alias(struct cpa_ alias_cpa = *cpa; temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~CPA_ARRAY; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); ret = __change_page_attr_set_clr(&alias_cpa, 0); @@ -725,7 +742,7 @@ static int cpa_process_alias(struct cpa_ * No need to redo, when the primary call touched the high * mapping already: */ - if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, _brk_end)) return 0; /* @@ -738,7 +755,7 @@ static int cpa_process_alias(struct cpa_ alias_cpa = *cpa; temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map; alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~CPA_ARRAY; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); /* * The high mapping range is imprecise, so ignore the return value. @@ -759,7 +776,7 @@ static int __change_page_attr_set_clr(st */ cpa->numpages = numpages; /* for array changes, we can't use large page */ - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; if (!debug_pagealloc) @@ -783,7 +800,7 @@ static int __change_page_attr_set_clr(st */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) cpa->curpage++; else *cpa->vaddr += cpa->numpages * PAGE_SIZE; @@ -800,7 +817,8 @@ static inline int cache_attr(pgprot_t at static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split, int array) + int force_split, int in_flag, + struct page **pages) { struct cpa_data cpa; int ret, cache, checkalias; @@ -815,15 +833,7 @@ static int change_page_attr_set_clr(unsi return 0; /* Ensure we are PAGE_SIZE aligned */ - if (!array) { - if (*addr & ~PAGE_MASK) { - *addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); - } - } else { + if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) { @@ -831,6 +841,18 @@ static int change_page_attr_set_clr(unsi WARN_ON_ONCE(1); } } + } else if (!(in_flag & CPA_PAGES_ARRAY)) { + /* + * in_flag of CPA_PAGES_ARRAY implies it is aligned. + * No need to cehck in that case + */ + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } } /* Must avoid aliasing mappings in the highmem code */ @@ -848,6 +870,7 @@ static int change_page_attr_set_clr(unsi xen_multicall_flush(true); cpa.vaddr = addr; + cpa.pages = pages; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; @@ -855,8 +878,8 @@ static int change_page_attr_set_clr(unsi cpa.curpage = 0; cpa.force_split = force_split; - if (array) - cpa.flags |= CPA_ARRAY; + if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa.flags |= in_flag; /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -882,9 +905,10 @@ static int change_page_attr_set_clr(unsi * wbindv): */ if (!ret && cpu_has_clflush) { - if (cpa.flags & CPA_ARRAY) - cpa_flush_array(addr, numpages, cache); - else + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(addr, numpages, cache, + cpa.flags, pages); + } else cpa_flush_range(*addr, numpages, cache); } else cpa_flush_all(cache); @@ -905,14 +929,28 @@ static inline int change_page_attr_set(u pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, - array); + (array ? CPA_ARRAY : 0), NULL); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, - array); + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int cpa_set_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, + CPA_PAGES_ARRAY, pages); +} + +static inline int cpa_clear_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, + CPA_PAGES_ARRAY, pages); } #ifdef CONFIG_XEN @@ -971,71 +1009,94 @@ int _set_memory_uc(unsigned long addr, i int set_memory_uc(unsigned long addr, int numpages) { + int ret; + /* * for now UC MINUS. see comments in ioremap_nocache() */ - if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL)) - return -EINVAL; + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_err; + + ret = _set_memory_uc(addr, numpages); + if (ret) + goto out_free; - return _set_memory_uc(addr, numpages); + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; } EXPORT_SYMBOL(set_memory_uc); int set_memory_array_uc(unsigned long *addr, int addrinarray) { - unsigned long start; - unsigned long end; - int i; + int i, j; + int ret; + /* * for now UC MINUS. see comments in ioremap_nocache() */ for (i = 0; i < addrinarray; i++) { - start = __pa(addr[i]); - for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { - if (end != __pa(addr[i + 1])) - break; - i++; - } - if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) - goto out; + ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_free; } - return change_page_attr_set(addr, addrinarray, + ret = change_page_attr_set(addr, addrinarray, __pgprot(_PAGE_CACHE_UC_MINUS), 1); -out: - for (i = 0; i < addrinarray; i++) { - unsigned long tmp = __pa(addr[i]); + if (ret) + goto out_free; - if (tmp == start) - break; - for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { - if (end != __pa(addr[i + 1])) - break; - i++; - } - free_memtype(tmp, end); - } - return -EINVAL; + return 0; + +out_free: + for (j = 0; j < i; j++) + free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); + + return ret; } EXPORT_SYMBOL(set_memory_array_uc); int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(&addr, numpages, + int ret; + ret = change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); + + if (!ret) { + ret = change_page_attr_set(&addr, numpages, __pgprot(_PAGE_CACHE_WC), 0); + } + return ret; } int set_memory_wc(unsigned long addr, int numpages) { + int ret; + if (!pat_enabled) return set_memory_uc(addr, numpages); - if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_WC, NULL)) - return -EINVAL; + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL); + if (ret) + goto out_err; - return _set_memory_wc(addr, numpages); + ret = _set_memory_wc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; } EXPORT_SYMBOL(set_memory_wc); @@ -1047,29 +1108,31 @@ int _set_memory_wb(unsigned long addr, i int set_memory_wb(unsigned long addr, int numpages) { - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + int ret; - return _set_memory_wb(addr, numpages); + ret = _set_memory_wb(addr, numpages); + if (ret) + return ret; + + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + return 0; } EXPORT_SYMBOL(set_memory_wb); int set_memory_array_wb(unsigned long *addr, int addrinarray) { int i; + int ret; - for (i = 0; i < addrinarray; i++) { - unsigned long start = __pa(addr[i]); - unsigned long end; - - for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { - if (end != __pa(addr[i + 1])) - break; - i++; - } - free_memtype(start, end); - } - return change_page_attr_clear(addr, addrinarray, + ret = change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); + if (ret) + return ret; + + for (i = 0; i < addrinarray; i++) + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + + return 0; } EXPORT_SYMBOL(set_memory_array_wb); @@ -1105,7 +1168,7 @@ int set_memory_np(unsigned long addr, in int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), - __pgprot(0), 1, 0); + __pgprot(0), 1, 0, NULL); } int set_pages_uc(struct page *page, int numpages) @@ -1116,6 +1179,35 @@ int set_pages_uc(struct page *page, int } EXPORT_SYMBOL(set_pages_uc); +int set_pages_array_uc(struct page **pages, int addrinarray) +{ + unsigned long start; + unsigned long end; + int i; + int free_idx; + + for (i = 0; i < addrinarray; i++) { + start = (unsigned long)page_address(pages[i]); + end = start + PAGE_SIZE; + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + goto err_out; + } + + if (cpa_set_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) { + return 0; /* Success */ + } +err_out: + free_idx = i; + for (i = 0; i < free_idx; i++) { + start = (unsigned long)page_address(pages[i]); + end = start + PAGE_SIZE; + free_memtype(start, end); + } + return -EINVAL; +} +EXPORT_SYMBOL(set_pages_array_uc); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -1124,6 +1216,28 @@ int set_pages_wb(struct page *page, int } EXPORT_SYMBOL(set_pages_wb); +int set_pages_array_wb(struct page **pages, int addrinarray) +{ + int retval; + unsigned long start; + unsigned long end; + int i; + + retval = cpa_clear_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_MASK)); + if (retval) + return retval; + + for (i = 0; i < addrinarray; i++) { + start = (unsigned long)page_address(pages[i]); + end = start + PAGE_SIZE; + free_memtype(start, end); + } + + return 0; +} +EXPORT_SYMBOL(set_pages_array_wb); + int set_pages_x(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); --- head-2010-05-25.orig/arch/x86/mm/pat-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/pat-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -31,7 +31,7 @@ #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; -void __cpuinit pat_disable(char *reason) +static inline void pat_disable(const char *reason) { pat_enabled = 0; printk(KERN_INFO "%s\n", reason); @@ -43,6 +43,11 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} #endif @@ -79,16 +84,20 @@ void pat_init(void) if (!pat_enabled) return; - /* Paranoia check. */ - if (!cpu_has_pat && boot_pat_state) { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } } #ifndef CONFIG_XEN @@ -195,10 +204,10 @@ static unsigned long pat_x_mtrr_type(u64 u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_UNCACHABLE) - return _PAGE_CACHE_UC; - if (mtrr_type == MTRR_TYPE_WRCOMB) - return _PAGE_CACHE_WC; + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_UC_MINUS; + + return _PAGE_CACHE_WB; } return req_type; @@ -371,23 +380,13 @@ int reserve_memtype(u64 start, u64 end, return 0; } - if (req_type == -1) { - /* - * Call mtrr_lookup to get the type hint. This is an - * optimization for /dev/mem mmap'ers into WB memory (BIOS - * tools and ACPI tools). Use WB request for WB memory and use - * UC_MINUS otherwise. - */ - u8 mtrr_type = mtrr_type_lookup(start, end); - - if (mtrr_type == MTRR_TYPE_WRBACK) - actual_type = _PAGE_CACHE_WB; - else - actual_type = _PAGE_CACHE_UC_MINUS; - } else { - actual_type = pat_x_mtrr_type(start, end, - req_type & _PAGE_CACHE_MASK); - } + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); if (new_type) *new_type = actual_type; @@ -565,9 +564,7 @@ static inline int range_is_allowed(unsig int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn, unsigned long size, pgprot_t *vma_prot) { - u64 addr = (u64)mfn << PAGE_SHIFT; - unsigned long flags = -1; - int retval; + unsigned long flags = _PAGE_CACHE_WB; if (!range_is_allowed(mfn, size)) return 0; @@ -597,60 +594,21 @@ int phys_mem_access_prot_allowed(struct #endif #endif - /* - * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. - * - * Without O_SYNC, we want to get - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise - */ - if (flags != -1) { - retval = reserve_memtype(addr, addr + size, flags, NULL); - } else { - retval = reserve_memtype(addr, addr + size, -1, &flags); - } - - if (retval < 0) - return 0; - - if (ioremap_check_change_attr(mfn, size, flags) < 0) { - free_memtype(addr, addr + size); - printk(KERN_INFO - "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - addr, addr + size); - return 0; - } - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | flags); return 1; } -void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot) -{ - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); - u64 addr = (u64)mfn << PAGE_SHIFT; - unsigned long flags; - - reserve_memtype(addr, addr + size, want_flags, &flags); - if (flags != want_flags) { - printk(KERN_INFO - "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", - current->comm, current->pid, - cattr_name(want_flags), - addr, (unsigned long long)(addr + size), - cattr_name(flags)); - } -} - -void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot) +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags) { - u64 addr = (u64)mfn << PAGE_SHIFT; + if (!pat_enabled) + return 0; - free_memtype(addr, addr + size); + return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags); } #ifndef CONFIG_XEN @@ -663,17 +621,18 @@ static int reserve_pfn_range(u64 paddr, int strict_prot) { int is_ram = 0; - int id_sz, ret; - unsigned long flags; + int ret; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + unsigned long flags = want_flags; is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. + * reserve_pfn_range() doesn't support RAM pages. Maintain the current + * behavior with RAM pages by returning success. */ if (is_ram != 0) - return -EINVAL; + return 0; ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) @@ -700,23 +659,8 @@ static int reserve_pfn_range(u64 paddr, flags); } - /* Need to keep identity mapping in sync */ - if (paddr >= __pa(high_memory)) - return 0; - - id_sz = (__pa(high_memory) < paddr + size) ? - __pa(high_memory) - paddr : - size; - - if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { free_memtype(paddr, paddr + size); - printk(KERN_ERR - "%s:%d reserve_pfn_range ioremap_change_attr failed %s " - "for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - (unsigned long long)paddr, - (unsigned long long)(paddr + size)); return -EINVAL; } return 0; @@ -741,29 +685,28 @@ static void free_pfn_range(u64 paddr, un * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. - * Otherwise, we reserve the entire vma range, my ging through the PTEs page - * by page to get physical address and protection. */ int track_pfn_vma_copy(struct vm_area_struct *vma) { - int retval = 0; - unsigned long i, j; resource_size_t paddr; unsigned long prot; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (!pat_enabled) return 0; + /* + * For now, only handle remap_pfn_range() vmas where + * is_linear_pfn_mapping() == TRUE. Handling of + * vm_insert_pfn() is TBD. + */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. */ - if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); return -EINVAL; } @@ -771,28 +714,7 @@ int track_pfn_vma_copy(struct vm_area_st return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } - /* reserve entire vma page by page, using pfn and prot from pte */ - for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) - continue; - - pgprot = __pgprot(prot); - retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1); - if (retval) - goto cleanup_ret; - } return 0; - -cleanup_ret: - /* Reserve error: Cleanup partial reservation and return error */ - for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) - continue; - - free_pfn_range(paddr, PAGE_SIZE); - } - - return retval; } /* @@ -802,50 +724,28 @@ cleanup_ret: * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. - * Otherwise, we look t the pfn and size and reserve only the specified range - * page by page. - * - * Note that this function can be called with caller trying to map only a - * subrange/page inside the vma. */ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { - int retval = 0; - unsigned long i, j; - resource_size_t base_paddr; resource_size_t paddr; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; if (!pat_enabled) return 0; + /* + * For now, only handle remap_pfn_range() vmas where + * is_linear_pfn_mapping() == TRUE. Handling of + * vm_insert_pfn() is TBD. + */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } - /* reserve page by page using pfn and size */ - base_paddr = (resource_size_t)pfn << PAGE_SHIFT; - for (i = 0; i < size; i += PAGE_SIZE) { - paddr = base_paddr + i; - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0); - if (retval) - goto cleanup_ret; - } return 0; - -cleanup_ret: - /* Reserve error: Cleanup partial reservation and return error */ - for (j = 0; j < i; j += PAGE_SIZE) { - paddr = base_paddr + j; - free_pfn_range(paddr, PAGE_SIZE); - } - - return retval; } /* @@ -856,39 +756,23 @@ cleanup_ret: void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { - unsigned long i; resource_size_t paddr; - unsigned long prot; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; if (!pat_enabled) return; + /* + * For now, only handle remap_pfn_range() vmas where + * is_linear_pfn_mapping() == TRUE. Handling of + * vm_insert_pfn() is TBD. + */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; free_pfn_range(paddr, vma_size); return; } - - if (size != 0 && size != vma_size) { - /* free page by page, using pfn and size */ - paddr = (resource_size_t)pfn << PAGE_SHIFT; - for (i = 0; i < size; i += PAGE_SIZE) { - paddr = paddr + i; - free_pfn_range(paddr, PAGE_SIZE); - } - } else { - /* free entire vma, page by page, using the pfn from pte */ - for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) - continue; - - free_pfn_range(paddr, PAGE_SIZE); - } - } } #endif /* CONFIG_XEN */ --- head-2010-05-25.orig/arch/x86/mm/pgtable-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/pgtable-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -122,10 +122,6 @@ void __pud_free_tlb(struct mmu_gather *t #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ -#ifndef CONFIG_X86_64 -#define TASK_SIZE64 TASK_SIZE -#endif - static void _pin_lock(struct mm_struct *mm, int lock) { if (lock) spin_lock(&mm->page_table_lock); @@ -149,7 +145,7 @@ static void _pin_lock(struct mm_struct * pgd_t *pgd = mm->pgd; unsigned g; - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) { pud_t *pud; unsigned u; @@ -230,10 +226,10 @@ static void pgd_walk(pgd_t *pgd_base, pg * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables * may not be the 'current' task's pagetables (e.g., current may be * 32-bit, but the pagetables may be for a 64-bit task). - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. + * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct + * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE. */ - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) { if (pgd_none(*pgd)) continue; pud = pud_offset(pgd, 0); @@ -736,9 +732,26 @@ int ptep_clear_flush_young(struct vm_are return young; } +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); + __FIXADDR_TOP = -reserve - PAGE_SIZE; +#endif +} + int fixmaps_set; -void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags) +void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); pte_t pte; @@ -757,6 +770,8 @@ void xen_set_fixmap(enum fixed_addresses set_pte_vaddr_pud(level3_user_pgt, address, pte); break; case FIX_EARLYCON_MEM_BASE: + case FIX_SHARED_INFO: + case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN: xen_l1_entry_update(level1_fixmap_pgt + pte_index(address), pfn_pte_ma(phys >> PAGE_SHIFT, flags)); fixmaps_set++; --- head-2010-05-25.orig/arch/x86/mm/pgtable_32-xen.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/arch/x86/mm/pgtable_32-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -25,6 +25,8 @@ #include #include +unsigned int __VMALLOC_RESERVE = 128 << 20; + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. @@ -54,7 +56,7 @@ void set_pte_vaddr(unsigned long vaddr, } pte = pte_offset_kernel(pmd, vaddr); if (pte_val(pteval)) - set_pte_present(&init_mm, vaddr, pte, pteval); + set_pte_at(&init_mm, vaddr, pte, pteval); else pte_clear(&init_mm, vaddr, pte); @@ -109,21 +111,6 @@ unsigned long hypervisor_virt_start = HY unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE); EXPORT_SYMBOL(__FIXADDR_TOP); -/** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve - * - * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. - */ -void __init reserve_top_address(unsigned long reserve) -{ - BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; -} - /* * vmalloc=size forces the vmalloc area to be exactly 'size' * bytes. This can be used to increase (or decrease) the --- head-2010-05-25.orig/drivers/acpi/Makefile 2010-03-24 14:53:41.000000000 +0100 +++ head-2010-05-25/drivers/acpi/Makefile 2010-03-24 15:25:06.000000000 +0100 @@ -64,8 +64,6 @@ obj-$(CONFIG_ACPI_POWER_METER) += power_ processor-y := processor_driver.o processor_throttling.o processor-y += processor_idle.o processor_thermal.o processor-$(CONFIG_CPU_FREQ) += processor_perflib.o -ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL -processor-objs += processor_perflib.o processor_extcntl.o -endif +processor-$(CONFIG_PROCESSOR_EXTERNAL_CONTROL) += processor_perflib.o processor_extcntl.o obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o --- head-2010-05-25.orig/drivers/acpi/acpica/hwsleep.c 2010-03-24 15:14:47.000000000 +0100 +++ head-2010-05-25/drivers/acpi/acpica/hwsleep.c 2010-03-24 15:25:06.000000000 +0100 @@ -394,7 +394,7 @@ acpi_status asmlinkage acpi_enter_sleep_ #else /* PV ACPI just need check hypercall return value */ err = acpi_notify_hypervisor_state(sleep_state, - PM1Acontrol, PM1Bcontrol); + pm1a_control, pm1b_control); if (err) { printk(KERN_ERR "ACPI: Hypervisor failure [%d]\n", err); return_ACPI_STATUS(AE_ERROR); --- head-2010-05-25.orig/drivers/acpi/processor_idle.c 2010-04-15 10:06:51.000000000 +0200 +++ head-2010-05-25/drivers/acpi/processor_idle.c 2010-04-15 10:06:59.000000000 +0200 @@ -606,7 +606,7 @@ static void acpi_processor_power_verify_ #ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL cx->latency_ticks = cx->latency; #else - cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->latency_ticks = us_to_pm_timer_ticks(cx->latency); #endif /* * On older chipsets, BM_RLD needs to be set @@ -643,7 +643,7 @@ static int acpi_processor_power_verify(s #ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL cx->latency_ticks = cx->latency; /* Normalize latency */ #else - cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->latency_ticks = us_to_pm_timer_ticks(cx->latency); #endif break; --- head-2010-05-25.orig/drivers/oprofile/oprofile_files.c 2010-03-24 15:02:17.000000000 +0100 +++ head-2010-05-25/drivers/oprofile/oprofile_files.c 2010-03-24 15:25:06.000000000 +0100 @@ -172,6 +172,7 @@ static const struct file_operations dump }; #ifdef CONFIG_XEN +#include #define TMPBUFSIZE 512 --- head-2010-05-25.orig/drivers/pci/msi-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/pci/msi-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -47,47 +47,50 @@ struct msi_pirq_entry { /* Arch hooks */ -int __attribute__ ((weak)) -arch_msi_check_device(struct pci_dev *dev, int nvec, int type) -{ - return 0; -} - -#ifndef CONFIG_XEN -int __attribute__ ((weak)) -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) +#ifndef arch_msi_check_device +int arch_msi_check_device(struct pci_dev *dev, int nvec, int type) { return 0; } +#endif -int __attribute__ ((weak)) -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +#ifndef arch_setup_msi_irqs +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { struct msi_desc *entry; int ret; + /* + * If an architecture wants to support multiple MSI, it needs to + * override arch_setup_msi_irqs() + */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + list_for_each_entry(entry, &dev->msi_list, list) { ret = arch_setup_msi_irq(dev, entry); - if (ret) + if (ret < 0) return ret; + if (ret > 0) + return -ENOSPC; } return 0; } +#endif -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) -{ - return; -} - -void __attribute__ ((weak)) -arch_teardown_msi_irqs(struct pci_dev *dev) +#ifndef arch_teardown_msi_irqs +void arch_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *entry; list_for_each_entry(entry, &dev->msi_list, list) { - if (entry->irq != 0) - arch_teardown_msi_irq(entry->irq); + int i, nvec; + if (entry->irq == 0) + continue; + nvec = 1 << entry->msi_attrib.multiple; + for (i = 0; i < nvec; i++) + arch_teardown_msi_irq(entry->irq + i); } } #endif @@ -347,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state) /** * msi_capability_init - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function + * @nvec: number of interrupts to allocate * - * Setup the MSI capability structure of device function with a single - * MSI irq, regardless of device function is capable of handling - * multiple messages. A return of zero indicates the successful setup - * of an entry zero with the new MSI irq or non-zero for otherwise. - **/ -static int msi_capability_init(struct pci_dev *dev) + * Setup the MSI capability structure of the device with the requested + * number of interrupts. A return value of zero indicates the successful + * setup of an entry with the new MSI irq. A negative return value indicates + * an error, and a positive return value indicates the number of interrupts + * which could have been allocated. + */ +static int msi_capability_init(struct pci_dev *dev, int nvec) { int pos, pirq; u16 control; @@ -363,6 +368,7 @@ static int msi_capability_init(struct pc pos = pci_find_capability(dev, PCI_CAP_ID_MSI); pci_read_config_word(dev, msi_control_reg(pos), &control); + WARN_ON(nvec > 1); /* XXX */ pirq = msi_map_vector(dev, 0, 0); if (pirq < 0) return -EBUSY; @@ -496,22 +502,34 @@ static int pci_msi_check_device(struct p } /** - * pci_enable_msi - configure device's MSI capability structure - * @dev: pointer to the pci_dev data structure of MSI device function + * pci_enable_msi_block - configure device's MSI capability structure + * @dev: device to configure + * @nvec: number of interrupts to configure * - * Setup the MSI capability structure of device function with - * a single MSI irq upon its software driver call to request for - * MSI mode enabled on its hardware device function. A return of zero - * indicates the successful setup of an entry zero with the new MSI - * vector or non-zero for otherwise. - **/ + * Allocate IRQs for a device with the MSI capability. + * This function returns a negative errno if an error occurs. If it + * is unable to allocate the number of interrupts requested, it returns + * the number of interrupts it might be able to allocate. If it successfully + * allocates at least the number of interrupts requested, it returns 0 and + * updates the @dev's irq member to the lowest new interrupt number; the + * other interrupt numbers allocated to this device are consecutive. + */ extern int pci_frontend_enable_msi(struct pci_dev *dev); -int pci_enable_msi(struct pci_dev* dev) +int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec) { - int temp, status; + int temp, status, pos, maxvec; + u16 msgctl; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); - status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI); + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (!pos) + return -EINVAL; + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); + maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1); + if (nvec > maxvec) + return maxvec; + + status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI); if (status) return status; @@ -521,6 +539,7 @@ int pci_enable_msi(struct pci_dev* dev) int ret; temp = dev->irq; + WARN_ON(nvec > 1); /* XXX */ ret = pci_frontend_enable_msi(dev); if (ret) return ret; @@ -535,23 +554,23 @@ int pci_enable_msi(struct pci_dev* dev) temp = dev->irq; - /* Check whether driver already requested for MSI-X irqs */ + /* Check whether driver already requested MSI-X irqs */ if (dev->msix_enabled) { dev_info(&dev->dev, "can't enable MSI " "(MSI-X already enabled)\n"); return -EINVAL; } - status = msi_capability_init(dev); + status = msi_capability_init(dev, nvec); if ( !status ) msi_dev_entry->default_irq = temp; return status; } -EXPORT_SYMBOL(pci_enable_msi); +EXPORT_SYMBOL(pci_enable_msi_block); extern void pci_frontend_disable_msi(struct pci_dev* dev); -void pci_msi_shutdown(struct pci_dev* dev) +void pci_msi_shutdown(struct pci_dev *dev) { int pirq; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); @@ -579,6 +598,7 @@ void pci_msi_shutdown(struct pci_dev* de pci_intx_for_msi(dev, 1); dev->msi_enabled = 0; } + void pci_disable_msi(struct pci_dev* dev) { pci_msi_shutdown(dev); @@ -586,6 +606,23 @@ void pci_disable_msi(struct pci_dev* dev EXPORT_SYMBOL(pci_disable_msi); /** + * pci_msix_table_size - return the number of device's MSI-X table entries + * @dev: pointer to the pci_dev data structure of MSI-X device function + */ +int pci_msix_table_size(struct pci_dev *dev) +{ + int pos; + u16 control; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (!pos) + return 0; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + return multi_msix_capable(control); +} + +/** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function * @entries: pointer to an array of MSI-X entries @@ -604,9 +641,8 @@ extern int pci_frontend_enable_msix(stru struct msix_entry *entries, int nvec); int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) { - int status, pos, nr_entries; + int status, nr_entries; int i, j, temp; - u16 control; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); if (!entries) @@ -653,9 +689,7 @@ int pci_enable_msix(struct pci_dev* dev, if (status) return status; - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - pci_read_config_word(dev, msi_control_reg(pos), &control); - nr_entries = multi_msix_capable(control); + nr_entries = pci_msix_table_size(dev); if (nvec > nr_entries) return -EINVAL; --- head-2010-05-25.orig/drivers/xen/Kconfig 2010-03-24 15:18:46.000000000 +0100 +++ head-2010-05-25/drivers/xen/Kconfig 2010-03-24 15:25:06.000000000 +0100 @@ -14,7 +14,6 @@ menu "XEN" config XEN_PRIVILEGED_GUEST bool "Privileged Guest (domain 0)" - select PCI_REASSIGN if PCI help Support for privileged operation (domain 0) @@ -333,10 +332,6 @@ endmenu config HAVE_IRQ_IGNORE_UNHANDLED def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - depends on X86 - config NO_IDLE_HZ def_bool y --- head-2010-05-25.orig/drivers/xen/char/mem.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-05-25/drivers/xen/char/mem.c 2010-03-24 15:25:06.000000000 +0100 @@ -158,21 +158,7 @@ static ssize_t write_mem(struct file * f } #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM -static void mmap_mem_open(struct vm_area_struct *vma) -{ - map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - -static void mmap_mem_close(struct vm_area_struct *vma) -{ - unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - static struct vm_operations_struct mmap_mem_ops = { - .open = mmap_mem_open, - .close = mmap_mem_close, #ifdef CONFIG_HAVE_IOREMAP_PROT .access = generic_access_phys #endif --- head-2010-05-25.orig/drivers/xen/core/Makefile 2010-04-19 14:50:32.000000000 +0200 +++ head-2010-05-25/drivers/xen/core/Makefile 2010-04-19 14:52:49.000000000 +0200 @@ -10,5 +10,5 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o -obj-$(CONFIG_X86_SMP) += spinlock.o +obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_KEXEC) += machine_kexec.o --- head-2010-05-25.orig/drivers/xen/core/evtchn.c 2010-04-23 15:19:25.000000000 +0200 +++ head-2010-05-25/drivers/xen/core/evtchn.c 2010-04-23 15:19:37.000000000 +0200 @@ -150,13 +150,15 @@ DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) #ifdef CONFIG_SMP static u8 cpu_evtchn[NR_EVENT_CHANNELS]; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static DEFINE_PER_CPU(unsigned long[BITS_TO_LONGS(NR_EVENT_CHANNELS)], + cpu_evtchn_mask); -static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, - unsigned int idx) +static inline unsigned long active_evtchns(unsigned int idx) { + shared_info_t *sh = HYPERVISOR_shared_info; + return (sh->evtchn_pending[idx] & - cpu_evtchn_mask[cpu][idx] & + percpu_read(cpu_evtchn_mask[idx]) & ~sh->evtchn_mask[idx]); } @@ -168,10 +170,10 @@ static void bind_evtchn_to_cpu(unsigned BUG_ON(!test_bit(chn, s->evtchn_mask)); if (irq != -1) - irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); + cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); - clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); - set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); + clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn])); + set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); cpu_evtchn[chn] = cpu; } @@ -184,11 +186,11 @@ static void init_evtchn_cpu_bindings(voi struct irq_desc *desc = irq_to_desc(i); if (desc) - desc->affinity = cpumask_of_cpu(0); + cpumask_copy(desc->affinity, cpumask_of(0)); } memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); - memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); + memset(per_cpu(cpu_evtchn_mask, 0), ~0, sizeof(per_cpu(cpu_evtchn_mask, 0))); } static inline unsigned int cpu_from_evtchn(unsigned int evtchn) @@ -198,9 +200,10 @@ static inline unsigned int cpu_from_evtc #else -static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, - unsigned int idx) +static inline unsigned long active_evtchns(unsigned int idx) { + shared_info_t *sh = HYPERVISOR_shared_info; + return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); } @@ -219,25 +222,15 @@ static inline unsigned int cpu_from_evtc #endif -/* Upcall to generic IRQ layer. */ #ifdef CONFIG_X86 -extern unsigned int do_IRQ(struct pt_regs *regs); void __init xen_init_IRQ(void); void __init init_IRQ(void) { irq_ctx_init(0); xen_init_IRQ(); } -#if defined (__i386__) -static inline void exit_idle(void) {} -#elif defined (__x86_64__) #include #endif -#define do_IRQ(irq, regs) do { \ - (regs)->orig_ax = ~(irq); \ - do_IRQ((regs)); \ -} while (0) -#endif /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -261,13 +254,12 @@ static DEFINE_PER_CPU(unsigned int, curr /* NB. Interrupts are disabled on entry. */ asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs) { + struct pt_regs *old_regs = set_irq_regs(regs); unsigned long l1, l2; unsigned long masked_l1, masked_l2; unsigned int l1i, l2i, start_l1i, start_l2i, port, count, i; int irq; - unsigned int cpu = smp_processor_id(); - shared_info_t *s = HYPERVISOR_shared_info; - vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; + vcpu_info_t *vcpu_info = current_vcpu_info(); exit_idle(); irq_enter(); @@ -277,7 +269,8 @@ asmlinkage void __irq_entry evtchn_do_up vcpu_info->evtchn_upcall_pending = 0; /* Nested invocations bail immediately. */ - if (unlikely(per_cpu(upcall_count, cpu)++)) + percpu_add(upcall_count, 1); + if (unlikely(percpu_read(upcall_count) != 1)) break; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -286,8 +279,8 @@ asmlinkage void __irq_entry evtchn_do_up #endif l1 = xchg(&vcpu_info->evtchn_pending_sel, 0); - start_l1i = l1i = per_cpu(current_l1i, cpu); - start_l2i = per_cpu(current_l2i, cpu); + start_l1i = l1i = percpu_read(current_l1i); + start_l2i = percpu_read(current_l2i); for (i = 0; l1 != 0; i++) { masked_l1 = l1 & ((~0UL) << l1i); @@ -298,7 +291,7 @@ asmlinkage void __irq_entry evtchn_do_up } l1i = __ffs(masked_l1); - l2 = active_evtchns(cpu, s, l1i); + l2 = active_evtchns(l1i); l2i = 0; /* usually scan entire word from start */ if (l1i == start_l1i) { /* We scan the starting word in two parts. */ @@ -318,17 +311,18 @@ asmlinkage void __irq_entry evtchn_do_up /* process port */ port = (l1i * BITS_PER_LONG) + l2i; - if ((irq = evtchn_to_irq[port]) != -1) - do_IRQ(irq, regs); - else + if (unlikely((irq = evtchn_to_irq[port]) == -1)) evtchn_device_upcall(port); + else if (!handle_irq(irq, regs) && printk_ratelimit()) + printk(KERN_EMERG "%s(%d): No handler for irq %d\n", + __func__, smp_processor_id(), irq); l2i = (l2i + 1) % BITS_PER_LONG; /* Next caller starts at last processed + 1 */ - per_cpu(current_l1i, cpu) = - l2i ? l1i : (l1i + 1) % BITS_PER_LONG; - per_cpu(current_l2i, cpu) = l2i; + percpu_write(current_l1i, + l2i ? l1i : (l1i + 1) % BITS_PER_LONG); + percpu_write(current_l2i, l2i); } while (l2i != 0); @@ -340,11 +334,12 @@ asmlinkage void __irq_entry evtchn_do_up } /* If there were nested callbacks then we have more to do. */ - count = per_cpu(upcall_count, cpu); - per_cpu(upcall_count, cpu) = 0; + count = percpu_read(upcall_count); + percpu_write(upcall_count, 0); } while (unlikely(count != 1)); irq_exit(); + set_irq_regs(old_regs); } static struct irq_chip dynirq_chip; @@ -551,7 +546,7 @@ static void unbind_from_irq(unsigned int /* Zap stats across IRQ changes of use. */ for_each_possible_cpu(cpu) -#ifdef CONFIG_SPARSE_IRQ +#ifdef CONFIG_GENERIC_HARDIRQS irq_to_desc(irq)->kstat_irqs[cpu] = 0; #else kstat_cpu(cpu).irqs[irq] = 0; @@ -669,7 +664,8 @@ int bind_ipi_to_irqhandler( if (irq < 0) return irq; - retval = request_irq(irq, handler, irqflags, devname, dev_id); + retval = request_irq(irq, handler, irqflags | IRQF_NO_SUSPEND, + devname, dev_id); if (retval != 0) { unbind_from_irq(irq); return retval; @@ -1134,7 +1130,7 @@ void irq_resume(void) mask_evtchn(evtchn); /* Check that no PIRQs are still bound. */ - for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) { + for (irq = PIRQ_BASE; irq < (PIRQ_BASE + nr_pirqs); irq++) { cfg = irq_cfg(irq); BUG_ON(cfg && cfg->info != IRQ_UNBOUND); } @@ -1171,7 +1167,7 @@ int arch_init_chip_data(struct irq_desc { if (!desc->chip_data) { /* By default all event channels notify CPU#0. */ - desc->affinity = cpumask_of_cpu(0); + cpumask_copy(desc->affinity, cpumask_of(0)); desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC); } @@ -1185,11 +1181,44 @@ int arch_init_chip_data(struct irq_desc #endif #if defined(CONFIG_X86_IO_APIC) +#ifdef CONFIG_SPARSE_IRQ +int nr_pirqs = NR_PIRQS; +EXPORT_SYMBOL_GPL(nr_pirqs); + +int __init arch_probe_nr_irqs(void) +{ + int nr_irqs_gsi, nr = acpi_probe_gsi(); + + if (nr <= NR_IRQS_LEGACY) { + /* for acpi=off or acpi not compiled in */ + int idx; + + for (nr = idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + } + nr_irqs_gsi = max(nr, NR_IRQS_LEGACY); + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#ifdef CONFIG_PCI_MSI + nr += nr_irqs_gsi * 16; +#endif + if (nr_pirqs > nr) { + nr_pirqs = nr; + nr_irqs = nr + NR_DYNIRQS; + } + + printk(KERN_DEBUG "nr_irqs_gsi=%d nr_pirqs=%d\n", + nr_irqs_gsi, nr_pirqs); + + return 0; +} +#endif + int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { struct physdev_irq irq_op; - if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) + if (irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs) return -EINVAL; if (cfg->vector) @@ -1212,7 +1241,7 @@ int assign_irq_vector(int irq, struct ir void evtchn_register_pirq(int irq) { - BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS); + BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs); if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND) return; irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0); @@ -1225,7 +1254,7 @@ int evtchn_map_pirq(int irq, int xen_pir if (irq < 0) { static DEFINE_SPINLOCK(irq_alloc_lock); - irq = PIRQ_BASE + NR_PIRQS - 1; + irq = PIRQ_BASE + nr_pirqs - 1; spin_lock(&irq_alloc_lock); do { struct irq_desc *desc; @@ -1285,7 +1314,7 @@ void __init xen_init_IRQ(void) init_evtchn_cpu_bindings(); pirq_needs_eoi = alloc_bootmem_pages(sizeof(unsigned long) - * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8))); + * BITS_TO_LONGS(ALIGN(nr_pirqs, PAGE_SIZE * 8))); eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT; if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0) pirq_eoi_does_unmask = true; @@ -1301,7 +1330,7 @@ void __init xen_init_IRQ(void) handle_level_irq, "level"); } - for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) { + for (i = PIRQ_BASE; i < (PIRQ_BASE + nr_pirqs); i++) { #else for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) { #endif --- head-2010-05-25.orig/drivers/xen/core/smpboot.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/core/smpboot.c 2010-03-24 15:25:06.000000000 +0100 @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -54,8 +53,8 @@ static char call1func_name[NR_CPUS][15]; #define set_cpu_to_apicid(cpu, apicid) #endif -DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); -DEFINE_PER_CPU(cpumask_t, cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); void __init prefill_possible_map(void) { @@ -88,8 +87,8 @@ set_cpu_sibling_map(unsigned int cpu) cpu_data(cpu).phys_proc_id = cpu; cpu_data(cpu).cpu_core_id = 0; - per_cpu(cpu_sibling_map, cpu) = cpumask_of_cpu(cpu); - per_cpu(cpu_core_map, cpu) = cpumask_of_cpu(cpu); + cpumask_copy(cpu_sibling_mask(cpu), cpumask_of(cpu)); + cpumask_copy(cpu_core_mask(cpu), cpumask_of(cpu)); cpu_data(cpu).booted_cores = 1; } @@ -100,8 +99,8 @@ remove_siblinginfo(unsigned int cpu) cpu_data(cpu).phys_proc_id = BAD_APICID; cpu_data(cpu).cpu_core_id = BAD_APICID; - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - cpus_clear(per_cpu(cpu_core_map, cpu)); + cpumask_clear(cpu_sibling_mask(cpu)); + cpumask_clear(cpu_core_mask(cpu)); cpu_data(cpu).booted_cores = 0; } @@ -224,7 +223,7 @@ static void __cpuinit cpu_initialize_con smp_trap_init(ctxt.trap_ctxt); ctxt.ldt_ents = 0; - ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu)); + ctxt.gdt_frames[0] = arbitrary_virt_to_mfn(get_cpu_gdt_table(cpu)); ctxt.gdt_ents = GDT_SIZE / 8; ctxt.user_regs.cs = __KERNEL_CS; @@ -242,12 +241,13 @@ static void __cpuinit cpu_initialize_con ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); ctxt.user_regs.fs = __KERNEL_PERCPU; + ctxt.user_regs.gs = __KERNEL_STACK_CANARY; #else /* __x86_64__ */ ctxt.syscall_callback_eip = (unsigned long)system_call; ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); - ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); + ctxt.gs_base_kernel = per_cpu_offset(cpu); #endif if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)) @@ -275,8 +275,10 @@ void __init smp_prepare_cpus(unsigned in current_thread_info()->cpu = 0; for_each_possible_cpu (cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - cpus_clear(per_cpu(cpu_core_map, cpu)); + alloc_cpumask_var(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL); + alloc_cpumask_var(&per_cpu(cpu_core_map, cpu), GFP_KERNEL); + cpumask_clear(cpu_sibling_mask(cpu)); + cpumask_clear(cpu_core_mask(cpu)); } set_cpu_sibling_map(0); @@ -303,9 +305,6 @@ void __init smp_prepare_cpus(unsigned in if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); -#ifdef __i386__ - init_gdt(cpu); -#endif gdt_addr = get_cpu_gdt_table(cpu); make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables); @@ -319,12 +318,12 @@ void __init smp_prepare_cpus(unsigned in set_cpu_to_apicid(cpu, apicid); #ifdef __x86_64__ - cpu_pda(cpu)->pcurrent = idle; - cpu_pda(cpu)->cpunumber = cpu; clear_tsk_thread_flag(idle, TIF_FORK); -#else - per_cpu(current_task, cpu) = idle; + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) - + KERNEL_STACK_OFFSET + THREAD_SIZE; #endif + per_cpu(current_task, cpu) = idle; irq_ctx_init(cpu); @@ -348,10 +347,7 @@ void __init smp_prepare_cpus(unsigned in void __init smp_prepare_boot_cpu(void) { -#ifdef __i386__ - init_gdt(smp_processor_id()); -#endif - switch_to_new_gdt(); + switch_to_new_gdt(smp_processor_id()); prefill_possible_map(); } --- head-2010-05-25.orig/drivers/xen/core/spinlock.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/core/spinlock.c 2010-03-24 15:25:06.000000000 +0100 @@ -78,13 +78,13 @@ static unsigned int spin_adjust(struct s unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token) { - return spin_adjust(x86_read_percpu(spinning), lock, token); + return spin_adjust(percpu_read(spinning), lock, token); } bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok, unsigned int flags) { - int irq = x86_read_percpu(spinlock_irq); + int irq = percpu_read(spinlock_irq); bool rc; typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask; raw_rwlock_t *rm_lock; @@ -97,9 +97,9 @@ bool xen_spin_wait(raw_spinlock_t *lock, /* announce we're spinning */ spinning.ticket = *ptok >> TICKET_SHIFT; spinning.lock = lock; - spinning.prev = x86_read_percpu(spinning); + spinning.prev = percpu_read(spinning); smp_wmb(); - x86_write_percpu(spinning, &spinning); + percpu_write(spinning, &spinning); upcall_mask = current_vcpu_info()->evtchn_upcall_mask; do { @@ -184,7 +184,7 @@ bool xen_spin_wait(raw_spinlock_t *lock, /* announce we're done */ other = spinning.prev; - x86_write_percpu(spinning, other); + percpu_write(spinning, other); rm_lock = &__get_cpu_var(spinning_rm_lock); raw_local_irq_disable(); __raw_write_lock(rm_lock); --- head-2010-05-25.orig/drivers/xen/netback/interface.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/netback/interface.c 2010-03-24 15:25:06.000000000 +0100 @@ -121,7 +121,7 @@ static void netbk_get_drvinfo(struct net struct ethtool_drvinfo *info) { strcpy(info->driver, "netbk"); - strcpy(info->bus_info, dev->dev.parent->bus_id); + strcpy(info->bus_info, dev_name(dev->dev.parent)); } static const struct netif_stat { --- head-2010-05-25.orig/drivers/xen/netback/netback.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/netback/netback.c 2010-03-24 15:25:06.000000000 +0100 @@ -333,7 +333,7 @@ int netif_be_start_xmit(struct sk_buff * */ netif->tx_queue_timeout.data = (unsigned long)netif; netif->tx_queue_timeout.function = tx_queue_callback; - __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); + mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); } } @@ -354,7 +354,7 @@ static void xen_network_done_notify(void static struct net_device *eth0_dev = NULL; if (unlikely(eth0_dev == NULL)) eth0_dev = __dev_get_by_name(&init_net, "eth0"); - netif_rx_schedule(???); + napi_schedule(???); } /* * Add following to poll() function in NAPI driver (Tigon3 is example): @@ -1308,8 +1308,7 @@ static void net_tx_action(unsigned long (unsigned long)netif; netif->credit_timeout.function = tx_credit_callback; - __mod_timer(&netif->credit_timeout, - next_credit); + mod_timer(&netif->credit_timeout, next_credit); netif_put(netif); continue; } --- head-2010-05-25.orig/drivers/xen/netfront/netfront.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/netfront/netfront.c 2010-03-24 15:25:06.000000000 +0100 @@ -102,7 +102,7 @@ static const int MODPARM_rx_flip = 0; static inline void dev_disable_gso_features(struct net_device *dev) { /* Turn off all GSO bits except ROBUST. */ - dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1; + dev->features &= ~NETIF_F_GSO_MASK; dev->features |= NETIF_F_GSO_ROBUST; } #elif defined(NETIF_F_TSO) @@ -635,7 +635,7 @@ static int network_open(struct net_devic if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){ netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(&np->napi); + napi_schedule(&np->napi); } } spin_unlock_bh(&np->rx_lock); @@ -707,7 +707,7 @@ static void rx_refill_timeout(unsigned l netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(&np->napi); + napi_schedule(&np->napi); } static void network_alloc_rx_buffers(struct net_device *dev) @@ -1064,7 +1064,7 @@ static irqreturn_t netif_int(int irq, vo if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) { netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(&np->napi); + napi_schedule(&np->napi); } } @@ -1521,7 +1521,7 @@ err: } if (!more_to_do && !accel_more_to_do) - __netif_rx_complete(napi); + __napi_complete(napi); local_irq_restore(flags); } --- head-2010-05-25.orig/drivers/xen/sfc_netfront/accel_msg.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/sfc_netfront/accel_msg.c 2010-03-24 15:25:06.000000000 +0100 @@ -47,7 +47,7 @@ static void vnic_start_interrupts(netfro netfront_accel_disable_net_interrupts(vnic); vnic->irq_enabled = 0; NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++); - netif_rx_schedule(&np->napi); + napi_schedule(&np->napi); } else { /* * Nothing yet, make sure we get interrupts through @@ -532,7 +532,7 @@ irqreturn_t netfront_accel_net_channel_i vnic->stats.event_count_since_irq; vnic->stats.event_count_since_irq = 0; #endif - netif_rx_schedule(&np->napi); + napi_schedule(&np->napi); } else { spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); --- head-2010-05-25.orig/drivers/xen/usbback/usbstub.c 2010-03-24 15:06:12.000000000 +0100 +++ head-2010-05-25/drivers/xen/usbback/usbstub.c 2010-03-24 15:25:06.000000000 +0100 @@ -188,7 +188,7 @@ static int usbstub_probe(struct usb_inte const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); - char *busid = intf->dev.parent->bus_id; + const char *busid = dev_name(intf->dev.parent); struct vusb_port_id *portid = NULL; struct usbstub *stub = NULL; usbif_t *usbif = NULL; --- head-2010-05-25.orig/drivers/xen/usbfront/usbfront-dbg.c 2010-03-24 15:10:37.000000000 +0100 +++ head-2010-05-25/drivers/xen/usbfront/usbfront-dbg.c 2010-03-24 15:25:06.000000000 +0100 @@ -64,7 +64,7 @@ static ssize_t show_statistics(struct de "%s\n" "xenhcd, hcd state %d\n", hcd->self.controller->bus->name, - hcd->self.controller->bus_id, + dev_name(hcd->self.controller), hcd->product_desc, hcd->state); size -= temp; --- head-2010-05-25.orig/drivers/xen/usbfront/xenbus.c 2010-04-15 09:53:49.000000000 +0200 +++ head-2010-05-25/drivers/xen/usbfront/xenbus.c 2010-03-24 15:25:06.000000000 +0100 @@ -252,10 +252,10 @@ static struct usb_hcd *create_hcd(struct } switch (usb_ver) { case USB_VER_USB11: - hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev->dev.bus_id); + hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev_name(&dev->dev)); break; case USB_VER_USB20: - hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev->dev.bus_id); + hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev_name(&dev->dev)); break; default: xenbus_dev_fatal(dev, err, "invalid usb-ver"); --- head-2010-05-25.orig/drivers/xen/xenbus/xenbus_probe.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/xenbus/xenbus_probe.c 2010-03-24 15:25:06.000000000 +0100 @@ -230,7 +230,7 @@ static struct xen_bus_type xenbus_fronte }, #if defined(CONFIG_XEN) || defined(MODULE) .dev = { - .bus_id = "xen", + .init_name = "xen", }, #endif }; --- head-2010-05-25.orig/drivers/xen/xenbus/xenbus_probe_backend.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/drivers/xen/xenbus/xenbus_probe_backend.c 2010-03-24 15:25:06.000000000 +0100 @@ -129,7 +129,7 @@ static struct xen_bus_type xenbus_backen .dev_attrs = xenbus_backend_attrs, }, .dev = { - .bus_id = "xen-backend", + .init_name = "xen-backend", }, }; --- head-2010-05-25.orig/include/linux/interrupt.h 2010-03-24 14:53:41.000000000 +0100 +++ head-2010-05-25/include/linux/interrupt.h 2010-03-24 15:25:06.000000000 +0100 @@ -52,6 +52,7 @@ * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. + * IRQF_NO_SUSPEND - Prevent this interrupt from being disabled during suspend. */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 @@ -62,6 +63,7 @@ #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 +#define IRQF_NO_SUSPEND 0x00008000 /* * Bits used by threaded handlers: --- head-2010-05-25.orig/kernel/irq/manage.c 2010-05-25 09:12:09.000000000 +0200 +++ head-2010-05-25/kernel/irq/manage.c 2010-03-24 15:25:06.000000000 +0100 @@ -200,7 +200,8 @@ static inline int setup_affinity(unsigne void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_TIMER)) + if (!desc->action || + (desc->action->flags & (IRQF_TIMER | IRQF_NO_SUSPEND))) return; desc->status |= IRQ_SUSPENDED; } --- head-2010-05-25.orig/lib/swiotlb-xen.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-25/lib/swiotlb-xen.c 2010-03-24 15:25:06.000000000 +0100 @@ -175,7 +175,7 @@ static void *swiotlb_bus_to_virt(dma_add return phys_to_virt(swiotlb_bus_to_phys(address)); } -int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) +int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) { return 0; } @@ -523,13 +523,13 @@ swiotlb_full(struct device *dev, size_t * Once the device is given the dma address, the device owns this memory until * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. */ -static dma_addr_t -_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, - int dir, struct dma_attrs *attrs) -{ - struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); - dma_addr_t dev_addr = gnttab_dma_map_page(page) + - offset_in_page(paddr); +dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + phys_addr_t phys = page_to_pseudophys(page) + offset; + dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset; void *map; BUG_ON(dir == DMA_NONE); @@ -539,44 +539,24 @@ _swiotlb_map_single(struct device *hwdev * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!address_needs_mapping(hwdev, dev_addr, size) && - !range_needs_mapping(paddr, size)) + if (!address_needs_mapping(dev, dev_addr, size) && + !range_needs_mapping(phys, size)) return dev_addr; /* * Oh well, have to allocate and map a bounce buffer. */ gnttab_dma_unmap_page(dev_addr); - map = map_single(hwdev, paddr, size, dir); + map = map_single(dev, phys, size, dir); if (!map) { - swiotlb_full(hwdev, size, dir, 1); + swiotlb_full(dev, size, dir, 1); map = io_tlb_overflow_buffer; } - dev_addr = swiotlb_virt_to_bus(hwdev, map); + dev_addr = swiotlb_virt_to_bus(dev, map); return dev_addr; } - -dma_addr_t -swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, - int dir, struct dma_attrs *attrs) -{ - return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs); -} -EXPORT_SYMBOL(swiotlb_map_single_attrs); - -dma_addr_t -swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) -{ - return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL); -} -EXPORT_SYMBOL(swiotlb_map_single); - -dma_addr_t -swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir) -{ - return _swiotlb_map_single(hwdev, paddr, size, dir, NULL); -} +EXPORT_SYMBOL_GPL(swiotlb_map_page); /* * Unmap a single streaming mode DMA translation. The dma_addr and size must @@ -586,9 +566,9 @@ swiotlb_map_single_phys(struct device *h * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -void -swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir, struct dma_attrs *attrs) +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); @@ -598,15 +578,7 @@ swiotlb_unmap_single_attrs(struct device else gnttab_dma_unmap_page(dev_addr); } -EXPORT_SYMBOL(swiotlb_unmap_single_attrs); - -void -swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, - int dir) -{ - return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL); -} -EXPORT_SYMBOL(swiotlb_unmap_single); +EXPORT_SYMBOL_GPL(swiotlb_unmap_page); /* * Make physical memory consistent for a single streaming mode DMA translation @@ -620,7 +592,7 @@ EXPORT_SYMBOL(swiotlb_unmap_single); */ void swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir) + size_t size, enum dma_data_direction dir) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); @@ -632,7 +604,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_cp void swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir) + size_t size, enum dma_data_direction dir) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); @@ -644,7 +616,8 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_de void swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - unsigned long offset, size_t size, int dir) + unsigned long offset, size_t size, + enum dma_data_direction dir) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); @@ -656,7 +629,8 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra void swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, - unsigned long offset, size_t size, int dir) + unsigned long offset, size_t size, + enum dma_data_direction dir) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); @@ -684,7 +658,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra */ int swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir, struct dma_attrs *attrs) + enum dma_data_direction dir, struct dma_attrs *attrs) { struct scatterlist *sg; int i; @@ -736,7 +710,7 @@ EXPORT_SYMBOL(swiotlb_map_sg); */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, int dir, struct dma_attrs *attrs) + int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) { struct scatterlist *sg; int i; @@ -770,7 +744,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg); */ void swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sgl, - int nelems, int dir) + int nelems, enum dma_data_direction dir) { struct scatterlist *sg; int i; @@ -787,7 +761,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); void swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sgl, - int nelems, int dir) + int nelems, enum dma_data_direction dir) { struct scatterlist *sg; int i; --- head-2010-05-25.orig/mm/page_alloc.c 2010-03-24 15:06:12.000000000 +0100 +++ head-2010-05-25/mm/page_alloc.c 2010-03-24 15:25:06.000000000 +0100 @@ -4685,11 +4685,9 @@ static void __setup_per_zone_wmarks(void } #ifdef CONFIG_XEN - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned int cpu; - if (!populated_zone(zone)) - continue; for_each_online_cpu(cpu) { unsigned long high;