You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
qubes-linux-kernel/patches.xen/xen3-patch-2.6.30

18267 lines
523 KiB

From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux 2.6.30
Patch-mainline: 2.6.30
This patch contains the differences between 2.6.29 and 2.6.30.
Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches.py
--- head-2011-03-17.orig/arch/ia64/include/asm/xen/hypervisor.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/ia64/include/asm/xen/hypervisor.h 2011-02-01 14:44:12.000000000 +0100
@@ -34,13 +34,13 @@
#define _ASM_IA64_XEN_HYPERVISOR_H
#include <linux/err.h>
+#ifdef CONFIG_PARAVIRT_XEN
#include <xen/interface/xen.h>
#include <xen/interface/version.h> /* to compile feature.c */
#include <xen/features.h> /* to comiple xen-netfront.c */
#include <xen/xen.h>
#include <asm/xen/hypercall.h>
-#ifdef CONFIG_PARAVIRT_XEN
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
--- head-2011-03-17.orig/arch/ia64/kernel/vmlinux.lds.S 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/arch/ia64/kernel/vmlinux.lds.S 2011-02-01 14:44:12.000000000 +0100
@@ -183,7 +183,7 @@ SECTIONS {
__start_gate_section = .;
*(.data..gate)
__stop_gate_section = .;
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
. = ALIGN(PAGE_SIZE);
__xen_start_gate_section = .;
*(.data..gate.xen)
--- head-2011-03-17.orig/arch/x86/Kconfig 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/Kconfig 2011-02-01 14:44:12.000000000 +0100
@@ -49,8 +49,8 @@ config X86
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_DMA_API_DEBUG
select HAVE_KERNEL_GZIP
- select HAVE_KERNEL_BZIP2
- select HAVE_KERNEL_LZMA
+ select HAVE_KERNEL_BZIP2 if !XEN
+ select HAVE_KERNEL_LZMA if !XEN
select HAVE_KERNEL_XZ
select HAVE_KERNEL_LZO
select HAVE_HW_BREAKPOINT
@@ -317,11 +317,11 @@ config X86_XEN
config X86_BIGSMP
bool "Support for big SMP systems with more than 8 CPUs"
- depends on X86_32 && SMP
+ depends on X86_32 && SMP && !XEN
---help---
This option is needed for the systems that have more than 8 CPUs
-if X86_32
+if X86_32 && !XEN
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
@@ -351,7 +351,7 @@ config X86_64_XEN
help
This option will compile a kernel compatible with Xen hypervisor
-if X86_64
+if X86_64 && !XEN
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
@@ -769,7 +769,7 @@ config MAXSMP
config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP
- range 2 8 if SMP && X86_32 && !X86_BIGSMP
+ range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
range 2 512 if SMP && !MAXSMP
default "1" if !SMP
default "4096" if MAXSMP
@@ -854,10 +854,6 @@ config X86_VISWS_APIC
def_bool y
depends on X86_32 && X86_VISWS
-config X86_XEN_GENAPIC
- def_bool y
- depends on X86_64_XEN
-
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
depends on X86_IO_APIC && !XEN
--- head-2011-03-17.orig/arch/x86/Makefile 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -116,10 +116,6 @@ endif
# prevent gcc from generating any FP code by mistake
KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
-# Xen subarch support
-mflags-$(CONFIG_XEN) := -Iarch/x86/include/mach-xen
-mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
-
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)
@@ -189,10 +185,10 @@ endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+endif
$(BOOT_TARGETS): vmlinux
$(Q)$(MAKE) $(build)=$(boot) $@
-endif
PHONY += install
install:
--- head-2011-03-17.orig/arch/x86/boot/Makefile 2011-01-31 14:53:50.000000000 +0100
+++ head-2011-03-17/arch/x86/boot/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -204,6 +204,12 @@ $(obj)/vmlinux-stripped: OBJCOPYFLAGS :=
$(obj)/vmlinux-stripped: vmlinux FORCE
$(call if_changed,objcopy)
+ifndef CONFIG_XEN
+bzImage := bzImage
+else
+bzImage := vmlinuz
+endif
+
install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \
System.map "$(INSTALL_PATH)"
--- head-2011-03-17.orig/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:44:12.000000000 +0100
@@ -502,7 +502,7 @@ ia32_sys_call_table:
.quad sys32_olduname
.quad sys_umask /* 60 */
.quad sys_chroot
- .quad sys32_ustat
+ .quad compat_sys_ustat
.quad sys_dup2
.quad sys_getppid
.quad sys_getpgrp /* 65 */
@@ -773,4 +773,6 @@ ia32_sys_call_table:
.quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
+ .quad compat_sys_preadv
+ .quad compat_sys_pwritev
ia32_syscall_end:
--- head-2011-03-17.orig/arch/x86/include/asm/kexec.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/asm/kexec.h 2011-02-01 14:44:12.000000000 +0100
@@ -21,8 +21,14 @@
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
# define PA_TABLE_PAGE 2
+# ifndef CONFIG_XEN
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
+# else /* CONFIG_XEN, see comment above
+# define VA_TABLE_PAGE 3 */
+# define PA_SWAP_PAGE 4
+# define PAGES_NR 5
+# endif /* CONFIG_XEN */
#endif
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
--- head-2011-03-17.orig/arch/x86/include/asm/page_64_types.h 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/arch/x86/include/asm/page_64_types.h 2011-02-01 14:44:12.000000000 +0100
@@ -69,7 +69,15 @@ extern void init_extra_mapping_wb(unsign
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_FLATMEM
+/*
+ * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen
+ * other than for hotplugged memory.
+ */
+#ifndef CONFIG_XEN
#define pfn_valid(pfn) ((pfn) < max_pfn)
+#else
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif
#endif
#endif /* _ASM_X86_PAGE_64_DEFS_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:44:12.000000000 +0100
@@ -39,7 +39,7 @@ extern gate_desc idt_table[];
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
@@ -91,7 +91,6 @@ static inline int desc_empty(const void
#define store_gdt(dtr) native_store_gdt(dtr)
#define store_idt(dtr) native_store_idt(dtr)
#define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
#define load_TLS(t, cpu) native_load_tls(t, cpu)
#define set_ldt native_set_ldt
@@ -111,6 +110,8 @@ static inline void paravirt_free_ldt(str
{
}
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
static inline void native_write_idt_entry(gate_desc *idt, int entry,
const gate_desc *gate)
{
@@ -251,6 +252,8 @@ static inline void native_load_tls(struc
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}
#else
+#include <asm/pgtable.h>
+
#define load_TLS(t, cpu) xen_load_tls(t, cpu)
#define set_ldt xen_set_ldt
@@ -265,8 +268,9 @@ static inline void xen_load_tls(struct t
struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
- if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
- *(u64 *)&t->tls_array[i]))
+ if (HYPERVISOR_update_descriptor(
+ arbitrary_virt_to_machine(&gdt[i]),
+ *(u64 *)&t->tls_array[i]))
BUG();
}
#endif
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:44:12.000000000 +0100
@@ -1,11 +1,154 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
+#else
+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+#endif
+
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
#ifdef CONFIG_X86_32
-# include "fixmap_32.h"
+ FIX_HOLE,
+ FIX_VDSO,
#else
-# include "fixmap_64.h"
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VSYSCALL_HPET,
+#endif
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#else
+ FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS 256
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+ FIX_F00F_IDT, /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_X86_32
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
+ FIX_TEXT_POKE1,
+ __end_of_permanent_fixed_addresses,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 256 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+#ifdef CONFIG_X86_32
+ FIX_WP_TEST,
+#endif
+ __end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
extern int fixmaps_set;
@@ -13,10 +156,10 @@ extern pte_t *kmap_pte;
extern pgprot_t kmap_prot;
extern pte_t *pkmap_page_table;
-void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
+void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);
static inline void __set_fixmap(enum fixed_addresses idx,
- maddr_t phys, pgprot_t flags)
+ phys_addr_t phys, pgprot_t flags)
{
xen_set_fixmap(idx, phys, flags);
}
@@ -65,4 +208,5 @@ static inline unsigned long virt_to_fix(
BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
return __virt_to_fix(vaddr);
}
+#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-02-01 14:39:24.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,125 +0,0 @@
-/*
- * fixmap.h: compile-time virtual memory allocation
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998 Ingo Molnar
- *
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- */
-
-#ifndef _ASM_X86_FIXMAP_32_H
-#define _ASM_X86_FIXMAP_32_H
-
-/* used by vmalloc.c, vsyscall.lds.S.
- *
- * Leave one empty page between vmalloc'ed areas and
- * the start of the fixmap.
- */
-extern unsigned long __FIXADDR_TOP;
-#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
-#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
-
-#ifndef __ASSEMBLY__
-#include <linux/kernel.h>
-#include <asm/acpi.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <linux/threads.h>
-#include <asm/kmap_types.h>
-
-/*
- * Here we define all the compile-time 'special' virtual
- * addresses. The point is to have a constant address at
- * compile time, but to set the physical address only
- * in the boot process. We allocate these special addresses
- * from the end of virtual memory (0xfffff000) backwards.
- * Also this lets us do fail-safe vmalloc(), we
- * can guarantee that these special addresses and
- * vmalloc()-ed addresses never overlap.
- *
- * these 'compile-time allocated' memory buffers are
- * fixed-size 4k pages. (or larger if used with an increment
- * highger than 1) use fixmap_set(idx,phys) to associate
- * physical memory with fixmap indices.
- *
- * TLB entries of such buffers will not be flushed across
- * task switches.
- */
-enum fixed_addresses {
- FIX_HOLE,
- FIX_VDSO,
- FIX_DBGP_BASE,
- FIX_EARLYCON_MEM_BASE,
-#ifdef CONFIG_X86_LOCAL_APIC
- FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
-#endif
-#ifndef CONFIG_XEN
-#ifdef CONFIG_X86_IO_APIC
- FIX_IO_APIC_BASE_0,
- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
-#endif
-#else
- FIX_SHARED_INFO,
-#define NR_FIX_ISAMAPS 256
- FIX_ISAMAP_END,
- FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
-#endif
-#ifdef CONFIG_X86_VISWS_APIC
- FIX_CO_CPU, /* Cobalt timer */
- FIX_CO_APIC, /* Cobalt APIC Redirection Table */
- FIX_LI_PCIA, /* Lithium PCI Bridge A */
- FIX_LI_PCIB, /* Lithium PCI Bridge B */
-#endif
-#ifdef CONFIG_X86_F00F_BUG
- FIX_F00F_IDT, /* Virtual mapping for IDT */
-#endif
-#ifdef CONFIG_X86_CYCLONE_TIMER
- FIX_CYCLONE_TIMER, /*cyclone timer register*/
-#endif
- FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
-#ifdef CONFIG_PCI_MMCONFIG
- FIX_PCIE_MCFG,
-#endif
-#ifdef CONFIG_PARAVIRT
- FIX_PARAVIRT_BOOTMAP,
-#endif
- __end_of_permanent_fixed_addresses,
- /*
- * 256 temporary boot-time mappings, used by early_ioremap(),
- * before ioremap() is functional.
- *
- * We round it up to the next 256 pages boundary so that we
- * can have a single pgd entry and a single pte table:
- */
-#define NR_FIX_BTMAPS 64
-#define FIX_BTMAPS_SLOTS 4
- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
- (__end_of_permanent_fixed_addresses & 255),
- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
- FIX_WP_TEST,
-#ifdef CONFIG_ACPI
- FIX_ACPI_BEGIN,
- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- FIX_OHCI1394_BASE,
-#endif
- __end_of_fixed_addresses
-};
-
-extern void reserve_top_address(unsigned long reserve);
-
-
-#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
-
-#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
-#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_FIXMAP_32_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-02-01 14:42:26.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,90 +0,0 @@
-/*
- * fixmap.h: compile-time virtual memory allocation
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998 Ingo Molnar
- */
-
-#ifndef _ASM_X86_FIXMAP_64_H
-#define _ASM_X86_FIXMAP_64_H
-
-#include <linux/kernel.h>
-#include <asm/acpi.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <asm/vsyscall.h>
-#include <asm/acpi.h>
-
-/*
- * Here we define all the compile-time 'special' virtual
- * addresses. The point is to have a constant address at
- * compile time, but to set the physical address only
- * in the boot process.
- *
- * These 'compile-time allocated' memory buffers are
- * fixed-size 4k pages (or larger if used with an increment
- * higher than 1). Use set_fixmap(idx,phys) to associate
- * physical memory with fixmap indices.
- *
- * TLB entries of such buffers will not be flushed across
- * task switches.
- */
-
-enum fixed_addresses {
- VSYSCALL_LAST_PAGE,
- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
- + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
- VSYSCALL_HPET,
- FIX_DBGP_BASE,
- FIX_EARLYCON_MEM_BASE,
-#ifdef CONFIG_X86_LOCAL_APIC
- FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
-#endif
-#ifndef CONFIG_XEN
- FIX_IO_APIC_BASE_0,
- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-#else
-#define NR_FIX_ISAMAPS 256
- FIX_ISAMAP_END,
- FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
-#endif
-#ifdef CONFIG_PARAVIRT
- FIX_PARAVIRT_BOOTMAP,
-#else
- FIX_SHARED_INFO,
-#endif
- __end_of_permanent_fixed_addresses,
-#ifdef CONFIG_ACPI
- FIX_ACPI_BEGIN,
- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- FIX_OHCI1394_BASE,
-#endif
- /*
- * 256 temporary boot-time mappings, used by early_ioremap(),
- * before ioremap() is functional.
- *
- * We round it up to the next 256 pages boundary so that we
- * can have a single pgd entry and a single pte table:
- */
-#define NR_FIX_BTMAPS 64
-#define FIX_BTMAPS_SLOTS 4
- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
- (__end_of_permanent_fixed_addresses & 255),
- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
- __end_of_fixed_addresses
-};
-
-#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
-#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
-
-/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
-#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
-
-#endif /* _ASM_X86_FIXMAP_64_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:44:12.000000000 +0100
@@ -62,6 +62,7 @@ void *kmap_atomic_prot(struct page *page
void *kmap_atomic(struct page *page, enum km_type type);
void kunmap_atomic(void *kvaddr, enum km_type type);
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
#define kmap_atomic_pte(page, type) \
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 14:44:12.000000000 +0100
@@ -43,10 +43,11 @@
#include <xen/interface/sched.h>
#include <xen/interface/nmi.h>
#include <xen/interface/tmem.h>
+#include <xen/interface/vcpu.h>
#include <xen/interface/arch-x86/xen-mca.h>
#include <asm/percpu.h>
#include <asm/ptrace.h>
-#include <asm/page.h>
+#include <asm/pgtable_types.h>
extern shared_info_t *HYPERVISOR_shared_info;
@@ -71,7 +72,9 @@ extern start_info_t *xen_start_info;
#define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN))
+DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
+#define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running)
/* arch/xen/kernel/evtchn.c */
/* Force a proper event-channel callback from Xen. */
@@ -153,20 +156,16 @@ int __must_check xen_multi_mmuext_op(str
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
- __get_cpu_var(xen_lazy_mmu) = true;
+ percpu_write(xen_lazy_mmu, true);
}
static inline void arch_leave_lazy_mmu_mode(void)
{
- __get_cpu_var(xen_lazy_mmu) = false;
+ percpu_write(xen_lazy_mmu, false);
xen_multicall_flush(false);
}
-#if defined(CONFIG_X86_32)
-#define arch_use_lazy_mmu_mode() unlikely(x86_read_percpu(xen_lazy_mmu))
-#elif !defined(arch_use_lazy_mmu_mode)
-#define arch_use_lazy_mmu_mode() unlikely(__get_cpu_var(xen_lazy_mmu))
-#endif
+#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu))
#if 0 /* All uses are in places potentially called asynchronously, but
* asynchronous code should rather not make use of lazy mode at all.
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:44:12.000000000 +0100
@@ -5,6 +5,10 @@
#include <linux/compiler.h>
#include <asm-generic/int-ll64.h>
+#include <asm/page.h>
+#ifdef __KERNEL__
+#include <asm/fixmap.h>
+#endif
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -82,6 +86,101 @@ static inline void writeq(__u64 val, vol
#define native_io_delay xen_io_delay
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+ return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ * However, we truncate the address to unsigned int to avoid undesirable
+ * promitions in legacy drivers.
+ */
+#define isa_virt_to_bus(_x) ({ \
+ unsigned long _va_ = (unsigned long)(_x); \
+ _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+ ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+ : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+
#ifdef CONFIG_X86_32
# include "../../asm/io_32.h"
#else
@@ -93,11 +192,6 @@ static inline void writeq(__u64 val, vol
/* We will be supplying our own /dev/mem implementation */
#define ARCH_HAS_DEV_MEM
-#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-#undef page_to_phys
-#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
-#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
-
#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
(unsigned long)(bv)->bv_offset)
@@ -106,23 +200,7 @@ static inline void writeq(__u64 val, vol
&& bvec_to_pseudophys(vec1) + (vec1)->bv_len \
== bvec_to_pseudophys(vec2))
-#undef virt_to_bus
-#undef bus_to_virt
-#define virt_to_bus(_x) phys_to_machine(__pa(_x))
-#define bus_to_virt(_x) __va(machine_to_phys(_x))
-
-#include <asm/fixmap.h>
-
#undef __ISA_IO_base
-#undef isa_virt_to_bus
-#undef isa_page_to_bus
-#undef isa_bus_to_virt
-#define isa_virt_to_bus(_x) ({ \
- unsigned long _va_ = (unsigned long)(_x); \
- _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
- ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
- : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
-#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
#endif
@@ -131,7 +209,7 @@ extern void unxlate_dev_mem_ptr(unsigned
extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
unsigned long prot_val);
-extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
/*
* early_ioremap() and early_iounmap() are for temporary early boot-time
@@ -140,10 +218,12 @@ extern void __iomem *ioremap_wc(unsigned
*/
extern void early_ioremap_init(void);
extern void early_ioremap_reset(void);
-extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
-extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
+extern void __iomem *early_ioremap(resource_size_t phys_addr,
+ unsigned long size);
+extern void __iomem *early_memremap(resource_size_t phys_addr,
+ unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+#define IO_SPACE_LIMIT 0xffff
#endif /* _ASM_X86_IO_H */
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/ipi.h 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+#include <asm/hw_irq.h>
+#include <asm/smp.h>
+
+void xen_send_IPI_mask(const struct cpumask *, int vector);
+void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector);
+void xen_send_IPI_allbutself(int vector);
+void xen_send_IPI_all(int vector);
+void xen_send_IPI_self(int vector);
+
+#endif /* _ASM_X86_IPI_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 14:44:12.000000000 +0100
@@ -94,7 +94,7 @@ static inline void halt(void)
#ifdef CONFIG_X86_64
# define __REG_si %rsi
-# define __CPU_num %gs:pda_cpunumber
+# define __CPU_num PER_CPU_VAR(cpu_number)
#else
# define __REG_si %esi
# define __CPU_num TI_cpu(%ebp)
@@ -130,6 +130,7 @@ sysexit_ecrit: /**** END OF SYSEXIT CRIT
mov $__KERNEL_PERCPU, %ecx ; \
push %esp ; \
mov %ecx, %fs ; \
+ SET_KERNEL_GS %ecx ; \
call evtchn_do_upcall ; \
add $4,%esp ; \
jmp ret_from_intr
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:32:00.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:33:07.000000000 +0100
@@ -2,29 +2,46 @@
#define _ASM_X86_IRQ_VECTORS_H
#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR 0x80
+# define SYSCALL_VECTOR 0x80
#else
-# define IA32_SYSCALL_VECTOR 0x80
+# define IA32_SYSCALL_VECTOR 0x80
#endif
-#define RESCHEDULE_VECTOR 0
-#define CALL_FUNCTION_VECTOR 1
-#define NMI_VECTOR 0x02
-#define CALL_FUNC_SINGLE_VECTOR 3
-#define NR_IPIS 4
+#define RESCHEDULE_VECTOR 0
+#define CALL_FUNCTION_VECTOR 1
+#define NMI_VECTOR 0x02
+#define CALL_FUNC_SINGLE_VECTOR 3
+#define NR_IPIS 4
/*
* The maximum number of vectors supported by i386 processors
* is limited to 256. For processors other than i386, NR_VECTORS
* should be changed accordingly.
*/
-#define NR_VECTORS 256
+#define NR_VECTORS 256
-#define FIRST_VM86_IRQ 3
-#define LAST_VM86_IRQ 15
-#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
-#define NR_IRQS_LEGACY 16
+#ifndef __ASSEMBLY__
+static inline int invalid_vm86_irq(int irq)
+{
+ return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+#endif
+
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
+
+#define NR_IRQS_LEGACY 16
/*
* The flat IRQ space is divided into two regions:
@@ -35,21 +52,41 @@
* 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
* are bound using the provided bind/unbind functions.
*/
+#define PIRQ_BASE 0
+
+#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
+#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
-#define PIRQ_BASE 0
-#if defined(NR_CPUS) && defined(MAX_IO_APICS)
-# if !defined(CONFIG_SPARSE_IRQ) && NR_CPUS < MAX_IO_APICS
-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
-# elif defined(CONFIG_SPARSE_IRQ) && 8 * NR_CPUS > 32 * MAX_IO_APICS
-# define NR_PIRQS (NR_VECTORS + 8 * NR_CPUS)
+#if defined(CONFIG_X86_IO_APIC)
+# ifdef CONFIG_SPARSE_IRQ
+# define NR_PIRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
# else
-# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
+# define NR_PIRQS \
+ (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
+ (NR_VECTORS + CPU_VECTOR_LIMIT) : \
+ (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# endif
+#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
+# define NR_PIRQS (NR_VECTORS + CPU_VECTOR_LIMIT)
+#else /* !CONFIG_X86_IO_APIC: */
+# define NR_PIRQS NR_IRQS_LEGACY
#endif
-#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
-#define NR_DYNIRQS 256
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_SPARSE_IRQ
+extern int nr_pirqs;
+#else
+# define nr_pirqs NR_PIRQS
+#endif
+#endif
+
+#define DYNIRQ_BASE (PIRQ_BASE + nr_pirqs)
+#ifdef CONFIG_SPARSE_IRQ
+#define NR_DYNIRQS CPU_VECTOR_LIMIT
+#else
+#define NR_DYNIRQS 256
+#endif
-#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
#endif /* _ASM_X86_IRQ_VECTORS_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-01 14:44:12.000000000 +0100
@@ -26,11 +26,117 @@ static inline void xen_activate_mm(struc
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+#define prepare_arch_switch(next) __prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
#ifdef CONFIG_X86_32
-# include "mmu_context_32.h"
+ /*
+ * Save away %gs. No need to save %fs, as it was saved on the
+ * stack on entry. No need to save %es and %ds, as those are
+ * always kernel segments while inside the kernel.
+ */
+ lazy_save_gs(current->thread.gs);
+ lazy_load_gs(__KERNEL_STACK_CANARY);
#else
-# include "mmu_context_64.h"
+ /*
+ * Save away %es, %ds, %fs and %gs. Must happen before reload
+ * of cr3/ldt (i.e., not in __switch_to).
+ */
+ __asm__ __volatile__ (
+ "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
+ : "=m" (current->thread.es),
+ "=m" (current->thread.ds),
+ "=m" (current->thread.fsindex),
+ "=m" (current->thread.gsindex) );
+
+ if (current->thread.ds)
+ __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
+
+ if (current->thread.es)
+ __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
+
+ if (current->thread.fsindex) {
+ __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
+ current->thread.fs = 0;
+ }
+
+ if (current->thread.gsindex) {
+ load_gs_index(0);
+ current->thread.gs = 0;
+ }
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+ struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+
+ if (likely(prev != next)) {
+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+ !PagePinned(virt_to_page(next->pgd)));
+
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+
+ /* Re-load page tables: load_cr3(next->pgd) */
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = virt_to_mfn(next->pgd);
+ op++;
+
+ /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
+#ifdef CONFIG_X86_64
+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
+ op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
+ op++;
+#endif
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt)) {
+ /* load_LDT_nolock(&next->context) */
+ op->cmd = MMUEXT_SET_LDT;
+ op->arg1.linear_addr = (unsigned long)next->context.ldt;
+ op->arg2.nr_ents = next->context.size;
+ op++;
+ }
+
+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+ }
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ else {
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+ load_cr3(next->pgd);
+ xen_new_user_pt(__pa(__user_pgd(next->pgd)));
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
#define activate_mm(prev, next) \
do { \
@@ -38,5 +144,17 @@ do { \
switch_mm((prev), (next), NULL); \
} while (0);
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm) \
+do { \
+ lazy_load_gs(0); \
+} while (0)
+#else
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ loadsegment(fs, 0); \
+} while (0)
+#endif
#endif /* _ASM_X86_MMU_CONTEXT_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-02-01 14:42:26.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,83 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_32_H
-#define _ASM_X86_MMU_CONTEXT_32_H
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
-}
-
-#define prepare_arch_switch(next) __prepare_arch_switch()
-
-static inline void __prepare_arch_switch(void)
-{
- /*
- * Save away %gs. No need to save %fs, as it was saved on the
- * stack on entry. No need to save %es and %ds, as those are
- * always kernel segments while inside the kernel.
- */
- asm volatile ( "mov %%gs,%0"
- : "=m" (current->thread.gs));
- asm volatile ( "movl %0,%%gs"
- : : "r" (0) );
-}
-
-static inline void switch_mm(struct mm_struct *prev,
- struct mm_struct *next,
- struct task_struct *tsk)
-{
- int cpu = smp_processor_id();
- struct mmuext_op _op[2], *op = _op;
-
- if (likely(prev != next)) {
- BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
- !PagePinned(virt_to_page(next->pgd)));
-
- /* stop flush ipis for the previous mm */
- cpu_clear(cpu, prev->cpu_vm_mask);
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- x86_write_percpu(cpu_tlbstate.active_mm, next);
-#endif
- cpu_set(cpu, next->cpu_vm_mask);
-
- /* Re-load page tables: load_cr3(next->pgd) */
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
- op++;
-
- /*
- * load the LDT, if the LDT is different:
- */
- if (unlikely(prev->context.ldt != next->context.ldt)) {
- /* load_LDT_nolock(&next->context, cpu) */
- op->cmd = MMUEXT_SET_LDT;
- op->arg1.linear_addr = (unsigned long)next->context.ldt;
- op->arg2.nr_ents = next->context.size;
- op++;
- }
-
- BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
- }
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
- else {
- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
-
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload %cr3.
- */
- load_cr3(next->pgd);
- load_LDT_nolock(&next->context);
- }
- }
-#endif
-}
-
-#define deactivate_mm(tsk, mm) \
- asm("movl %0,%%gs": :"r" (0));
-
-#endif /* _ASM_X86_MMU_CONTEXT_32_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-02-01 14:39:24.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,106 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_64_H
-#define _ASM_X86_MMU_CONTEXT_64_H
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
-#endif
-}
-
-#define prepare_arch_switch(next) __prepare_arch_switch()
-
-static inline void __prepare_arch_switch(void)
-{
- /*
- * Save away %es, %ds, %fs and %gs. Must happen before reload
- * of cr3/ldt (i.e., not in __switch_to).
- */
- __asm__ __volatile__ (
- "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
- : "=m" (current->thread.es),
- "=m" (current->thread.ds),
- "=m" (current->thread.fsindex),
- "=m" (current->thread.gsindex) );
-
- if (current->thread.ds)
- __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
-
- if (current->thread.es)
- __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
-
- if (current->thread.fsindex) {
- __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
- current->thread.fs = 0;
- }
-
- if (current->thread.gsindex) {
- load_gs_index(0);
- current->thread.gs = 0;
- }
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
- struct mmuext_op _op[3], *op = _op;
-
- if (likely(prev != next)) {
- BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
- !PagePinned(virt_to_page(next->pgd)));
-
- /* stop flush ipis for the previous mm */
- cpu_clear(cpu, prev->cpu_vm_mask);
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- write_pda(mmu_state, TLBSTATE_OK);
- write_pda(active_mm, next);
-#endif
- cpu_set(cpu, next->cpu_vm_mask);
-
- /* load_cr3(next->pgd) */
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
- op++;
-
- /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
- op->cmd = MMUEXT_NEW_USER_BASEPTR;
- op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
- op++;
-
- if (unlikely(next->context.ldt != prev->context.ldt)) {
- /* load_LDT_nolock(&next->context) */
- op->cmd = MMUEXT_SET_LDT;
- op->arg1.linear_addr = (unsigned long)next->context.ldt;
- op->arg2.nr_ents = next->context.size;
- op++;
- }
-
- BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
- }
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- else {
- write_pda(mmu_state, TLBSTATE_OK);
- if (read_pda(active_mm) != next)
- BUG();
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- */
- load_cr3(next->pgd);
- xen_new_user_pt(__pa(__user_pgd(next->pgd)));
- load_LDT_nolock(&next->context);
- }
- }
-#endif
-}
-
-#define deactivate_mm(tsk, mm) \
-do { \
- load_gs_index(0); \
- asm volatile("movl %0,%%fs"::"r"(0)); \
-} while (0)
-
-#endif /* _ASM_X86_MMU_CONTEXT_64_H */
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/mutex.h 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,3 @@
+#define arch_cpu_is_running(cpu) vcpu_running(cpu)
+
+#include_next <asm/mutex.h>
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:44:12.000000000 +0100
@@ -41,7 +41,6 @@ static inline int pci_proc_domain(struct
return pci_domain_nr(bus);
}
-extern void pci_iommu_alloc(void);
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
@@ -92,12 +91,44 @@ static inline void early_quirks(void) {
extern void pci_iommu_alloc(void);
-#endif /* __KERNEL__ */
+/* MSI arch hooks */
+#define arch_setup_msi_irqs arch_setup_msi_irqs
+#define arch_teardown_msi_irqs arch_teardown_msi_irqs
+
+#define PCI_DMA_BUS_IS_PHYS 0
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) || defined(CONFIG_SWIOTLB)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
+ dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
+ __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME) \
+ ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME) \
+ ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ (((PTR)->LEN_NAME) = (VAL))
-#ifdef CONFIG_X86_32
-# include "pci_32.h"
#else
-# include "../../asm/pci_64.h"
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
+#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
+#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ do { break; } while (pci_unmap_len(PTR, LEN_NAME))
+
+#endif
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include "../../asm/pci_64.h"
#endif
/* implement the pci_ DMA API in terms of the generic device dma_ one */
@@ -115,11 +146,6 @@ static inline int __pcibus_to_node(const
return sd->node;
}
-static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
-{
- return node_to_cpumask(__pcibus_to_node(bus));
-}
-
static inline const struct cpumask *
cpumask_of_pcibus(const struct pci_bus *bus)
{
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 14:44:12.000000000 +0100
@@ -1,178 +1,9 @@
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H
-#define FIRST_USER_ADDRESS 0
+#include <asm/page.h>
-#define _PAGE_BIT_PRESENT 0 /* is present */
-#define _PAGE_BIT_RW 1 /* writeable */
-#define _PAGE_BIT_USER 2 /* userspace addressable */
-#define _PAGE_BIT_PWT 3 /* page write through */
-#define _PAGE_BIT_PCD 4 /* page cache disabled */
-#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
-#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
-#define _PAGE_BIT_PAT 7 /* on 4KB pages */
-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
-#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
-#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
-#define _PAGE_BIT_UNUSED3 11
-#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
-
-/* If _PAGE_BIT_PRESENT is clear, we use these: */
-/* - if the user mapped it with PROT_NONE; pte_present gives true */
-#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
-
-#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
-#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
-#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
-#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
-#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
-#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
-#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
-#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
-#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
-#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
-#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
-#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define __HAVE_ARCH_PTE_SPECIAL
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
-#else
-#define _PAGE_NX (_AT(pteval_t, 0))
-#endif
-
-#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-
-#ifndef __ASSEMBLY__
-#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
-extern unsigned int __kernel_page_user;
-#else
-#define __kernel_page_user 0
-#endif
-#endif
-
-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY | __kernel_page_user)
-
-/* Set of bits not changed in pte_modify */
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-/*
- * PAT settings are part of the hypervisor interface, which sets the
- * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
- */
-#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
-#define _PAGE_CACHE_WB (0)
-#define _PAGE_CACHE_WT (_PAGE_PWT)
-#define _PAGE_CACHE_WC (_PAGE_PAT)
-#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
-#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
-#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
-
-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-
-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED)
-#define PAGE_COPY PAGE_COPY_NOEXEC
-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED)
-
-#define __PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
-#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
-
-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
-#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
-#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
-
-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
-#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
-
-#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
-#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
-#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
-
-/* xwr */
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_EXEC
-#define __P101 PAGE_READONLY_EXEC
-#define __P110 PAGE_COPY_EXEC
-#define __P111 PAGE_COPY_EXEC
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_EXEC
-#define __S101 PAGE_READONLY_EXEC
-#define __S110 PAGE_SHARED_EXEC
-#define __S111 PAGE_SHARED_EXEC
-
-/*
- * early identity mapping pte attrib macros.
- */
-#ifdef CONFIG_X86_64
-#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
-#else
-/*
- * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
- * bits are combined, this will alow user to access the high address mapped
- * VDSO in the presence of CONFIG_COMPAT_VDSO
- */
-#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
-#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
-#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
-#endif
+#include <asm/pgtable_types.h>
/*
* Macro to mark a page protection value as UC-
@@ -184,9 +15,6 @@ extern unsigned int __kernel_page_user;
#ifndef __ASSEMBLY__
-#define pgprot_writecombine pgprot_writecombine
-extern pgprot_t pgprot_writecombine(pgprot_t prot);
-
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
@@ -197,6 +25,59 @@ extern unsigned long empty_zero_page[PAG
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
+#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_atomic(ptep, pte) \
+ xen_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd) xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud) xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd) xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+ xen_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+ xen_pagetable_setup_done(base);
+}
+
+#define pgd_val(x) xen_pgd_val(x)
+#define __pgd(x) xen_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x) xen_pud_val(x)
+#define __pud(x) xen_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x) xen_pmd_val(x)
+#define __pmd(x) xen_make_pmd(x)
+#endif
+
+#define pte_val(x) xen_pte_val(x)
+#define __pte(x) xen_make_pte(x)
+
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
@@ -252,53 +133,67 @@ static inline int pte_special(pte_t pte)
static inline int pmd_large(pmd_t pte)
{
- return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
(_PAGE_PSE | _PAGE_PRESENT);
}
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+ pteval_t v = __pte_val(pte);
+
+ return __pte_ma(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+ pteval_t v = __pte_val(pte);
+
+ return __pte_ma(v & ~clear);
+}
+
static inline pte_t pte_mkclean(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkold(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
+ return pte_clear_flags(pte, _PAGE_ACCESSED);
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
+ return pte_clear_flags(pte, _PAGE_RW);
}
static inline pte_t pte_mkexec(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
+ return pte_clear_flags(pte, _PAGE_NX);
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
+ return pte_set_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
{
- return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
+ return pte_set_flags(pte, _PAGE_ACCESSED);
}
static inline pte_t pte_mkwrite(pte_t pte)
{
- return __pte_ma(__pte_val(pte) | _PAGE_RW);
+ return pte_set_flags(pte, _PAGE_RW);
}
static inline pte_t pte_mkhuge(pte_t pte)
{
- return __pte_ma(__pte_val(pte) | _PAGE_PSE);
+ return pte_set_flags(pte, _PAGE_PSE);
}
static inline pte_t pte_clrhuge(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
+ return pte_clear_flags(pte, _PAGE_PSE);
}
static inline pte_t pte_mkglobal(pte_t pte)
@@ -313,11 +208,9 @@ static inline pte_t pte_clrglobal(pte_t
static inline pte_t pte_mkspecial(pte_t pte)
{
- return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
+ return pte_set_flags(pte, _PAGE_SPECIAL);
}
-extern pteval_t __supported_pte_mask;
-
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -390,68 +283,208 @@ static inline int is_new_memtype_allowed
return 1;
}
-#ifndef __ASSEMBLY__
-#ifndef CONFIG_XEN
-/* Indicate that x86 has its own track and untrack pfn vma functions */
-#define __HAVE_PFNMAP_TRACKING
-#endif
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
+#endif /* __ASSEMBLY__ */
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-struct file;
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t *vma_prot);
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
#endif
-/* Install a pte for a particular vaddr in kernel space. */
-void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>
-#ifndef CONFIG_XEN
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
+static inline int pte_none(pte_t pte)
+{
+ return !pte.pte;
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+ can temporarily clear it. */
+ return __pmd_val(pmd) != 0;
#else
-static inline void xen_pagetable_setup_start(pgd_t *base) {}
-static inline void xen_pagetable_setup_done(pgd_t *base) {}
+ return pmd_flags(pmd) & _PAGE_PRESENT;
#endif
+}
-struct seq_file;
-extern void arch_report_meminfo(struct seq_file *m);
+static inline int pmd_none(pmd_t pmd)
+{
+ /* Only check low word on 32-bit platforms, since it might be
+ out of sync with upper half. */
+ return (unsigned long)__pmd_val(pmd) == 0;
+}
-#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
-#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+ return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+}
-#define set_pte_atomic(ptep, pte) \
- xen_set_pte_atomic(ptep, pte)
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
-#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned pmd_index(unsigned long address)
+{
+ return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}
-#ifndef __PAGETABLE_PUD_FOLDED
-#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd) xen_pgd_clear(pgd)
-#endif
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-#ifndef set_pud
-# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
-#endif
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned pte_index(unsigned long address)
+{
+ return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}
-#ifndef __PAGETABLE_PMD_FOLDED
-#define pud_clear(pud) xen_pud_clear(pud)
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+ return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+ return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT)
+ != (_KERNPG_TABLE & ~_PAGE_PRESENT);
+#else
+ return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
#endif
+}
-#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
-#define pmd_clear(pmd) xen_pmd_clear(pmd)
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+ return npg >> (20 - PAGE_SHIFT);
+}
-#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
-#endif /* __ASSEMBLY__ */
+#if PAGETABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+ return __pud_val(pud) == 0;
+}
-#ifdef CONFIG_X86_32
-# include "pgtable_32.h"
+static inline int pud_present(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+ return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+ return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int pud_large(pud_t pud)
+{
+ return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
#else
-# include "pgtable_64.h"
-#endif
+static inline int pud_large(pud_t pud)
+{
+ return 0;
+}
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#if PAGETABLE_LEVELS > 3
+static inline int pgd_present(pgd_t pgd)
+{
+ return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+ return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+
+/* to find an entry in a page-table-directory. */
+static inline unsigned pud_index(unsigned long address)
+{
+ return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+ return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+ return !__pgd_val(pgd);
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+
+#endif /* __ASSEMBLY__ */
/*
* the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
@@ -478,28 +511,6 @@ extern void arch_report_meminfo(struct s
#ifndef __ASSEMBLY__
-enum {
- PG_LEVEL_NONE,
- PG_LEVEL_4K,
- PG_LEVEL_2M,
- PG_LEVEL_1G,
- PG_LEVEL_NUM
-};
-
-#ifdef CONFIG_PROC_FS
-extern void update_page_count(int level, unsigned long pages);
-#else
-static inline void update_page_count(int level, unsigned long pages) { }
-#endif
-
-/*
- * Helper function that returns the kernel pagetable entry controlling
- * the virtual address 'address'. NULL means no pagetable entry present.
- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
- * as a pte too.
- */
-extern pte_t *lookup_address(unsigned long address, unsigned int *level);
-
/* local pte updates need not use xchg for locking */
static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
{
@@ -632,15 +643,18 @@ static inline void clone_pgd_range(pgd_t
memcpy(dst, src, count * sizeof(pgd_t));
}
-#define arbitrary_virt_to_machine(va) \
+#define arbitrary_virt_to_mfn(va) \
({ \
unsigned int __lvl; \
pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
- (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
- | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
+ pte_mfn(*__ptep); \
})
+#define arbitrary_virt_to_machine(va) \
+ (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT) \
+ | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+
#ifdef CONFIG_HIGHPTE
#include <asm/io.h>
struct page *kmap_atomic_to_page(void *);
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:44:12.000000000 +0100
@@ -20,21 +20,6 @@
__FILE__, __LINE__, &(e), __pgd_val(e), \
(pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
-static inline int pud_none(pud_t pud)
-{
- return __pud_val(pud) == 0;
-
-}
-static inline int pud_bad(pud_t pud)
-{
- return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
-}
-
-static inline int pud_present(pud_t pud)
-{
- return __pud_val(pud) & _PAGE_PRESENT;
-}
-
/* Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
@@ -102,15 +87,6 @@ static inline void pud_clear(pud_t *pudp
xen_tlb_flush();
}
-#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
-
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
-
-
-/* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \
- pmd_index(address))
-
#ifdef CONFIG_SMP
static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
{
@@ -127,17 +103,6 @@ static inline pte_t xen_ptep_get_and_cle
#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
#endif
-#define __HAVE_ARCH_PTE_SAME
-static inline int pte_same(pte_t a, pte_t b)
-{
- return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
-}
-
-static inline int pte_none(pte_t pte)
-{
- return !(pte.pte_low | pte.pte_high);
-}
-
#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
((_pte).pte_high << (32-PAGE_SHIFT)))
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable-3level-defs.h 2011-02-01 14:39:24.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,24 +0,0 @@
-#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
-#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
-
-#define SHARED_KERNEL_PMD 0
-
-/*
- * PGDIR_SHIFT determines what a top-level page table entry can map
- */
-#define PGDIR_SHIFT 30
-#define PTRS_PER_PGD 4
-
-/*
- * PMD_SHIFT determines the size of the area a middle-level
- * page table can map
- */
-#define PMD_SHIFT 21
-#define PTRS_PER_PMD 512
-
-/*
- * entries per page directory level
- */
-#define PTRS_PER_PTE 512
-
-#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable-3level_types.h 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef u64 pteval_t;
+typedef u64 pmdval_t;
+typedef u64 pudval_t;
+typedef u64 pgdval_t;
+typedef u64 pgprotval_t;
+
+typedef union {
+ struct {
+ unsigned long pte_low, pte_high;
+ };
+ pteval_t pte;
+} pte_t;
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+#define PAGETABLE_LEVELS 3
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 30
+#define PTRS_PER_PGD 4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:44:12.000000000 +0100
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_PGTABLE_32_H
#define _ASM_X86_PGTABLE_32_H
+#include <asm/pgtable_32_types.h>
+
/*
* The Linux memory management assumes a three-level page table setup. On
* the i386, we use that, but "fold" the mid level into the top-level page
@@ -31,47 +33,6 @@ void paging_init(void);
extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
-/*
- * The Linux x86 paging architecture is 'compile-time dual-mode', it
- * implements both the traditional 2-level x86 page tables and the
- * newer 3-level PAE-mode page tables.
- */
-#ifdef CONFIG_X86_PAE
-# include <asm/pgtable-3level-defs.h>
-# define PMD_SIZE (1UL << PMD_SHIFT)
-# define PMD_MASK (~(PMD_SIZE - 1))
-#else
-# include <asm/pgtable-2level-defs.h>
-#endif
-
-#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
-#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
-/* Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
- * physical memory until the kernel virtual memory starts. That means that
- * any out-of-bounds memory accesses will hopefully be caught.
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- */
-#define VMALLOC_OFFSET (8 * 1024 * 1024)
-#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
-
-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
- & PMD_MASK)
-
-#ifdef CONFIG_HIGHMEM
-# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
-#else
-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
-#endif
-
-#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
/*
* Define this if things work differently on an i386 and an i486:
@@ -80,66 +41,12 @@ extern void set_pmd_pfn(unsigned long, u
*/
#undef TEST_ACCESS_OK
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
-#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
-
-/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
-#define pmd_none(x) (!(unsigned long)__pmd_val(x))
-#if CONFIG_XEN_COMPAT <= 0x030002
-/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
- can temporarily clear it. */
-#define pmd_present(x) (__pmd_val(x))
-#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
-#else
-#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
-#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
-#endif
-
-
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
-
#ifdef CONFIG_X86_PAE
# include <asm/pgtable-3level.h>
#else
# include <asm/pgtable-2level.h>
#endif
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
-
-static inline int pud_large(pud_t pud) { return 0; }
-
-/*
- * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
- *
- * this macro returns the index of the entry in the pmd page which would
- * control the given virtual address
- */
-#define pmd_index(address) \
- (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-
-/*
- * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
- *
- * this macro returns the index of the entry in the pte page which would
- * control the given virtual address
- */
-#define pte_index(address) \
- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) \
- ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
-
-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
-#define pmd_page_vaddr(pmd) \
- ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
-
#if defined(CONFIG_HIGHPTE)
#define pte_offset_map(dir, address) \
((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
@@ -185,7 +92,4 @@ void make_lowmem_page_writable(void *va,
#define kern_addr_valid(kaddr) (0)
#endif
-#define io_remap_pfn_range(vma, from, pfn, size, prot) \
- direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
-
#endif /* _ASM_X86_PGTABLE_32_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:44:12.000000000 +0100
@@ -2,6 +2,8 @@
#define _ASM_X86_PGTABLE_64_H
#include <linux/const.h>
+#include <asm/pgtable_64_types.h>
+
#ifndef __ASSEMBLY__
/*
@@ -12,12 +14,12 @@
#include <linux/bitops.h>
#include <linux/threads.h>
#include <linux/sched.h>
-#include <asm/pda.h>
#ifdef CONFIG_XEN
extern pud_t level3_user_pgt[512];
extern void xen_init_pt(void);
+extern void xen_switch_pt(void);
#endif
extern pud_t level3_kernel_pgt[512];
@@ -33,39 +35,13 @@ extern void paging_init(void);
#endif /* !__ASSEMBLY__ */
-#define SHARED_KERNEL_PMD 0
-
-/*
- * PGDIR_SHIFT determines what a top-level page table entry can map
- */
-#define PGDIR_SHIFT 39
-#define PTRS_PER_PGD 512
-
-/*
- * 3rd level page
- */
-#define PUD_SHIFT 30
-#define PTRS_PER_PUD 512
-
-/*
- * PMD_SHIFT determines the size of the area a middle-level
- * page table can map
- */
-#define PMD_SHIFT 21
-#define PTRS_PER_PMD 512
-
-/*
- * entries per page directory level
- */
-#define PTRS_PER_PTE 512
-
#ifndef __ASSEMBLY__
#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \
__FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
#define pmd_ERROR(e) \
- printk("%s:%d: bad pmd %p(%016lx pfn %010Lx).\n", \
+ printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \
__FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
#define pud_ERROR(e) \
printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \
@@ -76,9 +52,6 @@ extern void paging_init(void);
__FILE__, __LINE__, &(e), __pgd_val(e), \
(pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-#define pgd_none(x) (!__pgd_val(x))
-#define pud_none(x) (!__pud_val(x))
-
struct mm_struct;
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
@@ -138,48 +111,6 @@ static inline void xen_pgd_clear(pgd_t *
xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
}
-#define pte_same(a, b) ((a).pte == (b).pte)
-
-#endif /* !__ASSEMBLY__ */
-
-#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
-#define PMD_MASK (~(PMD_SIZE - 1))
-#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
-#define PUD_MASK (~(PUD_SIZE - 1))
-#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
-#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
-#define MAX_PHYSMEM_BITS 43
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START _AC(0xffffc20000000000, UL)
-#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START _AC(0xffffe20000000000, UL)
-#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
-#define MODULES_END _AC(0xffffffffff000000, UL)
-#define MODULES_LEN (MODULES_END - MODULES_VADDR)
-
-#ifndef __ASSEMBLY__
-
-static inline int pgd_bad(pgd_t pgd)
-{
- return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-static inline int pud_bad(pud_t pud)
-{
- return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-static inline int pmd_bad(pmd_t pmd)
-{
- return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-#define pte_none(x) (!(x).pte)
-#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
-
-#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
-
#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
/*
@@ -190,47 +121,12 @@ static inline int pmd_bad(pmd_t pmd)
/*
* Level 4 access.
*/
-#define pgd_page_vaddr(pgd) \
- ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
-#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
-#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
static inline int pgd_large(pgd_t pgd) { return 0; }
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
/* PUD - Level3 access */
-/* to find an entry in a page-table-directory. */
-#define pud_page_vaddr(pud) \
- ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
-#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
-#define pud_offset(pgd, address) \
- ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
-#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
-
-static inline int pud_large(pud_t pte)
-{
- return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
-}
/* PMD - Level 2 access */
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
- pmd_index(address))
-#define pmd_none(x) (!__pmd_val(x))
-#if CONFIG_XEN_COMPAT <= 0x030002
-/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
- can temporarily clear it. */
-#define pmd_present(x) (__pmd_val(x))
-#else
-#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
-#endif
-#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
-#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-
#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
_PAGE_FILE })
@@ -238,13 +134,6 @@ static inline int pud_large(pud_t pte)
/* PTE - Level 1 access. */
-/* page, protection -> pte */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
-
-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
- pte_index((address)))
-
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
@@ -278,9 +167,6 @@ static inline int pud_large(pud_t pte)
extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
- direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
-
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64_types.h 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_PGTABLE_64_DEFS_H
+#define _ASM_X86_PGTABLE_64_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+#define PAGETABLE_LEVELS 4
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT 30
+#define PTRS_PER_PUD 512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_START _AC(0xffffc20000000000, UL)
+#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffe20000000000, UL)
+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
+#define MODULES_END _AC(0xffffffffff000000, UL)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_types.h 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,388 @@
+#ifndef _ASM_X86_PGTABLE_DEFS_H
+#define _ASM_X86_PGTABLE_DEFS_H
+
+#include <linux/const.h>
+#include <asm/page_types.h>
+
+#define FIRST_USER_ADDRESS 0
+
+#define _PAGE_BIT_PRESENT 0 /* is present */
+#define _PAGE_BIT_RW 1 /* writeable */
+#define _PAGE_BIT_USER 2 /* userspace addressable */
+#define _PAGE_BIT_PWT 3 /* page write through */
+#define _PAGE_BIT_PCD 4 /* page cache disabled */
+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
+#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
+#define _PAGE_BIT_UNUSED3 11
+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
+
+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB (0)
+#define _PAGE_CACHE_WT (_PAGE_PWT)
+#define _PAGE_CACHE_WC (_PAGE_PAT)
+#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+#define PAGE_COPY PAGE_COPY_NOEXEC
+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
+
+/* xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
+#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
+#endif
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_types.h>
+#else
+# include "pgtable_64_types.h"
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
+
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+
+#include <asm/maddr.h>
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#define __pgd_ma(x) ((pgd_t) { (x) } )
+static inline pgd_t xen_make_pgd(pgdval_t val)
+{
+ if (likely(val & _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pgd_t) { val };
+}
+
+#define __pgd_val(x) ((x).pgd)
+static inline pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ pgdval_t ret = __pgd_val(pgd);
+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
+ if (likely(ret))
+ ret = machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+ if (likely(ret & _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+#endif
+ return ret;
+}
+
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+ return __pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
+#if PAGETABLE_LEVELS > 3
+typedef struct { pudval_t pud; } pud_t;
+
+#define __pud_ma(x) ((pud_t) { (x) } )
+static inline pud_t xen_make_pud(pudval_t val)
+{
+ if (likely(val & _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pud_t) { val };
+}
+
+#define __pud_val(x) ((x).pud)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+ pudval_t ret = __pud_val(pud);
+ if (likely(ret & _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+ return ret;
+}
+#else
+#include <asm-generic/pgtable-nopud.h>
+
+#define __pud_val(x) __pgd_val((x).pgd)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+ return xen_pgd_val(pud.pgd);
+}
+#endif
+
+#if PAGETABLE_LEVELS > 2
+typedef struct { pmdval_t pmd; } pmd_t;
+
+#define __pmd_ma(x) ((pmd_t) { (x) } )
+static inline pmd_t xen_make_pmd(pmdval_t val)
+{
+ if (likely(val & _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pmd_t) { val };
+}
+
+#define __pmd_val(x) ((x).pmd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ pmdval_t ret = __pmd_val(pmd);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (likely(ret))
+ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+ if (likely(ret & _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+#endif
+ return ret;
+}
+#else
+#include <asm-generic/pgtable-nopmd.h>
+
+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
+#define __pmd_val(x) __pgd_val((x).pud.pgd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ return xen_pgd_val(pmd.pud.pgd);
+}
+#endif
+
+static inline pudval_t pud_flags(pud_t pud)
+{
+ return __pud_val(pud) & PTE_FLAGS_MASK;
+}
+
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+ return __pmd_val(pmd) & PTE_FLAGS_MASK;
+}
+
+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
+static inline pte_t xen_make_pte(pteval_t val)
+{
+ if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+ val = pte_phys_to_machine(val);
+ return (pte_t) { .pte = val };
+}
+
+#define __pte_val(x) ((x).pte)
+static inline pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t ret = __pte_val(pte);
+ if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+ ret = pte_machine_to_phys(ret);
+ return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+ return __pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+
+typedef struct page *pgtable_t;
+
+extern pteval_t __supported_pte_mask;
+extern int nx_enabled;
+extern void set_nx(void);
+
+#define pgprot_writecombine pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot);
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifndef CONFIG_XEN
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void xen_pagetable_setup_start(pgd_t *base) {}
+static inline void xen_pagetable_setup_done(pgd_t *base) {}
+#endif
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+enum {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+ PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_DEFS_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:45:38.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:45:53.000000000 +0100
@@ -16,6 +16,7 @@ struct mm_struct;
#include <asm/cpufeature.h>
#include <asm/system.h>
#include <asm/page.h>
+#include <asm/pgtable_types.h>
#include <asm/percpu.h>
#include <asm/msr.h>
#include <asm/desc_defs.h>
@@ -78,10 +79,10 @@ struct cpuinfo_x86 {
#endif
#else
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
- int x86_tlbsize;
+ int x86_tlbsize;
+#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
-#endif
#ifndef CONFIG_XEN
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
@@ -101,7 +102,7 @@ struct cpuinfo_x86 {
#ifndef CONFIG_XEN
#ifdef CONFIG_SMP
/* cpus sharing the last level cache: */
- cpumask_t llc_shared_map;
+ cpumask_var_t llc_shared_map;
#endif
/* cpuid returned max cores value: */
u16 x86_max_cores;
@@ -148,7 +149,7 @@ extern struct cpuinfo_x86 new_cpu_data;
extern __u32 cleared_cpu_caps[NCAPINTS];
#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
#define current_cpu_data __get_cpu_var(cpu_info)
#else
@@ -261,7 +262,6 @@ struct x86_hw_tss {
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
#define INVALID_IO_BITMAP_OFFSET 0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
#ifndef CONFIG_X86_NO_TSS
struct tss_struct {
@@ -277,11 +277,6 @@ struct tss_struct {
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
- /*
- * Cache the current maximum and the last task that used the bitmap:
- */
- unsigned long io_bitmap_max;
- struct thread_struct *io_bitmap_owner;
/*
* .. and then another 0x100 bytes for the emergency kernel stack:
@@ -290,7 +285,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
/*
* Save the original ist values for checking stack pointers during debugging
@@ -373,6 +368,11 @@ struct i387_soft_struct {
u32 entry_eip;
};
+struct ymmh_struct {
+ /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
+ u32 ymmh_space[64];
+};
+
struct xsave_hdr_struct {
u64 xstate_bv;
u64 reserved1[2];
@@ -382,6 +382,7 @@ struct xsave_hdr_struct {
struct xsave_struct {
struct i387_fxsave_struct i387;
struct xsave_hdr_struct xsave_hdr;
+ struct ymmh_struct ymmh;
/* new processor state extensions will go here */
} __attribute__ ((packed, aligned (64)));
@@ -392,11 +393,37 @@ union thread_xstate {
struct xsave_struct xsave;
};
-#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_TSS
DECLARE_PER_CPU(struct orig_ist, orig_ist);
#endif
-extern void print_cpu_info(struct cpuinfo_x86 *);
+union irq_stack_union {
+ char irq_stack[IRQ_STACK_SIZE];
+ /*
+ * GCC hardcodes the stack canary as %gs:40. Since the
+ * irq_stack is the object at %gs:0, we reserve the bottom
+ * 48 bytes of the irq stack for the canary.
+ */
+ struct {
+ char gs_base[40];
+ unsigned long stack_canary;
+ };
+};
+
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
+#else /* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+DECLARE_PER_CPU(unsigned long, stack_canary);
+#endif
+#endif /* X86_64 */
+
extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *);
extern struct kmem_cache *task_xstate_cachep;
@@ -669,6 +696,7 @@ static inline void __sti_mwait(unsigned
extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void init_c1e_mask(void);
extern unsigned long boot_option_idle_override;
extern unsigned long idle_halt;
@@ -706,9 +734,9 @@ extern int sysenter_setup(void);
extern struct desc_ptr early_gdt_descr;
extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
extern void cpu_init(void);
-extern void init_gdt(int cpu);
static inline unsigned long get_debugctlmsr(void)
{
@@ -793,6 +821,7 @@ static inline void spin_lock_prefetch(co
* User space process size: 3GB (default).
*/
#define TASK_SIZE PAGE_OFFSET
+#define TASK_SIZE_MAX TASK_SIZE
#define STACK_TOP TASK_SIZE
#define STACK_TOP_MAX STACK_TOP
@@ -850,7 +879,7 @@ extern unsigned long thread_saved_pc(str
/*
* User space process size. 47bits minus one guard page.
*/
-#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
@@ -859,12 +888,12 @@ extern unsigned long thread_saved_pc(str
0xc0000000 : 0xFFFFe000)
#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
- IA32_PAGE_OFFSET : TASK_SIZE64)
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
- IA32_PAGE_OFFSET : TASK_SIZE64)
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
#define STACK_TOP TASK_SIZE
-#define STACK_TOP_MAX TASK_SIZE64
+#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:44:12.000000000 +0100
@@ -15,34 +15,8 @@
# include <asm/io_apic.h>
# endif
#endif
-#include <asm/pda.h>
#include <asm/thread_info.h>
-
-#ifdef CONFIG_X86_64
-
-#define cpu_callin_mask cpu_possible_mask
-#define cpu_callout_mask cpu_possible_mask
-extern cpumask_var_t cpu_initialized_mask;
-extern cpumask_var_t cpu_sibling_setup_mask;
-
-#else /* CONFIG_X86_32 */
-
-#define cpu_callin_map cpu_possible_map
-#define cpu_callout_map cpu_possible_map
-extern cpumask_t cpu_initialized;
-extern cpumask_t cpu_sibling_setup_map;
-
-#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
-#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
-#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
-#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
-
-#endif /* CONFIG_X86_32 */
-
-extern void (*mtrr_hook)(void);
-extern void zap_low_mappings(void);
-
-extern int __cpuinit get_local_pda(int cpu);
+#include <asm/cpumask.h>
extern unsigned int num_processors;
@@ -51,9 +25,7 @@ DECLARE_PER_CPU(cpumask_t, cpu_sibling_m
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
DECLARE_PER_CPU(u16, cpu_llc_id);
#endif
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(int, cpu_number);
-#endif
static inline const struct cpumask *cpu_sibling_mask(int cpu)
{
@@ -152,9 +124,10 @@ static inline void arch_send_call_functi
smp_ops.send_call_func_single_ipi(cpu);
}
-static inline void arch_send_call_function_ipi(cpumask_t mask)
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
- smp_ops.send_call_func_ipi(&mask);
+ smp_ops.send_call_func_ipi(mask);
}
void cpu_disable_common(void);
@@ -167,6 +140,9 @@ void native_cpu_die(unsigned int cpu);
void native_play_dead(void);
void play_dead_common(void);
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+
#else /* CONFIG_XEN */
extern int __cpu_disable(void);
@@ -179,26 +155,17 @@ void xen_send_call_func_single_ipi(int c
#define smp_send_stop xen_smp_send_stop
#define smp_send_reschedule xen_smp_send_reschedule
#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
-#define arch_send_call_function_ipi(m) xen_send_call_func_ipi(&(m))
+#define arch_send_call_function_ipi_mask xen_send_call_func_ipi
void play_dead(void);
#endif /* CONFIG_XEN */
-extern void prefill_possible_map(void);
-
-void smp_store_cpu_info(int id);
-#define cpu_physical_id(cpu) (cpu)
-
/* We don't mark CPUs online until __cpu_up(), so we need another measure */
static inline int num_booting_cpus(void)
{
return cpumask_weight(cpu_callout_mask);
}
-#else
-static inline void prefill_possible_map(void)
-{
-}
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus __cpuinitdata;
@@ -209,11 +176,11 @@ extern unsigned disabled_cpus __cpuinitd
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
*/
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
#define safe_smp_processor_id() smp_processor_id()
#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() (percpu_read(cpu_number))
#define stack_smp_processor_id() \
({ \
@@ -223,10 +190,6 @@ extern unsigned disabled_cpus __cpuinitd
})
#define safe_smp_processor_id() smp_processor_id()
-#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
-#define cpu_physical_id(cpu) 0
-#define safe_smp_processor_id() 0
-#define stack_smp_processor_id() 0
#endif
#ifdef CONFIG_X86_LOCAL_APIC
@@ -238,28 +201,9 @@ static inline int logical_smp_processor_
return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
}
-#include <mach_apicdef.h>
-static inline unsigned int read_apic_id(void)
-{
- unsigned int reg;
-
- reg = *(u32 *)(APIC_BASE + APIC_ID);
-
- return GET_APIC_ID(reg);
-}
#endif
-
-# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
extern int hard_smp_processor_id(void);
-# else
-#include <mach_apicdef.h>
-static inline int hard_smp_processor_id(void)
-{
- /* we don't want to mark this access volatile - bad code generation */
- return read_apic_id();
-}
-# endif /* APIC_DEFINITION */
#else /* CONFIG_X86_LOCAL_APIC */
@@ -269,11 +213,5 @@ static inline int hard_smp_processor_id(
#endif /* CONFIG_X86_LOCAL_APIC */
-#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
-extern unsigned char boot_cpu_id;
-#else
-#define boot_cpu_id 0
-#endif
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:44:12.000000000 +0100
@@ -255,40 +255,18 @@ static __always_inline void __ticket_spi
static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
static inline void xen_spinlock_cleanup(unsigned int cpu) {}
-/*
- * Define virtualization-friendly old-style lock byte lock, for use in
- * pv_lock_ops if desired.
- *
- * This differs from the pre-2.6.24 spinlock by always using xchgb
- * rather than decb to take the lock; this allows it to use a
- * zero-initialized lock structure. It also maintains a 1-byte
- * contention counter, so that we can implement
- * __byte_spin_is_contended.
- */
-struct __byte_spinlock {
- s8 lock;
-#if NR_CPUS < 256
- s8 spinners;
-#else
-#error NR_CPUS >= 256 support not implemented
-#endif
-};
-
static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- return bl->lock != 0;
+ return lock->lock != 0;
}
static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
- return bl->spinners != 0;
+ return lock->spinners != 0;
}
static inline void __byte_spin_lock(raw_spinlock_t *lock)
{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
s8 val = 1;
asm("1: xchgb %1, %0\n"
@@ -301,27 +279,25 @@ static inline void __byte_spin_lock(raw_
" " LOCK_PREFIX "decb %2\n"
" jmp 1b\n"
"3:"
- : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+ : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
}
#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
static inline int __byte_spin_trylock(raw_spinlock_t *lock)
{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
u8 old = 1;
asm("xchgb %1,%0"
- : "+m" (bl->lock), "+q" (old) : : "memory");
+ : "+m" (lock->lock), "+q" (old) : : "memory");
return old == 0;
}
static inline void __byte_spin_unlock(raw_spinlock_t *lock)
{
- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
smp_wmb();
- bl->lock = 0;
+ lock->lock = 0;
}
#define __raw_spin(n) __byte_spin_##n
@@ -422,8 +398,7 @@ static inline int __raw_read_trylock(raw
{
atomic_t *count = (atomic_t *)lock;
- atomic_dec(count);
- if (atomic_read(count) >= 0)
+ if (atomic_dec_return(count) >= 0)
return 1;
atomic_inc(count);
return 0;
@@ -450,6 +425,9 @@ static inline void __raw_write_unlock(ra
: "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
}
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
#define _raw_spin_relax(lock) cpu_relax()
#define _raw_read_relax(lock) cpu_relax()
#define _raw_write_relax(lock) cpu_relax()
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock_types.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock_types.h 2011-02-01 14:44:12.000000000 +0100
@@ -26,6 +26,20 @@ typedef union {
# define TICKET_SHIFT 16
u16 cur, seq;
#endif
+#else
+/*
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure. It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+ u8 lock;
+#if CONFIG_NR_CPUS < 256
+ u8 spinners;
+#else
+# error NR_CPUS >= 256 not implemented
+#endif
#endif
};
} raw_spinlock_t;
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/system.h 2011-03-03 16:05:49.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/system.h 2011-03-03 16:06:47.000000000 +0100
@@ -21,9 +21,24 @@
struct task_struct; /* one of the stranger aspects of C forward declarations */
struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
#ifdef CONFIG_X86_32
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movl %P[task_canary](%[next]), %%ebx\n\t" \
+ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [stack_canary] "=m" (per_cpu_var(stack_canary))
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
@@ -45,6 +60,7 @@ do { \
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
"movl $1f,%[prev_ip]\n\t" /* save EIP */ \
"pushl %[next_ip]\n\t" /* restore EIP */ \
+ __switch_canary \
"jmp __switch_to\n" /* regparm call */ \
"1:\t" \
"popl %%ebp\n\t" /* restore EBP */ \
@@ -59,6 +75,8 @@ do { \
"=b" (ebx), "=c" (ecx), "=d" (edx), \
"=S" (esi), "=D" (edi) \
\
+ __switch_canary_oparam \
+ \
/* input parameters: */ \
: [next_sp] "m" (next->thread.sp), \
[next_ip] "m" (next->thread.ip), \
@@ -67,6 +85,8 @@ do { \
[prev] "a" (prev), \
[next] "d" (next) \
\
+ __switch_canary_iparam \
+ \
: /* reloaded segment registers */ \
"memory"); \
} while (0)
@@ -89,27 +109,44 @@ do { \
, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
"r12", "r13", "r14", "r15"
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movq %P[task_canary](%%rsi),%%r8\n\t" \
+ "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
/* Save restore flags to clear handle leaking NT */
#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
+ asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
"call __switch_to\n\t" \
".globl thread_return\n" \
"thread_return:\n\t" \
- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+ "movq "__percpu_arg([current_task])",%%rsi\n\t" \
+ __switch_canary \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
- LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
"movq %%rax,%%rdi\n\t" \
- "jc ret_from_fork\n\t" \
+ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "jnz ret_from_fork\n\t" \
RESTORE_CONTEXT \
: "=a" (last) \
+ __switch_canary_oparam \
: [next] "S" (next), [prev] "D" (prev), \
[threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
[ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [tif_fork] "i" (TIF_FORK), \
+ [_tif_fork] "i" (_TIF_FORK), \
[thread_info] "i" (offsetof(struct task_struct, stack)), \
- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
+ [current_task] "m" (per_cpu_var(current_task)) \
+ __switch_canary_iparam \
: "memory", "cc" __EXTRA_CLOBBER)
#endif
@@ -168,6 +205,25 @@ extern void xen_load_gs_index(unsigned);
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk) ((tsk)->thread.gs)
+#define lazy_save_gs(v) savesegment(gs, (v))
+#define lazy_load_gs(v) loadsegment(gs, (v))
+#else /* X86_32_LAZY_GS */
+#define get_user_gs(regs) (u16)((regs)->gs)
+#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v) do { } while (0)
+#define lazy_load_gs(v) do { } while (0)
+#endif /* X86_32_LAZY_GS */
+#endif /* X86_32 */
+
static inline unsigned long get_limit(unsigned long segment)
{
unsigned long __limit;
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/tlbflush.h 2011-02-01 14:44:12.000000000 +0100
@@ -86,21 +86,20 @@ static inline void flush_tlb_range(struc
flush_tlb_mm(vma->vm_mm);
}
+#ifndef CONFIG_XEN
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-#ifdef CONFIG_X86_32
struct tlb_state {
struct mm_struct *active_mm;
int state;
- char __cacheline_padding[L1_CACHE_BYTES-8];
};
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-void reset_lazy_tlbstate(void);
-#else
static inline void reset_lazy_tlbstate(void)
{
+ percpu_write(cpu_tlbstate.state, 0);
+ percpu_write(cpu_tlbstate.active_mm, &init_mm);
}
#endif
@@ -112,4 +111,6 @@ static inline void flush_tlb_kernel_rang
flush_tlb_all();
}
+extern void zap_low_mappings(void);
+
#endif /* _ASM_X86_TLBFLUSH_H */
--- head-2011-03-17.orig/arch/x86/kernel/Makefile 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -115,7 +115,6 @@ obj-$(CONFIG_X86_XEN) += fixup.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
@@ -125,11 +124,10 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
- obj-$(CONFIG_XEN) += nmi.o
time_64-$(CONFIG_XEN) += time_32.o
endif
-disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o genx2apic_%.o \
- hpet.o i8253.o i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o \
- smpboot.o tlb_$(BITS).o tsc.o tsc_sync.o uv_%.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
+ i8259.o irqinit_$(BITS).o pci-swiotlb.o reboot.o smpboot.o tsc.o \
+ tsc_sync.o uv_%.o vsmp_64.o
disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o
--- head-2011-03-17.orig/arch/x86/kernel/acpi/boot.c 2011-03-11 10:59:30.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/acpi/boot.c 2011-03-11 10:59:49.000000000 +0100
@@ -162,11 +162,6 @@ char *__init __acpi_map_table(unsigned l
if (!phys || !size)
return NULL;
-#ifdef CONFIG_XEN
- if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
- return isa_bus_to_virt(phys);
-#endif
-
return early_ioremap(phys, size);
}
void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -198,8 +193,10 @@ static int __init acpi_parse_madt(struct
madt->address);
}
+#ifndef CONFIG_XEN
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
+#endif
return 0;
}
--- head-2011-03-17.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -104,6 +104,7 @@ int acpi_save_state_mem(void)
stack_start.sp = temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
--- head-2011-03-17.orig/arch/x86/kernel/apic/Makefile 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -19,3 +19,9 @@ obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-$(CONFIG_X86_ES7000) += es7000_32.o
obj-$(CONFIG_X86_SUMMIT) += summit_32.o
+
+obj-$(CONFIG_XEN) += nmi.o
+
+probe_64-$(CONFIG_XEN) := probe_32.o
+
+disabled-obj-$(CONFIG_XEN) := apic_flat_$(BITS).o
--- head-2011-03-17.orig/arch/x86/kernel/apic/apic-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/apic-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -9,6 +9,8 @@
#include <asm/proto.h>
#include <asm/apic.h>
+unsigned int num_processors;
+
/*
* Debug level, exported for io_apic.c
*/
--- head-2011-03-17.orig/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -1,7 +1,7 @@
/*
* Intel IO-APIC support for multi-Pentium hosts.
*
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
*
* Many thanks to Stig Venaas for trying out countless experimental
* patches and reporting/debugging problems patiently!
@@ -42,6 +42,7 @@
#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/acpi.h>
@@ -51,9 +52,7 @@
#include <asm/nmi.h>
#include <asm/setup.h>
-#include <mach_ipi.h>
-#include <mach_apic.h>
-#include <mach_apicdef.h>
+#include <asm/apic.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -87,11 +86,11 @@ static DEFINE_SPINLOCK(vector_lock);
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
@@ -104,10 +103,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BU
int skip_ioapic_setup;
+static void __init _arch_disable_smp_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
static int __init parse_noapic(char *str)
{
/* disable IO-APIC */
- disable_ioapic_setup();
+ _arch_disable_smp_support();
return 0;
}
early_param("noapic", parse_noapic);
@@ -362,7 +370,7 @@ set_extra_move_desc(struct irq_desc *des
if (!cfg->move_in_progress) {
/* it means that domain is not changed */
- if (!cpumask_intersects(&desc->affinity, mask))
+ if (!cpumask_intersects(desc->affinity, mask))
cfg->move_desc_pending = 1;
}
}
@@ -387,12 +395,20 @@ struct io_apic {
unsigned int index;
unsigned int unused[3];
unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
};
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
}
#endif /* !CONFIG_XEN */
@@ -406,7 +422,7 @@ static inline unsigned int io_apic_read(
struct physdev_apic apic_op;
int ret;
- apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
+ apic_op.apic_physbase = mp_ioapics[apic].apicaddr;
apic_op.reg = reg;
ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
if (ret)
@@ -424,7 +440,7 @@ static inline void io_apic_write(unsigne
#else
struct physdev_apic apic_op;
- apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
+ apic_op.apic_physbase = mp_ioapics[apic].apicaddr;
apic_op.reg = reg;
apic_op.value = value;
WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
@@ -512,7 +528,7 @@ __ioapic_write_entry(int apic, int pin,
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
@@ -548,11 +564,11 @@ static void send_cleanup_vector(struct i
for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
cfg->move_cleanup_count++;
for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
} else {
cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
free_cpumask_var(cleanup_mask);
}
cfg->move_in_progress = 0;
@@ -573,16 +589,12 @@ static void __target_IO_APIC_irq(unsigne
apic = entry->apic;
pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
/*
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
if (!irq_remapped(irq))
io_apic_write(apic, 0x11 + pin*2, dest);
-#else
- io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
@@ -597,8 +609,9 @@ static int
assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
/*
- * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
- * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
*/
static unsigned int
set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
@@ -614,9 +627,12 @@ set_desc_affinity(struct irq_desc *desc,
if (assign_irq_vector(irq, cfg, mask))
return BAD_APICID;
- cpumask_and(&desc->affinity, cfg->domain, mask);
+ /* check that before desc->addinity get updated */
set_extra_move_desc(desc, mask);
- return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+
+ cpumask_copy(desc->affinity, mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
}
static void
@@ -830,23 +846,6 @@ static void clear_IO_APIC (void)
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
clear_IO_APIC_pin(apic, pin);
}
-
-#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
-void send_IPI_self(int vector)
-{
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP && CONFIG_X86_32*/
#else
#define add_pin_to_irq_cpu(cfg, cpu, apic, pin)
#endif /* !CONFIG_XEN */
@@ -858,8 +857,9 @@ void send_IPI_self(int vector)
*/
#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
+static int pirq_entries[MAX_PIRQS] = {
+ [0 ... MAX_PIRQS - 1] = -1
+};
static int __init ioapic_pirq_setup(char *str)
{
@@ -868,10 +868,6 @@ static int __init ioapic_pirq_setup(char
get_options(str, ARRAY_SIZE(ints), ints);
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
apic_printk(APIC_VERBOSE, KERN_INFO
"PIRQ redirection, working around broken MP-BIOS.\n");
max = MAX_PIRQS;
@@ -893,75 +889,106 @@ __setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
#ifdef CONFIG_INTR_REMAP
-/* I/O APIC RTE contents at the OS boot up */
-static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+struct IO_APIC_route_entry **alloc_ioapic_entries(void)
+{
+ int apic;
+ struct IO_APIC_route_entry **ioapic_entries;
+
+ ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
+ GFP_ATOMIC);
+ if (!ioapic_entries)
+ return 0;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ ioapic_entries[apic] =
+ kzalloc(sizeof(struct IO_APIC_route_entry) *
+ nr_ioapic_registers[apic], GFP_ATOMIC);
+ if (!ioapic_entries[apic])
+ goto nomem;
+ }
+
+ return ioapic_entries;
+
+nomem:
+ while (--apic >= 0)
+ kfree(ioapic_entries[apic]);
+ kfree(ioapic_entries);
+
+ return 0;
+}
/*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
*/
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
int apic, pin;
- /*
- * The number of IO-APIC IRQ registers (== #pins):
- */
+ if (!ioapic_entries)
+ return -ENOMEM;
+
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+ if (!ioapic_entries[apic])
+ return -ENOMEM;
+
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ ioapic_entries[apic][pin] =
+ ioapic_read_entry(apic, pin);
}
+ return 0;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+{
+ int apic, pin;
+
+ if (!ioapic_entries)
+ return;
+
for (apic = 0; apic < nr_ioapics; apic++) {
- early_ioapic_entries[apic] =
- kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_KERNEL);
- if (!early_ioapic_entries[apic])
- goto nomem;
- }
+ if (!ioapic_entries[apic])
+ break;
- for (apic = 0; apic < nr_ioapics; apic++)
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
struct IO_APIC_route_entry entry;
- entry = early_ioapic_entries[apic][pin] =
- ioapic_read_entry(apic, pin);
+ entry = ioapic_entries[apic][pin];
if (!entry.mask) {
entry.mask = 1;
ioapic_write_entry(apic, pin, entry);
}
}
-
- return 0;
-
-nomem:
- while (apic >= 0)
- kfree(early_ioapic_entries[apic--]);
- memset(early_ioapic_entries, 0,
- ARRAY_SIZE(early_ioapic_entries));
-
- return -ENOMEM;
+ }
}
-void restore_IO_APIC_setup(void)
+/*
+ * Restore IO APIC entries which was saved in ioapic_entries.
+ */
+int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
{
int apic, pin;
+ if (!ioapic_entries)
+ return -ENOMEM;
+
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!early_ioapic_entries[apic])
- break;
+ if (!ioapic_entries[apic])
+ return -ENOMEM;
+
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
ioapic_write_entry(apic, pin,
- early_ioapic_entries[apic][pin]);
- kfree(early_ioapic_entries[apic]);
- early_ioapic_entries[apic] = NULL;
+ ioapic_entries[apic][pin]);
}
+ return 0;
}
-void reinit_intr_remapped_IO_APIC(int intr_remapping)
+void reinit_intr_remapped_IO_APIC(int intr_remapping,
+ struct IO_APIC_route_entry **ioapic_entries)
+
{
/*
* for now plain restore of previous settings.
@@ -970,7 +997,17 @@ void reinit_intr_remapped_IO_APIC(int in
* table entries. for now, do a plain restore, and wait for
* the setup_IO_APIC_irqs() to do proper initialization.
*/
- restore_IO_APIC_setup();
+ restore_IO_APIC_setup(ioapic_entries);
+}
+
+void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
+{
+ int apic;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ kfree(ioapic_entries[apic]);
+
+ kfree(ioapic_entries);
}
#endif
@@ -982,10 +1019,10 @@ static int find_irq_entry(int apic, int
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == type &&
- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mp_dstirq == pin)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
return i;
return -1;
@@ -1000,13 +1037,13 @@ static int __init find_isa_irq_pin(int i
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
- return mp_irqs[i].mp_dstirq;
+ return mp_irqs[i].dstirq;
}
return -1;
}
@@ -1016,17 +1053,17 @@ static int __init find_isa_irq_apic(int
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
return apic;
}
}
@@ -1052,23 +1089,23 @@ int IO_APIC_get_PCI_irq_vector(int bus,
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mp_irqtype &&
+ !mp_irqs[i].irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mp_srcbusirq & 3))
+ if (pin == (mp_irqs[i].srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -1111,7 +1148,7 @@ static int EISA_ELCR(unsigned int irq)
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
@@ -1128,13 +1165,13 @@ static int EISA_ELCR(unsigned int irq)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mp_irqflag & 3)
+ switch (mp_irqs[idx].irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
@@ -1170,13 +1207,13 @@ static int MPBIOS_polarity(int idx)
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
if (test_bit(bus, mp_bus_not_pci))
@@ -1254,16 +1291,16 @@ int (*ioapic_renumber_irq)(int ioapic, i
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mp_dstirq != pin)
+ if (mp_irqs[idx].dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].mp_srcbusirq;
+ irq = mp_irqs[idx].srcbusirq;
} else {
/*
* PCI IRQs are mapped in order
@@ -1356,7 +1393,7 @@ __assign_irq_vector(int irq, struct irq_
int new_cpu;
int vector, offset;
- vector_allocation_domain(cpu, tmp_mask);
+ apic->vector_allocation_domain(cpu, tmp_mask);
vector = current_vector;
offset = current_offset;
@@ -1466,9 +1503,7 @@ void __setup_vector_irq(int cpu)
}
static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip;
-#endif
#define IOAPIC_AUTO -1
#define IOAPIC_EDGE 0
@@ -1507,7 +1542,6 @@ static void ioapic_register_intr(int irq
else
desc->status &= ~IRQ_LEVEL;
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
desc->status |= IRQ_MOVE_PCNTXT;
if (trigger)
@@ -1519,7 +1553,7 @@ static void ioapic_register_intr(int irq
handle_edge_irq, "edge");
return;
}
-#endif
+
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1534,37 +1568,44 @@ static void ioapic_register_intr(int irq
#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq)
#endif
-static int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector)
+int setup_ioapic_entry(int apic_id, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector, int pin)
{
/*
* add it to the IO-APIC irq-routing table:
*/
memset(entry,0,sizeof(*entry));
-#ifdef CONFIG_INTR_REMAP
+#ifndef CONFIG_XEN
if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+ struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
struct irte irte;
struct IR_IO_APIC_route_entry *ir_entry =
(struct IR_IO_APIC_route_entry *) entry;
int index;
if (!iommu)
- panic("No mapping iommu for ioapic %d\n", apic);
+ panic("No mapping iommu for ioapic %d\n", apic_id);
index = alloc_irte(iommu, irq, 1);
if (index < 0)
- panic("Failed to allocate IRTE for ioapic %d\n", apic);
+ panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
memset(&irte, 0, sizeof(irte));
irte.present = 1;
- irte.dst_mode = INT_DEST_MODE;
- irte.trigger_mode = trigger;
- irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.dst_mode = apic->irq_dest_mode;
+ /*
+ * Trigger mode in the IRTE will always be edge, and the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments above explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte.trigger_mode = 0;
+ irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = vector;
irte.dest_id = IRTE_DEST(destination);
@@ -1574,18 +1615,23 @@ static int setup_ioapic_entry(int apic,
ir_entry->zero = 0;
ir_entry->format = 1;
ir_entry->index = (index & 0x7fff);
+ /*
+ * IO-APIC RTE will be configured with virtual vector.
+ * irq handler will do the explicit EOI to the io-apic.
+ */
+ ir_entry->vector = pin;
} else
#endif
{
- entry->delivery_mode = INT_DELIVERY_MODE;
- entry->dest_mode = INT_DEST_MODE;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
entry->dest = destination;
+ entry->vector = vector;
}
entry->mask = 0; /* enable IRQ */
entry->trigger = trigger;
entry->polarity = polarity;
- entry->vector = vector;
/* Mask level triggered irqs.
* Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1595,7 +1641,7 @@ static int setup_ioapic_entry(int apic,
return 0;
}
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
+static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
int trigger, int polarity)
{
struct irq_cfg *cfg;
@@ -1607,26 +1653,26 @@ static void setup_IO_APIC_irq(int apic,
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, TARGET_CPUS))
+ if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
#ifndef CONFIG_XEN
- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
#else
- dest = cpu_mask_to_apicid(TARGET_CPUS);
+ dest = 0; /* meaningless */
#endif
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
+ apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
irq, trigger, polarity);
- if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- dest, trigger, polarity, cfg->vector)) {
+ if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+ dest, trigger, polarity, cfg->vector, pin)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic].mp_apicid, pin);
+ mp_ioapics[apic_id].apicid, pin);
__clear_irq_vector(irq, cfg);
return;
}
@@ -1635,12 +1681,12 @@ static void setup_IO_APIC_irq(int apic,
if (irq < NR_IRQS_LEGACY)
disable_8259A_irq(irq);
- ioapic_write_entry(apic, pin, entry);
+ ioapic_write_entry(apic_id, pin, entry);
}
static void __init setup_IO_APIC_irqs(void)
{
- int apic, pin, idx, irq;
+ int apic_id, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
@@ -1648,21 +1694,19 @@ static void __init setup_IO_APIC_irqs(vo
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
+ idx = find_irq_entry(apic_id, pin, mp_INT);
if (idx == -1) {
if (!notcon) {
notcon = 1;
apic_printk(APIC_VERBOSE,
KERN_DEBUG " %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
+ mp_ioapics[apic_id].apicid, pin);
} else
apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
+ mp_ioapics[apic_id].apicid, pin);
continue;
}
if (notcon) {
@@ -1671,23 +1715,30 @@ static void __init setup_IO_APIC_irqs(vo
notcon = 0;
}
- irq = pin_2_irq(idx, apic, pin);
-#if defined(CONFIG_XEN)
- if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
+ irq = pin_2_irq(idx, apic_id, pin);
+
+#ifdef CONFIG_XEN
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
continue;
-#elif defined(CONFIG_X86_32)
- if (multi_timer_check(apic, irq))
+#else
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(apic_id, irq))
continue;
#endif
+
desc = irq_to_desc_alloc_cpu(irq, cpu);
if (!desc) {
printk(KERN_INFO "can not get irq_desc for %d\n", irq);
continue;
}
cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+ add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
- setup_IO_APIC_irq(apic, pin, irq, desc,
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
irq_trigger(idx), irq_polarity(idx));
}
}
@@ -1701,15 +1752,13 @@ static void __init setup_IO_APIC_irqs(vo
/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
int vector)
{
struct IO_APIC_route_entry entry;
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
return;
-#endif
memset(&entry, 0, sizeof(entry));
@@ -1717,10 +1766,10 @@ static void __init setup_timer_IRQ0_pin(
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 1; /* mask IRQ now */
- entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = apic->irq_dest_mode;
+ entry.mask = 0; /* don't mask IRQ for edge */
+ entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+ entry.delivery_mode = apic->irq_delivery_mode;
entry.polarity = 0;
entry.trigger = 0;
entry.vector = vector;
@@ -1734,7 +1783,7 @@ static void __init setup_timer_IRQ0_pin(
/*
* Add it to the IO-APIC irq-routing table:
*/
- ioapic_write_entry(apic, pin, entry);
+ ioapic_write_entry(apic_id, pin, entry);
}
@@ -1756,7 +1805,7 @@ __apicdebuginit(void) print_IO_APIC(void
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -1776,7 +1825,7 @@ __apicdebuginit(void) print_IO_APIC(void
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -2040,13 +2089,6 @@ void __init enable_IO_APIC(void)
int apic;
unsigned long flags;
-#ifdef CONFIG_X86_32
- int i;
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-#endif
-
/*
* The number of IO-APIC IRQ registers (== #pins):
*/
@@ -2117,8 +2159,13 @@ void disable_IO_APIC(void)
* If the i8259 is routed through an IOAPIC
* Put that IOAPIC in virtual wire mode
* so legacy interrupts can be delivered.
+ *
+ * With interrupt-remapping, for now we will use virtual wire A mode,
+ * as virtual wire B is little complex (need to configure both
+ * IOAPIC RTE aswell as interrupt-remapping table entry).
+ * As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1) {
+ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2138,7 +2185,10 @@ void disable_IO_APIC(void)
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
- disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+ /*
+ * Use virtual wire A mode when interrupt remapping is enabled.
+ */
+ disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
}
#ifdef CONFIG_X86_32
@@ -2153,7 +2203,7 @@ static void __init setup_ioapic_ids_from
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
- int apic;
+ int apic_id;
int i;
unsigned char old_id;
unsigned long flags;
@@ -2172,26 +2222,26 @@ static void __init setup_ioapic_ids_from
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+ phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
/* Read the register 0 value */
spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
+ reg_00.raw = io_apic_read(apic_id, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- old_id = mp_ioapics[apic].mp_apicid;
+ old_id = mp_ioapics[apic_id].apicid;
- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+ if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mp_apicid);
+ apic_id, mp_ioapics[apic_id].apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+ mp_ioapics[apic_id].apicid = reg_00.bits.ID;
}
/*
@@ -2199,10 +2249,10 @@ static void __init setup_ioapic_ids_from
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mp_apicid)) {
+ if (apic->check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic_id].apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mp_apicid);
+ apic_id, mp_ioapics[apic_id].apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -2211,13 +2261,13 @@ static void __init setup_ioapic_ids_from
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mp_apicid = i;
+ mp_ioapics[apic_id].apicid = i;
} else {
physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+ tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic].mp_apicid);
+ mp_ioapics[apic_id].apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -2226,11 +2276,11 @@ static void __init setup_ioapic_ids_from
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic].mp_apicid)
+ if (old_id != mp_ioapics[apic_id].apicid)
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_dstapic == old_id)
- mp_irqs[i].mp_dstapic
- = mp_ioapics[apic].mp_apicid;
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mp_ioapics[apic_id].apicid;
/*
* Read the right value from the MPC table and
@@ -2238,20 +2288,20 @@ static void __init setup_ioapic_ids_from
*/
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mp_apicid);
+ mp_ioapics[apic_id].apicid);
- reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+ reg_00.bits.ID = mp_ioapics[apic_id].apicid;
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
+ io_apic_write(apic_id, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
+ reg_00.raw = io_apic_read(apic_id, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+ if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2354,7 +2404,7 @@ static int ioapic_retrigger_irq(unsigned
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+ apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -2362,7 +2412,7 @@ static int ioapic_retrigger_irq(unsigned
#else
static int ioapic_retrigger_irq(unsigned int irq)
{
- send_IPI_self(irq_cfg(irq)->vector);
+ apic->send_IPI_self(irq_cfg(irq)->vector);
return 1;
}
@@ -2380,37 +2430,24 @@ static int ioapic_retrigger_irq(unsigned
#ifdef CONFIG_SMP
#ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
/*
* Migrate the IO-APIC irq in the presence of intr-remapping.
*
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
*
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
*/
static void
migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
struct irte irte;
- int modify_ioapic_rte;
unsigned int dest;
- unsigned long flags;
unsigned int irq;
if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2426,14 +2463,7 @@ migrate_ioapic_irq_desc(struct irq_desc
set_extra_move_desc(desc, mask);
- dest = cpu_mask_to_apicid_and(cfg->domain, mask);
-
- modify_ioapic_rte = desc->status & IRQ_LEVEL;
- if (modify_ioapic_rte) {
- spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- }
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -2446,61 +2476,7 @@ migrate_ioapic_irq_desc(struct irq_desc
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
- cpumask_copy(&desc->affinity, mask);
-}
-
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
- int ret = -1;
- struct irq_cfg *cfg = desc->chip_data;
-
- mask_IO_APIC_irq_desc(desc);
-
- if (io_apic_level_ack_pending(cfg)) {
- /*
- * Interrupt in progress. Migrating irq now will change the
- * vector information in the IO-APIC RTE and that will confuse
- * the EOI broadcast performed by cpu.
- * So, delay the irq migration to the next instance.
- */
- schedule_delayed_work(&ir_migration_work, 1);
- goto unmask;
- }
-
- /* everthing is clear. we have right of way */
- migrate_ioapic_irq_desc(desc, &desc->pending_mask);
-
- ret = 0;
- desc->status &= ~IRQ_MOVE_PENDING;
- cpumask_clear(&desc->pending_mask);
-
-unmask:
- unmask_IO_APIC_irq_desc(desc);
-
- return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
- unsigned int irq;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- if (desc->status & IRQ_MOVE_PENDING) {
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
- if (!desc->chip->set_affinity ||
- !(desc->status & IRQ_MOVE_PENDING)) {
- desc->status &= ~IRQ_MOVE_PENDING;
- spin_unlock_irqrestore(&desc->lock, flags);
- continue;
- }
-
- desc->chip->set_affinity(irq, &desc->pending_mask);
- spin_unlock_irqrestore(&desc->lock, flags);
- }
- }
+ cpumask_copy(desc->affinity, mask);
}
/*
@@ -2509,13 +2485,6 @@ static void ir_irq_migration(struct work
static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
- if (desc->status & IRQ_LEVEL) {
- desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(&desc->pending_mask, mask);
- migrate_irq_remapped_level_desc(desc);
- return;
- }
-
migrate_ioapic_irq_desc(desc, mask);
}
static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2525,6 +2494,11 @@ static void set_ir_ioapic_affinity_irq(u
set_ir_ioapic_affinity_irq_desc(desc, mask);
}
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
+{
+}
#endif
asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2538,6 +2512,7 @@ asmlinkage void smp_irq_move_cleanup_int
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irq;
+ unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
@@ -2557,6 +2532,18 @@ asmlinkage void smp_irq_move_cleanup_int
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ /*
+ * Check if the vector that needs to be cleanedup is
+ * registered at the cpu's IRR. If so, then this is not
+ * the best time to clean it up. Lets clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to myself.
+ */
+ if (irr & (1 << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ goto unlock;
+ }
__get_cpu_var(vector_irq)[vector] = -1;
cfg->move_cleanup_count--;
unlock:
@@ -2579,7 +2566,7 @@ static void irq_complete_move(struct irq
/* domain has not changed, but affinity did */
me = smp_processor_id();
- if (cpu_isset(me, desc->affinity)) {
+ if (cpumask_test_cpu(me, desc->affinity)) {
*descp = desc = move_irq_desc(desc, me);
/* get the new one */
cfg = desc->chip_data;
@@ -2605,17 +2592,51 @@ static void irq_complete_move(struct irq
static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
-#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ io_apic_eoi(apic, pin);
+ entry = entry->next;
+ }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifdef CONFIG_X86_X2APIC
static void ack_x2apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
ack_x2APIC_irq();
+ eoi_ioapic_irq(desc);
}
static void ack_x2apic_edge(unsigned int irq)
{
ack_x2APIC_irq();
}
-
#endif
static void ack_apic_edge(unsigned int irq)
@@ -2681,6 +2702,9 @@ static void ack_apic_level(unsigned int
*/
ack_APIC_irq();
+ if (irq_remapped(irq))
+ eoi_ioapic_irq(desc);
+
/* Now we can move and renable the irq */
if (unlikely(do_unmask_irq)) {
/* Only migrate the irq if the ack has been received.
@@ -2726,6 +2750,26 @@ static void ack_apic_level(unsigned int
#endif
}
+#ifdef CONFIG_INTR_REMAP
+static void ir_ack_apic_edge(unsigned int irq)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (x2apic_enabled())
+ return ack_x2apic_edge(irq);
+#endif
+ return ack_apic_edge(irq);
+}
+
+static void ir_ack_apic_level(unsigned int irq)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (x2apic_enabled())
+ return ack_x2apic_level(irq);
+#endif
+ return ack_apic_level(irq);
+}
+#endif /* CONFIG_INTR_REMAP */
+
static struct irq_chip ioapic_chip __read_mostly = {
.name = "IO-APIC",
.startup = startup_ioapic_irq,
@@ -2739,20 +2783,20 @@ static struct irq_chip ioapic_chip __rea
.retrigger = ioapic_retrigger_irq,
};
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip __read_mostly = {
.name = "IR-IO-APIC",
.startup = startup_ioapic_irq,
.mask = mask_IO_APIC_irq,
.unmask = unmask_IO_APIC_irq,
- .ack = ack_x2apic_edge,
- .eoi = ack_x2apic_level,
+#ifdef CONFIG_INTR_REMAP
+ .ack = ir_ack_apic_edge,
+ .eoi = ir_ack_apic_level,
#ifdef CONFIG_SMP
.set_affinity = set_ir_ioapic_affinity_irq,
#endif
+#endif
.retrigger = ioapic_retrigger_irq,
};
-#endif
#endif /* !CONFIG_XEN */
static inline void init_IO_APIC_traps(void)
@@ -2774,7 +2818,7 @@ static inline void init_IO_APIC_traps(vo
*/
for_each_irq_desc(irq, desc) {
#ifdef CONFIG_XEN
- if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
continue;
#endif
cfg = desc->chip_data;
@@ -2936,19 +2980,15 @@ static inline void __init check_timer(vo
int cpu = boot_cpu_id;
int apic1, pin1, apic2, pin2;
unsigned long flags;
- unsigned int ver;
int no_pin1 = 0;
local_irq_save(flags);
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
/*
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- assign_irq_vector(0, cfg, TARGET_CPUS);
+ assign_irq_vector(0, cfg, apic->target_cpus());
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2962,7 +3002,13 @@ static inline void __init check_timer(vo
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
#ifdef CONFIG_X86_32
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ {
+ unsigned int ver;
+
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ }
#endif
pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2982,10 +3028,8 @@ static inline void __init check_timer(vo
* 8259A.
*/
if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
-#endif
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -3001,8 +3045,17 @@ static inline void __init check_timer(vo
if (no_pin1) {
add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ } else {
+ /* for edge trigger, setup_IO_APIC_irq already
+ * leave it unmasked.
+ * so only need to unmask if it is level-trigger
+ * do we really have level trigger timer?
+ */
+ int idx;
+ idx = find_irq_entry(apic1, pin1, mp_INT);
+ if (idx != -1 && irq_trigger(idx))
+ unmask_IO_APIC_irq_desc(desc);
}
- unmask_IO_APIC_irq_desc(desc);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
@@ -3012,10 +3065,9 @@ static inline void __init check_timer(vo
clear_IO_APIC_pin(0, pin1);
goto out;
}
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
+ local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -3030,7 +3082,6 @@ static inline void __init check_timer(vo
*/
replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- unmask_IO_APIC_irq_desc(desc);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -3045,6 +3096,7 @@ static inline void __init check_timer(vo
/*
* Cleanup, just in case ...
*/
+ local_irq_disable();
disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -3070,6 +3122,7 @@ static inline void __init check_timer(vo
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ local_irq_disable();
disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3087,6 +3140,7 @@ static inline void __init check_timer(vo
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
@@ -3119,7 +3173,7 @@ out:
void __init setup_IO_APIC(void)
{
-#if defined(CONFIG_X86_32) || defined(CONFIG_XEN)
+#ifdef CONFIG_XEN
enable_IO_APIC();
#else
/*
@@ -3201,8 +3255,8 @@ static int ioapic_resume(struct sys_devi
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3252,6 +3306,7 @@ static int __init ioapic_init_sysfs(void
device_initcall(ioapic_init_sysfs);
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
@@ -3266,11 +3321,11 @@ unsigned int create_irq_nr(unsigned int
struct irq_desc *desc_new = NULL;
irq = 0;
- spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < NR_IRQS; new++) {
- if (platform_legacy_irq(new))
- continue;
+ if (irq_want < nr_irqs_gsi)
+ irq_want = nr_irqs_gsi;
+ spin_lock_irqsave(&vector_lock, flags);
+ for (new = irq_want; new < nr_irqs; new++) {
desc_new = irq_to_desc_alloc_cpu(new, cpu);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
@@ -3280,7 +3335,7 @@ unsigned int create_irq_nr(unsigned int
if (cfg_new->vector != 0)
continue;
- if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
+ if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
}
@@ -3295,7 +3350,6 @@ unsigned int create_irq_nr(unsigned int
return irq;
}
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
int create_irq(void)
{
unsigned int irq_want;
@@ -3324,9 +3378,7 @@ void destroy_irq(unsigned int irq)
if (desc)
desc->chip_data = cfg;
-#ifdef CONFIG_INTR_REMAP
free_irte(irq);
-#endif
spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
@@ -3343,14 +3395,16 @@ static int msi_compose_msg(struct pci_de
int err;
unsigned dest;
+ if (disable_apic)
+ return -ENXIO;
+
cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
if (err)
return err;
- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
struct irte irte;
int ir_index;
@@ -3362,9 +3416,9 @@ static int msi_compose_msg(struct pci_de
memset (&irte, 0, sizeof(irte));
irte.present = 1;
- irte.dst_mode = INT_DEST_MODE;
+ irte.dst_mode = apic->irq_dest_mode;
irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3376,16 +3430,19 @@ static int msi_compose_msg(struct pci_de
MSI_ADDR_IR_SHV |
MSI_ADDR_IR_INDEX1(ir_index) |
MSI_ADDR_IR_INDEX2(ir_index);
- } else
-#endif
- {
- msg->address_hi = MSI_ADDR_BASE_HI;
+ } else {
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
msg->address_lo =
MSI_ADDR_BASE_LO |
- ((INT_DEST_MODE == 0) ?
+ ((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
MSI_ADDR_REDIRECTION_LOWPRI) |
MSI_ADDR_DEST_ID(dest);
@@ -3393,7 +3450,7 @@ static int msi_compose_msg(struct pci_de
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(cfg->vector);
@@ -3479,15 +3536,16 @@ static struct irq_chip msi_chip = {
.retrigger = ioapic_retrigger_irq,
};
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip msi_ir_chip = {
.name = "IR-PCI-MSI",
.unmask = unmask_msi_irq,
.mask = mask_msi_irq,
- .ack = ack_x2apic_edge,
+#ifdef CONFIG_INTR_REMAP
+ .ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
.set_affinity = ir_set_msi_irq_affinity,
#endif
+#endif
.retrigger = ioapic_retrigger_irq,
};
@@ -3517,7 +3575,6 @@ static int msi_alloc_irte(struct pci_dev
}
return index;
}
-#endif
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
@@ -3531,7 +3588,6 @@ static int setup_msi_irq(struct pci_dev
set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
struct irq_desc *desc = irq_to_desc(irq);
/*
@@ -3540,7 +3596,6 @@ static int setup_msi_irq(struct pci_dev
desc->status |= IRQ_MOVE_PCNTXT;
set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
} else
-#endif
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3548,60 +3603,26 @@ static int setup_msi_irq(struct pci_dev
return 0;
}
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
-{
- unsigned int irq;
- int ret;
- unsigned int irq_want;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
- if (irq == 0)
- return -1;
-
-#ifdef CONFIG_INTR_REMAP
- if (!intr_remapping_enabled)
- goto no_ir;
-
- ret = msi_alloc_irte(dev, irq, 1);
- if (ret < 0)
- goto error;
-no_ir:
-#endif
- ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
- }
- return 0;
-
-#ifdef CONFIG_INTR_REMAP
-error:
- destroy_irq(irq);
- return ret;
-#endif
-}
-
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
unsigned int irq;
int ret, sub_handle;
struct msi_desc *msidesc;
unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
- struct intel_iommu *iommu = 0;
+ struct intel_iommu *iommu = NULL;
int index = 0;
-#endif
+
+ /* x86 doesn't support multiple MSI yet */
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
irq = create_irq_nr(irq_want);
- irq_want++;
if (irq == 0)
return -1;
-#ifdef CONFIG_INTR_REMAP
+ irq_want = irq + 1;
if (!intr_remapping_enabled)
goto no_ir;
@@ -3629,7 +3650,6 @@ int arch_setup_msi_irqs(struct pci_dev *
set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
-#endif
ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
@@ -3647,7 +3667,7 @@ void arch_teardown_msi_irq(unsigned int
destroy_irq(irq);
}
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
@@ -3728,7 +3748,7 @@ static void hpet_msi_set_affinity(unsign
#endif /* CONFIG_SMP */
-struct irq_chip hpet_msi_type = {
+static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
.unmask = hpet_msi_unmask,
.mask = hpet_msi_mask,
@@ -3743,12 +3763,14 @@ int arch_setup_hpet_msi(unsigned int irq
{
int ret;
struct msi_msg msg;
+ struct irq_desc *desc = irq_to_desc(irq);
ret = msi_compose_msg(NULL, irq, &msg);
if (ret < 0)
return ret;
hpet_msi_write(irq, &msg);
+ desc->status |= IRQ_MOVE_PCNTXT;
set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
"edge");
@@ -3811,13 +3833,17 @@ int arch_setup_ht_irq(unsigned int irq,
struct irq_cfg *cfg;
int err;
+ if (disable_apic)
+ return -ENXIO;
+
cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus());
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -3825,11 +3851,11 @@ int arch_setup_ht_irq(unsigned int irq,
HT_IRQ_LOW_BASE |
HT_IRQ_LOW_DEST_ID(dest) |
HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((INT_DEST_MODE == 0) ?
+ ((apic->irq_dest_mode == 0) ?
HT_IRQ_LOW_DM_PHYSICAL :
HT_IRQ_LOW_DM_LOGICAL) |
HT_IRQ_LOW_RQEOI_EDGE |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
HT_IRQ_LOW_MT_FIXED :
HT_IRQ_LOW_MT_ARBITRATED) |
HT_IRQ_LOW_IRQ_MASKED;
@@ -3845,7 +3871,7 @@ int arch_setup_ht_irq(unsigned int irq,
}
#endif /* CONFIG_HT_IRQ */
-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+#ifdef CONFIG_X86_UV
/*
* Re-target the irq to the specified CPU and enable the specified MMR located
* on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3877,12 +3903,12 @@ int arch_enable_uv_irq(char *irq_name, u
BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
entry->vector = cfg->vector;
- entry->delivery_mode = INT_DELIVERY_MODE;
- entry->dest_mode = INT_DEST_MODE;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
- entry->dest = cpu_mask_to_apicid(eligible_cpu);
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3945,7 +3971,29 @@ void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+ /*
+ * for MSI and HT dyn irq
+ */
+ nr += nr_irqs_gsi * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ return 0;
+}
#endif
+#endif /* CONFIG_XEN */
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
@@ -3973,7 +4021,7 @@ int __init io_apic_get_unique_id(int ioa
*/
if (physids_empty(apic_id_map))
- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+ apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
@@ -3989,10 +4037,10 @@ int __init io_apic_get_unique_id(int ioa
* Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (check_apicid_used(apic_id_map, apic_id)) {
+ if (apic->check_apicid_used(apic_id_map, apic_id)) {
for (i = 0; i < get_physical_broadcast(); i++) {
- if (!check_apicid_used(apic_id_map, i))
+ if (!apic->check_apicid_used(apic_id_map, i))
break;
}
@@ -4005,7 +4053,7 @@ int __init io_apic_get_unique_id(int ioa
apic_id = i;
}
- tmp = apicid_to_cpu_present(apic_id);
+ tmp = apic->apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
@@ -4050,7 +4098,7 @@ int io_apic_set_pci_routing (int ioapic,
int cpu = boot_cpu_id;
#ifdef CONFIG_XEN
- if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) {
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
ioapic, irq);
return -EINVAL;
@@ -4091,8 +4139,8 @@ int acpi_get_override_irq(int bus_irq, i
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == mp_INT &&
- mp_irqs[i].mp_srcbusirq == bus_irq)
+ if (mp_irqs[i].irqtype == mp_INT &&
+ mp_irqs[i].srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -4108,7 +4156,7 @@ int acpi_get_override_irq(int bus_irq, i
/*
* This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
+ * so mask in all cases should simply be apic->target_cpus()
*/
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
@@ -4147,15 +4195,13 @@ void __init setup_ioapic_dest(void)
*/
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = &desc->affinity;
+ mask = desc->affinity;
else
- mask = TARGET_CPUS;
+ mask = apic->target_cpus();
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
set_ir_ioapic_affinity_irq_desc(desc, mask);
else
-#endif
set_ioapic_affinity_irq_desc(desc, mask);
}
@@ -4208,7 +4254,7 @@ void __init ioapic_init_mappings(void)
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mp_apicaddr;
+ ioapic_phys = mp_ioapics[i].apicaddr;
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
printk(KERN_ERR
@@ -4248,9 +4294,12 @@ static int __init ioapic_insert_resource
struct resource *r = ioapic_resources;
if (!r) {
- printk(KERN_ERR
- "IO APIC resources could be not be allocated.\n");
- return -1;
+ if (nr_ioapics > 0) {
+ printk(KERN_ERR
+ "IO APIC resources couldn't be allocated.\n");
+ return -1;
+ }
+ return 0;
}
for (i = 0; i < nr_ioapics; i++) {
--- head-2011-03-17.orig/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:56:59.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:57:27.000000000 +0100
@@ -2,8 +2,8 @@
#include <linux/interrupt.h>
#include <asm/smp.h>
+#include <asm/ipi.h>
-#ifdef CONFIG_X86_32
#include <xen/evtchn.h>
DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
@@ -15,32 +15,17 @@ static inline void __send_IPI_one(unsign
notify_remote_via_irq(irq);
}
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
+void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
{
- int cpu;
+ unsigned int cpu, this_cpu = smp_processor_id();
- switch (shortcut) {
- case APIC_DEST_SELF:
- __send_IPI_one(smp_processor_id(), vector);
- break;
- case APIC_DEST_ALLBUT:
- for_each_online_cpu(cpu)
- if (cpu != smp_processor_id())
- __send_IPI_one(cpu, vector);
- break;
- default:
- printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
- vector);
- break;
- }
-}
-
-void send_IPI_self(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_SELF, vector);
+ WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+ for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+ if (cpu != this_cpu)
+ __send_IPI_one(cpu, vector);
}
-void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
+void xen_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
unsigned int cpu;
@@ -49,20 +34,17 @@ void send_IPI_mask_bitmask(const struct
__send_IPI_one(cpu, vector);
}
-void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
+void xen_send_IPI_allbutself(int vector)
{
- send_IPI_mask_bitmask(mask, vector);
+ xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
}
-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+void xen_send_IPI_all(int vector)
{
- unsigned int query_cpu;
- unsigned int this_cpu = smp_processor_id();
-
- WARN_ON(!cpumask_subset(mask, cpu_online_mask));
- for_each_cpu_and(query_cpu, mask, cpu_online_mask)
- if (query_cpu != this_cpu)
- __send_IPI_one(query_cpu, vector);
+ xen_send_IPI_mask(cpu_online_mask, vector);
}
-#endif
+void xen_send_IPI_self(int vector)
+{
+ __send_IPI_one(smp_processor_id(), vector);
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/kernel/apic/probe_32-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,69 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/ipi.h>
+
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+static int xen_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic;
+}
+
+static struct apic apic_xen = {
+
+ .name = "default",
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = default_target_cpus,
+
+ .phys_pkg_id = xen_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+#ifdef CONFIG_SMP
+ .send_IPI_mask = xen_send_IPI_mask,
+ .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_self = xen_send_IPI_self,
+#endif
+};
+
+struct apic *apic = &apic_xen;
+EXPORT_SYMBOL_GPL(apic);
--- head-2011-03-17.orig/arch/x86/kernel/asm-offsets_32.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/asm-offsets_32.c 2011-02-01 14:44:12.000000000 +0100
@@ -112,6 +112,11 @@ void foo(void)
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_START_mfn_list, start_info, mfn_list);
+#endif
+
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
--- head-2011-03-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:41:35.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:42:07.000000000 +0100
@@ -1,105 +1,94 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
#include <linux/bootmem.h>
+#include <linux/linkage.h>
#include <linux/bitops.h>
+#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
#include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
#include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
#include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
#include <asm/smp.h>
+
#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <mach_apic.h>
-#include <asm/genapic.h>
-#elif defined(CONFIG_X86_64_XEN)
-#include <mach_apic.h>
+#include <asm/uv/uv.h>
#endif
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-
#ifdef CONFIG_XEN
-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC)
-#define phys_pkg_id(a,b) a
-#endif
#include <xen/interface/callback.h>
#endif
#include "cpu.h"
-#ifdef CONFIG_X86_64
-
/* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_initialized_mask;
#ifndef CONFIG_XEN
-cpumask_var_t cpu_callin_mask;
cpumask_var_t cpu_callout_mask;
-#endif
-cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callin_mask;
-#ifndef CONFIG_XEN
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
#endif
-#else /* CONFIG_X86_32 */
-
-#ifndef CONFIG_XEN
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-#endif
-cpumask_t cpu_initialized;
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+ alloc_bootmem_cpumask_var(&cpu_initialized_mask);
#ifndef CONFIG_XEN
-cpumask_t cpu_sibling_setup_map;
+ alloc_bootmem_cpumask_var(&cpu_callin_mask);
+ alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
#endif
+}
-#endif /* CONFIG_X86_32 */
-
-
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
- Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
+ /*
+ * We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ *
+ * TLS descriptors are currently at a different place compared to i386.
+ * Hopefully nobody expects them at a fixed place (Wine?)
+ */
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
#else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
#ifndef CONFIG_XEN
/*
* Segments used for calling PnP BIOS have byte granularity.
@@ -107,33 +96,41 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p
* the transfer segment sizes are set at run time.
*/
/* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
/* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
/* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
#endif
- [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
-} };
+ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ GDT_STACK_CANARY_INIT
#endif
+} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+static int __init x86_xsave_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
@@ -173,16 +170,17 @@ static inline int flag_is_changeable_p(u
* the CPUID. Add "volatile" to not allow gcc to
* optimize the subsequent calls to this function.
*/
- asm volatile ("pushfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "movl %0,%1\n\t"
- "xorl %2,%0\n\t"
- "pushl %0\n\t"
- "popfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "popfl\n\t"
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0, %1 \n\t"
+ "xorl %2, %0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl \n\t"
+
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
@@ -199,18 +197,22 @@ static int disable_x86_serial_nr __cpuin
static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
- /* Disable processor serial number */
- unsigned long lo, hi;
- rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- lo |= 0x200000;
- wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
- clear_cpu_cap(c, X86_FEATURE_PN);
+ unsigned long lo, hi;
- /* Disabling the serial number may affect the cpuid level */
- c->cpuid_level = cpuid_eax(0);
- }
+ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+ return;
+
+ /* Disable processor serial number: */
+
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+ printk(KERN_NOTICE "CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
}
static int __init x86_serial_nr_setup(char *s)
@@ -235,16 +237,64 @@ static inline void squash_the_stupid_ser
#endif
/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software. Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+ u32 feature;
+ u32 level;
+};
+
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+ { X86_FEATURE_MWAIT, 0x00000005 },
+ { X86_FEATURE_DCA, 0x00000009 },
+ { X86_FEATURE_XSAVE, 0x0000000d },
+ { 0, 0 }
+};
+
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+ const struct cpuid_dependent_feature *df;
+
+ for (df = cpuid_dependent_features; df->feature; df++) {
+
+ if (!cpu_has(c, df->feature))
+ continue;
+ /*
+ * Note: cpuid_level is set to -1 if unavailable, but
+ * extended_extended_level is set to 0 if unavailable
+ * and the legitimate extended levels are all negative
+ * when signed; hence the weird messing around with
+ * signs here...
+ */
+ if (!((s32)df->level < 0 ?
+ (u32)df->level > (u32)c->extended_cpuid_level :
+ (s32)df->level > (s32)c->cpuid_level))
+ continue;
+
+ clear_cpu_cap(c, df->feature);
+ if (!warn)
+ continue;
+
+ printk(KERN_WARNING
+ "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
+ }
+}
+
+/*
* Naming convention should be: <Name> [(<Codename>)]
* This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
*/
/* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
{
- struct cpu_model_info *info;
+ const struct cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -264,32 +314,52 @@ static char __cpuinit *table_lookup_mode
__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ loadsegment(fs, __KERNEL_PERCPU);
+#else
+ loadsegment(gs, 0);
+#ifndef CONFIG_XEN
+ wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#else
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+ (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)))
+ BUG();
+#endif
+#endif
+ load_stack_canary_segment();
+}
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;
unsigned long va, frames[16];
int f;
- gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.address = (long)get_cpu_gdt_table(cpu);
gdt_descr.size = GDT_SIZE - 1;
for (va = gdt_descr.address, f = 0;
va < gdt_descr.address + gdt_descr.size;
va += PAGE_SIZE, f++) {
- frames[f] = virt_to_mfn(va);
- make_lowmem_page_readonly(
- (void *)va, XENFEAT_writable_descriptor_tables);
+ frames[f] = arbitrary_virt_to_mfn(va);
+ make_page_readonly((void *)va,
+ XENFEAT_writable_descriptor_tables);
}
if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
BUG();
-#ifdef CONFIG_X86_32
- asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-#endif
+
+ /* Reload the per-cpu base */
+
+ load_percpu_segment(cpu);
}
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
@@ -308,7 +378,7 @@ static void __cpuinit default_init(struc
#endif
}
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -322,22 +392,24 @@ static void __cpuinit get_model_name(str
if (c->extended_cpuid_level < 0x80000004)
return;
- v = (unsigned int *) c->x86_model_id;
+ v = (unsigned int *)c->x86_model_id;
cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /* Intel chips right-justify this string for some dumb reason;
- undo that brain damage */
+ /*
+ * Intel chips right-justify this string for some dumb reason;
+ * undo that brain damage:
+ */
p = q = &c->x86_model_id[0];
while (*p == ' ')
- p++;
+ p++;
if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+ while (*p)
+ *q++ = *p++;
+ while (q <= &c->x86_model_id[48])
+ *q++ = '\0'; /* Zero-pad the rest */
}
}
@@ -406,36 +478,30 @@ void __cpuinit detect_ht(struct cpuinfo_
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
+ goto out;
+ }
- if (smp_num_siblings > nr_cpu_ids) {
- printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
+ if (smp_num_siblings <= 1)
+ goto out;
- index_msb = get_count_order(smp_num_siblings);
-#ifdef CONFIG_X86_64
- c->phys_proc_id = phys_pkg_id(index_msb);
-#else
- c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-#endif
+ if (smp_num_siblings > nr_cpu_ids) {
+ pr_warning("CPU: Unsupported number of siblings %d",
+ smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
- index_msb = get_count_order(smp_num_siblings);
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- core_bits = get_count_order(c->x86_max_cores);
+ index_msb = get_count_order(smp_num_siblings);
-#ifdef CONFIG_X86_64
- c->cpu_core_id = phys_pkg_id(index_msb) &
- ((1 << core_bits) - 1);
-#else
- c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
-#endif
- }
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
out:
if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -450,8 +516,8 @@ out:
static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
- int i;
static int printed;
+ int i;
for (i = 0; i < X86_VENDOR_NUM; i++) {
if (!cpu_devs[i])
@@ -460,6 +526,7 @@ static void __cpuinit get_cpu_vendor(str
if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
(cpu_devs[i]->c_ident[1] &&
!strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
this_cpu = cpu_devs[i];
c->x86_vendor = this_cpu->c_x86_vendor;
return;
@@ -468,7 +535,9 @@ static void __cpuinit get_cpu_vendor(str
if (!printed) {
printed++;
- printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+ printk(KERN_ERR
+ "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
printk(KERN_ERR "CPU: Your system may be unstable.\n");
}
@@ -488,14 +557,17 @@ void __cpuinit cpu_detect(struct cpuinfo
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 junk, tfms, cap0, misc;
+
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf;
c->x86_mask = tfms & 0xf;
+
if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
c->x86_cache_alignment = c->x86_clflush_size;
@@ -511,6 +583,7 @@ static void __cpuinit get_cpu_cap(struct
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 capability, excap;
+
cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
c->x86_capability[4] = excap;
@@ -519,6 +592,7 @@ static void __cpuinit get_cpu_cap(struct
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
+
if ((xlvl & 0xffff0000) == 0x80000000) {
if (xlvl >= 0x80000001) {
c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -526,13 +600,15 @@ static void __cpuinit get_cpu_cap(struct
}
}
-#ifdef CONFIG_X86_64
if (c->extended_cpuid_level >= 0x80000008) {
u32 eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
}
+#ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
#endif
if (c->extended_cpuid_level >= 0x80000007)
@@ -579,8 +655,12 @@ static void __init early_identify_cpu(st
{
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
@@ -603,21 +683,20 @@ static void __init early_identify_cpu(st
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
- validate_pat_support(c);
-
#ifdef CONFIG_SMP
c->cpu_index = boot_cpu_id;
#endif
+ filter_cpuid_features(c, false);
}
void __init early_cpu_init(void)
{
- struct cpu_dev **cdev;
+ const struct cpu_dev *const *cdev;
int count = 0;
- printk("KERNEL supported cpus:\n");
+ printk(KERN_INFO "KERNEL supported cpus:\n");
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
- struct cpu_dev *cpudev = *cdev;
+ const struct cpu_dev *cpudev = *cdev;
unsigned int j;
if (count >= X86_VENDOR_NUM)
@@ -628,7 +707,7 @@ void __init early_cpu_init(void)
for (j = 0; j < 2; j++) {
if (!cpudev->c_ident[j])
continue;
- printk(" %s %s\n", cpudev->c_vendor,
+ printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
cpudev->c_ident[j]);
}
}
@@ -671,7 +750,7 @@ static void __cpuinit generic_identify(s
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_HT
- c->apicid = phys_pkg_id(c->initial_apicid, 0);
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
# endif
@@ -708,9 +787,13 @@ static void __cpuinit identify_cpu(struc
#endif
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->cpuid_level = -1; /* CPUID not detected */
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -723,7 +806,7 @@ static void __cpuinit identify_cpu(struc
this_cpu->c_identify(c);
#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
- c->apicid = phys_pkg_id(0);
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
/*
@@ -743,13 +826,16 @@ static void __cpuinit identify_cpu(struc
squash_the_stupid_serial_number(c);
/*
- * The vendor-specific functions might have changed features. Now
- * we do "generic changes."
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
*/
+ /* Filter out anything that depends on CPUID levels we don't have */
+ filter_cpuid_features(c, true);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
- char *p;
+ const char *p;
p = table_lookup_model(c);
if (p)
strcpy(c->x86_model_id, p);
@@ -805,6 +891,7 @@ static void vgetcpu_set_mode(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
+ init_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
@@ -824,11 +911,11 @@ void __cpuinit identify_secondary_cpu(st
}
struct msr_range {
- unsigned min;
- unsigned max;
+ unsigned min;
+ unsigned max;
};
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
{ 0x00000000, 0x00000418},
{ 0xc0000000, 0xc000040b},
{ 0xc0010000, 0xc0010142},
@@ -837,14 +924,15 @@ static struct msr_range msr_range_array[
static void __cpuinit print_cpu_msr(void)
{
+ unsigned index_min, index_max;
unsigned index;
u64 val;
int i;
- unsigned index_min, index_max;
for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
index_min = msr_range_array[i].min;
index_max = msr_range_array[i].max;
+
for (index = index_min; index < index_max; index++) {
if (rdmsrl_amd_safe(index, &val))
continue;
@@ -854,6 +942,7 @@ static void __cpuinit print_cpu_msr(void
}
static int show_msr __cpuinitdata;
+
static __init int setup_show_msr(char *arg)
{
int num;
@@ -875,12 +964,14 @@ __setup("noclflush", setup_noclflush);
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
- char *vendor = NULL;
+ const char *vendor = NULL;
- if (c->x86_vendor < X86_VENDOR_NUM)
+ if (c->x86_vendor < X86_VENDOR_NUM) {
vendor = this_cpu->c_vendor;
- else if (c->cpuid_level >= 0)
- vendor = c->x86_vendor_id;
+ } else {
+ if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+ }
if (vendor && !strstr(c->x86_model_id, vendor))
printk(KERN_CONT "%s ", vendor);
@@ -907,87 +998,57 @@ void __cpuinit print_cpu_info(struct cpu
static __init int setup_disablecpuid(char *arg)
{
int bit;
+
if (get_option(&arg, &bit) && bit < NCAPINTS*32)
setup_clear_cpu_cap(bit);
else
return 0;
+
return 1;
}
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
#ifndef CONFIG_X86_NO_IDT
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
#endif
-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+ irq_stack_union) __aligned(PAGE_SIZE);
-static void __ref switch_pt(int cpu)
+void xen_switch_pt(void)
{
#ifdef CONFIG_XEN
- if (cpu == 0)
- xen_init_pt();
xen_pt_switch(__pa_symbol(init_level4_pgt));
xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
#endif
}
-void __cpuinit pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
- /* Setup up data that may be needed in __get_free_pages early */
- loadsegment(fs, 0);
- loadsegment(gs, 0);
-#ifndef CONFIG_XEN
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
-#else
- if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
- (unsigned long)pda))
- BUG();
-#endif
-
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack = (unsigned long)stack_thread_info() -
- PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
-
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- pda->irqstackptr += IRQSTACKSIZE - 64;
- } else {
- if (!pda->irqstackptr) {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d",
- cpu);
- pda->irqstackptr += IRQSTACKSIZE - 64;
- }
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+ (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
- if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
- pda->nodenumber = cpu_to_node(cpu);
- }
-
- switch_pt(cpu);
-}
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
#ifndef CONFIG_X86_NO_TSS
-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
-#endif
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
-extern asmlinkage void ignore_sysret(void);
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+ __aligned(PAGE_SIZE);
+#endif
void __cpuinit syscall_init(void)
{
@@ -1031,16 +1092,38 @@ unsigned long kernel_eflags;
DEFINE_PER_CPU(struct orig_ist, orig_ist);
#endif
-#else
+#else /* CONFIG_X86_64 */
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU(unsigned long, stack_canary);
+#endif
-/* Make sure %fs is initialized properly in idle threads */
+/* Make sure %fs and %gs are initialized properly in idle threads */
struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
regs->fs = __KERNEL_PERCPU;
+ regs->gs = __KERNEL_STACK_CANARY;
+
return regs;
}
-#endif
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+
+ set_debugreg(0, i);
+ }
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1050,24 +1133,31 @@ struct pt_regs * __cpuinit idle_regs(str
* A lot of state is already set up in PDA init for 64 bit
*/
#ifdef CONFIG_X86_64
+
void __cpuinit cpu_init(void)
{
- int cpu = stack_smp_processor_id();
#ifndef CONFIG_X86_NO_TSS
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+ struct orig_ist *orig_ist;
+ struct tss_struct *t;
unsigned long v;
- char *estacks = NULL;
int i;
#endif
struct task_struct *me;
+ int cpu;
+ cpu = stack_smp_processor_id();
/* CPU 0 is initialised in head64.c */
if (cpu != 0)
- pda_init(cpu);
+ xen_switch_pt();
#ifndef CONFIG_X86_NO_TSS
- else
- estacks = boot_exception_stacks;
+ t = &per_cpu(init_tss, cpu);
+ orig_ist = &per_cpu(orig_ist, cpu);
+#endif
+
+#ifdef CONFIG_NUMA
+ if (cpu != 0 && percpu_read(node_number) == 0 &&
+ cpu_to_node(cpu) != NUMA_NO_NODE)
+ percpu_write(node_number, cpu_to_node(cpu));
#endif
me = current;
@@ -1084,7 +1174,9 @@ void __cpuinit cpu_init(void)
* and set up the GDT descriptor:
*/
- switch_to_new_gdt();
+ switch_to_new_gdt(cpu);
+ loadsegment(fs, 0);
+
#ifndef CONFIG_X86_NO_IDT
load_idt((const struct desc_ptr *)&idt_descr);
#endif
@@ -1097,8 +1189,8 @@ void __cpuinit cpu_init(void)
barrier();
check_efer();
-#ifndef CONFIG_XEN
- if (cpu != 0 && x2apic)
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (cpu != 0)
enable_x2apic();
#endif
@@ -1107,24 +1199,17 @@ void __cpuinit cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!orig_ist->ist[0]) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
+ char *estacks = per_cpu(exception_stacks, cpu);
+
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception "
- "stack %ld %d\n", v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
+ estacks += exception_stack_sizes[v];
orig_ist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
/*
* <= is required because the CPU will access up to
* 8 bits beyond the end of the IO permission bitmap.
@@ -1135,8 +1220,7 @@ void __cpuinit cpu_init(void)
atomic_inc(&init_mm.mm_count);
me->active_mm = &init_mm;
- if (me->mm)
- BUG();
+ BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
load_sp0(t, &current->thread);
@@ -1155,22 +1239,9 @@ void __cpuinit cpu_init(void)
*/
if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
arch_kgdb_ops.correct_hw_break();
- else {
-#endif
- /*
- * Clear all 6 debug registers:
- */
-
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
- /* If the kgdb is connected no debug regs should be altered. */
- }
+ else
#endif
+ clear_all_debug_regs();
fpu_init();
@@ -1182,8 +1253,10 @@ void __cpuinit cpu_init(void)
kernel_eflags &= ~X86_EFLAGS_IF;
#endif
+#ifdef CONFIG_X86_LOCAL_APIC
if (is_uv_system())
uv_cpu_init();
+#endif
}
#else
@@ -1199,7 +1272,8 @@ void __cpuinit cpu_init(void)
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;) local_irq_enable();
+ for (;;)
+ local_irq_enable();
}
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1207,36 +1281,30 @@ void __cpuinit cpu_init(void)
if (cpu_has_vme || cpu_has_de)
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- switch_to_new_gdt();
+ switch_to_new_gdt(cpu);
/*
* Set up and load the per-CPU TSS and LDT
*/
atomic_inc(&init_mm.mm_count);
curr->active_mm = &init_mm;
- if (curr->mm)
- BUG();
+ BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
load_LDT(&init_mm.context);
+#ifndef CONFIG_X86_NO_TSS
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+#endif
+
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear %gs. */
- asm volatile ("mov %0, %%gs" : : "r" (0));
-
- /* Clear all 6 debug registers: */
- set_debugreg(0, 0);
- set_debugreg(0, 1);
- set_debugreg(0, 2);
- set_debugreg(0, 3);
- set_debugreg(0, 6);
- set_debugreg(0, 7);
+ clear_all_debug_regs();
/*
* Force FPU initialization:
@@ -1256,6 +1324,4 @@ void __cpuinit cpu_init(void)
xsave_init();
}
-
-
#endif
--- head-2011-03-17.orig/arch/x86/kernel/cpu/intel.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/cpu/intel.c 2011-02-01 14:44:12.000000000 +0100
@@ -96,8 +96,10 @@ static void __cpuinit early_init_intel(s
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+#ifndef CONFIG_XEN
if (!check_tsc_unstable())
sched_clock_stable = 1;
+#endif
}
/*
@@ -232,9 +234,13 @@ static void __cpuinit intel_workarounds(
rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
+#ifndef CONFIG_XEN
printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+#else
+ pr_warning("CPU: Hypervisor update needed\n");
+#endif
}
}
--- head-2011-03-17.orig/arch/x86/kernel/e820-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/e820-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -129,19 +129,50 @@ int __init e820_all_mapped(u64 start, u6
/*
* Add a memory region to the kernel e820 map.
*/
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+ int type)
{
- int x = e820.nr_map;
+ int x = e820x->nr_map;
- if (x == ARRAY_SIZE(e820.map)) {
+ if (x == ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
+ e820x->map[x].addr = start;
+ e820x->map[x].size = size;
+ e820x->map[x].type = type;
+ e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+ __e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+ switch (type) {
+ case E820_RAM:
+ case E820_RESERVED_KERN:
+ printk(KERN_CONT "(usable)");
+ break;
+ case E820_RESERVED:
+ printk(KERN_CONT "(reserved)");
+ break;
+ case E820_ACPI:
+ printk(KERN_CONT "(ACPI data)");
+ break;
+ case E820_NVS:
+ printk(KERN_CONT "(ACPI NVS)");
+ break;
+ case E820_UNUSABLE:
+ printk(KERN_CONT "(unusable)");
+ break;
+ default:
+ printk(KERN_CONT "type %u", type);
+ break;
+ }
}
static void __init _e820_print_map(const struct e820map *e820, const char *who)
@@ -153,27 +184,8 @@ static void __init _e820_print_map(const
(unsigned long long) e820->map[i].addr,
(unsigned long long)
(e820->map[i].addr + e820->map[i].size));
- switch (e820->map[i].type) {
- case E820_RAM:
- case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)\n");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)\n");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)\n");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)\n");
- break;
- case E820_UNUSABLE:
- printk("(unusable)\n");
- break;
- default:
- printk(KERN_CONT "type %u\n", e820->map[i].type);
- break;
- }
+ e820_print_type(e820->map[i].type);
+ printk(KERN_CONT "\n");
}
}
@@ -240,7 +252,7 @@ static void __init _e820_print_map(const
*/
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
- int *pnr_map)
+ u32 *pnr_map)
{
struct change_member {
struct e820entry *pbios; /* pointer to original bios entry */
@@ -444,11 +456,12 @@ static int __init append_e820_map(struct
return __append_e820_map(biosmap, nr_map);
}
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 size, unsigned old_type,
unsigned new_type)
{
- unsigned int i, x;
+ u64 end;
+ unsigned int i;
u64 real_updated_size = 0;
BUG_ON(old_type == new_type);
@@ -456,40 +469,59 @@ static u64 __init e820_update_range_map(
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
+ end = start + size;
+ printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+ (unsigned long long) start,
+ (unsigned long long) end);
+ e820_print_type(old_type);
+ printk(KERN_CONT " ==> ");
+ e820_print_type(new_type);
+ printk(KERN_CONT "\n");
+
for (i = 0; i < e820x->nr_map; i++) {
struct e820entry *ei = &e820x->map[i];
u64 final_start, final_end;
+ u64 ei_end;
+
if (ei->type != old_type)
continue;
- /* totally covered? */
- if (ei->addr >= start &&
- (ei->addr + ei->size) <= (start + size)) {
+
+ ei_end = ei->addr + ei->size;
+ /* totally covered by new range? */
+ if (ei->addr >= start && ei_end <= end) {
ei->type = new_type;
real_updated_size += ei->size;
continue;
}
+
+ /* new range is totally covered? */
+ if (ei->addr < start && ei_end > end) {
+ __e820_add_region(e820x, start, size, new_type);
+ __e820_add_region(e820x, end, ei_end - end, ei->type);
+ ei->size = start - ei->addr;
+ real_updated_size += size;
+ continue;
+ }
+
/* partially covered */
final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
+ final_end = min(end, ei_end);
if (final_start >= final_end)
continue;
- x = e820x->nr_map;
- if (x == ARRAY_SIZE(e820x->map)) {
- printk(KERN_ERR "Too many memory map entries!\n");
- break;
- }
- e820x->map[x].addr = final_start;
- e820x->map[x].size = final_end - final_start;
- e820x->map[x].type = new_type;
- e820x->nr_map++;
+ __e820_add_region(e820x, final_start, final_end - final_start,
+ new_type);
real_updated_size += final_end - final_start;
+ /*
+ * left range could be head or tail, so need to update
+ * size at first.
+ */
+ ei->size -= final_end - final_start;
if (ei->addr < final_start)
continue;
ei->addr = final_end;
- ei->size -= final_end - final_start;
}
return real_updated_size;
}
@@ -497,7 +529,7 @@ static u64 __init e820_update_range_map(
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return e820_update_range_map(&e820, start, size, old_type, new_type);
+ return __e820_update_range(&e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
@@ -505,11 +537,11 @@ static u64 __init e820_update_range_save
{
#ifdef CONFIG_XEN
if (is_initial_xendomain())
- return e820_update_range_map(&machine_e820,
- phys_to_machine(start), size,
- old_type, new_type);
+ return __e820_update_range(&machine_e820,
+ phys_to_machine(start), size,
+ old_type, new_type);
#endif
- return e820_update_range_map(&e820_saved, start, size, old_type,
+ return __e820_update_range(&e820_saved, start, size, old_type,
new_type);
}
@@ -553,7 +585,7 @@ u64 __init e820_remove_range(u64 start,
void __init update_e820(void)
{
- int nr_map;
+ u32 nr_map;
nr_map = e820.nr_map;
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -564,7 +596,7 @@ void __init update_e820(void)
}
static void __init update_e820_saved(void)
{
- int nr_map;
+ u32 nr_map;
nr_map = e820_saved.nr_map;
if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -916,6 +948,9 @@ void __init reserve_early_overlap_ok(u64
*/
void __init reserve_early(u64 start, u64 end, char *name)
{
+ if (start >= end)
+ return;
+
drop_overlaps_that_are_ok(start, end);
__reserve_early(start, end, name, 0);
}
@@ -1389,7 +1424,7 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- int nr = e820.nr_map;
+ u32 nr = e820.nr_map;
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
early_panic("Invalid user supplied memory map");
@@ -1479,7 +1514,7 @@ void __init e820_reserve_resources_late(
char *__init default_machine_specific_memory_setup(void)
{
char *who = "BIOS-e820";
- int new_nr;
+ u32 new_nr;
/*
* Try to copy the BIOS-supplied E820-map.
*
--- head-2011-03-17.orig/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -12,8 +12,8 @@
#include <asm/fcntl.h>
#include <asm/setup.h>
#include <asm/pci-direct.h>
-#include <asm/pgtable.h>
#include <asm/fixmap.h>
+#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
#ifndef CONFIG_XEN
@@ -279,7 +279,7 @@ static int dbgp_wait_until_complete(void
return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
}
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
{
int i;
@@ -340,7 +340,7 @@ static void dbgp_set_data(const void *bu
writel(hi, &ehci_debug->data47);
}
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
{
unsigned char *bytes = buf;
u32 lo, hi;
@@ -384,7 +384,7 @@ static int dbgp_bulk_write(unsigned devn
return ret;
}
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
int size)
{
u32 pids, addr, ctrl;
@@ -415,8 +415,8 @@ static int dbgp_bulk_read(unsigned devnu
return ret;
}
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
- int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+ int request, int value, int index, void *data, int size)
{
u32 pids, addr, ctrl;
struct usb_ctrlrequest req;
@@ -518,7 +518,7 @@ static u32 __init find_dbgp(int ehci_num
return 0;
}
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
{
u32 portsc;
u32 delay_time, delay;
@@ -561,7 +561,7 @@ static int ehci_reset_port(int port)
return -EBUSY;
}
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
{
u32 status;
int ret, reps;
@@ -586,13 +586,13 @@ static inline void dbgp_printk(const cha
typedef void (*set_debug_port_t)(int port);
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
{
}
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
{
u32 dword;
dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
--- head-2011-03-17.orig/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:44:12.000000000 +0100
@@ -30,12 +30,13 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - orig_eax
- * 2C(%esp) - %eip
- * 30(%esp) - %cs
- * 34(%esp) - %eflags
- * 38(%esp) - %oldesp
- * 3C(%esp) - %oldss
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@@ -46,7 +47,7 @@
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
@@ -105,121 +106,221 @@ NMI_MASK = 0x80000000
#define resume_userspace_sig resume_userspace
#endif
-#define SAVE_ALL \
- cld; \
- pushl %fs; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET fs, 0;*/\
- pushl %es; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET es, 0;*/\
- pushl %ds; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET ds, 0;*/\
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET eax, 0;\
- pushl %ebp; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebp, 0;\
- pushl %edi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edi, 0;\
- pushl %esi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET esi, 0;\
- pushl %edx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edx, 0;\
- pushl %ecx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ecx, 0;\
- pushl %ebx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebx, 0;\
- movl $(__USER_DS), %edx; \
- movl %edx, %ds; \
- movl %edx, %es; \
- movl $(__KERNEL_PERCPU), %edx; \
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+ CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE gs*/
+ .if \pop <> 0
+ add $\pop, %esp
+ CFI_ADJUST_CFA_OFFSET -\pop
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+ /*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+ /*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+ cld
+ PUSH_GS
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0;*/
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0;*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0;*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
+ SET_KERNEL_GS %edx
+.endm
-#define RESTORE_INT_REGS \
- popl %ebx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebx;\
- popl %ecx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ecx;\
- popl %edx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edx;\
- popl %esi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE esi;\
- popl %edi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edi;\
- popl %ebp; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebp;\
- popl %eax; \
- CFI_ADJUST_CFA_OFFSET -4;\
+.macro RESTORE_INT_REGS
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ popl %edi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edi
+ popl %ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebp
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
CFI_RESTORE eax
+.endm
-#define RESTORE_REGS \
- RESTORE_INT_REGS; \
-1: popl %ds; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE ds;*/\
-2: popl %es; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE es;*/\
-3: popl %fs; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE fs;*/\
-.pushsection .fixup,"ax"; \
-4: movl $0,(%esp); \
- jmp 1b; \
-5: movl $0,(%esp); \
- jmp 2b; \
-6: movl $0,(%esp); \
- jmp 3b; \
-.section __ex_table,"a";\
- .align 4; \
- .long 1b,4b; \
- .long 2b,5b; \
- .long 3b,6b; \
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE ds;*/
+2: popl %es
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE es;*/
+3: popl %fs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE fs;*/
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.section __ex_table, "a"
+ .align 4
+ .long 1b, 4b
+ .long 2b, 5b
+ .long 3b, 6b
.popsection
+ POP_GS_EX
+.endm
-#define RING0_INT_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 3*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_INT_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 3*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_EC_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 4*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_EC_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 4*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_PTREGS_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
- CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
- CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
- CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
- CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
- CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+.macro RING0_PTREGS_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+ CFI_OFFSET eip, PT_EIP-PT_OLDESP
+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+ CFI_OFFSET eax, PT_EAX-PT_OLDESP
+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+ CFI_OFFSET edi, PT_EDI-PT_OLDESP
+ CFI_OFFSET esi, PT_ESI-PT_OLDESP
+ CFI_OFFSET edx, PT_EDX-PT_OLDESP
+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP
CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm
ENTRY(ret_from_fork)
CFI_STARTPROC
@@ -344,7 +445,8 @@ sysenter_past_esp:
.previous
GET_THREAD_INFO(%ebp)
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
cmpl $(nr_syscalls), %eax
@@ -355,7 +457,7 @@ sysenter_do_call:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx
+ testl $_TIF_ALLWORK_MASK, %ecx
jne sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
@@ -364,11 +466,12 @@ sysenter_exit:
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
- testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
addl $4,%esp
CFI_ADJUST_CFA_OFFSET -4
@@ -385,7 +488,7 @@ sysenter_audit:
jmp sysenter_do_call
sysexit_audit:
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
@@ -398,7 +501,7 @@ sysexit_audit:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
@@ -412,6 +515,7 @@ sysexit_audit:
.align 4
.long 1b,2b
.popsection
+ PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
# pv sysenter call handler stub
@@ -447,7 +551,7 @@ ENTRY(system_call)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -461,7 +565,7 @@ syscall_exit:
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx # current->work
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
jne syscall_exit_work
restore_all:
@@ -492,8 +596,7 @@ restore_nocheck:
#endif
TRACE_IRQS_IRET
restore_nocheck_notrace:
- RESTORE_REGS
- addl $4, %esp # skip orig_eax/error_code
+ RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
@@ -555,9 +658,7 @@ restore_all_enable_events:
scrit: /**** START OF CRITICAL REGION ****/
__TEST_PENDING
jnz 14f # process more events if necessary...
- RESTORE_REGS
- addl $4, %esp
- CFI_ADJUST_CFA_OFFSET -4
+ RESTORE_REGS 4
1: INTERRUPT_RETURN
.section __ex_table,"a"
.align 4
@@ -571,9 +672,7 @@ ecrit: /**** END OF CRITICAL REGION ***
CFI_RESTORE_STATE
hypervisor_iret:
andl $~NMI_MASK, PT_EFLAGS(%esp)
- RESTORE_REGS
- addl $4, %esp
- CFI_ADJUST_CFA_OFFSET -4
+ RESTORE_REGS 4
jmp hypercall_page + (__HYPERVISOR_iret * 32)
#endif
CFI_ENDPROC
@@ -641,7 +740,7 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
- testb $_TIF_WORK_SYSCALL_EXIT, %cl
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
@@ -665,29 +764,51 @@ syscall_badsys:
END(syscall_badsys)
CFI_ENDPROC
+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ jmp sys_##name;
+
+PTREGSCALL(iopl)
+PTREGSCALL(fork)
+PTREGSCALL(clone)
+PTREGSCALL(vfork)
+PTREGSCALL(execve)
+PTREGSCALL(sigaltstack)
+PTREGSCALL(sigreturn)
+PTREGSCALL(rt_sigreturn)
+PTREGSCALL(vm86)
+PTREGSCALL(vm86old)
+
#ifndef CONFIG_XEN
-#define FIXUP_ESPFIX_STACK \
- /* since we are on a wrong stack, we cant make it a C code :( */ \
- PER_CPU(gdt_page, %ebx); \
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
- addl %esp, %eax; \
- pushl $__KERNEL_DS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4; \
- lss (%esp), %esp; \
- CFI_ADJUST_CFA_OFFSET -8;
-#define UNWIND_ESPFIX_STACK \
- movl %ss, %eax; \
- /* see if on espfix stack */ \
- cmpw $__ESPFIX_SS, %ax; \
- jne 27f; \
- movl $__KERNEL_DS, %eax; \
- movl %eax, %ds; \
- movl %eax, %es; \
- /* switch to normal stack */ \
- FIXUP_ESPFIX_STACK; \
-27:;
+.macro FIXUP_ESPFIX_STACK
+ /* since we are on a wrong stack, we cant make it a C code :( */
+ PER_CPU(gdt_page, %ebx)
+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ addl %esp, %eax
+ pushl $__KERNEL_DS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ lss (%esp), %esp
+ CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+.endm
/*
* Build the entry stubs and pointer table with some assembler magic.
@@ -743,7 +864,7 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-#define BUILD_INTERRUPT(name, nr) \
+#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
pushl $~(nr); \
@@ -751,13 +872,15 @@ ENTRY(name) \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
- call smp_##name; \
+ call fn; \
jmp ret_from_intr; \
CFI_ENDPROC; \
ENDPROC(name)
+#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
+
/* The include is where all of the SMP etc. interrupts come from */
-#include "entry_arch.h"
+#include <asm/entry_arch.h>
#else
#define UNWIND_ESPFIX_STACK
@@ -844,8 +967,13 @@ critical_fixup_table:
.byte 7 # pop %ds
.byte 8 # pop %es
.byte 9,9 # pop %fs
- .byte 10,10,10 # add $4,%esp
- .byte 11 # iret
+#ifndef CONFIG_X86_32_LAZY_GS
+ .byte 10,10 # pop %gs
+ .byte 11,11,11 # add $4,%esp
+#else
+ .byte 10,10,10 # add $8,%esp
+#endif
+ .byte 12 # iret
.byte -1,-1,-1,-1 # movb $1,1(%esi) = __DISABLE_INTERRUPTS
.previous
@@ -1203,7 +1331,7 @@ ENTRY(ia32pv_cstar_target)
.previous
SAVE_ALL
GET_THREAD_INFO(%ebp)
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz cstar_trace_entry
cmpl $nr_syscalls,%eax
jae cstar_badsys
@@ -1323,7 +1451,10 @@ ENTRY(page_fault)
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
- /* the function address is in %fs's slot on the stack */
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
@@ -1352,20 +1483,15 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
@@ -1390,20 +1516,21 @@ END(page_fault)
* by hand onto the new stack - while updating the return eip past
* the instruction that would have done it for sysenter.
*/
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
+.macro FIX_STACK offset ok label
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
+\label:
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ CFI_DEF_CFA esp, 0
+ CFI_UNDEFINED eip
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $__KERNEL_CS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $sysenter_past_esp
+ CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
+.endm
#endif /* CONFIG_XEN */
ENTRY(debug)
@@ -1411,7 +1538,7 @@ ENTRY(debug)
#ifndef CONFIG_XEN
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
#endif /* !CONFIG_XEN */
pushl $-1 # mark this as an int
@@ -1471,7 +1598,7 @@ nmi_stack_correct:
nmi_stack_fixup:
RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
+ FIX_STACK 12, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_debug_stack_check:
@@ -1482,7 +1609,7 @@ nmi_debug_stack_check:
jb nmi_stack_correct
cmpl $debug_esp_fix_insn,(%esp)
ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
+ FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_espfix_stack:
@@ -1494,7 +1621,7 @@ nmi_espfix_stack:
CFI_ADJUST_CFA_OFFSET 4
pushl %esp
CFI_ADJUST_CFA_OFFSET 4
- addw $4, (%esp)
+ addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
pushl 16(%esp)
--- head-2011-03-17.orig/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:44:12.000000000 +0100
@@ -51,10 +51,10 @@
#include <asm/unistd.h>
#include <asm/thread_info.h>
#include <asm/hw_irq.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/ftrace.h>
-#include <asm/errno.h>
+#include <asm/percpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/features.h>
@@ -81,20 +81,17 @@ ENTRY(ftrace_caller)
movq 8(%rbp), %rsi
subq $MCOUNT_INSN_SIZE, %rdi
-.globl ftrace_call
-ftrace_call:
+GLOBAL(ftrace_call)
call ftrace_stub
MCOUNT_RESTORE_FRAME
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
+GLOBAL(ftrace_graph_call)
jmp ftrace_stub
#endif
-.globl ftrace_stub
-ftrace_stub:
+GLOBAL(ftrace_stub)
retq
END(ftrace_caller)
@@ -114,8 +111,7 @@ ENTRY(mcount)
jnz ftrace_graph_caller
#endif
-.globl ftrace_stub
-ftrace_stub:
+GLOBAL(ftrace_stub)
retq
trace:
@@ -152,9 +148,7 @@ ENTRY(ftrace_graph_caller)
retq
END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
+GLOBAL(return_to_handler)
subq $80, %rsp
movq %rax, (%rsp)
@@ -369,15 +363,15 @@ ENTRY(save_args)
je 1f
SWAPGS
/*
- * irqcount is used to check if a CPU is already on an interrupt stack
+ * irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl %gs:pda_irqcount
+1: incl PER_CPU_VAR(irq_count)
jne 2f
popq_cfi %rax /* move return address... */
- mov %gs:pda_irqstackptr,%rsp
+ mov PER_CPU_VAR(irq_stack_ptr),%rsp
EMPTY_FRAME 0
pushq_cfi %rbp /* backlink for unwinder */
pushq_cfi %rax /* ... to the new stack */
@@ -407,6 +401,7 @@ END(save_rest)
#ifndef CONFIG_XEN
/* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -435,6 +430,7 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
+ .popsection
#endif
/*
@@ -445,6 +441,8 @@ END(save_paranoid)
ENTRY(ret_from_fork)
DEFAULT_FRAME
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
@@ -454,7 +452,6 @@ ENTRY(ret_from_fork)
GET_THREAD_INFO(%rcx)
- CFI_REMEMBER_STATE
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -466,7 +463,6 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
- CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
@@ -642,9 +638,7 @@ tracesys:
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
- .globl int_ret_from_sys_call
- .globl int_with_check
-int_ret_from_sys_call:
+GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testb $3,CS-ARGOFFSET(%rsp)
@@ -655,7 +649,7 @@ int_ret_from_sys_call:
1:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
-int_with_check:
+GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%edx
@@ -877,10 +871,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
#endif
+#ifdef CONFIG_X86_UV
apicinterrupt UV_BAU_MESSAGE \
uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+ generic_interrupt smp_generic_interrupt
#ifdef CONFIG_SMP
apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -998,15 +996,15 @@ ENTRY(do_hypervisor_callback) # do_hyp
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
DEFAULT_FRAME
-11: incl %gs:pda_irqcount
+11: incl PER_CPU_VAR(irq_count)
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- cmovzq %gs:pda_irqstackptr,%rsp
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
pushq %rbp # backlink for old unwinder
call evtchn_do_upcall
popq %rsp
CFI_DEF_CFA_REGISTER rsp
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
jmp error_exit
CFI_ENDPROC
END(do_hypervisor_callback)
@@ -1197,14 +1195,14 @@ ENTRY(call_softirq)
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- incl %gs:pda_irqcount
- cmove %gs:pda_irqstackptr,%rsp
+ incl PER_CPU_VAR(irq_count)
+ cmove PER_CPU_VAR(irq_stack_ptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
END(call_softirq)
@@ -1250,7 +1248,10 @@ ENTRY(paranoid_exit)
paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
+ RESTORE_ALL 8
+ jmp irq_return
paranoid_restore:
+ TRACE_IRQS_IRETQ 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
--- head-2011-03-17.orig/arch/x86/kernel/head-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -2,6 +2,7 @@
#include <linux/init.h>
#include <asm/setup.h>
+#ifndef CONFIG_XEN
#include <asm/bios_ebda.h>
#define BIOS_LOWMEM_KILOBYTES 0x413
@@ -18,7 +19,6 @@
*/
void __init reserve_ebda_region(void)
{
-#ifndef CONFIG_XEN
unsigned int lowmem, ebda_addr;
/* To determine the position of the EBDA and the */
@@ -53,5 +53,158 @@ void __init reserve_ebda_region(void)
/* reserve all memory between lowmem and the 1MB mark */
reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+}
+#else /* CONFIG_XEN */
+#include <linux/module.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/setup_arch.h>
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
+#else
+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
+#endif
+
+unsigned long *__read_mostly machine_to_phys_mapping =
+ (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int __read_mostly machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
+void __init xen_start_kernel(void)
+{
+ unsigned int i;
+ struct xen_machphys_mapping mapping;
+ unsigned long machine_to_phys_nr_ents;
+
+ xen_setup_features();
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
+ } else
+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+ while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
+ machine_to_phys_order++;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ phys_to_machine_mapping =
+ (unsigned long *)xen_start_info->mfn_list;
+
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_writable_pagetables));
+
+ reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
+ __pa(xen_start_info->pt_base)
+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+ "Xen provided");
+
+#ifdef CONFIG_X86_32
+{
+ extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
+ unsigned long addr;
+
+ /* Do an early initialization of the fixmap area */
+ make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
+ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+ set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr),
+ addr),
+ addr),
+ __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+}
+#else
+ check_efer();
+ xen_init_pt();
+#endif
+
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
+ != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
+ FIX_BUG_ON(SHARED_INFO);
+ FIX_BUG_ON(ISAMAP_BEGIN);
+ FIX_BUG_ON(ISAMAP_END);
+#undef pmd_index
+#undef __FIXADDR_TOP
+
+ /* Switch to the real shared_info page, and clear the dummy page. */
+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+ clear_page(empty_zero_page);
+
+ /* Set up mapping of lowest 1MB of physical memory. */
+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
+ if (is_initial_xendomain())
+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+ else
+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
+ virt_to_machine(empty_zero_page),
+ PAGE_KERNEL_RO);
+
+}
+
+void __init machine_specific_arch_setup(void)
+{
+ int ret;
+ static const struct callback_register __initconst event = {
+ .type = CALLBACKTYPE_event,
+ .address = CALLBACK_ADDR(hypervisor_callback)
+ };
+ static const struct callback_register __initconst failsafe = {
+ .type = CALLBACKTYPE_failsafe,
+ .address = CALLBACK_ADDR(failsafe_callback)
+ };
+#ifdef CONFIG_X86_64
+ static const struct callback_register __initconst syscall = {
+ .type = CALLBACKTYPE_syscall,
+ .address = CALLBACK_ADDR(system_call)
+ };
+#endif
+ static const struct callback_register __initconst nmi_cb = {
+ .type = CALLBACKTYPE_nmi,
+ .address = CALLBACK_ADDR(nmi)
+ };
+
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
+ if (ret == 0)
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#ifdef CONFIG_X86_64
+ if (ret == 0)
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+#ifdef CONFIG_X86_32
+ if (ret == -ENOSYS)
+ ret = HYPERVISOR_set_callbacks(
+ event.address.cs, event.address.eip,
+ failsafe.address.cs, failsafe.address.eip);
+#else
+ ret = HYPERVISOR_set_callbacks(
+ event.address,
+ failsafe.address,
+ syscall.address);
+#endif
+#endif
+ BUG_ON(ret);
+
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (ret == -ENOSYS) {
+ static struct xennmi_callback __initdata cb = {
+ .handler_address = (unsigned long)nmi
+ };
+
+ HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
+ }
#endif
}
+#endif /* CONFIG_XEN */
--- head-2011-03-17.orig/arch/x86/kernel/head32-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head32-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -9,6 +9,7 @@
#include <linux/start_kernel.h>
#include <asm/setup.h>
+#include <asm/setup_arch.h>
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
@@ -16,9 +17,25 @@
void __init i386_start_kernel(void)
{
+#ifdef CONFIG_XEN
+ struct xen_platform_parameters pp;
+
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_4gb_segments));
+
+ init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
+ hypervisor_virt_start = pp.virt_start;
+ reserve_top_address(0UL - pp.virt_start);
+ }
+
+ BUG_ON(pte_index(hypervisor_virt_start));
+#endif
+
reserve_trampoline_memory();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifndef CONFIG_XEN
#ifdef CONFIG_BLK_DEV_INITRD
@@ -30,14 +47,8 @@ void __init i386_start_kernel(void)
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
- reserve_early(init_pg_tables_start, init_pg_tables_end,
- "INIT_PG_TABLE");
+ reserve_ebda_region();
#else
- reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
- __pa(xen_start_info->pt_base)
- + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
- "Xen provided");
-
{
int max_cmdline;
@@ -46,9 +57,9 @@ void __init i386_start_kernel(void)
memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
boot_command_line[max_cmdline-1] = '\0';
}
-#endif
- reserve_ebda_region();
+ xen_start_kernel();
+#endif
/*
* At this point everything still needed from the boot loader
--- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -7,9 +7,6 @@
* Modified for Xen.
*/
-/* PDA is not ready to be used until the end of x86_64_start_kernel(). */
-#define arch_use_lazy_mmu_mode() false
-
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
@@ -18,12 +15,12 @@
#include <linux/percpu.h>
#include <linux/start_kernel.h>
#include <linux/io.h>
-#include <linux/module.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/setup.h>
+#include <asm/setup_arch.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -33,27 +30,6 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-}
-
#ifndef CONFIG_XEN
static void __init zap_identity_mappings(void)
{
@@ -92,16 +68,9 @@ static void __init copy_bootdata(char *r
}
#include <xen/interface/memory.h>
-unsigned long *machine_to_phys_mapping;
-EXPORT_SYMBOL(machine_to_phys_mapping);
-unsigned int machine_to_phys_order;
-EXPORT_SYMBOL(machine_to_phys_order);
void __init x86_64_start_kernel(char * real_mode_data)
{
- struct xen_machphys_mapping mapping;
- unsigned long machine_to_phys_nr_ents;
-
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
@@ -116,21 +85,8 @@ void __init x86_64_start_kernel(char * r
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
- xen_setup_features();
-
xen_start_info = (struct start_info *)real_mode_data;
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- phys_to_machine_mapping =
- (unsigned long *)xen_start_info->mfn_list;
-
- machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
- machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
- if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
- machine_to_phys_mapping = (unsigned long *)mapping.v_start;
- machine_to_phys_nr_ents = mapping.max_mfn + 1;
- }
- while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
- machine_to_phys_order++;
+ xen_start_kernel();
#ifndef CONFIG_XEN
/* clear bss before set_intr_gate with early_idt_handler */
@@ -155,7 +111,7 @@ void __init x86_64_start_kernel(char * r
if (console_loglevel == 10)
early_printk("Kernel alive\n");
- x86_64_init_pda();
+ xen_switch_pt();
x86_64_start_reservations(real_mode_data);
}
@@ -166,12 +122,7 @@ void __init x86_64_start_reservations(ch
reserve_trampoline_memory();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
-
- reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
- __pa(xen_start_info->pt_base)
- + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
- "Xen provided");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
/*
* At this point everything still needed from the boot loader
--- head-2011-03-17.orig/arch/x86/kernel/head_32-xen.S 2011-03-03 16:19:21.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head_32-xen.S 2011-03-03 16:23:08.000000000 +0100
@@ -6,12 +6,14 @@
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/boot.h>
#include <asm/dwarf2.h>
+#include <asm/percpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/elfnote.h>
@@ -38,9 +40,6 @@ ENTRY(startup_32)
/* Set up the stack pointer */
movl $(init_thread_union+THREAD_SIZE),%esp
- movl %ss,%eax
- movl %eax,%fs # gets reset once there's real percpu
-
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
XEN_CPUID
@@ -61,7 +60,49 @@ ENTRY(startup_32)
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
- xorl %eax,%eax # Clear GS
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * The linker can't handle this by relocation. Manually set
+ * base address in stack canary segment descriptor.
+ */
+ movl $per_cpu__gdt_page,%eax
+ movl $per_cpu__stack_canary,%ecx
+ subl $20, %ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+
+ # %esi still points to start_info, and no registers
+ # need to be preserved.
+
+ movl XEN_START_mfn_list(%esi), %ebx
+ movl $(per_cpu__gdt_page - __PAGE_OFFSET), %eax
+ shrl $PAGE_SHIFT, %eax
+ movl (%ebx,%eax,4), %ecx
+ pushl %ecx # frame number for set_gdt below
+
+ xorl %esi, %esi
+ xorl %edx, %edx
+ shldl $PAGE_SHIFT, %ecx, %edx
+ shll $PAGE_SHIFT, %ecx
+ orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx
+ movl $per_cpu__gdt_page, %ebx
+ movl $__HYPERVISOR_update_va_mapping, %eax
+ int $0x82
+
+ movl $(PAGE_SIZE_asm / 8), %ecx
+ movl %esp, %ebx
+ movl $__HYPERVISOR_set_gdt, %eax
+ int $0x82
+
+ popl %ecx
+
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax,%fs # set this cpu's percpu
+
+ movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
cld # gcc2 wants the direction flag cleared at all times
--- head-2011-03-17.orig/arch/x86/kernel/head_64-xen.S 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head_64-xen.S 2011-02-01 14:44:12.000000000 +0100
@@ -21,6 +21,7 @@
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/dwarf2.h>
+#include <asm/percpu.h>
#include <xen/interface/elfnote.h>
.section .text.head, "ax", @progbits
@@ -32,11 +33,23 @@ startup_64:
/* rsi is pointer to startup info structure.
pass it to C */
movq %rsi,%rdi
+
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
+ movl $MSR_GS_BASE,%ecx
+ movq $INIT_PER_CPU_VAR(irq_stack_union),%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ wrmsr
+
pushq $0 # fake return address
jmp x86_64_start_kernel
-.balign PAGE_SIZE
-
#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
phys_##name = . - .text.head; \
--- head-2011-03-17.orig/arch/x86/kernel/ioport-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/ioport-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -91,9 +91,8 @@ static int do_iopl(unsigned int level, s
}
#ifdef CONFIG_X86_32
-asmlinkage long sys_iopl(unsigned long regsp)
+long sys_iopl(struct pt_regs *regs)
{
- struct pt_regs *regs = (struct pt_regs *)&regsp;
unsigned int level = regs->bx;
#else
asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
--- head-2011-03-17.orig/arch/x86/kernel/irq-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/irq-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -6,13 +6,20 @@
#include <linux/kernel_stat.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
+#include <linux/ftrace.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
+#include <asm/idle.h>
atomic_t irq_err_count;
+#ifndef CONFIG_XEN
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+#endif
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -36,11 +43,7 @@ void ack_bad_irq(unsigned int irq)
#endif
}
-#ifdef CONFIG_X86_32
-# define irq_stats(x) (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x) cpu_pda(x)
-#endif
+#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
* /proc/interrupts printing:
*/
@@ -57,6 +60,19 @@ static int show_other_interrupts(struct
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
+
+ seq_printf(p, "%*s: ", prec, "SPU");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+#endif
+#ifndef CONFIG_XEN
+ if (generic_interrupt_extension) {
+ seq_printf(p, "%*s: ", prec, "PLT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+ seq_printf(p, " Platform interrupts\n");
+ }
#endif
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
@@ -91,12 +107,6 @@ static int show_other_interrupts(struct
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "%*s: ", prec, "SPU");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
-#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -133,23 +143,15 @@ int show_interrupts(struct seq_file *p,
return 0;
spin_lock_irqsave(&desc->lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
-#endif
action = desc->action;
if (!action && !any_count)
goto out;
seq_printf(p, "%*d: ", prec, i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
seq_printf(p, " %8s", desc->chip->name);
seq_printf(p, "-%-8s", desc->name);
@@ -174,6 +176,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
+ sum += irq_stats(cpu)->irq_spurious_count;
+#endif
+#ifndef CONFIG_XEN
+ if (generic_interrupt_extension)
+ sum += irq_stats(cpu)->generic_irqs;
#endif
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
@@ -190,9 +197,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_threshold_count;
#endif
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += irq_stats(cpu)->irq_spurious_count;
-#endif
return sum;
}
@@ -205,3 +209,64 @@ u64 arch_irq_stat(void)
#endif
return sum;
}
+
+
+#ifndef CONFIG_XEN
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* high bit used in ret_from_ code */
+ unsigned vector = ~regs->orig_ax;
+ unsigned irq;
+
+ exit_idle();
+ irq_enter();
+
+ irq = __get_cpu_var(vector_irq)[vector];
+
+ if (!handle_irq(irq, regs)) {
+#ifdef CONFIG_X86_64
+ if (!disable_apic)
+ ack_APIC_irq();
+#endif
+
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(), vector, irq);
+ }
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ ack_APIC_irq();
+
+ exit_idle();
+
+ irq_enter();
+
+ inc_irq_stat(generic_irqs);
+
+ if (generic_interrupt_extension)
+ generic_interrupt_extension();
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+#endif
--- head-2011-03-17.orig/arch/x86/kernel/machine_kexec_64.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/machine_kexec_64.c 2011-02-01 14:44:12.000000000 +0100
@@ -92,13 +92,8 @@ void machine_kexec_setup_load_arg(xen_ke
xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
- xki->page_list[PA_PGD] = __ma(kexec_pgd);
- xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
- xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
- xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
- xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
- xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
- xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
}
int __init machine_kexec_setup_resources(struct resource *hypervisor,
@@ -161,7 +156,7 @@ static int init_one_level2_page(struct k
}
pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
result = 0;
out:
return result;
--- head-2011-03-17.orig/arch/x86/kernel/microcode_core-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/microcode_core-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -21,28 +21,28 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/platform_device.h>
#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/firmware.h>
#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -51,7 +51,7 @@ MODULE_LICENSE("GPL");
static int verbose;
module_param(verbose, int, 0644);
-#define MICROCODE_VERSION "2.00-xen"
+#define MICROCODE_VERSION "2.00-xen"
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);
@@ -143,12 +143,12 @@ static void microcode_dev_exit(void)
MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while (0)
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while (0)
#endif
/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+static struct platform_device *microcode_pdev;
static int request_microcode(const char *name)
{
--- head-2011-03-17.orig/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -3,7 +3,7 @@
* compliant MP-table parsing routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
*/
@@ -29,11 +29,7 @@
#include <asm/setup.h>
#include <asm/smp.h>
-#include <mach_apic.h>
-#ifdef CONFIG_X86_32
-#include <mach_apicdef.h>
-#include <mach_mpparse.h>
-#endif
+#include <asm/apic.h>
static void *_bus_to_virt(unsigned long ma)
{
@@ -123,9 +119,6 @@ static void __init MP_bus_info(struct mp
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
static int bad_ioapic(unsigned long address)
{
@@ -153,11 +146,11 @@ static void __init MP_ioapic_info(struct
if (bad_ioapic(m->apicaddr))
return;
- mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr;
- mp_ioapics[nr_ioapics].mp_apicid = m->apicid;
- mp_ioapics[nr_ioapics].mp_type = m->type;
- mp_ioapics[nr_ioapics].mp_apicver = m->apicver;
- mp_ioapics[nr_ioapics].mp_flags = m->flags;
+ mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
+ mp_ioapics[nr_ioapics].apicid = m->apicid;
+ mp_ioapics[nr_ioapics].type = m->type;
+ mp_ioapics[nr_ioapics].apicver = m->apicver;
+ mp_ioapics[nr_ioapics].flags = m->flags;
nr_ioapics++;
}
@@ -169,55 +162,55 @@ static void print_MP_intsrc_info(struct
m->srcbusirq, m->dstapic, m->dstirq);
}
-static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
- mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
- (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
- mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+ mp_irq->irqtype, mp_irq->irqflag & 3,
+ (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+ mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
}
static void __init assign_to_mp_irq(struct mpc_intsrc *m,
- struct mp_config_intsrc *mp_irq)
+ struct mpc_intsrc *mp_irq)
{
- mp_irq->mp_dstapic = m->dstapic;
- mp_irq->mp_type = m->type;
- mp_irq->mp_irqtype = m->irqtype;
- mp_irq->mp_irqflag = m->irqflag;
- mp_irq->mp_srcbus = m->srcbus;
- mp_irq->mp_srcbusirq = m->srcbusirq;
- mp_irq->mp_dstirq = m->dstirq;
+ mp_irq->dstapic = m->dstapic;
+ mp_irq->type = m->type;
+ mp_irq->irqtype = m->irqtype;
+ mp_irq->irqflag = m->irqflag;
+ mp_irq->srcbus = m->srcbus;
+ mp_irq->srcbusirq = m->srcbusirq;
+ mp_irq->dstirq = m->dstirq;
}
-static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
struct mpc_intsrc *m)
{
- m->dstapic = mp_irq->mp_dstapic;
- m->type = mp_irq->mp_type;
- m->irqtype = mp_irq->mp_irqtype;
- m->irqflag = mp_irq->mp_irqflag;
- m->srcbus = mp_irq->mp_srcbus;
- m->srcbusirq = mp_irq->mp_srcbusirq;
- m->dstirq = mp_irq->mp_dstirq;
+ m->dstapic = mp_irq->dstapic;
+ m->type = mp_irq->type;
+ m->irqtype = mp_irq->irqtype;
+ m->irqflag = mp_irq->irqflag;
+ m->srcbus = mp_irq->srcbus;
+ m->srcbusirq = mp_irq->srcbusirq;
+ m->dstirq = mp_irq->dstirq;
}
-static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
struct mpc_intsrc *m)
{
- if (mp_irq->mp_dstapic != m->dstapic)
+ if (mp_irq->dstapic != m->dstapic)
return 1;
- if (mp_irq->mp_type != m->type)
+ if (mp_irq->type != m->type)
return 2;
- if (mp_irq->mp_irqtype != m->irqtype)
+ if (mp_irq->irqtype != m->irqtype)
return 3;
- if (mp_irq->mp_irqflag != m->irqflag)
+ if (mp_irq->irqflag != m->irqflag)
return 4;
- if (mp_irq->mp_srcbus != m->srcbus)
+ if (mp_irq->srcbus != m->srcbus)
return 5;
- if (mp_irq->mp_srcbusirq != m->srcbusirq)
+ if (mp_irq->srcbusirq != m->srcbusirq)
return 6;
- if (mp_irq->mp_dstirq != m->dstirq)
+ if (mp_irq->dstirq != m->dstirq)
return 7;
return 0;
@@ -238,8 +231,12 @@ static void __init MP_intsrc_info(struct
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */
-#endif
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
@@ -291,6 +288,20 @@ static int __init smp_check_mpc(struct m
return 1;
}
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+ *ptr += size;
+ *count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+ "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->length, 1);
+}
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -302,19 +313,10 @@ static int __init smp_read_mpc(struct mp
if (!smp_check_mpc(mpc, oem, str))
return 0;
+#ifndef CONFIG_XEN
#ifdef CONFIG_X86_32
- /*
- * need to make sure summit and es7000's mps_oem_check is safe to be
- * called early via genericarch 's mps_oem_check
- */
- if (early) {
-#ifdef CONFIG_X86_NUMAQ
- numaq_mps_oem_check(mpc, oem, str);
-#endif
- } else
- mps_oem_check(mpc, oem, str);
+ generic_mps_oem_check(mpc, oem, str);
#endif
-#ifndef CONFIG_XEN
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->lapic;
@@ -337,61 +339,30 @@ static int __init smp_read_mpc(struct mp
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
- {
- struct mpc_cpu *m = (struct mpc_cpu *)mpt;
- /* ACPI may have already provided this data */
- if (!acpi_lapic)
- MP_processor_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ /* ACPI may have already provided this data */
+ if (!acpi_lapic)
+ MP_processor_info((struct mpc_cpu *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
case MP_BUS:
- {
- struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
- MP_bus_info(m);
-#endif
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ MP_bus_info((struct mpc_bus *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
case MP_IOAPIC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
- MP_ioapic_info(m);
-#endif
- mpt += sizeof(struct mpc_ioapic);
- count += sizeof(struct mpc_ioapic);
- break;
- }
+ MP_ioapic_info((struct mpc_ioapic *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
case MP_INTSRC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
- MP_intsrc_info(m);
-#endif
- mpt += sizeof(struct mpc_intsrc);
- count += sizeof(struct mpc_intsrc);
- break;
- }
+ MP_intsrc_info((struct mpc_intsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
case MP_LINTSRC:
- {
- struct mpc_lintsrc *m =
- (struct mpc_lintsrc *)mpt;
- MP_lintsrc_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
default:
/* wrong mptable */
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
- printk(KERN_ERR "type %x\n", *mpt);
- print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->length, 1);
+ smp_dump_mptable(mpc, mpt);
count = mpc->length;
break;
}
@@ -399,13 +370,13 @@ static int __init smp_read_mpc(struct mp
(*x86_quirks->mpc_record)++;
}
-#ifdef CONFIG_X86_GENERICARCH
- generic_bigsmp_probe();
+#ifdef CONFIG_X86_BIGSMP
+ generic_bigsmp_probe();
#endif
-#ifdef CONFIG_X86_32
- setup_apic_routing();
-#endif
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
@@ -430,7 +401,7 @@ static void __init construct_default_ioi
intsrc.type = MP_INTSRC;
intsrc.irqflag = 0; /* conforming */
intsrc.srcbus = 0;
- intsrc.dstapic = mp_ioapics[0].mp_apicid;
+ intsrc.dstapic = mp_ioapics[0].apicid;
intsrc.irqtype = mp_INT;
@@ -585,7 +556,69 @@ static inline void __init construct_defa
}
}
-static struct intel_mp_floating *mpf_found;
+static struct mpf_intel *mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_ioremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_iounmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_ioremap(mpf->physptr, size);
+ /*
+ * Read the physical hardware table. Anything here will
+ * override the defaults.
+ */
+ if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 0;
+#endif
+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+ "... disabling SMP support. (tell your hw vendor)\n");
+ early_iounmap(mpc, size);
+ return -1;
+ }
+ early_iounmap(mpc, size);
+
+ if (early)
+ return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * If there are no explicit MP IRQ entries, then we are
+ * broken. We set up most of the low 16 IO-APIC pins to
+ * ISA defaults and hope it will work.
+ */
+ if (!mp_irq_entries) {
+ struct mpc_bus bus;
+
+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+ "using default mptable. (tell your hw vendor)\n");
+
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ memcpy(bus.bustype, "ISA ", 6);
+ MP_bus_info(&bus);
+
+ construct_default_ioirq_mptable(0);
+ }
+#endif
+
+ return 0;
+}
/*
* Scan the memory blocks for an SMP configuration block.
@@ -597,7 +630,7 @@ void __init get_smp_config(void)
#define early 0
#endif
{
- struct intel_mp_floating *mpf = mpf_found;
+ struct mpf_intel *mpf = mpf_found;
if (!mpf)
return;
@@ -618,9 +651,9 @@ void __init get_smp_config(void)
}
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->mpf_specification);
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
- if (mpf->mpf_feature2 & (1 << 7)) {
+ if (mpf->feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
@@ -631,7 +664,7 @@ void __init get_smp_config(void)
/*
* Now see if we need to read further.
*/
- if (mpf->mpf_feature1 != 0) {
+ if (mpf->feature1 != 0) {
#ifndef CONFIG_XEN
if (early) {
/*
@@ -643,49 +676,12 @@ void __init get_smp_config(void)
#endif
printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->mpf_feature1);
- construct_default_ISA_mptable(mpf->mpf_feature1);
-
- } else if (mpf->mpf_physptr) {
+ mpf->feature1);
+ construct_default_ISA_mptable(mpf->feature1);
- /*
- * Read the physical hardware table. Anything here will
- * override the defaults.
- */
- if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
-#ifdef CONFIG_X86_LOCAL_APIC
- smp_found_config = 0;
-#endif
- printk(KERN_ERR
- "BIOS bug, MP table errors detected!...\n");
- printk(KERN_ERR "... disabling SMP support. "
- "(tell your hw vendor)\n");
+ } else if (mpf->physptr) {
+ if (check_physptr(mpf, early))
return;
- }
-
- if (early)
- return;
-#ifdef CONFIG_X86_IO_APIC
- /*
- * If there are no explicit MP IRQ entries, then we are
- * broken. We set up most of the low 16 IO-APIC pins to
- * ISA defaults and hope it will work.
- */
- if (!mp_irq_entries) {
- struct mpc_bus bus;
-
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
- "using default mptable. "
- "(tell your hw vendor)\n");
-
- bus.type = MP_BUS;
- bus.busid = 0;
- memcpy(bus.bustype, "ISA ", 6);
- MP_bus_info(&bus);
-
- construct_default_ioirq_mptable(0);
- }
-#endif
} else
BUG();
@@ -707,60 +703,68 @@ void __init get_smp_config(void)
{
__get_smp_config(0);
}
+
+static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+ unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+ /*
+ * We cannot access to MPC table to compute table size yet,
+ * as only few megabytes from the bottom is mapped now.
+ * PC-9800's MPC table places on the very last of physical
+ * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+ * yields BUG() in reserve_bootmem.
+ * also need to make sure physptr is below than max_low_pfn
+ * we don't need reserve the area above max_low_pfn
+ */
+ unsigned long end = max_low_pfn * PAGE_SIZE;
+
+ if (mpf->physptr < end) {
+ if (mpf->physptr + size > end)
+ size = end - mpf->physptr;
+ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+ }
+#else
+ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
#endif
static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
{
unsigned int *bp = _bus_to_virt(base);
- struct intel_mp_floating *mpf;
+ struct mpf_intel *mpf;
apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
- mpf = (struct intel_mp_floating *)bp;
+ mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
- (mpf->mpf_length == 1) &&
+ (mpf->length == 1) &&
!mpf_checksum((unsigned char *)bp, 16) &&
- ((mpf->mpf_specification == 1)
- || (mpf->mpf_specification == 4))) {
+ ((mpf->specification == 1)
+ || (mpf->specification == 4))) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
mpf_found = mpf;
#ifndef CONFIG_XEN
- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
- mpf, virt_to_phys(mpf));
+ printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
+ mpf, (u64)virt_to_phys(mpf));
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
- BOOTMEM_DEFAULT);
- if (mpf->mpf_physptr) {
- unsigned long size = PAGE_SIZE;
-#ifdef CONFIG_X86_32
- /*
- * We cannot access to MPC table to compute
- * table size yet, as only few megabytes from
- * the bottom is mapped now.
- * PC-9800's MPC table places on the very last
- * of physical memory; so that simply reserving
- * PAGE_SIZE from mpg->mpf_physptr yields BUG()
- * in reserve_bootmem.
- */
- unsigned long end = max_low_pfn * PAGE_SIZE;
- if (mpf->mpf_physptr + size > end)
- size = end - mpf->mpf_physptr;
-#endif
- reserve_bootmem_generic(mpf->mpf_physptr, size,
+ reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
BOOTMEM_DEFAULT);
- }
+ if (mpf->physptr)
+ smp_reserve_bootmem(mpf);
#else
printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
- mpf, ((void *)bp - _bus_to_virt(base)) + base);
+ mpf, ((void *)bp - _bus_to_virt(base)) + base);
#endif
return 1;
}
@@ -842,15 +846,15 @@ static int __init get_MP_intsrc_index(s
/* not legacy */
for (i = 0; i < mp_irq_entries; i++) {
- if (mp_irqs[i].mp_irqtype != mp_INT)
+ if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].mp_irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != 0x0f)
continue;
- if (mp_irqs[i].mp_srcbus != m->srcbus)
+ if (mp_irqs[i].srcbus != m->srcbus)
continue;
- if (mp_irqs[i].mp_srcbusirq != m->srcbusirq)
+ if (mp_irqs[i].srcbusirq != m->srcbusirq)
continue;
if (irq_used[i]) {
/* already claimed */
@@ -867,7 +871,58 @@ static int __init get_MP_intsrc_index(s
#define SPARE_SLOT_NUM 20
static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "OLD ");
+ print_MP_intsrc_info(m);
+
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ apic_printk(APIC_VERBOSE, "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ return;
+ }
+ if (!i) {
+ /* legacy, do nothing */
+ return;
+ }
+ if (*nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2) are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[*nr_m_spare] = m;
+ *nr_m_spare += 1;
+ }
+}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+ int count)
+{
+ if (!mpc_new_phys) {
+ pr_info("No spare slots, try to append...take your risk, "
+ "new mpc_length %x\n", count);
+ } else {
+ if (count <= mpc_new_length)
+ pr_info("No spare slots, try to append..., "
+ "new mpc_length %x\n", count);
+ else {
+ pr_err("mpc_new_length %lx is too small\n",
+ mpc_new_length);
+ return -1;
+ }
+ }
+
+ return 0;
+}
static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
@@ -875,77 +930,33 @@ static int __init replace_intsrc_all(st
{
#ifdef CONFIG_X86_IO_APIC
int i;
- int nr_m_spare = 0;
#endif
-
int count = sizeof(*mpc);
+ int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
printk(KERN_INFO "mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
- {
- struct mpc_cpu *m = (struct mpc_cpu *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
case MP_BUS:
- {
- struct mpc_bus *m = (struct mpc_bus *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
case MP_IOAPIC:
- {
- mpt += sizeof(struct mpc_ioapic);
- count += sizeof(struct mpc_ioapic);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
case MP_INTSRC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
- apic_printk(APIC_VERBOSE, "OLD ");
- print_MP_intsrc_info(m);
- i = get_MP_intsrc_index(m);
- if (i > 0) {
- assign_to_mpc_intsrc(&mp_irqs[i], m);
- apic_printk(APIC_VERBOSE, "NEW ");
- print_mp_irq_info(&mp_irqs[i]);
- } else if (!i) {
- /* legacy, do nothing */
- } else if (nr_m_spare < SPARE_SLOT_NUM) {
- /*
- * not found (-1), or duplicated (-2)
- * are invalid entries,
- * we need to use the slot later
- */
- m_spare[nr_m_spare] = m;
- nr_m_spare++;
- }
-#endif
- mpt += sizeof(struct mpc_intsrc);
- count += sizeof(struct mpc_intsrc);
- break;
- }
+ check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
case MP_LINTSRC:
- {
- struct mpc_lintsrc *m =
- (struct mpc_lintsrc *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
default:
/* wrong mptable */
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
- printk(KERN_ERR "type %x\n", *mpt);
- print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->length, 1);
+ smp_dump_mptable(mpc, mpt);
goto out;
}
}
@@ -955,10 +966,10 @@ static int __init replace_intsrc_all(st
if (irq_used[i])
continue;
- if (mp_irqs[i].mp_irqtype != mp_INT)
+ if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].mp_irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != 0x0f)
continue;
if (nr_m_spare > 0) {
@@ -969,16 +980,8 @@ static int __init replace_intsrc_all(st
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
- if (!mpc_new_phys) {
- printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
- } else {
- if (count <= mpc_new_length)
- printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
- else {
- printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
- goto out;
- }
- }
+ if (!check_slot(mpc_new_phys, mpc_new_length, count))
+ goto out;
assign_to_mpc_intsrc(&mp_irqs[i], m);
mpc->length = count;
mpt += sizeof(struct mpc_intsrc);
@@ -1034,7 +1037,7 @@ static int __init update_mp_table(void)
{
char str[16];
char oem[10];
- struct intel_mp_floating *mpf;
+ struct mpf_intel *mpf;
struct mpc_table *mpc, *mpc_new;
if (!enable_update_mptable)
@@ -1047,19 +1050,19 @@ static int __init update_mp_table(void)
/*
* Now see if we need to go further.
*/
- if (mpf->mpf_feature1 != 0)
+ if (mpf->feature1 != 0)
return 0;
- if (!mpf->mpf_physptr)
+ if (!mpf->physptr)
return 0;
- mpc = _bus_to_virt(mpf->mpf_physptr);
+ mpc = _bus_to_virt(mpf->physptr);
if (!smp_check_mpc(mpc, oem, str))
return 0;
- printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
- printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
+ printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf));
+ printk(KERN_INFO "physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
@@ -1083,23 +1086,23 @@ static int __init update_mp_table(void)
maddr_t mpc_new_bus;
mpc_new_bus = phys_to_machine(mpc_new_phys);
- mpf->mpf_physptr = mpc_new_bus;
+ mpf->physptr = mpc_new_bus;
mpc_new = phys_to_virt(mpc_new_phys);
memcpy(mpc_new, mpc, mpc->length);
mpc = mpc_new;
/* check if we can modify that */
- if (mpc_new_bus - mpf->mpf_physptr) {
- struct intel_mp_floating *mpf_new;
+ if (mpc_new_bus - mpf->physptr) {
+ struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
mpf_new = isa_bus_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
mpf = mpf_new;
- mpf->mpf_physptr = mpc_new_bus;
+ mpf->physptr = mpc_new_bus;
}
- mpf->mpf_checksum = 0;
- mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
+ mpf->checksum = 0;
+ mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
}
/*
--- head-2011-03-17.orig/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -1,4 +1,5 @@
#include <linux/dma-mapping.h>
+#include <linux/dma-debug.h>
#include <linux/dmar.h>
#include <linux/bootmem.h>
#include <linux/pci.h>
@@ -12,7 +13,7 @@
static int forbid_dac __read_mostly;
-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -39,11 +40,14 @@ EXPORT_SYMBOL(bad_dma_address);
to older i386. */
struct device x86_dma_fallback_dev = {
.init_name = "fallback device",
- .coherent_dma_mask = DMA_32BIT_MASK,
+ .coherent_dma_mask = DMA_BIT_MASK(32),
.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
EXPORT_SYMBOL(x86_dma_fallback_dev);
+/* Number of entries preallocated for DMA-API debugging */
+#define PREALLOC_DMA_DEBUG_ENTRIES 32768
+
int dma_set_mask(struct device *dev, u64 mask)
{
if (!dev->dma_mask || !dma_supported(dev, mask))
@@ -103,20 +107,20 @@ static void __init dma32_free_bootmem(vo
}
#endif
-static struct dma_mapping_ops swiotlb_dma_ops = {
+static struct dma_map_ops swiotlb_dma_ops = {
.alloc_coherent = dma_generic_alloc_coherent,
.free_coherent = dma_generic_free_coherent,
.mapping_error = swiotlb_dma_mapping_error,
- .map_single = swiotlb_map_single_phys,
- .unmap_single = swiotlb_unmap_single,
+ .map_page = swiotlb_map_page,
+ .unmap_page = swiotlb_unmap_page,
.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
.sync_single_for_device = swiotlb_sync_single_for_device,
.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
- .map_sg = swiotlb_map_sg,
- .unmap_sg = swiotlb_unmap_sg,
+ .map_sg = swiotlb_map_sg_attrs,
+ .unmap_sg = swiotlb_unmap_sg_attrs,
.dma_supported = swiotlb_dma_supported
};
@@ -175,7 +179,7 @@ again:
if (!is_buffer_dma_capable(dma_mask, addr, size)) {
__free_pages(page, order);
- if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
+ if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
flag = (flag & ~GFP_DMA32) | GFP_DMA;
goto again;
}
@@ -305,7 +309,7 @@ int range_straddles_page_boundary(paddr_
int dma_supported(struct device *dev, u64 mask)
{
- struct dma_mapping_ops *ops = get_dma_ops(dev);
+ struct dma_map_ops *ops = get_dma_ops(dev);
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
@@ -320,7 +324,7 @@ int dma_supported(struct device *dev, u6
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
The caller just has to use GFP_DMA in this case. */
- if (mask < DMA_24BIT_MASK)
+ if (mask < DMA_BIT_MASK(24))
return 0;
/* Tell the device to use SAC when IOMMU force is on. This
@@ -335,7 +339,7 @@ int dma_supported(struct device *dev, u6
SAC for these. Assume all masks <= 40 bits are of this
type. Normally this doesn't make any difference, but gives
more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+ if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
dev_info(dev, "Force SAC with mask %Lx\n", mask);
return 0;
}
@@ -346,6 +350,12 @@ EXPORT_SYMBOL(dma_supported);
static int __init pci_iommu_init(void)
{
+ dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+
+#ifdef CONFIG_PCI
+ dma_debug_add_bus(&pci_bus_type);
+#endif
+
calgary_iommu_init();
intel_iommu_init();
@@ -371,8 +381,7 @@ fs_initcall(pci_iommu_init);
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO
- "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+ dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
forbid_dac = 1;
}
}
--- head-2011-03-17.orig/arch/x86/kernel/pci-nommu-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/pci-nommu-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -24,7 +24,7 @@ do { \
static int
gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
- int direction)
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
unsigned int i;
struct scatterlist *sg;
@@ -48,7 +48,7 @@ gnttab_map_sg(struct device *hwdev, stru
static void
gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
- int direction)
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
unsigned int i;
struct scatterlist *sg;
@@ -58,24 +58,25 @@ gnttab_unmap_sg(struct device *hwdev, st
}
static dma_addr_t
-gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
- int direction)
+gnttab_map_page(struct device *dev, struct page *page, unsigned long offset,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
dma_addr_t dma;
WARN_ON(size == 0);
- dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
- offset_in_page(paddr);
- IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
+ dma = gnttab_dma_map_page(page) + offset;
+ IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) +
+ offset, size));
IOMMU_BUG_ON(address_needs_mapping(dev, dma, size));
return dma;
}
static void
-gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
- int direction)
+gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
gnttab_dma_unmap_page(dma_addr);
}
@@ -85,14 +86,14 @@ static int nommu_dma_supported(struct de
return 1;
}
-struct dma_mapping_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = dma_generic_free_coherent,
- .map_single = gnttab_map_single,
- .unmap_single = gnttab_unmap_single,
- .map_sg = gnttab_map_sg,
- .unmap_sg = gnttab_unmap_sg,
- .dma_supported = nommu_dma_supported,
+struct dma_map_ops nommu_dma_ops = {
+ .alloc_coherent = dma_generic_alloc_coherent,
+ .free_coherent = dma_generic_free_coherent,
+ .map_page = gnttab_map_page,
+ .unmap_page = gnttab_unmap_page,
+ .map_sg = gnttab_map_sg,
+ .unmap_sg = gnttab_unmap_sg,
+ .dma_supported = nommu_dma_supported,
};
void __init no_iommu_init(void)
--- head-2011-03-17.orig/arch/x86/kernel/process-xen.c 2011-03-03 16:05:57.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/process-xen.c 2011-03-03 16:06:40.000000000 +0100
@@ -1,16 +1,19 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <asm/idle.h>
#include <linux/smp.h>
+#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
#include <xen/evtchn.h>
unsigned long idle_halt;
@@ -20,6 +23,9 @@ EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
*dst = *src;
@@ -57,6 +63,179 @@ void arch_task_cache_init(void)
}
/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+ struct task_struct *me = current;
+ struct thread_struct *t = &me->thread;
+ unsigned long *bp = t->io_bitmap_ptr;
+
+ if (bp) {
+ struct physdev_set_iobitmap set_iobitmap;
+
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Careful, clear this in the TSS too:
+ */
+ memset(&set_iobitmap, 0, sizeof(set_iobitmap));
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+ &set_iobitmap));
+ t->io_bitmap_max = 0;
+ kfree(bp);
+ }
+
+ ds_exit_thread(current);
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_X86_64
+ if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+ clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+ if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+ clear_tsk_thread_flag(tsk, TIF_IA32);
+ } else {
+ set_tsk_thread_flag(tsk, TIF_IA32);
+ current_thread_info()->status |= TS_COMPAT;
+ }
+ }
+#endif
+
+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+ tsk->thread.debugreg0 = 0;
+ tsk->thread.debugreg1 = 0;
+ tsk->thread.debugreg2 = 0;
+ tsk->thread.debugreg3 = 0;
+ tsk->thread.debugreg6 = 0;
+ tsk->thread.debugreg7 = 0;
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ /*
+ * Forget coprocessor state..
+ */
+ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+}
+
+static void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_enable_TSC();
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev, *next;
+
+ prev = &prev_p->thread;
+ next = &next_p->thread;
+
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
+ update_debugctlmsr(next->debugctlmsr);
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ set_debugreg(next->debugreg0, 0);
+ set_debugreg(next->debugreg1, 1);
+ set_debugreg(next->debugreg2, 2);
+ set_debugreg(next->debugreg3, 3);
+ /* no 4 and 5 */
+ set_debugreg(next->debugreg6, 6);
+ set_debugreg(next->debugreg7, 7);
+ }
+
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+}
+
+int sys_fork(struct pt_regs *regs)
+{
+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+int sys_vfork(struct pt_regs *regs)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+ NULL, NULL);
+}
+
+
+/*
* Idle related variables and functions
*/
unsigned long boot_option_idle_override = 0;
@@ -101,7 +280,7 @@ void stop_this_cpu(void *dummy)
/*
* Remove this CPU:
*/
- cpu_clear(smp_processor_id(), cpu_online_map);
+ set_cpu_online(smp_processor_id(), false);
disable_all_local_evtchn();
for (;;) {
@@ -254,12 +433,13 @@ static int __cpuinit check_c1e_idle(cons
return 1;
}
-static cpumask_t c1e_mask = CPU_MASK_NONE;
+static cpumask_var_t c1e_mask;
static int c1e_detected;
void c1e_remove_cpu(int cpu)
{
- cpu_clear(cpu, c1e_mask);
+ if (c1e_mask != NULL)
+ cpumask_clear_cpu(cpu, c1e_mask);
}
/*
@@ -288,8 +468,8 @@ static void c1e_idle(void)
if (c1e_detected) {
int cpu = smp_processor_id();
- if (!cpu_isset(cpu, c1e_mask)) {
- cpu_set(cpu, c1e_mask);
+ if (!cpumask_test_cpu(cpu, c1e_mask)) {
+ cpumask_set_cpu(cpu, c1e_mask);
/*
* Force broadcast so ACPI can not interfere. Needs
* to run with interrupts enabled as it uses
@@ -321,7 +501,7 @@ static void c1e_idle(void)
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifndef CONFIG_XEN
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
@@ -344,6 +524,17 @@ void __cpuinit select_idle_routine(const
#endif
}
+void __init init_c1e_mask(void)
+{
+#ifndef CONFIG_XEN
+ /* If we're using c1e_idle, we need to allocate c1e_mask. */
+ if (pm_idle == c1e_idle) {
+ alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+ cpumask_clear(c1e_mask);
+ }
+#endif
+}
+
static int __init idle_setup(char *str)
{
if (!str)
--- head-2011-03-17.orig/arch/x86/kernel/process_32-xen.c 2011-02-02 08:36:38.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/process_32-xen.c 2011-02-02 08:37:24.000000000 +0100
@@ -11,6 +11,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -69,9 +70,6 @@ asmlinkage void cstar_ret_from_fork(void
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
/*
* Return saved PC of a blocked thread.
*/
@@ -97,6 +95,15 @@ void cpu_idle(void)
{
int cpu = smp_processor_id();
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
@@ -111,7 +118,6 @@ void cpu_idle(void)
play_dead();
local_irq_disable();
- __get_cpu_var(irq_stat).idle_timestamp = jiffies;
/* Don't trace irqs off for idle */
stop_critical_timings();
xen_idle();
@@ -135,7 +141,7 @@ void __show_regs(struct pt_regs *regs, i
if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
- savesegment(gs, gs);
+ gs = get_user_gs(regs);
} else {
sp = (unsigned long) (&regs->sp);
savesegment(ss, ss);
@@ -216,6 +222,7 @@ int kernel_thread(int (*fn)(void *), voi
regs.ds = __USER_DS;
regs.es = __USER_DS;
regs.fs = __KERNEL_PERCPU;
+ regs.gs = __KERNEL_STACK_CANARY;
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -226,47 +233,6 @@ int kernel_thread(int (*fn)(void *), voi
}
EXPORT_SYMBOL(kernel_thread);
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
- /* The process may have allocated an io port bitmap... nuke it. */
- if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
- struct task_struct *tsk = current;
- struct thread_struct *t = &tsk->thread;
- struct physdev_set_iobitmap set_iobitmap;
- memset(&set_iobitmap, 0, sizeof(set_iobitmap));
- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
- &set_iobitmap));
- kfree(t->io_bitmap_ptr);
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
- }
-
- ds_exit_thread(current);
-}
-
-void flush_thread(void)
-{
- struct task_struct *tsk = current;
-
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
-}
-
void release_thread(struct task_struct *dead_task)
{
BUG_ON(dead_task->mm);
@@ -282,7 +248,7 @@ void prepare_to_copy(struct task_struct
unlazy_fpu(tsk);
}
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
{
@@ -300,7 +266,7 @@ int copy_thread(int nr, unsigned long cl
p->thread.ip = (unsigned long) ret_from_fork;
- savesegment(gs, p->thread.gs);
+ task_user_gs(p) = get_user_gs(regs);
tsk = current;
if (test_tsk_thread_flag(tsk, TIF_CSTAR))
@@ -342,7 +308,7 @@ int copy_thread(int nr, unsigned long cl
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- __asm__("movl %0, %%gs" : : "r"(0));
+ set_user_gs(regs, 0);
regs->fs = 0;
set_fs(USER_DS);
regs->ds = __USER_DS;
@@ -358,98 +324,6 @@ start_thread(struct pt_regs *regs, unsig
}
EXPORT_SYMBOL_GPL(start_thread);
-static void hard_disable_TSC(void)
-{
- write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
- preempt_disable();
- if (!test_and_set_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_disable_TSC();
- preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
- preempt_disable();
- if (test_and_clear_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_enable_TSC();
- preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
- unsigned int val;
-
- if (test_thread_flag(TIF_NOTSC))
- val = PR_TSC_SIGSEGV;
- else
- val = PR_TSC_ENABLE;
-
- return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
- if (val == PR_TSC_SIGSEGV)
- disable_TSC();
- else if (val == PR_TSC_ENABLE)
- enable_TSC();
- else
- return -EINVAL;
-
- return 0;
-}
-
-static noinline void
-__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread;
- next = &next_p->thread;
-
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
-
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
- set_debugreg(next->debugreg1, 1);
- set_debugreg(next->debugreg2, 2);
- set_debugreg(next->debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg6, 6);
- set_debugreg(next->debugreg7, 7);
- }
-
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
- else
- hard_enable_TSC();
- }
-}
-
/*
* switch_to(x,yn) should switch tasks from x to y.
*
@@ -530,7 +404,7 @@ __switch_to(struct task_struct *prev_p,
if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
next->tls_array[i].b != prev->tls_array[i].b)) { \
mcl->op = __HYPERVISOR_update_descriptor; \
- *(u64 *)&mcl->args[0] = virt_to_machine( \
+ *(u64 *)&mcl->args[0] = arbitrary_virt_to_machine( \
&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
*(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \
mcl++; \
@@ -610,64 +484,44 @@ __switch_to(struct task_struct *prev_p,
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- loadsegment(gs, next->gs);
+ lazy_load_gs(next->gs);
- x86_write_percpu(current_task, next_p);
+ percpu_write(current_task, next_p);
return prev_p;
}
-asmlinkage int sys_fork(struct pt_regs regs)
-{
- return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
-}
-
-asmlinkage int sys_clone(struct pt_regs regs)
+int sys_clone(struct pt_regs *regs)
{
unsigned long clone_flags;
unsigned long newsp;
int __user *parent_tidptr, *child_tidptr;
- clone_flags = regs.bx;
- newsp = regs.cx;
- parent_tidptr = (int __user *)regs.dx;
- child_tidptr = (int __user *)regs.di;
+ clone_flags = regs->bx;
+ newsp = regs->cx;
+ parent_tidptr = (int __user *)regs->dx;
+ child_tidptr = (int __user *)regs->di;
if (!newsp)
- newsp = regs.sp;
- return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
-}
-
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage int sys_vfork(struct pt_regs regs)
-{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
}
/*
* sys_execve() executes a new program.
*/
-asmlinkage int sys_execve(struct pt_regs regs)
+int sys_execve(struct pt_regs *regs)
{
int error;
char *filename;
- filename = getname((char __user *) regs.bx);
+ filename = getname((char __user *) regs->bx);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
error = do_execve(filename,
- (char __user * __user *) regs.cx,
- (char __user * __user *) regs.dx,
- &regs);
+ (char __user * __user *) regs->cx,
+ (char __user * __user *) regs->dx,
+ regs);
if (error == 0) {
/* Make sure we don't return using sysenter.. */
set_thread_flag(TIF_IRET);
--- head-2011-03-17.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:36:43.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/process_64-xen.c 2011-02-02 08:37:17.000000000 +0100
@@ -19,6 +19,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -50,7 +51,6 @@
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
-#include <asm/pda.h>
#include <asm/prctl.h>
#include <xen/interface/physdev.h>
#include <asm/desc.h>
@@ -63,6 +63,11 @@
asmlinkage extern void ret_from_fork(void);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -81,13 +86,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregist
void enter_idle(void)
{
- write_pda(isidle, 1);
+ percpu_write(is_idle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
- if (test_and_clear_bit_pda(0, isidle) == 0)
+ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
@@ -117,6 +122,16 @@ static inline void play_dead(void)
void cpu_idle(void)
{
current_thread_info()->status |= TS_POLLING;
+
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
/* endless idle loop with no priority at all */
while (1) {
tick_nohz_stop_sched_tick(1);
@@ -226,78 +241,11 @@ void show_regs(struct pt_regs *regs)
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
- struct task_struct *me = current;
- struct thread_struct *t = &me->thread;
-
- if (me->thread.io_bitmap_ptr) {
-#ifndef CONFIG_X86_NO_TSS
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-#endif
-#ifdef CONFIG_XEN
- struct physdev_set_iobitmap iobmp_op;
- memset(&iobmp_op, 0, sizeof(iobmp_op));
-#endif
-
- kfree(t->io_bitmap_ptr);
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
- /*
- * Careful, clear this in the TSS too:
- */
-#ifndef CONFIG_X86_NO_TSS
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
- put_cpu();
-#endif
-#ifdef CONFIG_XEN
- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
- &iobmp_op));
-#endif
- t->io_bitmap_max = 0;
- }
-
- ds_exit_thread(current);
-}
-
void xen_load_gs_index(unsigned gs)
{
WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
}
-void flush_thread(void)
-{
- struct task_struct *tsk = current;
-
- if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
- clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
- if (test_tsk_thread_flag(tsk, TIF_IA32)) {
- clear_tsk_thread_flag(tsk, TIF_IA32);
- } else {
- set_tsk_thread_flag(tsk, TIF_IA32);
- current_thread_info()->status |= TS_COMPAT;
- }
- }
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
-}
-
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
@@ -339,7 +287,7 @@ void prepare_to_copy(struct task_struct
unlazy_fpu(tsk);
}
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
{
@@ -430,103 +378,6 @@ start_thread(struct pt_regs *regs, unsig
}
EXPORT_SYMBOL_GPL(start_thread);
-static void hard_disable_TSC(void)
-{
- write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
- preempt_disable();
- if (!test_and_set_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_disable_TSC();
- preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
- preempt_disable();
- if (test_and_clear_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_enable_TSC();
- preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
- unsigned int val;
-
- if (test_thread_flag(TIF_NOTSC))
- val = PR_TSC_SIGSEGV;
- else
- val = PR_TSC_ENABLE;
-
- return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
- if (val == PR_TSC_SIGSEGV)
- disable_TSC();
- else if (val == PR_TSC_ENABLE)
- enable_TSC();
- else
- return -EINVAL;
-
- return 0;
-}
-
-/*
- * This special macro can be used to load a debugging register
- */
-#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
-
-static inline void __switch_to_xtra(struct task_struct *prev_p,
- struct task_struct *next_p)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread,
- next = &next_p->thread;
-
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
-
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- loaddebug(next, 0);
- loaddebug(next, 1);
- loaddebug(next, 2);
- loaddebug(next, 3);
- /* no 4 and 5 */
- loaddebug(next, 6);
- loaddebug(next, 7);
- }
-
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
- else
- hard_enable_TSC();
- }
-}
-
/*
* switch_to(x,y) should switch tasks from x to y.
*
@@ -592,7 +443,7 @@ __switch_to(struct task_struct *prev_p,
if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
next->tls_array[i].b != prev->tls_array[i].b)) { \
mcl->op = __HYPERVISOR_update_descriptor; \
- mcl->args[0] = virt_to_machine( \
+ mcl->args[0] = arbitrary_virt_to_machine( \
&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
mcl->args[1] = *(u64 *)&next->tls_array[i]; \
mcl++; \
@@ -679,19 +530,11 @@ __switch_to(struct task_struct *prev_p,
/*
* Switch the PDA context.
*/
- write_pda(pcurrent, next_p);
- write_pda(kernelstack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - PDA_STACKOFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
- write_pda(stack_canary, next_p->stack_canary);
+ percpu_write(current_task, next_p);
- /*
- * Build time only check to make sure the stack_canary is at
- * offset 40 in the pda; this is a gcc ABI requirement
- */
- BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
+ percpu_write(kernel_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - KERNEL_STACK_OFFSET);
/*
* Now maybe reload the debug registers
@@ -745,11 +588,6 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
-asmlinkage long sys_fork(struct pt_regs *regs)
-{
- return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
asmlinkage long
sys_clone(unsigned long clone_flags, unsigned long newsp,
void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -759,22 +597,6 @@ sys_clone(unsigned long clone_flags, uns
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage long sys_vfork(struct pt_regs *regs)
-{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
- NULL, NULL);
-}
-
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
--- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-03-03 16:22:27.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-03-03 16:22:49.000000000 +0100
@@ -74,14 +74,15 @@
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
-#include <asm/arch_hooks.h>
#include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
#include <asm/sections.h>
#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/vmi.h>
-#include <setup_arch.h>
+#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -89,7 +90,7 @@
#include <asm/system.h>
#include <asm/vsyscall.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/iommu.h>
@@ -97,7 +98,6 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
-#include <mach_apic.h>
#include <asm/paravirt.h>
#include <asm/hypervisor.h>
@@ -118,9 +118,6 @@
#include <xen/firmware.h>
#include <xen/xencons.h>
-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
-EXPORT_SYMBOL(HYPERVISOR_shared_info);
-
static int xen_panic_event(struct notifier_block *, unsigned long, void *);
static struct notifier_block xen_panic_block = {
xen_panic_event, NULL, 0 /* try to go last */
@@ -145,7 +142,26 @@ EXPORT_SYMBOL(xen_start_info);
#define ARCH_SETUP
#endif
+RESERVE_BRK(dmi_alloc, 65536);
+
+unsigned int boot_cpu_id __read_mostly;
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_64
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+ return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+}
+#endif
+
#ifndef CONFIG_DEBUG_BOOT_PARAMS
struct boot_params __initdata boot_params;
#else
@@ -179,14 +195,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
-/* This value is set up by the early boot code to point to the value
- immediately after the boot time page tables. It contains a *physical*
- address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-#endif
-
static struct resource video_ram_resource = {
.name = "Video RAM area",
.start = 0xa0000,
@@ -226,7 +234,9 @@ struct ist_info ist_info;
#endif
#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+ .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
EXPORT_SYMBOL(boot_cpu_data);
#endif
@@ -241,12 +251,6 @@ unsigned long mmu_cr4_features = X86_CR4
int bootloader_type;
/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
* Setup options
*/
struct screen_info screen_info;
@@ -293,6 +297,35 @@ static inline void copy_edd(void)
}
#endif
+void * __init extend_brk(size_t size, size_t align)
+{
+ size_t mask = align - 1;
+ void *ret;
+
+ BUG_ON(_brk_start == 0);
+ BUG_ON(align & mask);
+
+ _brk_end = (_brk_end + mask) & ~mask;
+ BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+ ret = (void *)_brk_end;
+ _brk_end += size;
+
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+static void __init reserve_brk(void)
+{
+ if (_brk_end > _brk_start)
+ reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+ /* Mark brk area as locked down and no longer taking any
+ new allocations */
+ _brk_start = 0;
+}
+
#ifdef CONFIG_BLK_DEV_INITRD
#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
@@ -653,24 +686,7 @@ static int __init setup_elfcorehdr(char
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-#ifndef CONFIG_XEN
-static int __init default_update_genapic(void)
-{
-#ifdef CONFIG_X86_SMP
-# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
- genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
-# endif
-#endif
-
- return 0;
-}
-#else
-#define default_update_genapic NULL
-#endif
-
-static struct x86_quirks default_x86_quirks __initdata = {
- .update_genapic = default_update_genapic,
-};
+static struct x86_quirks default_x86_quirks __initdata;
struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
@@ -739,12 +755,6 @@ void __init setup_arch(char **cmdline_p)
/* Register a call for panic conditions. */
atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_writable_pagetables));
-#ifdef CONFIG_X86_32
- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_4gb_segments));
-#endif
set_iopl.iopl = 1;
WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
#endif /* CONFIG_XEN */
@@ -752,7 +762,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
- pre_setup_arch_hook();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
@@ -836,16 +845,7 @@ void __init setup_arch(char **cmdline_p)
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
- init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
- init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
- xen_start_info->nr_pt_frames) << PAGE_SHIFT;
-#endif
-#else
- init_mm.brk = (unsigned long) &_end;
-#endif
+ init_mm.brk = _brk_end;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
@@ -958,9 +958,8 @@ void __init setup_arch(char **cmdline_p)
num_physpages = max_pfn;
max_mapnr = max_pfn;
-#ifndef CONFIG_XEN
- if (cpu_has_x2apic)
- check_x2apic();
+#ifdef CONFIG_X86_LOCAL_APIC
+ check_x2apic();
#endif
/* How many end-of-memory variables you have, grandma! */
@@ -977,6 +976,8 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
+ reserve_brk();
+
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
@@ -1001,7 +1002,7 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
vsmp_init();
#endif
@@ -1036,12 +1037,11 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_reserve_bootmem();
#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
-#endif
+
reserve_crashkernel();
#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
@@ -1142,13 +1142,9 @@ void __init setup_arch(char **cmdline_p)
for (i = 0; i < MAX_DMA_CHANNELS; ++i)
if (i != 4 && request_dma(i, "xen") != 0)
BUG();
-#endif /* CONFIG_XEN */
-
-#ifdef CONFIG_X86_GENERICARCH
+#else /* CONFIG_XEN */
generic_apic_probe();
-#endif
-#ifndef CONFIG_XEN
early_quirks();
#endif
@@ -1220,6 +1216,98 @@ void __init setup_arch(char **cmdline_p)
#endif /* CONFIG_XEN */
}
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+void __init x86_quirk_pre_intr_init(void)
+{
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
+ init_ISA_irqs();
+}
+
+/**
+ * x86_quirk_intr_init - post gate setup interrupt initialisation
+ *
+ * Description:
+ * Fill in any interrupts that may have been left out by the general
+ * init_IRQ() routine. interrupts having to do with the machine rather
+ * than the devices on the I/O bus (like APIC interrupts in intel MP
+ * systems) are started here.
+ **/
+void __init x86_quirk_intr_init(void)
+{
+ if (x86_quirks->arch_intr_init) {
+ if (x86_quirks->arch_intr_init())
+ return;
+ }
+}
+
+/**
+ * x86_quirk_trap_init - initialise system specific traps
+ *
+ * Description:
+ * Called as the final act of trap_init(). Used in VISWS to initialise
+ * the various board specific APIC traps.
+ **/
+void __init x86_quirk_trap_init(void)
+{
+ if (x86_quirks->arch_trap_init) {
+ if (x86_quirks->arch_trap_init())
+ return;
+ }
+}
+
+static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .name = "timer"
+};
+
+/**
+ * x86_quirk_pre_time_init - do any specific initialisations before.
+ *
+ **/
+void __init x86_quirk_pre_time_init(void)
+{
+ if (x86_quirks->arch_pre_time_init)
+ x86_quirks->arch_pre_time_init();
+}
+
+/**
+ * x86_quirk_time_init - do any specific initialisations for the system timer.
+ *
+ * Description:
+ * Must plug the system timer interrupt source at HZ into the IRQ listed
+ * in irq_vectors.h:TIMER_IRQ
+ **/
+void __init x86_quirk_time_init(void)
+{
+ if (x86_quirks->arch_time_init) {
+ /*
+ * A nonzero return code does not mean failure, it means
+ * that the architecture quirk does not want any
+ * generic (timer) setup to be performed after this:
+ */
+ if (x86_quirks->arch_time_init())
+ return;
+ }
+
+ irq0.mask = cpumask_of_cpu(0);
+ setup_irq(0, &irq0);
+}
+#endif /* CONFIG_X86_32 */
+
#ifdef CONFIG_XEN
static int
xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
--- head-2011-03-17.orig/arch/x86/kernel/setup_percpu.c 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/setup_percpu.c 2011-02-01 14:44:12.000000000 +0100
@@ -219,7 +219,7 @@ void __init setup_per_cpu_areas(void)
* are zeroed indicating that the static arrays are
* gone.
*/
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
per_cpu(x86_cpu_to_apicid, cpu) =
early_per_cpu_map(x86_cpu_to_apicid, cpu);
per_cpu(x86_bios_cpu_apicid, cpu) =
@@ -252,7 +252,7 @@ void __init setup_per_cpu_areas(void)
}
/* indicate the early static arrays will soon be gone */
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
#endif
--- head-2011-03-17.orig/arch/x86/kernel/smp-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/smp-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -2,7 +2,7 @@
* Intel SMP support routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,7 +26,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
-#include <mach_ipi.h>
+#include <asm/ipi.h>
#include <xen/evtchn.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -118,17 +118,17 @@ void xen_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+ xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void xen_send_call_func_single_ipi(int cpu)
{
- send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
+ xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
}
void xen_send_call_func_ipi(const struct cpumask *mask)
{
- send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
+ xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
}
/*
--- head-2011-03-17.orig/arch/x86/kernel/time-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/time-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -83,7 +83,7 @@ static DEFINE_PER_CPU(u64, processed_sto
static DEFINE_PER_CPU(u64, processed_blocked_time);
/* Current runstate of each CPU (updated automatically by the hypervisor). */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
/* Must be signed, as it's compared with s64 quantities which can be -ve. */
#define NS_PER_TICK (1000000000LL/HZ)
@@ -557,7 +557,8 @@ irqreturn_t timer_interrupt(int irq, voi
ct = jiffies_to_cputime(delta_cpu);
if (user_mode_vm(get_irq_regs()))
account_user_time(current, ct, cputime_to_scaled(ct));
- else if (current != idle_task(cpu))
+ else if (current != idle_task(cpu)
+ || irq_count() != HARDIRQ_OFFSET)
account_system_time(current, HARDIRQ_OFFSET,
ct, cputime_to_scaled(ct));
else
@@ -590,7 +591,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
static cycle_t cs_last;
-static cycle_t xen_clocksource_read(void)
+static cycle_t xen_clocksource_read(struct clocksource *cs)
{
#ifdef CONFIG_SMP
cycle_t last = get64(&cs_last);
--- head-2011-03-17.orig/arch/x86/kernel/traps-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/traps-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -54,15 +54,14 @@
#include <asm/desc.h>
#include <asm/i387.h>
-#include <mach_traps.h>
+#include <asm/mach_traps.h>
#ifdef CONFIG_X86_64
#include <asm/pgalloc.h>
#include <asm/proto.h>
-#include <asm/pda.h>
#else
#include <asm/processor-flags.h>
-#include <asm/arch_hooks.h>
+#include <asm/setup.h>
#include <asm/traps.h>
#include "cpu/mcheck/mce.h"
@@ -123,49 +122,6 @@ die_if_kernel(const char *str, struct pt
if (!user_mode_vm(regs))
die(str, regs, err);
}
-
-/*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
- * we set the offset field correctly and return 1.
- */
-static int lazy_iobitmap_copy(void)
-{
-#ifndef CONFIG_XEN
- struct thread_struct *thread;
- struct tss_struct *tss;
- int cpu;
-
- cpu = get_cpu();
- tss = &per_cpu(init_tss, cpu);
- thread = &current->thread;
-
- if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
- thread->io_bitmap_ptr) {
- memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
- thread->io_bitmap_max);
- /*
- * If the previously set map was extending to higher ports
- * than the current one, pad extra space with 0xff (no access).
- */
- if (thread->io_bitmap_max < tss->io_bitmap_max) {
- memset((char *) tss->io_bitmap +
- thread->io_bitmap_max, 0xff,
- tss->io_bitmap_max - thread->io_bitmap_max);
- }
- tss->io_bitmap_max = thread->io_bitmap_max;
- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
- tss->io_bitmap_owner = thread;
- put_cpu();
-
- return 1;
- }
- put_cpu();
-#endif
-
- return 0;
-}
#endif
static void __kprobes
@@ -316,11 +272,6 @@ do_general_protection(struct pt_regs *re
conditional_sti(regs);
#ifdef CONFIG_X86_32
- if (lazy_iobitmap_copy()) {
- /* restart the faulting instruction */
- return;
- }
-
if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
#endif
@@ -911,19 +862,20 @@ void math_emulate(struct math_emu_info *
}
#endif /* CONFIG_MATH_EMULATION */
-dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs)
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error_code)
{
#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
- conditional_sti(&regs);
+ conditional_sti(regs);
- info.regs = &regs;
+ info.regs = regs;
math_emulate(&info);
} else {
math_state_restore(); /* interrupts still off */
- conditional_sti(&regs);
+ conditional_sti(regs);
}
#else
math_state_restore();
@@ -939,7 +891,7 @@ dotraplinkage void do_iret_error(struct
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_BADSTK;
- info.si_addr = 0;
+ info.si_addr = NULL;
if (notify_die(DIE_TRAP, "iret exception",
regs, error_code, 32, SIGILL) == NOTIFY_STOP)
return;
--- head-2011-03-17.orig/arch/x86/kernel/vmlinux.lds.S 2011-01-31 17:32:16.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/vmlinux.lds.S 2011-02-01 14:44:12.000000000 +0100
@@ -16,8 +16,10 @@
#ifdef CONFIG_X86_32
#define LOAD_OFFSET __PAGE_OFFSET
-#else
+#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002
#define LOAD_OFFSET __START_KERNEL_map
+#else
+#define LOAD_OFFSET 0
#endif
#include <asm-generic/vmlinux.lds.h>
--- head-2011-03-17.orig/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,5 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-obj-y := setup.o
--- head-2011-03-17.orig/arch/x86/mach-xen/setup.c 2011-02-03 14:23:14.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,186 +0,0 @@
-/*
- * Machine specific setup for generic
- */
-
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <asm/acpi.h>
-#include <asm/arch_hooks.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-
-#include <xen/interface/callback.h>
-#include <xen/interface/memory.h>
-
-#ifdef CONFIG_X86_32
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-int no_broadcast=DEFAULT_SEND_IPI;
-
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
- "IPI Broadcast");
- return 1;
-}
-
-__setup("no_ipi_broadcast", no_ipi_broadcast);
-
-static int __init print_ipi_mode(void)
-{
- printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
- "Shortcut");
- return 0;
-}
-
-late_initcall(print_ipi_mode);
-
-unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
-EXPORT_SYMBOL(machine_to_phys_mapping);
-unsigned int machine_to_phys_order;
-EXPORT_SYMBOL(machine_to_phys_order);
-
-void __init pre_setup_arch_hook(void)
-{
- struct xen_machphys_mapping mapping;
- unsigned long machine_to_phys_nr_ents;
- struct xen_platform_parameters pp;
-
- init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
-
- xen_setup_features();
-
- if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
- hypervisor_virt_start = pp.virt_start;
- reserve_top_address(0UL - pp.virt_start);
- }
-
- if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
- machine_to_phys_mapping = (unsigned long *)mapping.v_start;
- machine_to_phys_nr_ents = mapping.max_mfn + 1;
- } else
- machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
- machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
-
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- phys_to_machine_mapping =
- (unsigned long *)xen_start_info->mfn_list;
-}
-
-#endif /* CONFIG_X86_32 */
-
-extern void hypervisor_callback(void);
-extern void failsafe_callback(void);
-extern void nmi(void);
-
-#ifdef CONFIG_X86_64
-#include <asm/proto.h>
-#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
-#else
-#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
-#endif
-
-void __init machine_specific_arch_setup(void)
-{
- int ret;
- static struct callback_register __initdata event = {
- .type = CALLBACKTYPE_event,
- .address = CALLBACK_ADDR(hypervisor_callback)
- };
- static struct callback_register __initdata failsafe = {
- .type = CALLBACKTYPE_failsafe,
- .address = CALLBACK_ADDR(failsafe_callback)
- };
-#ifdef CONFIG_X86_64
- static struct callback_register __initdata syscall = {
- .type = CALLBACKTYPE_syscall,
- .address = CALLBACK_ADDR(system_call)
- };
-#endif
- static struct callback_register __initdata nmi_cb = {
- .type = CALLBACKTYPE_nmi,
- .address = CALLBACK_ADDR(nmi)
- };
-
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
- if (ret == 0)
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
-#ifdef CONFIG_X86_64
- if (ret == 0)
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
-#endif
-#if CONFIG_XEN_COMPAT <= 0x030002
-#ifdef CONFIG_X86_32
- if (ret == -ENOSYS)
- ret = HYPERVISOR_set_callbacks(
- event.address.cs, event.address.eip,
- failsafe.address.cs, failsafe.address.eip);
-#else
- ret = HYPERVISOR_set_callbacks(
- event.address,
- failsafe.address,
- syscall.address);
-#endif
-#endif
- BUG_ON(ret);
-
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
-#if CONFIG_XEN_COMPAT <= 0x030002
- if (ret == -ENOSYS) {
- static struct xennmi_callback __initdata cb = {
- .handler_address = (unsigned long)nmi
- };
-
- HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
- }
-#endif
-
-#ifdef CONFIG_X86_32
- /* Do an early initialization of the fixmap area */
- {
- extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
- unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
- pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
- pmd_t *pmd = pmd_offset(pud, addr);
- unsigned int i;
-
- make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
-
-#define __FIXADDR_TOP (-PAGE_SIZE)
-#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
- != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
- FIX_BUG_ON(SHARED_INFO);
- FIX_BUG_ON(ISAMAP_BEGIN);
- FIX_BUG_ON(ISAMAP_END);
-#undef __FIXADDR_TOP
- BUG_ON(pte_index(hypervisor_virt_start));
-
- /* Switch to the real shared_info page, and clear the
- * dummy page. */
- set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
- HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
- memset(empty_zero_page, 0, sizeof(empty_zero_page));
-
- /* Setup mapping of lower 1st MB */
- for (i = 0; i < NR_FIX_ISAMAPS; i++)
- if (is_initial_xendomain())
- set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
- else
- __set_fixmap(FIX_ISAMAP_BEGIN - i,
- virt_to_machine(empty_zero_page),
- PAGE_KERNEL_RO);
- }
-#endif
-}
--- head-2011-03-17.orig/arch/x86/mm/Makefile 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -27,7 +27,7 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology_6
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_XEN) += hypervisor.o
-disabled-obj-$(CONFIG_XEN) := gup.o
+disabled-obj-$(CONFIG_XEN) := gup.o tlb.o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
--- head-2011-03-17.orig/arch/x86/mm/fault-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/fault-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -1,73 +1,79 @@
/*
* Copyright (C) 1995 Linus Torvalds
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mmiotrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/mmiotrace.h>
+#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/highmem.h>
-#include <linux/bootmem.h> /* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/vt_kern.h>
+#include <linux/signal.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/module.h>
#include <linux/kdebug.h>
+#include <linux/errno.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/mman.h>
+#include <linux/tty.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm-generic/sections.h>
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/segment.h>
+#include <asm/system.h>
#include <asm/proto.h>
-#include <asm-generic/sections.h>
#include <asm/traps.h>
+#include <asm/desc.h>
/*
- * Page fault error code bits
- * bit 0 == 0 means no page found, 1 means protection fault
- * bit 1 == 0 means read, 1 means write
- * bit 2 == 0 means kernel, 1 means user-mode
- * bit 3 == 1 means use of reserved bit detected
- * bit 4 == 1 means fault was an instruction fetch
- */
-#define PF_PROT (1<<0)
-#define PF_WRITE (1<<1)
-#define PF_USER (1<<2)
-#define PF_RSVD (1<<3)
-#define PF_INSTR (1<<4)
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {
+ PF_PROT = 1 << 0,
+ PF_WRITE = 1 << 1,
+ PF_USER = 1 << 2,
+ PF_RSVD = 1 << 3,
+ PF_INSTR = 1 << 4,
+};
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
-#ifdef CONFIG_MMIOTRACE
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
return -1;
-#endif
return 0;
}
static inline int notify_page_fault(struct pt_regs *regs)
{
-#ifdef CONFIG_KPROBES
int ret = 0;
/* kprobe_running() needs smp_processor_id() */
- if (!user_mode_vm(regs)) {
+ if (kprobes_built_in() && !user_mode_vm(regs)) {
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, 14))
ret = 1;
@@ -75,29 +81,76 @@ static inline int notify_page_fault(stru
}
return ret;
-#else
- return 0;
-#endif
}
/*
- * X86_32
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- *
- * X86_64
- * Sometimes the CPU reports invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
*
- * Opcode checker based on code by Richard Brunner
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
*/
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
- unsigned long error_code)
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+ unsigned char opcode, int *prefetch)
+{
+ unsigned char instr_hi = opcode & 0xf0;
+ unsigned char instr_lo = opcode & 0x0f;
+
+ switch (instr_hi) {
+ case 0x20:
+ case 0x30:
+ /*
+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+ * In X86_64 long mode, the CPU will signal invalid
+ * opcode if some of these prefixes are present so
+ * X86_64 will never get here anyway
+ */
+ return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+ case 0x40:
+ /*
+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+ * Need to figure out under what instruction mode the
+ * instruction was issued. Could check the LDT for lm,
+ * but for now it's good enough to assume that long
+ * mode only uses well known segments or kernel.
+ */
+ return (!user_mode(regs)) || (regs->cs == __USER_CS);
+#endif
+ case 0x60:
+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
+ return (instr_lo & 0xC) == 0x4;
+ case 0xF0:
+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+ return !instr_lo || (instr_lo>>1) == 1;
+ case 0x00:
+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
+ if (probe_kernel_address(instr, opcode))
+ return 0;
+
+ *prefetch = (instr_lo == 0xF) &&
+ (opcode == 0x0D || opcode == 0x18);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
+ unsigned char *max_instr;
unsigned char *instr;
- int scan_more = 1;
int prefetch = 0;
- unsigned char *max_instr;
/*
* If it was a exec (instruction fetch) fault on NX page, then
@@ -106,166 +159,45 @@ static int is_prefetch(struct pt_regs *r
if (error_code & PF_INSTR)
return 0;
- instr = (unsigned char *)convert_ip_to_linear(current, regs);
+ instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
return 0;
- while (scan_more && instr < max_instr) {
+ while (instr < max_instr) {
unsigned char opcode;
- unsigned char instr_hi;
- unsigned char instr_lo;
if (probe_kernel_address(instr, opcode))
break;
- instr_hi = opcode & 0xf0;
- instr_lo = opcode & 0x0f;
instr++;
- switch (instr_hi) {
- case 0x20:
- case 0x30:
- /*
- * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
- * In X86_64 long mode, the CPU will signal invalid
- * opcode if some of these prefixes are present so
- * X86_64 will never get here anyway
- */
- scan_more = ((instr_lo & 7) == 0x6);
- break;
-#ifdef CONFIG_X86_64
- case 0x40:
- /*
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
- * Need to figure out under what instruction mode the
- * instruction was issued. Could check the LDT for lm,
- * but for now it's good enough to assume that long
- * mode only uses well known segments or kernel.
- */
- scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
- break;
-#endif
- case 0x60:
- /* 0x64 thru 0x67 are valid prefixes in all modes. */
- scan_more = (instr_lo & 0xC) == 0x4;
- break;
- case 0xF0:
- /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
- scan_more = !instr_lo || (instr_lo>>1) == 1;
- break;
- case 0x00:
- /* Prefetch instruction is 0x0F0D or 0x0F18 */
- scan_more = 0;
-
- if (probe_kernel_address(instr, opcode))
- break;
- prefetch = (instr_lo == 0xF) &&
- (opcode == 0x0D || opcode == 0x18);
+ if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
- default:
- scan_more = 0;
- break;
- }
}
return prefetch;
}
-static void force_sig_info_fault(int si_signo, int si_code,
- unsigned long address, struct task_struct *tsk)
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+ struct task_struct *tsk)
{
siginfo_t info;
- info.si_signo = si_signo;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *)address;
- force_sig_info(si_signo, &info, tsk);
-}
+ info.si_signo = si_signo;
+ info.si_errno = 0;
+ info.si_code = si_code;
+ info.si_addr = (void __user *)address;
-#ifdef CONFIG_X86_64
-static int bad_address(void *p)
-{
- unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ force_sig_info(si_signo, &info, tsk);
}
-#endif
-
-static void dump_pagetable(unsigned long address)
-{
-#ifdef CONFIG_X86_32
- __typeof__(pte_val(__pte(0))) page;
-
- page = read_cr3();
- page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
-#ifdef CONFIG_X86_PAE
- printk("*pdpt = %016Lx ", page);
- if ((page & _PAGE_PRESENT)
- && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
- page = mfn_to_pfn(page >> PAGE_SHIFT);
- page <<= PAGE_SHIFT;
- page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
- & (PTRS_PER_PMD - 1)];
- printk(KERN_CONT "*pde = %016Lx ", page);
- page &= ~_PAGE_NX;
- }
-#else
- printk("*pde = %08lx ", page);
-#endif
-
- /*
- * We must not directly access the pte in the highpte
- * case if the page table is located in highmem.
- * And let's rather not kmap-atomic the pte, just in case
- * it's allocated already.
- */
- if ((page & _PAGE_PRESENT)
- && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
- && !(page & _PAGE_PSE)) {
- page = mfn_to_pfn(page >> PAGE_SHIFT);
- page <<= PAGE_SHIFT;
- page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
- & (PTRS_PER_PTE - 1)];
- printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
- }
-
- printk(KERN_CONT "\n");
-#else /* CONFIG_X86_64 */
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = (pgd_t *)read_cr3();
-
- pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
- pgd += pgd_index(address);
- if (bad_address(pgd)) goto bad;
- printk("PGD %lx ", pgd_val(*pgd));
- if (!pgd_present(*pgd)) goto ret;
-
- pud = pud_offset(pgd, address);
- if (bad_address(pud)) goto bad;
- printk(KERN_CONT "PUD %lx ", pud_val(*pud));
- if (!pud_present(*pud) || pud_large(*pud))
- goto ret;
- pmd = pmd_offset(pud, address);
- if (bad_address(pmd)) goto bad;
- printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
- if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
- pte = pte_offset_kernel(pmd, address);
- if (bad_address(pte)) goto bad;
- printk(KERN_CONT "PTE %lx", pte_val(*pte));
-ret:
- printk(KERN_CONT "\n");
- return;
-bad:
- printk("BAD\n");
-#endif
-}
+#define pgd_page_table(what, pg) \
+ spin_##what(&((struct mm_struct *)(pg)->private)->page_table_lock)
#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
@@ -286,7 +218,6 @@ static inline pmd_t *vmalloc_sync_one(pg
* and redundant with the set_pmd() on non-PAE. As would
* set_pud.
*/
-
pud = pud_offset(pgd, address);
pud_k = pud_offset(pgd_k, address);
if (!pud_present(*pud_k))
@@ -296,10 +227,11 @@ static inline pmd_t *vmalloc_sync_one(pg
pmd_k = pmd_offset(pud_k, address);
if (!pmd_present(*pmd_k))
return NULL;
+
if (!pmd_present(*pmd)) {
- bool lazy = x86_read_percpu(xen_lazy_mmu);
+ bool lazy = percpu_read(xen_lazy_mmu);
- x86_write_percpu(xen_lazy_mmu, false);
+ percpu_write(xen_lazy_mmu, false);
#if CONFIG_XEN_COMPAT > 0x030002
set_pmd(pmd, *pmd_k);
#else
@@ -309,143 +241,657 @@ static inline pmd_t *vmalloc_sync_one(pg
*/
set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
#endif
- x86_write_percpu(xen_lazy_mmu, lazy);
- } else
+ percpu_write(xen_lazy_mmu, lazy);
+ } else {
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+ }
+
return pmd_k;
}
-#endif
-
-#ifdef CONFIG_X86_64
-static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-#endif
-/* Workaround for K8 erratum #93 & buggy BIOS.
- BIOS SMM functions are required to use a specific workaround
- to avoid corruption of the 64bit RIP register on C stepping K8.
- A lot of BIOS that didn't get tested properly miss this.
- The OS sees this as a page fault with the upper 32bits of RIP cleared.
- Try to work around it here.
- Note we only handle faults in kernel here.
- Does nothing for X86_32
- */
-static int is_errata93(struct pt_regs *regs, unsigned long address)
+void vmalloc_sync_all(void)
{
-#ifdef CONFIG_X86_64
- static int warned;
- if (address != regs->ip)
- return 0;
- if ((address >> 32) != 0)
- return 0;
- address |= 0xffffffffUL << 32;
- if ((address >= (u64)_stext && address <= (u64)_etext) ||
- (address >= MODULES_VADDR && address <= MODULES_END)) {
- if (!warned) {
- printk(errata93_warning);
- warned = 1;
+ unsigned long address;
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ for (address = VMALLOC_START & PMD_MASK;
+ address >= TASK_SIZE && address < FIXADDR_TOP;
+ address += PMD_SIZE) {
+
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pmd_t *pmd;
+
+ pgd_page_table(lock, page);
+ pmd = vmalloc_sync_one(page_address(page), address);
+ pgd_page_table(unlock, page);
+
+ if (!pmd)
+ break;
}
- regs->ip = address;
- return 1;
+ spin_unlock_irqrestore(&pgd_lock, flags);
}
-#endif
- return 0;
}
/*
- * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
- * addresses >4GB. We catch this in the page fault handler because these
- * addresses are not reachable. Just detect this case and return. Any code
- * segment in LDT is compatibility mode.
+ * 32-bit:
+ *
+ * Handle a fault on the vmalloc or module mapping area
*/
-static int is_errata100(struct pt_regs *regs, unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
{
-#ifdef CONFIG_X86_64
- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
- (address >> 32))
- return 1;
-#endif
- return 0;
-}
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
-{
-#ifdef CONFIG_X86_F00F_BUG
- unsigned long nr;
/*
- * Pentium F0 0F C7 C8 bug workaround.
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
*/
- if (boot_cpu_data.f00f_bug) {
- nr = (address - idt_descr.address) >> 3;
+ pgd_paddr = read_cr3();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
- if (nr == 6) {
- do_invalid_op(regs, 0);
- return 1;
- }
- }
-#endif
return 0;
}
-static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ struct task_struct *tsk)
{
-#ifdef CONFIG_X86_32
- if (!oops_may_print())
+ unsigned long bit;
+
+ if (!v8086_mode(regs))
return;
-#endif
-#ifdef CONFIG_X86_PAE
- if (error_code & PF_INSTR) {
- unsigned int level;
+ bit = (address - 0xA0000) >> PAGE_SHIFT;
+ if (bit < 32)
+ tsk->thread.screen_bitmap |= 1 << bit;
+}
+
+static void dump_pagetable(unsigned long address)
+{
+ __typeof__(pte_val(__pte(0))) page;
+
+ page = read_cr3();
+ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+
+#ifdef CONFIG_X86_PAE
+ printk("*pdpt = %016Lx ", page);
+ if ((page & _PAGE_PRESENT)
+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
+ page = mfn_to_pfn(page >> PAGE_SHIFT);
+ page <<= PAGE_SHIFT;
+ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+ & (PTRS_PER_PMD - 1)];
+ printk(KERN_CONT "*pde = %016Lx ", page);
+ page &= ~_PAGE_NX;
+ }
+#else
+ printk("*pde = %08lx ", page);
+#endif
+
+ /*
+ * We must not directly access the pte in the highpte
+ * case if the page table is located in highmem.
+ * And let's rather not kmap-atomic the pte, just in case
+ * it's allocated already:
+ */
+ if ((page & _PAGE_PRESENT)
+ && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
+ && !(page & _PAGE_PSE)) {
+
+ page = mfn_to_pfn(page >> PAGE_SHIFT);
+ page <<= PAGE_SHIFT;
+ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+ & (PTRS_PER_PTE - 1)];
+ printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
+ }
+
+ printk(KERN_CONT "\n");
+}
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+ unsigned long address;
+
+ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+ address += PGDIR_SIZE) {
+
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pgd_page_table(lock, page);
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ pgd_page_table(unlock, page);
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+}
+
+/*
+ * 64-bit:
+ *
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+ pgd_t *pgd, *pgd_ref;
+ pud_t *pud, *pud_ref;
+ pmd_t *pmd, *pmd_ref;
+ pte_t *pte, *pte_ref;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ /*
+ * Copy kernel mappings over when needed. This can also
+ * happen within a race in page table update. In the later
+ * case just flush:
+ */
+ pgd = pgd_offset(current->active_mm, address);
+ pgd_ref = pgd_offset_k(address);
+ if (pgd_none(*pgd_ref))
+ return -1;
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ /*
+ * Below here mismatches are bugs because these lower tables
+ * are shared:
+ */
+
+ pud = pud_offset(pgd, address);
+ pud_ref = pud_offset(pgd_ref, address);
+ if (pud_none(*pud_ref))
+ return -1;
+
+ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+ BUG();
+
+ pmd = pmd_offset(pud, address);
+ pmd_ref = pmd_offset(pud_ref, address);
+ if (pmd_none(*pmd_ref))
+ return -1;
+
+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+ BUG();
+
+ pte_ref = pte_offset_kernel(pmd_ref, address);
+ if (!pte_present(*pte_ref))
+ return -1;
+
+ pte = pte_offset_kernel(pmd, address);
+
+ /*
+ * Don't use pte_page here, because the mappings can point
+ * outside mem_map, and the NUMA hash lookup cannot handle
+ * that:
+ */
+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+ BUG();
+
+ return 0;
+}
+
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+ unsigned long dummy;
+
+ return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = (pgd_t *)read_cr3();
+
+ pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+
+ pgd += pgd_index(address);
+ if (bad_address(pgd))
+ goto bad;
+
+ printk("PGD %lx ", pgd_val(*pgd));
+
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (bad_address(pud))
+ goto bad;
+
+ printk(KERN_CONT "PUD %lx ", pud_val(*pud));
+ if (!pud_present(*pud) || pud_large(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (bad_address(pmd))
+ goto bad;
+
+ printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
+ if (!pmd_present(*pmd) || pmd_large(*pmd))
+ goto out;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (bad_address(pte))
+ goto bad;
+
+ printk(KERN_CONT "PTE %lx", pte_val(*pte));
+out:
+ printk(KERN_CONT "\n");
+ return;
+bad:
+ printk("BAD\n");
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ static int once;
+
+ if (address != regs->ip)
+ return 0;
+
+ if ((address >> 32) != 0)
+ return 0;
+
+ address |= 0xffffffffUL << 32;
+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
+ (address >= MODULES_VADDR && address <= MODULES_END)) {
+ if (!once) {
+ printk(errata93_warning);
+ once = 1;
+ }
+ regs->ip = address;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return. Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
+ return 1;
+#endif
+ return 0;
+}
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ unsigned long nr;
+
+ /*
+ * Pentium F0 0F C7 C8 bug workaround:
+ */
+ if (boot_cpu_data.f00f_bug) {
+ nr = (address - idt_descr.address) >> 3;
+
+ if (nr == 6) {
+ do_invalid_op(regs, 0);
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ if (!oops_may_print())
+ return;
+
+ if (error_code & PF_INSTR) {
+ unsigned int level;
+
pte_t *pte = lookup_address(address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
- printk(KERN_CRIT "kernel tried to execute "
- "NX-protected page - exploit attempt? "
- "(uid: %d)\n", current_uid());
+ printk(nx_warning, current_uid());
}
-#endif
printk(KERN_ALERT "BUG: unable to handle kernel ");
if (address < PAGE_SIZE)
printk(KERN_CONT "NULL pointer dereference");
else
printk(KERN_CONT "paging request");
+
printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
printk_address(regs->ip, 1);
+
dump_pagetable(address);
}
-#ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- unsigned long flags = oops_begin();
- int sig = SIGKILL;
struct task_struct *tsk;
+ unsigned long flags;
+ int sig;
+
+ flags = oops_begin();
+ tsk = current;
+ sig = SIGKILL;
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
- current->comm, address);
+ tsk->comm, address);
dump_pagetable(address);
- tsk = current;
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+
if (__die("Bad pagetable", regs, error_code))
sig = 0;
+
oops_end(flags, regs, sig);
}
-#endif
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ struct task_struct *tsk = current;
+ unsigned long *stackend;
+ unsigned long flags;
+ int sig;
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs))
+ return;
+
+ /*
+ * 32-bit:
+ *
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * 64-bit:
+ *
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice:
+ */
+ flags = oops_begin();
+
+ show_fault_oops(regs, error_code, address);
+
+ stackend = end_of_stack(tsk);
+ if (*stackend != STACK_END_MAGIC)
+ printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+
+ sig = SIGKILL;
+ if (__die("Oops", regs, error_code))
+ sig = 0;
+
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_EMERG "CR2: %016lx\n", address);
+
+ oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, struct task_struct *tsk)
+{
+ if (!unhandled_signal(tsk, SIGSEGV))
+ return;
+
+ if (!printk_ratelimit())
+ return;
+
+ printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *)regs->ip, (void *)regs->sp, error_code);
+
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+
+ printk(KERN_CONT "\n");
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code)
+{
+ struct task_struct *tsk = current;
+
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & PF_USER) {
+ /*
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+ if (unlikely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
+
+ /* Kernel addresses are always protection faults: */
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+ tsk->thread.trap_no = 14;
+
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+ no_context(regs, error_code, address);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ up_read(&mm->mmap_sem);
+
+ __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void
+out_of_memory(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ /*
+ * We ran out of memory, call the OOM killer, and return the userspace
+ * (which will retry the fault, or kill us if we got oom-killed):
+ */
+ up_read(&current->mm->mmap_sem);
+
+ pagefault_out_of_memory();
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *mm = tsk->mm;
+
+ up_read(&mm->mmap_sem);
+
+ /* Kernel mode? Handle exceptions or die: */
+ if (!(error_code & PF_USER))
+ no_context(regs, error_code, address);
+
+ /* User-space => ok to do another page fault: */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 14;
+
+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, unsigned int fault)
+{
+ if (fault & VM_FAULT_OOM) {
+ out_of_memory(regs, error_code, address);
+ } else {
+ if (fault & VM_FAULT_SIGBUS)
+ do_sigbus(regs, error_code, address);
+ else
+ BUG();
+ }
+}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & PF_WRITE) && !pte_write(*pte))
return 0;
+
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
@@ -453,21 +899,25 @@ static int spurious_fault_check(unsigned
}
/*
- * Handle a spurious fault caused by a stale TLB entry. This allows
- * us to lazily refresh the TLB when increasing the permissions of a
- * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
- * expensive since that implies doing a full cross-processor TLB
- * flush, even if no stale TLB entries exist on other processors.
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X). Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static int spurious_fault(unsigned long address,
- unsigned long error_code)
+static noinline int
+spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ int ret;
/* Reserved-bit violation or user access to kernel space? */
if (error_code & (PF_USER | PF_RSVD))
@@ -486,126 +936,71 @@ static int spurious_fault(unsigned long
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
- return 0;
-
- if (pmd_large(*pmd))
- return spurious_fault_check(error_code, (pte_t *) pmd);
-
- pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
- return 0;
-
- return spurious_fault_check(error_code, pte);
-}
-
-/*
- * X86_32
- * Handle a fault on the vmalloc or module mapping area
- *
- * X86_64
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-#ifdef CONFIG_X86_32
- unsigned long pgd_paddr;
- pmd_t *pmd_k;
- pte_t *pte_k;
+ return 0;
- /* Make sure we are in vmalloc area */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
+ if (pmd_large(*pmd))
+ return spurious_fault_check(error_code, (pte_t *) pmd);
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+
+ ret = spurious_fault_check(error_code, pte);
+ if (!ret)
+ return 0;
/*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "current" here. We might be inside
- * an interrupt in the middle of a task switch..
+ * Make sure we have permissions in PMD.
+ * If not, then there's a bug in the page tables:
*/
- pgd_paddr = read_cr3();
- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
- if (!pmd_k)
- return -1;
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- return -1;
- return 0;
-#else
- pgd_t *pgd, *pgd_ref;
- pud_t *pud, *pud_ref;
- pmd_t *pmd, *pmd_ref;
- pte_t *pte, *pte_ref;
+ ret = spurious_fault_check(error_code, (pte_t *) pmd);
+ WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
- /* Make sure we are in vmalloc area */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
+ return ret;
+}
- /* Copy kernel mappings over when needed. This can also
- happen within a race in page table update. In the later
- case just flush. */
+int show_unhandled_signals = 1;
- pgd = pgd_offset(current->active_mm, address);
- pgd_ref = pgd_offset_k(address);
- if (pgd_none(*pgd_ref))
- return -1;
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+static inline int
+access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+{
+ if (write) {
+ /* write, present and write, not present: */
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return 1;
+ return 0;
+ }
- /* Below here mismatches are bugs because these lower tables
- are shared */
+ /* read, present: */
+ if (unlikely(error_code & PF_PROT))
+ return 1;
+
+ /* read, not present: */
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ return 1;
- pud = pud_offset(pgd, address);
- pud_ref = pud_offset(pgd_ref, address);
- if (pud_none(*pud_ref))
- return -1;
- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
- BUG();
- pmd = pmd_offset(pud, address);
- pmd_ref = pmd_offset(pud_ref, address);
- if (pmd_none(*pmd_ref))
- return -1;
- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
- BUG();
- pte_ref = pte_offset_kernel(pmd_ref, address);
- if (!pte_present(*pte_ref))
- return -1;
- pte = pte_offset_kernel(pmd, address);
- /* Don't use pte_page here, because the mappings can point
- outside mem_map, and the NUMA hash lookup cannot handle
- that. */
- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
- BUG();
return 0;
-#endif
}
-int show_unhandled_signals = 1;
+static int fault_in_kernel_space(unsigned long address)
+{
+ return address >= TASK_SIZE_MAX;
+}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
-#ifdef CONFIG_X86_64
-asmlinkage
-#endif
-void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
- struct task_struct *tsk;
- struct mm_struct *mm;
struct vm_area_struct *vma;
+ struct task_struct *tsk;
unsigned long address;
- int write, si_code;
+ struct mm_struct *mm;
+ int write;
int fault;
-#ifdef CONFIG_X86_64
- unsigned long flags;
- int sig;
-#endif
/* Set the "privileged fault" bit to something sane. */
if (user_mode_vm(regs))
@@ -615,13 +1010,12 @@ void __kprobes do_page_fault(struct pt_r
tsk = current;
mm = tsk->mm;
+
prefetchw(&mm->mmap_sem);
- /* get the address */
+ /* Get the faulting address: */
address = read_cr2();
- si_code = SEGV_MAPERR;
-
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -638,338 +1032,158 @@ void __kprobes do_page_fault(struct pt_r
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
-#ifdef CONFIG_X86_32
- if (unlikely(address >= TASK_SIZE)) {
-#else
- if (unlikely(address >= TASK_SIZE64)) {
-#endif
+ if (unlikely(fault_in_kernel_space(address))) {
/* Faults in hypervisor area can never be patched up. */
#if defined(CONFIG_X86_XEN)
- if (address >= hypervisor_virt_start)
- goto bad_area_nosemaphore;
+ if (address >= hypervisor_virt_start) {
#elif defined(CONFIG_X86_64_XEN)
if (address >= HYPERVISOR_VIRT_START
- && address < HYPERVISOR_VIRT_END)
- goto bad_area_nosemaphore;
+ && address < HYPERVISOR_VIRT_END) {
#endif
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
vmalloc_fault(address) >= 0)
return;
- /* Can handle a stale RO->RW TLB */
- if (spurious_fault(address, error_code))
+ /* Can handle a stale RO->RW TLB: */
+ if (spurious_fault(error_code, address))
return;
- /* kprobes don't want to hook the spurious faults. */
+ /* kprobes don't want to hook the spurious faults: */
if (notify_page_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
- * fault we could otherwise deadlock.
+ * fault we could otherwise deadlock:
*/
- goto bad_area_nosemaphore;
- }
+ bad_area_nosemaphore(regs, error_code, address);
- /* kprobes don't want to hook the spurious faults. */
- if (notify_page_fault(regs))
return;
+ }
+ /* kprobes don't want to hook the spurious faults: */
+ if (unlikely(notify_page_fault(regs)))
+ return;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
- * potential system fault or CPU buglet.
+ * potential system fault or CPU buglet:
*/
if (user_mode_vm(regs)) {
local_irq_enable();
error_code |= PF_USER;
- } else if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
+ } else {
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+ }
-#ifdef CONFIG_X86_64
if (unlikely(error_code & PF_RSVD))
- pgtable_bad(address, regs, error_code);
-#endif
+ pgtable_bad(regs, error_code, address);
/*
- * If we're in an interrupt, have no user context or are running in an
- * atomic region then we must not take the fault.
+ * If we're in an interrupt, have no user context or are running
+ * in an atomic region then we must not take the fault:
*/
- if (unlikely(in_atomic() || !mm))
- goto bad_area_nosemaphore;
+ if (unlikely(in_atomic() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
/*
* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
+ * addresses in user space. All other faults represent errors in
+ * the kernel and should generate an OOPS. Unfortunately, in the
+ * case of an erroneous fault occurring in a code path which already
+ * holds mmap_sem we will deadlock attempting to validate the fault
+ * against the address space. Luckily the kernel only validly
+ * references user space from well defined areas of code, which are
+ * listed in the exceptions table.
*
* As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
+ * the source reference check when there is a possibility of a
+ * deadlock. Attempt to lock the address space, if we cannot we then
+ * validate the source. If this is invalid we can skip the address
+ * space check, thus avoiding the deadlock:
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->ip))
- goto bad_area_nosemaphore;
+ !search_exception_tables(regs->ip)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
down_read(&mm->mmap_sem);
+ } else {
+ /*
+ * The above down_read_trylock() might have succeeded in
+ * which case we'll have missed the might_sleep() from
+ * down_read():
+ */
+ might_sleep();
}
vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
+ if (unlikely(!vma)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (likely(vma->vm_start <= address))
goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
if (error_code & PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
- * and pusha to work. ("enter $65535,$31" pushes
+ * and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
- if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
- goto bad_area;
+ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
}
- if (expand_stack(vma, address))
- goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+ if (unlikely(expand_stack(vma, address))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
good_area:
- si_code = SEGV_ACCERR;
- write = 0;
- switch (error_code & (PF_PROT|PF_WRITE)) {
- default: /* 3: write, present */
- /* fall through */
- case PF_WRITE: /* write, not present */
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- write++;
- break;
- case PF_PROT: /* read, present */
- goto bad_area;
- case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
+ write = error_code & PF_WRITE;
+
+ if (unlikely(access_error(error_code, write, vma))) {
+ bad_area_access_error(regs, error_code, address);
+ return;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
- * the fault.
+ * the fault:
*/
fault = handle_mm_fault(mm, vma, address, write);
+
if (unlikely(fault & VM_FAULT_ERROR)) {
- if (fault & VM_FAULT_OOM)
- goto out_of_memory;
- else if (fault & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
+ mm_fault_error(regs, error_code, address, fault);
+ return;
}
+
if (fault & VM_FAULT_MAJOR)
tsk->maj_flt++;
else
tsk->min_flt++;
-#ifdef CONFIG_X86_32
- /*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
- if (v8086_mode(regs)) {
- unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
- if (bit < 32)
- tsk->thread.screen_bitmap |= 1 << bit;
- }
-#endif
- up_read(&mm->mmap_sem);
- return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
- /*
- * It's possible to have interrupts off here.
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata100(regs, address))
- return;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(
- "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
- (void *) regs->ip, (void *) regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
-
- tsk->thread.cr2 = address;
- /* Kernel addresses are always protection faults */
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
- return;
- }
-
- if (is_f00f_bug(regs, address))
- return;
-
-no_context:
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
- return;
-
- /*
- * X86_32
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * X86_64
- * Hall of shame of CPU/BIOS bugs.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata93(regs, address))
- return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
- bust_spinlocks(1);
-#else
- flags = oops_begin();
-#endif
-
- show_fault_oops(regs, error_code, address);
-
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
- die("Oops", regs, error_code);
- bust_spinlocks(0);
- do_exit(SIGKILL);
-#else
- sig = SIGKILL;
- if (__die("Oops", regs, error_code))
- sig = 0;
- /* Executive summary in case the body of the oops scrolled away */
- printk(KERN_EMERG "CR2: %016lx\n", address);
- oops_end(flags, regs, sig);
-#endif
-
-out_of_memory:
- /*
- * We ran out of memory, call the OOM killer, and return the userspace
- * (which will retry the fault, or kill us if we got oom-killed).
- */
- up_read(&mm->mmap_sem);
- pagefault_out_of_memory();
- return;
+ check_v8086_mode(regs, address, tsk);
-do_sigbus:
up_read(&mm->mmap_sem);
-
- /* Kernel mode? Handle exceptions or die */
- if (!(error_code & PF_USER))
- goto no_context;
-#ifdef CONFIG_X86_32
- /* User space => ok to do another page fault */
- if (is_prefetch(regs, address, error_code))
- return;
-#endif
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-#define pgd_page_table(what, pg) \
- spin_##what(&((struct mm_struct *)(pg)->private)->page_table_lock)
-
-void vmalloc_sync_all(void)
-{
- unsigned long address;
-
-#ifdef CONFIG_X86_32
- if (SHARED_KERNEL_PMD)
- return;
-
- for (address = VMALLOC_START & PMD_MASK;
- address >= TASK_SIZE && address < FIXADDR_TOP;
- address += PMD_SIZE) {
- unsigned long flags;
- struct page *page;
-
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- pmd_t *pmd;
-
- pgd_page_table(lock, page);
- pmd = vmalloc_sync_one(page_address(page), address);
- pgd_page_table(unlock, page);
-
- if (!pmd)
- break;
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
-#else /* CONFIG_X86_64 */
- for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
- address += PGDIR_SIZE) {
- const pgd_t *pgd_ref = pgd_offset_k(address);
- unsigned long flags;
- struct page *page;
-
- if (pgd_none(*pgd_ref))
- continue;
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- pgd_page_table(lock, page);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- pgd_page_table(unlock, page);
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
-#endif
}
--- head-2011-03-17.orig/arch/x86/mm/highmem_32-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/highmem_32-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -1,5 +1,6 @@
#include <linux/highmem.h>
#include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
void *kmap(struct page *page)
{
@@ -18,49 +19,6 @@ void kunmap(struct page *page)
kunmap_high(page);
}
-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
- static unsigned warn_count = 10;
-
- if (unlikely(warn_count == 0))
- return;
-
- if (unlikely(in_interrupt())) {
- if (in_irq()) {
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (!irqs_disabled()) { /* softirq */
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
- type != KM_SKB_SUNRPC_DATA &&
- type != KM_SKB_DATA_SOFTIRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- }
- }
-
- if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
- if (!irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
- if (irq_count() == 0 && !irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- }
-#endif
-}
-
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
* no global lock is needed and because the kmap code must perform a global TLB
@@ -80,7 +38,7 @@ void *kmap_atomic_prot(struct page *page
if (!PageHighMem(page))
return page_address(page);
- debug_kmap_atomic_prot(type);
+ debug_kmap_atomic(type);
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -120,22 +78,13 @@ void kunmap_atomic(void *kvaddr, enum km
pagefault_enable();
}
-/* This is the same as kmap_atomic() but can map memory that doesn't
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
{
- enum fixed_addresses idx;
- unsigned long vaddr;
-
- pagefault_disable();
-
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
- /*arch_flush_lazy_mmu_mode();*/
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
@@ -206,3 +155,35 @@ EXPORT_SYMBOL(kmap_atomic_to_page);
#endif
EXPORT_SYMBOL(clear_highpage);
EXPORT_SYMBOL(copy_highpage);
+
+void __init set_highmem_pages_init(void)
+{
+ struct zone *zone;
+ int nid;
+
+ for_each_zone(zone) {
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ zone_start_pfn = zone->zone_start_pfn;
+ zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+ nid = zone_to_nid(zone);
+ printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+ zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+ add_highpages_with_active_regions(nid, zone_start_pfn,
+ zone_end_pfn);
+
+ /* XEN: init high-mem pages outside initial allocation. */
+ if (zone_start_pfn < xen_start_info->nr_pages)
+ zone_start_pfn = xen_start_info->nr_pages;
+ for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) {
+ ClearPageReserved(pfn_to_page(zone_start_pfn));
+ init_page_count(pfn_to_page(zone_start_pfn));
+ }
+ }
+ totalram_pages += totalhigh_pages;
+}
--- head-2011-03-17.orig/arch/x86/mm/hypervisor.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/hypervisor.c 2011-02-01 14:44:12.000000000 +0100
@@ -36,6 +36,7 @@
#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/setup.h>
#include <asm/hypervisor.h>
#include <xen/balloon.h>
#include <xen/features.h>
@@ -47,6 +48,9 @@
EXPORT_SYMBOL(hypercall_page);
+shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
#define NR_MC BITS_PER_LONG
#define NR_MMU BITS_PER_LONG
#define NR_MMUEXT (BITS_PER_LONG / 4)
@@ -536,7 +540,7 @@ int xen_create_contiguous_region(
unsigned int level;
if (vstart < __START_KERNEL_map
- || vstart + (PAGE_SIZE << order) > (unsigned long)_end)
+ || vstart + (PAGE_SIZE << order) > _brk_end)
return -EINVAL;
ptep = lookup_address((unsigned long)__va(__pa(vstart)),
&level);
@@ -951,6 +955,6 @@ int write_ldt_entry(struct desc_struct *
int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
int type)
{
- maddr_t mach_gp = virt_to_machine(gdt + entry);
+ maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-17/arch/x86/mm/init-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -0,0 +1,459 @@
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __meminitdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+#if !defined(CONFIG_XEN)
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+#elif defined(CONFIG_X86_32)
+#define direct_gbpages 0
+extern unsigned long extend_init_mapping(unsigned long tables_space);
+#else
+extern void xen_finish_init_mapping(void);
+#endif
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
+{
+ unsigned long puds, pmds, ptes, tables;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+ if (use_gbpages) {
+ unsigned long extra;
+
+ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ if (use_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+ extra += PMD_SIZE;
+#endif
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+ /* for fixmap */
+ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+#ifdef CONFIG_X86_32
+ e820_table_start = extend_init_mapping(tables);
+ e820_table_end = e820_table_start;
+#else /* CONFIG_X86_64 */
+ if (!e820_table_top) {
+ e820_table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+ xen_start_info->nr_pt_frames;
+ e820_table_end = e820_table_start;
+ } else {
+ /*
+ * [table_start, table_top) gets passed to reserve_early(),
+ * so we must not use table_end here, despite continuing
+ * to allocate from there. table_end possibly being below
+ * table_start is otoh not a problem.
+ */
+ e820_table_start = e820_table_top;
+ }
+#endif
+ if (e820_table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+ unsigned long ret = 0;
+ unsigned long pos;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
+ int use_pse, use_gbpages;
+
+ printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+ if (!after_bootmem)
+ init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+ set_nx();
+ if (nx_enabled)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+#endif
+
+ if (use_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (use_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pos == 0)
+ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ else
+ end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+ if (end_pfn > (end >> PAGE_SHIFT))
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* big page (2M) range */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pos>>PAGE_SHIFT;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+ if (!after_bootmem)
+ find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+ for (i = 0; i < nr_range; i++)
+ kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+ ret = end;
+#else /* CONFIG_X86_64 */
+#define addr_to_page(addr) \
+ ((unsigned long *) \
+ ((mfn_to_pfn(((addr) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) \
+ << PAGE_SHIFT) + __START_KERNEL_map))
+
+ if (!start) {
+ unsigned long addr, va = __START_KERNEL_map;
+ unsigned long *page = (unsigned long *)init_level4_pgt;
+
+ /* Kill mapping of memory below _text. */
+ while (va < (unsigned long)&_text) {
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+
+ /* Blow away any spurious initial mappings. */
+ va = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT);
+
+ addr = page[pgd_index(va)];
+ page = addr_to_page(addr);
+ addr = page[pud_index(va)];
+ page = addr_to_page(addr);
+ while (pmd_index(va) | pte_index(va)) {
+ if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+ break;
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+ }
+
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+#undef addr_to_page
+#endif
+
+#ifdef CONFIG_X86_32
+ early_ioremap_page_table_range_init();
+#endif
+
+#ifdef CONFIG_X86_64
+ BUG_ON(e820_table_end > e820_table_top);
+ if (!start)
+ xen_finish_init_mapping();
+ else
+#endif
+ if (e820_table_end < e820_table_top)
+ /* Disable the 'table_end' allocator. */
+ e820_table_top = e820_table_end;
+
+ __flush_tlb_all();
+
+ if (!after_bootmem && e820_table_top > e820_table_start)
+ reserve_early(e820_table_start << PAGE_SHIFT,
+ e820_table_top << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start, end);
+
+ return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
+ if (mfn_to_local_pfn(pagenr) >= max_pfn)
+ return 1;
+ return 0;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+ unsigned long addr = begin;
+
+ if (addr >= end)
+ return;
+
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable first.
+ */
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ ClearPageReserved(virt_to_page(addr));
+ init_page_count(virt_to_page(addr));
+ memset((void *)(addr & ~(PAGE_SIZE-1)),
+ POISON_FREE_INITMEM, PAGE_SIZE);
+#ifdef CONFIG_X86_64
+ if (addr >= __START_KERNEL_map) {
+ /* make_readonly() reports all kernel addresses. */
+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
+ pfn_pte(__pa(addr) >> PAGE_SHIFT,
+ PAGE_KERNEL),
+ 0))
+ BUG();
+ if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+ BUG();
+ }
+#endif
+ free_page(addr);
+ totalram_pages++;
+ }
+#endif
+}
+
+void free_initmem(void)
+{
+ free_init_pages("unused kernel memory",
+ (unsigned long)(&__init_begin),
+ (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_init_pages("initrd memory", start, end);
+}
+#endif
--- head-2011-03-17.orig/arch/x86/mm/init_32-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/init_32-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -52,9 +52,7 @@
#include <asm/swiotlb.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
-#include <asm/smp.h>
-
-unsigned int __VMALLOC_RESERVE = 128 << 20;
+#include <asm/init.h>
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -64,19 +62,14 @@ unsigned long highstart_pfn, highend_pfn
static noinline int do_test_wp_bit(void);
-
-static unsigned long __initdata table_start;
-static unsigned long __initdata table_end;
-static unsigned long __initdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
static __init void *alloc_low_page(void)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = __va(pfn * PAGE_SIZE);
@@ -96,7 +89,7 @@ static pmd_t * __init one_md_table_init(
#ifdef CONFIG_X86_PAE
if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_init_bootmem)
+ if (after_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
pmd_table = (pmd_t *)alloc_low_page();
@@ -128,7 +121,7 @@ static pte_t * __init one_page_table_ini
#endif
pte_t *page_table = NULL;
- if (after_init_bootmem) {
+ if (after_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
@@ -148,6 +141,23 @@ static pte_t * __init one_page_table_ini
return pte_offset_kernel(pmd, 0);
}
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ int pgd_idx = pgd_index(vaddr);
+ int pmd_idx = pmd_index(vaddr);
+
+ return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ int pte_idx = pte_index(vaddr);
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return one_page_table_init(pmd) + pte_idx;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte)
{
@@ -164,12 +174,12 @@ static pte_t *__init page_table_kmap_che
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < table_start
- || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
pte_t *newpte;
int i;
- BUG_ON(after_init_bootmem);
+ BUG_ON(after_bootmem);
newpte = alloc_low_page();
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
@@ -244,11 +254,14 @@ static inline int is_kernel_text(unsigne
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
- unsigned long start_pfn,
- unsigned long end_pfn,
- int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
+ int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long start_pfn, end_pfn;
+ pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
@@ -257,6 +270,9 @@ static void __init kernel_physical_mappi
unsigned pages_2m, pages_4k;
int mapping_iter;
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
/*
* First iteration will setup identity mapping using large/small pages
* based on use_pse, with other attributes same as set by
@@ -391,26 +407,6 @@ repeat:
mapping_iter = 2;
goto repeat;
}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (mfn_to_local_pfn(pagenr) >= max_pfn)
- return 1;
return 0;
}
@@ -506,30 +502,10 @@ void __init add_highpages_with_active_re
work_with_active_regions(nid, add_highpages_work_fn, &data);
}
-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(void)
-{
- int pfn;
-
- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
- /* XEN: init high-mem pages outside initial allocation. */
- for (pfn = xen_start_info->nr_pages; pfn < highend_pfn; pfn++) {
- ClearPageReserved(pfn_to_page(pfn));
- init_page_count(pfn_to_page(pfn));
- }
-
- totalram_pages += totalhigh_pages;
-}
-#endif /* !CONFIG_NUMA */
-
#else
static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
}
-static inline void set_highmem_pages_init(void)
-{
-}
#endif /* CONFIG_HIGHMEM */
pgd_t *swapper_pg_dir;
@@ -553,8 +529,9 @@ pgd_t *swapper_pg_dir;
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
{
+ pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
@@ -649,7 +626,7 @@ static int __init noexec_setup(char *str
}
early_param("noexec", noexec_setup);
-static void __init set_nx(void)
+void __init set_nx(void)
{
unsigned int v[4], l, h;
@@ -685,75 +662,97 @@ static int __init parse_highmem(char *ar
}
early_param("highmem", parse_highmem);
+#define MSG_HIGHMEM_TOO_BIG \
+ "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+ "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
/*
- * Determine low and high memory ranges:
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
*/
-void __init find_low_pfn_range(void)
+void __init lowmem_pfn_init(void)
{
- /* it could update max_pfn */
-
/* max_low_pfn is 0, we already have early_res support */
-
max_low_pfn = max_pfn;
- if (max_low_pfn > MAXMEM_PFN) {
- if (highmem_pages == -1)
- highmem_pages = max_pfn - MAXMEM_PFN;
- if (highmem_pages + MAXMEM_PFN < max_pfn)
- max_pfn = MAXMEM_PFN + highmem_pages;
- if (highmem_pages + MAXMEM_PFN > max_pfn) {
- printk(KERN_WARNING "only %luMB highmem pages "
- "available, ignoring highmem size of %uMB.\n",
- pages_to_mb(max_pfn - MAXMEM_PFN),
+
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+ pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+ printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
- max_low_pfn = MAXMEM_PFN;
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+ "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+ "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+ max_low_pfn = MAXMEM_PFN;
+
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
#ifndef CONFIG_HIGHMEM
- /* Maximum memory usable is what is directly addressable */
- printk(KERN_WARNING "Warning only %ldMB will be used.\n",
- MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING
- "Use a HIGHMEM64G enabled kernel.\n");
- else
- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
- max_pfn = MAXMEM_PFN;
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_HIGHMEM64G
- if (max_pfn > MAX_NONPAE_PFN) {
- max_pfn = MAX_NONPAE_PFN;
- printk(KERN_WARNING "Warning only 4GB will be used."
- "Use a HIGHMEM64G enabled kernel.\n");
- }
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+ }
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
- } else {
- if (highmem_pages == -1)
- highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
- if (highmem_pages >= max_pfn) {
- printk(KERN_ERR "highmem size specified (%uMB) is "
- "bigger than pages available (%luMB)!.\n",
- pages_to_mb(highmem_pages),
- pages_to_mb(max_pfn));
- highmem_pages = 0;
- }
- if (highmem_pages) {
- if (max_low_pfn - highmem_pages <
- 64*1024*1024/PAGE_SIZE){
- printk(KERN_ERR "highmem size %uMB results in "
- "smaller than 64MB lowmem, ignoring it.\n"
- , pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- max_low_pfn -= highmem_pages;
- }
-#else
- if (highmem_pages)
- printk(KERN_ERR "ignoring highmem size on non-highmem"
- " kernel!\n");
-#endif
- }
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+ /* it could update max_pfn */
+
+ if (max_pfn <= MAXMEM_PFN)
+ lowmem_pfn_init();
+ else
+ highmem_pfn_init();
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -779,6 +778,8 @@ void __init initmem_init(unsigned long s
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
+ __vmalloc_start_set = true;
+
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
@@ -800,40 +801,70 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+static unsigned long __init setup_node_bootmem(int nodeid,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long bootmap)
+{
+ unsigned long bootmap_size;
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+ bootmap >> PAGE_SHIFT,
+ start_pfn, end_pfn);
+ printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
+ nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
+ nodeid, bootmap, bootmap + bootmap_size);
+ free_bootmem_with_active_regions(nodeid, end_pfn);
+ early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ return bootmap + bootmap_size;
+}
+
void __init setup_bootmem_allocator(void)
{
- int i;
+ int nodeid;
unsigned long bootmap_size, bootmap;
- unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
+ unsigned long end_xen_pfn = min(max_low_pfn, xen_start_info->nr_pages);
/*
* Initialize the boot-time allocator (with low memory only):
*/
- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
- min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
+ bootmap_size = bootmem_bootmap_pages(end_xen_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, min(max_pfn_mapped,
+ xen_start_info->nr_pages)<<PAGE_SHIFT,
bootmap_size, PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
- /* don't touch min_low_pfn */
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
- min_low_pfn, end_pfn);
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
- printk(KERN_INFO " low ram: %08lx - %08lx\n",
- min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
- printk(KERN_INFO " bootmap %08lx - %08lx\n",
- bootmap, bootmap + bootmap_size);
- for_each_online_node(i)
- free_bootmem_with_active_regions(i, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
- after_init_bootmem = 1;
+ for_each_online_node(nodeid) {
+ unsigned long start_pfn, end_pfn;
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ start_pfn = node_start_pfn[nodeid];
+ end_pfn = node_end_pfn[nodeid];
+ if (start_pfn > end_xen_pfn)
+ continue;
+ if (end_pfn > end_xen_pfn)
+ end_pfn = end_xen_pfn;
+#else
+ start_pfn = 0;
+ end_pfn = end_xen_pfn;
+#endif
+ bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+ bootmap);
+ }
+
+ after_bootmem = 1;
}
-static unsigned long __init extend_init_mapping(unsigned long tables_space)
+unsigned long __init extend_init_mapping(unsigned long tables_space)
{
unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
+ xen_start_info->nr_pt_frames;
@@ -885,133 +916,6 @@ static unsigned long __init extend_init_
return start_pfn;
}
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
- unsigned long puds, pmds, ptes, tables;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = PAGE_ALIGN(puds * sizeof(pud_t));
-
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
-
- if (use_pse) {
- unsigned long extra;
-
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- extra += PMD_SIZE;
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
- tables += PAGE_ALIGN(ptes * sizeof(pte_t));
-
- /* for fixmap */
- tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
-
- table_start = extend_init_mapping(tables);
-
- table_end = table_start;
- table_top = table_start + (tables>>PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT,
- (table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- pgd_t *pgd_base = swapper_pg_dir;
- unsigned long start_pfn, end_pfn;
- unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- int use_pse = 0;
-#else
- int use_pse = cpu_has_pse;
-#endif
-
- /*
- * Find space for the kernel direct mapping tables.
- */
- if (!after_init_bootmem)
- find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
- set_nx();
- if (nx_enabled)
- printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- }
-
- /*
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
- */
- big_page_start = PMD_SIZE;
-
- if (start < big_page_start) {
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
- } else {
- /* head is not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- }
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
- /* big page range */
- start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < (big_page_start >> PAGE_SHIFT))
- start_pfn = big_page_start >> PAGE_SHIFT;
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- use_pse);
-
- /* tail is not big page alignment ? */
- start_pfn = end_pfn;
- if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn,
- end_pfn, 0);
- }
-
- early_ioremap_page_table_range_init(pgd_base);
-
- __flush_tlb_all();
-
- if (!after_init_bootmem)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- if (!after_init_bootmem)
- early_memtest(start, end);
-
- return end >> PAGE_SHIFT;
-}
-
-
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
@@ -1215,17 +1119,47 @@ static noinline int do_test_wp_bit(void)
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read write\n",
+ start, start+size);
+
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read only\n",
+ start, start+size);
+
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
-#ifndef CONFIG_DYNAMIC_FTRACE
- /* Dynamic tracing modifies the kernel text section */
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
size >> 10);
+ kernel_set_to_readonly = 1;
+
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
start, start+size);
@@ -1234,7 +1168,6 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
-#endif /* CONFIG_DYNAMIC_FTRACE */
start += size;
size = (unsigned long)__end_rodata - start;
@@ -1253,52 +1186,6 @@ void mark_rodata_ro(void)
}
#endif
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * If debugging page accesses then do not free this memory but
- * mark them not present - any buggy init-section access will
- * create a kernel page fault:
- */
- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
- begin, PAGE_ALIGN(end));
- set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
- unsigned long addr;
-
- /*
- * We just marked the kernel text read only above, now that
- * we are going to free part of that, we need to make that
- * writeable first.
- */
- set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
-
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- free_page(addr);
- totalram_pages++;
- }
- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-#endif
-}
-
-void free_initmem(void)
-{
- free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
--- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -51,6 +51,8 @@
#include <asm/kdebug.h>
#include <asm/numa.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
+#include <asm/setup.h>
#include <xen/features.h>
@@ -67,8 +69,6 @@ unsigned int __kernel_page_user;
EXPORT_SYMBOL(__kernel_page_user);
#endif
-int after_bootmem;
-
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
@@ -127,12 +127,6 @@ void __meminit early_make_page_readonly(
}
#ifndef CONFIG_XEN
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
-
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -154,14 +148,10 @@ early_param("gbpages", parse_direct_gbpa
* around without checking the pgd every time.
*/
-static unsigned long __meminitdata table_start;
-static unsigned long __meminitdata table_cur;
-static unsigned long __meminitdata table_top;
-
pteval_t __supported_pte_mask __read_mostly = ~0UL;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
/*
* noexec=on|off
@@ -176,9 +166,9 @@ static int __init nonx_setup(char *str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
- do_not_nx = 0;
+ disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
- do_not_nx = 1;
+ disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
return 0;
@@ -190,7 +180,7 @@ void __cpuinit check_efer(void)
unsigned long efer;
rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || do_not_nx)
+ if (!(efer & EFER_NX) || disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
@@ -224,9 +214,9 @@ static __ref void *spp_getpage(void)
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC);
- else if (table_cur < table_top) {
- ptr = __va(table_cur << PAGE_SHIFT);
- table_cur++;
+ else if (e820_table_end < e820_table_top) {
+ ptr = __va(e820_table_end << PAGE_SHIFT);
+ e820_table_end++;
memset(ptr, 0, PAGE_SIZE);
} else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -241,36 +231,54 @@ static __ref void *spp_getpage(void)
return ptr;
}
-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ if (pgd_none(*pgd)) {
+ pud_t *pud = (pud_t *)spp_getpage();
+ make_page_readonly(pud, XENFEAT_writable_page_tables);
+ pgd_populate(&init_mm, pgd, pud);
+ if (pud != pud_offset(pgd, 0))
+ printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+ pud, pud_offset(pgd, 0));
+ }
+ return pud_offset(pgd, vaddr);
+}
- pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
if (pud_none(*pud)) {
- pmd = (pmd_t *) spp_getpage();
+ pmd_t *pmd = (pmd_t *) spp_getpage();
make_page_readonly(pmd, XENFEAT_writable_page_tables);
pud_populate(&init_mm, pud, pmd);
- if (pmd != pmd_offset(pud, 0)) {
+ if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
- pmd, pmd_offset(pud, 0));
- return;
- }
+ pmd, pmd_offset(pud, 0));
}
- pmd = pmd_offset(pud, vaddr);
+ return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
if (pmd_none(*pmd)) {
- pte = (pte_t *) spp_getpage();
+ pte_t *pte = (pte_t *) spp_getpage();
make_page_readonly(pte, XENFEAT_writable_page_tables);
pmd_populate_kernel(&init_mm, pmd, pte);
- if (pte != pte_offset_kernel(pmd, 0)) {
+ if (pte != pte_offset_kernel(pmd, 0))
printk(KERN_ERR "PAGETABLE BUG #02!\n");
- return;
- }
}
+ return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = pud_page + pud_index(vaddr);
+ pmd = fill_pmd(pud, vaddr);
+ pte = fill_pte(pmd, vaddr);
- pte = pte_offset_kernel(pmd, vaddr);
set_pte(pte, new_pte);
/*
@@ -280,8 +288,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsig
__flush_tlb_one(vaddr);
}
-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud_page;
@@ -298,6 +305,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t
set_pte_vaddr_pud(pud_page, vaddr, pteval);
}
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(vaddr);
+ pud = fill_pud(pgd, vaddr);
+ return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return fill_pte(pmd, vaddr);
+}
+
#ifndef CONFIG_XEN
/*
* Create large page table mappings for a range of physical addresses.
@@ -380,9 +405,9 @@ static __ref void *alloc_low_page(unsign
return adr;
}
- BUG_ON(!table_cur);
- pfn = table_cur++;
- if (pfn >= table_top)
+ BUG_ON(!e820_table_end);
+ pfn = e820_table_end++;
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -407,13 +432,13 @@ static inline int __meminit make_readonl
/* Make new page tables read-only on the first pass. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& !max_pfn_mapped
- && (paddr >= (table_start << PAGE_SHIFT))
- && (paddr < (table_top << PAGE_SHIFT)))
+ && (paddr >= (e820_table_start << PAGE_SHIFT))
+ && (paddr < (e820_table_top << PAGE_SHIFT)))
readonly = 1;
/* Make old page tables read-only. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
- && (paddr < (table_cur << PAGE_SHIFT)))
+ && (paddr < (e820_table_end << PAGE_SHIFT)))
readonly = 1;
/*
@@ -422,7 +447,7 @@ static inline int __meminit make_readonl
* mappings. Exclude the vsyscall area here, allowing alternative
* instruction patching to work.
*/
- if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
+ if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa(_brk_end))
&& !(paddr >= __pa_symbol(&__vsyscall_0)
&& paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
readonly = 1;
@@ -747,43 +772,9 @@ void __init xen_init_pt(void)
}
}
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
-{
- unsigned long puds, pmds, ptes, tables;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
-
- if (!table_top) {
- table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
- xen_start_info->nr_pt_frames;
- table_cur = table_start;
- } else {
- /*
- * [table_start, table_top) gets passed to reserve_early(),
- * so we must not use table_cur here, despite continuing
- * to allocate from there. table_cur possibly being below
- * table_start is otoh not a problem.
- */
- table_start = table_top;
- }
- __flush_tlb_all();
-
- table_top = table_cur + (tables >> PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init xen_finish_init_mapping(void)
+void __init xen_finish_init_mapping(void)
{
- unsigned long i, start, end;
+ unsigned long start, end;
/* Re-vector virtual addresses pointing into the initial
mapping to the just-established permanent ones. */
@@ -801,49 +792,22 @@ static void __init xen_finish_init_mappi
__va(__pa(xen_start_info->mod_start));
/* Destroy the Xen-created mappings beyond the kernel image. */
- start = PAGE_ALIGN((unsigned long)_end);
- end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+ start = PAGE_ALIGN(_brk_end);
+ end = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT);
for (; start < end; start += PAGE_SIZE)
if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
BUG();
- /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
- start = table_top;
- WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
- table_start, table_cur, start);
- table_top = ~0UL;
-
- /* Switch to the real shared_info page, and clear the dummy page. */
- set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
- HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
- memset(empty_zero_page, 0, sizeof(empty_zero_page));
-
- /* Set up mapping of lowest 1MB of physical memory. */
- for (i = 0; i < NR_FIX_ISAMAPS; i++)
- if (is_initial_xendomain())
- set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
- else
- __set_fixmap(FIX_ISAMAP_BEGIN - i,
- virt_to_mfn(empty_zero_page)
- << PAGE_SHIFT,
- PAGE_KERNEL_RO);
-
- table_top = max(table_cur, start);
+ WARN(e820_table_end != e820_table_top, "start=%lx cur=%lx top=%lx\n",
+ e820_table_start, e820_table_end, e820_table_top);
+ if (e820_table_end > e820_table_top)
+ e820_table_top = e820_table_end;
}
-static void __init init_gbpages(void)
-{
-#ifndef CONFIG_XEN
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-#endif
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
unsigned long next, last_map_addr = end;
@@ -887,207 +851,6 @@ static unsigned long __meminit kernel_ph
return last_map_addr;
}
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
- unsigned long start_pfn, unsigned long end_pfn,
- unsigned long page_size_mask)
-{
-
- if (start_pfn < end_pfn) {
- if (nr_range >= NR_RANGE_MR)
- panic("run out of range for init_memory_mapping\n");
- mr[nr_range].start = start_pfn<<PAGE_SHIFT;
- mr[nr_range].end = end_pfn<<PAGE_SHIFT;
- mr[nr_range].page_size_mask = page_size_mask;
- nr_range++;
- }
-
- return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- unsigned long last_map_addr = 0;
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
-
- printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
-#endif
-
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
-
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
-
- /* head if not big page alignment ?*/
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (2M) range*/
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask &
- ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
- /* try to merge same page size and continuous */
- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
- unsigned long old_start;
- if (mr[i].end != mr[i+1].start ||
- mr[i].page_size_mask != mr[i+1].page_size_mask)
- continue;
- /* move it */
- old_start = mr[i].start;
- memmove(&mr[i], &mr[i+1],
- (nr_range - 1 - i) * sizeof (struct map_range));
- mr[i--].start = old_start;
- nr_range--;
- }
-
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " %010lx - %010lx page %s\n",
- mr[i].start, mr[i].end,
- (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
- (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
- if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
-
- if (!start) {
- unsigned long addr, va = __START_KERNEL_map;
- unsigned long *page = (unsigned long *)init_level4_pgt;
-
- /* Kill mapping of memory below _text. */
- while (va < (unsigned long)&_text) {
- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
- BUG();
- va += PAGE_SIZE;
- }
-
- /* Blow away any spurious initial mappings. */
- va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
- addr = page[pgd_index(va)];
- addr_to_page(addr, page);
- addr = page[pud_index(va)];
- addr_to_page(addr, page);
- while (pmd_index(va) | pte_index(va)) {
- if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
- break;
- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
- BUG();
- va += PAGE_SIZE;
- }
- }
-
- for (i = 0; i < nr_range; i++)
- last_map_addr = kernel_physical_mapping_init(
- mr[i].start, mr[i].end,
- mr[i].page_size_mask);
-
- BUG_ON(table_cur > table_top);
- if (!start)
- xen_finish_init_mapping();
- else if (table_cur < table_top)
- /* Disable the 'table_cur' allocator. */
- table_top = table_cur;
-
- __flush_tlb_all();
-
- if (!after_bootmem && table_top > table_start)
- reserve_early(table_start << PAGE_SHIFT,
- table_top << PAGE_SHIFT, "PGTABLE");
-
- printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
- last_map_addr, end);
-
- if (!after_bootmem)
- early_memtest(start, end);
-
- return last_map_addr >> PAGE_SHIFT;
-}
-
#ifndef CONFIG_NUMA
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1165,28 +928,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
#endif /* CONFIG_MEMORY_HOTPLUG */
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (mfn_to_local_pfn(pagenr) >= max_pfn)
- return 1;
- return 0;
-}
-
-
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
kcore_modules, kcore_vsyscall;
@@ -1243,56 +984,39 @@ void __init mem_init(void)
initsize >> 10);
}
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
{
- unsigned long addr = begin;
+ unsigned long start = PFN_ALIGN(_stext);
+ unsigned long end = PFN_ALIGN(__start_rodata);
- if (addr >= end)
+ if (!kernel_set_to_readonly)
return;
- /*
- * If debugging page accesses then do not free this memory but
- * mark them not present - any buggy init-section access will
- * create a kernel page fault:
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
- begin, PAGE_ALIGN(end));
- set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+ pr_debug("Set kernel text: %lx - %lx for read write\n",
+ start, end);
- for (; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(addr));
- init_page_count(virt_to_page(addr));
- memset((void *)(addr & ~(PAGE_SIZE-1)),
- POISON_FREE_INITMEM, PAGE_SIZE);
- if (addr >= __START_KERNEL_map) {
- /* make_readonly() reports all kernel addresses. */
- if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
- pfn_pte(__pa(addr) >> PAGE_SHIFT,
- PAGE_KERNEL),
- 0))
- BUG();
- if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
- BUG();
- }
- free_page(addr);
- totalram_pages++;
- }
-#endif
+ set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}
-void free_initmem(void)
+void set_kernel_text_ro(void)
{
- free_init_pages("unused kernel memory",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
-}
+ unsigned long start = PFN_ALIGN(_stext);
+ unsigned long end = PFN_ALIGN(__start_rodata);
-#ifdef CONFIG_DEBUG_RODATA
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read only\n",
+ start, end);
+
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
void mark_rodata_ro(void)
{
@@ -1300,15 +1024,12 @@ void mark_rodata_ro(void)
unsigned long rodata_start =
((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
-#ifdef CONFIG_DYNAMIC_FTRACE
- /* Dynamic tracing modifies the kernel text section */
- start = rodata_start;
-#endif
-
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+ kernel_set_to_readonly = 1;
+
/*
* The rodata section (but not the kernel text!) should also be
* not-executable.
@@ -1328,13 +1049,6 @@ void mark_rodata_ro(void)
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
--- head-2011-03-17.orig/arch/x86/mm/iomap_32-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/iomap_32-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -20,10 +20,11 @@
#include <asm/pat.h>
#include <linux/bitops.h>
#include <linux/module.h>
+#include <linux/highmem.h>
int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
-#ifndef CONFIG_X86_PAE
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
if (base + size > 0x100000000ULL)
return 0;
@@ -32,16 +33,28 @@ int is_io_mapping_possible(resource_size
}
EXPORT_SYMBOL_GPL(is_io_mapping_possible);
-/* Map 'mfn' using fixed map 'type' and protections 'prot'
- */
-void *
-iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
pagefault_disable();
+ debug_kmap_atomic(type);
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot));
+ /*arch_flush_lazy_mmu_mode();*/
+
+ return (void *)vaddr;
+}
+
+/*
+ * Map 'mfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot)
+{
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
* PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -51,13 +64,8 @@ iomap_atomic_prot_pfn(unsigned long mfn,
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
pgprot_val(prot) |= _PAGE_IOMAP;
- set_pte_at(&init_mm, vaddr, kmap_pte-idx, pfn_pte_ma(mfn, prot));
- /*arch_flush_lazy_mmu_mode()*/;
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(mfn, type, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
--- head-2011-03-17.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:41:20.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/ioremap-xen.c 2011-02-07 15:41:25.000000000 +0100
@@ -23,13 +23,17 @@
#include <asm/pgalloc.h>
#include <asm/pat.h>
-#ifdef CONFIG_X86_64
-
-static inline int phys_addr_valid(unsigned long addr)
+static inline int phys_addr_valid(resource_size_t addr)
{
- return addr < (1UL << boot_cpu_data.x86_phys_bits);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
}
+#ifdef CONFIG_X86_64
+
#define phys_base 0
unsigned long __phys_addr(unsigned long x)
@@ -41,8 +45,7 @@ unsigned long __phys_addr(unsigned long
} else {
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
- !phys_addr_valid(x));
+ VIRTUAL_BUG_ON(!phys_addr_valid(x));
}
return x;
}
@@ -59,10 +62,8 @@ bool __virt_addr_valid(unsigned long x)
if (x < PAGE_OFFSET)
return false;
x -= PAGE_OFFSET;
- if (system_state == SYSTEM_BOOTING ?
- x > MAXMEM : !phys_addr_valid(x)) {
+ if (!phys_addr_valid(x))
return false;
- }
}
return pfn_valid(x >> PAGE_SHIFT);
@@ -73,18 +74,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
#else
-static inline int phys_addr_valid(unsigned long addr)
-{
- return 1;
-}
-
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- /* VMALLOC_* aren't constants; not available at the boot time */
+ /* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
- is_vmalloc_addr((void *) x));
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
return x - PAGE_OFFSET;
}
EXPORT_SYMBOL(__phys_addr);
@@ -94,7 +89,9 @@ bool __virt_addr_valid(unsigned long x)
{
if (x < PAGE_OFFSET)
return false;
- if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
return false;
return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
@@ -443,16 +440,17 @@ static void __iomem *__ioremap_caller(re
return NULL;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
- size, prot, domid)) {
+
+ if (kernel_map_sync_memtype(phys_addr, size, prot_val)) {
free_memtype(phys_addr, phys_addr + size);
free_vm_area(area);
return NULL;
}
- if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+ if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+ size, prot, domid)) {
free_memtype(phys_addr, phys_addr + size);
- vunmap(area->addr);
+ free_vm_area(area);
return NULL;
}
@@ -509,7 +507,7 @@ EXPORT_SYMBOL(ioremap_nocache);
*
* Must be freed with iounmap.
*/
-void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
if (pat_enabled)
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -539,7 +537,8 @@ static void __iomem *ioremap_default(res
* - UC_MINUS for non-WB-able memory with no other conflicting mappings
* - Inherit from confliting mappings otherwise
*/
- err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+ err = reserve_memtype(phys_addr, phys_addr + size,
+ _PAGE_CACHE_WB, &flags);
if (err < 0)
return NULL;
@@ -678,13 +677,19 @@ static inline pte_t * __init early_iorem
return &bm_pte[pte_index(addr)];
}
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
void __init early_ioremap_init(void)
{
pmd_t *pmd;
+ int i;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
@@ -715,7 +720,7 @@ void __init early_ioremap_reset(void)
}
static void __init __early_set_fixmap(enum fixed_addresses idx,
- unsigned long phys, pgprot_t flags)
+ phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
@@ -734,7 +739,7 @@ static void __init __early_set_fixmap(en
}
static inline void __init early_set_fixmap(enum fixed_addresses idx,
- unsigned long phys, pgprot_t prot)
+ phys_addr_t phys, pgprot_t prot)
{
if (after_paging_init)
__set_fixmap(idx, phys, prot);
@@ -752,6 +757,7 @@ static inline void __init early_clear_fi
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
static int __init check_early_ioremap_leak(void)
{
int count = 0;
@@ -773,9 +779,11 @@ static int __init check_early_ioremap_le
}
late_initcall(check_early_ioremap_leak);
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
{
- unsigned long offset, last_addr;
+ unsigned long offset;
+ resource_size_t last_addr;
unsigned int nrpages;
enum fixed_addresses idx0, idx;
int i, slot;
@@ -791,15 +799,15 @@ static void __init __iomem *__early_iore
}
if (slot < 0) {
- printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
- phys_addr, size);
+ printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
+ (u64)phys_addr, size);
WARN_ON(1);
return NULL;
}
if (early_ioremap_debug) {
- printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
- phys_addr, size, slot);
+ printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
+ (u64)phys_addr, size, slot);
dump_stack();
}
@@ -839,20 +847,28 @@ static void __init __iomem *__early_iore
--nrpages;
}
if (early_ioremap_debug)
- printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+ printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
- prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
}
/* Remap an IO device */
-void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size)
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
{
+ /*
+ * Don't remap the low PCI/ISA area, it's always mapped.
+ */
+ if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1))
+ return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
}
/* Remap memory */
-void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size)
+void __init __iomem *
+early_memremap(resource_size_t phys_addr, unsigned long size)
{
return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL);
}
@@ -865,6 +881,15 @@ void __init early_iounmap(void __iomem *
enum fixed_addresses idx;
int i, slot;
+ /*
+ * early_ioremap special-cases the PCI/ISA range by not instantiating a
+ * vm_area and by simply returning an address into the kernel mapping
+ * of ISA space. So handle that here.
+ */
+ if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)
+ && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1))
+ return;
+
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (prev_map[i] == addr) {
@@ -909,8 +934,3 @@ void __init early_iounmap(void __iomem *
}
prev_map[slot] = NULL;
}
-
-void __this_fixmap_does_not_exist(void)
-{
- WARN_ON(1);
-}
--- head-2011-03-17.orig/arch/x86/mm/pageattr-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pageattr-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -16,6 +16,7 @@
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/setup.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
unsigned long pfn;
unsigned force_split : 1;
int curpage;
+ struct page **pages;
};
/*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_star
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -150,7 +153,7 @@ static void __cpa_flush_all(void *arg)
*/
__flush_tlb_all();
- if (cache && boot_cpu_data.x86_model >= 4)
+ if (cache && boot_cpu_data.x86 >= 4)
wbinvd();
}
@@ -201,38 +204,41 @@ static void cpa_flush_range(unsigned lon
}
}
-static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+ int in_flags, struct page **pages)
{
unsigned int i, level;
- unsigned long *addr;
+ unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
BUG_ON(irqs_disabled());
- on_each_cpu(__cpa_flush_range, NULL, 1);
+ on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
- if (!cache)
+ if (!cache || do_wbinvd)
return;
- /* 4M threshold */
- if (numpages >= 1024) {
- if (boot_cpu_data.x86_model >= 4)
- wbinvd();
- return;
- }
/*
* We only need to flush on one CPU,
* clflush is a MESI-coherent instruction that
* will cause all other CPUs to flush the same
* cachelines:
*/
- for (i = 0, addr = start; i < numpages; i++, addr++) {
- pte_t *pte = lookup_address(*addr, &level);
+ for (i = 0; i < numpages; i++) {
+ unsigned long addr;
+ pte_t *pte;
+
+ if (in_flags & CPA_PAGES_ARRAY)
+ addr = (unsigned long)page_address(pages[i]);
+ else
+ addr = start[i];
+
+ pte = lookup_address(addr, &level);
/*
* Only flush present addresses:
*/
if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range((void *) *addr, PAGE_SIZE);
+ clflush_cache_range((void *)addr, PAGE_SIZE);
}
}
@@ -498,6 +504,13 @@ static int split_large_page(pte_t *kpte,
pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+ /*
+ * If we ever want to utilize the PAT bit, we need to
+ * update this function to make sure it's converted from
+ * bit 12 to bit 7 when we cross from the 2MB level to
+ * the 4K level:
+ */
+ WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
#ifdef CONFIG_X86_64
if (level == PG_LEVEL_1G) {
@@ -597,7 +610,9 @@ static int __change_page_attr(struct cpa
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY)
+ address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+ else if (cpa->flags & CPA_ARRAY)
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
@@ -701,7 +716,9 @@ static int cpa_process_alias(struct cpa_
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY)
+ vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+ else if (cpa->flags & CPA_ARRAY)
vaddr = cpa->vaddr[cpa->curpage];
else
vaddr = *cpa->vaddr;
@@ -712,7 +729,7 @@ static int cpa_process_alias(struct cpa_
alias_cpa = *cpa;
temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~CPA_ARRAY;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -725,7 +742,7 @@ static int cpa_process_alias(struct cpa_
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, _brk_end))
return 0;
/*
@@ -738,7 +755,7 @@ static int cpa_process_alias(struct cpa_
alias_cpa = *cpa;
temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~CPA_ARRAY;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
/*
* The high mapping range is imprecise, so ignore the return value.
@@ -759,7 +776,7 @@ static int __change_page_attr_set_clr(st
*/
cpa->numpages = numpages;
/* for array changes, we can't use large page */
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
if (!debug_pagealloc)
@@ -783,7 +800,7 @@ static int __change_page_attr_set_clr(st
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
cpa->curpage++;
else
*cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -800,7 +817,8 @@ static inline int cache_attr(pgprot_t at
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
- int force_split, int array)
+ int force_split, int in_flag,
+ struct page **pages)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -815,15 +833,7 @@ static int change_page_attr_set_clr(unsi
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (!array) {
- if (*addr & ~PAGE_MASK) {
- *addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
- }
- } else {
+ if (in_flag & CPA_ARRAY) {
int i;
for (i = 0; i < numpages; i++) {
if (addr[i] & ~PAGE_MASK) {
@@ -831,6 +841,18 @@ static int change_page_attr_set_clr(unsi
WARN_ON_ONCE(1);
}
}
+ } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+ /*
+ * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+ * No need to cehck in that case
+ */
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
}
/* Must avoid aliasing mappings in the highmem code */
@@ -848,6 +870,7 @@ static int change_page_attr_set_clr(unsi
xen_multicall_flush(true);
cpa.vaddr = addr;
+ cpa.pages = pages;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
@@ -855,8 +878,8 @@ static int change_page_attr_set_clr(unsi
cpa.curpage = 0;
cpa.force_split = force_split;
- if (array)
- cpa.flags |= CPA_ARRAY;
+ if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa.flags |= in_flag;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -882,9 +905,10 @@ static int change_page_attr_set_clr(unsi
* wbindv):
*/
if (!ret && cpu_has_clflush) {
- if (cpa.flags & CPA_ARRAY)
- cpa_flush_array(addr, numpages, cache);
- else
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(addr, numpages, cache,
+ cpa.flags, pages);
+ } else
cpa_flush_range(*addr, numpages, cache);
} else
cpa_flush_all(cache);
@@ -905,14 +929,28 @@ static inline int change_page_attr_set(u
pgprot_t mask, int array)
{
return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
- array);
+ (array ? CPA_ARRAY : 0), NULL);
}
static inline int change_page_attr_clear(unsigned long *addr, int numpages,
pgprot_t mask, int array)
{
return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
- array);
+ (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+ CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+ CPA_PAGES_ARRAY, pages);
}
#ifdef CONFIG_XEN
@@ -971,71 +1009,94 @@ int _set_memory_uc(unsigned long addr, i
int set_memory_uc(unsigned long addr, int numpages)
{
+ int ret;
+
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
- if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
- _PAGE_CACHE_UC_MINUS, NULL))
- return -EINVAL;
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_UC_MINUS, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = _set_memory_uc(addr, numpages);
+ if (ret)
+ goto out_free;
- return _set_memory_uc(addr, numpages);
+ return 0;
+
+out_free:
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+ return ret;
}
EXPORT_SYMBOL(set_memory_uc);
int set_memory_array_uc(unsigned long *addr, int addrinarray)
{
- unsigned long start;
- unsigned long end;
- int i;
+ int i, j;
+ int ret;
+
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
for (i = 0; i < addrinarray; i++) {
- start = __pa(addr[i]);
- for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
- if (end != __pa(addr[i + 1]))
- break;
- i++;
- }
- if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
- goto out;
+ ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+ _PAGE_CACHE_UC_MINUS, NULL);
+ if (ret)
+ goto out_free;
}
- return change_page_attr_set(addr, addrinarray,
+ ret = change_page_attr_set(addr, addrinarray,
__pgprot(_PAGE_CACHE_UC_MINUS), 1);
-out:
- for (i = 0; i < addrinarray; i++) {
- unsigned long tmp = __pa(addr[i]);
+ if (ret)
+ goto out_free;
- if (tmp == start)
- break;
- for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
- if (end != __pa(addr[i + 1]))
- break;
- i++;
- }
- free_memtype(tmp, end);
- }
- return -EINVAL;
+ return 0;
+
+out_free:
+ for (j = 0; j < i; j++)
+ free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+ return ret;
}
EXPORT_SYMBOL(set_memory_array_uc);
int _set_memory_wc(unsigned long addr, int numpages)
{
- return change_page_attr_set(&addr, numpages,
+ int ret;
+ ret = change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+
+ if (!ret) {
+ ret = change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_CACHE_WC), 0);
+ }
+ return ret;
}
int set_memory_wc(unsigned long addr, int numpages)
{
+ int ret;
+
if (!pat_enabled)
return set_memory_uc(addr, numpages);
- if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
- _PAGE_CACHE_WC, NULL))
- return -EINVAL;
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_WC, NULL);
+ if (ret)
+ goto out_err;
- return _set_memory_wc(addr, numpages);
+ ret = _set_memory_wc(addr, numpages);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+ return ret;
}
EXPORT_SYMBOL(set_memory_wc);
@@ -1047,29 +1108,31 @@ int _set_memory_wb(unsigned long addr, i
int set_memory_wb(unsigned long addr, int numpages)
{
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ int ret;
- return _set_memory_wb(addr, numpages);
+ ret = _set_memory_wb(addr, numpages);
+ if (ret)
+ return ret;
+
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ return 0;
}
EXPORT_SYMBOL(set_memory_wb);
int set_memory_array_wb(unsigned long *addr, int addrinarray)
{
int i;
+ int ret;
- for (i = 0; i < addrinarray; i++) {
- unsigned long start = __pa(addr[i]);
- unsigned long end;
-
- for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
- if (end != __pa(addr[i + 1]))
- break;
- i++;
- }
- free_memtype(start, end);
- }
- return change_page_attr_clear(addr, addrinarray,
+ ret = change_page_attr_clear(addr, addrinarray,
__pgprot(_PAGE_CACHE_MASK), 1);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < addrinarray; i++)
+ free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+ return 0;
}
EXPORT_SYMBOL(set_memory_array_wb);
@@ -1105,7 +1168,7 @@ int set_memory_np(unsigned long addr, in
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
- __pgprot(0), 1, 0);
+ __pgprot(0), 1, 0, NULL);
}
int set_pages_uc(struct page *page, int numpages)
@@ -1116,6 +1179,35 @@ int set_pages_uc(struct page *page, int
}
EXPORT_SYMBOL(set_pages_uc);
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+ unsigned long start;
+ unsigned long end;
+ int i;
+ int free_idx;
+
+ for (i = 0; i < addrinarray; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+ goto err_out;
+ }
+
+ if (cpa_set_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
+ return 0; /* Success */
+ }
+err_out:
+ free_idx = i;
+ for (i = 0; i < free_idx; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -1124,6 +1216,28 @@ int set_pages_wb(struct page *page, int
}
EXPORT_SYMBOL(set_pages_wb);
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+ int retval;
+ unsigned long start;
+ unsigned long end;
+ int i;
+
+ retval = cpa_clear_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK));
+ if (retval)
+ return retval;
+
+ for (i = 0; i < addrinarray; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
int set_pages_x(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
--- head-2011-03-17.orig/arch/x86/mm/pat-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pat-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -31,7 +31,7 @@
#ifdef CONFIG_X86_PAT
int __read_mostly pat_enabled = 1;
-void __cpuinit pat_disable(char *reason)
+static inline void pat_disable(const char *reason)
{
pat_enabled = 0;
printk(KERN_INFO "%s\n", reason);
@@ -43,6 +43,11 @@ static int __init nopat(char *str)
return 0;
}
early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+ (void)reason;
+}
#endif
@@ -79,16 +84,20 @@ void pat_init(void)
if (!pat_enabled)
return;
- /* Paranoia check. */
- if (!cpu_has_pat && boot_pat_state) {
- /*
- * If this happens we are on a secondary CPU, but
- * switched to PAT on the boot CPU. We have no way to
- * undo PAT.
- */
- printk(KERN_ERR "PAT enabled, "
- "but not supported by secondary CPU\n");
- BUG();
+ if (!cpu_has_pat) {
+ if (!boot_pat_state) {
+ pat_disable("PAT not supported by CPU.");
+ return;
+ } else {
+ /*
+ * If this happens we are on a secondary CPU, but
+ * switched to PAT on the boot CPU. We have no way to
+ * undo PAT.
+ */
+ printk(KERN_ERR "PAT enabled, "
+ "but not supported by secondary CPU\n");
+ BUG();
+ }
}
#ifndef CONFIG_XEN
@@ -195,10 +204,10 @@ static unsigned long pat_x_mtrr_type(u64
u8 mtrr_type;
mtrr_type = mtrr_type_lookup(start, end);
- if (mtrr_type == MTRR_TYPE_UNCACHABLE)
- return _PAGE_CACHE_UC;
- if (mtrr_type == MTRR_TYPE_WRCOMB)
- return _PAGE_CACHE_WC;
+ if (mtrr_type != MTRR_TYPE_WRBACK)
+ return _PAGE_CACHE_UC_MINUS;
+
+ return _PAGE_CACHE_WB;
}
return req_type;
@@ -371,23 +380,13 @@ int reserve_memtype(u64 start, u64 end,
return 0;
}
- if (req_type == -1) {
- /*
- * Call mtrr_lookup to get the type hint. This is an
- * optimization for /dev/mem mmap'ers into WB memory (BIOS
- * tools and ACPI tools). Use WB request for WB memory and use
- * UC_MINUS otherwise.
- */
- u8 mtrr_type = mtrr_type_lookup(start, end);
-
- if (mtrr_type == MTRR_TYPE_WRBACK)
- actual_type = _PAGE_CACHE_WB;
- else
- actual_type = _PAGE_CACHE_UC_MINUS;
- } else {
- actual_type = pat_x_mtrr_type(start, end,
- req_type & _PAGE_CACHE_MASK);
- }
+ /*
+ * Call mtrr_lookup to get the type hint. This is an
+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
+ * tools and ACPI tools). Use WB request for WB memory and use
+ * UC_MINUS otherwise.
+ */
+ actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
if (new_type)
*new_type = actual_type;
@@ -565,9 +564,7 @@ static inline int range_is_allowed(unsig
int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
unsigned long size, pgprot_t *vma_prot)
{
- u64 addr = (u64)mfn << PAGE_SHIFT;
- unsigned long flags = -1;
- int retval;
+ unsigned long flags = _PAGE_CACHE_WB;
if (!range_is_allowed(mfn, size))
return 0;
@@ -597,60 +594,21 @@ int phys_mem_access_prot_allowed(struct
#endif
#endif
- /*
- * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
- *
- * Without O_SYNC, we want to get
- * - WB for WB-able memory and no other conflicting mappings
- * - UC_MINUS for non-WB-able memory with no other conflicting mappings
- * - Inherit from confliting mappings otherwise
- */
- if (flags != -1) {
- retval = reserve_memtype(addr, addr + size, flags, NULL);
- } else {
- retval = reserve_memtype(addr, addr + size, -1, &flags);
- }
-
- if (retval < 0)
- return 0;
-
- if (ioremap_check_change_attr(mfn, size, flags) < 0) {
- free_memtype(addr, addr + size);
- printk(KERN_INFO
- "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
- current->comm, current->pid,
- cattr_name(flags),
- addr, addr + size);
- return 0;
- }
-
*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
flags);
return 1;
}
-void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
-{
- unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
- u64 addr = (u64)mfn << PAGE_SHIFT;
- unsigned long flags;
-
- reserve_memtype(addr, addr + size, want_flags, &flags);
- if (flags != want_flags) {
- printk(KERN_INFO
- "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
- current->comm, current->pid,
- cattr_name(want_flags),
- addr, (unsigned long long)(addr + size),
- cattr_name(flags));
- }
-}
-
-void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags)
{
- u64 addr = (u64)mfn << PAGE_SHIFT;
+ if (!pat_enabled)
+ return 0;
- free_memtype(addr, addr + size);
+ return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags);
}
#ifndef CONFIG_XEN
@@ -663,17 +621,18 @@ static int reserve_pfn_range(u64 paddr,
int strict_prot)
{
int is_ram = 0;
- int id_sz, ret;
- unsigned long flags;
+ int ret;
unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+ unsigned long flags = want_flags;
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
- * reserve_pfn_range() doesn't support RAM pages.
+ * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+ * behavior with RAM pages by returning success.
*/
if (is_ram != 0)
- return -EINVAL;
+ return 0;
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
@@ -700,23 +659,8 @@ static int reserve_pfn_range(u64 paddr,
flags);
}
- /* Need to keep identity mapping in sync */
- if (paddr >= __pa(high_memory))
- return 0;
-
- id_sz = (__pa(high_memory) < paddr + size) ?
- __pa(high_memory) - paddr :
- size;
-
- if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+ if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
free_memtype(paddr, paddr + size);
- printk(KERN_ERR
- "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
- "for %Lx-%Lx\n",
- current->comm, current->pid,
- cattr_name(flags),
- (unsigned long long)paddr,
- (unsigned long long)(paddr + size));
return -EINVAL;
}
return 0;
@@ -741,29 +685,28 @@ static void free_pfn_range(u64 paddr, un
*
* If the vma has a linear pfn mapping for the entire range, we get the prot
* from pte and reserve the entire vma range with single reserve_pfn_range call.
- * Otherwise, we reserve the entire vma range, my ging through the PTEs page
- * by page to get physical address and protection.
*/
int track_pfn_vma_copy(struct vm_area_struct *vma)
{
- int retval = 0;
- unsigned long i, j;
resource_size_t paddr;
unsigned long prot;
- unsigned long vma_start = vma->vm_start;
- unsigned long vma_end = vma->vm_end;
- unsigned long vma_size = vma_end - vma_start;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (!pat_enabled)
return 0;
+ /*
+ * For now, only handle remap_pfn_range() vmas where
+ * is_linear_pfn_mapping() == TRUE. Handling of
+ * vm_insert_pfn() is TBD.
+ */
if (is_linear_pfn_mapping(vma)) {
/*
* reserve the whole chunk covered by vma. We need the
* starting address and protection from pte.
*/
- if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -771,28 +714,7 @@ int track_pfn_vma_copy(struct vm_area_st
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
- /* reserve entire vma page by page, using pfn and prot from pte */
- for (i = 0; i < vma_size; i += PAGE_SIZE) {
- if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
- continue;
-
- pgprot = __pgprot(prot);
- retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
- if (retval)
- goto cleanup_ret;
- }
return 0;
-
-cleanup_ret:
- /* Reserve error: Cleanup partial reservation and return error */
- for (j = 0; j < i; j += PAGE_SIZE) {
- if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
- continue;
-
- free_pfn_range(paddr, PAGE_SIZE);
- }
-
- return retval;
}
/*
@@ -802,50 +724,28 @@ cleanup_ret:
* prot is passed in as a parameter for the new mapping. If the vma has a
* linear pfn mapping for the entire range reserve the entire vma range with
* single reserve_pfn_range call.
- * Otherwise, we look t the pfn and size and reserve only the specified range
- * page by page.
- *
- * Note that this function can be called with caller trying to map only a
- * subrange/page inside the vma.
*/
int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long size)
{
- int retval = 0;
- unsigned long i, j;
- resource_size_t base_paddr;
resource_size_t paddr;
- unsigned long vma_start = vma->vm_start;
- unsigned long vma_end = vma->vm_end;
- unsigned long vma_size = vma_end - vma_start;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
if (!pat_enabled)
return 0;
+ /*
+ * For now, only handle remap_pfn_range() vmas where
+ * is_linear_pfn_mapping() == TRUE. Handling of
+ * vm_insert_pfn() is TBD.
+ */
if (is_linear_pfn_mapping(vma)) {
/* reserve the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
return reserve_pfn_range(paddr, vma_size, prot, 0);
}
- /* reserve page by page using pfn and size */
- base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
- for (i = 0; i < size; i += PAGE_SIZE) {
- paddr = base_paddr + i;
- retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
- if (retval)
- goto cleanup_ret;
- }
return 0;
-
-cleanup_ret:
- /* Reserve error: Cleanup partial reservation and return error */
- for (j = 0; j < i; j += PAGE_SIZE) {
- paddr = base_paddr + j;
- free_pfn_range(paddr, PAGE_SIZE);
- }
-
- return retval;
}
/*
@@ -856,39 +756,23 @@ cleanup_ret:
void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size)
{
- unsigned long i;
resource_size_t paddr;
- unsigned long prot;
- unsigned long vma_start = vma->vm_start;
- unsigned long vma_end = vma->vm_end;
- unsigned long vma_size = vma_end - vma_start;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
if (!pat_enabled)
return;
+ /*
+ * For now, only handle remap_pfn_range() vmas where
+ * is_linear_pfn_mapping() == TRUE. Handling of
+ * vm_insert_pfn() is TBD.
+ */
if (is_linear_pfn_mapping(vma)) {
/* free the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
free_pfn_range(paddr, vma_size);
return;
}
-
- if (size != 0 && size != vma_size) {
- /* free page by page, using pfn and size */
- paddr = (resource_size_t)pfn << PAGE_SHIFT;
- for (i = 0; i < size; i += PAGE_SIZE) {
- paddr = paddr + i;
- free_pfn_range(paddr, PAGE_SIZE);
- }
- } else {
- /* free entire vma, page by page, using the pfn from pte */
- for (i = 0; i < vma_size; i += PAGE_SIZE) {
- if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
- continue;
-
- free_pfn_range(paddr, PAGE_SIZE);
- }
- }
}
#endif /* CONFIG_XEN */
--- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -122,10 +122,6 @@ void __pud_free_tlb(struct mmu_gather *t
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
-#ifndef CONFIG_X86_64
-#define TASK_SIZE64 TASK_SIZE
-#endif
-
static void _pin_lock(struct mm_struct *mm, int lock) {
if (lock)
spin_lock(&mm->page_table_lock);
@@ -149,7 +145,7 @@ static void _pin_lock(struct mm_struct *
pgd_t *pgd = mm->pgd;
unsigned g;
- for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+ for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
pud_t *pud;
unsigned u;
@@ -230,10 +226,10 @@ static void pgd_walk(pgd_t *pgd_base, pg
* Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
* may not be the 'current' task's pagetables (e.g., current may be
* 32-bit, but the pagetables may be for a 64-bit task).
- * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
- * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+ * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
+ * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
*/
- for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+ for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
if (pgd_none(*pgd))
continue;
pud = pud_offset(pgd, 0);
@@ -739,9 +735,26 @@ int ptep_clear_flush_young(struct vm_are
return young;
}
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+ BUG_ON(fixmaps_set > 0);
+ printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+ (int)-reserve);
+ __FIXADDR_TOP = -reserve - PAGE_SIZE;
+#endif
+}
+
int fixmaps_set;
-void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
+void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
pte_t pte;
@@ -760,6 +773,8 @@ void xen_set_fixmap(enum fixed_addresses
set_pte_vaddr_pud(level3_user_pgt, address, pte);
break;
case FIX_EARLYCON_MEM_BASE:
+ case FIX_SHARED_INFO:
+ case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
pfn_pte_ma(phys >> PAGE_SHIFT, flags));
fixmaps_set++;
--- head-2011-03-17.orig/arch/x86/mm/pgtable_32-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pgtable_32-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -25,6 +25,8 @@
#include <xen/features.h>
#include <asm/hypervisor.h>
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
@@ -54,7 +56,7 @@ void set_pte_vaddr(unsigned long vaddr,
}
pte = pte_offset_kernel(pmd, vaddr);
if (pte_val(pteval))
- set_pte_present(&init_mm, vaddr, pte, pteval);
+ set_pte_at(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
@@ -109,21 +111,6 @@ unsigned long hypervisor_virt_start = HY
unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
EXPORT_SYMBOL(__FIXADDR_TOP);
-/**
- * reserve_top_address - reserves a hole in the top of kernel address space
- * @reserve - size of hole to reserve
- *
- * Can be used to relocate the fixmap area and poke a hole in the top
- * of kernel address space to make room for a hypervisor.
- */
-void __init reserve_top_address(unsigned long reserve)
-{
- BUG_ON(fixmaps_set > 0);
- printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
- (int)-reserve);
- __FIXADDR_TOP = -reserve - PAGE_SIZE;
-}
-
/*
* vmalloc=size forces the vmalloc area to be exactly 'size'
* bytes. This can be used to increase (or decrease) the
--- head-2011-03-17.orig/drivers/acpi/Makefile 2011-01-31 14:53:38.000000000 +0100
+++ head-2011-03-17/drivers/acpi/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -67,9 +67,7 @@ obj-$(CONFIG_ACPI_EC_DEBUGFS) += ec_sys.
processor-y := processor_driver.o processor_throttling.o
processor-y += processor_idle.o processor_thermal.o
processor-$(CONFIG_CPU_FREQ) += processor_perflib.o
-ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
-processor-objs += processor_perflib.o processor_extcntl.o
-endif
+processor-$(CONFIG_PROCESSOR_EXTERNAL_CONTROL) += processor_perflib.o processor_extcntl.o
obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
obj-$(CONFIG_ACPI_IPMI) += acpi_ipmi.o
--- head-2011-03-17.orig/drivers/acpi/acpica/hwsleep.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/acpi/acpica/hwsleep.c 2011-02-01 14:44:12.000000000 +0100
@@ -394,7 +394,7 @@ acpi_status asmlinkage acpi_enter_sleep_
#else
/* PV ACPI just need check hypercall return value */
err = acpi_notify_hypervisor_state(sleep_state,
- PM1Acontrol, PM1Bcontrol);
+ pm1a_control, pm1b_control);
if (err) {
printk(KERN_ERR "ACPI: Hypervisor failure [%d]\n", err);
return_ACPI_STATUS(AE_ERROR);
--- head-2011-03-17.orig/drivers/oprofile/oprofile_files.c 2011-01-31 17:01:49.000000000 +0100
+++ head-2011-03-17/drivers/oprofile/oprofile_files.c 2011-02-01 14:44:12.000000000 +0100
@@ -181,6 +181,7 @@ static const struct file_operations dump
};
#ifdef CONFIG_XEN
+#include <linux/slab.h>
#define TMPBUFSIZE 512
--- head-2011-03-17.orig/drivers/pci/msi-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/drivers/pci/msi-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -47,11 +47,12 @@ struct msi_pirq_entry {
/* Arch hooks */
-int __attribute__ ((weak))
-arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
+#ifndef arch_msi_check_device
+int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
{
return 0;
}
+#endif
static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
{
@@ -308,13 +309,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state)
/**
* msi_capability_init - configure device's MSI capability structure
* @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
*
- * Setup the MSI capability structure of device function with a single
- * MSI irq, regardless of device function is capable of handling
- * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI irq or non-zero for otherwise.
- **/
-static int msi_capability_init(struct pci_dev *dev)
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts. A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq. A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
{
int pos, pirq;
u16 control;
@@ -324,6 +327,7 @@ static int msi_capability_init(struct pc
pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
pci_read_config_word(dev, msi_control_reg(pos), &control);
+ WARN_ON(nvec > 1); /* XXX */
pirq = msi_map_vector(dev, 0, 0);
if (pirq < 0)
return -EBUSY;
@@ -457,22 +461,34 @@ static int pci_msi_check_device(struct p
}
/**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
*
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * vector or non-zero for otherwise.
- **/
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs. If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate. If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
extern int pci_frontend_enable_msi(struct pci_dev *dev);
-int pci_enable_msi(struct pci_dev* dev)
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
{
- int temp, status;
+ int temp, status, pos, maxvec;
+ u16 msgctl;
struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
- status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+ if (!pos)
+ return -EINVAL;
+ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+ maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+ if (nvec > maxvec)
+ return maxvec;
+
+ status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
if (status)
return status;
@@ -482,6 +498,7 @@ int pci_enable_msi(struct pci_dev* dev)
int ret;
temp = dev->irq;
+ WARN_ON(nvec > 1); /* XXX */
ret = pci_frontend_enable_msi(dev);
if (ret)
return ret;
@@ -496,23 +513,23 @@ int pci_enable_msi(struct pci_dev* dev)
temp = dev->irq;
- /* Check whether driver already requested for MSI-X irqs */
+ /* Check whether driver already requested MSI-X irqs */
if (dev->msix_enabled) {
dev_info(&dev->dev, "can't enable MSI "
"(MSI-X already enabled)\n");
return -EINVAL;
}
- status = msi_capability_init(dev);
+ status = msi_capability_init(dev, nvec);
if ( !status )
msi_dev_entry->default_irq = temp;
return status;
}
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);
extern void pci_frontend_disable_msi(struct pci_dev* dev);
-void pci_msi_shutdown(struct pci_dev* dev)
+void pci_msi_shutdown(struct pci_dev *dev)
{
int pirq;
struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
@@ -540,6 +557,7 @@ void pci_msi_shutdown(struct pci_dev* de
pci_intx_for_msi(dev, 1);
dev->msi_enabled = 0;
}
+
void pci_disable_msi(struct pci_dev* dev)
{
pci_msi_shutdown(dev);
@@ -547,6 +565,23 @@ void pci_disable_msi(struct pci_dev* dev
EXPORT_SYMBOL(pci_disable_msi);
/**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+ int pos;
+ u16 control;
+
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+ if (!pos)
+ return 0;
+
+ pci_read_config_word(dev, msi_control_reg(pos), &control);
+ return multi_msix_capable(control);
+}
+
+/**
* pci_enable_msix - configure device's MSI-X capability structure
* @dev: pointer to the pci_dev data structure of MSI-X device function
* @entries: pointer to an array of MSI-X entries
@@ -565,9 +600,8 @@ extern int pci_frontend_enable_msix(stru
struct msix_entry *entries, int nvec);
int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
{
- int status, pos, nr_entries;
+ int status, nr_entries;
int i, j, temp;
- u16 control;
struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
if (!entries)
@@ -614,9 +648,7 @@ int pci_enable_msix(struct pci_dev* dev,
if (status)
return status;
- pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
- pci_read_config_word(dev, msi_control_reg(pos), &control);
- nr_entries = multi_msix_capable(control);
+ nr_entries = pci_msix_table_size(dev);
if (nvec > nr_entries)
return -EINVAL;
--- head-2011-03-17.orig/drivers/xen/Kconfig 2011-02-02 15:37:07.000000000 +0100
+++ head-2011-03-17/drivers/xen/Kconfig 2011-02-01 14:44:12.000000000 +0100
@@ -14,7 +14,6 @@ menu "XEN"
config XEN_PRIVILEGED_GUEST
bool "Privileged Guest (domain 0)"
- select PCI_REASSIGN if PCI
help
Support for privileged operation (domain 0)
@@ -333,10 +332,6 @@ endmenu
config HAVE_IRQ_IGNORE_UNHANDLED
def_bool y
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
- depends on X86
-
config NO_IDLE_HZ
def_bool y
--- head-2011-03-17.orig/drivers/xen/char/mem.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/drivers/xen/char/mem.c 2011-02-01 14:44:12.000000000 +0100
@@ -158,21 +158,7 @@ static ssize_t write_mem(struct file * f
}
#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
-static void mmap_mem_open(struct vm_area_struct *vma)
-{
- map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
- vma->vm_page_prot);
-}
-
-static void mmap_mem_close(struct vm_area_struct *vma)
-{
- unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start,
- vma->vm_page_prot);
-}
-
static struct vm_operations_struct mmap_mem_ops = {
- .open = mmap_mem_open,
- .close = mmap_mem_close,
#ifdef CONFIG_HAVE_IOREMAP_PROT
.access = generic_access_phys
#endif
--- head-2011-03-17.orig/drivers/xen/core/Makefile 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/Makefile 2011-02-01 14:44:12.000000000 +0100
@@ -10,5 +10,5 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
-obj-$(CONFIG_X86_SMP) += spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_KEXEC) += machine_kexec.o
--- head-2011-03-17.orig/drivers/xen/core/evtchn.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/evtchn.c 2011-02-09 13:57:45.000000000 +0100
@@ -154,13 +154,15 @@ static u8 cpu_evtchn[NR_EVENT_CHANNELS];
#else
static u16 cpu_evtchn[NR_EVENT_CHANNELS];
#endif
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static DEFINE_PER_CPU(unsigned long[BITS_TO_LONGS(NR_EVENT_CHANNELS)],
+ cpu_evtchn_mask);
-static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
- unsigned int idx)
+static inline unsigned long active_evtchns(unsigned int idx)
{
+ shared_info_t *sh = HYPERVISOR_shared_info;
+
return (sh->evtchn_pending[idx] &
- cpu_evtchn_mask[cpu][idx] &
+ percpu_read(cpu_evtchn_mask[idx]) &
~sh->evtchn_mask[idx]);
}
@@ -172,10 +174,10 @@ static void bind_evtchn_to_cpu(unsigned
BUG_ON(!test_bit(chn, s->evtchn_mask));
if (irq != -1)
- irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
+ cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
- clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
- set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+ clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn]));
+ set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
cpu_evtchn[chn] = cpu;
}
@@ -188,14 +190,13 @@ static void init_evtchn_cpu_bindings(voi
struct irq_desc *desc = irq_to_desc(i);
if (desc)
- desc->affinity = cpumask_of_cpu(0);
+ cpumask_copy(desc->affinity, cpumask_of(0));
}
memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
for_each_possible_cpu(i)
- memset(cpu_evtchn_mask[i],
- (i == 0) ? ~0 : 0,
- sizeof(cpu_evtchn_mask[i]));
+ memset(per_cpu(cpu_evtchn_mask, i), -!i,
+ sizeof(per_cpu(cpu_evtchn_mask, i)));
}
static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
@@ -205,9 +206,10 @@ static inline unsigned int cpu_from_evtc
#else
-static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
- unsigned int idx)
+static inline unsigned long active_evtchns(unsigned int idx)
{
+ shared_info_t *sh = HYPERVISOR_shared_info;
+
return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
}
@@ -226,25 +228,15 @@ static inline unsigned int cpu_from_evtc
#endif
-/* Upcall to generic IRQ layer. */
#ifdef CONFIG_X86
-extern unsigned int do_IRQ(struct pt_regs *regs);
void __init xen_init_IRQ(void);
void __init init_IRQ(void)
{
irq_ctx_init(0);
xen_init_IRQ();
}
-#if defined (__i386__)
-static inline void exit_idle(void) {}
-#elif defined (__x86_64__)
#include <asm/idle.h>
#endif
-#define do_IRQ(irq, regs) do { \
- (regs)->orig_ax = ~(irq); \
- do_IRQ((regs)); \
-} while (0)
-#endif
/* Xen will never allocate port zero for any purpose. */
#define VALID_EVTCHN(chn) ((chn) != 0)
@@ -268,13 +260,12 @@ static DEFINE_PER_CPU(unsigned int, curr
/* NB. Interrupts are disabled on entry. */
asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
{
+ struct pt_regs *old_regs = set_irq_regs(regs);
unsigned long l1, l2;
unsigned long masked_l1, masked_l2;
unsigned int l1i, l2i, start_l1i, start_l2i, port, count, i;
int irq;
- unsigned int cpu = smp_processor_id();
- shared_info_t *s = HYPERVISOR_shared_info;
- vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
+ vcpu_info_t *vcpu_info = current_vcpu_info();
exit_idle();
irq_enter();
@@ -284,7 +275,8 @@ asmlinkage void __irq_entry evtchn_do_up
vcpu_info->evtchn_upcall_pending = 0;
/* Nested invocations bail immediately. */
- if (unlikely(per_cpu(upcall_count, cpu)++))
+ percpu_add(upcall_count, 1);
+ if (unlikely(percpu_read(upcall_count) != 1))
break;
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -297,18 +289,22 @@ asmlinkage void __irq_entry evtchn_do_up
* hardirq handlers see an up-to-date system time even if we
* have just woken from a long idle period.
*/
- if ((irq = __get_cpu_var(virq_to_irq)[VIRQ_TIMER]) != -1) {
+ if ((irq = percpu_read(virq_to_irq[VIRQ_TIMER])) != -1) {
port = evtchn_from_irq(irq);
l1i = port / BITS_PER_LONG;
l2i = port % BITS_PER_LONG;
- if (active_evtchns(cpu, s, l1i) & (1ul<<l2i))
- do_IRQ(irq, regs);
+ if (active_evtchns(l1i) & (1ul<<l2i)) {
+ mask_evtchn(port);
+ clear_evtchn(port);
+ if (!handle_irq(irq, regs))
+ BUG();
+ }
}
l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
- start_l1i = l1i = per_cpu(current_l1i, cpu);
- start_l2i = per_cpu(current_l2i, cpu);
+ start_l1i = l1i = percpu_read(current_l1i);
+ start_l2i = percpu_read(current_l2i);
for (i = 0; l1 != 0; i++) {
masked_l1 = l1 & ((~0UL) << l1i);
@@ -319,7 +315,7 @@ asmlinkage void __irq_entry evtchn_do_up
}
l1i = __ffs(masked_l1);
- l2 = active_evtchns(cpu, s, l1i);
+ l2 = active_evtchns(l1i);
l2i = 0; /* usually scan entire word from start */
if (l1i == start_l1i) {
/* We scan the starting word in two parts. */
@@ -339,17 +335,23 @@ asmlinkage void __irq_entry evtchn_do_up
/* process port */
port = (l1i * BITS_PER_LONG) + l2i;
- if ((irq = evtchn_to_irq[port]) != -1)
- do_IRQ(irq, regs);
- else
+ mask_evtchn(port);
+ if ((irq = evtchn_to_irq[port]) != -1) {
+ clear_evtchn(port);
+ if (!handle_irq(irq, regs)
+ && printk_ratelimit())
+ pr_emerg("No handler for "
+ "irq %d (port %u)\n",
+ irq, port);
+ } else
evtchn_device_upcall(port);
l2i = (l2i + 1) % BITS_PER_LONG;
/* Next caller starts at last processed + 1 */
- per_cpu(current_l1i, cpu) =
- l2i ? l1i : (l1i + 1) % BITS_PER_LONG;
- per_cpu(current_l2i, cpu) = l2i;
+ percpu_write(current_l1i,
+ l2i ? l1i : (l1i + 1) % BITS_PER_LONG);
+ percpu_write(current_l2i, l2i);
} while (l2i != 0);
@@ -361,27 +363,26 @@ asmlinkage void __irq_entry evtchn_do_up
}
/* If there were nested callbacks then we have more to do. */
- count = per_cpu(upcall_count, cpu);
- per_cpu(upcall_count, cpu) = 0;
+ count = percpu_read(upcall_count);
+ percpu_write(upcall_count, 0);
} while (unlikely(count != 1));
irq_exit();
+ set_irq_regs(old_regs);
}
-static struct irq_chip dynirq_chip;
-
-static int find_unbound_irq(unsigned int cpu)
+static int find_unbound_irq(unsigned int cpu, struct irq_chip *chip)
{
static int warned;
int irq;
- for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) {
+ for (irq = DYNIRQ_BASE; irq < nr_irqs; irq++) {
struct irq_desc *desc = irq_to_desc_alloc_cpu(irq, cpu);
struct irq_cfg *cfg = desc->chip_data;
if (!cfg->bindcount) {
desc->status |= IRQ_NOPROBE;
- set_irq_chip_and_handler_name(irq, &dynirq_chip,
+ set_irq_chip_and_handler_name(irq, chip,
handle_fasteoi_irq,
"fasteoi");
return irq;
@@ -397,6 +398,8 @@ static int find_unbound_irq(unsigned int
return -ENOSPC;
}
+static struct irq_chip dynirq_chip;
+
static int bind_caller_port_to_irq(unsigned int caller_port)
{
int irq;
@@ -404,7 +407,7 @@ static int bind_caller_port_to_irq(unsig
spin_lock(&irq_mapping_update_lock);
if ((irq = evtchn_to_irq[caller_port]) == -1) {
- if ((irq = find_unbound_irq(smp_processor_id())) < 0)
+ if ((irq = find_unbound_irq(smp_processor_id(), &dynirq_chip)) < 0)
goto out;
evtchn_to_irq[caller_port] = irq;
@@ -427,7 +430,7 @@ static int bind_local_port_to_irq(unsign
BUG_ON(evtchn_to_irq[local_port] != -1);
- if ((irq = find_unbound_irq(smp_processor_id())) < 0) {
+ if ((irq = find_unbound_irq(smp_processor_id(), &dynirq_chip)) < 0) {
struct evtchn_close close = { .port = local_port };
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
BUG();
@@ -480,7 +483,7 @@ static int bind_virq_to_irq(unsigned int
spin_lock(&irq_mapping_update_lock);
if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
- if ((irq = find_unbound_irq(cpu)) < 0)
+ if ((irq = find_unbound_irq(cpu, &dynirq_chip)) < 0)
goto out;
bind_virq.virq = virq;
@@ -513,7 +516,7 @@ static int bind_ipi_to_irq(unsigned int
spin_lock(&irq_mapping_update_lock);
if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
- if ((irq = find_unbound_irq(cpu)) < 0)
+ if ((irq = find_unbound_irq(cpu, &dynirq_chip)) < 0)
goto out;
bind_ipi.vcpu = cpu;
@@ -572,7 +575,7 @@ static void unbind_from_irq(unsigned int
/* Zap stats across IRQ changes of use. */
for_each_possible_cpu(cpu)
-#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_GENERIC_HARDIRQS
irq_to_desc(irq)->kstat_irqs[cpu] = 0;
#else
kstat_cpu(cpu).irqs[irq] = 0;
@@ -690,7 +693,8 @@ int bind_ipi_to_irqhandler(
if (irq < 0)
return irq;
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ retval = request_irq(irq, handler, irqflags | IRQF_NO_SUSPEND,
+ devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
@@ -777,16 +781,6 @@ static unsigned int startup_dynirq(unsig
#define shutdown_dynirq mask_dynirq
-static void ack_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn)) {
- mask_evtchn(evtchn);
- clear_evtchn(evtchn);
- }
-}
-
static void end_dynirq(unsigned int irq)
{
if (!(irq_to_desc(irq)->status & IRQ_DISABLED)) {
@@ -803,7 +797,6 @@ static struct irq_chip dynirq_chip = {
.disable = mask_dynirq,
.mask = mask_dynirq,
.unmask = unmask_dynirq,
- .ack = ack_dynirq,
.end = end_dynirq,
.eoi = end_dynirq,
#ifdef CONFIG_SMP
@@ -874,15 +867,18 @@ static void enable_pirq(unsigned int irq
{
struct evtchn_bind_pirq bind_pirq;
int evtchn = evtchn_from_irq(irq);
+ unsigned int pirq = irq - PIRQ_BASE;
if (VALID_EVTCHN(evtchn)) {
- clear_bit(irq - PIRQ_BASE, probing_pirq);
+ if (pirq < nr_pirqs)
+ clear_bit(pirq, probing_pirq);
goto out;
}
bind_pirq.pirq = evtchn_get_xen_pirq(irq);
/* NB. We are happy to share unless we are probing. */
- bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
+ bind_pirq.flags = (pirq < nr_pirqs
+ && test_and_clear_bit(pirq, probing_pirq))
|| (irq_to_desc(irq)->status & IRQ_AUTODETECT)
? 0 : BIND_PIRQ__WILL_SHARE;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
@@ -938,7 +934,6 @@ static void unmask_pirq(unsigned int irq
}
#define mask_pirq mask_dynirq
-#define ack_pirq ack_dynirq
static void end_pirq(unsigned int irq)
{
@@ -962,7 +957,6 @@ static struct irq_chip pirq_chip = {
.disable = disable_pirq,
.mask = mask_pirq,
.unmask = unmask_pirq,
- .ack = ack_pirq,
.end = end_pirq,
.eoi = end_pirq,
.set_type = set_type_pirq,
@@ -1103,7 +1097,6 @@ static void restore_cpu_ipis(unsigned in
void irq_resume(void)
{
unsigned int cpu, irq, evtchn;
- struct irq_cfg *cfg;
init_evtchn_cpu_bindings();
@@ -1119,17 +1112,22 @@ void irq_resume(void)
for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
mask_evtchn(evtchn);
- /* Check that no PIRQs are still bound. */
- for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) {
- cfg = irq_cfg(irq);
- BUG_ON(cfg && cfg->info != IRQ_UNBOUND);
- }
-
/* No IRQ <-> event-channel mappings. */
for (irq = 0; irq < nr_irqs; irq++) {
- cfg = irq_cfg(irq);
- if (cfg)
- cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
+ struct irq_cfg *cfg = irq_cfg(irq);
+
+ if (!cfg)
+ continue;
+
+ /* Check that no PIRQs are still bound. */
+#ifdef CONFIG_SPARSE_IRQ
+ if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+ BUG_ON(type_from_irq_cfg(cfg) == IRQT_PIRQ);
+ else
+#endif
+ BUG_ON(cfg->info != IRQ_UNBOUND);
+
+ cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
}
for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
evtchn_to_irq[evtchn] = -1;
@@ -1157,7 +1155,7 @@ int arch_init_chip_data(struct irq_desc
{
if (!desc->chip_data) {
/* By default all event channels notify CPU#0. */
- desc->affinity = cpumask_of_cpu(0);
+ cpumask_copy(desc->affinity, cpumask_of(0));
desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC);
}
@@ -1170,12 +1168,55 @@ int arch_init_chip_data(struct irq_desc
}
#endif
+#ifdef CONFIG_SPARSE_IRQ
+int nr_pirqs = NR_PIRQS;
+EXPORT_SYMBOL_GPL(nr_pirqs);
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr = 256, nr_irqs_gsi;
+
+ if (is_initial_xendomain()) {
+ nr_irqs_gsi = acpi_probe_gsi();
+#ifdef CONFIG_X86_IO_APIC
+ if (nr_irqs_gsi <= NR_IRQS_LEGACY) {
+ /* for acpi=off or acpi not compiled in */
+ int idx;
+
+ for (nr_irq_gsi = idx = 0; idx < nr_ioapics; idx++)
+ nr_irqs_gsi += io_apic_get_redir_entries(idx) + 1;
+ }
+#endif
+ if (nr_irqs_gsi < NR_IRQS_LEGACY)
+ nr_irqs_gsi = NR_IRQS_LEGACY;
+#ifdef CONFIG_PCI_MSI
+ nr += max(nr_irqs_gsi * 16, nr_cpu_ids * 8);
+#endif
+ } else {
+ nr_irqs_gsi = NR_VECTORS;
+#ifdef CONFIG_PCI_MSI
+ nr += max(NR_IRQS_LEGACY * 16, nr_cpu_ids * 8);
+#endif
+ }
+
+ if (nr_pirqs > nr_irqs_gsi)
+ nr_pirqs = nr_irqs_gsi;
+ if (nr > min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS))
+ nr = min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS);
+ nr_irqs = min_t(int, nr_pirqs + nr, PAGE_SIZE * 8);
+
+ printk(KERN_DEBUG "nr_pirqs: %d\n", nr_pirqs);
+
+ return 0;
+}
+#endif
+
#if defined(CONFIG_X86_IO_APIC)
int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
struct physdev_irq irq_op;
- if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
+ if (irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs)
return -EINVAL;
if (cfg->vector)
@@ -1198,7 +1239,7 @@ int assign_irq_vector(int irq, struct ir
void evtchn_register_pirq(int irq)
{
- BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS);
+ BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs);
if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND)
return;
irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0);
@@ -1209,9 +1250,29 @@ void evtchn_register_pirq(int irq)
int evtchn_map_pirq(int irq, int xen_pirq)
{
if (irq < 0) {
+#ifdef CONFIG_SPARSE_IRQ
+ spin_lock(&irq_mapping_update_lock);
+ irq = find_unbound_irq(smp_processor_id(), &pirq_chip);
+ if (irq >= 0) {
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+
+ desc = irq_to_desc_alloc_node(irq, numa_node_id());
+ cfg = desc->chip_data;
+ BUG_ON(type_from_irq(irq) != IRQT_UNBOUND);
+ cfg->bindcount++;
+ cfg->info = mk_irq_info(IRQT_PIRQ, xen_pirq, 0);
+ }
+ spin_unlock(&irq_mapping_update_lock);
+ if (irq < 0)
+ return irq;
+ } else if (irq >= PIRQ_BASE && irq < PIRQ_BASE + nr_pirqs) {
+ WARN_ONCE(1, "Non-MSI IRQ#%d (Xen %d)\n", irq, xen_pirq);
+ return -EINVAL;
+#else
static DEFINE_SPINLOCK(irq_alloc_lock);
- irq = PIRQ_BASE + NR_PIRQS - 1;
+ irq = PIRQ_BASE + nr_pirqs - 1;
spin_lock(&irq_alloc_lock);
do {
struct irq_desc *desc;
@@ -1233,6 +1294,7 @@ int evtchn_map_pirq(int irq, int xen_pir
return -ENOSPC;
set_irq_chip_and_handler_name(irq, &pirq_chip,
handle_fasteoi_irq, "fasteoi");
+#endif
} else if (!xen_pirq) {
if (unlikely(type_from_irq(irq) != IRQT_PIRQ))
return -EINVAL;
@@ -1244,6 +1306,9 @@ int evtchn_map_pirq(int irq, int xen_pir
*/
set_irq_chip_and_handler(irq, NULL, NULL);
irq_cfg(irq)->info = IRQ_UNBOUND;
+#ifdef CONFIG_SPARSE_IRQ
+ irq_cfg(irq)->bindcount--;
+#endif
return 0;
} else if (type_from_irq(irq) != IRQT_PIRQ
|| index_from_irq(irq) != xen_pirq) {
@@ -1270,8 +1335,13 @@ void __init xen_init_IRQ(void)
init_evtchn_cpu_bindings();
+#ifdef CONFIG_SPARSE_IRQ
+ i = nr_irqs;
+#else
+ i = nr_pirqs;
+#endif
pirq_needs_eoi = alloc_bootmem_pages(sizeof(unsigned long)
- * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
+ * BITS_TO_LONGS(ALIGN(i, PAGE_SIZE * 8)));
eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
pirq_eoi_does_unmask = true;
@@ -1287,7 +1357,7 @@ void __init xen_init_IRQ(void)
handle_fasteoi_irq, "fasteoi");
}
- for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) {
+ for (i = PIRQ_BASE; i < (PIRQ_BASE + nr_pirqs); i++) {
#else
for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
#endif
--- head-2011-03-17.orig/drivers/xen/core/smpboot.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/smpboot.c 2011-02-01 14:44:12.000000000 +0100
@@ -18,7 +18,6 @@
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <asm/desc.h>
-#include <asm/arch_hooks.h>
#include <asm/pgalloc.h>
#include <xen/evtchn.h>
#include <xen/interface/vcpu.h>
@@ -188,7 +187,7 @@ static void __cpuinit cpu_initialize_con
smp_trap_init(ctxt.trap_ctxt);
ctxt.ldt_ents = 0;
- ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
+ ctxt.gdt_frames[0] = arbitrary_virt_to_mfn(get_cpu_gdt_table(cpu));
ctxt.gdt_ents = GDT_SIZE / 8;
ctxt.user_regs.cs = __KERNEL_CS;
@@ -206,12 +205,13 @@ static void __cpuinit cpu_initialize_con
ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
ctxt.user_regs.fs = __KERNEL_PERCPU;
+ ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
#else /* __x86_64__ */
ctxt.syscall_callback_eip = (unsigned long)system_call;
ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
- ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
+ ctxt.gs_base_kernel = per_cpu_offset(cpu);
#endif
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt))
@@ -256,9 +256,6 @@ void __init smp_prepare_cpus(unsigned in
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
-#ifdef __i386__
- init_gdt(cpu);
-#endif
gdt_addr = get_cpu_gdt_table(cpu);
make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
@@ -269,12 +266,12 @@ void __init smp_prepare_cpus(unsigned in
cpu_data(cpu).cpu_index = cpu;
#ifdef __x86_64__
- cpu_pda(cpu)->pcurrent = idle;
- cpu_pda(cpu)->cpunumber = cpu;
clear_tsk_thread_flag(idle, TIF_FORK);
-#else
- per_cpu(current_task, cpu) = idle;
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(idle) -
+ KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
+ per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
@@ -298,10 +295,7 @@ void __init smp_prepare_cpus(unsigned in
void __init smp_prepare_boot_cpu(void)
{
-#ifdef __i386__
- init_gdt(smp_processor_id());
-#endif
- switch_to_new_gdt();
+ switch_to_new_gdt(smp_processor_id());
prefill_possible_map();
}
--- head-2011-03-17.orig/drivers/xen/core/spinlock.c 2011-03-15 16:51:35.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/spinlock.c 2011-03-15 16:43:45.000000000 +0100
@@ -92,7 +92,7 @@ static unsigned int spin_adjust(struct s
unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token)
{
- return spin_adjust(x86_read_percpu(spinning), lock, token);
+ return spin_adjust(percpu_read(spinning), lock, token);
}
bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok,
@@ -105,21 +105,21 @@ bool xen_spin_wait(raw_spinlock_t *lock,
/* If kicker interrupt not initialized yet, just spin. */
if (unlikely(!cpu_online(raw_smp_processor_id()))
- || unlikely(!x86_read_percpu(poll_evtchn)))
+ || unlikely(!percpu_read(poll_evtchn)))
return false;
/* announce we're spinning */
spinning.ticket = *ptok >> TICKET_SHIFT;
spinning.lock = lock;
- spinning.prev = x86_read_percpu(spinning);
+ spinning.prev = percpu_read(spinning);
smp_wmb();
- x86_write_percpu(spinning, &spinning);
+ percpu_write(spinning, &spinning);
upcall_mask = current_vcpu_info()->evtchn_upcall_mask;
do {
bool nested = false;
- clear_evtchn(x86_read_percpu(poll_evtchn));
+ clear_evtchn(percpu_read(poll_evtchn));
/*
* Check again to make sure it didn't become free while
@@ -132,7 +132,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
* without rechecking the lock.
*/
if (spinning.prev)
- set_evtchn(x86_read_percpu(poll_evtchn));
+ set_evtchn(percpu_read(poll_evtchn));
rc = true;
break;
}
@@ -187,7 +187,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
current_vcpu_info()->evtchn_upcall_mask = upcall_mask;
- rc = !test_evtchn(x86_read_percpu(poll_evtchn));
+ rc = !test_evtchn(percpu_read(poll_evtchn));
if (!rc)
inc_irq_stat(irq_lock_count);
} while (spinning.prev || rc);
@@ -199,11 +199,11 @@ bool xen_spin_wait(raw_spinlock_t *lock,
/* announce we're done */
other = spinning.prev;
- x86_write_percpu(spinning, other);
+ percpu_write(spinning, other);
raw_local_irq_disable();
- rm_idx = x86_read_percpu(rm_seq.idx);
+ rm_idx = percpu_read(rm_seq.idx);
smp_wmb();
- x86_write_percpu(rm_seq.idx, rm_idx + 1);
+ percpu_write(rm_seq.idx, rm_idx + 1);
mb();
/*
@@ -227,7 +227,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
}
rm_idx &= 1;
- while (x86_read_percpu(rm_seq.ctr[rm_idx].counter))
+ while (percpu_read(rm_seq.ctr[rm_idx].counter))
cpu_relax();
raw_local_irq_restore(upcall_mask);
*ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
--- head-2011-03-17.orig/drivers/xen/netback/interface.c 2011-02-17 10:15:18.000000000 +0100
+++ head-2011-03-17/drivers/xen/netback/interface.c 2011-02-17 10:16:00.000000000 +0100
@@ -162,7 +162,7 @@ static void netbk_get_drvinfo(struct net
struct ethtool_drvinfo *info)
{
strcpy(info->driver, "netbk");
- strcpy(info->bus_info, dev->dev.parent->bus_id);
+ strcpy(info->bus_info, dev_name(dev->dev.parent));
}
static const struct netif_stat {
--- head-2011-03-17.orig/drivers/xen/netback/netback.c 2011-03-01 11:52:09.000000000 +0100
+++ head-2011-03-17/drivers/xen/netback/netback.c 2011-02-01 14:44:12.000000000 +0100
@@ -342,7 +342,7 @@ int netif_be_start_xmit(struct sk_buff *
*/
netif->tx_queue_timeout.data = (unsigned long)netif;
netif->tx_queue_timeout.function = tx_queue_callback;
- __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
+ mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
}
}
@@ -363,7 +363,7 @@ static void xen_network_done_notify(void
static struct net_device *eth0_dev = NULL;
if (unlikely(eth0_dev == NULL))
eth0_dev = __dev_get_by_name(&init_net, "eth0");
- netif_rx_schedule(???);
+ napi_schedule(???);
}
/*
* Add following to poll() function in NAPI driver (Tigon3 is example):
@@ -1305,8 +1305,7 @@ static void net_tx_action(unsigned long
(unsigned long)netif;
netif->credit_timeout.function =
tx_credit_callback;
- __mod_timer(&netif->credit_timeout,
- next_credit);
+ mod_timer(&netif->credit_timeout, next_credit);
netif_put(netif);
continue;
}
--- head-2011-03-17.orig/drivers/xen/netfront/netfront.c 2011-02-09 16:04:26.000000000 +0100
+++ head-2011-03-17/drivers/xen/netfront/netfront.c 2011-02-09 16:04:51.000000000 +0100
@@ -103,7 +103,7 @@ static const int MODPARM_rx_flip = 0;
static inline void dev_disable_gso_features(struct net_device *dev)
{
/* Turn off all GSO bits except ROBUST. */
- dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+ dev->features &= ~NETIF_F_GSO_MASK;
dev->features |= NETIF_F_GSO_ROBUST;
}
#elif defined(NETIF_F_TSO)
@@ -632,7 +632,7 @@ static int network_open(struct net_devic
if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
netfront_accelerator_call_stop_napi_irq(np, dev);
- netif_rx_schedule(&np->napi);
+ napi_schedule(&np->napi);
}
}
spin_unlock_bh(&np->rx_lock);
@@ -703,7 +703,7 @@ static void rx_refill_timeout(unsigned l
netfront_accelerator_call_stop_napi_irq(np, dev);
- netif_rx_schedule(&np->napi);
+ napi_schedule(&np->napi);
}
static void network_alloc_rx_buffers(struct net_device *dev)
@@ -1057,7 +1057,7 @@ static irqreturn_t netif_int(int irq, vo
if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
netfront_accelerator_call_stop_napi_irq(np, dev);
- netif_rx_schedule(&np->napi);
+ napi_schedule(&np->napi);
}
}
@@ -1514,7 +1514,7 @@ err:
}
if (!more_to_do && !accel_more_to_do)
- __netif_rx_complete(napi);
+ __napi_complete(napi);
local_irq_restore(flags);
}
@@ -1803,7 +1803,7 @@ static void netfront_get_drvinfo(struct
struct ethtool_drvinfo *info)
{
strcpy(info->driver, "netfront");
- strcpy(info->bus_info, dev->dev.parent->bus_id);
+ strcpy(info->bus_info, dev_name(dev->dev.parent));
}
static int network_connect(struct net_device *dev)
--- head-2011-03-17.orig/drivers/xen/pciback/conf_space_header.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/xen/pciback/conf_space_header.c 2011-02-10 12:59:56.000000000 +0100
@@ -24,7 +24,7 @@ static int command_read(struct pci_dev *
int ret;
ret = pciback_read_config_word(dev, offset, value, data);
- if (!atomic_read(&dev->enable_cnt))
+ if (!pci_is_enabled(dev))
return ret;
for (i = 0; i < PCI_ROM_RESOURCE; i++) {
@@ -41,14 +41,14 @@ static int command_write(struct pci_dev
{
int err;
- if (!atomic_read(&dev->enable_cnt) && is_enable_cmd(value)) {
+ if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
if (unlikely(verbose_request))
printk(KERN_DEBUG "pciback: %s: enable\n",
pci_name(dev));
err = pci_enable_device(dev);
if (err)
return err;
- } else if (atomic_read(&dev->enable_cnt) && !is_enable_cmd(value)) {
+ } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
if (unlikely(verbose_request))
printk(KERN_DEBUG "pciback: %s: disable\n",
pci_name(dev));
--- head-2011-03-17.orig/drivers/xen/sfc_netfront/accel_msg.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/drivers/xen/sfc_netfront/accel_msg.c 2011-02-01 14:44:12.000000000 +0100
@@ -47,7 +47,7 @@ static void vnic_start_interrupts(netfro
netfront_accel_disable_net_interrupts(vnic);
vnic->irq_enabled = 0;
NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
- netif_rx_schedule(&np->napi);
+ napi_schedule(&np->napi);
} else {
/*
* Nothing yet, make sure we get interrupts through
@@ -532,7 +532,7 @@ irqreturn_t netfront_accel_net_channel_i
vnic->stats.event_count_since_irq;
vnic->stats.event_count_since_irq = 0;
#endif
- netif_rx_schedule(&np->napi);
+ napi_schedule(&np->napi);
}
else {
spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
--- head-2011-03-17.orig/drivers/xen/usbback/usbstub.c 2011-03-11 10:55:46.000000000 +0100
+++ head-2011-03-17/drivers/xen/usbback/usbstub.c 2011-02-01 14:44:12.000000000 +0100
@@ -188,7 +188,7 @@ static int usbstub_probe(struct usb_inte
const struct usb_device_id *id)
{
struct usb_device *udev = interface_to_usbdev(intf);
- char *busid = intf->dev.parent->bus_id;
+ const char *busid = dev_name(intf->dev.parent);
struct vusb_port_id *portid = NULL;
struct usbstub *stub = NULL;
usbif_t *usbif = NULL;
--- head-2011-03-17.orig/drivers/xen/usbfront/usbfront-dbg.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-17/drivers/xen/usbfront/usbfront-dbg.c 2011-02-01 14:44:12.000000000 +0100
@@ -64,7 +64,7 @@ static ssize_t show_statistics(struct de
"%s\n"
"xenhcd, hcd state %d\n",
hcd->self.controller->bus->name,
- hcd->self.controller->bus_id,
+ dev_name(hcd->self.controller),
hcd->product_desc,
hcd->state);
size -= temp;
--- head-2011-03-17.orig/drivers/xen/usbfront/xenbus.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/xen/usbfront/xenbus.c 2011-02-01 14:44:12.000000000 +0100
@@ -252,10 +252,10 @@ static struct usb_hcd *create_hcd(struct
}
switch (usb_ver) {
case USB_VER_USB11:
- hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev->dev.bus_id);
+ hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev_name(&dev->dev));
break;
case USB_VER_USB20:
- hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev->dev.bus_id);
+ hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev_name(&dev->dev));
break;
default:
xenbus_dev_fatal(dev, err, "invalid usb-ver");
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:44:12.000000000 +0100
@@ -231,7 +231,7 @@ static struct xen_bus_type xenbus_fronte
},
#if defined(CONFIG_XEN) || defined(MODULE)
.dev = {
- .bus_id = "xen",
+ .init_name = "xen",
},
#endif
};
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe_backend.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe_backend.c 2011-02-01 14:44:12.000000000 +0100
@@ -129,7 +129,7 @@ static struct xen_bus_type xenbus_backen
.dev_attrs = xenbus_backend_attrs,
},
.dev = {
- .bus_id = "xen-backend",
+ .init_name = "xen-backend",
},
};
--- head-2011-03-17.orig/kernel/sched.c 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/kernel/sched.c 2011-02-01 14:44:12.000000000 +0100
@@ -4020,6 +4020,12 @@ need_resched_nonpreemptible:
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
+
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
@@ -4078,7 +4084,8 @@ int mutex_spin_on_owner(struct mutex *lo
/*
* Is that owner really running on that cpu?
*/
- if (task_thread_info(rq->curr) != owner || need_resched())
+ if (task_thread_info(rq->curr) != owner || need_resched()
+ || !arch_cpu_is_running(cpu))
return 0;
arch_mutex_cpu_relax();
--- head-2011-03-17.orig/lib/swiotlb-xen.c 2011-02-01 14:42:26.000000000 +0100
+++ head-2011-03-17/lib/swiotlb-xen.c 2011-02-01 14:44:12.000000000 +0100
@@ -183,7 +183,7 @@ static void *swiotlb_bus_to_virt(dma_add
return phys_to_virt(swiotlb_bus_to_phys(address));
}
-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
{
return 0;
}
@@ -546,13 +546,13 @@ swiotlb_full(struct device *dev, size_t
* Once the device is given the dma address, the device owns this memory until
* either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
*/
-static dma_addr_t
-_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
- int dir, struct dma_attrs *attrs)
-{
- struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
- dma_addr_t dev_addr = gnttab_dma_map_page(page) +
- offset_in_page(paddr);
+dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ phys_addr_t phys = page_to_pseudophys(page) + offset;
+ dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset;
void *map;
BUG_ON(dir == DMA_NONE);
@@ -562,44 +562,24 @@ _swiotlb_map_single(struct device *hwdev
* we can safely return the device addr and not worry about bounce
* buffering it.
*/
- if (!address_needs_mapping(hwdev, dev_addr, size) &&
- !range_needs_mapping(paddr, size))
+ if (!address_needs_mapping(dev, dev_addr, size) &&
+ !range_needs_mapping(phys, size))
return dev_addr;
/*
* Oh well, have to allocate and map a bounce buffer.
*/
gnttab_dma_unmap_page(dev_addr);
- map = map_single(hwdev, paddr, size, dir);
+ map = map_single(dev, phys, size, dir);
if (!map) {
- swiotlb_full(hwdev, size, dir, 1);
+ swiotlb_full(dev, size, dir, 1);
map = io_tlb_overflow_buffer;
}
- dev_addr = swiotlb_virt_to_bus(hwdev, map);
+ dev_addr = swiotlb_virt_to_bus(dev, map);
return dev_addr;
}
-
-dma_addr_t
-swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
- int dir, struct dma_attrs *attrs)
-{
- return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
-}
-EXPORT_SYMBOL(swiotlb_map_single_attrs);
-
-dma_addr_t
-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
-{
- return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
-}
-EXPORT_SYMBOL(swiotlb_map_single);
-
-dma_addr_t
-swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
-{
- return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
-}
+EXPORT_SYMBOL_GPL(swiotlb_map_page);
/*
* Unmap a single streaming mode DMA translation. The dma_addr and size must
@@ -609,9 +589,9 @@ swiotlb_map_single_phys(struct device *h
* After this call, reads by the cpu to the buffer are guaranteed to see
* whatever the device wrote there.
*/
-void
-swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, int dir, struct dma_attrs *attrs)
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
{
char *dma_addr = swiotlb_bus_to_virt(dev_addr);
@@ -621,15 +601,7 @@ swiotlb_unmap_single_attrs(struct device
else
gnttab_dma_unmap_page(dev_addr);
}
-EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
-
-void
-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
- int dir)
-{
- return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
-}
-EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
/*
* Make physical memory consistent for a single streaming mode DMA translation
@@ -654,7 +626,7 @@ swiotlb_sync_single(struct device *hwdev
void
swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, int dir)
+ size_t size, enum dma_data_direction dir)
{
swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
}
@@ -662,7 +634,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_cp
void
swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, int dir)
+ size_t size, enum dma_data_direction dir)
{
swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
}
@@ -685,7 +657,8 @@ swiotlb_sync_single_range(struct device
void
swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
- unsigned long offset, size_t size, int dir)
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
{
swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
SYNC_FOR_CPU);
@@ -694,7 +667,8 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra
void
swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
- unsigned long offset, size_t size, int dir)
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir)
{
swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
SYNC_FOR_DEVICE);
@@ -719,7 +693,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra
*/
int
swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
- int dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, struct dma_attrs *attrs)
{
struct scatterlist *sg;
int i;
@@ -771,7 +745,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
*/
void
swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
- int nelems, int dir, struct dma_attrs *attrs)
+ int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
{
struct scatterlist *sg;
int i;
@@ -821,7 +795,7 @@ swiotlb_sync_sg(struct device *hwdev, st
void
swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
- int nelems, int dir)
+ int nelems, enum dma_data_direction dir)
{
swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
}
@@ -829,7 +803,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
void
swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
- int nelems, int dir)
+ int nelems, enum dma_data_direction dir)
{
swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
}
--- head-2011-03-17.orig/mm/page_alloc.c 2011-02-08 10:03:14.000000000 +0100
+++ head-2011-03-17/mm/page_alloc.c 2011-02-08 10:05:20.000000000 +0100
@@ -5005,11 +5005,9 @@ static void __setup_per_zone_wmarks(void
}
#ifdef CONFIG_XEN
- for_each_zone(zone) {
+ for_each_populated_zone(zone) {
unsigned int cpu;
- if (!populated_zone(zone))
- continue;
for_each_online_cpu(cpu) {
unsigned long high;