qubes-linux-kernel/patches.xen/xen3-patch-2.6.30

From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux 2.6.30
Patch-mainline: 2.6.30

 This patch contains the differences between 2.6.29 and 2.6.30.

Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.30" by xen-port-patches.py

--- head-2010-05-25.orig/arch/ia64/include/asm/xen/hypervisor.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/ia64/include/asm/xen/hypervisor.h	2010-03-24 15:25:06.000000000 +0100
@@ -34,13 +34,13 @@
 #define _ASM_IA64_XEN_HYPERVISOR_H

 #include <linux/err.h>
+#ifdef CONFIG_PARAVIRT_XEN
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>	/* to compile feature.c */
 #include <xen/features.h>		/* to comiple xen-netfront.c */
 #include <xen/xen.h>
 #include <asm/xen/hypercall.h>

-#ifdef CONFIG_PARAVIRT_XEN
 extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;

--- head-2010-05-25.orig/arch/ia64/kernel/vmlinux.lds.S	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/arch/ia64/kernel/vmlinux.lds.S	2010-03-24 15:25:06.000000000 +0100
@@ -182,7 +182,7 @@ SECTIONS
 	  __start_gate_section = .;
 	  *(.data.gate)
 	  __stop_gate_section = .;
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 	  . = ALIGN(PAGE_SIZE);
 	  __xen_start_gate_section = .;
 	  *(.data.gate.xen)
--- head-2010-05-25.orig/arch/x86/Kconfig	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/Kconfig	2010-03-24 15:25:06.000000000 +0100
@@ -49,8 +49,8 @@ config X86
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_DMA_API_DEBUG
 	select HAVE_KERNEL_GZIP
-	select HAVE_KERNEL_BZIP2
-	select HAVE_KERNEL_LZMA
+	select HAVE_KERNEL_BZIP2 if !XEN
+	select HAVE_KERNEL_LZMA if !XEN
 	select HAVE_KERNEL_LZO
 	select HAVE_HW_BREAKPOINT
 	select PERF_EVENTS
@@ -337,11 +337,11 @@ config X86_XEN

 config X86_BIGSMP
 	bool "Support for big SMP systems with more than 8 CPUs"
-	depends on X86_32 && SMP
+	depends on X86_32 && SMP && !XEN
 	---help---
 	  This option is needed for the systems that have more than 8 CPUs

-if X86_32
+if X86_32 && !XEN
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
 	default y
@@ -371,7 +371,7 @@ config X86_64_XEN
 	help
 	  This option will compile a kernel compatible with Xen hypervisor

-if X86_64
+if X86_64 && !XEN
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
 	default y
@@ -842,7 +842,7 @@ config MAXSMP

 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
-	range 2 8 if SMP && X86_32 && !X86_BIGSMP
+	range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
 	range 2 512 if SMP && !MAXSMP
 	default "1" if !SMP
 	default "4096" if MAXSMP
@@ -916,10 +916,6 @@ config X86_VISWS_APIC
 	def_bool y
 	depends on X86_32 && X86_VISWS

-config X86_XEN_GENAPIC
-	def_bool y
-	depends on X86_64_XEN
-
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
 	default n
--- head-2010-05-25.orig/arch/x86/Makefile	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/Makefile	2010-03-24 15:25:06.000000000 +0100
@@ -111,10 +111,6 @@ endif
 # prevent gcc from generating any FP code by mistake
 KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)

-# Xen subarch support
-mflags-$(CONFIG_XEN)		:= -Iarch/x86/include/mach-xen
-mcore-$(CONFIG_XEN)		:= arch/x86/mach-xen/
-
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)

@@ -187,10 +183,10 @@ endif
 	$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
 	$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
 	$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+endif

 $(BOOT_TARGETS): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $@
-endif

 PHONY += install
 install:
--- head-2010-05-25.orig/arch/x86/boot/Makefile	2010-03-24 15:01:37.000000000 +0100
+++ head-2010-05-25/arch/x86/boot/Makefile	2010-03-24 15:25:06.000000000 +0100
@@ -204,6 +204,12 @@ $(obj)/vmlinux-stripped: OBJCOPYFLAGS :=
 $(obj)/vmlinux-stripped: vmlinux FORCE
 	$(call if_changed,objcopy)

+ifndef CONFIG_XEN
+bzImage := bzImage
+else
+bzImage := vmlinuz
+endif
+
 install:
-	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \
 		System.map "$(INSTALL_PATH)"
--- head-2010-05-25.orig/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:25:06.000000000 +0100
@@ -502,7 +502,7 @@ ia32_sys_call_table:
 	.quad sys32_olduname
 	.quad sys_umask		/* 60 */
 	.quad sys_chroot
-	.quad sys32_ustat
+	.quad compat_sys_ustat
 	.quad sys_dup2
 	.quad sys_getppid
 	.quad sys_getpgrp		/* 65 */
@@ -773,4 +773,6 @@ ia32_sys_call_table:
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad compat_sys_preadv
+	.quad compat_sys_pwritev
 ia32_syscall_end:
--- head-2010-05-25.orig/arch/x86/include/asm/kexec.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/asm/kexec.h	2010-03-24 15:25:06.000000000 +0100
@@ -21,8 +21,14 @@
 # define PA_CONTROL_PAGE	0
 # define VA_CONTROL_PAGE	1
 # define PA_TABLE_PAGE		2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE		3
 # define PAGES_NR		4
+# else /* CONFIG_XEN, see comment above
+#  define VA_TABLE_PAGE		3 */
+#  define PA_SWAP_PAGE		4
+#  define PAGES_NR		5
+# endif /* CONFIG_XEN */
 #endif

 # define KEXEC_CONTROL_CODE_MAX_SIZE	2048
--- head-2010-05-25.orig/arch/x86/include/asm/page_64_types.h	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/arch/x86/include/asm/page_64_types.h	2010-03-24 15:25:06.000000000 +0100
@@ -69,7 +69,15 @@ extern void init_extra_mapping_wb(unsign
 #endif	/* !__ASSEMBLY__ */

 #ifdef CONFIG_FLATMEM
+/*
+ * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen
+ * other than for hotplugged memory.
+ */
+#ifndef CONFIG_XEN
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
+#else
+#define pfn_valid(pfn)          ((pfn) < max_mapnr)
+#endif
 #endif

 #endif /* _ASM_X86_PAGE_64_DEFS_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:25:06.000000000 +0100
@@ -39,7 +39,7 @@ extern gate_desc idt_table[];
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
@@ -91,7 +91,6 @@ static inline int desc_empty(const void
 #define store_gdt(dtr) native_store_gdt(dtr)
 #define store_idt(dtr) native_store_idt(dtr)
 #define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))

 #define load_TLS(t, cpu) native_load_tls(t, cpu)
 #define set_ldt native_set_ldt
@@ -111,6 +110,8 @@ static inline void paravirt_free_ldt(str
 {
 }

+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
 static inline void native_write_idt_entry(gate_desc *idt, int entry,
 					  const gate_desc *gate)
 {
@@ -251,6 +252,8 @@ static inline void native_load_tls(struc
 		gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
 }
 #else
+#include <asm/pgtable.h>
+
 #define load_TLS(t, cpu) xen_load_tls(t, cpu)
 #define set_ldt xen_set_ldt

@@ -265,8 +268,9 @@ static inline void xen_load_tls(struct t
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;

 	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
-		if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
-						 *(u64 *)&t->tls_array[i]))
+		if (HYPERVISOR_update_descriptor(
+				arbitrary_virt_to_machine(&gdt[i]),
+				*(u64 *)&t->tls_array[i]))
 			BUG();
 }
 #endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/fixmap.h	2010-03-24 15:25:06.000000000 +0100
@@ -1,11 +1,154 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
 #ifndef _ASM_X86_FIXMAP_H
 #define _ASM_X86_FIXMAP_H

+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)
+
+#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
+#else
+#define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START	((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
+#endif
+
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
 #ifdef CONFIG_X86_32
-# include "fixmap_32.h"
+	FIX_HOLE,
+	FIX_VDSO,
 #else
-# include "fixmap_64.h"
+	VSYSCALL_LAST_PAGE,
+	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+	VSYSCALL_HPET,
+#endif
+	FIX_DBGP_BASE,
+	FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_X86_LOCAL_APIC
+	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_IO_APIC
+	FIX_IO_APIC_BASE_0,
+	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#else
+	FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS	256
+	FIX_ISAMAP_END,
+	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+	FIX_CO_CPU,	/* Cobalt timer */
+	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
+	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
+	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+	FIX_F00F_IDT,	/* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+	FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_X86_32
+	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
+	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+	FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+	FIX_PARAVIRT_BOOTMAP,
+#endif
+	FIX_TEXT_POKE0,	/* reserve 2 pages for text_poke() */
+	FIX_TEXT_POKE1,
+	__end_of_permanent_fixed_addresses,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
+	/*
+	 * 256 temporary boot-time mappings, used by early_ioremap(),
+	 * before ioremap() is functional.
+	 *
+	 * We round it up to the next 256 pages boundary so that we
+	 * can have a single pgd entry and a single pte table:
+	 */
+#define NR_FIX_BTMAPS		64
+#define FIX_BTMAPS_SLOTS	4
+	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+			(__end_of_permanent_fixed_addresses & 255),
+	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+#ifdef CONFIG_X86_32
+	FIX_WP_TEST,
+#endif
+	__end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_BOOT_START	(FIXADDR_TOP - FIXADDR_BOOT_SIZE)

 extern int fixmaps_set;

@@ -13,10 +156,10 @@ extern pte_t *kmap_pte;
 extern pgprot_t kmap_prot;
 extern pte_t *pkmap_page_table;

-void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
+void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);

 static inline void __set_fixmap(enum fixed_addresses idx,
-				maddr_t phys, pgprot_t flags)
+				phys_addr_t phys, pgprot_t flags)
 {
 	xen_set_fixmap(idx, phys, flags);
 }
@@ -65,4 +208,5 @@ static inline unsigned long virt_to_fix(
 	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
 	return __virt_to_fix(vaddr);
 }
+#endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap_32.h	2010-03-24 15:14:47.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,125 +0,0 @@
-/*
- * fixmap.h: compile-time virtual memory allocation
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998 Ingo Molnar
- *
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- */
-
-#ifndef _ASM_X86_FIXMAP_32_H
-#define _ASM_X86_FIXMAP_32_H
-
-/* used by vmalloc.c, vsyscall.lds.S.
- *
- * Leave one empty page between vmalloc'ed areas and
- * the start of the fixmap.
- */
-extern unsigned long __FIXADDR_TOP;
-#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
-#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
-
-#ifndef __ASSEMBLY__
-#include <linux/kernel.h>
-#include <asm/acpi.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <linux/threads.h>
-#include <asm/kmap_types.h>
-
-/*
- * Here we define all the compile-time 'special' virtual
- * addresses. The point is to have a constant address at
- * compile time, but to set the physical address only
- * in the boot process. We allocate these special addresses
- * from the end of virtual memory (0xfffff000) backwards.
- * Also this lets us do fail-safe vmalloc(), we
- * can guarantee that these special addresses and
- * vmalloc()-ed addresses never overlap.
- *
- * these 'compile-time allocated' memory buffers are
- * fixed-size 4k pages. (or larger if used with an increment
- * highger than 1) use fixmap_set(idx,phys) to associate
- * physical memory with fixmap indices.
- *
- * TLB entries of such buffers will not be flushed across
- * task switches.
- */
-enum fixed_addresses {
-	FIX_HOLE,
-	FIX_VDSO,
-	FIX_DBGP_BASE,
-	FIX_EARLYCON_MEM_BASE,
-#ifdef CONFIG_X86_LOCAL_APIC
-	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
-#endif
-#ifndef CONFIG_XEN
-#ifdef CONFIG_X86_IO_APIC
-	FIX_IO_APIC_BASE_0,
-	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
-#endif
-#else
-	FIX_SHARED_INFO,
-#define NR_FIX_ISAMAPS	256
-	FIX_ISAMAP_END,
-	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
-#endif
-#ifdef CONFIG_X86_VISWS_APIC
-	FIX_CO_CPU,	/* Cobalt timer */
-	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
-	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
-	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
-#endif
-#ifdef CONFIG_X86_F00F_BUG
-	FIX_F00F_IDT,	/* Virtual mapping for IDT */
-#endif
-#ifdef CONFIG_X86_CYCLONE_TIMER
-	FIX_CYCLONE_TIMER, /*cyclone timer register*/
-#endif
-	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
-	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
-#ifdef CONFIG_PCI_MMCONFIG
-	FIX_PCIE_MCFG,
-#endif
-#ifdef CONFIG_PARAVIRT
-	FIX_PARAVIRT_BOOTMAP,
-#endif
-	__end_of_permanent_fixed_addresses,
-	/*
-	 * 256 temporary boot-time mappings, used by early_ioremap(),
-	 * before ioremap() is functional.
-	 *
-	 * We round it up to the next 256 pages boundary so that we
-	 * can have a single pgd entry and a single pte table:
-	 */
-#define NR_FIX_BTMAPS		64
-#define FIX_BTMAPS_SLOTS	4
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
-			(__end_of_permanent_fixed_addresses & 255),
-	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
-	FIX_WP_TEST,
-#ifdef CONFIG_ACPI
-	FIX_ACPI_BEGIN,
-	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	FIX_OHCI1394_BASE,
-#endif
-	__end_of_fixed_addresses
-};
-
-extern void reserve_top_address(unsigned long reserve);
-
-
-#define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)
-
-#define __FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define __FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START		(FIXADDR_TOP - __FIXADDR_SIZE)
-#define FIXADDR_BOOT_START	(FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_FIXMAP_32_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:17:58.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,90 +0,0 @@
-/*
- * fixmap.h: compile-time virtual memory allocation
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998 Ingo Molnar
- */
-
-#ifndef _ASM_X86_FIXMAP_64_H
-#define _ASM_X86_FIXMAP_64_H
-
-#include <linux/kernel.h>
-#include <asm/acpi.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <asm/vsyscall.h>
-#include <asm/acpi.h>
-
-/*
- * Here we define all the compile-time 'special' virtual
- * addresses. The point is to have a constant address at
- * compile time, but to set the physical address only
- * in the boot process.
- *
- * These 'compile-time allocated' memory buffers are
- * fixed-size 4k pages (or larger if used with an increment
- * higher than 1). Use set_fixmap(idx,phys) to associate
- * physical memory with fixmap indices.
- *
- * TLB entries of such buffers will not be flushed across
- * task switches.
- */
-
-enum fixed_addresses {
-	VSYSCALL_LAST_PAGE,
-	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
-			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
-	VSYSCALL_HPET,
-	FIX_DBGP_BASE,
-	FIX_EARLYCON_MEM_BASE,
-#ifdef CONFIG_X86_LOCAL_APIC
-	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
-#endif
-#ifndef CONFIG_XEN
-	FIX_IO_APIC_BASE_0,
-	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-#else
-#define NR_FIX_ISAMAPS	256
-	FIX_ISAMAP_END,
-	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
-#endif
-#ifdef CONFIG_PARAVIRT
-	FIX_PARAVIRT_BOOTMAP,
-#else
-	FIX_SHARED_INFO,
-#endif
-	__end_of_permanent_fixed_addresses,
-#ifdef CONFIG_ACPI
-	FIX_ACPI_BEGIN,
-	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	FIX_OHCI1394_BASE,
-#endif
-	/*
-	 * 256 temporary boot-time mappings, used by early_ioremap(),
-	 * before ioremap() is functional.
-	 *
-	 * We round it up to the next 256 pages boundary so that we
-	 * can have a single pgd entry and a single pte table:
-	 */
-#define NR_FIX_BTMAPS		64
-#define FIX_BTMAPS_SLOTS	4
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
-			(__end_of_permanent_fixed_addresses & 255),
-	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
-	__end_of_fixed_addresses
-};
-
-#define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
-#define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
-
-/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
-#define FIXADDR_USER_START	((unsigned long)VSYSCALL32_VSYSCALL)
-#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
-
-#endif /* _ASM_X86_FIXMAP_64_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 17:05:16.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 17:05:22.000000000 +0100
@@ -62,6 +62,7 @@ void *kmap_atomic_prot(struct page *page
 void *kmap_atomic(struct page *page, enum km_type type);
 void kunmap_atomic(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);

 #define kmap_atomic_pte(page, type) \
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/hypervisor.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/hypervisor.h	2010-03-24 15:25:06.000000000 +0100
@@ -46,7 +46,7 @@
 #include <xen/interface/arch-x86/xen-mca.h>
 #include <asm/percpu.h>
 #include <asm/ptrace.h>
-#include <asm/page.h>
+#include <asm/pgtable_types.h>

 extern shared_info_t *HYPERVISOR_shared_info;

@@ -153,20 +153,16 @@ int __must_check xen_multi_mmuext_op(str
 #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-	__get_cpu_var(xen_lazy_mmu) = true;
+	percpu_write(xen_lazy_mmu, true);
 }

 static inline void arch_leave_lazy_mmu_mode(void)
 {
-	__get_cpu_var(xen_lazy_mmu) = false;
+	percpu_write(xen_lazy_mmu, false);
 	xen_multicall_flush(false);
 }

-#if defined(CONFIG_X86_32)
-#define arch_use_lazy_mmu_mode() unlikely(x86_read_percpu(xen_lazy_mmu))
-#elif !defined(arch_use_lazy_mmu_mode)
-#define arch_use_lazy_mmu_mode() unlikely(__get_cpu_var(xen_lazy_mmu))
-#endif
+#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu))

 #if 0 /* All uses are in places potentially called asynchronously, but
        * asynchronous code should rather not make use of lazy mode at all.
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/io.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/io.h	2010-03-24 15:25:06.000000000 +0100
@@ -5,6 +5,10 @@

 #include <linux/compiler.h>
 #include <asm-generic/int-ll64.h>
+#include <asm/page.h>
+#ifdef __KERNEL__
+#include <asm/fixmap.h>
+#endif

 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
@@ -82,6 +86,101 @@ static inline void writeq(__u64 val, vol

 #define native_io_delay xen_io_delay

+/**
+ *	virt_to_phys	-	map virtual addresses to physical
+ *	@address: address to remap
+ *
+ *	The returned physical address is the physical (CPU) mapping for
+ *	the memory address given. It is only valid to use this function on
+ *	addresses directly mapped or allocated via kmalloc.
+ *
+ *	This function does not give bus mappings for DMA transfers. In
+ *	almost all conceivable cases a device driver should not be using
+ *	this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+	return __pa(address);
+}
+
+/**
+ *	phys_to_virt	-	map physical address to virtual
+ *	@address: address to remap
+ *
+ *	The returned virtual address is a current CPU mapping for
+ *	the memory address given. It is only valid to use this function on
+ *	addresses that have a kernel mapping
+ *
+ *	This function does not handle bus mappings for DMA transfers. In
+ *	almost all conceivable cases a device driver should not be using
+ *	this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+	return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page)	 (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page)	 (phys_to_machine(page_to_pseudophys(page)))
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ * However, we truncate the address to unsigned int to avoid undesirable
+ * promitions in legacy drivers.
+ */
+#define isa_virt_to_bus(_x) ({ \
+	unsigned long _va_ = (unsigned long)(_x); \
+	_va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+	? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+	: ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/**
+ * ioremap     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+				unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+	return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+
 #ifdef CONFIG_X86_32
 # include "../../asm/io_32.h"
 #else
@@ -93,11 +192,6 @@ static inline void writeq(__u64 val, vol
 /* We will be supplying our own /dev/mem implementation */
 #define ARCH_HAS_DEV_MEM

-#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-#undef page_to_phys
-#define page_to_phys(page)	 (phys_to_machine(page_to_pseudophys(page)))
-#define page_to_bus(page)	 (phys_to_machine(page_to_pseudophys(page)))
-
 #define bvec_to_pseudophys(bv)	 (page_to_pseudophys((bv)->bv_page) + \
 				  (unsigned long)(bv)->bv_offset)

@@ -106,23 +200,7 @@ static inline void writeq(__u64 val, vol
 	 && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
 	    == bvec_to_pseudophys(vec2))

-#undef virt_to_bus
-#undef bus_to_virt
-#define virt_to_bus(_x) phys_to_machine(__pa(_x))
-#define bus_to_virt(_x) __va(machine_to_phys(_x))
-
-#include <asm/fixmap.h>
-
 #undef __ISA_IO_base
-#undef isa_virt_to_bus
-#undef isa_page_to_bus
-#undef isa_bus_to_virt
-#define isa_virt_to_bus(_x) ({ \
-	unsigned long _va_ = (unsigned long)(_x); \
-	_va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
-	? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
-	: ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
-#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))

 #endif

@@ -131,7 +209,7 @@ extern void unxlate_dev_mem_ptr(unsigned

 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
 				     unsigned long prot_val);
-extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);

 /*
  * early_ioremap() and early_iounmap() are for temporary early boot-time
@@ -140,10 +218,12 @@ extern void __iomem *ioremap_wc(unsigned
  */
 extern void early_ioremap_init(void);
 extern void early_ioremap_reset(void);
-extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
-extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
+extern void __iomem *early_ioremap(resource_size_t phys_addr,
+				   unsigned long size);
+extern void __iomem *early_memremap(resource_size_t phys_addr,
+				    unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);

+#define IO_SPACE_LIMIT 0xffff

 #endif /* _ASM_X86_IO_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/ipi.h	2010-03-24 15:25:06.000000000 +0100
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+#include <asm/hw_irq.h>
+#include <asm/smp.h>
+
+void xen_send_IPI_mask(const struct cpumask *, int vector);
+void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector);
+void xen_send_IPI_allbutself(int vector);
+void xen_send_IPI_all(int vector);
+void xen_send_IPI_self(int vector);
+
+#endif /* _ASM_X86_IPI_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/irqflags.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/irqflags.h	2010-03-24 15:25:06.000000000 +0100
@@ -94,7 +94,7 @@ static inline void halt(void)

 #ifdef CONFIG_X86_64
 # define __REG_si %rsi
-# define __CPU_num %gs:pda_cpunumber
+# define __CPU_num PER_CPU_VAR(cpu_number)
 #else
 # define __REG_si %esi
 # define __CPU_num TI_cpu(%ebp)
@@ -130,6 +130,7 @@ sysexit_ecrit:	/**** END OF SYSEXIT CRIT
 	mov  $__KERNEL_PERCPU, %ecx					; \
 	push %esp							; \
 	mov  %ecx, %fs							; \
+	SET_KERNEL_GS %ecx						; \
 	call evtchn_do_upcall						; \
 	add  $4,%esp							; \
 	jmp  ret_from_intr
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/irq_vectors.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/irq_vectors.h	2010-03-24 15:25:06.000000000 +0100
@@ -2,29 +2,46 @@
 #define _ASM_X86_IRQ_VECTORS_H

 #ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR		0x80
+# define SYSCALL_VECTOR			0x80
 #else
-# define IA32_SYSCALL_VECTOR	0x80
+# define IA32_SYSCALL_VECTOR		0x80
 #endif

-#define RESCHEDULE_VECTOR	0
-#define CALL_FUNCTION_VECTOR	1
-#define CALL_FUNC_SINGLE_VECTOR 2
-#define SPIN_UNLOCK_VECTOR	3
-#define NR_IPIS			4
+#define RESCHEDULE_VECTOR		0
+#define CALL_FUNCTION_VECTOR		1
+#define CALL_FUNC_SINGLE_VECTOR		2
+#define SPIN_UNLOCK_VECTOR		3
+#define NR_IPIS				4

 /*
  * The maximum number of vectors supported by i386 processors
  * is limited to 256. For processors other than i386, NR_VECTORS
  * should be changed accordingly.
  */
-#define NR_VECTORS		256
+#define NR_VECTORS			 256

-#define	FIRST_VM86_IRQ		3
-#define LAST_VM86_IRQ		15
-#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
+#define	FIRST_VM86_IRQ			   3
+#define LAST_VM86_IRQ			  15

-#define NR_IRQS_LEGACY		16
+#ifndef __ASSEMBLY__
+static inline int invalid_vm86_irq(int irq)
+{
+	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+#endif
+
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
+
+#define NR_IRQS_LEGACY			  16

 /*
  * The flat IRQ space is divided into two regions:
@@ -35,21 +52,41 @@
  *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
  *     are bound using the provided bind/unbind functions.
  */
+#define PIRQ_BASE			0

-#define PIRQ_BASE		0
-#if defined(NR_CPUS) && defined(MAX_IO_APICS)
-# if !defined(CONFIG_SPARSE_IRQ) && NR_CPUS < MAX_IO_APICS
-#  define NR_PIRQS		(NR_VECTORS + 32 * NR_CPUS)
-# elif defined(CONFIG_SPARSE_IRQ) && 8 * NR_CPUS > 32 * MAX_IO_APICS
-#  define NR_PIRQS		(NR_VECTORS + 8 * NR_CPUS)
+#define CPU_VECTOR_LIMIT		(  8 * NR_CPUS      )
+#define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
+
+#ifdef CONFIG_X86_IO_APIC
+# if !defined(NR_CPUS) || !defined(MAX_IO_APICS)
+/* nothing */
+# elif defined(CONFIG_SPARSE_IRQ)
+#  define NR_PIRQS					\
+	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
+		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
+		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+# elif NR_CPUS < MAX_IO_APICS
+#  define NR_PIRQS 			(NR_VECTORS + 4*CPU_VECTOR_LIMIT)
 # else
-#  define NR_PIRQS		(NR_VECTORS + 32 * MAX_IO_APICS)
+#  define NR_PIRQS			(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
 # endif
+#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
+# define NR_PIRQS			(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else /* !CONFIG_X86_IO_APIC: */
+# define NR_PIRQS			NR_IRQS_LEGACY
+#endif
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ)
+extern int nr_pirqs;
+#else
+# define nr_pirqs			NR_PIRQS
+#endif
 #endif

-#define DYNIRQ_BASE		(PIRQ_BASE + NR_PIRQS)
-#define NR_DYNIRQS		256
+#define DYNIRQ_BASE			(PIRQ_BASE + nr_pirqs)
+#define NR_DYNIRQS			256

-#define NR_IRQS			(NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQS				(NR_PIRQS + NR_DYNIRQS)

 #endif /* _ASM_X86_IRQ_VECTORS_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/mmu_context.h	2010-03-24 15:25:06.000000000 +0100
@@ -26,11 +26,117 @@ static inline void xen_activate_mm(struc
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 void destroy_context(struct mm_struct *mm);

+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+#define prepare_arch_switch(next)	__prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
 #ifdef CONFIG_X86_32
-# include "mmu_context_32.h"
+	/*
+	 * Save away %gs. No need to save %fs, as it was saved on the
+	 * stack on entry.  No need to save %es and %ds, as those are
+	 * always kernel segments while inside the kernel.
+	 */
+	lazy_save_gs(current->thread.gs);
+	lazy_load_gs(__KERNEL_STACK_CANARY);
 #else
-# include "mmu_context_64.h"
+	/*
+	 * Save away %es, %ds, %fs and %gs. Must happen before reload
+	 * of cr3/ldt (i.e., not in __switch_to).
+	 */
+	__asm__ __volatile__ (
+		"mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
+		: "=m" (current->thread.es),
+		  "=m" (current->thread.ds),
+		  "=m" (current->thread.fsindex),
+		  "=m" (current->thread.gsindex) );
+
+	if (current->thread.ds)
+		__asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
+
+	if (current->thread.es)
+		__asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
+
+	if (current->thread.fsindex) {
+		__asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
+		current->thread.fs = 0;
+	}
+
+	if (current->thread.gsindex) {
+		load_gs_index(0);
+		current->thread.gs = 0;
+	}
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+			     struct task_struct *tsk)
+{
+	unsigned cpu = smp_processor_id();
+	struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+
+	if (likely(prev != next)) {
+		BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+		       !PagePinned(virt_to_page(next->pgd)));
+
+		/* stop flush ipis for the previous mm */
+		cpu_clear(cpu, prev->cpu_vm_mask);
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		percpu_write(cpu_tlbstate.active_mm, next);
 #endif
+		cpu_set(cpu, next->cpu_vm_mask);
+
+		/* Re-load page tables: load_cr3(next->pgd) */
+		op->cmd = MMUEXT_NEW_BASEPTR;
+		op->arg1.mfn = virt_to_mfn(next->pgd);
+		op++;
+
+		/* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
+#ifdef CONFIG_X86_64
+		op->cmd = MMUEXT_NEW_USER_BASEPTR;
+		op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd));
+		op++;
+#endif
+
+		/*
+		 * load the LDT, if the LDT is different:
+		 */
+		if (unlikely(prev->context.ldt != next->context.ldt)) {
+			/* load_LDT_nolock(&next->context) */
+			op->cmd = MMUEXT_SET_LDT;
+			op->arg1.linear_addr = (unsigned long)next->context.ldt;
+			op->arg2.nr_ents     = next->context.size;
+			op++;
+		}
+
+		BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+	}
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+	else {
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+			/* We were in lazy tlb mode and leave_mm disabled
+			 * tlb flush IPI delivery. We must reload CR3
+			 * to make sure to use no freed page tables.
+			 */
+			load_cr3(next->pgd);
+			xen_new_user_pt(__pa(__user_pgd(next->pgd)));
+			load_LDT_nolock(&next->context);
+		}
+	}
+#endif
+}

 #define activate_mm(prev, next)			\
 do {						\
@@ -38,5 +144,17 @@ do {						\
 	switch_mm((prev), (next), NULL);	\
 } while (0);

+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm)			\
+do {						\
+	lazy_load_gs(0);			\
+} while (0)
+#else
+#define deactivate_mm(tsk, mm)			\
+do {						\
+	load_gs_index(0);			\
+	loadsegment(fs, 0);			\
+} while (0)
+#endif

 #endif /* _ASM_X86_MMU_CONTEXT_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:17:58.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,83 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_32_H
-#define _ASM_X86_MMU_CONTEXT_32_H
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
-}
-
-#define prepare_arch_switch(next)	__prepare_arch_switch()
-
-static inline void __prepare_arch_switch(void)
-{
-	/*
-	 * Save away %gs. No need to save %fs, as it was saved on the
-	 * stack on entry.  No need to save %es and %ds, as those are
-	 * always kernel segments while inside the kernel.
-	 */
-	asm volatile ( "mov %%gs,%0"
-		: "=m" (current->thread.gs));
-	asm volatile ( "movl %0,%%gs"
-		: : "r" (0) );
-}
-
-static inline void switch_mm(struct mm_struct *prev,
-			     struct mm_struct *next,
-			     struct task_struct *tsk)
-{
-	int cpu = smp_processor_id();
-	struct mmuext_op _op[2], *op = _op;
-
-	if (likely(prev != next)) {
-		BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
-		       !PagePinned(virt_to_page(next->pgd)));
-
-		/* stop flush ipis for the previous mm */
-		cpu_clear(cpu, prev->cpu_vm_mask);
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		x86_write_percpu(cpu_tlbstate.active_mm, next);
-#endif
-		cpu_set(cpu, next->cpu_vm_mask);
-
-		/* Re-load page tables: load_cr3(next->pgd) */
-		op->cmd = MMUEXT_NEW_BASEPTR;
-		op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
-		op++;
-
-		/*
-		 * load the LDT, if the LDT is different:
-		 */
-		if (unlikely(prev->context.ldt != next->context.ldt)) {
-			/* load_LDT_nolock(&next->context, cpu) */
-			op->cmd = MMUEXT_SET_LDT;
-			op->arg1.linear_addr = (unsigned long)next->context.ldt;
-			op->arg2.nr_ents     = next->context.size;
-			op++;
-		}
-
-		BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
-	}
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
-	else {
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
-
-		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-			/* We were in lazy tlb mode and leave_mm disabled
-			 * tlb flush IPI delivery. We must reload %cr3.
-			 */
-			load_cr3(next->pgd);
-			load_LDT_nolock(&next->context);
-		}
-	}
-#endif
-}
-
-#define deactivate_mm(tsk, mm)			\
-	asm("movl %0,%%gs": :"r" (0));
-
-#endif /* _ASM_X86_MMU_CONTEXT_32_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h	2010-03-24 15:14:47.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,106 +0,0 @@
-#ifndef _ASM_X86_MMU_CONTEXT_64_H
-#define _ASM_X86_MMU_CONTEXT_64_H
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	if (read_pda(mmu_state) == TLBSTATE_OK)
-		write_pda(mmu_state, TLBSTATE_LAZY);
-#endif
-}
-
-#define prepare_arch_switch(next)	__prepare_arch_switch()
-
-static inline void __prepare_arch_switch(void)
-{
-	/*
-	 * Save away %es, %ds, %fs and %gs. Must happen before reload
-	 * of cr3/ldt (i.e., not in __switch_to).
-	 */
-	__asm__ __volatile__ (
-		"mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
-		: "=m" (current->thread.es),
-		  "=m" (current->thread.ds),
-		  "=m" (current->thread.fsindex),
-		  "=m" (current->thread.gsindex) );
-
-	if (current->thread.ds)
-		__asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
-
-	if (current->thread.es)
-		__asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
-
-	if (current->thread.fsindex) {
-		__asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
-		current->thread.fs = 0;
-	}
-
-	if (current->thread.gsindex) {
-		load_gs_index(0);
-		current->thread.gs = 0;
-	}
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
-{
-	unsigned cpu = smp_processor_id();
-	struct mmuext_op _op[3], *op = _op;
-
-	if (likely(prev != next)) {
-		BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
-		       !PagePinned(virt_to_page(next->pgd)));
-
-		/* stop flush ipis for the previous mm */
-		cpu_clear(cpu, prev->cpu_vm_mask);
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-		write_pda(mmu_state, TLBSTATE_OK);
-		write_pda(active_mm, next);
-#endif
-		cpu_set(cpu, next->cpu_vm_mask);
-
-		/* load_cr3(next->pgd) */
-		op->cmd = MMUEXT_NEW_BASEPTR;
-		op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
-		op++;
-
-		/* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
-		op->cmd = MMUEXT_NEW_USER_BASEPTR;
-		op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
-		op++;
-
-		if (unlikely(next->context.ldt != prev->context.ldt)) {
-			/* load_LDT_nolock(&next->context) */
-			op->cmd = MMUEXT_SET_LDT;
-			op->arg1.linear_addr = (unsigned long)next->context.ldt;
-			op->arg2.nr_ents     = next->context.size;
-			op++;
-		}
-
-		BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
-	}
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	else {
-		write_pda(mmu_state, TLBSTATE_OK);
-		if (read_pda(active_mm) != next)
-			BUG();
-		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-			/* We were in lazy tlb mode and leave_mm disabled
-			 * tlb flush IPI delivery. We must reload CR3
-			 * to make sure to use no freed page tables.
-			 */
-                        load_cr3(next->pgd);
-                        xen_new_user_pt(__pa(__user_pgd(next->pgd)));
-			load_LDT_nolock(&next->context);
-		}
-	}
-#endif
-}
-
-#define deactivate_mm(tsk, mm)			\
-do {						\
-	load_gs_index(0);			\
-	asm volatile("movl %0,%%fs"::"r"(0));	\
-} while (0)
-
-#endif /* _ASM_X86_MMU_CONTEXT_64_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:25:06.000000000 +0100
@@ -41,7 +41,6 @@ static inline int pci_proc_domain(struct
 	return pci_domain_nr(bus);
 }

-extern void pci_iommu_alloc(void);

 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
@@ -92,12 +91,44 @@ static inline void early_quirks(void) {

 extern void pci_iommu_alloc(void);

-#endif  /* __KERNEL__ */
+/* MSI arch hooks */
+#define arch_setup_msi_irqs arch_setup_msi_irqs
+#define arch_teardown_msi_irqs arch_teardown_msi_irqs
+
+#define PCI_DMA_BUS_IS_PHYS 0
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) || defined(CONFIG_SWIOTLB)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)       \
+	        dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)         \
+	        __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME)                  \
+	        ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)         \
+	        (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME)                    \
+	        ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL)           \
+	        (((PTR)->LEN_NAME) = (VAL))

-#ifdef CONFIG_X86_32
-# include "pci_32.h"
 #else
-# include "../../asm/pci_64.h"
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)       dma_addr_t ADDR_NAME[0];
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
+#define pci_unmap_addr(PTR, ADDR_NAME)  sizeof((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+	        do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
+#define pci_unmap_len(PTR, LEN_NAME)            sizeof((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+	        do { break; } while (pci_unmap_len(PTR, LEN_NAME))
+
+#endif
+
+#endif  /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include "../../asm/pci_64.h"
 #endif

 /* implement the pci_ DMA API in terms of the generic device dma_ one */
@@ -115,11 +146,6 @@ static inline int __pcibus_to_node(const
 	return sd->node;
 }

-static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
-{
-	return node_to_cpumask(__pcibus_to_node(bus));
-}
-
 static inline const struct cpumask *
 cpumask_of_pcibus(const struct pci_bus *bus)
 {
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:25:06.000000000 +0100
@@ -1,178 +1,9 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H

-#define FIRST_USER_ADDRESS	0
+#include <asm/page.h>

-#define _PAGE_BIT_PRESENT	0	/* is present */
-#define _PAGE_BIT_RW		1	/* writeable */
-#define _PAGE_BIT_USER		2	/* userspace addressable */
-#define _PAGE_BIT_PWT		3	/* page write through */
-#define _PAGE_BIT_PCD		4	/* page cache disabled */
-#define _PAGE_BIT_ACCESSED	5	/* was accessed (raised by CPU) */
-#define _PAGE_BIT_DIRTY		6	/* was written to (raised by CPU) */
-#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
-#define _PAGE_BIT_PAT		7	/* on 4KB pages */
-#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
-#define _PAGE_BIT_UNUSED1	9	/* available for programmer */
-#define _PAGE_BIT_IOMAP		10	/* flag used to indicate IO mapping */
-#define _PAGE_BIT_UNUSED3	11
-#define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
-#define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
-#define _PAGE_BIT_CPA_TEST	_PAGE_BIT_UNUSED1
-#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
-
-/* If _PAGE_BIT_PRESENT is clear, we use these: */
-/* - if the user mapped it with PROT_NONE; pte_present gives true */
-#define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
-
-#define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
-#define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
-#define _PAGE_USER	(_AT(pteval_t, 1) << _PAGE_BIT_USER)
-#define _PAGE_PWT	(_AT(pteval_t, 1) << _PAGE_BIT_PWT)
-#define _PAGE_PCD	(_AT(pteval_t, 1) << _PAGE_BIT_PCD)
-#define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
-#define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
-#define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
-#define _PAGE_UNUSED3	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
-#define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
-#define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
-#define _PAGE_CPA_TEST	(_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define __HAVE_ARCH_PTE_SPECIAL
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
-#else
-#define _PAGE_NX	(_AT(pteval_t, 0))
-#endif
-
-#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-
-#ifndef __ASSEMBLY__
-#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
-extern unsigned int __kernel_page_user;
-#else
-#define __kernel_page_user 0
-#endif
-#endif
-
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-			 _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
-			 _PAGE_DIRTY | __kernel_page_user)
-
-/* Set of bits not changed in pte_modify */
-#define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
-			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-/*
- * PAT settings are part of the hypervisor interface, which sets the
- * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
- */
-#define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
-#define _PAGE_CACHE_WB		(0)
-#define _PAGE_CACHE_WT		(_PAGE_PWT)
-#define _PAGE_CACHE_WC		(_PAGE_PAT)
-#define _PAGE_CACHE_WP		(_PAGE_PAT | _PAGE_PWT)
-#define _PAGE_CACHE_UC_MINUS	(_PAGE_PCD)
-#define _PAGE_CACHE_UC		(_PAGE_PCD | _PAGE_PWT)
-
-#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
-				 _PAGE_ACCESSED | _PAGE_NX)
-
-#define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW |	\
-					 _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
-					 _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
-					 _PAGE_ACCESSED)
-#define PAGE_COPY		PAGE_COPY_NOEXEC
-#define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
-					 _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
-					 _PAGE_ACCESSED)
-
-#define __PAGE_KERNEL_EXEC						\
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
-#define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
-
-#define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
-#define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
-#define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_WC		(__PAGE_KERNEL | _PAGE_CACHE_WC)
-#define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
-#define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE	(__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-#define __PAGE_KERNEL_IO		(__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC | _PAGE_IOMAP)
-
-#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_WC			__pgprot(__PAGE_KERNEL_WC)
-#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_UC_MINUS		__pgprot(__PAGE_KERNEL_UC_MINUS)
-#define PAGE_KERNEL_EXEC_NOCACHE	__pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
-#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_NOCACHE	__pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
-#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE	__pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
-
-#define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
-#define PAGE_KERNEL_IO_UC_MINUS		__pgprot(__PAGE_KERNEL_IO_UC_MINUS)
-#define PAGE_KERNEL_IO_WC		__pgprot(__PAGE_KERNEL_IO_WC)
-
-/*         xwr */
-#define __P000	PAGE_NONE
-#define __P001	PAGE_READONLY
-#define __P010	PAGE_COPY
-#define __P011	PAGE_COPY
-#define __P100	PAGE_READONLY_EXEC
-#define __P101	PAGE_READONLY_EXEC
-#define __P110	PAGE_COPY_EXEC
-#define __P111	PAGE_COPY_EXEC
-
-#define __S000	PAGE_NONE
-#define __S001	PAGE_READONLY
-#define __S010	PAGE_SHARED
-#define __S011	PAGE_SHARED
-#define __S100	PAGE_READONLY_EXEC
-#define __S101	PAGE_READONLY_EXEC
-#define __S110	PAGE_SHARED_EXEC
-#define __S111	PAGE_SHARED_EXEC
-
-/*
- * early identity mapping  pte attrib macros.
- */
-#ifdef CONFIG_X86_64
-#define __PAGE_KERNEL_IDENT_LARGE_EXEC	__PAGE_KERNEL_LARGE_EXEC
-#else
-/*
- * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
- * bits are combined, this will alow user to access the high address mapped
- * VDSO in the presence of CONFIG_COMPAT_VDSO
- */
-#define PTE_IDENT_ATTR	 0x003		/* PRESENT+RW */
-#define PDE_IDENT_ATTR	 0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
-#define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
-#endif
+#include <asm/pgtable_types.h>

 /*
  * Macro to mark a page protection value as UC-
@@ -184,9 +15,6 @@ extern unsigned int __kernel_page_user;

 #ifndef __ASSEMBLY__

-#define pgprot_writecombine	pgprot_writecombine
-extern pgprot_t pgprot_writecombine(pgprot_t prot);
-
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
@@ -197,6 +25,59 @@ extern unsigned long empty_zero_page[PAG
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;

+#define set_pte(ptep, pte)		xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte)	xen_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_atomic(ptep, pte)					\
+	xen_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd)		xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd)		xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd)			xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud)		xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud)			xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep)	xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd)			xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep)              do { } while (0)
+#define pte_update_defer(mm, addr, ptep)        do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+	xen_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+	xen_pagetable_setup_done(base);
+}
+
+#define pgd_val(x)	xen_pgd_val(x)
+#define __pgd(x)	xen_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x)	xen_pud_val(x)
+#define __pud(x)	xen_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x)	xen_pmd_val(x)
+#define __pmd(x)	xen_make_pmd(x)
+#endif
+
+#define pte_val(x)	xen_pte_val(x)
+#define __pte(x)	xen_make_pte(x)
+
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
@@ -252,53 +133,67 @@ static inline int pte_special(pte_t pte)

 static inline int pmd_large(pmd_t pte)
 {
-	return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+	return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
 		(_PAGE_PSE | _PAGE_PRESENT);
 }

+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+	pteval_t v = __pte_val(pte);
+
+	return __pte_ma(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+	pteval_t v = __pte_val(pte);
+
+	return __pte_ma(v & ~clear);
+}
+
 static inline pte_t pte_mkclean(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
+	return pte_clear_flags(pte, _PAGE_DIRTY);
 }

 static inline pte_t pte_mkold(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
+	return pte_clear_flags(pte, _PAGE_ACCESSED);
 }

 static inline pte_t pte_wrprotect(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
+	return pte_clear_flags(pte, _PAGE_RW);
 }

 static inline pte_t pte_mkexec(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
+	return pte_clear_flags(pte, _PAGE_NX);
 }

 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
+	return pte_set_flags(pte, _PAGE_DIRTY);
 }

 static inline pte_t pte_mkyoung(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
+	return pte_set_flags(pte, _PAGE_ACCESSED);
 }

 static inline pte_t pte_mkwrite(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) | _PAGE_RW);
+	return pte_set_flags(pte, _PAGE_RW);
 }

 static inline pte_t pte_mkhuge(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) | _PAGE_PSE);
+	return pte_set_flags(pte, _PAGE_PSE);
 }

 static inline pte_t pte_clrhuge(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
+	return pte_clear_flags(pte, _PAGE_PSE);
 }

 static inline pte_t pte_mkglobal(pte_t pte)
@@ -313,11 +208,9 @@ static inline pte_t pte_clrglobal(pte_t

 static inline pte_t pte_mkspecial(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
+	return pte_set_flags(pte, _PAGE_SPECIAL);
 }

-extern pteval_t __supported_pte_mask;
-
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
@@ -391,68 +284,208 @@ static inline int is_new_memtype_allowed
 	return 1;
 }

-#ifndef __ASSEMBLY__
-#ifndef CONFIG_XEN
-/* Indicate that x86 has its own track and untrack pfn vma functions */
-#define __HAVE_PFNMAP_TRACKING
-#endif
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
+#endif	/* __ASSEMBLY__ */

-#define __HAVE_PHYS_MEM_ACCESS_PROT
-struct file;
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-                              unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
-                              unsigned long size, pgprot_t *vma_prot);
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
 #endif

-/* Install a pte for a particular vaddr in kernel space. */
-void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>

-#ifndef CONFIG_XEN
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
+static inline int pte_none(pte_t pte)
+{
+	return !pte.pte;
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+	return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+   can temporarily clear it. */
+	return __pmd_val(pmd) != 0;
 #else
-static inline void xen_pagetable_setup_start(pgd_t *base) {}
-static inline void xen_pagetable_setup_done(pgd_t *base) {}
+	return pmd_flags(pmd) & _PAGE_PRESENT;
 #endif
+}

-struct seq_file;
-extern void arch_report_meminfo(struct seq_file *m);
+static inline int pmd_none(pmd_t pmd)
+{
+	/* Only check low word on 32-bit platforms, since it might be
+	   out of sync with upper half. */
+	return (unsigned long)__pmd_val(pmd) == 0;
+}

-#define set_pte(ptep, pte)		xen_set_pte(ptep, pte)
-#define set_pte_at(mm, addr, ptep, pte)	xen_set_pte_at(mm, addr, ptep, pte)
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+	return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+}

-#define set_pte_atomic(ptep, pte)					\
-	xen_set_pte_atomic(ptep, pte)
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd)	pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)

-#define set_pmd(pmdp, pmd)		xen_set_pmd(pmdp, pmd)
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned pmd_index(unsigned long address)
+{
+	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}

-#ifndef __PAGETABLE_PUD_FOLDED
-#define set_pgd(pgdp, pgd)		xen_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd)			xen_pgd_clear(pgd)
-#endif
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))

-#ifndef set_pud
-# define set_pud(pudp, pud)		xen_set_pud(pudp, pud)
-#endif
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned pte_index(unsigned long address)
+{
+	return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}

-#ifndef __PAGETABLE_PMD_FOLDED
-#define pud_clear(pud)			xen_pud_clear(pud)
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+	return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+	return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT)
+	       != (_KERNPG_TABLE & ~_PAGE_PRESENT);
+#else
+	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 #endif
+}

-#define pte_clear(mm, addr, ptep)	xen_pte_clear(mm, addr, ptep)
-#define pmd_clear(pmd)			xen_pmd_clear(pmd)
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+	return npg >> (20 - PAGE_SHIFT);
+}

-#define pte_update(mm, addr, ptep)              do { } while (0)
-#define pte_update_defer(mm, addr, ptep)        do { } while (0)
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
+	direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)

-#endif	/* __ASSEMBLY__ */
+#if PAGETABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+	return __pud_val(pud) == 0;
+}

-#ifdef CONFIG_X86_32
-# include "pgtable_32.h"
+static inline int pud_present(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+	return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud)		pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+	return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+	return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int pud_large(pud_t pud)
+{
+	return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+		(_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+	return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
 #else
-# include "pgtable_64.h"
-#endif
+static inline int pud_large(pud_t pud)
+{
+	return 0;
+}
+#endif	/* PAGETABLE_LEVELS > 2 */
+
+#if PAGETABLE_LEVELS > 3
+static inline int pgd_present(pgd_t pgd)
+{
+	return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+	return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd)		pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+
+/* to find an entry in a page-table-directory. */
+static inline unsigned pud_index(unsigned long address)
+{
+	return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+	return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+	return !__pgd_val(pgd);
+}
+#endif	/* PAGETABLE_LEVELS > 3 */
+
+#endif	/* __ASSEMBLY__ */

 /*
  * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
@@ -479,28 +512,6 @@ extern void arch_report_meminfo(struct s

 #ifndef __ASSEMBLY__

-enum {
-	PG_LEVEL_NONE,
-	PG_LEVEL_4K,
-	PG_LEVEL_2M,
-	PG_LEVEL_1G,
-	PG_LEVEL_NUM
-};
-
-#ifdef CONFIG_PROC_FS
-extern void update_page_count(int level, unsigned long pages);
-#else
-static inline void update_page_count(int level, unsigned long pages) { }
-#endif
-
-/*
- * Helper function that returns the kernel pagetable entry controlling
- * the virtual address 'address'. NULL means no pagetable entry present.
- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
- * as a pte too.
- */
-extern pte_t *lookup_address(unsigned long address, unsigned int *level);
-
 /* local pte updates need not use xchg for locking */
 static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
 {
@@ -633,15 +644,18 @@ static inline void clone_pgd_range(pgd_t
        memcpy(dst, src, count * sizeof(pgd_t));
 }

-#define arbitrary_virt_to_machine(va)					\
+#define arbitrary_virt_to_mfn(va)					\
 ({									\
 	unsigned int __lvl;						\
 	pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);	\
 	BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
-	(((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT)			\
-	 | ((unsigned long)(va) & (PAGE_SIZE - 1)));			\
+	pte_mfn(*__ptep);						\
 })

+#define arbitrary_virt_to_machine(va)					\
+	(((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT)		\
+	 | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+
 #ifdef CONFIG_HIGHPTE
 #include <asm/io.h>
 struct page *kmap_atomic_to_page(void *);
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:25:06.000000000 +0100
@@ -20,21 +20,6 @@
 	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
 	       (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)

-static inline int pud_none(pud_t pud)
-{
-	return __pud_val(pud) == 0;
-
-}
-static inline int pud_bad(pud_t pud)
-{
-	return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
-}
-
-static inline int pud_present(pud_t pud)
-{
-	return __pud_val(pud) & _PAGE_PRESENT;
-}
-
 /* Rules for using set_pte: the pte being assigned *must* be
  * either not present or in a state where the hardware will
  * not attempt to update the pte.  In places where this is
@@ -102,15 +87,6 @@ static inline void pud_clear(pud_t *pudp
 		xen_tlb_flush();
 }

-#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
-
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
-
-
-/* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) +	\
-				  pmd_index(address))
-
 #ifdef CONFIG_SMP
 static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
 {
@@ -127,17 +103,6 @@ static inline pte_t xen_ptep_get_and_cle
 #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
 #endif

-#define __HAVE_ARCH_PTE_SAME
-static inline int pte_same(pte_t a, pte_t b)
-{
-	return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
-}
-
-static inline int pte_none(pte_t pte)
-{
-	return !(pte.pte_low | pte.pte_high);
-}
-
 #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
 			 ((_pte).pte_high << (32-PAGE_SHIFT)))

--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable-3level-defs.h	2010-03-24 15:14:47.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,24 +0,0 @@
-#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
-#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
-
-#define SHARED_KERNEL_PMD	0
-
-/*
- * PGDIR_SHIFT determines what a top-level page table entry can map
- */
-#define PGDIR_SHIFT	30
-#define PTRS_PER_PGD	4
-
-/*
- * PMD_SHIFT determines the size of the area a middle-level
- * page table can map
- */
-#define PMD_SHIFT	21
-#define PTRS_PER_PMD	512
-
-/*
- * entries per page directory level
- */
-#define PTRS_PER_PTE	512
-
-#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable-3level_types.h	2010-03-24 15:25:06.000000000 +0100
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef u64	pteval_t;
+typedef u64	pmdval_t;
+typedef u64	pudval_t;
+typedef u64	pgdval_t;
+typedef u64	pgprotval_t;
+
+typedef union {
+	struct {
+		unsigned long pte_low, pte_high;
+	};
+	pteval_t pte;
+} pte_t;
+#endif	/* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD	0
+
+#define PAGETABLE_LEVELS	3
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT	30
+#define PTRS_PER_PGD	4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT	21
+#define PTRS_PER_PMD	512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE	512
+
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:25:06.000000000 +0100
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_PGTABLE_32_H
 #define _ASM_X86_PGTABLE_32_H

+#include <asm/pgtable_32_types.h>
+
 /*
  * The Linux memory management assumes a three-level page table setup. On
  * the i386, we use that, but "fold" the mid level into the top-level page
@@ -31,47 +33,6 @@ void paging_init(void);

 extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);

-/*
- * The Linux x86 paging architecture is 'compile-time dual-mode', it
- * implements both the traditional 2-level x86 page tables and the
- * newer 3-level PAE-mode page tables.
- */
-#ifdef CONFIG_X86_PAE
-# include <asm/pgtable-3level-defs.h>
-# define PMD_SIZE	(1UL << PMD_SHIFT)
-# define PMD_MASK	(~(PMD_SIZE - 1))
-#else
-# include <asm/pgtable-2level-defs.h>
-#endif
-
-#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
-#define PGDIR_MASK	(~(PGDIR_SIZE - 1))
-
-/* Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
- * physical memory until the kernel virtual memory starts.  That means that
- * any out-of-bounds memory accesses will hopefully be caught.
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- */
-#define VMALLOC_OFFSET	(8 * 1024 * 1024)
-#define VMALLOC_START	((unsigned long)high_memory + VMALLOC_OFFSET)
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
-
-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1))	\
-		    & PMD_MASK)
-
-#ifdef CONFIG_HIGHMEM
-# define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)
-#else
-# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE)
-#endif
-
-#define MAXMEM	(VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)

 /*
  * Define this if things work differently on an i386 and an i486:
@@ -80,66 +41,12 @@ extern void set_pmd_pfn(unsigned long, u
  */
 #undef TEST_ACCESS_OK

-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
-#define pte_present(x)	((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
-
-/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
-#define pmd_none(x)	(!(unsigned long)__pmd_val(x))
-#if CONFIG_XEN_COMPAT <= 0x030002
-/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
-   can temporarily clear it. */
-#define pmd_present(x)	(__pmd_val(x))
-#define pmd_bad(x)	((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
-#else
-#define pmd_present(x)	(__pmd_val(x) & _PAGE_PRESENT)
-#define pmd_bad(x)	((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
-#endif
-
-
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
 # include <asm/pgtable-2level.h>
 #endif

-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-
-
-static inline int pud_large(pud_t pud) { return 0; }
-
-/*
- * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
- *
- * this macro returns the index of the entry in the pmd page which would
- * control the given virtual address
- */
-#define pmd_index(address)				\
-	(((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-
-/*
- * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
- *
- * this macro returns the index of the entry in the pte page which would
- * control the given virtual address
- */
-#define pte_index(address)					\
-	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address)				\
-	((pte_t *)pmd_page_vaddr(*(dir)) +  pte_index((address)))
-
-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
-#define pmd_page_vaddr(pmd)					\
-	((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
-
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address)					\
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) +		\
@@ -185,7 +92,4 @@ void make_lowmem_page_writable(void *va,
 #define kern_addr_valid(kaddr)	(0)
 #endif

-#define io_remap_pfn_range(vma, from, pfn, size, prot)			\
-	direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)
-
 #endif /* _ASM_X86_PGTABLE_32_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:25:06.000000000 +0100
@@ -2,6 +2,8 @@
 #define _ASM_X86_PGTABLE_64_H

 #include <linux/const.h>
+#include <asm/pgtable_64_types.h>
+
 #ifndef __ASSEMBLY__

 /*
@@ -12,12 +14,12 @@
 #include <linux/bitops.h>
 #include <linux/threads.h>
 #include <linux/sched.h>
-#include <asm/pda.h>

 #ifdef CONFIG_XEN
 extern pud_t level3_user_pgt[512];

 extern void xen_init_pt(void);
+extern void xen_switch_pt(void);
 #endif

 extern pud_t level3_kernel_pgt[512];
@@ -33,39 +35,13 @@ extern void paging_init(void);

 #endif /* !__ASSEMBLY__ */

-#define SHARED_KERNEL_PMD	0
-
-/*
- * PGDIR_SHIFT determines what a top-level page table entry can map
- */
-#define PGDIR_SHIFT	39
-#define PTRS_PER_PGD	512
-
-/*
- * 3rd level page
- */
-#define PUD_SHIFT	30
-#define PTRS_PER_PUD	512
-
-/*
- * PMD_SHIFT determines the size of the area a middle-level
- * page table can map
- */
-#define PMD_SHIFT	21
-#define PTRS_PER_PMD	512
-
-/*
- * entries per page directory level
- */
-#define PTRS_PER_PTE	512
-
 #ifndef __ASSEMBLY__

 #define pte_ERROR(e)							\
 	printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n",		\
 	       __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
 #define pmd_ERROR(e)							\
-	printk("%s:%d: bad pmd %p(%016lx pfn %010Lx).\n",		\
+	printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n",		\
 	       __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
 #define pud_ERROR(e)							\
 	printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n",		\
@@ -76,9 +52,6 @@ extern void paging_init(void);
 	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
 	       (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)

-#define pgd_none(x)	(!__pgd_val(x))
-#define pud_none(x)	(!__pud_val(x))
-
 struct mm_struct;

 void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
@@ -138,48 +111,6 @@ static inline void xen_pgd_clear(pgd_t *
 	xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
 }

-#define pte_same(a, b)		((a).pte == (b).pte)
-
-#endif /* !__ASSEMBLY__ */
-
-#define PMD_SIZE	(_AC(1, UL) << PMD_SHIFT)
-#define PMD_MASK	(~(PMD_SIZE - 1))
-#define PUD_SIZE	(_AC(1, UL) << PUD_SHIFT)
-#define PUD_MASK	(~(PUD_SIZE - 1))
-#define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
-#define PGDIR_MASK	(~(PGDIR_SIZE - 1))
-
-#define MAX_PHYSMEM_BITS 43
-#define MAXMEM		 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START    _AC(0xffffc20000000000, UL)
-#define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
-#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
-#define MODULES_END      _AC(0xffffffffff000000, UL)
-#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
-
-#ifndef __ASSEMBLY__
-
-static inline int pgd_bad(pgd_t pgd)
-{
-	return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-static inline int pud_bad(pud_t pud)
-{
-	return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-static inline int pmd_bad(pmd_t pmd)
-{
-	return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
-}
-
-#define pte_none(x)	(!(x).pte)
-#define pte_present(x)	((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
-
-#define pages_to_mb(x)	((x) >> (20 - PAGE_SHIFT))   /* FIXME: is this right? */
-
 #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)

 /*
@@ -190,47 +121,12 @@ static inline int pmd_bad(pmd_t pmd)
 /*
  * Level 4 access.
  */
-#define pgd_page_vaddr(pgd)						\
-	((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
-#define pgd_page(pgd)		(pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
-#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
 static inline int pgd_large(pgd_t pgd) { return 0; }
 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)

 /* PUD - Level3 access */
-/* to find an entry in a page-table-directory. */
-#define pud_page_vaddr(pud)						\
-	((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
-#define pud_page(pud)	(pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
-#define pud_offset(pgd, address)					\
-	((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
-#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
-
-static inline int pud_large(pud_t pte)
-{
-	return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
-		(_PAGE_PSE | _PAGE_PRESENT);
-}

 /* PMD  - Level 2 access */
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
-#define pmd_page(pmd)		(pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
-
-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
-				  pmd_index(address))
-#define pmd_none(x)	(!__pmd_val(x))
-#if CONFIG_XEN_COMPAT <= 0x030002
-/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
-   can temporarily clear it. */
-#define pmd_present(x)	(__pmd_val(x))
-#else
-#define pmd_present(x)	(__pmd_val(x) & _PAGE_PRESENT)
-#endif
-#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
-#define pmd_pfn(x)  ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-
 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
 #define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
 					    _PAGE_FILE })
@@ -238,13 +134,6 @@ static inline int pud_large(pud_t pte)

 /* PTE - Level 1 access. */

-/* page, protection -> pte */
-#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn((page)), (pgprot))
-
-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
-					 pte_index((address)))
-
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
@@ -278,9 +167,6 @@ static inline int pud_large(pud_t pte)
 extern int kern_addr_valid(unsigned long addr);
 extern void cleanup_highmap(void);

-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
-	direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
-
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_64_types.h	2010-03-24 15:25:06.000000000 +0100
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_PGTABLE_64_DEFS_H
+#define _ASM_X86_PGTABLE_64_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long	pteval_t;
+typedef unsigned long	pmdval_t;
+typedef unsigned long	pudval_t;
+typedef unsigned long	pgdval_t;
+typedef unsigned long	pgprotval_t;
+
+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+#endif	/* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD	0
+#define PAGETABLE_LEVELS	4
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT	39
+#define PTRS_PER_PGD	512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT	30
+#define PTRS_PER_PUD	512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT	21
+#define PTRS_PER_PMD	512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE	512
+
+#define PMD_SIZE	(_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE - 1))
+#define PUD_SIZE	(_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE - 1))
+#define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE - 1))
+
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM		 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_START    _AC(0xffffc20000000000, UL)
+#define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
+#define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
+#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
+#define MODULES_END      _AC(0xffffffffff000000, UL)
+#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
+
+#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_types.h	2010-03-24 15:25:06.000000000 +0100
@@ -0,0 +1,388 @@
+#ifndef _ASM_X86_PGTABLE_DEFS_H
+#define _ASM_X86_PGTABLE_DEFS_H
+
+#include <linux/const.h>
+#include <asm/page_types.h>
+
+#define FIRST_USER_ADDRESS	0
+
+#define _PAGE_BIT_PRESENT	0	/* is present */
+#define _PAGE_BIT_RW		1	/* writeable */
+#define _PAGE_BIT_USER		2	/* userspace addressable */
+#define _PAGE_BIT_PWT		3	/* page write through */
+#define _PAGE_BIT_PCD		4	/* page cache disabled */
+#define _PAGE_BIT_ACCESSED	5	/* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY		6	/* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT		7	/* on 4KB pages */
+#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1	9	/* available for programmer */
+#define _PAGE_BIT_IOMAP		10	/* flag used to indicate IO mapping */
+#define _PAGE_BIT_UNUSED3	11
+#define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST	_PAGE_BIT_UNUSED1
+#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
+
+#define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER	(_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT	(_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD	(_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_UNUSED3	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
+#define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST	(_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX	(_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
+			 _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB		(0)
+#define _PAGE_CACHE_WT		(_PAGE_PWT)
+#define _PAGE_CACHE_WC		(_PAGE_PAT)
+#define _PAGE_CACHE_WP		(_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS	(_PAGE_PCD)
+#define _PAGE_CACHE_UC		(_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+				 _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW |	\
+					 _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED)
+#define PAGE_COPY		PAGE_COPY_NOEXEC
+#define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC						\
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC		(__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE	(__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO		(__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC			__pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS		__pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE	__pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE	__pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE	__pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+#define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS		__pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC		__pgprot(__PAGE_KERNEL_IO_WC)
+
+/*         xwr */
+#define __P000	PAGE_NONE
+#define __P001	PAGE_READONLY
+#define __P010	PAGE_COPY
+#define __P011	PAGE_COPY
+#define __P100	PAGE_READONLY_EXEC
+#define __P101	PAGE_READONLY_EXEC
+#define __P110	PAGE_COPY_EXEC
+#define __P111	PAGE_COPY_EXEC
+
+#define __S000	PAGE_NONE
+#define __S001	PAGE_READONLY
+#define __S010	PAGE_SHARED
+#define __S011	PAGE_SHARED
+#define __S100	PAGE_READONLY_EXEC
+#define __S101	PAGE_READONLY_EXEC
+#define __S110	PAGE_SHARED_EXEC
+#define __S111	PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping  pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC	__PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR	 0x003		/* PRESENT+RW */
+#define PDE_IDENT_ATTR	 0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
+#endif
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_types.h>
+#else
+# include "pgtable_64_types.h"
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK		((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK		(~PTE_PFN_MASK)
+
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+
+#include <asm/maddr.h>
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#define __pgd_ma(x) ((pgd_t) { (x) } )
+static inline pgd_t xen_make_pgd(pgdval_t val)
+{
+	if (val & _PAGE_PRESENT)
+		val = pte_phys_to_machine(val);
+	return (pgd_t) { val };
+}
+
+#define __pgd_val(x) ((x).pgd)
+static inline pgdval_t xen_pgd_val(pgd_t pgd)
+{
+	pgdval_t ret = __pgd_val(pgd);
+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
+	if (ret)
+		ret = machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+	if (ret & _PAGE_PRESENT)
+		ret = pte_machine_to_phys(ret);
+#endif
+	return ret;
+}
+
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+	return __pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
+#if PAGETABLE_LEVELS > 3
+typedef struct { pudval_t pud; } pud_t;
+
+#define __pud_ma(x) ((pud_t) { (x) } )
+static inline pud_t xen_make_pud(pudval_t val)
+{
+	if (val & _PAGE_PRESENT)
+		val = pte_phys_to_machine(val);
+	return (pud_t) { val };
+}
+
+#define __pud_val(x) ((x).pud)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+	pudval_t ret = __pud_val(pud);
+	if (ret & _PAGE_PRESENT)
+		ret = pte_machine_to_phys(ret);
+	return ret;
+}
+#else
+#include <asm-generic/pgtable-nopud.h>
+
+#define __pud_val(x) __pgd_val((x).pgd)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+	return xen_pgd_val(pud.pgd);
+}
+#endif
+
+#if PAGETABLE_LEVELS > 2
+typedef struct { pmdval_t pmd; } pmd_t;
+
+#define __pmd_ma(x)	((pmd_t) { (x) } )
+static inline pmd_t xen_make_pmd(pmdval_t val)
+{
+	if (val & _PAGE_PRESENT)
+		val = pte_phys_to_machine(val);
+	return (pmd_t) { val };
+}
+
+#define __pmd_val(x) ((x).pmd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+	pmdval_t ret = __pmd_val(pmd);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (ret)
+		ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+	if (ret & _PAGE_PRESENT)
+		ret = pte_machine_to_phys(ret);
+#endif
+	return ret;
+}
+#else
+#include <asm-generic/pgtable-nopmd.h>
+
+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
+#define __pmd_val(x) __pgd_val((x).pud.pgd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+	return xen_pgd_val(pmd.pud.pgd);
+}
+#endif
+
+static inline pudval_t pud_flags(pud_t pud)
+{
+	return __pud_val(pud) & PTE_FLAGS_MASK;
+}
+
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+	return __pmd_val(pmd) & PTE_FLAGS_MASK;
+}
+
+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
+static inline pte_t xen_make_pte(pteval_t val)
+{
+	if ((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)
+		val = pte_phys_to_machine(val);
+	return (pte_t) { .pte = val };
+}
+
+#define __pte_val(x) ((x).pte)
+static inline pteval_t xen_pte_val(pte_t pte)
+{
+	pteval_t ret = __pte_val(pte);
+	if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)
+		ret = pte_machine_to_phys(ret);
+	return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+	return __pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x)	((x).pgprot)
+#define __pgprot(x)	((pgprot_t) { (x) } )
+
+
+typedef struct page *pgtable_t;
+
+extern pteval_t __supported_pte_mask;
+extern int nx_enabled;
+extern void set_nx(void);
+
+#define pgprot_writecombine	pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t *vma_prot);
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifndef CONFIG_XEN
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void xen_pagetable_setup_start(pgd_t *base) {}
+static inline void xen_pagetable_setup_done(pgd_t *base) {}
+#endif
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+enum {
+	PG_LEVEL_NONE,
+	PG_LEVEL_4K,
+	PG_LEVEL_2M,
+	PG_LEVEL_1G,
+	PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+#endif	/* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_DEFS_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:25:06.000000000 +0100
@@ -16,6 +16,7 @@ struct mm_struct;
 #include <asm/cpufeature.h>
 #include <asm/system.h>
 #include <asm/page.h>
+#include <asm/pgtable_types.h>
 #include <asm/percpu.h>
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
@@ -74,10 +75,10 @@ struct cpuinfo_x86 {
 	char			pad0;
 #else
 	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
-	int			 x86_tlbsize;
+	int			x86_tlbsize;
+#endif
 	__u8			x86_virt_bits;
 	__u8			x86_phys_bits;
-#endif
 	/* CPUID returned core id bits: */
 	__u8			x86_coreid_bits;
 	/* Max extended CPUID function supported: */
@@ -92,9 +93,9 @@ struct cpuinfo_x86 {
 	int			x86_cache_alignment;	/* In bytes */
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	/* cpus sharing the last level cache: */
-	cpumask_t		llc_shared_map;
+	cpumask_var_t		llc_shared_map;
 #endif
 	/* cpuid returned max cores value: */
 	u16			 x86_max_cores;
@@ -138,7 +139,7 @@ extern struct cpuinfo_x86	new_cpu_data;
 extern __u32			cleared_cpu_caps[NCAPINTS];

 #ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 #define cpu_data(cpu)		per_cpu(cpu_info, cpu)
 #define current_cpu_data	__get_cpu_var(cpu_info)
 #else
@@ -251,7 +252,6 @@ struct x86_hw_tss {
 #define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
 #define IO_BITMAP_OFFSET		offsetof(struct tss_struct, io_bitmap)
 #define INVALID_IO_BITMAP_OFFSET	0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY	0x9000

 #ifndef CONFIG_X86_NO_TSS
 struct tss_struct {
@@ -267,11 +267,6 @@ struct tss_struct {
 	 * be within the limit.
 	 */
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
-	/*
-	 * Cache the current maximum and the last task that used the bitmap:
-	 */
-	unsigned long		io_bitmap_max;
-	struct thread_struct	*io_bitmap_owner;

 	/*
 	 * .. and then another 0x100 bytes for the emergency kernel stack:
@@ -280,7 +275,7 @@ struct tss_struct {

 } ____cacheline_aligned;

-DECLARE_PER_CPU(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);

 /*
  * Save the original ist values for checking stack pointers during debugging
@@ -363,6 +358,11 @@ struct i387_soft_struct {
 	u32			entry_eip;
 };

+struct ymmh_struct {
+	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
+	u32 ymmh_space[64];
+};
+
 struct xsave_hdr_struct {
 	u64 xstate_bv;
 	u64 reserved1[2];
@@ -372,6 +372,7 @@ struct xsave_hdr_struct {
 struct xsave_struct {
 	struct i387_fxsave_struct i387;
 	struct xsave_hdr_struct xsave_hdr;
+	struct ymmh_struct ymmh;
 	/* new processor state extensions will go here */
 } __attribute__ ((packed, aligned (64)));

@@ -382,11 +383,37 @@ union thread_xstate {
 	struct xsave_struct		xsave;
 };

-#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_TSS
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 #endif

-extern void print_cpu_info(struct cpuinfo_x86 *);
+union irq_stack_union {
+	char irq_stack[IRQ_STACK_SIZE];
+	/*
+	 * GCC hardcodes the stack canary as %gs:40.  Since the
+	 * irq_stack is the object at %gs:0, we reserve the bottom
+	 * 48 bytes of the irq stack for the canary.
+	 */
+	struct {
+		char gs_base[40];
+		unsigned long stack_canary;
+	};
+};
+
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
+#else	/* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+DECLARE_PER_CPU(unsigned long, stack_canary);
+#endif
+#endif	/* X86_64 */
+
 extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
@@ -659,6 +686,7 @@ static inline void __sti_mwait(unsigned
 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);

 extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void init_c1e_mask(void);

 extern unsigned long		boot_option_idle_override;
 extern unsigned long		idle_halt;
@@ -696,9 +724,9 @@ extern int sysenter_setup(void);
 extern struct desc_ptr		early_gdt_descr;

 extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
 extern void cpu_init(void);
-extern void init_gdt(int cpu);

 static inline unsigned long get_debugctlmsr(void)
 {
@@ -783,6 +811,7 @@ static inline void spin_lock_prefetch(co
  * User space process size: 3GB (default).
  */
 #define TASK_SIZE		PAGE_OFFSET
+#define TASK_SIZE_MAX		TASK_SIZE
 #define STACK_TOP		TASK_SIZE
 #define STACK_TOP_MAX		STACK_TOP

@@ -840,7 +869,7 @@ extern unsigned long thread_saved_pc(str
 /*
  * User space process size. 47bits minus one guard page.
  */
-#define TASK_SIZE64	((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)

 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
@@ -849,12 +878,12 @@ extern unsigned long thread_saved_pc(str
 					0xc0000000 : 0xFFFFe000)

 #define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
-					IA32_PAGE_OFFSET : TASK_SIZE64)
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 #define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_IA32)) ? \
-					IA32_PAGE_OFFSET : TASK_SIZE64)
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)

 #define STACK_TOP		TASK_SIZE
-#define STACK_TOP_MAX		TASK_SIZE64
+#define STACK_TOP_MAX		TASK_SIZE_MAX

 #define INIT_THREAD  { \
 	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/smp.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/smp.h	2010-03-24 15:25:06.000000000 +0100
@@ -15,53 +15,25 @@
 #  include <asm/io_apic.h>
 # endif
 #endif
-#include <asm/pda.h>
 #include <asm/thread_info.h>
-
-#ifdef CONFIG_X86_64
-
-#define cpu_callin_mask cpu_possible_mask
-#define cpu_callout_mask cpu_possible_mask
-extern cpumask_var_t cpu_initialized_mask;
-extern cpumask_var_t cpu_sibling_setup_mask;
-
-#else /* CONFIG_X86_32 */
-
-#define cpu_callin_map cpu_possible_map
-#define cpu_callout_map cpu_possible_map
-extern cpumask_t cpu_initialized;
-extern cpumask_t cpu_sibling_setup_map;
-
-#define cpu_callin_mask		((struct cpumask *)&cpu_callin_map)
-#define cpu_callout_mask	((struct cpumask *)&cpu_callout_map)
-#define cpu_initialized_mask	((struct cpumask *)&cpu_initialized)
-#define cpu_sibling_setup_mask	((struct cpumask *)&cpu_sibling_setup_map)
-
-#endif /* CONFIG_X86_32 */
-
-extern void (*mtrr_hook)(void);
-extern void zap_low_mappings(void);
-
-extern int __cpuinit get_local_pda(int cpu);
+#include <asm/cpumask.h>

 extern int smp_num_siblings;
 extern unsigned int num_processors;

-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(int, cpu_number);
-#endif

 static inline struct cpumask *cpu_sibling_mask(int cpu)
 {
-	return &per_cpu(cpu_sibling_map, cpu);
+	return per_cpu(cpu_sibling_map, cpu);
 }

 static inline struct cpumask *cpu_core_mask(int cpu)
 {
-	return &per_cpu(cpu_core_map, cpu);
+	return per_cpu(cpu_core_map, cpu);
 }

 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
@@ -149,9 +121,10 @@ static inline void arch_send_call_functi
 	smp_ops.send_call_func_single_ipi(cpu);
 }

-static inline void arch_send_call_function_ipi(cpumask_t mask)
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
-	smp_ops.send_call_func_ipi(&mask);
+	smp_ops.send_call_func_ipi(mask);
 }

 void cpu_disable_common(void);
@@ -176,14 +149,12 @@ void xen_send_call_func_single_ipi(int c
 #define smp_send_stop		xen_smp_send_stop
 #define smp_send_reschedule	xen_smp_send_reschedule
 #define arch_send_call_function_single_ipi	xen_send_call_func_single_ipi
-#define arch_send_call_function_ipi(m)		xen_send_call_func_ipi(&(m))
+#define arch_send_call_function_ipi_mask	xen_send_call_func_ipi

 void play_dead(void);

 #endif /* CONFIG_XEN */

-extern void prefill_possible_map(void);
-
 void smp_store_cpu_info(int id);
 #define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)

@@ -192,10 +163,6 @@ static inline int num_booting_cpus(void)
 {
 	return cpumask_weight(cpu_callout_mask);
 }
-#else
-static inline void prefill_possible_map(void)
-{
-}
 #endif /* CONFIG_SMP */

 extern unsigned disabled_cpus __cpuinitdata;
@@ -206,11 +173,11 @@ extern unsigned disabled_cpus __cpuinitd
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
 #define safe_smp_processor_id() smp_processor_id()

 #elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id()	read_pda(cpunumber)
+#define raw_smp_processor_id() (percpu_read(cpu_number))

 #define stack_smp_processor_id()					\
 ({								\
@@ -220,10 +187,6 @@ extern unsigned disabled_cpus __cpuinitd
 })
 #define safe_smp_processor_id()		smp_processor_id()

-#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
-#define cpu_physical_id(cpu)		boot_cpu_physical_apicid
-#define safe_smp_processor_id()		0
-#define stack_smp_processor_id() 	0
 #endif

 #ifdef CONFIG_X86_LOCAL_APIC
@@ -235,28 +198,9 @@ static inline int logical_smp_processor_
 	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
 }

-#include <mach_apicdef.h>
-static inline unsigned int read_apic_id(void)
-{
-	unsigned int reg;
-
-	reg = *(u32 *)(APIC_BASE + APIC_ID);
-
-	return GET_APIC_ID(reg);
-}
 #endif

-
-# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
 extern int hard_smp_processor_id(void);
-# else
-#include <mach_apicdef.h>
-static inline int hard_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return read_apic_id();
-}
-# endif /* APIC_DEFINITION */

 #else /* CONFIG_X86_LOCAL_APIC */

@@ -266,11 +210,5 @@ static inline int hard_smp_processor_id(

 #endif /* CONFIG_X86_LOCAL_APIC */

-#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
-extern unsigned char boot_cpu_id;
-#else
-#define boot_cpu_id	0
-#endif
-
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_SMP_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:25:06.000000000 +0100
@@ -255,40 +255,18 @@ static __always_inline void __ticket_spi
 static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
 static inline void xen_spinlock_cleanup(unsigned int cpu) {}

-/*
- * Define virtualization-friendly old-style lock byte lock, for use in
- * pv_lock_ops if desired.
- *
- * This differs from the pre-2.6.24 spinlock by always using xchgb
- * rather than decb to take the lock; this allows it to use a
- * zero-initialized lock structure.  It also maintains a 1-byte
- * contention counter, so that we can implement
- * __byte_spin_is_contended.
- */
-struct __byte_spinlock {
-	s8 lock;
-#if NR_CPUS < 256
-	s8 spinners;
-#else
-#error NR_CPUS >= 256 support not implemented
-#endif
-};
-
 static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
 {
-	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
-	return bl->lock != 0;
+	return lock->lock != 0;
 }

 static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
 {
-	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
-	return bl->spinners != 0;
+	return lock->spinners != 0;
 }

 static inline void __byte_spin_lock(raw_spinlock_t *lock)
 {
-	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
 	s8 val = 1;

 	asm("1: xchgb %1, %0\n"
@@ -301,27 +279,25 @@ static inline void __byte_spin_lock(raw_
 	    "   " LOCK_PREFIX "decb %2\n"
 	    "   jmp 1b\n"
 	    "3:"
-	    : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+	    : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
 }

 #define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)

 static inline int __byte_spin_trylock(raw_spinlock_t *lock)
 {
-	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
 	u8 old = 1;

 	asm("xchgb %1,%0"
-	    : "+m" (bl->lock), "+q" (old) : : "memory");
+	    : "+m" (lock->lock), "+q" (old) : : "memory");

 	return old == 0;
 }

 static inline void __byte_spin_unlock(raw_spinlock_t *lock)
 {
-	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
 	smp_wmb();
-	bl->lock = 0;
+	lock->lock = 0;
 }

 #define __raw_spin(n) __byte_spin_##n
@@ -422,8 +398,7 @@ static inline int __raw_read_trylock(raw
 {
 	atomic_t *count = (atomic_t *)lock;

-	atomic_dec(count);
-	if (atomic_read(count) >= 0)
+	if (atomic_dec_return(count) >= 0)
 		return 1;
 	atomic_inc(count);
 	return 0;
@@ -450,6 +425,9 @@ static inline void __raw_write_unlock(ra
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }

+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/spinlock_types.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/spinlock_types.h	2010-03-24 15:25:06.000000000 +0100
@@ -26,6 +26,20 @@ typedef union {
 # define TICKET_SHIFT 16
 		u16 cur, seq;
 #endif
+#else
+/*
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure.  It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+		u8 lock;
+#if CONFIG_NR_CPUS < 256
+		u8 spinners;
+#else
+# error NR_CPUS >= 256 not implemented
+#endif
 #endif
 	};
 } raw_spinlock_t;
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:25:06.000000000 +0100
@@ -21,9 +21,24 @@
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 struct task_struct *__switch_to(struct task_struct *prev,
 				struct task_struct *next);
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);

 #ifdef CONFIG_X86_32

+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary							\
+	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
+	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam						\
+	, [stack_canary] "=m" (per_cpu_var(stack_canary))
+#define __switch_canary_iparam						\
+	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else	/* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif	/* CC_STACKPROTECTOR */
+
 /*
  * Saving eflags is important. It switches not only IOPL between tasks,
  * it also protects other tasks from NT leaking through sysenter etc.
@@ -45,6 +60,7 @@ do {									\
 		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
 		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
 		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
+		     __switch_canary					\
 		     "jmp __switch_to\n"	/* regparm call  */	\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"		/* restore EBP   */	\
@@ -59,6 +75,8 @@ do {									\
 		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
 		       "=S" (esi), "=D" (edi)				\
 		       							\
+		       __switch_canary_oparam				\
+									\
 		       /* input parameters: */				\
 		     : [next_sp]  "m" (next->thread.sp),		\
 		       [next_ip]  "m" (next->thread.ip),		\
@@ -67,6 +85,8 @@ do {									\
 		       [prev]     "a" (prev),				\
 		       [next]     "d" (next)				\
 									\
+		       __switch_canary_iparam				\
+									\
 		     : /* reloaded segment registers */			\
 			"memory");					\
 } while (0)
@@ -87,27 +107,44 @@ do {									\
 	, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
 	  "r12", "r13", "r14", "r15"

+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary							  \
+	"movq %P[task_canary](%%rsi),%%r8\n\t"				  \
+	"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam						  \
+	, [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+#define __switch_canary_iparam						  \
+	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else	/* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif	/* CC_STACKPROTECTOR */
+
 /* Save restore flags to clear handle leaking NT */
 #define switch_to(prev, next, last) \
-	asm volatile(SAVE_CONTEXT						    \
+	asm volatile(SAVE_CONTEXT					  \
 	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
 	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
 	     "call __switch_to\n\t"					  \
 	     ".globl thread_return\n"					  \
 	     "thread_return:\n\t"					  \
-	     "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"			  \
+	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
+	     __switch_canary						  \
 	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
-	     LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"	  \
 	     "movq %%rax,%%rdi\n\t" 					  \
-	     "jc   ret_from_fork\n\t"					  \
+	     "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"	  \
+	     "jnz   ret_from_fork\n\t"					  \
 	     RESTORE_CONTEXT						  \
 	     : "=a" (last)					  	  \
+	       __switch_canary_oparam					  \
 	     : [next] "S" (next), [prev] "D" (prev),			  \
 	       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
 	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
-	       [tif_fork] "i" (TIF_FORK),			  	  \
+	       [_tif_fork] "i" (_TIF_FORK),			  	  \
 	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-	       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
+	       [current_task] "m" (per_cpu_var(current_task))		  \
+	       __switch_canary_iparam					  \
 	     : "memory", "cc" __EXTRA_CLOBBER)
 #endif

@@ -166,6 +203,25 @@ extern void xen_load_gs_index(unsigned);
 #define savesegment(seg, value)				\
 	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")

+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk)	((tsk)->thread.gs)
+#define lazy_save_gs(v)		savesegment(gs, (v))
+#define lazy_load_gs(v)		loadsegment(gs, (v))
+#else	/* X86_32_LAZY_GS */
+#define get_user_gs(regs)	(u16)((regs)->gs)
+#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v)		do { } while (0)
+#define lazy_load_gs(v)		do { } while (0)
+#endif	/* X86_32_LAZY_GS */
+#endif	/* X86_32 */
+
 static inline unsigned long get_limit(unsigned long segment)
 {
 	unsigned long __limit;
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/tlbflush.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/tlbflush.h	2010-03-24 15:25:06.000000000 +0100
@@ -86,21 +86,20 @@ static inline void flush_tlb_range(struc
 	flush_tlb_mm(vma->vm_mm);
 }

+#ifndef CONFIG_XEN
 #define TLBSTATE_OK	1
 #define TLBSTATE_LAZY	2

-#ifdef CONFIG_X86_32
 struct tlb_state {
 	struct mm_struct *active_mm;
 	int state;
-	char __cacheline_padding[L1_CACHE_BYTES-8];
 };
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);

-void reset_lazy_tlbstate(void);
-#else
 static inline void reset_lazy_tlbstate(void)
 {
+	percpu_write(cpu_tlbstate.state, 0);
+	percpu_write(cpu_tlbstate.active_mm, &init_mm);
 }
 #endif

@@ -112,4 +111,6 @@ static inline void flush_tlb_kernel_rang
 	flush_tlb_all();
 }

+extern void zap_low_mappings(void);
+
 #endif /* _ASM_X86_TLBFLUSH_H */
--- head-2010-05-25.orig/arch/x86/kernel/Makefile	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/Makefile	2010-03-24 15:25:06.000000000 +0100
@@ -122,7 +122,6 @@ obj-$(CONFIG_X86_XEN)		+= fixup.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-	obj-$(CONFIG_X86_XEN_GENAPIC)	+= genapic_64.o genapic_xen_64.o
 	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
 	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
@@ -134,11 +133,10 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o

-	obj-$(CONFIG_XEN)		+= nmi.o
 	time_64-$(CONFIG_XEN)		+= time_32.o
 endif

-disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o genx2apic_%.o \
-	hpet.o i8253.o i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o \
-	smpboot.o tlb_$(BITS).o tsc.o tsc_sync.o uv_%.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
+	i8259.o irqinit_$(BITS).o pci-swiotlb.o reboot.o smpboot.o tsc.o \
+	tsc_sync.o uv_%.o vsmp_64.o
 disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o
--- head-2010-05-25.orig/arch/x86/kernel/acpi/boot.c	2010-04-15 10:05:36.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/acpi/boot.c	2010-04-15 10:07:05.000000000 +0200
@@ -115,11 +115,6 @@ char *__init __acpi_map_table(unsigned l
 	if (!phys || !size)
 		return NULL;

-#ifdef CONFIG_XEN
-	if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
-		return isa_bus_to_virt(phys);
-#endif
-
 	return early_ioremap(phys, size);
 }
 void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -151,8 +146,10 @@ static int __init acpi_parse_madt(struct
 		       madt->address);
 	}

+#ifndef CONFIG_XEN
 	default_acpi_madt_oem_check(madt->header.oem_id,
 				    madt->header.oem_table_id);
+#endif

 	return 0;
 }
--- head-2010-05-25.orig/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -104,6 +104,7 @@ int acpi_save_state_mem(void)
 	stack_start.sp = temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_table(smp_processor_id());
+	initial_gs = per_cpu_offset(smp_processor_id());
 #endif
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
--- head-2010-05-25.orig/arch/x86/kernel/apic/Makefile	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/apic/Makefile	2010-03-24 15:25:06.000000000 +0100
@@ -17,3 +17,10 @@ obj-$(CONFIG_X86_BIGSMP)	+= bigsmp_32.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_ES7000)	+= es7000_32.o
 obj-$(CONFIG_X86_SUMMIT)	+= summit_32.o
+
+obj-$(CONFIG_XEN)		+= nmi.o
+
+probe_64-$(CONFIG_XEN)		:= probe_32.o
+
+disabled-obj-$(CONFIG_XEN)	:= apic_flat_$(BITS).o
+disabled-obj-$(filter-out $(CONFIG_SMP),$(CONFIG_XEN)) += ipi.o
--- head-2010-05-25.orig/arch/x86/kernel/apic/apic-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/apic/apic-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -4,11 +4,20 @@

 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/module.h>

 #include <asm/smp.h>
 #include <asm/proto.h>
 #include <asm/apic.h>

+unsigned int num_processors;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+
 /*
  * Debug level, exported for io_apic.c
  */
--- head-2010-05-25.orig/arch/x86/kernel/apic/io_apic-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/apic/io_apic-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -1,7 +1,7 @@
 /*
  *	Intel IO-APIC support for multi-Pentium hosts.
  *
- *	Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *	Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *
  *	Many thanks to Stig Venaas for trying out countless experimental
  *	patches and reporting/debugging problems patiently!
@@ -46,6 +46,7 @@
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
+#include <asm/cpu.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
 #include <asm/acpi.h>
@@ -61,9 +62,7 @@
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_irq.h>

-#include <mach_ipi.h>
-#include <mach_apic.h>
-#include <mach_apicdef.h>
+#include <asm/apic.h>

 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
@@ -97,11 +96,11 @@ static DEFINE_SPINLOCK(vector_lock);
 int nr_ioapic_registers[MAX_IO_APICS];

 /* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;

 /* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];

 /* # of MP IRQ source entries */
 int mp_irq_entries;
@@ -114,10 +113,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BU

 int skip_ioapic_setup;

+static void __init _arch_disable_smp_support(void)
+{
+#ifdef CONFIG_PCI
+	noioapicquirk = 1;
+	noioapicreroute = -1;
+#endif
+	skip_ioapic_setup = 1;
+}
+
 static int __init parse_noapic(char *str)
 {
 	/* disable IO-APIC */
-	disable_ioapic_setup();
+	_arch_disable_smp_support();
 	return 0;
 }
 early_param("noapic", parse_noapic);
@@ -372,7 +380,7 @@ set_extra_move_desc(struct irq_desc *des

 	if (!cfg->move_in_progress) {
 		/* it means that domain is not changed */
-		if (!cpumask_intersects(&desc->affinity, mask))
+		if (!cpumask_intersects(desc->affinity, mask))
 			cfg->move_desc_pending = 1;
 	}
 }
@@ -397,12 +405,20 @@ struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
 	unsigned int data;
+	unsigned int unused2[11];
+	unsigned int eoi;
 };

 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(vector, &io_apic->eoi);
 }
 #endif /* CONFIG_XEN */

@@ -416,7 +432,7 @@ static inline unsigned int io_apic_read(
 	struct physdev_apic apic_op;
 	int ret;

-	apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
+	apic_op.apic_physbase = mp_ioapics[apic].apicaddr;
 	apic_op.reg = reg;
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
 	if (ret)
@@ -434,7 +450,7 @@ static inline void io_apic_write(unsigne
 #else
 	struct physdev_apic apic_op;

-	apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
+	apic_op.apic_physbase = mp_ioapics[apic].apicaddr;
 	apic_op.reg = reg;
 	apic_op.value = value;
 	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
@@ -522,7 +538,7 @@ __ioapic_write_entry(int apic, int pin,
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
 }

-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&ioapic_lock, flags);
@@ -558,11 +574,11 @@ static void send_cleanup_vector(struct i
 		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
 			cfg->move_cleanup_count++;
 		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
 		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
 		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
-		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 	}
 	cfg->move_in_progress = 0;
@@ -583,16 +599,12 @@ static void __target_IO_APIC_irq(unsigne

 		apic = entry->apic;
 		pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
 		/*
 		 * With interrupt-remapping, destination information comes
 		 * from interrupt-remapping table entry.
 		 */
 		if (!irq_remapped(irq))
 			io_apic_write(apic, 0x11 + pin*2, dest);
-#else
-		io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
@@ -607,8 +619,9 @@ static int
 assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);

 /*
- * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
- * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
  */
 static unsigned int
 set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
@@ -624,9 +637,12 @@ set_desc_affinity(struct irq_desc *desc,
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;

-	cpumask_and(&desc->affinity, cfg->domain, mask);
+	/* check that before desc->addinity get updated */
 	set_extra_move_desc(desc, mask);
-	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+
+	cpumask_copy(desc->affinity, mask);
+
+	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }

 static void
@@ -840,23 +856,6 @@ static void clear_IO_APIC (void)
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
 			clear_IO_APIC_pin(apic, pin);
 }
-
-#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
-void send_IPI_self(int vector)
-{
-	unsigned int cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	apic_wait_icr_idle();
-	cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP && CONFIG_X86_32*/
 #else
 #define add_pin_to_irq_cpu(cfg, cpu, apic, pin)
 #endif /* CONFIG_XEN */
@@ -868,8 +867,9 @@ void send_IPI_self(int vector)
  */

 #define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
+static int pirq_entries[MAX_PIRQS] = {
+	[0 ... MAX_PIRQS - 1] = -1
+};

 static int __init ioapic_pirq_setup(char *str)
 {
@@ -878,10 +878,6 @@ static int __init ioapic_pirq_setup(char

 	get_options(str, ARRAY_SIZE(ints), ints);

-	for (i = 0; i < MAX_PIRQS; i++)
-		pirq_entries[i] = -1;
-
-	pirqs_enabled = 1;
 	apic_printk(APIC_VERBOSE, KERN_INFO
 			"PIRQ redirection, working around broken MP-BIOS.\n");
 	max = MAX_PIRQS;
@@ -903,75 +899,106 @@ __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */

 #ifdef CONFIG_INTR_REMAP
-/* I/O APIC RTE contents at the OS boot up */
-static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+struct IO_APIC_route_entry **alloc_ioapic_entries(void)
+{
+	int apic;
+	struct IO_APIC_route_entry **ioapic_entries;
+
+	ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
+				GFP_ATOMIC);
+	if (!ioapic_entries)
+		return 0;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		ioapic_entries[apic] =
+			kzalloc(sizeof(struct IO_APIC_route_entry) *
+				nr_ioapic_registers[apic], GFP_ATOMIC);
+		if (!ioapic_entries[apic])
+			goto nomem;
+	}
+
+	return ioapic_entries;
+
+nomem:
+	while (--apic >= 0)
+		kfree(ioapic_entries[apic]);
+	kfree(ioapic_entries);
+
+	return 0;
+}

 /*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
  */
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 {
-	union IO_APIC_reg_01 reg_01;
-	unsigned long flags;
 	int apic, pin;

-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
+	if (!ioapic_entries)
+		return -ENOMEM;
+
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+		if (!ioapic_entries[apic])
+			return -ENOMEM;
+
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			ioapic_entries[apic][pin] =
+				ioapic_read_entry(apic, pin);
 	}

+	return 0;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+{
+	int apic, pin;
+
+	if (!ioapic_entries)
+		return;
+
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		early_ioapic_entries[apic] =
-			kzalloc(sizeof(struct IO_APIC_route_entry) *
-				nr_ioapic_registers[apic], GFP_KERNEL);
-		if (!early_ioapic_entries[apic])
-			goto nomem;
-	}
+		if (!ioapic_entries[apic])
+			break;

-	for (apic = 0; apic < nr_ioapics; apic++)
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 			struct IO_APIC_route_entry entry;

-			entry = early_ioapic_entries[apic][pin] =
-				ioapic_read_entry(apic, pin);
+			entry = ioapic_entries[apic][pin];
 			if (!entry.mask) {
 				entry.mask = 1;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
-
-	return 0;
-
-nomem:
-	while (apic >= 0)
-		kfree(early_ioapic_entries[apic--]);
-	memset(early_ioapic_entries, 0,
-		ARRAY_SIZE(early_ioapic_entries));
-
-	return -ENOMEM;
+	}
 }

-void restore_IO_APIC_setup(void)
+/*
+ * Restore IO APIC entries which was saved in ioapic_entries.
+ */
+int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 {
 	int apic, pin;

+	if (!ioapic_entries)
+		return -ENOMEM;
+
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		if (!early_ioapic_entries[apic])
-			break;
+		if (!ioapic_entries[apic])
+			return -ENOMEM;
+
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
 			ioapic_write_entry(apic, pin,
-					   early_ioapic_entries[apic][pin]);
-		kfree(early_ioapic_entries[apic]);
-		early_ioapic_entries[apic] = NULL;
+					ioapic_entries[apic][pin]);
 	}
+	return 0;
 }

-void reinit_intr_remapped_IO_APIC(int intr_remapping)
+void reinit_intr_remapped_IO_APIC(int intr_remapping,
+	struct IO_APIC_route_entry **ioapic_entries)
+
 {
 	/*
 	 * for now plain restore of previous settings.
@@ -980,7 +1007,17 @@ void reinit_intr_remapped_IO_APIC(int in
 	 * table entries. for now, do a plain restore, and wait for
 	 * the setup_IO_APIC_irqs() to do proper initialization.
 	 */
-	restore_IO_APIC_setup();
+	restore_IO_APIC_setup(ioapic_entries);
+}
+
+void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
+{
+	int apic;
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		kfree(ioapic_entries[apic]);
+
+	kfree(ioapic_entries);
 }
 #endif

@@ -992,10 +1029,10 @@ static int find_irq_entry(int apic, int
 	int i;

 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mp_irqtype == type &&
-		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
-		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mp_dstirq == pin)
+		if (mp_irqs[i].irqtype == type &&
+		    (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+		     mp_irqs[i].dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].dstirq == pin)
 			return i;

 	return -1;
@@ -1010,13 +1047,13 @@ static int __init find_isa_irq_pin(int i
 	int i;

 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mp_srcbus;
+		int lbus = mp_irqs[i].srcbus;

 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mp_irqtype == type) &&
-		    (mp_irqs[i].mp_srcbusirq == irq))
+		    (mp_irqs[i].irqtype == type) &&
+		    (mp_irqs[i].srcbusirq == irq))

-			return mp_irqs[i].mp_dstirq;
+			return mp_irqs[i].dstirq;
 	}
 	return -1;
 }
@@ -1026,17 +1063,17 @@ static int __init find_isa_irq_apic(int
 	int i;

 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mp_srcbus;
+		int lbus = mp_irqs[i].srcbus;

 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mp_irqtype == type) &&
-		    (mp_irqs[i].mp_srcbusirq == irq))
+		    (mp_irqs[i].irqtype == type) &&
+		    (mp_irqs[i].srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
+			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
 				return apic;
 		}
 	}
@@ -1062,23 +1099,23 @@ int IO_APIC_get_PCI_irq_vector(int bus,
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mp_srcbus;
+		int lbus = mp_irqs[i].srcbus;

 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
-			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+			    mp_irqs[i].dstapic == MP_APIC_ALL)
 				break;

 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mp_irqtype &&
+		    !mp_irqs[i].irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
+		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);

 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;

-			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
+			if (pin == (mp_irqs[i].srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -1121,7 +1158,7 @@ static int EISA_ELCR(unsigned int irq)
  * EISA conforming in the MP table, that means its trigger type must
  * be read in from the ELCR */

-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)

 /* PCI interrupts are always polarity one level triggered,
@@ -1138,13 +1175,13 @@ static int EISA_ELCR(unsigned int irq)

 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mp_srcbus;
+	int bus = mp_irqs[idx].srcbus;
 	int polarity;

 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mp_irqflag & 3)
+	switch (mp_irqs[idx].irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -1180,13 +1217,13 @@ static int MPBIOS_polarity(int idx)

 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mp_srcbus;
+	int bus = mp_irqs[idx].srcbus;
 	int trigger;

 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -1264,16 +1301,16 @@ int (*ioapic_renumber_irq)(int ioapic, i
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mp_srcbus;
+	int bus = mp_irqs[idx].srcbus;

 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mp_dstirq != pin)
+	if (mp_irqs[idx].dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");

 	if (test_bit(bus, mp_bus_not_pci)) {
-		irq = mp_irqs[idx].mp_srcbusirq;
+		irq = mp_irqs[idx].srcbusirq;
 	} else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -1366,7 +1403,7 @@ __assign_irq_vector(int irq, struct irq_
 		int new_cpu;
 		int vector, offset;

-		vector_allocation_domain(cpu, tmp_mask);
+		apic->vector_allocation_domain(cpu, tmp_mask);

 		vector = current_vector;
 		offset = current_offset;
@@ -1476,9 +1513,7 @@ void __setup_vector_irq(int cpu)
 }

 static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip;
-#endif

 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
@@ -1517,7 +1552,6 @@ static void ioapic_register_intr(int irq
 	else
 		desc->status &= ~IRQ_LEVEL;

-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		desc->status |= IRQ_MOVE_PCNTXT;
 		if (trigger)
@@ -1529,7 +1563,7 @@ static void ioapic_register_intr(int irq
 						      handle_edge_irq, "edge");
 		return;
 	}
-#endif
+
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1544,37 +1578,44 @@ static void ioapic_register_intr(int irq
 #define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq)
 #endif

-static int setup_ioapic_entry(int apic, int irq,
-			      struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int trigger,
-			      int polarity, int vector)
+int setup_ioapic_entry(int apic_id, int irq,
+		       struct IO_APIC_route_entry *entry,
+		       unsigned int destination, int trigger,
+		       int polarity, int vector, int pin)
 {
 	/*
 	 * add it to the IO-APIC irq-routing table:
 	 */
 	memset(entry,0,sizeof(*entry));

-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
-		struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+#ifndef CONFIG_XEN
+		struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
 		struct irte irte;
 		struct IR_IO_APIC_route_entry *ir_entry =
 			(struct IR_IO_APIC_route_entry *) entry;
 		int index;

 		if (!iommu)
-			panic("No mapping iommu for ioapic %d\n", apic);
+			panic("No mapping iommu for ioapic %d\n", apic_id);

 		index = alloc_irte(iommu, irq, 1);
 		if (index < 0)
-			panic("Failed to allocate IRTE for ioapic %d\n", apic);
+			panic("Failed to allocate IRTE for ioapic %d\n", apic_id);

 		memset(&irte, 0, sizeof(irte));

 		irte.present = 1;
-		irte.dst_mode = INT_DEST_MODE;
-		irte.trigger_mode = trigger;
-		irte.dlvry_mode = INT_DELIVERY_MODE;
+		irte.dst_mode = apic->irq_dest_mode;
+		/*
+		 * Trigger mode in the IRTE will always be edge, and the
+		 * actual level or edge trigger will be setup in the IO-APIC
+		 * RTE. This will help simplify level triggered irq migration.
+		 * For more details, see the comments above explainig IO-APIC
+		 * irq migration in the presence of interrupt-remapping.
+		 */
+		irte.trigger_mode = 0;
+		irte.dlvry_mode = apic->irq_delivery_mode;
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);

@@ -1584,18 +1625,22 @@ static int setup_ioapic_entry(int apic,
 		ir_entry->zero = 0;
 		ir_entry->format = 1;
 		ir_entry->index = (index & 0x7fff);
-	} else
+		/*
+		 * IO-APIC RTE will be configured with virtual vector.
+		 * irq handler will do the explicit EOI to the io-apic.
+		 */
+		ir_entry->vector = pin;
 #endif
-	{
-		entry->delivery_mode = INT_DELIVERY_MODE;
-		entry->dest_mode = INT_DEST_MODE;
+	} else {
+		entry->delivery_mode = apic->irq_delivery_mode;
+		entry->dest_mode = apic->irq_dest_mode;
 		entry->dest = destination;
+		entry->vector = vector;
 	}

 	entry->mask = 0;				/* enable IRQ */
 	entry->trigger = trigger;
 	entry->polarity = polarity;
-	entry->vector = vector;

 	/* Mask level triggered irqs.
 	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1605,7 +1650,7 @@ static int setup_ioapic_entry(int apic,
 	return 0;
 }

-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
+static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
 			      int trigger, int polarity)
 {
 	struct irq_cfg *cfg;
@@ -1617,26 +1662,26 @@ static void setup_IO_APIC_irq(int apic,

 	cfg = desc->chip_data;

-	if (assign_irq_vector(irq, cfg, TARGET_CPUS))
+	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
 		return;

 #ifndef CONFIG_XEN
-	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 #else
-	dest = cpu_mask_to_apicid(TARGET_CPUS);
+	dest = 0; /* meaningless */
 #endif

 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
 		    "IRQ %d Mode:%i Active:%i)\n",
-		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
+		    apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
 		    irq, trigger, polarity);


-	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
-			       dest, trigger, polarity, cfg->vector)) {
+	if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+			       dest, trigger, polarity, cfg->vector, pin)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-		       mp_ioapics[apic].mp_apicid, pin);
+		       mp_ioapics[apic_id].apicid, pin);
 		__clear_irq_vector(irq, cfg);
 		return;
 	}
@@ -1645,12 +1690,12 @@ static void setup_IO_APIC_irq(int apic,
 	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);

-	ioapic_write_entry(apic, pin, entry);
+	ioapic_write_entry(apic_id, pin, entry);
 }

 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic, pin, idx, irq;
+	int apic_id, pin, idx, irq;
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
@@ -1658,21 +1703,19 @@ static void __init setup_IO_APIC_irqs(vo

 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");

-	for (apic = 0; apic < nr_ioapics; apic++) {
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
+		for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {

-			idx = find_irq_entry(apic, pin, mp_INT);
+			idx = find_irq_entry(apic_id, pin, mp_INT);
 			if (idx == -1) {
 				if (!notcon) {
 					notcon = 1;
 					apic_printk(APIC_VERBOSE,
 						KERN_DEBUG " %d-%d",
-						mp_ioapics[apic].mp_apicid,
-						pin);
+						mp_ioapics[apic_id].apicid, pin);
 				} else
 					apic_printk(APIC_VERBOSE, " %d-%d",
-						mp_ioapics[apic].mp_apicid,
-						pin);
+						mp_ioapics[apic_id].apicid, pin);
 				continue;
 			}
 			if (notcon) {
@@ -1681,23 +1724,30 @@ static void __init setup_IO_APIC_irqs(vo
 				notcon = 0;
 			}

-			irq = pin_2_irq(idx, apic, pin);
-#if defined(CONFIG_XEN)
-			if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
+			irq = pin_2_irq(idx, apic_id, pin);
+
+#ifdef CONFIG_XEN
+			if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
 				continue;
-#elif defined(CONFIG_X86_32)
-			if (multi_timer_check(apic, irq))
+#else
+			/*
+			 * Skip the timer IRQ if there's a quirk handler
+			 * installed and if it returns 1:
+			 */
+			if (apic->multi_timer_check &&
+					apic->multi_timer_check(apic_id, irq))
 				continue;
 #endif
+
 			desc = irq_to_desc_alloc_cpu(irq, cpu);
 			if (!desc) {
 				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 				continue;
 			}
 			cfg = desc->chip_data;
-			add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+			add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);

-			setup_IO_APIC_irq(apic, pin, irq, desc,
+			setup_IO_APIC_irq(apic_id, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
 		}
 	}
@@ -1711,15 +1761,13 @@ static void __init setup_IO_APIC_irqs(vo
 /*
  * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
 					int vector)
 {
 	struct IO_APIC_route_entry entry;

-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled)
 		return;
-#endif

 	memset(&entry, 0, sizeof(entry));

@@ -1727,10 +1775,10 @@ static void __init setup_timer_IRQ0_pin(
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
-	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 1;					/* mask IRQ now */
-	entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
-	entry.delivery_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = apic->irq_dest_mode;
+	entry.mask = 0;			/* don't mask IRQ for edge */
+	entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+	entry.delivery_mode = apic->irq_delivery_mode;
 	entry.polarity = 0;
 	entry.trigger = 0;
 	entry.vector = vector;
@@ -1744,7 +1792,7 @@ static void __init setup_timer_IRQ0_pin(
 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
-	ioapic_write_entry(apic, pin, entry);
+	ioapic_write_entry(apic_id, pin, entry);
 }


@@ -1766,7 +1814,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].apicid, nr_ioapic_registers[i]);

 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1786,7 +1834,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	spin_unlock_irqrestore(&ioapic_lock, flags);

 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -2050,13 +2098,6 @@ void __init enable_IO_APIC(void)
 	int apic;
 	unsigned long flags;

-#ifdef CONFIG_X86_32
-	int i;
-	if (!pirqs_enabled)
-		for (i = 0; i < MAX_PIRQS; i++)
-			pirq_entries[i] = -1;
-#endif
-
 	/*
 	 * The number of IO-APIC IRQ registers (== #pins):
 	 */
@@ -2129,8 +2170,13 @@ void disable_IO_APIC(void)
 	 * If the i8259 is routed through an IOAPIC
 	 * Put that IOAPIC in virtual wire mode
 	 * so legacy interrupts can be delivered.
+	 *
+	 * With interrupt-remapping, for now we will use virtual wire A mode,
+	 * as virtual wire B is little complex (need to configure both
+	 * IOAPIC RTE aswell as interrupt-remapping table entry).
+	 * As this gets called during crash dump, keep this simple for now.
 	 */
-	if (ioapic_i8259.pin != -1) {
+	if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
 		struct IO_APIC_route_entry entry;

 		memset(&entry, 0, sizeof(entry));
@@ -2150,7 +2196,10 @@ void disable_IO_APIC(void)
 		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}

-	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+	/*
+	 * Use virtual wire A mode when interrupt remapping is enabled.
+	 */
+	disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
 }

 #ifdef CONFIG_X86_32
@@ -2165,7 +2214,7 @@ static void __init setup_ioapic_ids_from
 {
 	union IO_APIC_reg_00 reg_00;
 	physid_mask_t phys_id_present_map;
-	int apic;
+	int apic_id;
 	int i;
 	unsigned char old_id;
 	unsigned long flags;
@@ -2184,26 +2233,26 @@ static void __init setup_ioapic_ids_from
 	 * This is broken; anything with a real cpu count has to
 	 * circumvent this idiocy regardless.
 	 */
-	phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+	phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);

 	/*
 	 * Set the IOAPIC ID to the value stored in the MPC table.
 	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
+	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {

 		/* Read the register 0 value */
 		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
+		reg_00.raw = io_apic_read(apic_id, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);

-		old_id = mp_ioapics[apic].mp_apicid;
+		old_id = mp_ioapics[apic_id].apicid;

-		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+		if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic, mp_ioapics[apic].mp_apicid);
+				apic_id, mp_ioapics[apic_id].apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+			mp_ioapics[apic_id].apicid = reg_00.bits.ID;
 		}

 		/*
@@ -2211,10 +2260,10 @@ static void __init setup_ioapic_ids_from
 		 * system must have a unique ID or we get lots of nice
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
-		if (check_apicid_used(phys_id_present_map,
-					mp_ioapics[apic].mp_apicid)) {
+		if (apic->check_apicid_used(phys_id_present_map,
+					mp_ioapics[apic_id].apicid)) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic, mp_ioapics[apic].mp_apicid);
+				apic_id, mp_ioapics[apic_id].apicid);
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -2223,13 +2272,13 @@ static void __init setup_ioapic_ids_from
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic].mp_apicid = i;
+			mp_ioapics[apic_id].apicid = i;
 		} else {
 			physid_mask_t tmp;
-			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+			tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mp_ioapics[apic].mp_apicid);
+					mp_ioapics[apic_id].apicid);
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}

@@ -2238,11 +2287,11 @@ static void __init setup_ioapic_ids_from
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mp_ioapics[apic].mp_apicid)
+		if (old_id != mp_ioapics[apic_id].apicid)
 			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mp_dstapic == old_id)
-					mp_irqs[i].mp_dstapic
-						= mp_ioapics[apic].mp_apicid;
+				if (mp_irqs[i].dstapic == old_id)
+					mp_irqs[i].dstapic
+						= mp_ioapics[apic_id].apicid;

 		/*
 		 * Read the right value from the MPC table and
@@ -2250,20 +2299,20 @@ static void __init setup_ioapic_ids_from
 		 */
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic].mp_apicid);
+			mp_ioapics[apic_id].apicid);

-		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+		reg_00.bits.ID = mp_ioapics[apic_id].apicid;
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0, reg_00.raw);
+		io_apic_write(apic_id, 0, reg_00.raw);
 		spin_unlock_irqrestore(&ioapic_lock, flags);

 		/*
 		 * Sanity check
 		 */
 		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
+		reg_00.raw = io_apic_read(apic_id, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+		if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2366,7 +2415,7 @@ static int ioapic_retrigger_irq(unsigned
 	unsigned long flags;

 	spin_lock_irqsave(&vector_lock, flags);
-	send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+	apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
 	spin_unlock_irqrestore(&vector_lock, flags);

 	return 1;
@@ -2374,7 +2423,7 @@ static int ioapic_retrigger_irq(unsigned
 #else
 static int ioapic_retrigger_irq(unsigned int irq)
 {
-	send_IPI_self(irq_cfg(irq)->vector);
+	apic->send_IPI_self(irq_cfg(irq)->vector);

 	return 1;
 }
@@ -2392,37 +2441,24 @@ static int ioapic_retrigger_irq(unsigned
 #ifdef CONFIG_SMP

 #ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);

 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little  bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
  *
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
  */
 static void
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irte irte;
-	int modify_ioapic_rte;
 	unsigned int dest;
-	unsigned long flags;
 	unsigned int irq;

 	if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2438,14 +2474,7 @@ migrate_ioapic_irq_desc(struct irq_desc

 	set_extra_move_desc(desc, mask);

-	dest = cpu_mask_to_apicid_and(cfg->domain, mask);
-
-	modify_ioapic_rte = desc->status & IRQ_LEVEL;
-	if (modify_ioapic_rte) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-	}
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);

 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
@@ -2458,61 +2487,7 @@ migrate_ioapic_irq_desc(struct irq_desc
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);

-	cpumask_copy(&desc->affinity, mask);
-}
-
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
-	int ret = -1;
-	struct irq_cfg *cfg = desc->chip_data;
-
-	mask_IO_APIC_irq_desc(desc);
-
-	if (io_apic_level_ack_pending(cfg)) {
-		/*
-		 * Interrupt in progress. Migrating irq now will change the
-		 * vector information in the IO-APIC RTE and that will confuse
-		 * the EOI broadcast performed by cpu.
-		 * So, delay the irq migration to the next instance.
-		 */
-		schedule_delayed_work(&ir_migration_work, 1);
-		goto unmask;
-	}
-
-	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
-
-	ret = 0;
-	desc->status &= ~IRQ_MOVE_PENDING;
-	cpumask_clear(&desc->pending_mask);
-
-unmask:
-	unmask_IO_APIC_irq_desc(desc);
-
-	return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
-	unsigned int irq;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		if (desc->status & IRQ_MOVE_PENDING) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&desc->lock, flags);
-			if (!desc->chip->set_affinity ||
-			    !(desc->status & IRQ_MOVE_PENDING)) {
-				desc->status &= ~IRQ_MOVE_PENDING;
-				spin_unlock_irqrestore(&desc->lock, flags);
-				continue;
-			}
-
-			desc->chip->set_affinity(irq, &desc->pending_mask);
-			spin_unlock_irqrestore(&desc->lock, flags);
-		}
-	}
+	cpumask_copy(desc->affinity, mask);
 }

 /*
@@ -2521,13 +2496,6 @@ static void ir_irq_migration(struct work
 static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 					    const struct cpumask *mask)
 {
-	if (desc->status & IRQ_LEVEL) {
-		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, mask);
-		migrate_irq_remapped_level_desc(desc);
-		return;
-	}
-
 	migrate_ioapic_irq_desc(desc, mask);
 }
 static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2537,6 +2505,11 @@ static void set_ir_ioapic_affinity_irq(u

 	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+						   const struct cpumask *mask)
+{
+}
 #endif

 asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2550,6 +2523,7 @@ asmlinkage void smp_irq_move_cleanup_int
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irq;
+		unsigned int irr;
 		struct irq_desc *desc;
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
@@ -2569,6 +2543,18 @@ asmlinkage void smp_irq_move_cleanup_int
 		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 			goto unlock;

+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		/*
+		 * Check if the vector that needs to be cleanedup is
+		 * registered at the cpu's IRR. If so, then this is not
+		 * the best time to clean it up. Lets clean it up in the
+		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+		 * to myself.
+		 */
+		if (irr  & (1 << (vector % 32))) {
+			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+			goto unlock;
+		}
 		__get_cpu_var(vector_irq)[vector] = -1;
 		cfg->move_cleanup_count--;
 unlock:
@@ -2591,7 +2577,7 @@ static void irq_complete_move(struct irq

 		/* domain has not changed, but affinity did */
 		me = smp_processor_id();
-		if (cpu_isset(me, desc->affinity)) {
+		if (cpumask_test_cpu(me, desc->affinity)) {
 			*descp = desc = move_irq_desc(desc, me);
 			/* get the new one */
 			cfg = desc->chip_data;
@@ -2617,17 +2603,51 @@ static void irq_complete_move(struct irq
 static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif

-#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		io_apic_eoi(apic, pin);
+		entry = entry->next;
+	}
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifdef CONFIG_X86_X2APIC
 static void ack_x2apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	ack_x2APIC_irq();
+	eoi_ioapic_irq(desc);
 }

 static void ack_x2apic_edge(unsigned int irq)
 {
 	ack_x2APIC_irq();
 }
-
 #endif

 static void ack_apic_edge(unsigned int irq)
@@ -2693,6 +2713,9 @@ static void ack_apic_level(unsigned int
 	 */
 	ack_APIC_irq();

+	if (irq_remapped(irq))
+		eoi_ioapic_irq(desc);
+
 	/* Now we can move and renable the irq */
 	if (unlikely(do_unmask_irq)) {
 		/* Only migrate the irq if the ack has been received.
@@ -2738,6 +2761,26 @@ static void ack_apic_level(unsigned int
 #endif
 }

+#ifdef CONFIG_INTR_REMAP
+static void ir_ack_apic_edge(unsigned int irq)
+{
+#ifdef CONFIG_X86_X2APIC
+       if (x2apic_enabled())
+               return ack_x2apic_edge(irq);
+#endif
+       return ack_apic_edge(irq);
+}
+
+static void ir_ack_apic_level(unsigned int irq)
+{
+#ifdef CONFIG_X86_X2APIC
+       if (x2apic_enabled())
+               return ack_x2apic_level(irq);
+#endif
+       return ack_apic_level(irq);
+}
+#endif /* CONFIG_INTR_REMAP */
+
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.startup	= startup_ioapic_irq,
@@ -2751,20 +2794,20 @@ static struct irq_chip ioapic_chip __rea
 	.retrigger	= ioapic_retrigger_irq,
 };

-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip __read_mostly = {
 	.name		= "IR-IO-APIC",
 	.startup	= startup_ioapic_irq,
 	.mask		= mask_IO_APIC_irq,
 	.unmask		= unmask_IO_APIC_irq,
-	.ack		= ack_x2apic_edge,
-	.eoi		= ack_x2apic_level,
+#ifdef CONFIG_INTR_REMAP
+	.ack		= ir_ack_apic_edge,
+	.eoi		= ir_ack_apic_level,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ir_ioapic_affinity_irq,
 #endif
+#endif
 	.retrigger	= ioapic_retrigger_irq,
 };
-#endif
 #endif /* CONFIG_XEN */

 static inline void init_IO_APIC_traps(void)
@@ -2786,7 +2829,7 @@ static inline void init_IO_APIC_traps(vo
 	 */
 	for_each_irq_desc(irq, desc) {
 #ifdef CONFIG_XEN
-		if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
+		if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
 			continue;
 #endif
 		cfg = desc->chip_data;
@@ -2948,19 +2991,15 @@ static inline void __init check_timer(vo
 	int cpu = boot_cpu_id;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
-	unsigned int ver;
 	int no_pin1 = 0;

 	local_irq_save(flags);

-	ver = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(ver);
-
 	/*
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, cfg, TARGET_CPUS);
+	assign_irq_vector(0, cfg, apic->target_cpus());

 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2974,7 +3013,13 @@ static inline void __init check_timer(vo
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
 #ifdef CONFIG_X86_32
-	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+	{
+		unsigned int ver;
+
+		ver = apic_read(APIC_LVR);
+		ver = GET_APIC_VERSION(ver);
+		timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+	}
 #endif

 	pin1  = find_isa_irq_pin(0, mp_INT);
@@ -2994,10 +3039,8 @@ static inline void __init check_timer(vo
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("BIOS bug: timer not connected to IO-APIC");
-#endif
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -3013,8 +3056,17 @@ static inline void __init check_timer(vo
 		if (no_pin1) {
 			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+		} else {
+			/* for edge trigger, setup_IO_APIC_irq already
+			 * leave it unmasked.
+			 * so only need to unmask if it is level-trigger
+			 * do we really have level trigger timer?
+			 */
+			int idx;
+			idx = find_irq_entry(apic1, pin1, mp_INT);
+			if (idx != -1 && irq_trigger(idx))
+				unmask_IO_APIC_irq_desc(desc);
 		}
-		unmask_IO_APIC_irq_desc(desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -3024,10 +3076,9 @@ static inline void __init check_timer(vo
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
-#ifdef CONFIG_INTR_REMAP
 		if (intr_remapping_enabled)
 			panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
+		local_irq_disable();
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
 			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -3042,7 +3093,6 @@ static inline void __init check_timer(vo
 		 */
 		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq_desc(desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -3057,6 +3107,7 @@ static inline void __init check_timer(vo
 		/*
 		 * Cleanup, just in case ...
 		 */
+		local_irq_disable();
 		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
 		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -3082,6 +3133,7 @@ static inline void __init check_timer(vo
 		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
+	local_irq_disable();
 	disable_8259A_irq(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
 	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3099,6 +3151,7 @@ static inline void __init check_timer(vo
 		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
+	local_irq_disable();
 	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
 	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
 		"report.  Then try booting with the 'noapic' option.\n");
@@ -3131,7 +3184,7 @@ out:
 void __init setup_IO_APIC(void)
 {

-#if defined(CONFIG_X86_32) || defined(CONFIG_XEN)
+#ifdef CONFIG_XEN
 	enable_IO_APIC();
 #else
 	/*
@@ -3213,8 +3266,8 @@ static int ioapic_resume(struct sys_devi

 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3264,6 +3317,7 @@ static int __init ioapic_init_sysfs(void

 device_initcall(ioapic_init_sysfs);

+static int nr_irqs_gsi = NR_IRQS_LEGACY;
 /*
  * Dynamic irq allocate and deallocation
  */
@@ -3278,11 +3332,11 @@ unsigned int create_irq_nr(unsigned int
 	struct irq_desc *desc_new = NULL;

 	irq = 0;
-	spin_lock_irqsave(&vector_lock, flags);
-	for (new = irq_want; new < NR_IRQS; new++) {
-		if (platform_legacy_irq(new))
-			continue;
+	if (irq_want < nr_irqs_gsi)
+		irq_want = nr_irqs_gsi;

+	spin_lock_irqsave(&vector_lock, flags);
+	for (new = irq_want; new < nr_irqs; new++) {
 		desc_new = irq_to_desc_alloc_cpu(new, cpu);
 		if (!desc_new) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", new);
@@ -3292,7 +3346,7 @@ unsigned int create_irq_nr(unsigned int

 		if (cfg_new->vector != 0)
 			continue;
-		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
 		break;
 	}
@@ -3307,7 +3361,6 @@ unsigned int create_irq_nr(unsigned int
 	return irq;
 }

-static int nr_irqs_gsi = NR_IRQS_LEGACY;
 int create_irq(void)
 {
 	unsigned int irq_want;
@@ -3336,9 +3389,7 @@ void destroy_irq(unsigned int irq)
 	if (desc)
 		desc->chip_data = cfg;

-#ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
-#endif
 	spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
@@ -3355,14 +3406,16 @@ static int msi_compose_msg(struct pci_de
 	int err;
 	unsigned dest;

+	if (disable_apic)
+		return -ENXIO;
+
 	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+	err = assign_irq_vector(irq, cfg, apic->target_cpus());
 	if (err)
 		return err;

-	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());

-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irte irte;
 		int ir_index;
@@ -3374,9 +3427,9 @@ static int msi_compose_msg(struct pci_de
 		memset (&irte, 0, sizeof(irte));

 		irte.present = 1;
-		irte.dst_mode = INT_DEST_MODE;
+		irte.dst_mode = apic->irq_dest_mode;
 		irte.trigger_mode = 0; /* edge */
-		irte.dlvry_mode = INT_DELIVERY_MODE;
+		irte.dlvry_mode = apic->irq_delivery_mode;
 		irte.vector = cfg->vector;
 		irte.dest_id = IRTE_DEST(dest);

@@ -3388,16 +3441,19 @@ static int msi_compose_msg(struct pci_de
 				  MSI_ADDR_IR_SHV |
 				  MSI_ADDR_IR_INDEX1(ir_index) |
 				  MSI_ADDR_IR_INDEX2(ir_index);
-	} else
-#endif
-	{
-		msg->address_hi = MSI_ADDR_BASE_HI;
+	} else {
+		if (x2apic_enabled())
+			msg->address_hi = MSI_ADDR_BASE_HI |
+					  MSI_ADDR_EXT_DEST_ID(dest);
+		else
+			msg->address_hi = MSI_ADDR_BASE_HI;
+
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
-			((INT_DEST_MODE == 0) ?
+			((apic->irq_dest_mode == 0) ?
 				MSI_ADDR_DEST_MODE_PHYSICAL:
 				MSI_ADDR_DEST_MODE_LOGICAL) |
-			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+			((apic->irq_delivery_mode != dest_LowestPrio) ?
 				MSI_ADDR_REDIRECTION_CPU:
 				MSI_ADDR_REDIRECTION_LOWPRI) |
 			MSI_ADDR_DEST_ID(dest);
@@ -3405,7 +3461,7 @@ static int msi_compose_msg(struct pci_de
 		msg->data =
 			MSI_DATA_TRIGGER_EDGE |
 			MSI_DATA_LEVEL_ASSERT |
-			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+			((apic->irq_delivery_mode != dest_LowestPrio) ?
 				MSI_DATA_DELIVERY_FIXED:
 				MSI_DATA_DELIVERY_LOWPRI) |
 			MSI_DATA_VECTOR(cfg->vector);
@@ -3491,15 +3547,16 @@ static struct irq_chip msi_chip = {
 	.retrigger	= ioapic_retrigger_irq,
 };

-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip msi_ir_chip = {
 	.name		= "IR-PCI-MSI",
 	.unmask		= unmask_msi_irq,
 	.mask		= mask_msi_irq,
-	.ack		= ack_x2apic_edge,
+#ifdef CONFIG_INTR_REMAP
+	.ack		= ir_ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= ir_set_msi_irq_affinity,
 #endif
+#endif
 	.retrigger	= ioapic_retrigger_irq,
 };

@@ -3529,7 +3586,6 @@ static int msi_alloc_irte(struct pci_dev
 	}
 	return index;
 }
-#endif

 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
@@ -3543,7 +3599,6 @@ static int setup_msi_irq(struct pci_dev
 	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);

-#ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
 		struct irq_desc *desc = irq_to_desc(irq);
 		/*
@@ -3552,7 +3607,6 @@ static int setup_msi_irq(struct pci_dev
 		desc->status |= IRQ_MOVE_PCNTXT;
 		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 	} else
-#endif
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");

 	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3560,60 +3614,26 @@ static int setup_msi_irq(struct pci_dev
 	return 0;
 }

-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
-{
-	unsigned int irq;
-	int ret;
-	unsigned int irq_want;
-
-	irq_want = nr_irqs_gsi;
-	irq = create_irq_nr(irq_want);
-	if (irq == 0)
-		return -1;
-
-#ifdef CONFIG_INTR_REMAP
-	if (!intr_remapping_enabled)
-		goto no_ir;
-
-	ret = msi_alloc_irte(dev, irq, 1);
-	if (ret < 0)
-		goto error;
-no_ir:
-#endif
-	ret = setup_msi_irq(dev, msidesc, irq);
-	if (ret < 0) {
-		destroy_irq(irq);
-		return ret;
-	}
-	return 0;
-
-#ifdef CONFIG_INTR_REMAP
-error:
-	destroy_irq(irq);
-	return ret;
-#endif
-}
-
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	unsigned int irq;
 	int ret, sub_handle;
 	struct msi_desc *msidesc;
 	unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
-	struct intel_iommu *iommu = 0;
+	struct intel_iommu *iommu = NULL;
 	int index = 0;
-#endif
+
+	/* x86 doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;

 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		irq = create_irq_nr(irq_want);
-		irq_want++;
 		if (irq == 0)
 			return -1;
-#ifdef CONFIG_INTR_REMAP
+		irq_want = irq + 1;
 		if (!intr_remapping_enabled)
 			goto no_ir;

@@ -3641,7 +3661,6 @@ int arch_setup_msi_irqs(struct pci_dev *
 			set_irte_irq(irq, iommu, index, sub_handle);
 		}
 no_ir:
-#endif
 		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
@@ -3659,7 +3678,7 @@ void arch_teardown_msi_irq(unsigned int
 	destroy_irq(irq);
 }

-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
@@ -3740,7 +3759,7 @@ static void hpet_msi_set_affinity(unsign

 #endif /* CONFIG_SMP */

-struct irq_chip hpet_msi_type = {
+static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
 	.unmask = hpet_msi_unmask,
 	.mask = hpet_msi_mask,
@@ -3755,12 +3774,14 @@ int arch_setup_hpet_msi(unsigned int irq
 {
 	int ret;
 	struct msi_msg msg;
+	struct irq_desc *desc = irq_to_desc(irq);

 	ret = msi_compose_msg(NULL, irq, &msg);
 	if (ret < 0)
 		return ret;

 	hpet_msi_write(irq, &msg);
+	desc->status |= IRQ_MOVE_PCNTXT;
 	set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
 		"edge");

@@ -3823,13 +3844,17 @@ int arch_setup_ht_irq(unsigned int irq,
 	struct irq_cfg *cfg;
 	int err;

+	if (disable_apic)
+		return -ENXIO;
+
 	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+	err = assign_irq_vector(irq, cfg, apic->target_cpus());
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;

-		dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+		dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+						    apic->target_cpus());

 		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);

@@ -3837,11 +3862,11 @@ int arch_setup_ht_irq(unsigned int irq,
 			HT_IRQ_LOW_BASE |
 			HT_IRQ_LOW_DEST_ID(dest) |
 			HT_IRQ_LOW_VECTOR(cfg->vector) |
-			((INT_DEST_MODE == 0) ?
+			((apic->irq_dest_mode == 0) ?
 				HT_IRQ_LOW_DM_PHYSICAL :
 				HT_IRQ_LOW_DM_LOGICAL) |
 			HT_IRQ_LOW_RQEOI_EDGE |
-			((INT_DELIVERY_MODE != dest_LowestPrio) ?
+			((apic->irq_delivery_mode != dest_LowestPrio) ?
 				HT_IRQ_LOW_MT_FIXED :
 				HT_IRQ_LOW_MT_ARBITRATED) |
 			HT_IRQ_LOW_IRQ_MASKED;
@@ -3857,7 +3882,7 @@ int arch_setup_ht_irq(unsigned int irq,
 }
 #endif /* CONFIG_HT_IRQ */

-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+#ifdef CONFIG_X86_UV
 /*
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3889,12 +3914,12 @@ int arch_enable_uv_irq(char *irq_name, u
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));

 	entry->vector = cfg->vector;
-	entry->delivery_mode = INT_DELIVERY_MODE;
-	entry->dest_mode = INT_DEST_MODE;
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode = apic->irq_dest_mode;
 	entry->polarity = 0;
 	entry->trigger = 0;
 	entry->mask = 0;
-	entry->dest = cpu_mask_to_apicid(eligible_cpu);
+	entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);

 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3957,7 +3982,29 @@ void __init probe_nr_irqs_gsi(void)

 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+	int nr;
+
+	if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+		nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+	nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+	/*
+	 * for MSI and HT dyn irq
+	 */
+	nr += nr_irqs_gsi * 16;
 #endif
+	if (nr < nr_irqs)
+		nr_irqs = nr;
+
+	return 0;
+}
+#endif
+#endif /* CONFIG_XEN */

 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
@@ -3985,7 +4032,7 @@ int __init io_apic_get_unique_id(int ioa
 	 */

 	if (physids_empty(apic_id_map))
-		apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+		apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);

 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(ioapic, 0);
@@ -4001,10 +4048,10 @@ int __init io_apic_get_unique_id(int ioa
 	 * Every APIC in a system must have a unique ID or we get lots of nice
 	 * 'stuck on smp_invalidate_needed IPI wait' messages.
 	 */
-	if (check_apicid_used(apic_id_map, apic_id)) {
+	if (apic->check_apicid_used(apic_id_map, apic_id)) {

 		for (i = 0; i < get_physical_broadcast(); i++) {
-			if (!check_apicid_used(apic_id_map, i))
+			if (!apic->check_apicid_used(apic_id_map, i))
 				break;
 		}

@@ -4017,7 +4064,7 @@ int __init io_apic_get_unique_id(int ioa
 		apic_id = i;
 	}

-	tmp = apicid_to_cpu_present(apic_id);
+	tmp = apic->apicid_to_cpu_present(apic_id);
 	physids_or(apic_id_map, apic_id_map, tmp);

 	if (reg_00.bits.ID != apic_id) {
@@ -4062,7 +4109,7 @@ int io_apic_set_pci_routing (int ioapic,
 	int cpu = boot_cpu_id;

 #ifdef CONFIG_XEN
-	if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) {
+	if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
 			    ioapic, irq);
 		return -EINVAL;
@@ -4103,8 +4150,8 @@ int acpi_get_override_irq(int bus_irq, i
 		return -1;

 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mp_irqtype == mp_INT &&
-		    mp_irqs[i].mp_srcbusirq == bus_irq)
+		if (mp_irqs[i].irqtype == mp_INT &&
+		    mp_irqs[i].srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
@@ -4120,7 +4167,7 @@ int acpi_get_override_irq(int bus_irq, i
 /*
  * This function currently is only a helper for the i386 smp boot process where
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
+ * so mask in all cases should simply be apic->target_cpus()
  */
 #ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
@@ -4159,15 +4206,13 @@ void __init setup_ioapic_dest(void)
 			 */
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = &desc->affinity;
+				mask = desc->affinity;
 			else
-				mask = TARGET_CPUS;
+				mask = apic->target_cpus();

-#ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
 				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
-#endif
 				set_ioapic_affinity_irq_desc(desc, mask);
 		}

@@ -4220,7 +4265,7 @@ void __init ioapic_init_mappings(void)
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].mp_apicaddr;
+			ioapic_phys = mp_ioapics[i].apicaddr;
 #ifdef CONFIG_X86_32
 			if (!ioapic_phys) {
 				printk(KERN_ERR
@@ -4260,9 +4305,12 @@ static int __init ioapic_insert_resource
 	struct resource *r = ioapic_resources;

 	if (!r) {
-		printk(KERN_ERR
-		       "IO APIC resources could be not be allocated.\n");
-		return -1;
+		if (nr_ioapics > 0) {
+			printk(KERN_ERR
+				"IO APIC resources couldn't be allocated.\n");
+			return -1;
+		}
+		return 0;
 	}

 	for (i = 0; i < nr_ioapics; i++) {
--- head-2010-05-25.orig/arch/x86/kernel/apic/ipi-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/apic/ipi-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -17,38 +17,8 @@
 #include <asm/mmu_context.h>
 #include <asm/apic.h>
 #include <asm/proto.h>
+#include <asm/ipi.h>

-#ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
-#include <mach_apic.h>
-#include <mach_ipi.h>
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR(unsigned int shortcut, int vector)
-{
-	unsigned int icr = shortcut | APIC_DEST_LOGICAL;
-
-	switch (vector) {
-	default:
-		icr |= APIC_DM_FIXED | vector;
-		break;
-	case NMI_VECTOR:
-		icr |= APIC_DM_NMI;
-		break;
-	}
-	return icr;
-}
-
-static inline int __prepare_ICR2(unsigned int mask)
-{
-	return SET_APIC_DEST_FIELD(mask);
-}
-#else
 #include <xen/evtchn.h>

 DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
@@ -59,36 +29,10 @@ static inline void __send_IPI_one(unsign
 	BUG_ON(irq < 0);
 	notify_remote_via_irq(irq);
 }
-#endif

-void __send_IPI_shortcut(unsigned int shortcut, int vector)
+static void __send_IPI_shortcut(unsigned int shortcut, int vector)
 {
-#ifndef CONFIG_XEN
-	/*
-	 * Subtle. In the case of the 'never do double writes' workaround
-	 * we have to lock out interrupts to be safe.  As we don't care
-	 * of the value read we use an atomic rmw access to avoid costly
-	 * cli/sti.  Otherwise we use an even cheaper single atomic write
-	 * to the APIC.
-	 */
-	unsigned int cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	apic_wait_icr_idle();
-
-	/*
-	 * No need to touch the target chip field
-	 */
-	cfg = __prepare_ICR(shortcut, vector);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	apic_write(APIC_ICR, cfg);
-#else
-	int cpu;
+	unsigned int cpu;

 	switch (shortcut) {
 	case APIC_DEST_SELF:
@@ -99,149 +43,53 @@ void __send_IPI_shortcut(unsigned int sh
 			if (cpu != smp_processor_id())
 				__send_IPI_one(cpu, vector);
 		break;
+	case APIC_DEST_ALLINC:
+		for_each_online_cpu(cpu)
+			__send_IPI_one(cpu, vector);
+		break;
 	default:
 		printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
 		       vector);
 		break;
 	}
-#endif
 }

-void send_IPI_self(int vector)
+void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
 {
-	__send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-#ifndef CONFIG_XEN
-/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
- */
-static inline void __send_IPI_dest_field(unsigned long mask, int vector)
-{
-	unsigned long cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	if (unlikely(vector == NMI_VECTOR))
-		safe_apic_wait_icr_idle();
-	else
-		apic_wait_icr_idle();
-
-	/*
-	 * prepare target chip field
-	 */
-	cfg = __prepare_ICR2(mask);
-	apic_write(APIC_ICR2, cfg);
-
-	/*
-	 * program the ICR
-	 */
-	cfg = __prepare_ICR(0, vector);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	apic_write(APIC_ICR, cfg);
-}
-#endif
-
-/*
- * This is only used on smaller machines.
- */
-void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
-{
-#ifndef CONFIG_XEN
-	unsigned long mask = cpumask_bits(cpumask)[0];
-#else
 	unsigned int cpu;
-#endif
 	unsigned long flags;

 	local_irq_save(flags);
-#ifndef CONFIG_XEN
-	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
-	__send_IPI_dest_field(mask, vector);
-#else
 	WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
 	for_each_cpu_and(cpu, cpumask, cpu_online_mask)
-		__send_IPI_one(cpu, vector);
-#endif
+		if (cpu != smp_processor_id())
+			__send_IPI_one(cpu, vector);
 	local_irq_restore(flags);
 }

-void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
+void xen_send_IPI_mask(const struct cpumask *cpumask, int vector)
 {
-#ifndef CONFIG_XEN
+	unsigned int cpu;
 	unsigned long flags;
-	unsigned int query_cpu;
-
-	/*
-	 * Hack. The clustered APIC addressing mode doesn't allow us to send
-	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
-	 * should be modified to do 1 message per cluster ID - mbligh
-	 */

 	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask)
-		__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
+	WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+	for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+		__send_IPI_one(cpu, vector);
 	local_irq_restore(flags);
-#else
-	send_IPI_mask_bitmask(mask, vector);
-#endif
 }

-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+void xen_send_IPI_allbutself(int vector)
 {
-	unsigned long flags;
-	unsigned int query_cpu;
-	unsigned int this_cpu = smp_processor_id();
-
-	/* See Hack comment above */
-
-	local_irq_save(flags);
-#ifndef CONFIG_XEN
-	for_each_cpu(query_cpu, mask)
-		if (query_cpu != this_cpu)
-			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
-					      vector);
-#else
-	WARN_ON(!cpumask_subset(mask, cpu_online_mask));
-	for_each_cpu_and(query_cpu, mask, cpu_online_mask)
-		if (query_cpu != this_cpu)
-			__send_IPI_one(query_cpu, vector);
-#endif
-	local_irq_restore(flags);
+	__send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
 }

-#ifndef CONFIG_XEN
-/* must come after the send_IPI functions above for inlining */
-static int convert_apicid_to_cpu(int apic_id)
-{
-	int i;
-
-	for_each_possible_cpu(i) {
-		if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
-			return i;
-	}
-	return -1;
+void xen_send_IPI_all(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_ALLINC, vector);
 }

-int safe_smp_processor_id(void)
+void xen_send_IPI_self(int vector)
 {
-	int apicid, cpuid;
-
-	if (!boot_cpu_has(X86_FEATURE_APIC))
-		return 0;
-
-	apicid = hard_smp_processor_id();
-	if (apicid == BAD_APICID)
-		return 0;
-
-	cpuid = convert_apicid_to_cpu(apicid);
-
-	return cpuid >= 0 ? cpuid : 0;
+	__send_IPI_shortcut(APIC_DEST_SELF, vector);
 }
-#endif
-#endif
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/apic/probe_32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -0,0 +1,69 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/ipi.h>
+
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+static int xen_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+	return cpuid_apic;
+}
+
+static struct apic apic_xen = {
+
+	.name				= "default",
+
+	.irq_delivery_mode		= dest_LowestPrio,
+	/* logical delivery broadcast to all CPUs: */
+	.irq_dest_mode			= 1,
+
+	.target_cpus			= default_target_cpus,
+
+	.phys_pkg_id			= xen_phys_pkg_id,
+	.mps_oem_check			= NULL,
+
+#ifdef CONFIG_SMP
+	.send_IPI_mask			= xen_send_IPI_mask,
+	.send_IPI_mask_allbutself	= xen_send_IPI_mask_allbutself,
+	.send_IPI_allbutself		= xen_send_IPI_allbutself,
+	.send_IPI_all			= xen_send_IPI_all,
+	.send_IPI_self			= xen_send_IPI_self,
+#endif
+};
+
+struct apic *apic = &apic_xen;
+EXPORT_SYMBOL_GPL(apic);
--- head-2010-05-25.orig/arch/x86/kernel/asm-offsets_32.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/asm-offsets_32.c	2010-03-24 15:25:06.000000000 +0100
@@ -115,6 +115,11 @@ void foo(void)

 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);

+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_START_mfn_list, start_info, mfn_list);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 	BLANK();
 	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
--- head-2010-05-25.orig/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -1,101 +1,94 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
 #include <linux/bootmem.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
 #include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
 #include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
 #include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
 #include <asm/mce.h>
+#include <asm/msr.h>
 #include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
 #include <asm/smp.h>
+
 #ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <mach_apic.h>
-#include <asm/genapic.h>
-#elif defined(CONFIG_X86_64_XEN)
-#include <mach_apic.h>
+#include <asm/uv/uv.h>
 #endif

-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-
 #ifdef CONFIG_XEN
-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC)
-#define phys_pkg_id(a,b) a
-#endif
 #include <xen/interface/callback.h>
 #endif

 #include "cpu.h"

-#ifdef CONFIG_X86_64
-
 /* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_initialized_mask;
 #ifndef CONFIG_XEN
-cpumask_var_t cpu_callin_mask;
 cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
 #endif
-cpumask_var_t cpu_initialized_mask;

 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;

-#else /* CONFIG_X86_32 */
-
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+	alloc_bootmem_cpumask_var(&cpu_initialized_mask);
 #ifndef CONFIG_XEN
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
+	alloc_bootmem_cpumask_var(&cpu_callin_mask);
+	alloc_bootmem_cpumask_var(&cpu_callout_mask);
 #endif
-cpumask_t cpu_initialized;
-cpumask_t cpu_sibling_setup_map;
-
-#endif /* CONFIG_X86_32 */
-
+	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}

-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;

+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
-	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
+	/*
+	 * We need valid kernel segments for data and code in long mode too
+	 * IRET will check the segment types  kkeil 2000/10/28
+	 * Also sysret mandates a special GDT layout
+	 *
+	 * TLS descriptors are currently at a different place compared to i386.
+	 * Hopefully nobody expects them at a fixed place (Wine?)
+	 */
+	[GDT_ENTRY_KERNEL32_CS]		= { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS]		= { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS]		= { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS]	= { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS]	= { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS]	= { { { 0x0000ffff, 0x00affb00 } } },
 #else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+	[GDT_ENTRY_KERNEL_CS]		= { { { 0x0000ffff, 0x00cf9a00 } } },
+	[GDT_ENTRY_KERNEL_DS]		= { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS]	= { { { 0x0000ffff, 0x00cffa00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS]	= { { { 0x0000ffff, 0x00cff200 } } },
 #ifndef CONFIG_XEN
 	/*
 	 * Segments used for calling PnP BIOS have byte granularity.
@@ -103,33 +96,41 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_p
 	 * the transfer segment sizes are set at run time.
 	 */
 	/* 32-bit code */
-	[GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+	[GDT_ENTRY_PNPBIOS_CS32]	= { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+	[GDT_ENTRY_PNPBIOS_CS16]	= { { { 0x0000ffff, 0x00009a00 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_DS]		= { { { 0x0000ffff, 0x00009200 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_TS1]		= { { { 0x00000000, 0x00009200 } } },
 	/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+	[GDT_ENTRY_PNPBIOS_TS2]		= { { { 0x00000000, 0x00009200 } } },
 	/*
 	 * The APM segments have byte granularity and their bases
 	 * are set at run time.  All have 64k limits.
 	 */
 	/* 32-bit code */
-	[GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+	[GDT_ENTRY_APMBIOS_BASE]	= { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+	[GDT_ENTRY_APMBIOS_BASE+1]	= { { { 0x0000ffff, 0x00009a00 } } },
 	/* data */
-	[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+	[GDT_ENTRY_APMBIOS_BASE+2]	= { { { 0x0000ffff, 0x00409200 } } },

-	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+	[GDT_ENTRY_ESPFIX_SS]		= { { { 0x00000000, 0x00c09200 } } },
 #endif
-	[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
-} };
+	[GDT_ENTRY_PERCPU]		= { { { 0x0000ffff, 0x00cf9200 } } },
+	GDT_STACK_CANARY_INIT
 #endif
+} };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);

+static int __init x86_xsave_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -168,16 +169,17 @@ static inline int flag_is_changeable_p(u
 	 * the CPUID. Add "volatile" to not allow gcc to
 	 * optimize the subsequent calls to this function.
 	 */
-	asm volatile ("pushfl\n\t"
-		      "pushfl\n\t"
-		      "popl %0\n\t"
-		      "movl %0,%1\n\t"
-		      "xorl %2,%0\n\t"
-		      "pushl %0\n\t"
-		      "popfl\n\t"
-		      "pushfl\n\t"
-		      "popl %0\n\t"
-		      "popfl\n\t"
+	asm volatile ("pushfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "movl %0, %1	\n\t"
+		      "xorl %2, %0	\n\t"
+		      "pushl %0		\n\t"
+		      "popfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "popfl		\n\t"
+
 		      : "=&r" (f1), "=&r" (f2)
 		      : "ir" (flag));

@@ -192,18 +194,22 @@ static int __cpuinit have_cpuid_p(void)

 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 {
-	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-		/* Disable processor serial number */
-		unsigned long lo, hi;
-		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		lo |= 0x200000;
-		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		printk(KERN_NOTICE "CPU serial number disabled.\n");
-		clear_cpu_cap(c, X86_FEATURE_PN);
+	unsigned long lo, hi;

-		/* Disabling the serial number may affect the cpuid level */
-		c->cpuid_level = cpuid_eax(0);
-	}
+	if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+		return;
+
+	/* Disable processor serial number: */
+
+	rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+	lo |= 0x200000;
+	wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+	printk(KERN_NOTICE "CPU serial number disabled.\n");
+	clear_cpu_cap(c, X86_FEATURE_PN);
+
+	/* Disabling the serial number may affect the cpuid level */
+	c->cpuid_level = cpuid_eax(0);
 }

 static int __init x86_serial_nr_setup(char *s)
@@ -228,16 +234,64 @@ static inline void squash_the_stupid_ser
 #endif

 /*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software.  Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+	u32 feature;
+	u32 level;
+};
+
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+	{ X86_FEATURE_MWAIT,		0x00000005 },
+	{ X86_FEATURE_DCA,		0x00000009 },
+	{ X86_FEATURE_XSAVE,		0x0000000d },
+	{ 0, 0 }
+};
+
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+	const struct cpuid_dependent_feature *df;
+
+	for (df = cpuid_dependent_features; df->feature; df++) {
+
+		if (!cpu_has(c, df->feature))
+			continue;
+		/*
+		 * Note: cpuid_level is set to -1 if unavailable, but
+		 * extended_extended_level is set to 0 if unavailable
+		 * and the legitimate extended levels are all negative
+		 * when signed; hence the weird messing around with
+		 * signs here...
+		 */
+		if (!((s32)df->level < 0 ?
+		     (u32)df->level > (u32)c->extended_cpuid_level :
+		     (s32)df->level > (s32)c->cpuid_level))
+			continue;
+
+		clear_cpu_cap(c, df->feature);
+		if (!warn)
+			continue;
+
+		printk(KERN_WARNING
+		       "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+				x86_cap_flags[df->feature], df->level);
+	}
+}
+
+/*
  * Naming convention should be: <Name> [(<Codename>)]
  * This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
  */

 /* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
 {
-	struct cpu_model_info *info;
+	const struct cpu_model_info *info;

 	if (c->x86_model >= 16)
 		return NULL;	/* Range check */
@@ -257,32 +311,52 @@ static char __cpuinit *table_lookup_mode

 __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;

-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+	loadsegment(fs, __KERNEL_PERCPU);
+#else
+	loadsegment(gs, 0);
+#ifndef CONFIG_XEN
+	wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#else
+	if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+			(unsigned long)per_cpu(irq_stack_union.gs_base, cpu)))
+		BUG();
+#endif
+#endif
+	load_stack_canary_segment();
+}
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
 {
 	struct desc_ptr gdt_descr;
 	unsigned long va, frames[16];
 	int f;

-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.address = (long)get_cpu_gdt_table(cpu);
 	gdt_descr.size = GDT_SIZE - 1;

 	for (va = gdt_descr.address, f = 0;
 	     va < gdt_descr.address + gdt_descr.size;
 	     va += PAGE_SIZE, f++) {
-		frames[f] = virt_to_mfn(va);
-		make_lowmem_page_readonly(
-			(void *)va, XENFEAT_writable_descriptor_tables);
+		frames[f] = arbitrary_virt_to_mfn(va);
+		make_page_readonly((void *)va,
+				   XENFEAT_writable_descriptor_tables);
 	}
 	if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
 		BUG();
-#ifdef CONFIG_X86_32
-	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-#endif
+
+	/* Reload the per-cpu base */
+
+	load_percpu_segment(cpu);
 }

-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};

 static void __cpuinit default_init(struct cpuinfo_x86 *c)
 {
@@ -301,7 +375,7 @@ static void __cpuinit default_init(struc
 #endif
 }

-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 	.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -315,22 +389,24 @@ static void __cpuinit get_model_name(str
 	if (c->extended_cpuid_level < 0x80000004)
 		return;

-	v = (unsigned int *) c->x86_model_id;
+	v = (unsigned int *)c->x86_model_id;
 	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
 	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;

-	/* Intel chips right-justify this string for some dumb reason;
-	   undo that brain damage */
+	/*
+	 * Intel chips right-justify this string for some dumb reason;
+	 * undo that brain damage:
+	 */
 	p = q = &c->x86_model_id[0];
 	while (*p == ' ')
-	     p++;
+		p++;
 	if (p != q) {
-	     while (*p)
-		  *q++ = *p++;
-	     while (q <= &c->x86_model_id[48])
-		  *q++ = '\0';	/* Zero-pad the rest */
+		while (*p)
+			*q++ = *p++;
+		while (q <= &c->x86_model_id[48])
+			*q++ = '\0';	/* Zero-pad the rest */
 	}
 }

@@ -399,36 +475,30 @@ void __cpuinit detect_ht(struct cpuinfo_

 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
+		goto out;
+	}

-		if (smp_num_siblings > nr_cpu_ids) {
-			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
-					smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
+	if (smp_num_siblings <= 1)
+		goto out;

-		index_msb = get_count_order(smp_num_siblings);
-#ifdef CONFIG_X86_64
-		c->phys_proc_id = phys_pkg_id(index_msb);
-#else
-		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-#endif
+	if (smp_num_siblings > nr_cpu_ids) {
+		pr_warning("CPU: Unsupported number of siblings %d",
+			   smp_num_siblings);
+		smp_num_siblings = 1;
+		return;
+	}

-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+	index_msb = get_count_order(smp_num_siblings);
+	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);

-		index_msb = get_count_order(smp_num_siblings);
+	smp_num_siblings = smp_num_siblings / c->x86_max_cores;

-		core_bits = get_count_order(c->x86_max_cores);
+	index_msb = get_count_order(smp_num_siblings);

-#ifdef CONFIG_X86_64
-		c->cpu_core_id = phys_pkg_id(index_msb) &
-					       ((1 << core_bits) - 1);
-#else
-		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
-					       ((1 << core_bits) - 1);
-#endif
-	}
+	core_bits = get_count_order(c->x86_max_cores);
+
+	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+				       ((1 << core_bits) - 1);

 out:
 	if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -443,8 +513,8 @@ out:
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
-	int i;
 	static int printed;
+	int i;

 	for (i = 0; i < X86_VENDOR_NUM; i++) {
 		if (!cpu_devs[i])
@@ -453,6 +523,7 @@ static void __cpuinit get_cpu_vendor(str
 		if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
 		    (cpu_devs[i]->c_ident[1] &&
 		     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
 			this_cpu = cpu_devs[i];
 			c->x86_vendor = this_cpu->c_x86_vendor;
 			return;
@@ -461,7 +532,9 @@ static void __cpuinit get_cpu_vendor(str

 	if (!printed) {
 		printed++;
-		printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+		printk(KERN_ERR
+		    "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}

@@ -481,14 +554,17 @@ void __cpuinit cpu_detect(struct cpuinfo
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 junk, tfms, cap0, misc;
+
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
 		c->x86 = (tfms >> 8) & 0xf;
 		c->x86_model = (tfms >> 4) & 0xf;
 		c->x86_mask = tfms & 0xf;
+
 		if (c->x86 == 0xf)
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
 		if (cap0 & (1<<19)) {
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 			c->x86_cache_alignment = c->x86_clflush_size;
@@ -504,6 +580,7 @@ static void __cpuinit get_cpu_cap(struct
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 capability, excap;
+
 		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 		c->x86_capability[0] = capability;
 		c->x86_capability[4] = excap;
@@ -512,6 +589,7 @@ static void __cpuinit get_cpu_cap(struct
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
+
 	if ((xlvl & 0xffff0000) == 0x80000000) {
 		if (xlvl >= 0x80000001) {
 			c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -519,13 +597,15 @@ static void __cpuinit get_cpu_cap(struct
 		}
 	}

-#ifdef CONFIG_X86_64
 	if (c->extended_cpuid_level >= 0x80000008) {
 		u32 eax = cpuid_eax(0x80000008);

 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
 	}
+#ifdef CONFIG_X86_32
+	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+		c->x86_phys_bits = 36;
 #endif

 	if (c->extended_cpuid_level >= 0x80000007)
@@ -572,8 +652,12 @@ static void __init early_identify_cpu(st
 {
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
 #else
 	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
 #endif
 	c->x86_cache_alignment = c->x86_clflush_size;

@@ -596,21 +680,20 @@ static void __init early_identify_cpu(st
 	if (this_cpu->c_early_init)
 		this_cpu->c_early_init(c);

-	validate_pat_support(c);
-
 #ifdef CONFIG_SMP
 	c->cpu_index = boot_cpu_id;
 #endif
+	filter_cpuid_features(c, false);
 }

 void __init early_cpu_init(void)
 {
-	struct cpu_dev **cdev;
+	const struct cpu_dev *const *cdev;
 	int count = 0;

-	printk("KERNEL supported cpus:\n");
+	printk(KERN_INFO "KERNEL supported cpus:\n");
 	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
-		struct cpu_dev *cpudev = *cdev;
+		const struct cpu_dev *cpudev = *cdev;
 		unsigned int j;

 		if (count >= X86_VENDOR_NUM)
@@ -621,7 +704,7 @@ void __init early_cpu_init(void)
 		for (j = 0; j < 2; j++) {
 			if (!cpudev->c_ident[j])
 				continue;
-			printk("  %s %s\n", cpudev->c_vendor,
+			printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
 				cpudev->c_ident[j]);
 		}
 	}
@@ -663,7 +746,7 @@ static void __cpuinit generic_identify(s
 		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
 #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
 # ifdef CONFIG_X86_HT
-		c->apicid = phys_pkg_id(c->initial_apicid, 0);
+		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 # else
 		c->apicid = c->initial_apicid;
 # endif
@@ -697,9 +780,13 @@ static void __cpuinit identify_cpu(struc
 	c->x86_coreid_bits = 0;
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
 #else
 	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
 #endif
 	c->x86_cache_alignment = c->x86_clflush_size;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -712,7 +799,7 @@ static void __cpuinit identify_cpu(struc
 		this_cpu->c_identify(c);

 #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
-	c->apicid = phys_pkg_id(0);
+	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 #endif

 	/*
@@ -732,13 +819,16 @@ static void __cpuinit identify_cpu(struc
 	squash_the_stupid_serial_number(c);

 	/*
-	 * The vendor-specific functions might have changed features.  Now
-	 * we do "generic changes."
+	 * The vendor-specific functions might have changed features.
+	 * Now we do "generic changes."
 	 */

+	/* Filter out anything that depends on CPUID levels we don't have */
+	filter_cpuid_features(c, true);
+
 	/* If the model name is still unset, do table lookup. */
 	if (!c->x86_model_id[0]) {
-		char *p;
+		const char *p;
 		p = table_lookup_model(c);
 		if (p)
 			strcpy(c->x86_model_id, p);
@@ -794,6 +884,7 @@ static void vgetcpu_set_mode(void)
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
+	init_c1e_mask();
 #ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
@@ -813,11 +904,11 @@ void __cpuinit identify_secondary_cpu(st
 }

 struct msr_range {
-	unsigned min;
-	unsigned max;
+	unsigned	min;
+	unsigned	max;
 };

-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
 	{ 0x00000000, 0x00000418},
 	{ 0xc0000000, 0xc000040b},
 	{ 0xc0010000, 0xc0010142},
@@ -826,14 +917,15 @@ static struct msr_range msr_range_array[

 static void __cpuinit print_cpu_msr(void)
 {
+	unsigned index_min, index_max;
 	unsigned index;
 	u64 val;
 	int i;
-	unsigned index_min, index_max;

 	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
 		index_min = msr_range_array[i].min;
 		index_max = msr_range_array[i].max;
+
 		for (index = index_min; index < index_max; index++) {
 			if (rdmsrl_amd_safe(index, &val))
 				continue;
@@ -843,6 +935,7 @@ static void __cpuinit print_cpu_msr(void
 }

 static int show_msr __cpuinitdata;
+
 static __init int setup_show_msr(char *arg)
 {
 	int num;
@@ -864,12 +957,14 @@ __setup("noclflush", setup_noclflush);

 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
-	char *vendor = NULL;
+	const char *vendor = NULL;

-	if (c->x86_vendor < X86_VENDOR_NUM)
+	if (c->x86_vendor < X86_VENDOR_NUM) {
 		vendor = this_cpu->c_vendor;
-	else if (c->cpuid_level >= 0)
-		vendor = c->x86_vendor_id;
+	} else {
+		if (c->cpuid_level >= 0)
+			vendor = c->x86_vendor_id;
+	}

 	if (vendor && !strstr(c->x86_model_id, vendor))
 		printk(KERN_CONT "%s ", vendor);
@@ -896,87 +991,57 @@ void __cpuinit print_cpu_info(struct cpu
 static __init int setup_disablecpuid(char *arg)
 {
 	int bit;
+
 	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
 		setup_clear_cpu_cap(bit);
 	else
 		return 0;
+
 	return 1;
 }
 __setup("clearcpuid=", setup_disablecpuid);

 #ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
 #ifndef CONFIG_X86_NO_IDT
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 #endif

-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+		     irq_stack_union) __aligned(PAGE_SIZE);

-static void __ref switch_pt(int cpu)
+void xen_switch_pt(void)
 {
 #ifdef CONFIG_XEN
-	if (cpu == 0)
-		xen_init_pt();
 	xen_pt_switch(__pa_symbol(init_level4_pgt));
 	xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
 #endif
 }

-void __cpuinit pda_init(int cpu)
-{
-	struct x8664_pda *pda = cpu_pda(cpu);
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;

-	/* Setup up data that may be needed in __get_free_pages early */
-	loadsegment(fs, 0);
-	loadsegment(gs, 0);
-#ifndef CONFIG_XEN
-	/* Memory clobbers used to order PDA accessed */
-	mb();
-	wrmsrl(MSR_GS_BASE, pda);
-	mb();
-#else
-	if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
-					(unsigned long)pda))
-		BUG();
-#endif
-
-	pda->cpunumber = cpu;
-	pda->irqcount = -1;
-	pda->kernelstack = (unsigned long)stack_thread_info() -
-				 PDA_STACKOFFSET + THREAD_SIZE;
-	pda->active_mm = &init_mm;
-	pda->mmu_state = 0;
-
-	if (cpu == 0) {
-		/* others are initialized in smpboot.c */
-		pda->pcurrent = &init_task;
-		pda->irqstackptr = boot_cpu_stack;
-		pda->irqstackptr += IRQSTACKSIZE - 64;
-	} else {
-		if (!pda->irqstackptr) {
-			pda->irqstackptr = (char *)
-				__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-			if (!pda->irqstackptr)
-				panic("cannot allocate irqstack for cpu %d",
-				      cpu);
-			pda->irqstackptr += IRQSTACKSIZE - 64;
-		}
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);

-		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
-			pda->nodenumber = cpu_to_node(cpu);
-	}
-
-	switch_pt(cpu);
-}
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;

 #ifndef CONFIG_X86_NO_TSS
-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-				  DEBUG_STKSZ] __page_aligned_bss;
-#endif
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+};

-extern asmlinkage void ignore_sysret(void);
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+	__aligned(PAGE_SIZE);
+#endif

 void __cpuinit syscall_init(void)
 {
@@ -1020,16 +1085,38 @@ unsigned long kernel_eflags;
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 #endif

-#else
+#else	/* CONFIG_X86_64 */

-/* Make sure %fs is initialized properly in idle threads */
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU(unsigned long, stack_canary);
+#endif
+
+/* Make sure %fs and %gs are initialized properly in idle threads */
 struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
 	regs->fs = __KERNEL_PERCPU;
+	regs->gs = __KERNEL_STACK_CANARY;
+
 	return regs;
 }
-#endif
+#endif	/* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		/* Ignore db4, db5 */
+		if ((i == 4) || (i == 5))
+			continue;
+
+		set_debugreg(0, i);
+	}
+}

 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
@@ -1039,24 +1126,31 @@ struct pt_regs * __cpuinit idle_regs(str
  * A lot of state is already set up in PDA init for 64 bit
  */
 #ifdef CONFIG_X86_64
+
 void __cpuinit cpu_init(void)
 {
-	int cpu = stack_smp_processor_id();
 #ifndef CONFIG_X86_NO_TSS
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+	struct orig_ist *orig_ist;
+	struct tss_struct *t;
 	unsigned long v;
-	char *estacks = NULL;
 	int i;
 #endif
 	struct task_struct *me;
+	int cpu;

+	cpu = stack_smp_processor_id();
 	/* CPU 0 is initialised in head64.c */
 	if (cpu != 0)
-		pda_init(cpu);
+		xen_switch_pt();
 #ifndef CONFIG_X86_NO_TSS
-	else
-		estacks = boot_exception_stacks;
+	t = &per_cpu(init_tss, cpu);
+	orig_ist = &per_cpu(orig_ist, cpu);
+#endif
+
+#ifdef CONFIG_NUMA
+	if (cpu != 0 && percpu_read(node_number) == 0 &&
+	    cpu_to_node(cpu) != NUMA_NO_NODE)
+		percpu_write(node_number, cpu_to_node(cpu));
 #endif

 	me = current;
@@ -1073,7 +1167,9 @@ void __cpuinit cpu_init(void)
 	 * and set up the GDT descriptor:
 	 */

-	switch_to_new_gdt();
+	switch_to_new_gdt(cpu);
+	loadsegment(fs, 0);
+
 #ifndef CONFIG_X86_NO_IDT
 	load_idt((const struct desc_ptr *)&idt_descr);
 #endif
@@ -1086,8 +1182,8 @@ void __cpuinit cpu_init(void)
 	barrier();

 	check_efer();
-#ifndef CONFIG_XEN
-	if (cpu != 0 && x2apic)
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (cpu != 0)
 		enable_x2apic();
 #endif

@@ -1096,24 +1192,17 @@ void __cpuinit cpu_init(void)
 	 * set up and load the per-CPU TSS
 	 */
 	if (!orig_ist->ist[0]) {
-		static const unsigned int order[N_EXCEPTION_STACKS] = {
-		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-		  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-		};
+		char *estacks = per_cpu(exception_stacks, cpu);
+
 		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-			if (cpu) {
-				estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-				if (!estacks)
-					panic("Cannot allocate exception "
-					      "stack %ld %d\n", v, cpu);
-			}
-			estacks += PAGE_SIZE << order[v];
+			estacks += exception_stack_sizes[v];
 			orig_ist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
 		}
 	}

 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
 	/*
 	 * <= is required because the CPU will access up to
 	 * 8 bits beyond the end of the IO permission bitmap.
@@ -1124,8 +1213,7 @@ void __cpuinit cpu_init(void)

 	atomic_inc(&init_mm.mm_count);
 	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
+	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);

 	load_sp0(t, &current->thread);
@@ -1144,22 +1232,9 @@ void __cpuinit cpu_init(void)
 	 */
 	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
 		arch_kgdb_ops.correct_hw_break();
-	else {
-#endif
-	/*
-	 * Clear all 6 debug registers:
-	 */
-
-	set_debugreg(0UL, 0);
-	set_debugreg(0UL, 1);
-	set_debugreg(0UL, 2);
-	set_debugreg(0UL, 3);
-	set_debugreg(0UL, 6);
-	set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
-	/* If the kgdb is connected no debug regs should be altered. */
-	}
+	else
 #endif
+		clear_all_debug_regs();

 	fpu_init();

@@ -1171,8 +1246,10 @@ void __cpuinit cpu_init(void)
 		kernel_eflags &= ~X86_EFLAGS_IF;
 #endif

+#ifdef CONFIG_X86_LOCAL_APIC
 	if (is_uv_system())
 		uv_cpu_init();
+#endif
 }

 #else
@@ -1188,7 +1265,8 @@ void __cpuinit cpu_init(void)

 	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-		for (;;) local_irq_enable();
+		for (;;)
+			local_irq_enable();
 	}

 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1196,36 +1274,30 @@ void __cpuinit cpu_init(void)
 	if (cpu_has_vme || cpu_has_de)
 		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);

-	switch_to_new_gdt();
+	switch_to_new_gdt(cpu);

 	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
 	atomic_inc(&init_mm.mm_count);
 	curr->active_mm = &init_mm;
-	if (curr->mm)
-		BUG();
+	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);

 	load_sp0(t, thread);

 	load_LDT(&init_mm.context);

+#ifndef CONFIG_X86_NO_TSS
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+#endif
+
 #ifdef CONFIG_DOUBLEFAULT
 	/* Set up doublefault TSS pointer in the GDT */
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif

-	/* Clear %gs. */
-	asm volatile ("mov %0, %%gs" : : "r" (0));
-
-	/* Clear all 6 debug registers: */
-	set_debugreg(0, 0);
-	set_debugreg(0, 1);
-	set_debugreg(0, 2);
-	set_debugreg(0, 3);
-	set_debugreg(0, 6);
-	set_debugreg(0, 7);
+	clear_all_debug_regs();

 	/*
 	 * Force FPU initialization:
@@ -1245,6 +1317,4 @@ void __cpuinit cpu_init(void)

 	xsave_init();
 }
-
-
 #endif
--- head-2010-05-25.orig/arch/x86/kernel/cpu/intel.c	2010-05-25 09:12:08.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/cpu/intel.c	2010-05-25 09:24:45.000000000 +0200
@@ -91,8 +91,10 @@ static void __cpuinit early_init_intel(s
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+#ifndef CONFIG_XEN
 		if (!check_tsc_unstable())
 			sched_clock_stable = 1;
+#endif
 	}

 	/*
--- head-2010-05-25.orig/arch/x86/kernel/e820-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/e820-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -129,19 +129,50 @@ int __init e820_all_mapped(u64 start, u6
 /*
  * Add a memory region to the kernel e820 map.
  */
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+					 int type)
 {
-	int x = e820.nr_map;
+	int x = e820x->nr_map;

-	if (x == ARRAY_SIZE(e820.map)) {
+	if (x == ARRAY_SIZE(e820x->map)) {
 		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
 		return;
 	}

-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
+	e820x->map[x].addr = start;
+	e820x->map[x].size = size;
+	e820x->map[x].type = type;
+	e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+	__e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+	switch (type) {
+	case E820_RAM:
+	case E820_RESERVED_KERN:
+		printk(KERN_CONT "(usable)");
+		break;
+	case E820_RESERVED:
+		printk(KERN_CONT "(reserved)");
+		break;
+	case E820_ACPI:
+		printk(KERN_CONT "(ACPI data)");
+		break;
+	case E820_NVS:
+		printk(KERN_CONT "(ACPI NVS)");
+		break;
+	case E820_UNUSABLE:
+		printk(KERN_CONT "(unusable)");
+		break;
+	default:
+		printk(KERN_CONT "type %u", type);
+		break;
+	}
 }

 static void __init _e820_print_map(const struct e820map *e820, const char *who)
@@ -153,27 +184,8 @@ static void __init _e820_print_map(const
 		       (unsigned long long) e820->map[i].addr,
 		       (unsigned long long)
 		       (e820->map[i].addr + e820->map[i].size));
-		switch (e820->map[i].type) {
-		case E820_RAM:
-		case E820_RESERVED_KERN:
-			printk(KERN_CONT "(usable)\n");
-			break;
-		case E820_RESERVED:
-			printk(KERN_CONT "(reserved)\n");
-			break;
-		case E820_ACPI:
-			printk(KERN_CONT "(ACPI data)\n");
-			break;
-		case E820_NVS:
-			printk(KERN_CONT "(ACPI NVS)\n");
-			break;
-		case E820_UNUSABLE:
-			printk("(unusable)\n");
-			break;
-		default:
-			printk(KERN_CONT "type %u\n", e820->map[i].type);
-			break;
-		}
+		e820_print_type(e820->map[i].type);
+		printk(KERN_CONT "\n");
 	}
 }

@@ -240,7 +252,7 @@ static void __init _e820_print_map(const
  */

 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
-				int *pnr_map)
+			     u32 *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -444,11 +456,12 @@ static int __init append_e820_map(struct
 	return __append_e820_map(biosmap, nr_map);
 }

-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 					u64 size, unsigned old_type,
 					unsigned new_type)
 {
-	unsigned int i, x;
+	u64 end;
+	unsigned int i;
 	u64 real_updated_size = 0;

 	BUG_ON(old_type == new_type);
@@ -456,40 +469,59 @@ static u64 __init e820_update_range_map(
 	if (size > (ULLONG_MAX - start))
 		size = ULLONG_MAX - start;

+	end = start + size;
+	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+		       (unsigned long long) start,
+		       (unsigned long long) end);
+	e820_print_type(old_type);
+	printk(KERN_CONT " ==> ");
+	e820_print_type(new_type);
+	printk(KERN_CONT "\n");
+
 	for (i = 0; i < e820x->nr_map; i++) {
 		struct e820entry *ei = &e820x->map[i];
 		u64 final_start, final_end;
+		u64 ei_end;
+
 		if (ei->type != old_type)
 			continue;
-		/* totally covered? */
-		if (ei->addr >= start &&
-		    (ei->addr + ei->size) <= (start + size)) {
+
+		ei_end = ei->addr + ei->size;
+		/* totally covered by new range? */
+		if (ei->addr >= start && ei_end <= end) {
 			ei->type = new_type;
 			real_updated_size += ei->size;
 			continue;
 		}
+
+		/* new range is totally covered? */
+		if (ei->addr < start && ei_end > end) {
+			__e820_add_region(e820x, start, size, new_type);
+			__e820_add_region(e820x, end, ei_end - end, ei->type);
+			ei->size = start - ei->addr;
+			real_updated_size += size;
+			continue;
+		}
+
 		/* partially covered */
 		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
+		final_end = min(end, ei_end);
 		if (final_start >= final_end)
 			continue;

-		x = e820x->nr_map;
-		if (x == ARRAY_SIZE(e820x->map)) {
-			printk(KERN_ERR "Too many memory map entries!\n");
-			break;
-		}
-		e820x->map[x].addr = final_start;
-		e820x->map[x].size = final_end - final_start;
-		e820x->map[x].type = new_type;
-		e820x->nr_map++;
+		__e820_add_region(e820x, final_start, final_end - final_start,
+				  new_type);

 		real_updated_size += final_end - final_start;

+		/*
+		 * left range could be head or tail, so need to update
+		 * size at first.
+		 */
+		ei->size -= final_end - final_start;
 		if (ei->addr < final_start)
 			continue;
 		ei->addr = final_end;
-		ei->size -= final_end - final_start;
 	}
 	return real_updated_size;
 }
@@ -497,7 +529,7 @@ static u64 __init e820_update_range_map(
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 			     unsigned new_type)
 {
-	return e820_update_range_map(&e820, start, size, old_type, new_type);
+	return __e820_update_range(&e820, start, size, old_type, new_type);
 }

 static u64 __init e820_update_range_saved(u64 start, u64 size,
@@ -505,11 +537,11 @@ static u64 __init e820_update_range_save
 {
 #ifdef CONFIG_XEN
 	if (is_initial_xendomain())
-		return e820_update_range_map(&machine_e820,
-					     phys_to_machine(start), size,
-					     old_type, new_type);
+		return __e820_update_range(&machine_e820,
+					   phys_to_machine(start), size,
+					   old_type, new_type);
 #endif
-	return e820_update_range_map(&e820_saved, start, size, old_type,
+	return __e820_update_range(&e820_saved, start, size, old_type,
 				     new_type);
 }

@@ -553,7 +585,7 @@ u64 __init e820_remove_range(u64 start,

 void __init update_e820(void)
 {
-	int nr_map;
+	u32 nr_map;

 	nr_map = e820.nr_map;
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -564,7 +596,7 @@ void __init update_e820(void)
 }
 static void __init update_e820_saved(void)
 {
-	int nr_map;
+	u32 nr_map;

 	nr_map = e820_saved.nr_map;
 	if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -916,6 +948,9 @@ void __init reserve_early_overlap_ok(u64
  */
 void __init reserve_early(u64 start, u64 end, char *name)
 {
+	if (start >= end)
+		return;
+
 	drop_overlaps_that_are_ok(start, end);
 	__reserve_early(start, end, name, 0);
 }
@@ -1389,7 +1424,7 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
-		int nr = e820.nr_map;
+		u32 nr = e820.nr_map;

 		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
 			early_panic("Invalid user supplied memory map");
@@ -1479,7 +1514,7 @@ void __init e820_reserve_resources_late(
 char *__init default_machine_specific_memory_setup(void)
 {
 	char *who = "BIOS-e820";
-	int new_nr;
+	u32 new_nr;
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
--- head-2010-05-25.orig/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -12,8 +12,8 @@
 #include <asm/fcntl.h>
 #include <asm/setup.h>
 #include <asm/pci-direct.h>
-#include <asm/pgtable.h>
 #include <asm/fixmap.h>
+#include <asm/pgtable.h>
 #include <linux/usb/ehci_def.h>

 #ifndef CONFIG_XEN
@@ -279,7 +279,7 @@ static int dbgp_wait_until_complete(void
 	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
 }

-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
 {
 	int i;

@@ -340,7 +340,7 @@ static void dbgp_set_data(const void *bu
 	writel(hi, &ehci_debug->data47);
 }

-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
 {
 	unsigned char *bytes = buf;
 	u32 lo, hi;
@@ -384,7 +384,7 @@ static int dbgp_bulk_write(unsigned devn
 	return ret;
 }

-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
 				 int size)
 {
 	u32 pids, addr, ctrl;
@@ -415,8 +415,8 @@ static int dbgp_bulk_read(unsigned devnu
 	return ret;
 }

-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
-	int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+	int request, int value, int index, void *data, int size)
 {
 	u32 pids, addr, ctrl;
 	struct usb_ctrlrequest req;
@@ -518,7 +518,7 @@ static u32 __init find_dbgp(int ehci_num
 	return 0;
 }

-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
 {
 	u32 portsc;
 	u32 delay_time, delay;
@@ -561,7 +561,7 @@ static int ehci_reset_port(int port)
 	return -EBUSY;
 }

-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
 {
 	u32 status;
 	int ret, reps;
@@ -586,13 +586,13 @@ static inline void dbgp_printk(const cha

 typedef void (*set_debug_port_t)(int port);

-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
 {
 }

-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;

-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
 {
 	u32 dword;
 	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
--- head-2010-05-25.orig/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:25:06.000000000 +0100
@@ -30,12 +30,13 @@
  *	1C(%esp) - %ds
  *	20(%esp) - %es
  *	24(%esp) - %fs
- *	28(%esp) - orig_eax
- *	2C(%esp) - %eip
- *	30(%esp) - %cs
- *	34(%esp) - %eflags
- *	38(%esp) - %oldesp
- *	3C(%esp) - %oldss
+ *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
+ *	2C(%esp) - orig_eax
+ *	30(%esp) - %eip
+ *	34(%esp) - %cs
+ *	38(%esp) - %eflags
+ *	3C(%esp) - %oldesp
+ *	40(%esp) - %oldss
  *
  * "current" is in register %ebx during any slow entries.
  */
@@ -46,7 +47,7 @@
 #include <asm/errno.h>
 #include <asm/segment.h>
 #include <asm/smp.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
 #include <asm/desc.h>
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
@@ -105,121 +106,221 @@ NMI_MASK	= 0x80000000
 #define resume_userspace_sig	resume_userspace
 #endif

-#define SAVE_ALL \
-	cld; \
-	pushl %fs; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	/*CFI_REL_OFFSET fs, 0;*/\
-	pushl %es; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	/*CFI_REL_OFFSET es, 0;*/\
-	pushl %ds; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	/*CFI_REL_OFFSET ds, 0;*/\
-	pushl %eax; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET eax, 0;\
-	pushl %ebp; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET ebp, 0;\
-	pushl %edi; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET edi, 0;\
-	pushl %esi; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET esi, 0;\
-	pushl %edx; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET edx, 0;\
-	pushl %ecx; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET ecx, 0;\
-	pushl %ebx; \
-	CFI_ADJUST_CFA_OFFSET 4;\
-	CFI_REL_OFFSET ebx, 0;\
-	movl $(__USER_DS), %edx; \
-	movl %edx, %ds; \
-	movl %edx, %es; \
-	movl $(__KERNEL_PERCPU), %edx; \
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc.  Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+	addl $(4 + \pop), %esp
+	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else	/* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+	pushl %gs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98:	popl %gs
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_RESTORE gs*/
+  .if \pop <> 0
+	add $\pop, %esp
+	CFI_ADJUST_CFA_OFFSET -\pop
+  .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99:	movl $0, (%esp)
+	jmp 98b
+.section __ex_table, "a"
+	.align 4
+	.long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98:	mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99:	movl $0, PT_GS(%esp)
+	jmp 98b
+.section __ex_table, "a"
+	.align 4
+	.long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+	movl %gs, \reg
+	/*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+	movl \reg, PT_GS(%esp)
+	/*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+	movl $(__KERNEL_STACK_CANARY), \reg
+	movl \reg, %gs
+.endm
+
+#endif	/* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+	cld
+	PUSH_GS
+	pushl %fs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET fs, 0;*/
+	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0;*/
+	pushl %ds
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ds, 0;*/
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eax, 0
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebp, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx, 0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl $(__USER_DS), %edx
+	movl %edx, %ds
+	movl %edx, %es
+	movl $(__KERNEL_PERCPU), %edx
 	movl %edx, %fs
+	SET_KERNEL_GS %edx
+.endm

-#define RESTORE_INT_REGS \
-	popl %ebx;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	CFI_RESTORE ebx;\
-	popl %ecx;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	CFI_RESTORE ecx;\
-	popl %edx;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	CFI_RESTORE edx;\
-	popl %esi;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	CFI_RESTORE esi;\
-	popl %edi;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	CFI_RESTORE edi;\
-	popl %ebp;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	CFI_RESTORE ebp;\
-	popl %eax;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
+.macro RESTORE_INT_REGS
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ecx
+	popl %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	popl %edi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edi
+	popl %ebp
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebp
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
 	CFI_RESTORE eax
+.endm

-#define RESTORE_REGS	\
-	RESTORE_INT_REGS; \
-1:	popl %ds;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	/*CFI_RESTORE ds;*/\
-2:	popl %es;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	/*CFI_RESTORE es;*/\
-3:	popl %fs;	\
-	CFI_ADJUST_CFA_OFFSET -4;\
-	/*CFI_RESTORE fs;*/\
-.pushsection .fixup,"ax";	\
-4:	movl $0,(%esp);	\
-	jmp 1b;		\
-5:	movl $0,(%esp);	\
-	jmp 2b;		\
-6:	movl $0,(%esp);	\
-	jmp 3b;		\
-.section __ex_table,"a";\
-	.align 4;	\
-	.long 1b,4b;	\
-	.long 2b,5b;	\
-	.long 3b,6b;	\
+.macro RESTORE_REGS pop=0
+	RESTORE_INT_REGS
+1:	popl %ds
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_RESTORE ds;*/
+2:	popl %es
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_RESTORE es;*/
+3:	popl %fs
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_RESTORE fs;*/
+	POP_GS \pop
+.pushsection .fixup, "ax"
+4:	movl $0, (%esp)
+	jmp 1b
+5:	movl $0, (%esp)
+	jmp 2b
+6:	movl $0, (%esp)
+	jmp 3b
+.section __ex_table, "a"
+	.align 4
+	.long 1b, 4b
+	.long 2b, 5b
+	.long 3b, 6b
 .popsection
+	POP_GS_EX
+.endm

-#define RING0_INT_FRAME \
-	CFI_STARTPROC simple;\
-	CFI_SIGNAL_FRAME;\
-	CFI_DEF_CFA esp, 3*4;\
-	/*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_INT_FRAME
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, 3*4
+	/*CFI_OFFSET cs, -2*4;*/
 	CFI_OFFSET eip, -3*4
+.endm

-#define RING0_EC_FRAME \
-	CFI_STARTPROC simple;\
-	CFI_SIGNAL_FRAME;\
-	CFI_DEF_CFA esp, 4*4;\
-	/*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_EC_FRAME
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, 4*4
+	/*CFI_OFFSET cs, -2*4;*/
 	CFI_OFFSET eip, -3*4
+.endm

-#define RING0_PTREGS_FRAME \
-	CFI_STARTPROC simple;\
-	CFI_SIGNAL_FRAME;\
-	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
-	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
-	CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
-	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
-	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
-	CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
-	CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
-	CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
-	CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
-	CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
-	CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+.macro RING0_PTREGS_FRAME
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+	CFI_OFFSET eip, PT_EIP-PT_OLDESP
+	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+	CFI_OFFSET eax, PT_EAX-PT_OLDESP
+	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+	CFI_OFFSET edi, PT_EDI-PT_OLDESP
+	CFI_OFFSET esi, PT_ESI-PT_OLDESP
+	CFI_OFFSET edx, PT_EDX-PT_OLDESP
+	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
 	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm

 ENTRY(ret_from_fork)
 	CFI_STARTPROC
@@ -344,7 +445,8 @@ sysenter_past_esp:
 .previous

 	GET_THREAD_INFO(%ebp)
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
 sysenter_do_call:
 	cmpl $(nr_syscalls), %eax
@@ -355,7 +457,7 @@ sysenter_do_call:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $_TIF_ALLWORK_MASK, %cx
+	testl $_TIF_ALLWORK_MASK, %ecx
 	jne sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
@@ -364,11 +466,12 @@ sysenter_exit:
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
+	PTGS_TO_GS
 	ENABLE_INTERRUPTS_SYSEXIT

 #ifdef CONFIG_AUDITSYSCALL
 sysenter_audit:
-	testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	addl $4,%esp
 	CFI_ADJUST_CFA_OFFSET -4
@@ -385,7 +488,7 @@ sysenter_audit:
 	jmp sysenter_do_call

 sysexit_audit:
-	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
 	jne syscall_exit_work
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
@@ -398,7 +501,7 @@ sysexit_audit:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
 	jne syscall_exit_work
 	movl PT_EAX(%esp),%eax	/* reload syscall return value */
 	jmp sysenter_exit
@@ -412,6 +515,7 @@ sysexit_audit:
 	.align 4
 	.long 1b,2b
 .popsection
+	PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)

 	# pv sysenter call handler stub
@@ -447,7 +551,7 @@ ENTRY(system_call)
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -461,7 +565,7 @@ syscall_exit:
 					# between sampling and the iret
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
-	testw $_TIF_ALLWORK_MASK, %cx	# current->work
+	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
 	jne syscall_exit_work

 restore_all:
@@ -492,8 +596,7 @@ restore_nocheck:
 #endif
 	TRACE_IRQS_IRET
 restore_nocheck_notrace:
-	RESTORE_REGS
-	addl $4, %esp			# skip orig_eax/error_code
+	RESTORE_REGS 4			# skip orig_eax/error_code
 	CFI_ADJUST_CFA_OFFSET -4
 irq_return:
 	INTERRUPT_RETURN
@@ -555,9 +658,7 @@ restore_all_enable_events:
 scrit:	/**** START OF CRITICAL REGION ****/
 	__TEST_PENDING
 	jnz  14f			# process more events if necessary...
-	RESTORE_REGS
-	addl $4, %esp
-	CFI_ADJUST_CFA_OFFSET -4
+	RESTORE_REGS 4
 1:	INTERRUPT_RETURN
 .section __ex_table,"a"
 	.align 4
@@ -571,9 +672,7 @@ ecrit:  /**** END OF CRITICAL REGION ***
 	CFI_RESTORE_STATE
 hypervisor_iret:
 	andl $~NMI_MASK, PT_EFLAGS(%esp)
-	RESTORE_REGS
-	addl $4, %esp
-	CFI_ADJUST_CFA_OFFSET -4
+	RESTORE_REGS 4
 	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
 #endif
 	CFI_ENDPROC
@@ -641,7 +740,7 @@ END(syscall_trace_entry)
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $_TIF_WORK_SYSCALL_EXIT, %cl
+	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
 	jz work_pending
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
@@ -665,29 +764,51 @@ syscall_badsys:
 END(syscall_badsys)
 	CFI_ENDPROC

+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL(name) \
+	ALIGN; \
+ptregs_##name: \
+	leal 4(%esp),%eax; \
+	jmp sys_##name;
+
+PTREGSCALL(iopl)
+PTREGSCALL(fork)
+PTREGSCALL(clone)
+PTREGSCALL(vfork)
+PTREGSCALL(execve)
+PTREGSCALL(sigaltstack)
+PTREGSCALL(sigreturn)
+PTREGSCALL(rt_sigreturn)
+PTREGSCALL(vm86)
+PTREGSCALL(vm86old)
+
 #ifndef CONFIG_XEN
-#define FIXUP_ESPFIX_STACK \
-	/* since we are on a wrong stack, we cant make it a C code :( */ \
-	PER_CPU(gdt_page, %ebx); \
-	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
-	addl %esp, %eax; \
-	pushl $__KERNEL_DS; \
-	CFI_ADJUST_CFA_OFFSET 4; \
-	pushl %eax; \
-	CFI_ADJUST_CFA_OFFSET 4; \
-	lss (%esp), %esp; \
-	CFI_ADJUST_CFA_OFFSET -8;
-#define UNWIND_ESPFIX_STACK \
-	movl %ss, %eax; \
-	/* see if on espfix stack */ \
-	cmpw $__ESPFIX_SS, %ax; \
-	jne 27f; \
-	movl $__KERNEL_DS, %eax; \
-	movl %eax, %ds; \
-	movl %eax, %es; \
-	/* switch to normal stack */ \
-	FIXUP_ESPFIX_STACK; \
-27:;
+.macro FIXUP_ESPFIX_STACK
+	/* since we are on a wrong stack, we cant make it a C code :( */
+	PER_CPU(gdt_page, %ebx)
+	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+	addl %esp, %eax
+	pushl $__KERNEL_DS
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	lss (%esp), %esp
+	CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+	movl %ss, %eax
+	/* see if on espfix stack */
+	cmpw $__ESPFIX_SS, %ax
+	jne 27f
+	movl $__KERNEL_DS, %eax
+	movl %eax, %ds
+	movl %eax, %es
+	/* switch to normal stack */
+	FIXUP_ESPFIX_STACK
+27:
+.endm

 /*
  * Build the entry stubs and pointer table with some assembler magic.
@@ -743,7 +864,7 @@ common_interrupt:
 ENDPROC(common_interrupt)
 	CFI_ENDPROC

-#define BUILD_INTERRUPT(name, nr)	\
+#define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
 	RING0_INT_FRAME;		\
 	pushl $~(nr);			\
@@ -751,13 +872,15 @@ ENTRY(name)				\
 	SAVE_ALL;			\
 	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
-	call smp_##name;		\
+	call fn;			\
 	jmp ret_from_intr;		\
 	CFI_ENDPROC;			\
 ENDPROC(name)

+#define BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(name, nr, smp_##name)
+
 /* The include is where all of the SMP etc. interrupts come from */
-#include "entry_arch.h"
+#include <asm/entry_arch.h>

 #else
 #define UNWIND_ESPFIX_STACK
@@ -844,8 +967,13 @@ critical_fixup_table:
 	.byte 7				# pop  %ds
 	.byte 8				# pop  %es
 	.byte 9,9			# pop  %fs
-	.byte 10,10,10			# add  $4,%esp
-	.byte 11			# iret
+#ifndef CONFIG_X86_32_LAZY_GS
+	.byte 10,10			# pop  %gs
+	.byte 11,11,11			# add  $4,%esp
+#else
+	.byte 10,10,10			# add  $8,%esp
+#endif
+	.byte 12			# iret
 	.byte -1,-1,-1,-1		# movb $1,1(%esi) = __DISABLE_INTERRUPTS
 .previous

@@ -1203,7 +1331,7 @@ ENTRY(ia32pv_cstar_target)
 .previous
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz cstar_trace_entry
 	cmpl $nr_syscalls,%eax
 	jae cstar_badsys
@@ -1323,7 +1451,10 @@ ENTRY(page_fault)
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
-	/* the function address is in %fs's slot on the stack */
+	/* the function address is in %gs's slot on the stack */
+	pushl %fs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET fs, 0*/
 	pushl %es
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET es, 0*/
@@ -1352,20 +1483,15 @@ error_code:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ebx, 0
 	cld
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET fs, 0*/
 	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
 	UNWIND_ESPFIX_STACK
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	/*CFI_REGISTER es, ecx*/
-	movl PT_FS(%esp), %edi		# get the function address
+	GS_TO_REG %ecx
+	movl PT_GS(%esp), %edi		# get the function address
 	movl PT_ORIG_EAX(%esp), %edx	# get the error code
 	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	mov  %ecx, PT_FS(%esp)
-	/*CFI_REL_OFFSET fs, ES*/
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
@@ -1390,20 +1516,21 @@ END(page_fault)
  * by hand onto the new stack - while updating the return eip past
  * the instruction that would have done it for sysenter.
  */
-#define FIX_STACK(offset, ok, label)		\
-	cmpw $__KERNEL_CS,4(%esp);		\
-	jne ok;					\
-label:						\
-	movl TSS_sysenter_sp0+offset(%esp),%esp;	\
-	CFI_DEF_CFA esp, 0;			\
-	CFI_UNDEFINED eip;			\
-	pushfl;					\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	pushl $__KERNEL_CS;			\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	pushl $sysenter_past_esp;		\
-	CFI_ADJUST_CFA_OFFSET 4;		\
+.macro FIX_STACK offset ok label
+	cmpw $__KERNEL_CS, 4(%esp)
+	jne \ok
+\label:
+	movl TSS_sysenter_sp0 + \offset(%esp), %esp
+	CFI_DEF_CFA esp, 0
+	CFI_UNDEFINED eip
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $__KERNEL_CS
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $sysenter_past_esp
+	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET eip, 0
+.endm
 #endif /* CONFIG_XEN */

 ENTRY(debug)
@@ -1411,7 +1538,7 @@ ENTRY(debug)
 #ifndef CONFIG_XEN
 	cmpl $ia32_sysenter_target,(%esp)
 	jne debug_stack_correct
-	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
 debug_stack_correct:
 #endif /* !CONFIG_XEN */
 	pushl $-1			# mark this as an int
@@ -1471,7 +1598,7 @@ nmi_stack_correct:

 nmi_stack_fixup:
 	RING0_INT_FRAME
-	FIX_STACK(12,nmi_stack_correct, 1)
+	FIX_STACK 12, nmi_stack_correct, 1
 	jmp nmi_stack_correct

 nmi_debug_stack_check:
@@ -1482,7 +1609,7 @@ nmi_debug_stack_check:
 	jb nmi_stack_correct
 	cmpl $debug_esp_fix_insn,(%esp)
 	ja nmi_stack_correct
-	FIX_STACK(24,nmi_stack_correct, 1)
+	FIX_STACK 24, nmi_stack_correct, 1
 	jmp nmi_stack_correct

 nmi_espfix_stack:
@@ -1494,7 +1621,7 @@ nmi_espfix_stack:
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %esp
 	CFI_ADJUST_CFA_OFFSET 4
-	addw $4, (%esp)
+	addl $4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
 	pushl 16(%esp)
--- head-2010-05-25.orig/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:25:06.000000000 +0100
@@ -51,10 +51,10 @@
 #include <asm/unistd.h>
 #include <asm/thread_info.h>
 #include <asm/hw_irq.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
 #include <asm/irqflags.h>
 #include <asm/ftrace.h>
-#include <asm/errno.h>
+#include <asm/percpu.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/features.h>

@@ -81,20 +81,17 @@ ENTRY(ftrace_caller)
 	movq 8(%rbp), %rsi
 	subq $MCOUNT_INSN_SIZE, %rdi

-.globl ftrace_call
-ftrace_call:
+GLOBAL(ftrace_call)
 	call ftrace_stub

 	MCOUNT_RESTORE_FRAME

 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
+GLOBAL(ftrace_graph_call)
 	jmp ftrace_stub
 #endif

-.globl ftrace_stub
-ftrace_stub:
+GLOBAL(ftrace_stub)
 	retq
 END(ftrace_caller)

@@ -114,8 +111,7 @@ ENTRY(mcount)
 	jnz ftrace_graph_caller
 #endif

-.globl ftrace_stub
-ftrace_stub:
+GLOBAL(ftrace_stub)
 	retq

 trace:
@@ -152,9 +148,7 @@ ENTRY(ftrace_graph_caller)
 	retq
 END(ftrace_graph_caller)

-
-.globl return_to_handler
-return_to_handler:
+GLOBAL(return_to_handler)
 	subq  $80, %rsp

 	movq %rax, (%rsp)
@@ -369,15 +363,15 @@ ENTRY(save_args)
 	je 1f
 	SWAPGS
 	/*
-	 * irqcount is used to check if a CPU is already on an interrupt stack
+	 * irq_count is used to check if a CPU is already on an interrupt stack
 	 * or not. While this is essentially redundant with preempt_count it is
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-1:	incl %gs:pda_irqcount
+1:	incl PER_CPU_VAR(irq_count)
 	jne 2f
 	popq_cfi %rax			/* move return address... */
-	mov %gs:pda_irqstackptr,%rsp
+	mov PER_CPU_VAR(irq_stack_ptr),%rsp
 	EMPTY_FRAME 0
 	pushq_cfi %rbp			/* backlink for unwinder */
 	pushq_cfi %rax			/* ... to the new stack */
@@ -407,6 +401,7 @@ END(save_rest)

 #ifndef CONFIG_XEN
 /* save complete stack frame */
+	.pushsection .kprobes.text, "ax"
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
@@ -435,6 +430,7 @@ ENTRY(save_paranoid)
 1:	ret
 	CFI_ENDPROC
 END(save_paranoid)
+	.popsection
 #endif

 /*
@@ -445,6 +441,8 @@ END(save_paranoid)
 ENTRY(ret_from_fork)
 	DEFAULT_FRAME

+	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
 	push kernel_eflags(%rip)
 	CFI_ADJUST_CFA_OFFSET 8
 	popf					# reset kernel eflags
@@ -454,7 +452,6 @@ ENTRY(ret_from_fork)

 	GET_THREAD_INFO(%rcx)

-	CFI_REMEMBER_STATE
 	RESTORE_REST

 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
@@ -466,7 +463,6 @@ ENTRY(ret_from_fork)
 	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
 	jmp ret_from_sys_call			# go to the SYSRET fastpath

-	CFI_RESTORE_STATE
 	CFI_ENDPROC
 END(ret_from_fork)

@@ -642,9 +638,7 @@ tracesys:
  * Syscall return path ending with IRET.
  * Has correct top of stack, but partial stack frame.
  */
-	.globl int_ret_from_sys_call
-	.globl int_with_check
-int_ret_from_sys_call:
+GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testb $3,CS-ARGOFFSET(%rsp)
@@ -655,7 +649,7 @@ int_ret_from_sys_call:
 1:
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	mask to check */
-int_with_check:
+GLOBAL(int_with_check)
 	LOCKDEP_SYS_EXIT_IRQ
 	GET_THREAD_INFO(%rcx)
 	movl TI_flags(%rcx),%edx
@@ -877,10 +871,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
 	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
 #endif

+#ifdef CONFIG_X86_UV
 apicinterrupt UV_BAU_MESSAGE \
 	uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
 apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+	generic_interrupt smp_generic_interrupt

 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -998,15 +996,15 @@ ENTRY(do_hypervisor_callback)   # do_hyp
 	movq %rdi, %rsp            # we don't return, adjust the stack frame
 	CFI_ENDPROC
 	DEFAULT_FRAME
-11:	incl %gs:pda_irqcount
+11:	incl PER_CPU_VAR(irq_count)
 	movq %rsp,%rbp
 	CFI_DEF_CFA_REGISTER rbp
-	cmovzq %gs:pda_irqstackptr,%rsp
+	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	pushq %rbp			# backlink for old unwinder
 	call evtchn_do_upcall
 	popq %rsp
 	CFI_DEF_CFA_REGISTER rsp
-	decl %gs:pda_irqcount
+	decl PER_CPU_VAR(irq_count)
 	jmp  error_exit
 	CFI_ENDPROC
 END(do_hypervisor_callback)
@@ -1197,14 +1195,14 @@ ENTRY(call_softirq)
 	CFI_REL_OFFSET rbp,0
 	mov  %rsp,%rbp
 	CFI_DEF_CFA_REGISTER rbp
-	incl %gs:pda_irqcount
-	cmove %gs:pda_irqstackptr,%rsp
+	incl PER_CPU_VAR(irq_count)
+	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
 	push  %rbp			# backlink for old unwinder
 	call __do_softirq
 	leaveq
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET   -8
-	decl %gs:pda_irqcount
+	decl PER_CPU_VAR(irq_count)
 	ret
 	CFI_ENDPROC
 END(call_softirq)
@@ -1250,7 +1248,10 @@ ENTRY(paranoid_exit)
 paranoid_swapgs:
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
+	RESTORE_ALL 8
+	jmp irq_return
 paranoid_restore:
+	TRACE_IRQS_IRETQ 0
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_userspace:
--- head-2010-05-25.orig/arch/x86/kernel/head-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head-xen.c	2010-04-28 17:07:13.000000000 +0200
@@ -2,6 +2,7 @@
 #include <linux/init.h>

 #include <asm/setup.h>
+#ifndef CONFIG_XEN
 #include <asm/bios_ebda.h>

 #define BIOS_LOWMEM_KILOBYTES 0x413
@@ -18,7 +19,6 @@
  */
 void __init reserve_ebda_region(void)
 {
-#ifndef CONFIG_XEN
 	unsigned int lowmem, ebda_addr;

 	/* To determine the position of the EBDA and the */
@@ -53,5 +53,174 @@ void __init reserve_ebda_region(void)

 	/* reserve all memory between lowmem and the 1MB mark */
 	reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+}
+#else /* CONFIG_XEN */
+#include <linux/module.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/setup_arch.h>
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
+#else
+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
+#endif
+
+unsigned long *__read_mostly machine_to_phys_mapping =
+	(void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int __read_mostly machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
+void __init xen_start_kernel(void)
+{
+	unsigned int i;
+	struct xen_machphys_mapping mapping;
+	unsigned long machine_to_phys_nr_ents;
+#ifdef CONFIG_X86_32
+	struct xen_platform_parameters pp;
+	extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
+	unsigned long addr;
+#endif
+
+	xen_setup_features();
+
+	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+		machine_to_phys_nr_ents = mapping.max_mfn + 1;
+	} else
+		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+	while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
+		machine_to_phys_order++;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		phys_to_machine_mapping =
+			(unsigned long *)xen_start_info->mfn_list;
+
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_writable_pagetables));
+
+	reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
+		      __pa(xen_start_info->pt_base)
+		      + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+		      "Xen provided");
+
+#ifdef CONFIG_X86_32
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_4gb_segments));
+
+	init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
+
+	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
+		hypervisor_virt_start = pp.virt_start;
+		reserve_top_address(0UL - pp.virt_start);
+	}
+
+	BUG_ON(pte_index(hypervisor_virt_start));
+
+	/* Do an early initialization of the fixmap area */
+	make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
+	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+	set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr),
+				      addr),
+			   addr),
+		__pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+#else
+	check_efer();
+	xen_init_pt();
+#endif
+
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
+			!= pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
+	FIX_BUG_ON(SHARED_INFO);
+	FIX_BUG_ON(ISAMAP_BEGIN);
+	FIX_BUG_ON(ISAMAP_END);
+#undef pmd_index
+#undef __FIXADDR_TOP
+
+	/* Switch to the real shared_info page, and clear the dummy page. */
+	set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+	HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+	clear_page(empty_zero_page);
+
+	/* Set up mapping of lowest 1MB of physical memory. */
+	for (i = 0; i < NR_FIX_ISAMAPS; i++)
+		if (is_initial_xendomain())
+			set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+		else
+			__set_fixmap(FIX_ISAMAP_BEGIN - i,
+				     virt_to_machine(empty_zero_page),
+				     PAGE_KERNEL_RO);
+
+}
+
+void __init machine_specific_arch_setup(void)
+{
+	int ret;
+	static const struct callback_register __initconst event = {
+		.type = CALLBACKTYPE_event,
+		.address = CALLBACK_ADDR(hypervisor_callback)
+	};
+	static const struct callback_register __initconst failsafe = {
+		.type = CALLBACKTYPE_failsafe,
+		.address = CALLBACK_ADDR(failsafe_callback)
+	};
+#ifdef CONFIG_X86_64
+	static const struct callback_register __initconst syscall = {
+		.type = CALLBACKTYPE_syscall,
+		.address = CALLBACK_ADDR(system_call)
+	};
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
+	static const struct callback_register __initconst nmi_cb = {
+		.type = CALLBACKTYPE_nmi,
+		.address = CALLBACK_ADDR(nmi)
+	};
+#endif
+
+	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
+	if (ret == 0)
+		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#ifdef CONFIG_X86_64
+	if (ret == 0)
+		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+#ifdef CONFIG_X86_32
+	if (ret == -ENOSYS)
+		ret = HYPERVISOR_set_callbacks(
+			event.address.cs, event.address.eip,
+			failsafe.address.cs, failsafe.address.eip);
+#else
+		ret = HYPERVISOR_set_callbacks(
+			event.address,
+			failsafe.address,
+			syscall.address);
+#endif
+#endif
+	BUG_ON(ret);
+
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
+	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (ret == -ENOSYS) {
+		static struct xennmi_callback __initdata cb = {
+			.handler_address = (unsigned long)nmi
+		};
+
+		HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
+	}
+#endif
 #endif
 }
+#endif /* CONFIG_XEN */
--- head-2010-05-25.orig/arch/x86/kernel/head32-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -9,6 +9,7 @@
 #include <linux/start_kernel.h>

 #include <asm/setup.h>
+#include <asm/setup_arch.h>
 #include <asm/sections.h>
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
@@ -18,7 +19,7 @@ void __init i386_start_kernel(void)
 {
 	reserve_trampoline_memory();

-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");

 #ifndef CONFIG_XEN
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -30,14 +31,8 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	reserve_early(init_pg_tables_start, init_pg_tables_end,
-			"INIT_PG_TABLE");
+	reserve_ebda_region();
 #else
-	reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
-		      __pa(xen_start_info->pt_base)
-		      + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
-		      "Xen provided");
-
 	{
 		int max_cmdline;

@@ -46,9 +41,9 @@ void __init i386_start_kernel(void)
 		memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
 		boot_command_line[max_cmdline-1] = '\0';
 	}
-#endif

-	reserve_ebda_region();
+	xen_start_kernel();
+#endif

 	/*
 	 * At this point everything still needed from the boot loader
--- head-2010-05-25.orig/arch/x86/kernel/head64-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head64-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -7,9 +7,6 @@
  *	Modified for Xen.
  */

-/* PDA is not ready to be used until the end of x86_64_start_kernel(). */
-#define arch_use_lazy_mmu_mode() false
-
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <linux/types.h>
@@ -18,12 +15,12 @@
 #include <linux/percpu.h>
 #include <linux/start_kernel.h>
 #include <linux/io.h>
-#include <linux/module.h>

 #include <asm/processor.h>
 #include <asm/proto.h>
 #include <asm/smp.h>
 #include <asm/setup.h>
+#include <asm/setup_arch.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -33,27 +30,6 @@
 #include <asm/bios_ebda.h>
 #include <asm/trampoline.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
-	_cpu_pda = __cpu_pda;
-	cpu_pda(0) = &_boot_cpu_pda;
-	pda_init(0);
-}
-
 #ifndef CONFIG_XEN
 static void __init zap_identity_mappings(void)
 {
@@ -92,16 +68,9 @@ static void __init copy_bootdata(char *r
 }

 #include <xen/interface/memory.h>
-unsigned long *machine_to_phys_mapping;
-EXPORT_SYMBOL(machine_to_phys_mapping);
-unsigned int machine_to_phys_order;
-EXPORT_SYMBOL(machine_to_phys_order);

 void __init x86_64_start_kernel(char * real_mode_data)
 {
-	struct xen_machphys_mapping mapping;
-	unsigned long machine_to_phys_nr_ents;
-
 	/*
 	 * Build-time sanity checks on the kernel image and module
 	 * area mappings. (these are purely build-time and produce no code)
@@ -116,21 +85,8 @@ void __init x86_64_start_kernel(char * r
 				(__START_KERNEL & PGDIR_MASK)));
 	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

-	xen_setup_features();
-
 	xen_start_info = (struct start_info *)real_mode_data;
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		phys_to_machine_mapping =
-			(unsigned long *)xen_start_info->mfn_list;
-
-	machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
-	machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
-	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
-		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
-		machine_to_phys_nr_ents = mapping.max_mfn + 1;
-	}
-	while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
-		machine_to_phys_order++;
+	xen_start_kernel();

 #ifndef CONFIG_XEN
 	/* clear bss before set_intr_gate with early_idt_handler */
@@ -155,7 +111,7 @@ void __init x86_64_start_kernel(char * r
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");

-	x86_64_init_pda();
+	xen_switch_pt();

 	x86_64_start_reservations(real_mode_data);
 }
@@ -166,12 +122,7 @@ void __init x86_64_start_reservations(ch

 	reserve_trampoline_memory();

-	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
-
-	reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
-		      __pa(xen_start_info->pt_base)
-		      + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
-		      "Xen provided");
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");

 	/*
 	 * At this point everything still needed from the boot loader
--- head-2010-05-25.orig/arch/x86/kernel/head_32-xen.S	2010-03-24 15:12:36.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head_32-xen.S	2010-03-24 15:25:06.000000000 +0100
@@ -6,12 +6,14 @@
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
 #include <asm/cache.h>
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
 #include <asm/boot.h>
 #include <asm/dwarf2.h>
+#include <asm/percpu.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/elfnote.h>

@@ -38,9 +40,6 @@ ENTRY(startup_32)
 	/* Set up the stack pointer */
 	movl $(init_thread_union+THREAD_SIZE),%esp

-	movl %ss,%eax
-	movl %eax,%fs			# gets reset once there's real percpu
-
 	/* get vendor info */
 	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
 	XEN_CPUID
@@ -63,7 +62,49 @@ ENTRY(startup_32)

 	movb $1,X86_HARD_MATH

-	xorl %eax,%eax		# Clear GS
+#ifdef CONFIG_CC_STACKPROTECTOR
+	/*
+	 * The linker can't handle this by relocation.  Manually set
+	 * base address in stack canary segment descriptor.
+	 */
+	movl $per_cpu__gdt_page,%eax
+	movl $per_cpu__stack_canary,%ecx
+	subl $20, %ecx
+	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+	shrl $16, %ecx
+	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+
+	# %esi still points to start_info, and no registers
+	# need to be preserved.
+
+	movl XEN_START_mfn_list(%esi), %ebx
+	movl $(per_cpu__gdt_page - __PAGE_OFFSET), %eax
+	shrl $PAGE_SHIFT, %eax
+	movl (%ebx,%eax,4), %ecx
+	pushl %ecx			# frame number for set_gdt below
+
+	xorl %esi, %esi
+	xorl %edx, %edx
+	shldl $PAGE_SHIFT, %ecx, %edx
+	shll $PAGE_SHIFT, %ecx
+	orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx
+	movl $per_cpu__gdt_page, %ebx
+	movl $__HYPERVISOR_update_va_mapping, %eax
+	int $0x82
+
+	movl $(PAGE_SIZE_asm / 8), %ecx
+	movl %esp, %ebx
+	movl $__HYPERVISOR_set_gdt, %eax
+	int $0x82
+
+	popl %ecx
+
+	movl $(__KERNEL_PERCPU), %eax
+	movl %eax,%fs			# set this cpu's percpu
+
+	movl $(__KERNEL_STACK_CANARY),%eax
 	movl %eax,%gs

 	cld			# gcc2 wants the direction flag cleared at all times
--- head-2010-05-25.orig/arch/x86/kernel/head_64-xen.S	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head_64-xen.S	2010-03-24 15:25:06.000000000 +0100
@@ -21,6 +21,7 @@
 #include <asm/msr.h>
 #include <asm/cache.h>
 #include <asm/dwarf2.h>
+#include <asm/percpu.h>
 #include <xen/interface/elfnote.h>

 	.section .text.head, "ax", @progbits
@@ -32,11 +33,23 @@ startup_64:
 	/* rsi is pointer to startup info structure.
 	   pass it to C */
 	movq %rsi,%rdi
+
+	/* Set up %gs.
+	 *
+	 * The base of %gs always points to the bottom of the irqstack
+	 * union.  If the stack protector canary is enabled, it is
+	 * located at %gs:40.  Note that, on SMP, the boot cpu uses
+	 * init data section till per cpu areas are set up.
+	 */
+	movl	$MSR_GS_BASE,%ecx
+	movq	$INIT_PER_CPU_VAR(irq_stack_union),%rax
+	movq    %rax,%rdx
+	shrq	$32,%rdx
+	wrmsr
+
 	pushq $0		# fake return address
 	jmp x86_64_start_kernel

-.balign PAGE_SIZE
-
 #define NEXT_PAGE(name) \
 	.balign	PAGE_SIZE; \
 	phys_##name = . - .text.head; \
--- head-2010-05-25.orig/arch/x86/kernel/ioport-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/ioport-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -91,9 +91,8 @@ static int do_iopl(unsigned int level, s
 }

 #ifdef CONFIG_X86_32
-asmlinkage long sys_iopl(unsigned long regsp)
+long sys_iopl(struct pt_regs *regs)
 {
-	struct pt_regs *regs = (struct pt_regs *)&regsp;
 	unsigned int level = regs->bx;
 #else
 asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
--- head-2010-05-25.orig/arch/x86/kernel/irq-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/irq-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -6,13 +6,20 @@
 #include <linux/kernel_stat.h>
 #include <linux/seq_file.h>
 #include <linux/smp.h>
+#include <linux/ftrace.h>

 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/irq.h>
+#include <asm/idle.h>

 atomic_t irq_err_count;

+#ifndef CONFIG_XEN
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+#endif
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -36,11 +43,7 @@ void ack_bad_irq(unsigned int irq)
 #endif
 }

-#ifdef CONFIG_X86_32
-# define irq_stats(x)		(&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x)		cpu_pda(x)
-#endif
+#define irq_stats(x)		(&per_cpu(irq_stat, x))
 /*
  * /proc/interrupts printing:
  */
@@ -57,6 +60,19 @@ static int show_other_interrupts(struct
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "SPU");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+	seq_printf(p, "  Spurious interrupts\n");
+#endif
+#ifndef CONFIG_XEN
+	if (generic_interrupt_extension) {
+		seq_printf(p, "%*s: ", prec, "PLT");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+		seq_printf(p, "  Platform interrupts\n");
+	}
 #endif
 #ifdef CONFIG_SMP
 	seq_printf(p, "%*s: ", prec, "RES");
@@ -86,12 +102,6 @@ static int show_other_interrupts(struct
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "%*s: ", prec, "SPU");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
-	seq_printf(p, "  Spurious interrupts\n");
-#endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -128,23 +138,15 @@ int show_interrupts(struct seq_file *p,
 		return 0;

 	spin_lock_irqsave(&desc->lock, flags);
-#ifndef CONFIG_SMP
-	any_count = kstat_irqs(i);
-#else
 	for_each_online_cpu(j)
 		any_count |= kstat_irqs_cpu(i, j);
-#endif
 	action = desc->action;
 	if (!action && !any_count)
 		goto out;

 	seq_printf(p, "%*d: ", prec, i);
-#ifndef CONFIG_SMP
-	seq_printf(p, "%10u ", kstat_irqs(i));
-#else
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
 	seq_printf(p, " %8s", desc->chip->name);
 	seq_printf(p, "-%-8s", desc->name);

@@ -169,6 +171,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)

 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->irq_spurious_count;
+#endif
+#ifndef CONFIG_XEN
+	if (generic_interrupt_extension)
+		sum += irq_stats(cpu)->generic_irqs;
 #endif
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
@@ -183,9 +190,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->irq_threshold_count;
 #endif
 #endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	sum += irq_stats(cpu)->irq_spurious_count;
-#endif
 	return sum;
 }

@@ -198,3 +202,64 @@ u64 arch_irq_stat(void)
 #endif
 	return sum;
 }
+
+
+#ifndef CONFIG_XEN
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/* high bit used in ret_from_ code  */
+	unsigned vector = ~regs->orig_ax;
+	unsigned irq;
+
+	exit_idle();
+	irq_enter();
+
+	irq = __get_cpu_var(vector_irq)[vector];
+
+	if (!handle_irq(irq, regs)) {
+#ifdef CONFIG_X86_64
+		if (!disable_apic)
+			ack_APIC_irq();
+#endif
+
+		if (printk_ratelimit())
+			printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
+			       __func__, smp_processor_id(), vector, irq);
+	}
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+	return 1;
+}
+
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	exit_idle();
+
+	irq_enter();
+
+	inc_irq_stat(generic_irqs);
+
+	if (generic_interrupt_extension)
+		generic_interrupt_extension();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+#endif
--- head-2010-05-25.orig/arch/x86/kernel/machine_kexec_64.c	2010-04-15 10:03:05.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/machine_kexec_64.c	2010-04-15 10:07:08.000000000 +0200
@@ -92,13 +92,8 @@ void machine_kexec_setup_load_arg(xen_ke
 	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
 	xki->page_list[PA_TABLE_PAGE] = __ma(table_page);

-	xki->page_list[PA_PGD] = __ma(kexec_pgd);
-	xki->page_list[PA_PUD_0] = __ma(kexec_pud0);
-	xki->page_list[PA_PUD_1] = __ma(kexec_pud1);
-	xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
-	xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
-	xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
-	xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
 }

 int __init machine_kexec_setup_resources(struct resource *hypervisor,
@@ -161,7 +156,7 @@ static int init_one_level2_page(struct k
 	}
 	pmd = pmd_offset(pud, addr);
 	if (!pmd_present(*pmd))
-		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
 	result = 0;
 out:
 	return result;
--- head-2010-05-25.orig/arch/x86/kernel/microcode_core-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/microcode_core-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -21,28 +21,28 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  */
+#include <linux/platform_device.h>
 #include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/firmware.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
+#include <linux/fs.h>
+#include <linux/mm.h>

-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
 #include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>

 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -51,7 +51,7 @@ MODULE_LICENSE("GPL");
 static int verbose;
 module_param(verbose, int, 0644);

-#define MICROCODE_VERSION 	"2.00-xen"
+#define MICROCODE_VERSION	"2.00-xen"

 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DEFINE_MUTEX(microcode_mutex);
@@ -143,12 +143,12 @@ static void microcode_dev_exit(void)

 MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 #else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while (0)
+#define microcode_dev_init()	0
+#define microcode_dev_exit()	do { } while (0)
 #endif

 /* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+static struct platform_device	*microcode_pdev;

 static int request_microcode(const char *name)
 {
--- head-2010-05-25.orig/arch/x86/kernel/mpparse-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/mpparse-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -3,7 +3,7 @@
  *	compliant MP-table parsing routines.
  *
  *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *	(c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
  *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
  */

@@ -29,11 +29,7 @@
 #include <asm/setup.h>
 #include <asm/smp.h>

-#include <mach_apic.h>
-#ifdef CONFIG_X86_32
-#include <mach_apicdef.h>
-#include <mach_mpparse.h>
-#endif
+#include <asm/apic.h>

 static void *_bus_to_virt(unsigned long ma)
 {
@@ -123,9 +119,6 @@ static void __init MP_bus_info(struct mp
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
-#endif
-
-#ifdef CONFIG_X86_IO_APIC

 static int bad_ioapic(unsigned long address)
 {
@@ -153,11 +146,11 @@ static void __init MP_ioapic_info(struct
 	if (bad_ioapic(m->apicaddr))
 		return;

-	mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr;
-	mp_ioapics[nr_ioapics].mp_apicid = m->apicid;
-	mp_ioapics[nr_ioapics].mp_type = m->type;
-	mp_ioapics[nr_ioapics].mp_apicver = m->apicver;
-	mp_ioapics[nr_ioapics].mp_flags = m->flags;
+	mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
+	mp_ioapics[nr_ioapics].apicid = m->apicid;
+	mp_ioapics[nr_ioapics].type = m->type;
+	mp_ioapics[nr_ioapics].apicver = m->apicver;
+	mp_ioapics[nr_ioapics].flags = m->flags;
 	nr_ioapics++;
 }

@@ -169,55 +162,55 @@ static void print_MP_intsrc_info(struct
 		m->srcbusirq, m->dstapic, m->dstirq);
 }

-static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
 {
 	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
-		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
-		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
-		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+		mp_irq->irqtype, mp_irq->irqflag & 3,
+		(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+		mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
 }

 static void __init assign_to_mp_irq(struct mpc_intsrc *m,
-				    struct mp_config_intsrc *mp_irq)
+				    struct mpc_intsrc *mp_irq)
 {
-	mp_irq->mp_dstapic = m->dstapic;
-	mp_irq->mp_type = m->type;
-	mp_irq->mp_irqtype = m->irqtype;
-	mp_irq->mp_irqflag = m->irqflag;
-	mp_irq->mp_srcbus = m->srcbus;
-	mp_irq->mp_srcbusirq = m->srcbusirq;
-	mp_irq->mp_dstirq = m->dstirq;
+	mp_irq->dstapic = m->dstapic;
+	mp_irq->type = m->type;
+	mp_irq->irqtype = m->irqtype;
+	mp_irq->irqflag = m->irqflag;
+	mp_irq->srcbus = m->srcbus;
+	mp_irq->srcbusirq = m->srcbusirq;
+	mp_irq->dstirq = m->dstirq;
 }

-static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
 					struct mpc_intsrc *m)
 {
-	m->dstapic = mp_irq->mp_dstapic;
-	m->type = mp_irq->mp_type;
-	m->irqtype = mp_irq->mp_irqtype;
-	m->irqflag = mp_irq->mp_irqflag;
-	m->srcbus = mp_irq->mp_srcbus;
-	m->srcbusirq = mp_irq->mp_srcbusirq;
-	m->dstirq = mp_irq->mp_dstirq;
+	m->dstapic = mp_irq->dstapic;
+	m->type = mp_irq->type;
+	m->irqtype = mp_irq->irqtype;
+	m->irqflag = mp_irq->irqflag;
+	m->srcbus = mp_irq->srcbus;
+	m->srcbusirq = mp_irq->srcbusirq;
+	m->dstirq = mp_irq->dstirq;
 }

-static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
 					struct mpc_intsrc *m)
 {
-	if (mp_irq->mp_dstapic != m->dstapic)
+	if (mp_irq->dstapic != m->dstapic)
 		return 1;
-	if (mp_irq->mp_type != m->type)
+	if (mp_irq->type != m->type)
 		return 2;
-	if (mp_irq->mp_irqtype != m->irqtype)
+	if (mp_irq->irqtype != m->irqtype)
 		return 3;
-	if (mp_irq->mp_irqflag != m->irqflag)
+	if (mp_irq->irqflag != m->irqflag)
 		return 4;
-	if (mp_irq->mp_srcbus != m->srcbus)
+	if (mp_irq->srcbus != m->srcbus)
 		return 5;
-	if (mp_irq->mp_srcbusirq != m->srcbusirq)
+	if (mp_irq->srcbusirq != m->srcbusirq)
 		return 6;
-	if (mp_irq->mp_dstirq != m->dstirq)
+	if (mp_irq->dstirq != m->dstirq)
 		return 7;

 	return 0;
@@ -238,8 +231,12 @@ static void __init MP_intsrc_info(struct
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */

-#endif

 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
@@ -289,6 +286,20 @@ static int __init smp_check_mpc(struct m
 	return 1;
 }

+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+	*ptr += size;
+	*count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+	printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+		"type %x\n", *mpt);
+	print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+			1, mpc, mpc->length, 1);
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -300,17 +311,8 @@ static int __init smp_read_mpc(struct mp
 	if (!smp_check_mpc(mpc, oem, str))
 		return 0;

-#ifdef CONFIG_X86_32
-	/*
-	 * need to make sure summit and es7000's mps_oem_check is safe to be
-	 * called early via genericarch 's mps_oem_check
-	 */
-	if (early) {
-#ifdef CONFIG_X86_NUMAQ
-		numaq_mps_oem_check(mpc, oem, str);
-#endif
-	} else
-		mps_oem_check(mpc, oem, str);
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+	generic_mps_oem_check(mpc, oem, str);
 #endif
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
@@ -333,61 +335,30 @@ static int __init smp_read_mpc(struct mp
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
-			{
-				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-				/* ACPI may have already provided this data */
-				if (!acpi_lapic)
-					MP_processor_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			/* ACPI may have already provided this data */
+			if (!acpi_lapic)
+				MP_processor_info((struct mpc_cpu *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
 		case MP_BUS:
-			{
-				struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
-				MP_bus_info(m);
-#endif
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			MP_bus_info((struct mpc_bus *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
 		case MP_IOAPIC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
-				MP_ioapic_info(m);
-#endif
-				mpt += sizeof(struct mpc_ioapic);
-				count += sizeof(struct mpc_ioapic);
-				break;
-			}
+			MP_ioapic_info((struct mpc_ioapic *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
 		case MP_INTSRC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-				MP_intsrc_info(m);
-#endif
-				mpt += sizeof(struct mpc_intsrc);
-				count += sizeof(struct mpc_intsrc);
-				break;
-			}
+			MP_intsrc_info((struct mpc_intsrc *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
 		case MP_LINTSRC:
-			{
-				struct mpc_lintsrc *m =
-				    (struct mpc_lintsrc *)mpt;
-				MP_lintsrc_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
 		default:
 			/* wrong mptable */
-			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-			printk(KERN_ERR "type %x\n", *mpt);
-			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->length, 1);
+			smp_dump_mptable(mpc, mpt);
 			count = mpc->length;
 			break;
 		}
@@ -395,13 +366,13 @@ static int __init smp_read_mpc(struct mp
 			(*x86_quirks->mpc_record)++;
 	}

-#ifdef CONFIG_X86_GENERICARCH
-       generic_bigsmp_probe();
+#ifdef CONFIG_X86_BIGSMP
+	generic_bigsmp_probe();
 #endif

-#ifdef CONFIG_X86_32
-	setup_apic_routing();
-#endif
+	if (apic->setup_apic_routing)
+		apic->setup_apic_routing();
+
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
@@ -426,7 +397,7 @@ static void __init construct_default_ioi
 	intsrc.type = MP_INTSRC;
 	intsrc.irqflag = 0;	/* conforming */
 	intsrc.srcbus = 0;
-	intsrc.dstapic = mp_ioapics[0].mp_apicid;
+	intsrc.dstapic = mp_ioapics[0].apicid;

 	intsrc.irqtype = mp_INT;

@@ -579,14 +550,76 @@ static inline void __init construct_defa
 	}
 }

-static struct intel_mp_floating *mpf_found;
+static struct mpf_intel *mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	mpc = early_ioremap(physptr, PAGE_SIZE);
+	size = mpc->length;
+	early_iounmap(mpc, PAGE_SIZE);
+	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+	return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	size = get_mpc_size(mpf->physptr);
+	mpc = early_ioremap(mpf->physptr, size);
+	/*
+	 * Read the physical hardware table.  Anything here will
+	 * override the defaults.
+	 */
+	if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+		smp_found_config = 0;
+#endif
+		printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+			"... disabling SMP support. (tell your hw vendor)\n");
+		early_iounmap(mpc, size);
+		return -1;
+	}
+	early_iounmap(mpc, size);
+
+	if (early)
+		return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * If there are no explicit MP IRQ entries, then we are
+	 * broken.  We set up most of the low 16 IO-APIC pins to
+	 * ISA defaults and hope it will work.
+	 */
+	if (!mp_irq_entries) {
+		struct mpc_bus bus;
+
+		printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+		       "using default mptable. (tell your hw vendor)\n");
+
+		bus.type = MP_BUS;
+		bus.busid = 0;
+		memcpy(bus.bustype, "ISA   ", 6);
+		MP_bus_info(&bus);
+
+		construct_default_ioirq_mptable(0);
+	}
+#endif
+
+	return 0;
+}

 /*
  * Scan the memory blocks for an SMP configuration block.
  */
 static void __init __get_smp_config(unsigned int early)
 {
-	struct intel_mp_floating *mpf = mpf_found;
+	struct mpf_intel *mpf = mpf_found;

 	if (!mpf)
 		return;
@@ -607,9 +640,9 @@ static void __init __get_smp_config(unsi
 	}

 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
-	       mpf->mpf_specification);
+	       mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-	if (mpf->mpf_feature2 & (1 << 7)) {
+	if (mpf->feature2 & (1 << 7)) {
 		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
 		pic_mode = 1;
 	} else {
@@ -620,7 +653,7 @@ static void __init __get_smp_config(unsi
 	/*
 	 * Now see if we need to read further.
 	 */
-	if (mpf->mpf_feature1 != 0) {
+	if (mpf->feature1 != 0) {
 		if (early) {
 			/*
 			 * local APIC has default address
@@ -630,49 +663,12 @@ static void __init __get_smp_config(unsi
 		}

 		printk(KERN_INFO "Default MP configuration #%d\n",
-		       mpf->mpf_feature1);
-		construct_default_ISA_mptable(mpf->mpf_feature1);
-
-	} else if (mpf->mpf_physptr) {
-
-		/*
-		 * Read the physical hardware table.  Anything here will
-		 * override the defaults.
-		 */
-		if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
-#ifdef CONFIG_X86_LOCAL_APIC
-			smp_found_config = 0;
-#endif
-			printk(KERN_ERR
-			       "BIOS bug, MP table errors detected!...\n");
-			printk(KERN_ERR "... disabling SMP support. "
-			       "(tell your hw vendor)\n");
-			return;
-		}
+		       mpf->feature1);
+		construct_default_ISA_mptable(mpf->feature1);

-		if (early)
+	} else if (mpf->physptr) {
+		if (check_physptr(mpf, early))
 			return;
-#ifdef CONFIG_X86_IO_APIC
-		/*
-		 * If there are no explicit MP IRQ entries, then we are
-		 * broken.  We set up most of the low 16 IO-APIC pins to
-		 * ISA defaults and hope it will work.
-		 */
-		if (!mp_irq_entries) {
-			struct mpc_bus bus;
-
-			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-			       "using default mptable. "
-			       "(tell your hw vendor)\n");
-
-			bus.type = MP_BUS;
-			bus.busid = 0;
-			memcpy(bus.bustype, "ISA   ", 6);
-			MP_bus_info(&bus);
-
-			construct_default_ioirq_mptable(0);
-		}
-#endif
 	} else
 		BUG();

@@ -693,58 +689,68 @@ void __init get_smp_config(void)
 	__get_smp_config(0);
 }

+#ifndef CONFIG_XEN
+static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+	unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+	/*
+	 * We cannot access to MPC table to compute table size yet,
+	 * as only few megabytes from the bottom is mapped now.
+	 * PC-9800's MPC table places on the very last of physical
+	 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+	 * yields BUG() in reserve_bootmem.
+	 * also need to make sure physptr is below than max_low_pfn
+	 * we don't need reserve the area above max_low_pfn
+	 */
+	unsigned long end = max_low_pfn * PAGE_SIZE;
+
+	if (mpf->physptr < end) {
+		if (mpf->physptr + size > end)
+			size = end - mpf->physptr;
+		reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+	}
+#else
+	reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
+#endif
+
 static int __init smp_scan_config(unsigned long base, unsigned long length,
 				  unsigned reserve)
 {
 	unsigned int *bp = _bus_to_virt(base);
-	struct intel_mp_floating *mpf;
+	struct mpf_intel *mpf;

 	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
 			bp, length);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);

 	while (length > 0) {
-		mpf = (struct intel_mp_floating *)bp;
+		mpf = (struct mpf_intel *)bp;
 		if ((*bp == SMP_MAGIC_IDENT) &&
-		    (mpf->mpf_length == 1) &&
+		    (mpf->length == 1) &&
 		    !mpf_checksum((unsigned char *)bp, 16) &&
-		    ((mpf->mpf_specification == 1)
-		     || (mpf->mpf_specification == 4))) {
+		    ((mpf->specification == 1)
+		     || (mpf->specification == 4))) {
 #ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
 #endif
 			mpf_found = mpf;

 #ifndef CONFIG_XEN
-			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-			       mpf, virt_to_phys(mpf));
+			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
+			       mpf, (u64)virt_to_phys(mpf));

 			if (!reserve)
 				return 1;
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
-					BOOTMEM_DEFAULT);
-			if (mpf->mpf_physptr) {
-				unsigned long size = PAGE_SIZE;
-#ifdef CONFIG_X86_32
-				/*
-				 * We cannot access to MPC table to compute
-				 * table size yet, as only few megabytes from
-				 * the bottom is mapped now.
-				 * PC-9800's MPC table places on the very last
-				 * of physical memory; so that simply reserving
-				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
-				 * in reserve_bootmem.
-				 */
-				unsigned long end = max_low_pfn * PAGE_SIZE;
-				if (mpf->mpf_physptr + size > end)
-					size = end - mpf->mpf_physptr;
-#endif
-				reserve_bootmem_generic(mpf->mpf_physptr, size,
+			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
 						BOOTMEM_DEFAULT);
-			}
+			if (mpf->physptr)
+				smp_reserve_bootmem(mpf);
 #else
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-				mpf, ((void *)bp - _bus_to_virt(base)) + base);
+			       mpf, ((void *)bp - _bus_to_virt(base)) + base);
 #endif
 			return 1;
 		}
@@ -826,15 +832,15 @@ static int  __init get_MP_intsrc_index(s
 	/* not legacy */

 	for (i = 0; i < mp_irq_entries; i++) {
-		if (mp_irqs[i].mp_irqtype != mp_INT)
+		if (mp_irqs[i].irqtype != mp_INT)
 			continue;

-		if (mp_irqs[i].mp_irqflag != 0x0f)
+		if (mp_irqs[i].irqflag != 0x0f)
 			continue;

-		if (mp_irqs[i].mp_srcbus != m->srcbus)
+		if (mp_irqs[i].srcbus != m->srcbus)
 			continue;
-		if (mp_irqs[i].mp_srcbusirq != m->srcbusirq)
+		if (mp_irqs[i].srcbusirq != m->srcbusirq)
 			continue;
 		if (irq_used[i]) {
 			/* already claimed */
@@ -851,7 +857,58 @@ static int  __init get_MP_intsrc_index(s
 #define SPARE_SLOT_NUM 20

 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+	int i;
+
+	apic_printk(APIC_VERBOSE, "OLD ");
+	print_MP_intsrc_info(m);
+
+	i = get_MP_intsrc_index(m);
+	if (i > 0) {
+		assign_to_mpc_intsrc(&mp_irqs[i], m);
+		apic_printk(APIC_VERBOSE, "NEW ");
+		print_mp_irq_info(&mp_irqs[i]);
+		return;
+	}
+	if (!i) {
+		/* legacy, do nothing */
+		return;
+	}
+	if (*nr_m_spare < SPARE_SLOT_NUM) {
+		/*
+		 * not found (-1), or duplicated (-2) are invalid entries,
+		 * we need to use the slot later
+		 */
+		m_spare[*nr_m_spare] = m;
+		*nr_m_spare += 1;
+	}
+}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+		      int count)
+{
+	if (!mpc_new_phys) {
+		pr_info("No spare slots, try to append...take your risk, "
+			"new mpc_length %x\n", count);
+	} else {
+		if (count <= mpc_new_length)
+			pr_info("No spare slots, try to append..., "
+				"new mpc_length %x\n", count);
+		else {
+			pr_err("mpc_new_length %lx is too small\n",
+				mpc_new_length);
+			return -1;
+		}
+	}
+
+	return 0;
+}

 static int  __init replace_intsrc_all(struct mpc_table *mpc,
 					unsigned long mpc_new_phys,
@@ -859,77 +916,33 @@ static int  __init replace_intsrc_all(st
 {
 #ifdef CONFIG_X86_IO_APIC
 	int i;
-	int nr_m_spare = 0;
 #endif
-
 	int count = sizeof(*mpc);
+	int nr_m_spare = 0;
 	unsigned char *mpt = ((unsigned char *)mpc) + count;

 	printk(KERN_INFO "mpc_length %x\n", mpc->length);
 	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
-			{
-				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
 		case MP_BUS:
-			{
-				struct mpc_bus *m = (struct mpc_bus *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
 		case MP_IOAPIC:
-			{
-				mpt += sizeof(struct mpc_ioapic);
-				count += sizeof(struct mpc_ioapic);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
 		case MP_INTSRC:
-			{
-#ifdef CONFIG_X86_IO_APIC
-				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-				apic_printk(APIC_VERBOSE, "OLD ");
-				print_MP_intsrc_info(m);
-				i = get_MP_intsrc_index(m);
-				if (i > 0) {
-					assign_to_mpc_intsrc(&mp_irqs[i], m);
-					apic_printk(APIC_VERBOSE, "NEW ");
-					print_mp_irq_info(&mp_irqs[i]);
-				} else if (!i) {
-					/* legacy, do nothing */
-				} else if (nr_m_spare < SPARE_SLOT_NUM) {
-					/*
-					 * not found (-1), or duplicated (-2)
-					 * are invalid entries,
-					 * we need to use the slot  later
-					 */
-					m_spare[nr_m_spare] = m;
-					nr_m_spare++;
-				}
-#endif
-				mpt += sizeof(struct mpc_intsrc);
-				count += sizeof(struct mpc_intsrc);
-				break;
-			}
+			check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
 		case MP_LINTSRC:
-			{
-				struct mpc_lintsrc *m =
-				    (struct mpc_lintsrc *)mpt;
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
 		default:
 			/* wrong mptable */
-			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-			printk(KERN_ERR "type %x\n", *mpt);
-			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->length, 1);
+			smp_dump_mptable(mpc, mpt);
 			goto out;
 		}
 	}
@@ -939,10 +952,10 @@ static int  __init replace_intsrc_all(st
 		if (irq_used[i])
 			continue;

-		if (mp_irqs[i].mp_irqtype != mp_INT)
+		if (mp_irqs[i].irqtype != mp_INT)
 			continue;

-		if (mp_irqs[i].mp_irqflag != 0x0f)
+		if (mp_irqs[i].irqflag != 0x0f)
 			continue;

 		if (nr_m_spare > 0) {
@@ -953,16 +966,8 @@ static int  __init replace_intsrc_all(st
 		} else {
 			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 			count += sizeof(struct mpc_intsrc);
-			if (!mpc_new_phys) {
-				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
-			} else {
-				if (count <= mpc_new_length)
-					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
-				else {
-					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
-					goto out;
-				}
-			}
+			if (!check_slot(mpc_new_phys, mpc_new_length, count))
+				goto out;
 			assign_to_mpc_intsrc(&mp_irqs[i], m);
 			mpc->length = count;
 			mpt += sizeof(struct mpc_intsrc);
@@ -1018,7 +1023,7 @@ static int __init update_mp_table(void)
 {
 	char str[16];
 	char oem[10];
-	struct intel_mp_floating *mpf;
+	struct mpf_intel *mpf;
 	struct mpc_table *mpc, *mpc_new;

 	if (!enable_update_mptable)
@@ -1031,19 +1036,19 @@ static int __init update_mp_table(void)
 	/*
 	 * Now see if we need to go further.
 	 */
-	if (mpf->mpf_feature1 != 0)
+	if (mpf->feature1 != 0)
 		return 0;

-	if (!mpf->mpf_physptr)
+	if (!mpf->physptr)
 		return 0;

-	mpc = _bus_to_virt(mpf->mpf_physptr);
+	mpc = _bus_to_virt(mpf->physptr);

 	if (!smp_check_mpc(mpc, oem, str))
 		return 0;

-	printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
-	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
+	printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf));
+	printk(KERN_INFO "physptr: %x\n", mpf->physptr);

 	if (mpc_new_phys && mpc->length > mpc_new_length) {
 		mpc_new_phys = 0;
@@ -1067,23 +1072,23 @@ static int __init update_mp_table(void)
 		maddr_t mpc_new_bus;

 		mpc_new_bus = phys_to_machine(mpc_new_phys);
-		mpf->mpf_physptr = mpc_new_bus;
+		mpf->physptr = mpc_new_bus;
 		mpc_new = phys_to_virt(mpc_new_phys);
 		memcpy(mpc_new, mpc, mpc->length);
 		mpc = mpc_new;
 		/* check if we can modify that */
-		if (mpc_new_bus - mpf->mpf_physptr) {
-			struct intel_mp_floating *mpf_new;
+		if (mpc_new_bus - mpf->physptr) {
+			struct mpf_intel *mpf_new;
 			/* steal 16 bytes from [0, 1k) */
 			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
 			mpf_new = isa_bus_to_virt(0x400 - 16);
 			memcpy(mpf_new, mpf, 16);
 			mpf = mpf_new;
-			mpf->mpf_physptr = mpc_new_bus;
+			mpf->physptr = mpc_new_bus;
 		}
-		mpf->mpf_checksum = 0;
-		mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
-		printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
+		mpf->checksum = 0;
+		mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+		printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
 	}

 	/*
--- head-2010-05-25.orig/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -1,4 +1,5 @@
 #include <linux/dma-mapping.h>
+#include <linux/dma-debug.h>
 #include <linux/dmar.h>
 #include <linux/bootmem.h>
 #include <linux/pci.h>
@@ -12,7 +13,7 @@

 static int forbid_dac __read_mostly;

-struct dma_mapping_ops *dma_ops;
+struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);

 static int iommu_sac_force __read_mostly;
@@ -39,11 +40,14 @@ EXPORT_SYMBOL(bad_dma_address);
    to older i386. */
 struct device x86_dma_fallback_dev = {
 	.init_name = "fallback device",
-	.coherent_dma_mask = DMA_32BIT_MASK,
+	.coherent_dma_mask = DMA_BIT_MASK(32),
 	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
 };
 EXPORT_SYMBOL(x86_dma_fallback_dev);

+/* Number of entries preallocated for DMA-API debugging */
+#define PREALLOC_DMA_DEBUG_ENTRIES       32768
+
 int dma_set_mask(struct device *dev, u64 mask)
 {
 	if (!dev->dma_mask || !dma_supported(dev, mask))
@@ -103,20 +107,20 @@ static void __init dma32_free_bootmem(vo
 }
 #endif

-static struct dma_mapping_ops swiotlb_dma_ops = {
+static struct dma_map_ops swiotlb_dma_ops = {
 	.alloc_coherent = dma_generic_alloc_coherent,
 	.free_coherent = dma_generic_free_coherent,
 	.mapping_error = swiotlb_dma_mapping_error,
-	.map_single = swiotlb_map_single_phys,
-	.unmap_single = swiotlb_unmap_single,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
 	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
 	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
-	.map_sg = swiotlb_map_sg,
-	.unmap_sg = swiotlb_unmap_sg,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
 	.dma_supported = swiotlb_dma_supported
 };

@@ -175,7 +179,7 @@ again:
 	if (!is_buffer_dma_capable(dma_mask, addr, size)) {
 		__free_pages(page, order);

-		if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
+		if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
 			flag = (flag & ~GFP_DMA32) | GFP_DMA;
 			goto again;
 		}
@@ -305,7 +309,7 @@ int range_straddles_page_boundary(paddr_

 int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	struct dma_map_ops *ops = get_dma_ops(dev);

 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
@@ -320,7 +324,7 @@ int dma_supported(struct device *dev, u6
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
 	   The caller just has to use GFP_DMA in this case. */
-	if (mask < DMA_24BIT_MASK)
+	if (mask < DMA_BIT_MASK(24))
 		return 0;

 	/* Tell the device to use SAC when IOMMU force is on.  This
@@ -335,7 +339,7 @@ int dma_supported(struct device *dev, u6
 	   SAC for these.  Assume all masks <= 40 bits are of this
 	   type. Normally this doesn't make any difference, but gives
 	   more gentle handling of IOMMU overflow. */
-	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
 		dev_info(dev, "Force SAC with mask %Lx\n", mask);
 		return 0;
 	}
@@ -346,6 +350,12 @@ EXPORT_SYMBOL(dma_supported);

 static int __init pci_iommu_init(void)
 {
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+
+#ifdef CONFIG_PCI
+	dma_debug_add_bus(&pci_bus_type);
+#endif
+
 	calgary_iommu_init();

 	intel_iommu_init();
@@ -371,8 +381,7 @@ fs_initcall(pci_iommu_init);
 static __devinit void via_no_dac(struct pci_dev *dev)
 {
 	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO
-			"PCI: VIA PCI bridge detected. Disabling DAC.\n");
+		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
 		forbid_dac = 1;
 	}
 }
--- head-2010-05-25.orig/arch/x86/kernel/pci-nommu-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/pci-nommu-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -24,7 +24,7 @@ do {							\

 static int
 gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-	      int direction)
+	      enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	unsigned int i;
 	struct scatterlist *sg;
@@ -48,7 +48,7 @@ gnttab_map_sg(struct device *hwdev, stru

 static void
 gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-		int direction)
+		enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	unsigned int i;
 	struct scatterlist *sg;
@@ -58,24 +58,25 @@ gnttab_unmap_sg(struct device *hwdev, st
 }

 static dma_addr_t
-gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
-		  int direction)
+gnttab_map_page(struct device *dev, struct page *page, unsigned long offset,
+		size_t size, enum dma_data_direction dir,
+		struct dma_attrs *attrs)
 {
 	dma_addr_t dma;

 	WARN_ON(size == 0);

-	dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
-	      offset_in_page(paddr);
-	IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
+	dma = gnttab_dma_map_page(page) + offset;
+	IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) +
+						   offset, size));
 	IOMMU_BUG_ON(address_needs_mapping(dev, dma, size));

 	return dma;
 }

 static void
-gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		    int direction)
+gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
+		  enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	gnttab_dma_unmap_page(dma_addr);
 }
@@ -85,14 +86,14 @@ static int nommu_dma_supported(struct de
 	return 1;
 }

-struct dma_mapping_ops nommu_dma_ops = {
-	.alloc_coherent = dma_generic_alloc_coherent,
-	.free_coherent = dma_generic_free_coherent,
-	.map_single = gnttab_map_single,
-	.unmap_single = gnttab_unmap_single,
-	.map_sg = gnttab_map_sg,
-	.unmap_sg = gnttab_unmap_sg,
-	.dma_supported = nommu_dma_supported,
+struct dma_map_ops nommu_dma_ops = {
+	.alloc_coherent	= dma_generic_alloc_coherent,
+	.free_coherent	= dma_generic_free_coherent,
+	.map_page	= gnttab_map_page,
+	.unmap_page	= gnttab_unmap_page,
+	.map_sg		= gnttab_map_sg,
+	.unmap_sg	= gnttab_unmap_sg,
+	.dma_supported	= nommu_dma_supported,
 };

 void __init no_iommu_init(void)
--- head-2010-05-25.orig/arch/x86/kernel/process-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/process-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -1,16 +1,19 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <asm/idle.h>
 #include <linux/smp.h>
+#include <linux/prctl.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
 #include <xen/evtchn.h>

 unsigned long idle_halt;
@@ -20,6 +23,9 @@ EXPORT_SYMBOL(idle_nomwait);

 struct kmem_cache *task_xstate_cachep;

+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
@@ -57,6 +63,179 @@ void arch_task_cache_init(void)
 }

 /*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+	struct task_struct *me = current;
+	struct thread_struct *t = &me->thread;
+	unsigned long *bp = t->io_bitmap_ptr;
+
+	if (bp) {
+		struct physdev_set_iobitmap set_iobitmap;
+
+		t->io_bitmap_ptr = NULL;
+		clear_thread_flag(TIF_IO_BITMAP);
+		/*
+		 * Careful, clear this in the TSS too:
+		 */
+		memset(&set_iobitmap, 0, sizeof(set_iobitmap));
+		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+					      &set_iobitmap));
+		t->io_bitmap_max = 0;
+		kfree(bp);
+	}
+
+	ds_exit_thread(current);
+}
+
+void flush_thread(void)
+{
+	struct task_struct *tsk = current;
+
+#ifdef CONFIG_X86_64
+	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+			clear_tsk_thread_flag(tsk, TIF_IA32);
+		} else {
+			set_tsk_thread_flag(tsk, TIF_IA32);
+			current_thread_info()->status |= TS_COMPAT;
+		}
+	}
+#endif
+
+	clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+	tsk->thread.debugreg0 = 0;
+	tsk->thread.debugreg1 = 0;
+	tsk->thread.debugreg2 = 0;
+	tsk->thread.debugreg3 = 0;
+	tsk->thread.debugreg6 = 0;
+	tsk->thread.debugreg7 = 0;
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+	/*
+	 * Forget coprocessor state..
+	 */
+	tsk->fpu_counter = 0;
+	clear_fpu(tsk);
+	clear_used_math();
+}
+
+static void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev, *next;
+
+	prev = &prev_p->thread;
+	next = &next_p->thread;
+
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
+		update_debugctlmsr(next->debugctlmsr);
+
+	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+		set_debugreg(next->debugreg0, 0);
+		set_debugreg(next->debugreg1, 1);
+		set_debugreg(next->debugreg2, 2);
+		set_debugreg(next->debugreg3, 3);
+		/* no 4 and 5 */
+		set_debugreg(next->debugreg6, 6);
+		set_debugreg(next->debugreg7, 7);
+	}
+
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+}
+
+int sys_fork(struct pt_regs *regs)
+{
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+int sys_vfork(struct pt_regs *regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+		       NULL, NULL);
+}
+
+
+/*
  * Idle related variables and functions
  */
 unsigned long boot_option_idle_override = 0;
@@ -130,7 +309,7 @@ void stop_this_cpu(void *dummy)
 	/*
 	 * Remove this CPU:
 	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
+	set_cpu_online(smp_processor_id(), false);
 	disable_all_local_evtchn();

 	for (;;) {
@@ -283,12 +462,13 @@ static int __cpuinit check_c1e_idle(cons
 	return 1;
 }

-static cpumask_t c1e_mask = CPU_MASK_NONE;
+static cpumask_var_t c1e_mask;
 static int c1e_detected;

 void c1e_remove_cpu(int cpu)
 {
-	cpu_clear(cpu, c1e_mask);
+	if (c1e_mask != NULL)
+		cpumask_clear_cpu(cpu, c1e_mask);
 }

 /*
@@ -317,8 +497,8 @@ static void c1e_idle(void)
 	if (c1e_detected) {
 		int cpu = smp_processor_id();

-		if (!cpu_isset(cpu, c1e_mask)) {
-			cpu_set(cpu, c1e_mask);
+		if (!cpumask_test_cpu(cpu, c1e_mask)) {
+			cpumask_set_cpu(cpu, c1e_mask);
 			/*
 			 * Force broadcast so ACPI can not interfere. Needs
 			 * to run with interrupts enabled as it uses
@@ -350,7 +530,7 @@ static void c1e_idle(void)
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifndef CONFIG_XEN
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
 			" performance may degrade.\n");
@@ -373,6 +553,17 @@ void __cpuinit select_idle_routine(const
 #endif
 }

+void __init init_c1e_mask(void)
+{
+#ifndef CONFIG_XEN
+	/* If we're using c1e_idle, we need to allocate c1e_mask. */
+	if (pm_idle == c1e_idle) {
+		alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+		cpumask_clear(c1e_mask);
+	}
+#endif
+}
+
 static int __init idle_setup(char *str)
 {
 	if (!str)
--- head-2010-05-25.orig/arch/x86/kernel/process_32-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/process_32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -11,6 +11,7 @@

 #include <stdarg.h>

+#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -71,9 +72,6 @@ asmlinkage void cstar_ret_from_fork(void
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);

-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
 /*
  * Return saved PC of a blocked thread.
  */
@@ -99,6 +97,15 @@ void cpu_idle(void)
 {
 	int cpu = smp_processor_id();

+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us.  CPU0 already has it initialized but no harm in
+	 * doing it again.  This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+
 	current_thread_info()->status |= TS_POLLING;

 	/* endless idle loop with no priority at all */
@@ -113,7 +120,6 @@ void cpu_idle(void)
 				play_dead();

 			local_irq_disable();
-			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
 			xen_idle();
@@ -137,7 +143,7 @@ void __show_regs(struct pt_regs *regs, i
 	if (user_mode_vm(regs)) {
 		sp = regs->sp;
 		ss = regs->ss & 0xffff;
-		savesegment(gs, gs);
+		gs = get_user_gs(regs);
 	} else {
 		sp = (unsigned long) (&regs->sp);
 		savesegment(ss, ss);
@@ -218,6 +224,7 @@ int kernel_thread(int (*fn)(void *), voi
 	regs.ds = __USER_DS;
 	regs.es = __USER_DS;
 	regs.fs = __KERNEL_PERCPU;
+	regs.gs = __KERNEL_STACK_CANARY;
 	regs.orig_ax = -1;
 	regs.ip = (unsigned long) kernel_thread_helper;
 	regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -228,47 +235,6 @@ int kernel_thread(int (*fn)(void *), voi
 }
 EXPORT_SYMBOL(kernel_thread);

-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	/* The process may have allocated an io port bitmap... nuke it. */
-	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-		struct task_struct *tsk = current;
-		struct thread_struct *t = &tsk->thread;
-		struct physdev_set_iobitmap set_iobitmap;
-		memset(&set_iobitmap, 0, sizeof(set_iobitmap));
-		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
-					      &set_iobitmap));
-		kfree(t->io_bitmap_ptr);
-		t->io_bitmap_ptr = NULL;
-		clear_thread_flag(TIF_IO_BITMAP);
-	}
-
-	ds_exit_thread(current);
-}
-
-void flush_thread(void)
-{
-	struct task_struct *tsk = current;
-
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-	/*
-	 * Forget coprocessor state..
-	 */
-	tsk->fpu_counter = 0;
-	clear_fpu(tsk);
-	clear_used_math();
-}
-
 void release_thread(struct task_struct *dead_task)
 {
 	BUG_ON(dead_task->mm);
@@ -284,7 +250,7 @@ void prepare_to_copy(struct task_struct
 	unlazy_fpu(tsk);
 }

-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
@@ -302,7 +268,7 @@ int copy_thread(int nr, unsigned long cl

 	p->thread.ip = (unsigned long) ret_from_fork;

-	savesegment(gs, p->thread.gs);
+	task_user_gs(p) = get_user_gs(regs);

 	tsk = current;
 	if (test_tsk_thread_flag(tsk, TIF_CSTAR))
@@ -344,7 +310,7 @@ int copy_thread(int nr, unsigned long cl
 void
 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
-	__asm__("movl %0, %%gs" : : "r"(0));
+	set_user_gs(regs, 0);
 	regs->fs		= 0;
 	set_fs(USER_DS);
 	regs->ds		= __USER_DS;
@@ -360,98 +326,6 @@ start_thread(struct pt_regs *regs, unsig
 }
 EXPORT_SYMBOL_GPL(start_thread);

-static void hard_disable_TSC(void)
-{
-	write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
-	preempt_disable();
-	if (!test_and_set_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_disable_TSC();
-	preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
-	write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
-	preempt_disable();
-	if (test_and_clear_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_enable_TSC();
-	preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
-	unsigned int val;
-
-	if (test_thread_flag(TIF_NOTSC))
-		val = PR_TSC_SIGSEGV;
-	else
-		val = PR_TSC_ENABLE;
-
-	return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
-	if (val == PR_TSC_SIGSEGV)
-		disable_TSC();
-	else if (val == PR_TSC_ENABLE)
-		enable_TSC();
-	else
-		return -EINVAL;
-
-	return 0;
-}
-
-static noinline void
-__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
-{
-	struct thread_struct *prev, *next;
-
-	prev = &prev_p->thread;
-	next = &next_p->thread;
-
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg0, 0);
-		set_debugreg(next->debugreg1, 1);
-		set_debugreg(next->debugreg2, 2);
-		set_debugreg(next->debugreg3, 3);
-		/* no 4 and 5 */
-		set_debugreg(next->debugreg6, 6);
-		set_debugreg(next->debugreg7, 7);
-	}
-
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
-	}
-}
-
 /*
  *	switch_to(x,yn) should switch tasks from x to y.
  *
@@ -532,7 +406,7 @@ __switch_to(struct task_struct *prev_p,
 	if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||	\
 		     next->tls_array[i].b != prev->tls_array[i].b)) {	\
 		mcl->op = __HYPERVISOR_update_descriptor;		\
-		*(u64 *)&mcl->args[0] =	virt_to_machine(		\
+		*(u64 *)&mcl->args[0] =	arbitrary_virt_to_machine(	\
 			&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
 		*(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];	\
 		mcl++;							\
@@ -612,64 +486,44 @@ __switch_to(struct task_struct *prev_p,
 	 * Restore %gs if needed (which is common)
 	 */
 	if (prev->gs | next->gs)
-		loadsegment(gs, next->gs);
+		lazy_load_gs(next->gs);

-	x86_write_percpu(current_task, next_p);
+	percpu_write(current_task, next_p);

 	return prev_p;
 }

-asmlinkage int sys_fork(struct pt_regs regs)
-{
-	return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
-}
-
-asmlinkage int sys_clone(struct pt_regs regs)
+int sys_clone(struct pt_regs *regs)
 {
 	unsigned long clone_flags;
 	unsigned long newsp;
 	int __user *parent_tidptr, *child_tidptr;

-	clone_flags = regs.bx;
-	newsp = regs.cx;
-	parent_tidptr = (int __user *)regs.dx;
-	child_tidptr = (int __user *)regs.di;
+	clone_flags = regs->bx;
+	newsp = regs->cx;
+	parent_tidptr = (int __user *)regs->dx;
+	child_tidptr = (int __user *)regs->di;
 	if (!newsp)
-		newsp = regs.sp;
-	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
-}
-
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage int sys_vfork(struct pt_regs regs)
-{
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+		newsp = regs->sp;
+	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
 }

 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(struct pt_regs regs)
+int sys_execve(struct pt_regs *regs)
 {
 	int error;
 	char *filename;

-	filename = getname((char __user *) regs.bx);
+	filename = getname((char __user *) regs->bx);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
-			(char __user * __user *) regs.cx,
-			(char __user * __user *) regs.dx,
-			&regs);
+			(char __user * __user *) regs->cx,
+			(char __user * __user *) regs->dx,
+			regs);
 	if (error == 0) {
 		/* Make sure we don't return using sysenter.. */
 		set_thread_flag(TIF_IRET);
--- head-2010-05-25.orig/arch/x86/kernel/process_64-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/process_64-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -19,6 +19,7 @@

 #include <stdarg.h>

+#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -50,7 +51,6 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/mmu_context.h>
-#include <asm/pda.h>
 #include <asm/prctl.h>
 #include <xen/interface/platform.h>
 #include <xen/interface/physdev.h>
@@ -67,6 +67,11 @@

 asmlinkage extern void ret_from_fork(void);

+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;

 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -85,13 +90,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregist

 void enter_idle(void)
 {
-	write_pda(isidle, 1);
+	percpu_write(is_idle, 1);
 	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }

 static void __exit_idle(void)
 {
-	if (test_and_clear_bit_pda(0, isidle) == 0)
+	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
 		return;
 	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
@@ -121,6 +126,16 @@ static inline void play_dead(void)
 void cpu_idle(void)
 {
 	current_thread_info()->status |= TS_POLLING;
+
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us.  CPU0 already has it initialized but no harm in
+	 * doing it again.  This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		tick_nohz_stop_sched_tick(1);
@@ -230,78 +245,11 @@ void show_regs(struct pt_regs *regs)
 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
 }

-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
-
-	if (me->thread.io_bitmap_ptr) {
-#ifndef CONFIG_X86_NO_TSS
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-#endif
-#ifdef CONFIG_XEN
-		struct physdev_set_iobitmap iobmp_op;
-		memset(&iobmp_op, 0, sizeof(iobmp_op));
-#endif
-
-		kfree(t->io_bitmap_ptr);
-		t->io_bitmap_ptr = NULL;
-		clear_thread_flag(TIF_IO_BITMAP);
-		/*
-		 * Careful, clear this in the TSS too:
-		 */
-#ifndef CONFIG_X86_NO_TSS
-		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
-		put_cpu();
-#endif
-#ifdef CONFIG_XEN
-		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
-					      &iobmp_op));
-#endif
-		t->io_bitmap_max = 0;
-	}
-
-	ds_exit_thread(current);
-}
-
 void xen_load_gs_index(unsigned gs)
 {
 	WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
 }

-void flush_thread(void)
-{
-	struct task_struct *tsk = current;
-
-	if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
-		clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
-		if (test_tsk_thread_flag(tsk, TIF_IA32)) {
-			clear_tsk_thread_flag(tsk, TIF_IA32);
-		} else {
-			set_tsk_thread_flag(tsk, TIF_IA32);
-			current_thread_info()->status |= TS_COMPAT;
-		}
-	}
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	tsk->thread.debugreg0 = 0;
-	tsk->thread.debugreg1 = 0;
-	tsk->thread.debugreg2 = 0;
-	tsk->thread.debugreg3 = 0;
-	tsk->thread.debugreg6 = 0;
-	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-	/*
-	 * Forget coprocessor state..
-	 */
-	tsk->fpu_counter = 0;
-	clear_fpu(tsk);
-	clear_used_math();
-}
-
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
@@ -343,7 +291,7 @@ void prepare_to_copy(struct task_struct
 	unlazy_fpu(tsk);
 }

-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
@@ -434,103 +382,6 @@ start_thread(struct pt_regs *regs, unsig
 }
 EXPORT_SYMBOL_GPL(start_thread);

-static void hard_disable_TSC(void)
-{
-	write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
-	preempt_disable();
-	if (!test_and_set_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_disable_TSC();
-	preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
-	write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
-	preempt_disable();
-	if (test_and_clear_thread_flag(TIF_NOTSC))
-		/*
-		 * Must flip the CPU state synchronously with
-		 * TIF_NOTSC in the current running context.
-		 */
-		hard_enable_TSC();
-	preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
-	unsigned int val;
-
-	if (test_thread_flag(TIF_NOTSC))
-		val = PR_TSC_SIGSEGV;
-	else
-		val = PR_TSC_ENABLE;
-
-	return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
-	if (val == PR_TSC_SIGSEGV)
-		disable_TSC();
-	else if (val == PR_TSC_ENABLE)
-		enable_TSC();
-	else
-		return -EINVAL;
-
-	return 0;
-}
-
-/*
- * This special macro can be used to load a debugging register
- */
-#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
-
-static inline void __switch_to_xtra(struct task_struct *prev_p,
-				    struct task_struct *next_p)
-{
-	struct thread_struct *prev, *next;
-
-	prev = &prev_p->thread,
-	next = &next_p->thread;
-
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
-	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		loaddebug(next, 0);
-		loaddebug(next, 1);
-		loaddebug(next, 2);
-		loaddebug(next, 3);
-		/* no 4 and 5 */
-		loaddebug(next, 6);
-		loaddebug(next, 7);
-	}
-
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
-	}
-}
-
 /*
  *	switch_to(x,y) should switch tasks from x to y.
  *
@@ -596,7 +447,7 @@ __switch_to(struct task_struct *prev_p,
 	if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||	\
 		     next->tls_array[i].b != prev->tls_array[i].b)) {	\
 		mcl->op      = __HYPERVISOR_update_descriptor;		\
-		mcl->args[0] = virt_to_machine(				\
+		mcl->args[0] = arbitrary_virt_to_machine(		\
 			&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
 		mcl->args[1] = *(u64 *)&next->tls_array[i];		\
 		mcl++;							\
@@ -683,19 +534,11 @@ __switch_to(struct task_struct *prev_p,
 	/*
 	 * Switch the PDA context.
 	 */
-	write_pda(pcurrent, next_p);
-	write_pda(kernelstack,
-		  (unsigned long)task_stack_page(next_p) +
-		  THREAD_SIZE - PDA_STACKOFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
-	write_pda(stack_canary, next_p->stack_canary);
+	percpu_write(current_task, next_p);

-	/*
-	 * Build time only check to make sure the stack_canary is at
-	 * offset 40 in the pda; this is a gcc ABI requirement
-	 */
-	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
+	percpu_write(kernel_stack,
+		  (unsigned long)task_stack_page(next_p) +
+		  THREAD_SIZE - KERNEL_STACK_OFFSET);

 	/*
 	 * Now maybe reload the debug registers
@@ -749,11 +592,6 @@ void set_personality_64bit(void)
 	current->personality &= ~READ_IMPLIES_EXEC;
 }

-asmlinkage long sys_fork(struct pt_regs *regs)
-{
-	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
 asmlinkage long
 sys_clone(unsigned long clone_flags, unsigned long newsp,
 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -763,22 +601,6 @@ sys_clone(unsigned long clone_flags, uns
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }

-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage long sys_vfork(struct pt_regs *regs)
-{
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
-		    NULL, NULL);
-}
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long stack;
--- head-2010-05-25.orig/arch/x86/kernel/quirks-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/quirks-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -75,8 +75,7 @@ static void ich_force_hpet_resume(void)
 	if (!force_hpet_address)
 		return;

-	if (rcba_base == NULL)
-		BUG();
+	BUG_ON(rcba_base == NULL);

 	/* read the Function Disable register, dword mode only */
 	val = readl(rcba_base + 0x3404);
@@ -173,7 +172,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
 			 ich_force_enable_hpet);
-
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16,	/* ICH10 */
+			 ich_force_enable_hpet);

 static struct pci_dev *cached_dev;

@@ -262,8 +262,6 @@ static void old_ich_force_enable_hpet_us
 {
 	if (hpet_force_user)
 		old_ich_force_enable_hpet(dev);
-	else
-		hpet_print_force_info();
 }

 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
--- head-2010-05-25.orig/arch/x86/kernel/setup-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/setup-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -74,14 +74,15 @@
 #include <asm/e820.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
-#include <asm/arch_hooks.h>
 #include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
 #include <asm/sections.h>
 #include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/vmi.h>
-#include <setup_arch.h>
+#include <asm/setup_arch.h>
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
@@ -89,7 +90,7 @@

 #include <asm/system.h>
 #include <asm/vsyscall.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
 #include <asm/desc.h>
 #include <asm/dma.h>
 #include <asm/iommu.h>
@@ -97,7 +98,6 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>

-#include <mach_apic.h>
 #include <asm/paravirt.h>
 #include <asm/hypervisor.h>

@@ -118,9 +118,6 @@
 #include <xen/firmware.h>
 #include <xen/xencons.h>

-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
-EXPORT_SYMBOL(HYPERVISOR_shared_info);
-
 static int xen_panic_event(struct notifier_block *, unsigned long, void *);
 static struct notifier_block xen_panic_block = {
 	xen_panic_event, NULL, 0 /* try to go last */
@@ -145,7 +142,26 @@ EXPORT_SYMBOL(xen_start_info);
 #define ARCH_SETUP
 #endif

+RESERVE_BRK(dmi_alloc, 65536);
+
+unsigned int boot_cpu_id __read_mostly;
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
 #ifndef CONFIG_XEN
+#ifdef CONFIG_X86_64
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+	return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+	return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+}
+#endif
+
 #ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
 #else
@@ -179,14 +195,6 @@ static struct resource bss_resource = {


 #ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-#endif
-
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
@@ -226,7 +234,9 @@ struct ist_info ist_info;
 #endif

 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+	.x86_phys_bits = MAX_PHYSMEM_BITS,
+};
 EXPORT_SYMBOL(boot_cpu_data);
 #endif

@@ -241,12 +251,6 @@ unsigned long mmu_cr4_features = X86_CR4
 int bootloader_type;

 /*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
  * Setup options
  */
 struct screen_info screen_info;
@@ -293,6 +297,35 @@ static inline void copy_edd(void)
 }
 #endif

+void * __init extend_brk(size_t size, size_t align)
+{
+	size_t mask = align - 1;
+	void *ret;
+
+	BUG_ON(_brk_start == 0);
+	BUG_ON(align & mask);
+
+	_brk_end = (_brk_end + mask) & ~mask;
+	BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+	ret = (void *)_brk_end;
+	_brk_end += size;
+
+	memset(ret, 0, size);
+
+	return ret;
+}
+
+static void __init reserve_brk(void)
+{
+	if (_brk_end > _brk_start)
+		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+	/* Mark brk area as locked down and no longer taking any
+	   new allocations */
+	_brk_start = 0;
+}
+
 #ifdef CONFIG_BLK_DEV_INITRD

 #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
@@ -653,24 +686,7 @@ static int __init setup_elfcorehdr(char
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif

-#ifndef CONFIG_XEN
-static int __init default_update_genapic(void)
-{
-#ifdef CONFIG_X86_SMP
-# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
-	genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
-# endif
-#endif
-
-	return 0;
-}
-#else
-#define default_update_genapic NULL
-#endif
-
-static struct x86_quirks default_x86_quirks __initdata = {
-	.update_genapic         = default_update_genapic,
-};
+static struct x86_quirks default_x86_quirks __initdata;

 struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;

@@ -738,19 +754,11 @@ void __init setup_arch(char **cmdline_p)

 	/* Register a call for panic conditions. */
 	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
-
-	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_writable_pagetables));
-#ifdef CONFIG_X86_32
-	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_4gb_segments));
-#endif
 #endif /* CONFIG_XEN */

 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
-	pre_setup_arch_hook();
 #else
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
@@ -834,16 +842,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
-	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
-	init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
-		       xen_start_info->nr_pt_frames) << PAGE_SHIFT;
-#endif
-#else
-	init_mm.brk = (unsigned long) &_end;
-#endif
+	init_mm.brk = _brk_end;

 	code_resource.start = virt_to_phys(_text);
 	code_resource.end = virt_to_phys(_etext)-1;
@@ -956,9 +955,8 @@ void __init setup_arch(char **cmdline_p)
 	num_physpages = max_pfn;
 	max_mapnr = max_pfn;

-#ifndef CONFIG_XEN
- 	if (cpu_has_x2apic)
- 		check_x2apic();
+#ifdef CONFIG_X86_LOCAL_APIC
+	check_x2apic();
 #endif

 	/* How many end-of-memory variables you have, grandma! */
@@ -975,6 +973,8 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif

+	reserve_brk();
+
 	/* max_pfn_mapped is updated here */
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
@@ -999,7 +999,7 @@ void __init setup_arch(char **cmdline_p)

 	reserve_initrd();

-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
 	vsmp_init();
 #endif

@@ -1034,12 +1034,11 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_reserve_bootmem();
 #endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
 	/*
 	 * Find and reserve possible boot-time SMP configuration:
 	 */
 	find_smp_config();
-#endif
+
 	reserve_crashkernel();

 #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
@@ -1140,13 +1139,9 @@ void __init setup_arch(char **cmdline_p)
 	for (i = 0; i < MAX_DMA_CHANNELS; ++i)
 		if (i != 4 && request_dma(i, "xen") != 0)
 			BUG();
-#endif /* CONFIG_XEN */
-
-#ifdef CONFIG_X86_GENERICARCH
+#else /* CONFIG_XEN */
 	generic_apic_probe();
-#endif

-#ifndef CONFIG_XEN
 	early_quirks();
 #endif

@@ -1221,6 +1216,98 @@ void __init setup_arch(char **cmdline_p)
 #endif /* CONFIG_XEN */
 }

+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+
+/**
+ * x86_quirk_intr_init - post gate setup interrupt initialisation
+ *
+ * Description:
+ *	Fill in any interrupts that may have been left out by the general
+ *	init_IRQ() routine.  interrupts having to do with the machine rather
+ *	than the devices on the I/O bus (like APIC interrupts in intel MP
+ *	systems) are started here.
+ **/
+void __init x86_quirk_intr_init(void)
+{
+	if (x86_quirks->arch_intr_init) {
+		if (x86_quirks->arch_intr_init())
+			return;
+	}
+}
+
+/**
+ * x86_quirk_trap_init - initialise system specific traps
+ *
+ * Description:
+ *	Called as the final act of trap_init().  Used in VISWS to initialise
+ *	the various board specific APIC traps.
+ **/
+void __init x86_quirk_trap_init(void)
+{
+	if (x86_quirks->arch_trap_init) {
+		if (x86_quirks->arch_trap_init())
+			return;
+	}
+}
+
+static struct irqaction irq0  = {
+	.handler = timer_interrupt,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+	.name = "timer"
+};
+
+/**
+ * x86_quirk_pre_time_init - do any specific initialisations before.
+ *
+ **/
+void __init x86_quirk_pre_time_init(void)
+{
+	if (x86_quirks->arch_pre_time_init)
+		x86_quirks->arch_pre_time_init();
+}
+
+/**
+ * x86_quirk_time_init - do any specific initialisations for the system timer.
+ *
+ * Description:
+ *	Must plug the system timer interrupt source at HZ into the IRQ listed
+ *	in irq_vectors.h:TIMER_IRQ
+ **/
+void __init x86_quirk_time_init(void)
+{
+	if (x86_quirks->arch_time_init) {
+		/*
+		 * A nonzero return code does not mean failure, it means
+		 * that the architecture quirk does not want any
+		 * generic (timer) setup to be performed after this:
+		 */
+		if (x86_quirks->arch_time_init())
+			return;
+	}
+
+	irq0.mask = cpumask_of_cpu(0);
+	setup_irq(0, &irq0);
+}
+#endif /* CONFIG_X86_32 */
+
 #ifdef CONFIG_XEN
 static int
 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
--- head-2010-05-25.orig/arch/x86/kernel/setup_percpu.c	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/setup_percpu.c	2010-03-24 15:25:06.000000000 +0100
@@ -231,7 +231,7 @@ void __init setup_per_cpu_areas(void)
 		 * are zeroed indicating that the static arrays are
 		 * gone.
 		 */
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
 		per_cpu(x86_cpu_to_apicid, cpu) =
 			early_per_cpu_map(x86_cpu_to_apicid, cpu);
 		per_cpu(x86_bios_cpu_apicid, cpu) =
@@ -255,7 +255,7 @@ void __init setup_per_cpu_areas(void)
 	}

 	/* indicate the early static arrays will soon be gone */
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
 	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
 	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
 #endif
--- head-2010-05-25.orig/arch/x86/kernel/smp-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/smp-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -2,7 +2,7 @@
  *	Intel SMP support routines.
  *
  *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *	(c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
  *      (c) 2002,2003 Andi Kleen, SuSE Labs.
  *
  *	i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,7 +26,7 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
-#include <mach_ipi.h>
+#include <asm/ipi.h>
 #include <xen/evtchn.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
@@ -118,17 +118,17 @@ void xen_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
-	send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+	xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }

 void xen_send_call_func_single_ipi(int cpu)
 {
-	send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
+	xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
 }

 void xen_send_call_func_ipi(const struct cpumask *mask)
 {
-	send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
+	xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
 }

 /*
--- head-2010-05-25.orig/arch/x86/kernel/time-xen.c	2010-05-11 17:14:09.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/time-xen.c	2010-05-12 09:02:08.000000000 +0200
@@ -554,7 +554,8 @@ irqreturn_t timer_interrupt(int irq, voi
 		ct = jiffies_to_cputime(delta_cpu);
 		if (user_mode_vm(get_irq_regs()))
 			account_user_time(current, ct, cputime_to_scaled(ct));
-		else if (current != idle_task(cpu))
+		else if (current != idle_task(cpu)
+			 || irq_count() != HARDIRQ_OFFSET)
 			account_system_time(current, HARDIRQ_OFFSET,
 					    ct, cputime_to_scaled(ct));
 		else
@@ -587,7 +588,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);

 static cycle_t cs_last;

-static cycle_t xen_clocksource_read(void)
+static cycle_t xen_clocksource_read(struct clocksource *cs)
 {
 #ifdef CONFIG_SMP
 	cycle_t last = get64(&cs_last);
--- head-2010-05-25.orig/arch/x86/kernel/traps-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/traps-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -54,15 +54,14 @@
 #include <asm/desc.h>
 #include <asm/i387.h>

-#include <mach_traps.h>
+#include <asm/mach_traps.h>

 #ifdef CONFIG_X86_64
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
-#include <asm/pda.h>
 #else
 #include <asm/processor-flags.h>
-#include <asm/arch_hooks.h>
+#include <asm/setup.h>
 #include <asm/traps.h>

 #include "cpu/mcheck/mce.h"
@@ -123,49 +122,6 @@ die_if_kernel(const char *str, struct pt
 	if (!user_mode_vm(regs))
 		die(str, regs, err);
 }
-
-/*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
- * we set the offset field correctly and return 1.
- */
-static int lazy_iobitmap_copy(void)
-{
-#ifndef CONFIG_XEN
-	struct thread_struct *thread;
-	struct tss_struct *tss;
-	int cpu;
-
-	cpu = get_cpu();
-	tss = &per_cpu(init_tss, cpu);
-	thread = &current->thread;
-
-	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-	    thread->io_bitmap_ptr) {
-		memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-		       thread->io_bitmap_max);
-		/*
-		 * If the previously set map was extending to higher ports
-		 * than the current one, pad extra space with 0xff (no access).
-		 */
-		if (thread->io_bitmap_max < tss->io_bitmap_max) {
-			memset((char *) tss->io_bitmap +
-				thread->io_bitmap_max, 0xff,
-				tss->io_bitmap_max - thread->io_bitmap_max);
-		}
-		tss->io_bitmap_max = thread->io_bitmap_max;
-		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-		tss->io_bitmap_owner = thread;
-		put_cpu();
-
-		return 1;
-	}
-	put_cpu();
-#endif
-
-	return 0;
-}
 #endif

 static void __kprobes
@@ -316,11 +272,6 @@ do_general_protection(struct pt_regs *re
 	conditional_sti(regs);

 #ifdef CONFIG_X86_32
-	if (lazy_iobitmap_copy()) {
-		/* restart the faulting instruction */
-		return;
-	}
-
 	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;
 #endif
@@ -911,19 +862,20 @@ void math_emulate(struct math_emu_info *
 }
 #endif /* CONFIG_MATH_EMULATION */

-dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs)
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error_code)
 {
 #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };

-		conditional_sti(&regs);
+		conditional_sti(regs);

-		info.regs = &regs;
+		info.regs = regs;
 		math_emulate(&info);
 	} else {
 		math_state_restore(); /* interrupts still off */
-		conditional_sti(&regs);
+		conditional_sti(regs);
 	}
 #else
 	math_state_restore();
@@ -939,7 +891,7 @@ dotraplinkage void do_iret_error(struct
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_BADSTK;
-	info.si_addr = 0;
+	info.si_addr = NULL;
 	if (notify_die(DIE_TRAP, "iret exception",
 			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
 		return;
--- head-2010-05-25.orig/arch/x86/kernel/vmlinux.lds.S	2010-03-24 15:08:58.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/vmlinux.lds.S	2010-03-24 15:25:06.000000000 +0100
@@ -16,8 +16,10 @@

 #ifdef CONFIG_X86_32
 #define LOAD_OFFSET __PAGE_OFFSET
-#else
+#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002
 #define LOAD_OFFSET __START_KERNEL_map
+#else
+#define LOAD_OFFSET 0
 #endif

 #include <asm-generic/vmlinux.lds.h>
--- head-2010-05-25.orig/arch/x86/mach-xen/Makefile	2007-06-12 13:12:48.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,5 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-obj-y				:= setup.o
--- head-2010-05-25.orig/arch/x86/mach-xen/setup.c	2010-03-24 15:12:46.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,190 +0,0 @@
-/*
- *	Machine specific setup for generic
- */
-
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <asm/acpi.h>
-#include <asm/arch_hooks.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-
-#include <xen/interface/callback.h>
-#include <xen/interface/memory.h>
-
-#ifdef CONFIG_X86_32
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI	(1)
-#else
-#define DEFAULT_SEND_IPI	(0)
-#endif
-
-int no_broadcast=DEFAULT_SEND_IPI;
-
-static __init int no_ipi_broadcast(char *str)
-{
-	get_option(&str, &no_broadcast);
-	printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
-											"IPI Broadcast");
-	return 1;
-}
-
-__setup("no_ipi_broadcast", no_ipi_broadcast);
-
-static int __init print_ipi_mode(void)
-{
-	printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
-											"Shortcut");
-	return 0;
-}
-
-late_initcall(print_ipi_mode);
-
-unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
-EXPORT_SYMBOL(machine_to_phys_mapping);
-unsigned int machine_to_phys_order;
-EXPORT_SYMBOL(machine_to_phys_order);
-
-void __init pre_setup_arch_hook(void)
-{
-	struct xen_machphys_mapping mapping;
-	unsigned long machine_to_phys_nr_ents;
-	struct xen_platform_parameters pp;
-
-	init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
-
-	xen_setup_features();
-
-	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
-		hypervisor_virt_start = pp.virt_start;
-		reserve_top_address(0UL - pp.virt_start);
-	}
-
-	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
-		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
-		machine_to_phys_nr_ents = mapping.max_mfn + 1;
-	} else
-		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
-	machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
-
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		phys_to_machine_mapping =
-			(unsigned long *)xen_start_info->mfn_list;
-}
-
-#endif /* CONFIG_X86_32 */
-
-extern void hypervisor_callback(void);
-extern void failsafe_callback(void);
-extern void nmi(void);
-
-#ifdef CONFIG_X86_64
-#include <asm/proto.h>
-#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
-#else
-#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
-#endif
-
-void __init machine_specific_arch_setup(void)
-{
-	int ret;
-	static struct callback_register __initdata event = {
-		.type = CALLBACKTYPE_event,
-		.address = CALLBACK_ADDR(hypervisor_callback)
-	};
-	static struct callback_register __initdata failsafe = {
-		.type = CALLBACKTYPE_failsafe,
-		.address = CALLBACK_ADDR(failsafe_callback)
-	};
-#ifdef CONFIG_X86_64
-	static struct callback_register __initdata syscall = {
-		.type = CALLBACKTYPE_syscall,
-		.address = CALLBACK_ADDR(system_call)
-	};
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
-	static struct callback_register __initdata nmi_cb = {
-		.type = CALLBACKTYPE_nmi,
-		.address = CALLBACK_ADDR(nmi)
-	};
-#endif
-
-	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
-	if (ret == 0)
-		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
-#ifdef CONFIG_X86_64
-	if (ret == 0)
-		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
-#endif
-#if CONFIG_XEN_COMPAT <= 0x030002
-#ifdef CONFIG_X86_32
-	if (ret == -ENOSYS)
-		ret = HYPERVISOR_set_callbacks(
-			event.address.cs, event.address.eip,
-			failsafe.address.cs, failsafe.address.eip);
-#else
-		ret = HYPERVISOR_set_callbacks(
-			event.address,
-			failsafe.address,
-			syscall.address);
-#endif
-#endif
-	BUG_ON(ret);
-
-#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_32)
-	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
-#if CONFIG_XEN_COMPAT <= 0x030002
-	if (ret == -ENOSYS) {
-		static struct xennmi_callback __initdata cb = {
-			.handler_address = (unsigned long)nmi
-		};
-
-		HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
-	}
-#endif
-#endif
-
-#ifdef CONFIG_X86_32
-	/* Do an early initialization of the fixmap area */
-	{
-		extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
-		unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
-		pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
-		pmd_t *pmd = pmd_offset(pud, addr);
-		unsigned int i;
-
-		make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
-		set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
-
-#define __FIXADDR_TOP (-PAGE_SIZE)
-#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
-			!= pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
-		FIX_BUG_ON(SHARED_INFO);
-		FIX_BUG_ON(ISAMAP_BEGIN);
-		FIX_BUG_ON(ISAMAP_END);
-#undef __FIXADDR_TOP
-		BUG_ON(pte_index(hypervisor_virt_start));
-
-		/* Switch to the real shared_info page, and clear the
-		 * dummy page. */
-		set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-		HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-		memset(empty_zero_page, 0, sizeof(empty_zero_page));
-
-		/* Setup mapping of lower 1st MB */
-		for (i = 0; i < NR_FIX_ISAMAPS; i++)
-			if (is_initial_xendomain())
-				set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
-			else
-				__set_fixmap(FIX_ISAMAP_BEGIN - i,
-					     virt_to_machine(empty_zero_page),
-					     PAGE_KERNEL_RO);
-	}
-#endif
-}
--- head-2010-05-25.orig/arch/x86/mm/Makefile	2010-03-24 15:01:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/Makefile	2010-03-24 15:25:06.000000000 +0100
@@ -26,5 +26,6 @@ obj-$(CONFIG_K8_NUMA)		+= k8topology_64.
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o

 obj-$(CONFIG_XEN)		+= hypervisor.o
+disabled-obj-$(CONFIG_XEN)	:= tlb.o

 obj-$(CONFIG_MEMTEST)		+= memtest.o
--- head-2010-05-25.orig/arch/x86/mm/fault-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/fault-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -1,73 +1,79 @@
 /*
  *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mmiotrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
 #include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/mmiotrace.h>
+#include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
-#include <linux/bootmem.h>		/* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/vt_kern.h>
+#include <linux/signal.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
+#include <linux/errno.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/mman.h>
+#include <linux/tty.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm-generic/sections.h>

-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/segment.h>
+#include <asm/system.h>
 #include <asm/proto.h>
-#include <asm-generic/sections.h>
 #include <asm/traps.h>
+#include <asm/desc.h>

 /*
- * Page fault error code bits
- *	bit 0 == 0 means no page found, 1 means protection fault
- *	bit 1 == 0 means read, 1 means write
- *	bit 2 == 0 means kernel, 1 means user-mode
- *	bit 3 == 1 means use of reserved bit detected
- *	bit 4 == 1 means fault was an instruction fetch
- */
-#define PF_PROT		(1<<0)
-#define PF_WRITE	(1<<1)
-#define PF_USER		(1<<2)
-#define PF_RSVD		(1<<3)
-#define PF_INSTR	(1<<4)
+ * Page fault error code bits:
+ *
+ *   bit 0 ==	 0: no page found	1: protection fault
+ *   bit 1 ==	 0: read access		1: write access
+ *   bit 2 ==	 0: kernel-mode access	1: user-mode access
+ *   bit 3 ==				1: use of reserved bit detected
+ *   bit 4 ==				1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {

+	PF_PROT		=		1 << 0,
+	PF_WRITE	=		1 << 1,
+	PF_USER		=		1 << 2,
+	PF_RSVD		=		1 << 3,
+	PF_INSTR	=		1 << 4,
+};
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
-#ifdef CONFIG_MMIOTRACE
 	if (unlikely(is_kmmio_active()))
 		if (kmmio_handler(regs, addr) == 1)
 			return -1;
-#endif
 	return 0;
 }

 static inline int notify_page_fault(struct pt_regs *regs)
 {
-#ifdef CONFIG_KPROBES
 	int ret = 0;

 	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode_vm(regs)) {
+	if (kprobes_built_in() && !user_mode_vm(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
 			ret = 1;
@@ -75,29 +81,76 @@ static inline int notify_page_fault(stru
 	}

 	return ret;
-#else
-	return 0;
-#endif
 }

 /*
- * X86_32
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- *
- * X86_64
- * Sometimes the CPU reports invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
  *
- * Opcode checker based on code by Richard Brunner
+ *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ *   Sometimes the CPU reports invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
  */
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
-		       unsigned long error_code)
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+		      unsigned char opcode, int *prefetch)
 {
+	unsigned char instr_hi = opcode & 0xf0;
+	unsigned char instr_lo = opcode & 0x0f;
+
+	switch (instr_hi) {
+	case 0x20:
+	case 0x30:
+		/*
+		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+		 * In X86_64 long mode, the CPU will signal invalid
+		 * opcode if some of these prefixes are present so
+		 * X86_64 will never get here anyway
+		 */
+		return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+	case 0x40:
+		/*
+		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+		 * Need to figure out under what instruction mode the
+		 * instruction was issued. Could check the LDT for lm,
+		 * but for now it's good enough to assume that long
+		 * mode only uses well known segments or kernel.
+		 */
+		return (!user_mode(regs)) || (regs->cs == __USER_CS);
+#endif
+	case 0x60:
+		/* 0x64 thru 0x67 are valid prefixes in all modes. */
+		return (instr_lo & 0xC) == 0x4;
+	case 0xF0:
+		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+		return !instr_lo || (instr_lo>>1) == 1;
+	case 0x00:
+		/* Prefetch instruction is 0x0F0D or 0x0F18 */
+		if (probe_kernel_address(instr, opcode))
+			return 0;
+
+		*prefetch = (instr_lo == 0xF) &&
+			(opcode == 0x0D || opcode == 0x18);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+	unsigned char *max_instr;
 	unsigned char *instr;
-	int scan_more = 1;
 	int prefetch = 0;
-	unsigned char *max_instr;

 	/*
 	 * If it was a exec (instruction fetch) fault on NX page, then
@@ -106,99 +159,174 @@ static int is_prefetch(struct pt_regs *r
 	if (error_code & PF_INSTR)
 		return 0;

-	instr = (unsigned char *)convert_ip_to_linear(current, regs);
+	instr = (void *)convert_ip_to_linear(current, regs);
 	max_instr = instr + 15;

 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 		return 0;

-	while (scan_more && instr < max_instr) {
+	while (instr < max_instr) {
 		unsigned char opcode;
-		unsigned char instr_hi;
-		unsigned char instr_lo;

 		if (probe_kernel_address(instr, opcode))
 			break;

-		instr_hi = opcode & 0xf0;
-		instr_lo = opcode & 0x0f;
 		instr++;

-		switch (instr_hi) {
-		case 0x20:
-		case 0x30:
-			/*
-			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
-			 * In X86_64 long mode, the CPU will signal invalid
-			 * opcode if some of these prefixes are present so
-			 * X86_64 will never get here anyway
-			 */
-			scan_more = ((instr_lo & 7) == 0x6);
-			break;
-#ifdef CONFIG_X86_64
-		case 0x40:
-			/*
-			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
-			 * Need to figure out under what instruction mode the
-			 * instruction was issued. Could check the LDT for lm,
-			 * but for now it's good enough to assume that long
-			 * mode only uses well known segments or kernel.
-			 */
-			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
 			break;
+	}
+	return prefetch;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+		     struct task_struct *tsk)
+{
+	siginfo_t info;
+
+	info.si_signo	= si_signo;
+	info.si_errno	= 0;
+	info.si_code	= si_code;
+	info.si_addr	= (void __user *)address;
+
+	force_sig_info(si_signo, &info, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+
+	if (!pmd_present(*pmd)) {
+		bool lazy = percpu_read(xen_lazy_mmu);
+
+		percpu_write(xen_lazy_mmu, false);
+#if CONFIG_XEN_COMPAT > 0x030002
+		set_pmd(pmd, *pmd_k);
+#else
+		/*
+		 * When running on older Xen we must launder *pmd_k through
+		 * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
+		 */
+		set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
 #endif
-		case 0x60:
-			/* 0x64 thru 0x67 are valid prefixes in all modes. */
-			scan_more = (instr_lo & 0xC) == 0x4;
-			break;
-		case 0xF0:
-			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
-			scan_more = !instr_lo || (instr_lo>>1) == 1;
-			break;
-		case 0x00:
-			/* Prefetch instruction is 0x0F0D or 0x0F18 */
-			scan_more = 0;
+		percpu_write(xen_lazy_mmu, lazy);
+	} else {
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+	}
+
+	return pmd_k;
+}
+
+void vmalloc_sync_all(void)
+{
+	unsigned long address;
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	for (address = VMALLOC_START & PMD_MASK;
+	     address >= TASK_SIZE && address < FIXADDR_TOP;
+	     address += PMD_SIZE) {
+
+		unsigned long flags;
+		struct page *page;

-			if (probe_kernel_address(instr, opcode))
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			if (!vmalloc_sync_one(page_address(page), address))
 				break;
-			prefetch = (instr_lo == 0xF) &&
-				(opcode == 0x0D || opcode == 0x18);
-			break;
-		default:
-			scan_more = 0;
-			break;
 		}
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
-	return prefetch;
 }

-static void force_sig_info_fault(int si_signo, int si_code,
-	unsigned long address, struct task_struct *tsk)
+/*
+ * 32-bit:
+ *
+ *   Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
 {
-	siginfo_t info;
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;

-	info.si_signo = si_signo;
-	info.si_errno = 0;
-	info.si_code = si_code;
-	info.si_addr = (void __user *)address;
-	force_sig_info(si_signo, &info, tsk);
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+
+	return 0;
 }

-#ifdef CONFIG_X86_64
-static int bad_address(void *p)
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
 {
-	unsigned long dummy;
-	return probe_kernel_address((unsigned long *)p, dummy);
+	unsigned long bit;
+
+	if (!v8086_mode(regs))
+		return;
+
+	bit = (address - 0xA0000) >> PAGE_SHIFT;
+	if (bit < 32)
+		tsk->thread.screen_bitmap |= 1 << bit;
 }
-#endif

 static void dump_pagetable(unsigned long address)
 {
-#ifdef CONFIG_X86_32
 	__typeof__(pte_val(__pte(0))) page;

 	page = read_cr3();
 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+
 #ifdef CONFIG_X86_PAE
 	printk("*pdpt = %016Lx ", page);
 	if ((page & _PAGE_PRESENT)
@@ -206,7 +334,7 @@ static void dump_pagetable(unsigned long
 		page = mfn_to_pfn(page >> PAGE_SHIFT);
 		page <<= PAGE_SHIFT;
 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
-		                                         & (PTRS_PER_PMD - 1)];
+							& (PTRS_PER_PMD - 1)];
 		printk(KERN_CONT "*pde = %016Lx ", page);
 		page &= ~_PAGE_NX;
 	}
@@ -218,20 +346,146 @@ static void dump_pagetable(unsigned long
 	 * We must not directly access the pte in the highpte
 	 * case if the page table is located in highmem.
 	 * And let's rather not kmap-atomic the pte, just in case
-	 * it's allocated already.
+	 * it's allocated already:
 	 */
 	if ((page & _PAGE_PRESENT)
 	    && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
 	    && !(page & _PAGE_PSE)) {
+
 		page = mfn_to_pfn(page >> PAGE_SHIFT);
 		page <<= PAGE_SHIFT;
 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
-		                                         & (PTRS_PER_PTE - 1)];
+							& (PTRS_PER_PTE - 1)];
 		printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
 	}

 	printk(KERN_CONT "\n");
-#else /* CONFIG_X86_64 */
+}
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+	unsigned long address;
+
+	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+	     address += PGDIR_SIZE) {
+
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		unsigned long flags;
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+		}
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
+}
+
+/*
+ * 64-bit:
+ *
+ *   Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	/*
+	 * Copy kernel mappings over when needed. This can also
+	 * happen within a race in page table update. In the later
+	 * case just flush:
+	 */
+	pgd = pgd_offset(current->active_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+	else
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+	/*
+	 * Below here mismatches are bugs because these lower tables
+	 * are shared:
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+		BUG();
+
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+
+	pte = pte_offset_kernel(pmd, address);
+
+	/*
+	 * Don't use pte_page here, because the mappings can point
+	 * outside mem_map, and the NUMA hash lookup cannot handle
+	 * that:
+	 */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+		BUG();
+
+	return 0;
+}
+
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+	unsigned long dummy;
+
+	return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address)
+{
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
@@ -240,113 +494,77 @@ static void dump_pagetable(unsigned long
 	pgd = (pgd_t *)read_cr3();

 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+
 	pgd += pgd_index(address);
-	if (bad_address(pgd)) goto bad;
+	if (bad_address(pgd))
+		goto bad;
+
 	printk("PGD %lx ", pgd_val(*pgd));
-	if (!pgd_present(*pgd)) goto ret;
+
+	if (!pgd_present(*pgd))
+		goto out;

 	pud = pud_offset(pgd, address);
-	if (bad_address(pud)) goto bad;
+	if (bad_address(pud))
+		goto bad;
+
 	printk(KERN_CONT "PUD %lx ", pud_val(*pud));
 	if (!pud_present(*pud) || pud_large(*pud))
-		goto ret;
+		goto out;

 	pmd = pmd_offset(pud, address);
-	if (bad_address(pmd)) goto bad;
+	if (bad_address(pmd))
+		goto bad;
+
 	printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
-	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+	if (!pmd_present(*pmd) || pmd_large(*pmd))
+		goto out;

 	pte = pte_offset_kernel(pmd, address);
-	if (bad_address(pte)) goto bad;
+	if (bad_address(pte))
+		goto bad;
+
 	printk(KERN_CONT "PTE %lx", pte_val(*pte));
-ret:
+out:
 	printk(KERN_CONT "\n");
 	return;
 bad:
 	printk("BAD\n");
-#endif
-}
-
-#ifdef CONFIG_X86_32
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-	unsigned index = pgd_index(address);
-	pgd_t *pgd_k;
-	pud_t *pud, *pud_k;
-	pmd_t *pmd, *pmd_k;
-
-	pgd += index;
-	pgd_k = init_mm.pgd + index;
-
-	if (!pgd_present(*pgd_k))
-		return NULL;
-
-	/*
-	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
-	 * and redundant with the set_pmd() on non-PAE. As would
-	 * set_pud.
-	 */
-
-	pud = pud_offset(pgd, address);
-	pud_k = pud_offset(pgd_k, address);
-	if (!pud_present(*pud_k))
-		return NULL;
-
-	pmd = pmd_offset(pud, address);
-	pmd_k = pmd_offset(pud_k, address);
-	if (!pmd_present(*pmd_k))
-		return NULL;
-	if (!pmd_present(*pmd)) {
-		bool lazy = x86_read_percpu(xen_lazy_mmu);
-
-		x86_write_percpu(xen_lazy_mmu, false);
-#if CONFIG_XEN_COMPAT > 0x030002
-		set_pmd(pmd, *pmd_k);
-#else
-		/*
-		 * When running on older Xen we must launder *pmd_k through
-		 * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
-		 */
-		set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
-#endif
-		x86_write_percpu(xen_lazy_mmu, lazy);
-	} else
-		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-	return pmd_k;
 }
-#endif

-#ifdef CONFIG_X86_64
-static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-#endif
+#endif /* CONFIG_X86_64 */

-/* Workaround for K8 erratum #93 & buggy BIOS.
-   BIOS SMM functions are required to use a specific workaround
-   to avoid corruption of the 64bit RIP register on C stepping K8.
-   A lot of BIOS that didn't get tested properly miss this.
-   The OS sees this as a page fault with the upper 32bits of RIP cleared.
-   Try to work around it here.
-   Note we only handle faults in kernel here.
-   Does nothing for X86_32
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
  */
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-	static int warned;
+	static int once;
+
 	if (address != regs->ip)
 		return 0;
+
 	if ((address >> 32) != 0)
 		return 0;
+
 	address |= 0xffffffffUL << 32;
 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
-		if (!warned) {
+		if (!once) {
 			printk(errata93_warning);
-			warned = 1;
+			once = 1;
 		}
 		regs->ip = address;
 		return 1;
@@ -356,16 +574,17 @@ static int is_errata93(struct pt_regs *r
 }

 /*
- * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
- * addresses >4GB.  We catch this in the page fault handler because these
- * addresses are not reachable. Just detect this case and return.  Any code
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return.  Any code
  * segment in LDT is compatibility mode.
  */
 static int is_errata100(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
-	    (address >> 32))
+	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
 		return 1;
 #endif
 	return 0;
@@ -375,8 +594,9 @@ static int is_f00f_bug(struct pt_regs *r
 {
 #ifdef CONFIG_X86_F00F_BUG
 	unsigned long nr;
+
 	/*
-	 * Pentium F0 0F C7 C8 bug workaround.
+	 * Pentium F0 0F C7 C8 bug workaround:
 	 */
 	if (boot_cpu_data.f00f_bug) {
 		nr = (address - idt_descr.address) >> 3;
@@ -390,62 +610,277 @@ static int is_f00f_bug(struct pt_regs *r
 	return 0;
 }

-static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
-			    unsigned long address)
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address)
 {
-#ifdef CONFIG_X86_32
 	if (!oops_may_print())
 		return;
-#endif

-#ifdef CONFIG_X86_PAE
 	if (error_code & PF_INSTR) {
 		unsigned int level;
+
 		pte_t *pte = lookup_address(address, &level);

 		if (pte && pte_present(*pte) && !pte_exec(*pte))
-			printk(KERN_CRIT "kernel tried to execute "
-				"NX-protected page - exploit attempt? "
-				"(uid: %d)\n", current_uid());
+			printk(nx_warning, current_uid());
 	}
-#endif

 	printk(KERN_ALERT "BUG: unable to handle kernel ");
 	if (address < PAGE_SIZE)
 		printk(KERN_CONT "NULL pointer dereference");
 	else
 		printk(KERN_CONT "paging request");
+
 	printk(KERN_CONT " at %p\n", (void *) address);
 	printk(KERN_ALERT "IP:");
 	printk_address(regs->ip, 1);
+
 	dump_pagetable(address);
 }

-#ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-				 unsigned long error_code)
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+	    unsigned long address)
 {
-	unsigned long flags = oops_begin();
-	int sig = SIGKILL;
 	struct task_struct *tsk;
+	unsigned long flags;
+	int sig;
+
+	flags = oops_begin();
+	tsk = current;
+	sig = SIGKILL;

 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-	       current->comm, address);
+	       tsk->comm, address);
 	dump_pagetable(address);
-	tsk = current;
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
+
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_no	= 14;
+	tsk->thread.error_code	= error_code;
+
 	if (__die("Bad pagetable", regs, error_code))
 		sig = 0;
+
 	oops_end(flags, regs, sig);
 }
-#endif
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address)
+{
+	struct task_struct *tsk = current;
+	unsigned long *stackend;
+	unsigned long flags;
+	int sig;
+
+	/* Are we prepared to handle this kernel fault? */
+	if (fixup_exception(regs))
+		return;
+
+	/*
+	 * 32-bit:
+	 *
+	 *   Valid to do another page fault here, because if this fault
+	 *   had been triggered by is_prefetch fixup_exception would have
+	 *   handled it.
+	 *
+	 * 64-bit:
+	 *
+	 *   Hall of shame of CPU/BIOS bugs.
+	 */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	if (is_errata93(regs, address))
+		return;
+
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have to
+	 * terminate things with extreme prejudice:
+	 */
+	flags = oops_begin();
+
+	show_fault_oops(regs, error_code, address);
+
+	stackend = end_of_stack(tsk);
+	if (*stackend != STACK_END_MAGIC)
+		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_no	= 14;
+	tsk->thread.error_code	= error_code;
+
+	sig = SIGKILL;
+	if (__die("Oops", regs, error_code))
+		sig = 0;
+
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_EMERG "CR2: %016lx\n", address);
+
+	oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address, struct task_struct *tsk)
+{
+	if (!unhandled_signal(tsk, SIGSEGV))
+		return;
+
+	if (!printk_ratelimit())
+		return;
+
+	printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+		tsk->comm, task_pid_nr(tsk), address,
+		(void *)regs->ip, (void *)regs->sp, error_code);
+
+	print_vma_addr(KERN_CONT " in ", regs->ip);
+
+	printk(KERN_CONT "\n");
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		       unsigned long address, int si_code)
+{
+	struct task_struct *tsk = current;
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & PF_USER) {
+		/*
+		 * It's possible to have interrupts off here:
+		 */
+		local_irq_enable();
+
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space:
+		 */
+		if (is_prefetch(regs, error_code, address))
+			return;
+
+		if (is_errata100(regs, address))
+			return;
+
+		if (unlikely(show_unhandled_signals))
+			show_signal_msg(regs, error_code, address, tsk);
+
+		/* Kernel addresses are always protection faults: */
+		tsk->thread.cr2		= address;
+		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no	= 14;
+
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+
+		return;
+	}
+
+	if (is_f00f_bug(regs, address))
+		return;
+
+	no_context(regs, error_code, address);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		     unsigned long address)
+{
+	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address, int si_code)
+{
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
+	up_read(&mm->mmap_sem);
+
+	__bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+		      unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void
+out_of_memory(struct pt_regs *regs, unsigned long error_code,
+	      unsigned long address)
+{
+	/*
+	 * We ran out of memory, call the OOM killer, and return the userspace
+	 * (which will retry the fault, or kill us if we got oom-killed):
+	 */
+	up_read(&current->mm->mmap_sem);
+
+	pagefault_out_of_memory();
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die: */
+	if (!(error_code & PF_USER))
+		no_context(regs, error_code, address);
+
+	/* User-space => ok to do another page fault: */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	tsk->thread.cr2		= address;
+	tsk->thread.error_code	= error_code;
+	tsk->thread.trap_no	= 14;
+
+	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+	       unsigned long address, unsigned int fault)
+{
+	if (fault & VM_FAULT_OOM) {
+		out_of_memory(regs, error_code, address);
+	} else {
+		if (fault & VM_FAULT_SIGBUS)
+			do_sigbus(regs, error_code, address);
+		else
+			BUG();
+	}
+}

 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
 	if ((error_code & PF_WRITE) && !pte_write(*pte))
 		return 0;
+
 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
 		return 0;

@@ -453,21 +888,25 @@ static int spurious_fault_check(unsigned
 }

 /*
- * Handle a spurious fault caused by a stale TLB entry.  This allows
- * us to lazily refresh the TLB when increasing the permissions of a
- * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
- * expensive since that implies doing a full cross-processor TLB
- * flush, even if no stale TLB entries exist on other processors.
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static int spurious_fault(unsigned long address,
-			  unsigned long error_code)
+static noinline int
+spurious_fault(unsigned long error_code, unsigned long address)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	int ret;

 	/* Reserved-bit violation or user access to kernel space? */
 	if (error_code & (PF_USER | PF_RSVD))
@@ -495,117 +934,62 @@ static int spurious_fault(unsigned long
 	if (!pte_present(*pte))
 		return 0;

-	return spurious_fault_check(error_code, pte);
-}
-
-/*
- * X86_32
- * Handle a fault on the vmalloc or module mapping area
- *
- * X86_64
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-#ifdef CONFIG_X86_32
-	unsigned long pgd_paddr;
-	pmd_t *pmd_k;
-	pte_t *pte_k;
-
-	/* Make sure we are in vmalloc area */
-	if (!(address >= VMALLOC_START && address < VMALLOC_END))
-		return -1;
+	ret = spurious_fault_check(error_code, pte);
+	if (!ret)
+		return 0;

 	/*
-	 * Synchronize this task's top level page-table
-	 * with the 'reference' page table.
-	 *
-	 * Do _not_ use "current" here. We might be inside
-	 * an interrupt in the middle of a task switch..
+	 * Make sure we have permissions in PMD.
+	 * If not, then there's a bug in the page tables:
 	 */
-	pgd_paddr = read_cr3();
-	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-	if (!pmd_k)
-		return -1;
-	pte_k = pte_offset_kernel(pmd_k, address);
-	if (!pte_present(*pte_k))
-		return -1;
-	return 0;
-#else
-	pgd_t *pgd, *pgd_ref;
-	pud_t *pud, *pud_ref;
-	pmd_t *pmd, *pmd_ref;
-	pte_t *pte, *pte_ref;
+	ret = spurious_fault_check(error_code, (pte_t *) pmd);
+	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");

-	/* Make sure we are in vmalloc area */
-	if (!(address >= VMALLOC_START && address < VMALLOC_END))
-		return -1;
+	return ret;
+}

-	/* Copy kernel mappings over when needed. This can also
-	   happen within a race in page table update. In the later
-	   case just flush. */
+int show_unhandled_signals = 1;

-	pgd = pgd_offset(current->active_mm, address);
-	pgd_ref = pgd_offset_k(address);
-	if (pgd_none(*pgd_ref))
-		return -1;
-	if (pgd_none(*pgd))
-		set_pgd(pgd, *pgd_ref);
-	else
-		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+static inline int
+access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+{
+	if (write) {
+		/* write, present and write, not present: */
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return 1;
+		return 0;
+	}

-	/* Below here mismatches are bugs because these lower tables
-	   are shared */
+	/* read, present: */
+	if (unlikely(error_code & PF_PROT))
+		return 1;
+
+	/* read, not present: */
+	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+		return 1;

-	pud = pud_offset(pgd, address);
-	pud_ref = pud_offset(pgd_ref, address);
-	if (pud_none(*pud_ref))
-		return -1;
-	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-		BUG();
-	pmd = pmd_offset(pud, address);
-	pmd_ref = pmd_offset(pud_ref, address);
-	if (pmd_none(*pmd_ref))
-		return -1;
-	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-		BUG();
-	pte_ref = pte_offset_kernel(pmd_ref, address);
-	if (!pte_present(*pte_ref))
-		return -1;
-	pte = pte_offset_kernel(pmd, address);
-	/* Don't use pte_page here, because the mappings can point
-	   outside mem_map, and the NUMA hash lookup cannot handle
-	   that. */
-	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-		BUG();
 	return 0;
-#endif
 }

-int show_unhandled_signals = 1;
+static int fault_in_kernel_space(unsigned long address)
+{
+	return address >= TASK_SIZE_MAX;
+}

 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
  * routines.
  */
-#ifdef CONFIG_X86_64
-asmlinkage
-#endif
-void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
-	struct task_struct *tsk;
-	struct mm_struct *mm;
 	struct vm_area_struct *vma;
+	struct task_struct *tsk;
 	unsigned long address;
-	int write, si_code;
+	struct mm_struct *mm;
+	int write;
 	int fault;
-#ifdef CONFIG_X86_64
-	unsigned long flags;
-	int sig;
-#endif

 	/* Set the "privileged fault" bit to something sane. */
 	if (user_mode_vm(regs))
@@ -615,13 +999,12 @@ void __kprobes do_page_fault(struct pt_r

 	tsk = current;
 	mm = tsk->mm;
+
 	prefetchw(&mm->mmap_sem);

-	/* get the address */
+	/* Get the faulting address: */
 	address = read_cr2();

-	si_code = SEGV_MAPERR;
-
 	if (unlikely(kmmio_fault(regs, address)))
 		return;

@@ -638,328 +1021,158 @@ void __kprobes do_page_fault(struct pt_r
 	 * (error_code & 4) == 0, and that the fault was not a
 	 * protection error (error_code & 9) == 0.
 	 */
-#ifdef CONFIG_X86_32
-	if (unlikely(address >= TASK_SIZE)) {
-#else
-	if (unlikely(address >= TASK_SIZE64)) {
-#endif
+	if (unlikely(fault_in_kernel_space(address))) {
 		/* Faults in hypervisor area can never be patched up. */
 #if defined(CONFIG_X86_XEN)
-		if (address >= hypervisor_virt_start)
-			goto bad_area_nosemaphore;
+		if (address >= hypervisor_virt_start) {
 #elif defined(CONFIG_X86_64_XEN)
 		if (address >= HYPERVISOR_VIRT_START
-		    && address < HYPERVISOR_VIRT_END)
-			goto bad_area_nosemaphore;
+		    && address < HYPERVISOR_VIRT_END) {
 #endif
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
+
 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 		    vmalloc_fault(address) >= 0)
 			return;

-		/* Can handle a stale RO->RW TLB */
-		if (spurious_fault(address, error_code))
+		/* Can handle a stale RO->RW TLB: */
+		if (spurious_fault(error_code, address))
 			return;

-		/* kprobes don't want to hook the spurious faults. */
+		/* kprobes don't want to hook the spurious faults: */
 		if (notify_page_fault(regs))
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
-		 * fault we could otherwise deadlock.
+		 * fault we could otherwise deadlock:
 		 */
-		goto bad_area_nosemaphore;
-	}
+		bad_area_nosemaphore(regs, error_code, address);

-	/* kprobes don't want to hook the spurious faults. */
-	if (notify_page_fault(regs))
 		return;
+	}

+	/* kprobes don't want to hook the spurious faults: */
+	if (unlikely(notify_page_fault(regs)))
+		return;
 	/*
 	 * It's safe to allow irq's after cr2 has been saved and the
 	 * vmalloc fault has been handled.
 	 *
 	 * User-mode registers count as a user access even for any
-	 * potential system fault or CPU buglet.
+	 * potential system fault or CPU buglet:
 	 */
 	if (user_mode_vm(regs)) {
 		local_irq_enable();
 		error_code |= PF_USER;
-	} else if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
+	} else {
+		if (regs->flags & X86_EFLAGS_IF)
+			local_irq_enable();
+	}

-#ifdef CONFIG_X86_64
 	if (unlikely(error_code & PF_RSVD))
-		pgtable_bad(address, regs, error_code);
-#endif
+		pgtable_bad(regs, error_code, address);

 	/*
-	 * If we're in an interrupt, have no user context or are running in an
-	 * atomic region then we must not take the fault.
+	 * If we're in an interrupt, have no user context or are running
+	 * in an atomic region then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm))
-		goto bad_area_nosemaphore;
+	if (unlikely(in_atomic() || !mm)) {
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}

 	/*
 	 * When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
+	 * addresses in user space.  All other faults represent errors in
+	 * the kernel and should generate an OOPS.  Unfortunately, in the
+	 * case of an erroneous fault occurring in a code path which already
+	 * holds mmap_sem we will deadlock attempting to validate the fault
+	 * against the address space.  Luckily the kernel only validly
+	 * references user space from well defined areas of code, which are
+	 * listed in the exceptions table.
 	 *
 	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibility of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
+	 * the source reference check when there is a possibility of a
+	 * deadlock. Attempt to lock the address space, if we cannot we then
+	 * validate the source. If this is invalid we can skip the address
+	 * space check, thus avoiding the deadlock:
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->ip))
-			goto bad_area_nosemaphore;
+		    !search_exception_tables(regs->ip)) {
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
 		down_read(&mm->mmap_sem);
+	} else {
+		/*
+		 * The above down_read_trylock() might have succeeded in
+		 * which case we'll have missed the might_sleep() from
+		 * down_read():
+		 */
+		might_sleep();
 	}

 	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
+	if (unlikely(!vma)) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (likely(vma->vm_start <= address))
 		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
 	if (error_code & PF_USER) {
 		/*
 		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter
-		 * and pusha to work.  ("enter $65535,$31" pushes
+		 * and pusha to work. ("enter $65535, $31" pushes
 		 * 32 pointers and then decrements %sp by 65535.)
 		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
-			goto bad_area;
+		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+			bad_area(regs, error_code, address);
+			return;
+		}
 	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+	if (unlikely(expand_stack(vma, address))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so
+	 * we can handle it..
+	 */
 good_area:
-	si_code = SEGV_ACCERR;
-	write = 0;
-	switch (error_code & (PF_PROT|PF_WRITE)) {
-	default:	/* 3: write, present */
-		/* fall through */
-	case PF_WRITE:		/* write, not present */
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-		write++;
-		break;
-	case PF_PROT:		/* read, present */
-		goto bad_area;
-	case 0:			/* read, not present */
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-			goto bad_area;
+	write = error_code & PF_WRITE;
+
+	if (unlikely(access_error(error_code, write, vma))) {
+		bad_area_access_error(regs, error_code, address);
+		return;
 	}

 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
+	 * the fault:
 	 */
 	fault = handle_mm_fault(mm, vma, address, write);
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
+		mm_fault_error(regs, error_code, address, fault);
+		return;
 	}
+
 	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
 	else
 		tsk->min_flt++;

-#ifdef CONFIG_X86_32
-	/*
-	 * Did it hit the DOS screen memory VA from vm86 mode?
-	 */
-	if (v8086_mode(regs)) {
-		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-		if (bit < 32)
-			tsk->thread.screen_bitmap |= 1 << bit;
-	}
-#endif
-	up_read(&mm->mmap_sem);
-	return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
-		/*
-		 * It's possible to have interrupts off here.
-		 */
-		local_irq_enable();
-
-		/*
-		 * Valid to do another page fault here because this one came
-		 * from user space.
-		 */
-		if (is_prefetch(regs, address, error_code))
-			return;
-
-		if (is_errata100(regs, address))
-			return;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk(
-			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			tsk->comm, task_pid_nr(tsk), address,
-			(void *) regs->ip, (void *) regs->sp, error_code);
-			print_vma_addr(" in ", regs->ip);
-			printk("\n");
-		}
-
-		tsk->thread.cr2 = address;
-		/* Kernel addresses are always protection faults */
-		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no = 14;
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-		return;
-	}
-
-	if (is_f00f_bug(regs, address))
-		return;
-
-no_context:
-	/* Are we prepared to handle this kernel fault?  */
-	if (fixup_exception(regs))
-		return;
-
-	/*
-	 * X86_32
-	 * Valid to do another page fault here, because if this fault
-	 * had been triggered by is_prefetch fixup_exception would have
-	 * handled it.
-	 *
-	 * X86_64
-	 * Hall of shame of CPU/BIOS bugs.
-	 */
-	if (is_prefetch(regs, address, error_code))
-		return;
-
-	if (is_errata93(regs, address))
-		return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
-	bust_spinlocks(1);
-#else
-	flags = oops_begin();
-#endif
-
-	show_fault_oops(regs, error_code, address);
-
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
-	die("Oops", regs, error_code);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-#else
-	sig = SIGKILL;
-	if (__die("Oops", regs, error_code))
-		sig = 0;
-	/* Executive summary in case the body of the oops scrolled away */
-	printk(KERN_EMERG "CR2: %016lx\n", address);
-	oops_end(flags, regs, sig);
-#endif
-
-out_of_memory:
-	/*
-	 * We ran out of memory, call the OOM killer, and return the userspace
-	 * (which will retry the fault, or kill us if we got oom-killed).
-	 */
-	up_read(&mm->mmap_sem);
-	pagefault_out_of_memory();
-	return;
+	check_v8086_mode(regs, address, tsk);

-do_sigbus:
 	up_read(&mm->mmap_sem);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & PF_USER))
-		goto no_context;
-#ifdef CONFIG_X86_32
-	/* User space => ok to do another page fault */
-	if (is_prefetch(regs, address, error_code))
-		return;
-#endif
-	tsk->thread.cr2 = address;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 14;
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
-	unsigned long address;
-
-#ifdef CONFIG_X86_32
-	if (SHARED_KERNEL_PMD)
-		return;
-
-	for (address = VMALLOC_START & PMD_MASK;
-	     address >= TASK_SIZE && address < FIXADDR_TOP;
-	     address += PMD_SIZE) {
-		unsigned long flags;
-		struct page *page;
-
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page),
-					      address))
-				break;
-		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
-#else /* CONFIG_X86_64 */
-	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-	     address += PGDIR_SIZE) {
-		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
-		struct page *page;
-
-		if (pgd_none(*pgd_ref))
-			continue;
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			pgd_t *pgd;
-			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			if (pgd_none(*pgd))
-				set_pgd(pgd, *pgd_ref);
-			else
-				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
-#endif
 }
--- head-2010-05-25.orig/arch/x86/mm/highmem_32-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/highmem_32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -1,5 +1,6 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */

 void *kmap(struct page *page)
 {
@@ -18,49 +19,6 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }

-static void debug_kmap_atomic_prot(enum km_type type)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-	static unsigned warn_count = 10;
-
-	if (unlikely(warn_count == 0))
-		return;
-
-	if (unlikely(in_interrupt())) {
-		if (in_irq()) {
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		} else if (!irqs_disabled()) {	/* softirq */
-			if (type != KM_IRQ0 && type != KM_IRQ1 &&
-			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-			    type != KM_SKB_SUNRPC_DATA &&
-			    type != KM_SKB_DATA_SOFTIRQ &&
-			    type != KM_BOUNCE_READ) {
-				WARN_ON(1);
-				warn_count--;
-			}
-		}
-	}
-
-	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
-		if (!irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-		if (irq_count() == 0 && !irqs_disabled()) {
-			WARN_ON(1);
-			warn_count--;
-		}
-	}
-#endif
-}
-
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -80,7 +38,7 @@ void *kmap_atomic_prot(struct page *page
 	if (!PageHighMem(page))
 		return page_address(page);

-	debug_kmap_atomic_prot(type);
+	debug_kmap_atomic(type);

 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -120,22 +78,13 @@ void kunmap_atomic(void *kvaddr, enum km
 	pagefault_enable();
 }

-/* This is the same as kmap_atomic() but can map memory that doesn't
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
-	enum fixed_addresses idx;
-	unsigned long vaddr;
-
-	pagefault_disable();
-
-	idx = type + KM_TYPE_NR*smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-	set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
-	/*arch_flush_lazy_mmu_mode();*/
-
-	return (void*) vaddr;
+	return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
 }
 EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */

@@ -206,3 +155,35 @@ EXPORT_SYMBOL(kmap_atomic_to_page);
 #endif
 EXPORT_SYMBOL(clear_highpage);
 EXPORT_SYMBOL(copy_highpage);
+
+void __init set_highmem_pages_init(void)
+{
+	struct zone *zone;
+	int nid;
+
+	for_each_zone(zone) {
+		unsigned long zone_start_pfn, zone_end_pfn;
+
+		if (!is_highmem(zone))
+			continue;
+
+		zone_start_pfn = zone->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+		nid = zone_to_nid(zone);
+		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn);
+
+		/* XEN: init high-mem pages outside initial allocation. */
+		if (zone_start_pfn < xen_start_info->nr_pages)
+			zone_start_pfn = xen_start_info->nr_pages;
+		for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) {
+			ClearPageReserved(pfn_to_page(zone_start_pfn));
+			init_page_count(pfn_to_page(zone_start_pfn));
+		}
+	}
+	totalram_pages += totalhigh_pages;
+}
--- head-2010-05-25.orig/arch/x86/mm/hypervisor.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/hypervisor.c	2010-03-24 15:25:06.000000000 +0100
@@ -36,6 +36,7 @@
 #include <linux/vmalloc.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/setup.h>
 #include <asm/hypervisor.h>
 #include <xen/balloon.h>
 #include <xen/features.h>
@@ -47,6 +48,9 @@

 EXPORT_SYMBOL(hypercall_page);

+shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
 #define NR_MC     BITS_PER_LONG
 #define NR_MMU    BITS_PER_LONG
 #define NR_MMUEXT (BITS_PER_LONG / 4)
@@ -538,7 +542,7 @@ int xen_create_contiguous_region(
 		unsigned int level;

 		if (vstart < __START_KERNEL_map
-		    || vstart + (PAGE_SIZE << order) > (unsigned long)_end)
+		    || vstart + (PAGE_SIZE << order) > _brk_end)
 			return -EINVAL;
 		ptep = lookup_address((unsigned long)__va(__pa(vstart)),
 				      &level);
@@ -953,6 +957,6 @@ int write_ldt_entry(struct desc_struct *
 int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
 		    int type)
 {
-	maddr_t mach_gp = virt_to_machine(gdt + entry);
+	maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
 	return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
 }
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/mm/init-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -0,0 +1,459 @@
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __meminitdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+#if !defined(CONFIG_XEN)
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+#elif defined(CONFIG_X86_32)
+#define direct_gbpages 0
+extern unsigned long extend_init_mapping(unsigned long tables_space);
+#else
+extern void xen_finish_init_mapping(void);
+#endif
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
+{
+	unsigned long puds, pmds, ptes, tables;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+	if (use_gbpages) {
+		unsigned long extra;
+
+		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+	} else
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	if (use_pse) {
+		unsigned long extra;
+
+		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+		extra += PMD_SIZE;
+#endif
+		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	} else
+		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+	/* for fixmap */
+	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+	/*
+	 * RED-PEN putting page tables only on node 0 could
+	 * cause a hotspot and fill up ZONE_DMA. The page tables
+	 * need roughly 0.5KB per GB.
+	 */
+#ifdef CONFIG_X86_32
+	e820_table_start = extend_init_mapping(tables);
+	e820_table_end = e820_table_start;
+#else /* CONFIG_X86_64 */
+	if (!e820_table_top) {
+		e820_table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+			xen_start_info->nr_pt_frames;
+		e820_table_end = e820_table_start;
+	} else {
+		/*
+		 * [table_start, table_top) gets passed to reserve_early(),
+		 * so we must not use table_end here, despite continuing
+		 * to allocate from there. table_end possibly being below
+		 * table_start is otoh not a problem.
+		 */
+		e820_table_start = e820_table_top;
+	}
+#endif
+	if (e820_table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+			     unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
+{
+	unsigned long page_size_mask = 0;
+	unsigned long start_pfn, end_pfn;
+	unsigned long ret = 0;
+	unsigned long pos;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;
+	int use_pse, use_gbpages;
+
+	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+	if (!after_bootmem)
+		init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	use_pse = use_gbpages = 0;
+#else
+	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+#endif
+
+	if (use_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	if (use_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	/* head if not big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* big page (2M) range */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+#endif
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = pos>>PAGE_SHIFT;
+	end_pfn = end>>PAGE_SHIFT;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
+	}
+
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
+	 */
+	if (!after_bootmem)
+		find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+	for (i = 0; i < nr_range; i++)
+		kernel_physical_mapping_init(mr[i].start, mr[i].end,
+					     mr[i].page_size_mask);
+	ret = end;
+#else /* CONFIG_X86_64 */
+#define addr_to_page(addr)						\
+	((unsigned long *)						\
+	 ((mfn_to_pfn(((addr) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)	\
+	   << PAGE_SHIFT) + __START_KERNEL_map))
+
+	if (!start) {
+		unsigned long addr, va = __START_KERNEL_map;
+		unsigned long *page = (unsigned long *)init_level4_pgt;
+
+		/* Kill mapping of memory below _text. */
+		while (va < (unsigned long)&_text) {
+			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+				BUG();
+			va += PAGE_SIZE;
+		}
+
+		/* Blow away any spurious initial mappings. */
+		va = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT);
+
+		addr = page[pgd_index(va)];
+		page = addr_to_page(addr);
+		addr = page[pud_index(va)];
+		page = addr_to_page(addr);
+		while (pmd_index(va) | pte_index(va)) {
+			if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+				break;
+			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+				BUG();
+			va += PAGE_SIZE;
+		}
+	}
+
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+#undef addr_to_page
+#endif
+
+#ifdef CONFIG_X86_32
+	early_ioremap_page_table_range_init();
+#endif
+
+#ifdef CONFIG_X86_64
+	BUG_ON(e820_table_end > e820_table_top);
+	if (!start)
+		xen_finish_init_mapping();
+	else
+#endif
+	if (e820_table_end < e820_table_top)
+		/* Disable the 'table_end' allocator. */
+		e820_table_top = e820_table_end;
+
+	__flush_tlb_all();
+
+	if (!after_bootmem && e820_table_top > e820_table_start)
+		reserve_early(e820_table_start << PAGE_SHIFT,
+			      e820_table_top << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_bootmem)
+		early_memtest(start, end);
+
+	return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (mfn_to_local_pfn(pagenr) >= max_pfn)
+		return 1;
+	return 0;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+	unsigned long addr = begin;
+
+	if (addr >= end)
+		return;
+
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, PAGE_ALIGN(end));
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+	/*
+	 * We just marked the kernel text read only above, now that
+	 * we are going to free part of that, we need to make that
+	 * writeable first.
+	 */
+	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
+	for (; addr < end; addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		memset((void *)(addr & ~(PAGE_SIZE-1)),
+			POISON_FREE_INITMEM, PAGE_SIZE);
+#ifdef CONFIG_X86_64
+		if (addr >= __START_KERNEL_map) {
+			/* make_readonly() reports all kernel addresses. */
+			if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
+							 pfn_pte(__pa(addr) >> PAGE_SHIFT,
+								 PAGE_KERNEL),
+							 0))
+				BUG();
+			if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+				BUG();
+		}
+#endif
+		free_page(addr);
+		totalram_pages++;
+	}
+#endif
+}
+
+void free_initmem(void)
+{
+	free_init_pages("unused kernel memory",
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_init_pages("initrd memory", start, end);
+}
+#endif
--- head-2010-05-25.orig/arch/x86/mm/init_32-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/init_32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -52,9 +52,7 @@
 #include <asm/swiotlb.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-#include <asm/smp.h>
-
-unsigned int __VMALLOC_RESERVE = 128 << 20;
+#include <asm/init.h>

 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -64,19 +62,14 @@ unsigned long highstart_pfn, highend_pfn

 static noinline int do_test_wp_bit(void);

-
-static unsigned long __initdata table_start;
-static unsigned long __initdata table_end;
-static unsigned long __initdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;

 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;

-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");

 	adr = __va(pfn * PAGE_SIZE);
@@ -96,7 +89,7 @@ static pmd_t * __init one_md_table_init(

 #ifdef CONFIG_X86_PAE
 	if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
-		if (after_init_bootmem)
+		if (after_bootmem)
 			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		else
 			pmd_table = (pmd_t *)alloc_low_page();
@@ -128,7 +121,7 @@ static pte_t * __init one_page_table_ini
 #endif
 		pte_t *page_table = NULL;

-		if (after_init_bootmem) {
+		if (after_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
@@ -148,6 +141,23 @@ static pte_t * __init one_page_table_ini
 	return pte_offset_kernel(pmd, 0);
 }

+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	int pgd_idx = pgd_index(vaddr);
+	int pmd_idx = pmd_index(vaddr);
+
+	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	int pte_idx = pte_index(vaddr);
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return one_page_table_init(pmd) + pte_idx;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 					   unsigned long vaddr, pte_t *lastpte)
 {
@@ -164,12 +174,12 @@ static pte_t *__init page_table_kmap_che
 	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
 	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
 	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-	    && ((__pa(pte) >> PAGE_SHIFT) < table_start
-		|| (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
 		pte_t *newpte;
 		int i;

-		BUG_ON(after_init_bootmem);
+		BUG_ON(after_bootmem);
 		newpte = alloc_low_page();
 		for (i = 0; i < PTRS_PER_PTE; i++)
 			set_pte(newpte + i, pte[i]);
@@ -244,11 +254,14 @@ static inline int is_kernel_text(unsigne
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
-						unsigned long start_pfn,
-						unsigned long end_pfn,
-						int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
+	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+	unsigned long start_pfn, end_pfn;
+	pgd_t *pgd_base = swapper_pg_dir;
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
@@ -257,6 +270,9 @@ static void __init kernel_physical_mappi
 	unsigned pages_2m, pages_4k;
 	int mapping_iter;

+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
 	/*
 	 * First iteration will setup identity mapping using large/small pages
 	 * based on use_pse, with other attributes same as set by
@@ -391,26 +407,6 @@ repeat:
 		mapping_iter = 2;
 		goto repeat;
 	}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (mfn_to_local_pfn(pagenr) >= max_pfn)
-		return 1;
 	return 0;
 }

@@ -506,30 +502,10 @@ void __init add_highpages_with_active_re
 	work_with_active_regions(nid, add_highpages_work_fn, &data);
 }

-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(void)
-{
-	int pfn;
-
-	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
-	/* XEN: init high-mem pages outside initial allocation. */
-	for (pfn = xen_start_info->nr_pages; pfn < highend_pfn; pfn++) {
-		ClearPageReserved(pfn_to_page(pfn));
-		init_page_count(pfn_to_page(pfn));
-	}
-
-	totalram_pages += totalhigh_pages;
-}
-#endif /* !CONFIG_NUMA */
-
 #else
 static inline void permanent_kmaps_init(pgd_t *pgd_base)
 {
 }
-static inline void set_highmem_pages_init(void)
-{
-}
 #endif /* CONFIG_HIGHMEM */

 pgd_t *swapper_pg_dir;
@@ -553,8 +529,9 @@ pgd_t *swapper_pg_dir;
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
 {
+	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long vaddr, end;

 	/*
@@ -649,7 +626,7 @@ static int __init noexec_setup(char *str
 }
 early_param("noexec", noexec_setup);

-static void __init set_nx(void)
+void __init set_nx(void)
 {
 	unsigned int v[4], l, h;

@@ -685,75 +662,97 @@ static int __init parse_highmem(char *ar
 }
 early_param("highmem", parse_highmem);

+#define MSG_HIGHMEM_TOO_BIG \
+	"highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+	"highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
 /*
- * Determine low and high memory ranges:
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
  */
-void __init find_low_pfn_range(void)
+void __init lowmem_pfn_init(void)
 {
-	/* it could update max_pfn */
-
 	/* max_low_pfn is 0, we already have early_res support */
-
 	max_low_pfn = max_pfn;
-	if (max_low_pfn > MAXMEM_PFN) {
-		if (highmem_pages == -1)
-			highmem_pages = max_pfn - MAXMEM_PFN;
-		if (highmem_pages + MAXMEM_PFN < max_pfn)
-			max_pfn = MAXMEM_PFN + highmem_pages;
-		if (highmem_pages + MAXMEM_PFN > max_pfn) {
-			printk(KERN_WARNING "only %luMB highmem pages "
-				"available, ignoring highmem size of %uMB.\n",
-				pages_to_mb(max_pfn - MAXMEM_PFN),
+
+	if (highmem_pages == -1)
+		highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+	if (highmem_pages >= max_pfn) {
+		printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+			pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+		highmem_pages = 0;
+	}
+	if (highmem_pages) {
+		if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+			printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
 				pages_to_mb(highmem_pages));
 			highmem_pages = 0;
 		}
-		max_low_pfn = MAXMEM_PFN;
+		max_low_pfn -= highmem_pages;
+	}
+#else
+	if (highmem_pages)
+		printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+	"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+	"Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+	max_low_pfn = MAXMEM_PFN;
+
+	if (highmem_pages == -1)
+		highmem_pages = max_pfn - MAXMEM_PFN;
+
+	if (highmem_pages + MAXMEM_PFN < max_pfn)
+		max_pfn = MAXMEM_PFN + highmem_pages;
+
+	if (highmem_pages + MAXMEM_PFN > max_pfn) {
+		printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+			pages_to_mb(max_pfn - MAXMEM_PFN),
+			pages_to_mb(highmem_pages));
+		highmem_pages = 0;
+	}
 #ifndef CONFIG_HIGHMEM
-		/* Maximum memory usable is what is directly addressable */
-		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
-					MAXMEM>>20);
-		if (max_pfn > MAX_NONPAE_PFN)
-			printk(KERN_WARNING
-				 "Use a HIGHMEM64G enabled kernel.\n");
-		else
-			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-		max_pfn = MAXMEM_PFN;
+	/* Maximum memory usable is what is directly addressable */
+	printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+	if (max_pfn > MAX_NONPAE_PFN)
+		printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+	else
+		printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+	max_pfn = MAXMEM_PFN;
 #else /* !CONFIG_HIGHMEM */
 #ifndef CONFIG_HIGHMEM64G
-		if (max_pfn > MAX_NONPAE_PFN) {
-			max_pfn = MAX_NONPAE_PFN;
-			printk(KERN_WARNING "Warning only 4GB will be used."
-				"Use a HIGHMEM64G enabled kernel.\n");
-		}
+	if (max_pfn > MAX_NONPAE_PFN) {
+		max_pfn = MAX_NONPAE_PFN;
+		printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+	}
 #endif /* !CONFIG_HIGHMEM64G */
 #endif /* !CONFIG_HIGHMEM */
-	} else {
-		if (highmem_pages == -1)
-			highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
-		if (highmem_pages >= max_pfn) {
-			printk(KERN_ERR "highmem size specified (%uMB) is "
-				"bigger than pages available (%luMB)!.\n",
-				pages_to_mb(highmem_pages),
-				pages_to_mb(max_pfn));
-			highmem_pages = 0;
-		}
-		if (highmem_pages) {
-			if (max_low_pfn - highmem_pages <
-			    64*1024*1024/PAGE_SIZE){
-				printk(KERN_ERR "highmem size %uMB results in "
-				"smaller than 64MB lowmem, ignoring it.\n"
-					, pages_to_mb(highmem_pages));
-				highmem_pages = 0;
-			}
-			max_low_pfn -= highmem_pages;
-		}
-#else
-		if (highmem_pages)
-			printk(KERN_ERR "ignoring highmem size on non-highmem"
-					" kernel!\n");
-#endif
-	}
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+	/* it could update max_pfn */
+
+	if (max_pfn <= MAXMEM_PFN)
+		lowmem_pfn_init();
+	else
+		highmem_pfn_init();
 }

 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -779,6 +778,8 @@ void __init initmem_init(unsigned long s
 #ifdef CONFIG_FLATMEM
 	max_mapnr = num_physpages;
 #endif
+	__vmalloc_start_set = true;
+
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));

@@ -800,40 +801,70 @@ static void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }

+static unsigned long __init setup_node_bootmem(int nodeid,
+				 unsigned long start_pfn,
+				 unsigned long end_pfn,
+				 unsigned long bootmap)
+{
+	unsigned long bootmap_size;
+
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+					 bootmap >> PAGE_SHIFT,
+					 start_pfn, end_pfn);
+	printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
+		nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
+		 nodeid, bootmap, bootmap + bootmap_size);
+	free_bootmem_with_active_regions(nodeid, end_pfn);
+	early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+	return bootmap + bootmap_size;
+}
+
 void __init setup_bootmem_allocator(void)
 {
-	int i;
+	int nodeid;
 	unsigned long bootmap_size, bootmap;
-	unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
+	unsigned long end_xen_pfn = min(max_low_pfn, xen_start_info->nr_pages);

 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
-	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
+	bootmap_size = bootmem_bootmap_pages(end_xen_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(0, min(max_pfn_mapped,
+				        xen_start_info->nr_pages)<<PAGE_SHIFT,
 				 bootmap_size, PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");

-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-					 min_low_pfn, end_pfn);
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
-	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
-		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-		 bootmap, bootmap + bootmap_size);
-	for_each_online_node(i)
-		free_bootmem_with_active_regions(i, end_pfn);
-	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);

-	after_init_bootmem = 1;
+	for_each_online_node(nodeid) {
+		 unsigned long start_pfn, end_pfn;
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+		start_pfn = node_start_pfn[nodeid];
+		end_pfn = node_end_pfn[nodeid];
+		if (start_pfn > end_xen_pfn)
+			continue;
+		if (end_pfn > end_xen_pfn)
+			end_pfn = end_xen_pfn;
+#else
+		start_pfn = 0;
+		end_pfn = end_xen_pfn;
+#endif
+		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+						 bootmap);
+	}
+
+	after_bootmem = 1;
 }

-static unsigned long __init extend_init_mapping(unsigned long tables_space)
+unsigned long __init extend_init_mapping(unsigned long tables_space)
 {
 	unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
 				  + xen_start_info->nr_pt_frames;
@@ -885,133 +916,6 @@ static unsigned long __init extend_init_
 	return start_pfn;
 }

-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
-	unsigned long puds, pmds, ptes, tables;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = PAGE_ALIGN(puds * sizeof(pud_t));
-
-	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
-
-	if (use_pse) {
-		unsigned long extra;
-
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-		extra += PMD_SIZE;
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-	tables += PAGE_ALIGN(ptes * sizeof(pte_t));
-
-	/* for fixmap */
-	tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
-
-	table_start = extend_init_mapping(tables);
-
-	table_end = table_start;
-	table_top = table_start + (tables>>PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT,
-		(table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-						unsigned long end)
-{
-	pgd_t *pgd_base = swapper_pg_dir;
-	unsigned long start_pfn, end_pfn;
-	unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	int use_pse = 0;
-#else
-	int use_pse = cpu_has_pse;
-#endif
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 */
-	if (!after_init_bootmem)
-		find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
-	set_nx();
-	if (nx_enabled)
-		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	}
-
-	/*
-	 * Don't use a large page for the first 2/4MB of memory
-	 * because there are often fixed size MTRRs in there
-	 * and overlapping MTRRs into large pages can cause
-	 * slowdowns.
-	 */
-	big_page_start = PMD_SIZE;
-
-	if (start < big_page_start) {
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
-	} else {
-		/* head is not big page alignment ? */
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-				 << (PMD_SHIFT - PAGE_SHIFT);
-	}
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
-	/* big page range */
-	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < (big_page_start >> PAGE_SHIFT))
-		start_pfn =  big_page_start >> PAGE_SHIFT;
-	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-					     use_pse);
-
-	/* tail is not big page alignment ? */
-	start_pfn = end_pfn;
-	if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
-		end_pfn = end >> PAGE_SHIFT;
-		if (start_pfn < end_pfn)
-			kernel_physical_mapping_init(pgd_base, start_pfn,
-							 end_pfn, 0);
-	}
-
-	early_ioremap_page_table_range_init(pgd_base);
-
-	__flush_tlb_all();
-
-	if (!after_init_bootmem)
-		reserve_early(table_start << PAGE_SHIFT,
-			      table_end << PAGE_SHIFT, "PGTABLE");
-
-	if (!after_init_bootmem)
-		early_memtest(start, end);
-
-	return end >> PAGE_SHIFT;
-}
-
-
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -1215,17 +1119,47 @@ static noinline int do_test_wp_bit(void)
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);

+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, start+size);
+
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, start+size);
+
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;

-#ifndef CONFIG_DYNAMIC_FTRACE
-	/* Dynamic tracing modifies the kernel text section */
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
 		size >> 10);

+	kernel_set_to_readonly = 1;
+
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
 		start, start+size);
@@ -1234,7 +1168,6 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
-#endif /* CONFIG_DYNAMIC_FTRACE */

 	start += size;
 	size = (unsigned long)__end_rodata - start;
@@ -1253,52 +1186,6 @@ void mark_rodata_ro(void)
 }
 #endif

-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * If debugging page accesses then do not free this memory but
-	 * mark them not present - any buggy init-section access will
-	 * create a kernel page fault:
-	 */
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, PAGE_ALIGN(end));
-	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-	unsigned long addr;
-
-	/*
-	 * We just marked the kernel text read only above, now that
-	 * we are going to free part of that, we need to make that
-	 * writeable first.
-	 */
-	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
-
-	for (addr = begin; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
-		totalram_pages++;
-	}
-	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-#endif
-}
-
-void free_initmem(void)
-{
-	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
--- head-2010-05-25.orig/arch/x86/mm/init_64-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/init_64-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -51,6 +51,8 @@
 #include <asm/kdebug.h>
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
+#include <asm/setup.h>

 #include <xen/features.h>

@@ -67,8 +69,6 @@ unsigned int __kernel_page_user;
 EXPORT_SYMBOL(__kernel_page_user);
 #endif

-int after_bootmem;
-
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
@@ -127,12 +127,6 @@ void __meminit early_make_page_readonly(
 }

 #ifndef CONFIG_XEN
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-				= 1
-#endif
-;
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
@@ -154,14 +148,10 @@ early_param("gbpages", parse_direct_gbpa
  * around without checking the pgd every time.
  */

-static unsigned long __meminitdata table_start;
-static unsigned long __meminitdata table_cur;
-static unsigned long __meminitdata table_top;
-
 pteval_t __supported_pte_mask __read_mostly = ~0UL;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);

-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;

 /*
  * noexec=on|off
@@ -176,9 +166,9 @@ static int __init nonx_setup(char *str)
 		return -EINVAL;
 	if (!strncmp(str, "on", 2)) {
 		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
+		disable_nx = 0;
 	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
+		disable_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
 	}
 	return 0;
@@ -190,7 +180,7 @@ void __cpuinit check_efer(void)
 	unsigned long efer;

 	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
+	if (!(efer & EFER_NX) || disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }

@@ -224,9 +214,9 @@ static __ref void *spp_getpage(void)

 	if (after_bootmem)
 		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
-	else if (table_cur < table_top) {
-		ptr = __va(table_cur << PAGE_SHIFT);
-		table_cur++;
+	else if (e820_table_end < e820_table_top) {
+		ptr = __va(e820_table_end << PAGE_SHIFT);
+		e820_table_end++;
 		memset(ptr, 0, PAGE_SIZE);
 	} else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -241,36 +231,54 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }

-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	if (pgd_none(*pgd)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		make_page_readonly(pud, XENFEAT_writable_page_tables);
+		pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+	}
+	return pud_offset(pgd, vaddr);
+}

-	pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
 	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage();
+		pmd_t *pmd = (pmd_t *) spp_getpage();
 		make_page_readonly(pmd, XENFEAT_writable_page_tables);
 		pud_populate(&init_mm, pud, pmd);
-		if (pmd != pmd_offset(pud, 0)) {
+		if (pmd != pmd_offset(pud, 0))
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-				pmd, pmd_offset(pud, 0));
-			return;
-		}
+			       pmd, pmd_offset(pud, 0));
 	}
-	pmd = pmd_offset(pud, vaddr);
+	return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
 	if (pmd_none(*pmd)) {
-		pte = (pte_t *) spp_getpage();
+		pte_t *pte = (pte_t *) spp_getpage();
 		make_page_readonly(pte, XENFEAT_writable_page_tables);
 		pmd_populate_kernel(&init_mm, pmd, pte);
-		if (pte != pte_offset_kernel(pmd, 0)) {
+		if (pte != pte_offset_kernel(pmd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
-			return;
-		}
 	}
+	return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pud = pud_page + pud_index(vaddr);
+	pmd = fill_pmd(pud, vaddr);
+	pte = fill_pte(pmd, vaddr);

-	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, new_pte);

 	/*
@@ -280,8 +288,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsig
 	__flush_tlb_one(vaddr);
 }

-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	pud_t *pud_page;
@@ -298,6 +305,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }

+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(vaddr);
+	pud = fill_pud(pgd, vaddr);
+	return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return fill_pte(pmd, vaddr);
+}
+
 #ifndef CONFIG_XEN
 /*
  * Create large page table mappings for a range of physical addresses.
@@ -380,9 +405,9 @@ static __ref void *alloc_low_page(unsign
 		return adr;
 	}

-	BUG_ON(!table_cur);
-	pfn = table_cur++;
-	if (pfn >= table_top)
+	BUG_ON(!e820_table_end);
+	pfn = e820_table_end++;
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");

 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -407,13 +432,13 @@ static inline int __meminit make_readonl
 	/* Make new page tables read-only on the first pass. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
 	    && !max_pfn_mapped
-	    && (paddr >= (table_start << PAGE_SHIFT))
-	    && (paddr < (table_top << PAGE_SHIFT)))
+	    && (paddr >= (e820_table_start << PAGE_SHIFT))
+	    && (paddr < (e820_table_top << PAGE_SHIFT)))
 		readonly = 1;
 	/* Make old page tables read-only. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
 	    && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
-	    && (paddr < (table_cur << PAGE_SHIFT)))
+	    && (paddr < (e820_table_end << PAGE_SHIFT)))
 		readonly = 1;

 	/*
@@ -422,7 +447,7 @@ static inline int __meminit make_readonl
 	 * mappings. Exclude the vsyscall area here, allowing alternative
 	 * instruction patching to work.
 	 */
-	if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
+	if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa(_brk_end))
 	    && !(paddr >= __pa_symbol(&__vsyscall_0)
 	         && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
 		readonly = 1;
@@ -747,43 +772,9 @@ void __init xen_init_pt(void)
 	}
 }

-static void __init find_early_table_space(unsigned long end, int use_pse,
-					  int use_gbpages)
-{
-	unsigned long puds, pmds, ptes, tables;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
-	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-	ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
-
-	if (!table_top) {
-		table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
-			xen_start_info->nr_pt_frames;
-		table_cur = table_start;
-	} else {
-		/*
-		 * [table_start, table_top) gets passed to reserve_early(),
-		 * so we must not use table_cur here, despite continuing
-		 * to allocate from there. table_cur possibly being below
-		 * table_start is otoh not a problem.
-		 */
-		table_start = table_top;
-	}
-	__flush_tlb_all();
-
-	table_top = table_cur + (tables >> PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init xen_finish_init_mapping(void)
+void __init xen_finish_init_mapping(void)
 {
-	unsigned long i, start, end;
+	unsigned long start, end;

 	/* Re-vector virtual addresses pointing into the initial
 	   mapping to the just-established permanent ones. */
@@ -801,49 +792,22 @@ static void __init xen_finish_init_mappi
 			__va(__pa(xen_start_info->mod_start));

 	/* Destroy the Xen-created mappings beyond the kernel image. */
-	start = PAGE_ALIGN((unsigned long)_end);
-	end   = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+	start = PAGE_ALIGN(_brk_end);
+	end   = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT);
 	for (; start < end; start += PAGE_SIZE)
 		if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
 			BUG();

-	/* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
-	start = table_top;
-	WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
-	     table_start, table_cur, start);
-	table_top = ~0UL;
-
-	/* Switch to the real shared_info page, and clear the dummy page. */
-	set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-	HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-	memset(empty_zero_page, 0, sizeof(empty_zero_page));
-
-	/* Set up mapping of lowest 1MB of physical memory. */
-	for (i = 0; i < NR_FIX_ISAMAPS; i++)
-		if (is_initial_xendomain())
-			set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
-		else
-			__set_fixmap(FIX_ISAMAP_BEGIN - i,
-				     virt_to_mfn(empty_zero_page)
-				     << PAGE_SHIFT,
-				     PAGE_KERNEL_RO);
-
-	table_top = max(table_cur, start);
+	WARN(e820_table_end != e820_table_top, "start=%lx cur=%lx top=%lx\n",
+	     e820_table_start, e820_table_end, e820_table_top);
+	if (e820_table_end > e820_table_top)
+		e820_table_top = e820_table_end;
 }

-static void __init init_gbpages(void)
-{
-#ifndef CONFIG_XEN
-	if (direct_gbpages && cpu_has_gbpages)
-		printk(KERN_INFO "Using GB pages for direct mapping\n");
-	else
-		direct_gbpages = 0;
-#endif
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
-						unsigned long end,
-						unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {

 	unsigned long next, last_map_addr = end;
@@ -887,207 +851,6 @@ static unsigned long __meminit kernel_ph
 	return last_map_addr;
 }

-struct map_range {
-	unsigned long start;
-	unsigned long end;
-	unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
-		   unsigned long start_pfn, unsigned long end_pfn,
-		   unsigned long page_size_mask)
-{
-
-	if (start_pfn < end_pfn) {
-		if (nr_range >= NR_RANGE_MR)
-			panic("run out of range for init_memory_mapping\n");
-		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-		mr[nr_range].page_size_mask = page_size_mask;
-		nr_range++;
-	}
-
-	return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-					       unsigned long end)
-{
-	unsigned long last_map_addr = 0;
-	unsigned long page_size_mask = 0;
-	unsigned long start_pfn, end_pfn;
-	unsigned long pos;
-
-	struct map_range mr[NR_RANGE_MR];
-	int nr_range, i;
-	int use_pse, use_gbpages;
-
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
-	if (!after_bootmem)
-		init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	use_pse = use_gbpages = 0;
-#else
-	use_pse = cpu_has_pse;
-	use_gbpages = direct_gbpages;
-#endif
-
-	if (use_gbpages)
-		page_size_mask |= 1 << PG_LEVEL_1G;
-	if (use_pse)
-		page_size_mask |= 1 << PG_LEVEL_2M;
-
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-
-	/* head if not big page alignment ?*/
-	start_pfn = start >> PAGE_SHIFT;
-	pos = start_pfn << PAGE_SHIFT;
-	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-			<< (PMD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > (end >> PAGE_SHIFT))
-		end_pfn = end >> PAGE_SHIFT;
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (2M) range*/
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (1G) range */
-	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask &
-				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (1G) alignment */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (2M) alignment */
-	start_pfn = pos>>PAGE_SHIFT;
-	end_pfn = end>>PAGE_SHIFT;
-	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-	/* try to merge same page size and continuous */
-	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-		unsigned long old_start;
-		if (mr[i].end != mr[i+1].start ||
-		    mr[i].page_size_mask != mr[i+1].page_size_mask)
-			continue;
-		/* move it */
-		old_start = mr[i].start;
-		memmove(&mr[i], &mr[i+1],
-			 (nr_range - 1 - i) * sizeof (struct map_range));
-		mr[i--].start = old_start;
-		nr_range--;
-	}
-
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-	if (!after_bootmem)
-		find_early_table_space(end, use_pse, use_gbpages);
-
-	if (!start) {
-		unsigned long addr, va = __START_KERNEL_map;
-		unsigned long *page = (unsigned long *)init_level4_pgt;
-
-		/* Kill mapping of memory below _text. */
-		while (va < (unsigned long)&_text) {
-			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
-				BUG();
-			va += PAGE_SIZE;
-		}
-
-		/* Blow away any spurious initial mappings. */
-		va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
-		addr = page[pgd_index(va)];
-		addr_to_page(addr, page);
-		addr = page[pud_index(va)];
-		addr_to_page(addr, page);
-		while (pmd_index(va) | pte_index(va)) {
-			if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
-				break;
-			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
-				BUG();
-			va += PAGE_SIZE;
-		}
-	}
-
-	for (i = 0; i < nr_range; i++)
-		last_map_addr = kernel_physical_mapping_init(
-					mr[i].start, mr[i].end,
-					mr[i].page_size_mask);
-
-	BUG_ON(table_cur > table_top);
-	if (!start)
-		xen_finish_init_mapping();
-	else if (table_cur < table_top)
-		/* Disable the 'table_cur' allocator. */
-		table_top = table_cur;
-
-	__flush_tlb_all();
-
-	if (!after_bootmem && table_top > table_start)
-		reserve_early(table_start << PAGE_SHIFT,
-			      table_top << PAGE_SHIFT, "PGTABLE");
-
-	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
-			 last_map_addr, end);
-
-	if (!after_bootmem)
-		early_memtest(start, end);
-
-	return last_map_addr >> PAGE_SHIFT;
-}
-
 #ifndef CONFIG_NUMA
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -1165,28 +928,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to

 #endif /* CONFIG_MEMORY_HOTPLUG */

-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (mfn_to_local_pfn(pagenr) >= max_pfn)
-		return 1;
-	return 0;
-}
-
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;

@@ -1243,56 +984,39 @@ void __init mem_init(void)
 		initsize >> 10);
 }

-void free_init_pages(char *what, unsigned long begin, unsigned long end)
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+static int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
 {
-	unsigned long addr = begin;
+	unsigned long start = PFN_ALIGN(_stext);
+	unsigned long end = PFN_ALIGN(__start_rodata);

-	if (addr >= end)
+	if (!kernel_set_to_readonly)
 		return;

-	/*
-	 * If debugging page accesses then do not free this memory but
-	 * mark them not present - any buggy init-section access will
-	 * create a kernel page fault:
-	 */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, PAGE_ALIGN(end));
-	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, end);

-	for (; addr < end; addr += PAGE_SIZE) {
-		ClearPageReserved(virt_to_page(addr));
-		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)),
-		       POISON_FREE_INITMEM, PAGE_SIZE);
-		if (addr >= __START_KERNEL_map) {
-			/* make_readonly() reports all kernel addresses. */
-			if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
-							 pfn_pte(__pa(addr) >> PAGE_SHIFT,
-								 PAGE_KERNEL),
-							 0))
-				BUG();
-			if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
-				BUG();
-		}
-		free_page(addr);
-		totalram_pages++;
-	}
-#endif
+	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
 }

-void free_initmem(void)
+void set_kernel_text_ro(void)
 {
-	free_init_pages("unused kernel memory",
-			(unsigned long)(&__init_begin),
-			(unsigned long)(&__init_end));
-}
+	unsigned long start = PFN_ALIGN(_stext);
+	unsigned long end = PFN_ALIGN(__start_rodata);

-#ifdef CONFIG_DEBUG_RODATA
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, end);
+
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}

 void mark_rodata_ro(void)
 {
@@ -1300,15 +1024,12 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

-#ifdef CONFIG_DYNAMIC_FTRACE
-	/* Dynamic tracing modifies the kernel text section */
-	start = rodata_start;
-#endif
-
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
 	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

+	kernel_set_to_readonly = 1;
+
 	/*
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
@@ -1328,13 +1049,6 @@ void mark_rodata_ro(void)

 #endif

-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
--- head-2010-05-25.orig/arch/x86/mm/iomap_32-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/iomap_32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -20,10 +20,11 @@
 #include <asm/pat.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/highmem.h>

 int is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
-#ifndef CONFIG_X86_PAE
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
 	/* There is no way to map greater than 1 << 32 address without PAE */
 	if (base + size > 0x100000000ULL)
 		return 0;
@@ -32,16 +33,28 @@ int is_io_mapping_possible(resource_size
 }
 EXPORT_SYMBOL_GPL(is_io_mapping_possible);

-/* Map 'mfn' using fixed map 'type' and protections 'prot'
- */
-void *
-iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;

 	pagefault_disable();

+	debug_kmap_atomic(type);
+	idx = type + KM_TYPE_NR * smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot));
+	/*arch_flush_lazy_mmu_mode();*/
+
+	return (void *)vaddr;
+}
+
+/*
+ * Map 'mfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot)
+{
 	/*
 	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
 	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -51,13 +64,8 @@ iomap_atomic_prot_pfn(unsigned long mfn,
 	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
 		prot = PAGE_KERNEL_UC_MINUS;

-	idx = type + KM_TYPE_NR*smp_processor_id();
-	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	pgprot_val(prot) |= _PAGE_IOMAP;
-	set_pte_at(&init_mm, vaddr, kmap_pte-idx, pfn_pte_ma(mfn, prot));
-	/*arch_flush_lazy_mmu_mode()*/;
-
-	return (void*) vaddr;
+	return kmap_atomic_prot_pfn(mfn, type, prot);
 }
 EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);

--- head-2010-05-25.orig/arch/x86/mm/ioremap-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/ioremap-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -23,13 +23,17 @@
 #include <asm/pgalloc.h>
 #include <asm/pat.h>

-#ifdef CONFIG_X86_64
-
-static inline int phys_addr_valid(unsigned long addr)
+static inline int phys_addr_valid(resource_size_t addr)
 {
-	return addr < (1UL << boot_cpu_data.x86_phys_bits);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+	return 1;
+#endif
 }

+#ifdef CONFIG_X86_64
+
 #define phys_base 0

 unsigned long __phys_addr(unsigned long x)
@@ -41,8 +45,7 @@ unsigned long __phys_addr(unsigned long
 	} else {
 		VIRTUAL_BUG_ON(x < PAGE_OFFSET);
 		x -= PAGE_OFFSET;
-		VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
-					!phys_addr_valid(x));
+		VIRTUAL_BUG_ON(!phys_addr_valid(x));
 	}
 	return x;
 }
@@ -59,10 +62,8 @@ bool __virt_addr_valid(unsigned long x)
 		if (x < PAGE_OFFSET)
 			return false;
 		x -= PAGE_OFFSET;
-		if (system_state == SYSTEM_BOOTING ?
-				x > MAXMEM : !phys_addr_valid(x)) {
+		if (!phys_addr_valid(x))
 			return false;
-		}
 	}

 	return pfn_valid(x >> PAGE_SHIFT);
@@ -73,18 +74,12 @@ EXPORT_SYMBOL(__virt_addr_valid);

 #else

-static inline int phys_addr_valid(unsigned long addr)
-{
-	return 1;
-}
-
 #ifdef CONFIG_DEBUG_VIRTUAL
 unsigned long __phys_addr(unsigned long x)
 {
-	/* VMALLOC_* aren't constants; not available at the boot time */
+	/* VMALLOC_* aren't constants  */
 	VIRTUAL_BUG_ON(x < PAGE_OFFSET);
-	VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
-		is_vmalloc_addr((void *) x));
+	VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
 	return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
@@ -94,7 +89,9 @@ bool __virt_addr_valid(unsigned long x)
 {
 	if (x < PAGE_OFFSET)
 		return false;
-	if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+		return false;
+	if (x >= FIXADDR_START)
 		return false;
 	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
@@ -462,16 +459,17 @@ static void __iomem *__ioremap_caller(re
 		return NULL;
 	area->phys_addr = phys_addr;
 	vaddr = (unsigned long) area->addr;
-	if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
-				     size, prot, domid)) {
+
+	if (kernel_map_sync_memtype(phys_addr, size, prot_val)) {
 		free_memtype(phys_addr, phys_addr + size);
 		free_vm_area(area);
 		return NULL;
 	}

-	if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+	if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+				     size, prot, domid)) {
 		free_memtype(phys_addr, phys_addr + size);
-		vunmap(area->addr);
+		free_vm_area(area);
 		return NULL;
 	}

@@ -528,7 +526,7 @@ EXPORT_SYMBOL(ioremap_nocache);
  *
  * Must be freed with iounmap.
  */
-void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
 	if (pat_enabled)
 		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -558,7 +556,8 @@ static void __iomem *ioremap_default(res
 	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
 	 * - Inherit from confliting mappings otherwise
 	 */
-	err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+	err = reserve_memtype(phys_addr, phys_addr + size,
+				_PAGE_CACHE_WB, &flags);
 	if (err < 0)
 		return NULL;

@@ -697,13 +696,19 @@ static inline pte_t * __init early_iorem
 	return &bm_pte[pte_index(addr)];
 }

+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
 void __init early_ioremap_init(void)
 {
 	pmd_t *pmd;
+	int i;

 	if (early_ioremap_debug)
 		printk(KERN_INFO "early_ioremap_init()\n");

+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	memset(bm_pte, 0, sizeof(bm_pte));
 	make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
@@ -734,7 +739,7 @@ void __init early_ioremap_reset(void)
 }

 static void __init __early_set_fixmap(enum fixed_addresses idx,
-				   unsigned long phys, pgprot_t flags)
+				      phys_addr_t phys, pgprot_t flags)
 {
 	unsigned long addr = __fix_to_virt(idx);
 	pte_t *pte;
@@ -753,7 +758,7 @@ static void __init __early_set_fixmap(en
 }

 static inline void __init early_set_fixmap(enum fixed_addresses idx,
-					   unsigned long phys, pgprot_t prot)
+					   phys_addr_t phys, pgprot_t prot)
 {
 	if (after_paging_init)
 		__set_fixmap(idx, phys, prot);
@@ -771,6 +776,7 @@ static inline void __init early_clear_fi

 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
 static int __init check_early_ioremap_leak(void)
 {
 	int count = 0;
@@ -792,9 +798,11 @@ static int __init check_early_ioremap_le
 }
 late_initcall(check_early_ioremap_leak);

-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
 {
-	unsigned long offset, last_addr;
+	unsigned long offset;
+	resource_size_t last_addr;
 	unsigned int nrpages;
 	enum fixed_addresses idx0, idx;
 	int i, slot;
@@ -810,15 +818,15 @@ static void __init __iomem *__early_iore
 	}

 	if (slot < 0) {
-		printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
-			 phys_addr, size);
+		printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
+			 (u64)phys_addr, size);
 		WARN_ON(1);
 		return NULL;
 	}

 	if (early_ioremap_debug) {
-		printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
-		       phys_addr, size, slot);
+		printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
+		       (u64)phys_addr, size, slot);
 		dump_stack();
 	}

@@ -858,20 +866,28 @@ static void __init __iomem *__early_iore
 		--nrpages;
 	}
 	if (early_ioremap_debug)
-		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);

-	prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
 	return prev_map[slot];
 }

 /* Remap an IO device */
-void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size)
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
 {
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped.
+	 */
+	if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1))
+		return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
 	return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
 }

 /* Remap memory */
-void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size)
+void __init __iomem *
+early_memremap(resource_size_t phys_addr, unsigned long size)
 {
 	return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL);
 }
@@ -884,6 +900,15 @@ void __init early_iounmap(void __iomem *
 	enum fixed_addresses idx;
 	int i, slot;

+	/*
+	 * early_ioremap special-cases the PCI/ISA range by not instantiating a
+	 * vm_area and by simply returning an address into the kernel mapping
+	 * of ISA space.   So handle that here.
+	 */
+	if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)
+	    && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1))
+		return;
+
 	slot = -1;
 	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
 		if (prev_map[i] == addr) {
@@ -928,8 +953,3 @@ void __init early_iounmap(void __iomem *
 	}
 	prev_map[slot] = NULL;
 }
-
-void __this_fixmap_does_not_exist(void)
-{
-	WARN_ON(1);
-}
--- head-2010-05-25.orig/arch/x86/mm/pageattr-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/pageattr-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -16,6 +16,7 @@
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/setup.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
 	unsigned long	pfn;
 	unsigned	force_split : 1;
 	int		curpage;
+	struct page	**pages;
 };

 /*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);

 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4

 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_star

 static inline unsigned long highmap_end_pfn(void)
 {
-	return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+	return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 }

 #endif
@@ -150,7 +153,7 @@ static void __cpa_flush_all(void *arg)
 	 */
 	__flush_tlb_all();

-	if (cache && boot_cpu_data.x86_model >= 4)
+	if (cache && boot_cpu_data.x86 >= 4)
 		wbinvd();
 }

@@ -201,38 +204,41 @@ static void cpa_flush_range(unsigned lon
 	}
 }

-static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+			    int in_flags, struct page **pages)
 {
 	unsigned int i, level;
-	unsigned long *addr;
+	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */

 	BUG_ON(irqs_disabled());

-	on_each_cpu(__cpa_flush_range, NULL, 1);
+	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);

-	if (!cache)
+	if (!cache || do_wbinvd)
 		return;

-	/* 4M threshold */
-	if (numpages >= 1024) {
-		if (boot_cpu_data.x86_model >= 4)
-			wbinvd();
-		return;
-	}
 	/*
 	 * We only need to flush on one CPU,
 	 * clflush is a MESI-coherent instruction that
 	 * will cause all other CPUs to flush the same
 	 * cachelines:
 	 */
-	for (i = 0, addr = start; i < numpages; i++, addr++) {
-		pte_t *pte = lookup_address(*addr, &level);
+	for (i = 0; i < numpages; i++) {
+		unsigned long addr;
+		pte_t *pte;
+
+		if (in_flags & CPA_PAGES_ARRAY)
+			addr = (unsigned long)page_address(pages[i]);
+		else
+			addr = start[i];
+
+		pte = lookup_address(addr, &level);

 		/*
 		 * Only flush present addresses:
 		 */
 		if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
-			clflush_cache_range((void *) *addr, PAGE_SIZE);
+			clflush_cache_range((void *)addr, PAGE_SIZE);
 	}
 }

@@ -498,6 +504,13 @@ static int split_large_page(pte_t *kpte,
 	pbase = (pte_t *)page_address(base);
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+	/*
+	 * If we ever want to utilize the PAT bit, we need to
+	 * update this function to make sure it's converted from
+	 * bit 12 to bit 7 when we cross from the 2MB level to
+	 * the 4K level:
+	 */
+	WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);

 #ifdef CONFIG_X86_64
 	if (level == PG_LEVEL_1G) {
@@ -597,7 +610,9 @@ static int __change_page_attr(struct cpa
 	unsigned int level;
 	pte_t *kpte, old_pte;

-	if (cpa->flags & CPA_ARRAY)
+	if (cpa->flags & CPA_PAGES_ARRAY)
+		address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+	else if (cpa->flags & CPA_ARRAY)
 		address = cpa->vaddr[cpa->curpage];
 	else
 		address = *cpa->vaddr;
@@ -701,7 +716,9 @@ static int cpa_process_alias(struct cpa_
 	 * No need to redo, when the primary call touched the direct
 	 * mapping already:
 	 */
-	if (cpa->flags & CPA_ARRAY)
+	if (cpa->flags & CPA_PAGES_ARRAY)
+		vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+	else if (cpa->flags & CPA_ARRAY)
 		vaddr = cpa->vaddr[cpa->curpage];
 	else
 		vaddr = *cpa->vaddr;
@@ -712,7 +729,7 @@ static int cpa_process_alias(struct cpa_
 		alias_cpa = *cpa;
 		temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
 		alias_cpa.vaddr = &temp_cpa_vaddr;
-		alias_cpa.flags &= ~CPA_ARRAY;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);


 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -725,7 +742,7 @@ static int cpa_process_alias(struct cpa_
 	 * No need to redo, when the primary call touched the high
 	 * mapping already:
 	 */
-	if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+	if (within(vaddr, (unsigned long) _text, _brk_end))
 		return 0;

 	/*
@@ -738,7 +755,7 @@ static int cpa_process_alias(struct cpa_
 	alias_cpa = *cpa;
 	temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
 	alias_cpa.vaddr = &temp_cpa_vaddr;
-	alias_cpa.flags &= ~CPA_ARRAY;
+	alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);

 	/*
 	 * The high mapping range is imprecise, so ignore the return value.
@@ -759,7 +776,7 @@ static int __change_page_attr_set_clr(st
 		 */
 		cpa->numpages = numpages;
 		/* for array changes, we can't use large page */
-		if (cpa->flags & CPA_ARRAY)
+		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
 			cpa->numpages = 1;

 		if (!debug_pagealloc)
@@ -783,7 +800,7 @@ static int __change_page_attr_set_clr(st
 		 */
 		BUG_ON(cpa->numpages > numpages);
 		numpages -= cpa->numpages;
-		if (cpa->flags & CPA_ARRAY)
+		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
 			cpa->curpage++;
 		else
 			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -800,7 +817,8 @@ static inline int cache_attr(pgprot_t at

 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 				    pgprot_t mask_set, pgprot_t mask_clr,
-				    int force_split, int array)
+				    int force_split, int in_flag,
+				    struct page **pages)
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
@@ -815,15 +833,7 @@ static int change_page_attr_set_clr(unsi
 		return 0;

 	/* Ensure we are PAGE_SIZE aligned */
-	if (!array) {
-		if (*addr & ~PAGE_MASK) {
-			*addr &= PAGE_MASK;
-			/*
-			 * People should not be passing in unaligned addresses:
-			 */
-			WARN_ON_ONCE(1);
-		}
-	} else {
+	if (in_flag & CPA_ARRAY) {
 		int i;
 		for (i = 0; i < numpages; i++) {
 			if (addr[i] & ~PAGE_MASK) {
@@ -831,6 +841,18 @@ static int change_page_attr_set_clr(unsi
 				WARN_ON_ONCE(1);
 			}
 		}
+	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
+		/*
+		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+		 * No need to cehck in that case
+		 */
+		if (*addr & ~PAGE_MASK) {
+			*addr &= PAGE_MASK;
+			/*
+			 * People should not be passing in unaligned addresses:
+			 */
+			WARN_ON_ONCE(1);
+		}
 	}

 	/* Must avoid aliasing mappings in the highmem code */
@@ -848,6 +870,7 @@ static int change_page_attr_set_clr(unsi
 		xen_multicall_flush(true);

 	cpa.vaddr = addr;
+	cpa.pages = pages;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
 	cpa.mask_clr = mask_clr;
@@ -855,8 +878,8 @@ static int change_page_attr_set_clr(unsi
 	cpa.curpage = 0;
 	cpa.force_split = force_split;

-	if (array)
-		cpa.flags |= CPA_ARRAY;
+	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+		cpa.flags |= in_flag;

 	/* No alias checking for _NX bit modifications */
 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -882,9 +905,10 @@ static int change_page_attr_set_clr(unsi
 	 * wbindv):
 	 */
 	if (!ret && cpu_has_clflush) {
-		if (cpa.flags & CPA_ARRAY)
-			cpa_flush_array(addr, numpages, cache);
-		else
+		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+			cpa_flush_array(addr, numpages, cache,
+					cpa.flags, pages);
+		} else
 			cpa_flush_range(*addr, numpages, cache);
 	} else
 		cpa_flush_all(cache);
@@ -905,14 +929,28 @@ static inline int change_page_attr_set(u
 				       pgprot_t mask, int array)
 {
 	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
-		array);
+		(array ? CPA_ARRAY : 0), NULL);
 }

 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
 					 pgprot_t mask, int array)
 {
 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
-		array);
+		(array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+				       pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+		CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+					 pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+		CPA_PAGES_ARRAY, pages);
 }

 #ifdef CONFIG_XEN
@@ -971,71 +1009,94 @@ int _set_memory_uc(unsigned long addr, i

 int set_memory_uc(unsigned long addr, int numpages)
 {
+	int ret;
+
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
-	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
-			    _PAGE_CACHE_UC_MINUS, NULL))
-		return -EINVAL;
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			    _PAGE_CACHE_UC_MINUS, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = _set_memory_uc(addr, numpages);
+	if (ret)
+		goto out_free;

-	return _set_memory_uc(addr, numpages);
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_uc);

 int set_memory_array_uc(unsigned long *addr, int addrinarray)
 {
-	unsigned long start;
-	unsigned long end;
-	int i;
+	int i, j;
+	int ret;
+
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
 	for (i = 0; i < addrinarray; i++) {
-		start = __pa(addr[i]);
-		for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-			if (end != __pa(addr[i + 1]))
-				break;
-			i++;
-		}
-		if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
-			goto out;
+		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+					_PAGE_CACHE_UC_MINUS, NULL);
+		if (ret)
+			goto out_free;
 	}

-	return change_page_attr_set(addr, addrinarray,
+	ret = change_page_attr_set(addr, addrinarray,
 				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
-out:
-	for (i = 0; i < addrinarray; i++) {
-		unsigned long tmp = __pa(addr[i]);
+	if (ret)
+		goto out_free;

-		if (tmp == start)
-			break;
-		for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-			if (end != __pa(addr[i + 1]))
-				break;
-			i++;
-		}
-		free_memtype(tmp, end);
-	}
-	return -EINVAL;
+	return 0;
+
+out_free:
+	for (j = 0; j < i; j++)
+		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_array_uc);

 int _set_memory_wc(unsigned long addr, int numpages)
 {
-	return change_page_attr_set(&addr, numpages,
+	int ret;
+	ret = change_page_attr_set(&addr, numpages,
+				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+
+	if (!ret) {
+		ret = change_page_attr_set(&addr, numpages,
 				    __pgprot(_PAGE_CACHE_WC), 0);
+	}
+	return ret;
 }

 int set_memory_wc(unsigned long addr, int numpages)
 {
+	int ret;
+
 	if (!pat_enabled)
 		return set_memory_uc(addr, numpages);

-	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
-		_PAGE_CACHE_WC, NULL))
-		return -EINVAL;
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+		_PAGE_CACHE_WC, NULL);
+	if (ret)
+		goto out_err;

-	return _set_memory_wc(addr, numpages);
+	ret = _set_memory_wc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
 }
 EXPORT_SYMBOL(set_memory_wc);

@@ -1047,29 +1108,31 @@ int _set_memory_wb(unsigned long addr, i

 int set_memory_wb(unsigned long addr, int numpages)
 {
-	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+	int ret;

-	return _set_memory_wb(addr, numpages);
+	ret = _set_memory_wb(addr, numpages);
+	if (ret)
+		return ret;
+
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+	return 0;
 }
 EXPORT_SYMBOL(set_memory_wb);

 int set_memory_array_wb(unsigned long *addr, int addrinarray)
 {
 	int i;
+	int ret;

-	for (i = 0; i < addrinarray; i++) {
-		unsigned long start = __pa(addr[i]);
-		unsigned long end;
-
-		for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-			if (end != __pa(addr[i + 1]))
-				break;
-			i++;
-		}
-		free_memtype(start, end);
-	}
-	return change_page_attr_clear(addr, addrinarray,
+	ret = change_page_attr_clear(addr, addrinarray,
 				      __pgprot(_PAGE_CACHE_MASK), 1);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < addrinarray; i++)
+		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+	return 0;
 }
 EXPORT_SYMBOL(set_memory_array_wb);

@@ -1105,7 +1168,7 @@ int set_memory_np(unsigned long addr, in
 int set_memory_4k(unsigned long addr, int numpages)
 {
 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
-					__pgprot(0), 1, 0);
+					__pgprot(0), 1, 0, NULL);
 }

 int set_pages_uc(struct page *page, int numpages)
@@ -1116,6 +1179,35 @@ int set_pages_uc(struct page *page, int
 }
 EXPORT_SYMBOL(set_pages_uc);

+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+	unsigned long start;
+	unsigned long end;
+	int i;
+	int free_idx;
+
+	for (i = 0; i < addrinarray; i++) {
+		start = (unsigned long)page_address(pages[i]);
+		end = start + PAGE_SIZE;
+		if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+			goto err_out;
+	}
+
+	if (cpa_set_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
+		return 0; /* Success */
+	}
+err_out:
+	free_idx = i;
+	for (i = 0; i < free_idx; i++) {
+		start = (unsigned long)page_address(pages[i]);
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
@@ -1124,6 +1216,28 @@ int set_pages_wb(struct page *page, int
 }
 EXPORT_SYMBOL(set_pages_wb);

+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+	int retval;
+	unsigned long start;
+	unsigned long end;
+	int i;
+
+	retval = cpa_clear_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_MASK));
+	if (retval)
+		return retval;
+
+	for (i = 0; i < addrinarray; i++) {
+		start = (unsigned long)page_address(pages[i]);
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
 int set_pages_x(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
--- head-2010-05-25.orig/arch/x86/mm/pat-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/pat-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -31,7 +31,7 @@
 #ifdef CONFIG_X86_PAT
 int __read_mostly pat_enabled = 1;

-void __cpuinit pat_disable(char *reason)
+static inline void pat_disable(const char *reason)
 {
 	pat_enabled = 0;
 	printk(KERN_INFO "%s\n", reason);
@@ -43,6 +43,11 @@ static int __init nopat(char *str)
 	return 0;
 }
 early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+	(void)reason;
+}
 #endif


@@ -79,16 +84,20 @@ void pat_init(void)
 	if (!pat_enabled)
 		return;

-	/* Paranoia check. */
-	if (!cpu_has_pat && boot_pat_state) {
-		/*
-		 * If this happens we are on a secondary CPU, but
-		 * switched to PAT on the boot CPU. We have no way to
-		 * undo PAT.
-		 */
-		printk(KERN_ERR "PAT enabled, "
-		       "but not supported by secondary CPU\n");
-		BUG();
+	if (!cpu_has_pat) {
+		if (!boot_pat_state) {
+			pat_disable("PAT not supported by CPU.");
+			return;
+		} else {
+			/*
+			 * If this happens we are on a secondary CPU, but
+			 * switched to PAT on the boot CPU. We have no way to
+			 * undo PAT.
+			 */
+			printk(KERN_ERR "PAT enabled, "
+			       "but not supported by secondary CPU\n");
+			BUG();
+		}
 	}

 #ifndef CONFIG_XEN
@@ -195,10 +204,10 @@ static unsigned long pat_x_mtrr_type(u64
 		u8 mtrr_type;

 		mtrr_type = mtrr_type_lookup(start, end);
-		if (mtrr_type == MTRR_TYPE_UNCACHABLE)
-			return _PAGE_CACHE_UC;
-		if (mtrr_type == MTRR_TYPE_WRCOMB)
-			return _PAGE_CACHE_WC;
+		if (mtrr_type != MTRR_TYPE_WRBACK)
+			return _PAGE_CACHE_UC_MINUS;
+
+		return _PAGE_CACHE_WB;
 	}

 	return req_type;
@@ -371,23 +380,13 @@ int reserve_memtype(u64 start, u64 end,
 		return 0;
 	}

-	if (req_type == -1) {
-		/*
-		 * Call mtrr_lookup to get the type hint. This is an
-		 * optimization for /dev/mem mmap'ers into WB memory (BIOS
-		 * tools and ACPI tools). Use WB request for WB memory and use
-		 * UC_MINUS otherwise.
-		 */
-		u8 mtrr_type = mtrr_type_lookup(start, end);
-
-		if (mtrr_type == MTRR_TYPE_WRBACK)
-			actual_type = _PAGE_CACHE_WB;
-		else
-			actual_type = _PAGE_CACHE_UC_MINUS;
-	} else {
-		actual_type = pat_x_mtrr_type(start, end,
-					      req_type & _PAGE_CACHE_MASK);
-	}
+	/*
+	 * Call mtrr_lookup to get the type hint. This is an
+	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
+	 * tools and ACPI tools). Use WB request for WB memory and use
+	 * UC_MINUS otherwise.
+	 */
+	actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);

 	if (new_type)
 		*new_type = actual_type;
@@ -565,9 +564,7 @@ static inline int range_is_allowed(unsig
 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
-	u64 addr = (u64)mfn << PAGE_SHIFT;
-	unsigned long flags = -1;
-	int retval;
+	unsigned long flags = _PAGE_CACHE_WB;

 	if (!range_is_allowed(mfn, size))
 		return 0;
@@ -597,60 +594,21 @@ int phys_mem_access_prot_allowed(struct
 #endif
 #endif

-	/*
-	 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
-	 *
-	 * Without O_SYNC, we want to get
-	 * - WB for WB-able memory and no other conflicting mappings
-	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
-	 * - Inherit from confliting mappings otherwise
-	 */
-	if (flags != -1) {
-		retval = reserve_memtype(addr, addr + size, flags, NULL);
-	} else {
-		retval = reserve_memtype(addr, addr + size, -1, &flags);
-	}
-
-	if (retval < 0)
-		return 0;
-
-	if (ioremap_check_change_attr(mfn, size, flags) < 0) {
-		free_memtype(addr, addr + size);
-		printk(KERN_INFO
-		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
-			current->comm, current->pid,
-			cattr_name(flags),
-			addr, addr + size);
-		return 0;
-	}
-
 	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 			     flags);
 	return 1;
 }

-void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
-{
-	unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
-	u64 addr = (u64)mfn << PAGE_SHIFT;
-	unsigned long flags;
-
-	reserve_memtype(addr, addr + size, want_flags, &flags);
-	if (flags != want_flags) {
-		printk(KERN_INFO
-		"%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
-			current->comm, current->pid,
-			cattr_name(want_flags),
-			addr, (unsigned long long)(addr + size),
-			cattr_name(flags));
-	}
-}
-
-void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags)
 {
-	u64 addr = (u64)mfn << PAGE_SHIFT;
+	if (!pat_enabled)
+		return 0;

-	free_memtype(addr, addr + size);
+	return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags);
 }

 #ifndef CONFIG_XEN
@@ -663,17 +621,18 @@ static int reserve_pfn_range(u64 paddr,
 				int strict_prot)
 {
 	int is_ram = 0;
-	int id_sz, ret;
-	unsigned long flags;
+	int ret;
 	unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+	unsigned long flags = want_flags;

 	is_ram = pat_pagerange_is_ram(paddr, paddr + size);

 	/*
-	 * reserve_pfn_range() doesn't support RAM pages.
+	 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+	 * behavior with RAM pages by returning success.
 	 */
 	if (is_ram != 0)
-		return -EINVAL;
+		return 0;

 	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
 	if (ret)
@@ -700,23 +659,8 @@ static int reserve_pfn_range(u64 paddr,
 				     flags);
 	}

-	/* Need to keep identity mapping in sync */
-	if (paddr >= __pa(high_memory))
-		return 0;
-
-	id_sz = (__pa(high_memory) < paddr + size) ?
-				__pa(high_memory) - paddr :
-				size;
-
-	if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+	if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
 		free_memtype(paddr, paddr + size);
-		printk(KERN_ERR
-			"%s:%d reserve_pfn_range ioremap_change_attr failed %s "
-			"for %Lx-%Lx\n",
-			current->comm, current->pid,
-			cattr_name(flags),
-			(unsigned long long)paddr,
-			(unsigned long long)(paddr + size));
 		return -EINVAL;
 	}
 	return 0;
@@ -741,29 +685,28 @@ static void free_pfn_range(u64 paddr, un
  *
  * If the vma has a linear pfn mapping for the entire range, we get the prot
  * from pte and reserve the entire vma range with single reserve_pfn_range call.
- * Otherwise, we reserve the entire vma range, my ging through the PTEs page
- * by page to get physical address and protection.
  */
 int track_pfn_vma_copy(struct vm_area_struct *vma)
 {
-	int retval = 0;
-	unsigned long i, j;
 	resource_size_t paddr;
 	unsigned long prot;
-	unsigned long vma_start = vma->vm_start;
-	unsigned long vma_end = vma->vm_end;
-	unsigned long vma_size = vma_end - vma_start;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
 	pgprot_t pgprot;

 	if (!pat_enabled)
 		return 0;

+	/*
+	 * For now, only handle remap_pfn_range() vmas where
+	 * is_linear_pfn_mapping() == TRUE. Handling of
+	 * vm_insert_pfn() is TBD.
+	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/*
 		 * reserve the whole chunk covered by vma. We need the
 		 * starting address and protection from pte.
 		 */
-		if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
 			WARN_ON_ONCE(1);
 			return -EINVAL;
 		}
@@ -771,28 +714,7 @@ int track_pfn_vma_copy(struct vm_area_st
 		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
 	}

-	/* reserve entire vma page by page, using pfn and prot from pte */
-	for (i = 0; i < vma_size; i += PAGE_SIZE) {
-		if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
-			continue;
-
-		pgprot = __pgprot(prot);
-		retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
-		if (retval)
-			goto cleanup_ret;
-	}
 	return 0;
-
-cleanup_ret:
-	/* Reserve error: Cleanup partial reservation and return error */
-	for (j = 0; j < i; j += PAGE_SIZE) {
-		if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
-			continue;
-
-		free_pfn_range(paddr, PAGE_SIZE);
-	}
-
-	return retval;
 }

 /*
@@ -802,50 +724,28 @@ cleanup_ret:
  * prot is passed in as a parameter for the new mapping. If the vma has a
  * linear pfn mapping for the entire range reserve the entire vma range with
  * single reserve_pfn_range call.
- * Otherwise, we look t the pfn and size and reserve only the specified range
- * page by page.
- *
- * Note that this function can be called with caller trying to map only a
- * subrange/page inside the vma.
  */
 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 			unsigned long pfn, unsigned long size)
 {
-	int retval = 0;
-	unsigned long i, j;
-	resource_size_t base_paddr;
 	resource_size_t paddr;
-	unsigned long vma_start = vma->vm_start;
-	unsigned long vma_end = vma->vm_end;
-	unsigned long vma_size = vma_end - vma_start;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;

 	if (!pat_enabled)
 		return 0;

+	/*
+	 * For now, only handle remap_pfn_range() vmas where
+	 * is_linear_pfn_mapping() == TRUE. Handling of
+	 * vm_insert_pfn() is TBD.
+	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/* reserve the whole chunk starting from vm_pgoff */
 		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 		return reserve_pfn_range(paddr, vma_size, prot, 0);
 	}

-	/* reserve page by page using pfn and size */
-	base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		paddr = base_paddr + i;
-		retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
-		if (retval)
-			goto cleanup_ret;
-	}
 	return 0;
-
-cleanup_ret:
-	/* Reserve error: Cleanup partial reservation and return error */
-	for (j = 0; j < i; j += PAGE_SIZE) {
-		paddr = base_paddr + j;
-		free_pfn_range(paddr, PAGE_SIZE);
-	}
-
-	return retval;
 }

 /*
@@ -856,39 +756,23 @@ cleanup_ret:
 void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 			unsigned long size)
 {
-	unsigned long i;
 	resource_size_t paddr;
-	unsigned long prot;
-	unsigned long vma_start = vma->vm_start;
-	unsigned long vma_end = vma->vm_end;
-	unsigned long vma_size = vma_end - vma_start;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;

 	if (!pat_enabled)
 		return;

+	/*
+	 * For now, only handle remap_pfn_range() vmas where
+	 * is_linear_pfn_mapping() == TRUE. Handling of
+	 * vm_insert_pfn() is TBD.
+	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/* free the whole chunk starting from vm_pgoff */
 		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 		free_pfn_range(paddr, vma_size);
 		return;
 	}
-
-	if (size != 0 && size != vma_size) {
-		/* free page by page, using pfn and size */
-		paddr = (resource_size_t)pfn << PAGE_SHIFT;
-		for (i = 0; i < size; i += PAGE_SIZE) {
-			paddr = paddr + i;
-			free_pfn_range(paddr, PAGE_SIZE);
-		}
-	} else {
-		/* free entire vma, page by page, using the pfn from pte */
-		for (i = 0; i < vma_size; i += PAGE_SIZE) {
-			if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
-				continue;
-
-			free_pfn_range(paddr, PAGE_SIZE);
-		}
-	}
 }
 #endif /* CONFIG_XEN */

--- head-2010-05-25.orig/arch/x86/mm/pgtable-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/pgtable-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -122,10 +122,6 @@ void __pud_free_tlb(struct mmu_gather *t
 #endif	/* PAGETABLE_LEVELS > 3 */
 #endif	/* PAGETABLE_LEVELS > 2 */

-#ifndef CONFIG_X86_64
-#define TASK_SIZE64 TASK_SIZE
-#endif
-
 static void _pin_lock(struct mm_struct *mm, int lock) {
 	if (lock)
 		spin_lock(&mm->page_table_lock);
@@ -149,7 +145,7 @@ static void _pin_lock(struct mm_struct *
 		pgd_t *pgd = mm->pgd;
 		unsigned g;

-		for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+		for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
 			pud_t *pud;
 			unsigned u;

@@ -230,10 +226,10 @@ static void pgd_walk(pgd_t *pgd_base, pg
 	 * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
 	 * may not be the 'current' task's pagetables (e.g., current may be
 	 * 32-bit, but the pagetables may be for a 64-bit task).
-	 * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
-	 * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+	 * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
+	 * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
 	 */
-	for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+	for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
 		if (pgd_none(*pgd))
 			continue;
 		pud = pud_offset(pgd, 0);
@@ -736,9 +732,26 @@ int ptep_clear_flush_young(struct vm_are
 	return young;
 }

+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+	BUG_ON(fixmaps_set > 0);
+	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+	       (int)-reserve);
+	__FIXADDR_TOP = -reserve - PAGE_SIZE;
+#endif
+}
+
 int fixmaps_set;

-void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
+void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
 	pte_t pte;
@@ -757,6 +770,8 @@ void xen_set_fixmap(enum fixed_addresses
 		set_pte_vaddr_pud(level3_user_pgt, address, pte);
 		break;
 	case FIX_EARLYCON_MEM_BASE:
+	case FIX_SHARED_INFO:
+	case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
 		xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
 				    pfn_pte_ma(phys >> PAGE_SHIFT, flags));
 		fixmaps_set++;
--- head-2010-05-25.orig/arch/x86/mm/pgtable_32-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/pgtable_32-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -25,6 +25,8 @@
 #include <xen/features.h>
 #include <asm/hypervisor.h>

+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
 /*
  * Associate a virtual page frame with a given physical page frame
  * and protection flags for that frame.
@@ -54,7 +56,7 @@ void set_pte_vaddr(unsigned long vaddr,
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
 	if (pte_val(pteval))
-		set_pte_present(&init_mm, vaddr, pte, pteval);
+		set_pte_at(&init_mm, vaddr, pte, pteval);
 	else
 		pte_clear(&init_mm, vaddr, pte);

@@ -109,21 +111,6 @@ unsigned long hypervisor_virt_start = HY
 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
 EXPORT_SYMBOL(__FIXADDR_TOP);

-/**
- * reserve_top_address - reserves a hole in the top of kernel address space
- * @reserve - size of hole to reserve
- *
- * Can be used to relocate the fixmap area and poke a hole in the top
- * of kernel address space to make room for a hypervisor.
- */
-void __init reserve_top_address(unsigned long reserve)
-{
-	BUG_ON(fixmaps_set > 0);
-	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
-	       (int)-reserve);
-	__FIXADDR_TOP = -reserve - PAGE_SIZE;
-}
-
 /*
  * vmalloc=size forces the vmalloc area to be exactly 'size'
  * bytes. This can be used to increase (or decrease) the
--- head-2010-05-25.orig/drivers/acpi/Makefile	2010-03-24 14:53:41.000000000 +0100
+++ head-2010-05-25/drivers/acpi/Makefile	2010-03-24 15:25:06.000000000 +0100
@@ -64,8 +64,6 @@ obj-$(CONFIG_ACPI_POWER_METER)	+= power_
 processor-y			:= processor_driver.o processor_throttling.o
 processor-y			+= processor_idle.o processor_thermal.o
 processor-$(CONFIG_CPU_FREQ)	+= processor_perflib.o
-ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
-processor-objs	+= processor_perflib.o processor_extcntl.o
-endif
+processor-$(CONFIG_PROCESSOR_EXTERNAL_CONTROL) += processor_perflib.o processor_extcntl.o

 obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
--- head-2010-05-25.orig/drivers/acpi/acpica/hwsleep.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-05-25/drivers/acpi/acpica/hwsleep.c	2010-03-24 15:25:06.000000000 +0100
@@ -394,7 +394,7 @@ acpi_status asmlinkage acpi_enter_sleep_
 #else
 	/* PV ACPI just need check hypercall return value */
 	err = acpi_notify_hypervisor_state(sleep_state,
-			PM1Acontrol, PM1Bcontrol);
+			pm1a_control, pm1b_control);
 	if (err) {
 		printk(KERN_ERR "ACPI: Hypervisor failure [%d]\n", err);
 		return_ACPI_STATUS(AE_ERROR);
--- head-2010-05-25.orig/drivers/acpi/processor_idle.c	2010-04-15 10:06:51.000000000 +0200
+++ head-2010-05-25/drivers/acpi/processor_idle.c	2010-04-15 10:06:59.000000000 +0200
@@ -606,7 +606,7 @@ static void acpi_processor_power_verify_
 #ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	cx->latency_ticks = cx->latency;
 #else
-	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+	cx->latency_ticks = us_to_pm_timer_ticks(cx->latency);
 #endif
 	/*
 	 * On older chipsets, BM_RLD needs to be set
@@ -643,7 +643,7 @@ static int acpi_processor_power_verify(s
 #ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 			cx->latency_ticks = cx->latency; /* Normalize latency */
 #else
-			cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+			cx->latency_ticks = us_to_pm_timer_ticks(cx->latency);
 #endif
 			break;

--- head-2010-05-25.orig/drivers/oprofile/oprofile_files.c	2010-03-24 15:02:17.000000000 +0100
+++ head-2010-05-25/drivers/oprofile/oprofile_files.c	2010-03-24 15:25:06.000000000 +0100
@@ -172,6 +172,7 @@ static const struct file_operations dump
 };

 #ifdef CONFIG_XEN
+#include <linux/slab.h>

 #define TMPBUFSIZE 512

--- head-2010-05-25.orig/drivers/pci/msi-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/pci/msi-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -47,47 +47,50 @@ struct msi_pirq_entry {

 /* Arch hooks */

-int __attribute__ ((weak))
-arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
-{
-	return 0;
-}
-
-#ifndef CONFIG_XEN
-int __attribute__ ((weak))
-arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
+#ifndef arch_msi_check_device
+int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
 {
 	return 0;
 }
+#endif

-int __attribute__ ((weak))
-arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+#ifndef arch_setup_msi_irqs
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	struct msi_desc *entry;
 	int ret;

+	/*
+	 * If an architecture wants to support multiple MSI, it needs to
+	 * override arch_setup_msi_irqs()
+	 */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
 	list_for_each_entry(entry, &dev->msi_list, list) {
 		ret = arch_setup_msi_irq(dev, entry);
-		if (ret)
+		if (ret < 0)
 			return ret;
+		if (ret > 0)
+			return -ENOSPC;
 	}

 	return 0;
 }
+#endif

-void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
-{
-	return;
-}
-
-void __attribute__ ((weak))
-arch_teardown_msi_irqs(struct pci_dev *dev)
+#ifndef arch_teardown_msi_irqs
+void arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct msi_desc *entry;

 	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq != 0)
-			arch_teardown_msi_irq(entry->irq);
+		int i, nvec;
+		if (entry->irq == 0)
+			continue;
+		nvec = 1 << entry->msi_attrib.multiple;
+		for (i = 0; i < nvec; i++)
+			arch_teardown_msi_irq(entry->irq + i);
 	}
 }
 #endif
@@ -347,13 +350,15 @@ EXPORT_SYMBOL_GPL(pci_restore_msi_state)
 /**
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
  *
- * Setup the MSI capability structure of device function with a single
- * MSI irq, regardless of device function is capable of handling
- * multiple messages. A return of zero indicates the successful setup
- * of an entry zero with the new MSI irq or non-zero for otherwise.
- **/
-static int msi_capability_init(struct pci_dev *dev)
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts.  A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq.  A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
 {
 	int pos, pirq;
 	u16 control;
@@ -363,6 +368,7 @@ static int msi_capability_init(struct pc
 	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
 	pci_read_config_word(dev, msi_control_reg(pos), &control);

+	WARN_ON(nvec > 1); /* XXX */
 	pirq = msi_map_vector(dev, 0, 0);
 	if (pirq < 0)
 		return -EBUSY;
@@ -496,22 +502,34 @@ static int pci_msi_check_device(struct p
 }

 /**
- * pci_enable_msi - configure device's MSI capability structure
- * @dev: pointer to the pci_dev data structure of MSI device function
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
  *
- * Setup the MSI capability structure of device function with
- * a single MSI irq upon its software driver call to request for
- * MSI mode enabled on its hardware device function. A return of zero
- * indicates the successful setup of an entry zero with the new MSI
- * vector or non-zero for otherwise.
- **/
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs.  If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate.  If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
 extern int pci_frontend_enable_msi(struct pci_dev *dev);
-int pci_enable_msi(struct pci_dev* dev)
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
 {
-	int temp, status;
+	int temp, status, pos, maxvec;
+	u16 msgctl;
 	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);

-	status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI);
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+	maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+	if (nvec > maxvec)
+		return maxvec;
+
+	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
 	if (status)
  		return status;

@@ -521,6 +539,7 @@ int pci_enable_msi(struct pci_dev* dev)
 		int ret;

 		temp = dev->irq;
+		WARN_ON(nvec > 1); /* XXX */
 		ret = pci_frontend_enable_msi(dev);
 		if (ret)
 			return ret;
@@ -535,23 +554,23 @@ int pci_enable_msi(struct pci_dev* dev)

 	temp = dev->irq;

-	/* Check whether driver already requested for MSI-X irqs */
+	/* Check whether driver already requested MSI-X irqs */
 	if (dev->msix_enabled) {
 		dev_info(&dev->dev, "can't enable MSI "
 			 "(MSI-X already enabled)\n");
 		return -EINVAL;
 	}

-	status = msi_capability_init(dev);
+	status = msi_capability_init(dev, nvec);
 	if ( !status )
 		msi_dev_entry->default_irq = temp;

 	return status;
 }
-EXPORT_SYMBOL(pci_enable_msi);
+EXPORT_SYMBOL(pci_enable_msi_block);

 extern void pci_frontend_disable_msi(struct pci_dev* dev);
-void pci_msi_shutdown(struct pci_dev* dev)
+void pci_msi_shutdown(struct pci_dev *dev)
 {
 	int pirq;
 	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
@@ -579,6 +598,7 @@ void pci_msi_shutdown(struct pci_dev* de
 	pci_intx_for_msi(dev, 1);
 	dev->msi_enabled = 0;
 }
+
 void pci_disable_msi(struct pci_dev* dev)
 {
 	pci_msi_shutdown(dev);
@@ -586,6 +606,23 @@ void pci_disable_msi(struct pci_dev* dev
 EXPORT_SYMBOL(pci_disable_msi);

 /**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	return multi_msix_capable(control);
+}
+
+/**
  * pci_enable_msix - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of MSI-X entries
@@ -604,9 +641,8 @@ extern int pci_frontend_enable_msix(stru
 		struct msix_entry *entries, int nvec);
 int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec)
 {
-	int status, pos, nr_entries;
+	int status, nr_entries;
 	int i, j, temp;
-	u16 control;
 	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);

 	if (!entries)
@@ -653,9 +689,7 @@ int pci_enable_msix(struct pci_dev* dev,
 	if (status)
 		return status;

-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	pci_read_config_word(dev, msi_control_reg(pos), &control);
-	nr_entries = multi_msix_capable(control);
+	nr_entries = pci_msix_table_size(dev);
 	if (nvec > nr_entries)
 		return -EINVAL;

--- head-2010-05-25.orig/drivers/xen/Kconfig	2010-03-24 15:18:46.000000000 +0100
+++ head-2010-05-25/drivers/xen/Kconfig	2010-03-24 15:25:06.000000000 +0100
@@ -14,7 +14,6 @@ menu "XEN"

 config XEN_PRIVILEGED_GUEST
 	bool "Privileged Guest (domain 0)"
-	select PCI_REASSIGN if PCI
 	help
 	  Support for privileged operation (domain 0)

@@ -333,10 +332,6 @@ endmenu
 config HAVE_IRQ_IGNORE_UNHANDLED
 	def_bool y

-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-	depends on X86
-
 config NO_IDLE_HZ
 	def_bool y

--- head-2010-05-25.orig/drivers/xen/char/mem.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-05-25/drivers/xen/char/mem.c	2010-03-24 15:25:06.000000000 +0100
@@ -158,21 +158,7 @@ static ssize_t write_mem(struct file * f
 }

 #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
-static void mmap_mem_open(struct vm_area_struct *vma)
-{
-	map_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
-			vma->vm_page_prot);
-}
-
-static void mmap_mem_close(struct vm_area_struct *vma)
-{
-	unmap_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
-			vma->vm_page_prot);
-}
-
 static struct vm_operations_struct mmap_mem_ops = {
-	.open  = mmap_mem_open,
-	.close = mmap_mem_close,
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 	.access = generic_access_phys
 #endif
--- head-2010-05-25.orig/drivers/xen/core/Makefile	2010-04-19 14:50:32.000000000 +0200
+++ head-2010-05-25/drivers/xen/core/Makefile	2010-04-19 14:52:49.000000000 +0200
@@ -10,5 +10,5 @@ obj-$(CONFIG_SYS_HYPERVISOR)	+= hypervis
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
-obj-$(CONFIG_X86_SMP)		+= spinlock.o
+obj-$(CONFIG_SMP)		+= spinlock.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o
--- head-2010-05-25.orig/drivers/xen/core/evtchn.c	2010-04-23 15:19:25.000000000 +0200
+++ head-2010-05-25/drivers/xen/core/evtchn.c	2010-04-23 15:19:37.000000000 +0200
@@ -150,13 +150,15 @@ DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS])
 #ifdef CONFIG_SMP

 static u8 cpu_evtchn[NR_EVENT_CHANNELS];
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static DEFINE_PER_CPU(unsigned long[BITS_TO_LONGS(NR_EVENT_CHANNELS)],
+		      cpu_evtchn_mask);

-static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
-					   unsigned int idx)
+static inline unsigned long active_evtchns(unsigned int idx)
 {
+	shared_info_t *sh = HYPERVISOR_shared_info;
+
 	return (sh->evtchn_pending[idx] &
-		cpu_evtchn_mask[cpu][idx] &
+		percpu_read(cpu_evtchn_mask[idx]) &
 		~sh->evtchn_mask[idx]);
 }

@@ -168,10 +170,10 @@ static void bind_evtchn_to_cpu(unsigned
 	BUG_ON(!test_bit(chn, s->evtchn_mask));

 	if (irq != -1)
-		irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
+		cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));

-	clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
-	set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+	clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn]));
+	set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
 	cpu_evtchn[chn] = cpu;
 }

@@ -184,11 +186,11 @@ static void init_evtchn_cpu_bindings(voi
 		struct irq_desc *desc = irq_to_desc(i);

 		if (desc)
-			desc->affinity = cpumask_of_cpu(0);
+			cpumask_copy(desc->affinity, cpumask_of(0));
 	}

 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
-	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+	memset(per_cpu(cpu_evtchn_mask, 0), ~0, sizeof(per_cpu(cpu_evtchn_mask, 0)));
 }

 static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
@@ -198,9 +200,10 @@ static inline unsigned int cpu_from_evtc

 #else

-static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
-					   unsigned int idx)
+static inline unsigned long active_evtchns(unsigned int idx)
 {
+	shared_info_t *sh = HYPERVISOR_shared_info;
+
 	return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
 }

@@ -219,25 +222,15 @@ static inline unsigned int cpu_from_evtc

 #endif

-/* Upcall to generic IRQ layer. */
 #ifdef CONFIG_X86
-extern unsigned int do_IRQ(struct pt_regs *regs);
 void __init xen_init_IRQ(void);
 void __init init_IRQ(void)
 {
 	irq_ctx_init(0);
 	xen_init_IRQ();
 }
-#if defined (__i386__)
-static inline void exit_idle(void) {}
-#elif defined (__x86_64__)
 #include <asm/idle.h>
 #endif
-#define do_IRQ(irq, regs) do {		\
-	(regs)->orig_ax = ~(irq);	\
-	do_IRQ((regs));			\
-} while (0)
-#endif

 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
@@ -261,13 +254,12 @@ static DEFINE_PER_CPU(unsigned int, curr
 /* NB. Interrupts are disabled on entry. */
 asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
 {
+	struct pt_regs     *old_regs = set_irq_regs(regs);
 	unsigned long       l1, l2;
 	unsigned long       masked_l1, masked_l2;
 	unsigned int        l1i, l2i, start_l1i, start_l2i, port, count, i;
 	int                 irq;
-	unsigned int        cpu = smp_processor_id();
-	shared_info_t      *s = HYPERVISOR_shared_info;
-	vcpu_info_t        *vcpu_info = &s->vcpu_info[cpu];
+	vcpu_info_t        *vcpu_info = current_vcpu_info();

 	exit_idle();
 	irq_enter();
@@ -277,7 +269,8 @@ asmlinkage void __irq_entry evtchn_do_up
 		vcpu_info->evtchn_upcall_pending = 0;

 		/* Nested invocations bail immediately. */
-		if (unlikely(per_cpu(upcall_count, cpu)++))
+		percpu_add(upcall_count, 1);
+		if (unlikely(percpu_read(upcall_count) != 1))
 			break;

 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -286,8 +279,8 @@ asmlinkage void __irq_entry evtchn_do_up
 #endif
 		l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);

-		start_l1i = l1i = per_cpu(current_l1i, cpu);
-		start_l2i = per_cpu(current_l2i, cpu);
+		start_l1i = l1i = percpu_read(current_l1i);
+		start_l2i = percpu_read(current_l2i);

 		for (i = 0; l1 != 0; i++) {
 			masked_l1 = l1 & ((~0UL) << l1i);
@@ -298,7 +291,7 @@ asmlinkage void __irq_entry evtchn_do_up
 			}
 			l1i = __ffs(masked_l1);

-			l2 = active_evtchns(cpu, s, l1i);
+			l2 = active_evtchns(l1i);
 			l2i = 0; /* usually scan entire word from start */
 			if (l1i == start_l1i) {
 				/* We scan the starting word in two parts. */
@@ -318,17 +311,18 @@ asmlinkage void __irq_entry evtchn_do_up

 				/* process port */
 				port = (l1i * BITS_PER_LONG) + l2i;
-				if ((irq = evtchn_to_irq[port]) != -1)
-					do_IRQ(irq, regs);
-				else
+				if (unlikely((irq = evtchn_to_irq[port]) == -1))
 					evtchn_device_upcall(port);
+				else if (!handle_irq(irq, regs) && printk_ratelimit())
+					printk(KERN_EMERG "%s(%d): No handler for irq %d\n",
+					       __func__, smp_processor_id(), irq);

 				l2i = (l2i + 1) % BITS_PER_LONG;

 				/* Next caller starts at last processed + 1 */
-				per_cpu(current_l1i, cpu) =
-					l2i ? l1i : (l1i + 1) % BITS_PER_LONG;
-				per_cpu(current_l2i, cpu) = l2i;
+				percpu_write(current_l1i,
+					l2i ? l1i : (l1i + 1) % BITS_PER_LONG);
+				percpu_write(current_l2i, l2i);

 			} while (l2i != 0);

@@ -340,11 +334,12 @@ asmlinkage void __irq_entry evtchn_do_up
 		}

 		/* If there were nested callbacks then we have more to do. */
-		count = per_cpu(upcall_count, cpu);
-		per_cpu(upcall_count, cpu) = 0;
+		count = percpu_read(upcall_count);
+		percpu_write(upcall_count, 0);
 	} while (unlikely(count != 1));

 	irq_exit();
+	set_irq_regs(old_regs);
 }

 static struct irq_chip dynirq_chip;
@@ -551,7 +546,7 @@ static void unbind_from_irq(unsigned int

 		/* Zap stats across IRQ changes of use. */
 		for_each_possible_cpu(cpu)
-#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_GENERIC_HARDIRQS
 			irq_to_desc(irq)->kstat_irqs[cpu] = 0;
 #else
 			kstat_cpu(cpu).irqs[irq] = 0;
@@ -669,7 +664,8 @@ int bind_ipi_to_irqhandler(
 	if (irq < 0)
 		return irq;

-	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	retval = request_irq(irq, handler, irqflags | IRQF_NO_SUSPEND,
+			     devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
 		return retval;
@@ -1134,7 +1130,7 @@ void irq_resume(void)
 		mask_evtchn(evtchn);

 	/* Check that no PIRQs are still bound. */
-	for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) {
+	for (irq = PIRQ_BASE; irq < (PIRQ_BASE + nr_pirqs); irq++) {
 		cfg = irq_cfg(irq);
 		BUG_ON(cfg && cfg->info != IRQ_UNBOUND);
 	}
@@ -1171,7 +1167,7 @@ int arch_init_chip_data(struct irq_desc
 {
 	if (!desc->chip_data) {
 		/* By default all event channels notify CPU#0. */
-		desc->affinity = cpumask_of_cpu(0);
+		cpumask_copy(desc->affinity, cpumask_of(0));

 		desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC);
 	}
@@ -1185,11 +1181,44 @@ int arch_init_chip_data(struct irq_desc
 #endif

 #if defined(CONFIG_X86_IO_APIC)
+#ifdef CONFIG_SPARSE_IRQ
+int nr_pirqs = NR_PIRQS;
+EXPORT_SYMBOL_GPL(nr_pirqs);
+
+int __init arch_probe_nr_irqs(void)
+{
+	int nr_irqs_gsi, nr = acpi_probe_gsi();
+
+	if (nr <= NR_IRQS_LEGACY) {
+		/* for acpi=off or acpi not compiled in */
+		int idx;
+
+		for (nr = idx = 0; idx < nr_ioapics; idx++)
+			nr += io_apic_get_redir_entries(idx) + 1;
+	}
+	nr_irqs_gsi = max(nr, NR_IRQS_LEGACY);
+
+	nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#ifdef CONFIG_PCI_MSI
+	nr += nr_irqs_gsi * 16;
+#endif
+	if (nr_pirqs > nr) {
+		nr_pirqs = nr;
+		nr_irqs = nr + NR_DYNIRQS;
+	}
+
+	printk(KERN_DEBUG "nr_irqs_gsi=%d nr_pirqs=%d\n",
+	       nr_irqs_gsi, nr_pirqs);
+
+	return 0;
+}
+#endif
+
 int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
 	struct physdev_irq irq_op;

-	if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
+	if (irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs)
 		return -EINVAL;

 	if (cfg->vector)
@@ -1212,7 +1241,7 @@ int assign_irq_vector(int irq, struct ir

 void evtchn_register_pirq(int irq)
 {
-	BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS);
+	BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs);
 	if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND)
 		return;
 	irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0);
@@ -1225,7 +1254,7 @@ int evtchn_map_pirq(int irq, int xen_pir
 	if (irq < 0) {
 		static DEFINE_SPINLOCK(irq_alloc_lock);

-		irq = PIRQ_BASE + NR_PIRQS - 1;
+		irq = PIRQ_BASE + nr_pirqs - 1;
 		spin_lock(&irq_alloc_lock);
 		do {
 			struct irq_desc *desc;
@@ -1285,7 +1314,7 @@ void __init xen_init_IRQ(void)
 	init_evtchn_cpu_bindings();

 	pirq_needs_eoi = alloc_bootmem_pages(sizeof(unsigned long)
-		* BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
+		* BITS_TO_LONGS(ALIGN(nr_pirqs, PAGE_SIZE * 8)));
  	eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
 	if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
 		pirq_eoi_does_unmask = true;
@@ -1301,7 +1330,7 @@ void __init xen_init_IRQ(void)
 					      handle_level_irq, "level");
 	}

-	for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) {
+	for (i = PIRQ_BASE; i < (PIRQ_BASE + nr_pirqs); i++) {
 #else
 	for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
 #endif
--- head-2010-05-25.orig/drivers/xen/core/smpboot.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/core/smpboot.c	2010-03-24 15:25:06.000000000 +0100
@@ -18,7 +18,6 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 #include <asm/desc.h>
-#include <asm/arch_hooks.h>
 #include <asm/pgalloc.h>
 #include <xen/evtchn.h>
 #include <xen/interface/vcpu.h>
@@ -54,8 +53,8 @@ static char call1func_name[NR_CPUS][15];
 #define set_cpu_to_apicid(cpu, apicid)
 #endif

-DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
-DEFINE_PER_CPU(cpumask_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);

 void __init prefill_possible_map(void)
 {
@@ -88,8 +87,8 @@ set_cpu_sibling_map(unsigned int cpu)
 	cpu_data(cpu).phys_proc_id = cpu;
 	cpu_data(cpu).cpu_core_id  = 0;

-	per_cpu(cpu_sibling_map, cpu) = cpumask_of_cpu(cpu);
-	per_cpu(cpu_core_map, cpu) = cpumask_of_cpu(cpu);
+	cpumask_copy(cpu_sibling_mask(cpu), cpumask_of(cpu));
+	cpumask_copy(cpu_core_mask(cpu), cpumask_of(cpu));

 	cpu_data(cpu).booted_cores = 1;
 }
@@ -100,8 +99,8 @@ remove_siblinginfo(unsigned int cpu)
 	cpu_data(cpu).phys_proc_id = BAD_APICID;
 	cpu_data(cpu).cpu_core_id  = BAD_APICID;

-	cpus_clear(per_cpu(cpu_sibling_map, cpu));
-	cpus_clear(per_cpu(cpu_core_map, cpu));
+	cpumask_clear(cpu_sibling_mask(cpu));
+	cpumask_clear(cpu_core_mask(cpu));

 	cpu_data(cpu).booted_cores = 0;
 }
@@ -224,7 +223,7 @@ static void __cpuinit cpu_initialize_con
 	smp_trap_init(ctxt.trap_ctxt);

 	ctxt.ldt_ents = 0;
-	ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
+	ctxt.gdt_frames[0] = arbitrary_virt_to_mfn(get_cpu_gdt_table(cpu));
 	ctxt.gdt_ents = GDT_SIZE / 8;

 	ctxt.user_regs.cs = __KERNEL_CS;
@@ -242,12 +241,13 @@ static void __cpuinit cpu_initialize_con
 	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));

 	ctxt.user_regs.fs = __KERNEL_PERCPU;
+	ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
 #else /* __x86_64__ */
 	ctxt.syscall_callback_eip  = (unsigned long)system_call;

 	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));

-	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
+	ctxt.gs_base_kernel = per_cpu_offset(cpu);
 #endif

 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt))
@@ -275,8 +275,10 @@ void __init smp_prepare_cpus(unsigned in
 	current_thread_info()->cpu = 0;

 	for_each_possible_cpu (cpu) {
-		cpus_clear(per_cpu(cpu_sibling_map, cpu));
-		cpus_clear(per_cpu(cpu_core_map, cpu));
+		alloc_cpumask_var(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL);
+		alloc_cpumask_var(&per_cpu(cpu_core_map, cpu), GFP_KERNEL);
+		cpumask_clear(cpu_sibling_mask(cpu));
+		cpumask_clear(cpu_core_mask(cpu));
 	}

 	set_cpu_sibling_map(0);
@@ -303,9 +305,6 @@ void __init smp_prepare_cpus(unsigned in
 		if (IS_ERR(idle))
 			panic("failed fork for CPU %d", cpu);

-#ifdef __i386__
-		init_gdt(cpu);
-#endif
 		gdt_addr = get_cpu_gdt_table(cpu);
 		make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);

@@ -319,12 +318,12 @@ void __init smp_prepare_cpus(unsigned in
 		set_cpu_to_apicid(cpu, apicid);

 #ifdef __x86_64__
-		cpu_pda(cpu)->pcurrent = idle;
-		cpu_pda(cpu)->cpunumber = cpu;
 		clear_tsk_thread_flag(idle, TIF_FORK);
-#else
-	 	per_cpu(current_task, cpu) = idle;
+		per_cpu(kernel_stack, cpu) =
+			(unsigned long)task_stack_page(idle) -
+			KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
+	 	per_cpu(current_task, cpu) = idle;

 		irq_ctx_init(cpu);

@@ -348,10 +347,7 @@ void __init smp_prepare_cpus(unsigned in

 void __init smp_prepare_boot_cpu(void)
 {
-#ifdef __i386__
-	init_gdt(smp_processor_id());
-#endif
-	switch_to_new_gdt();
+	switch_to_new_gdt(smp_processor_id());
 	prefill_possible_map();
 }

--- head-2010-05-25.orig/drivers/xen/core/spinlock.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/core/spinlock.c	2010-03-24 15:25:06.000000000 +0100
@@ -78,13 +78,13 @@ static unsigned int spin_adjust(struct s

 unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token)
 {
-	return spin_adjust(x86_read_percpu(spinning), lock, token);
+	return spin_adjust(percpu_read(spinning), lock, token);
 }

 bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok,
                    unsigned int flags)
 {
-	int irq = x86_read_percpu(spinlock_irq);
+	int irq = percpu_read(spinlock_irq);
 	bool rc;
 	typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask;
 	raw_rwlock_t *rm_lock;
@@ -97,9 +97,9 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 	/* announce we're spinning */
 	spinning.ticket = *ptok >> TICKET_SHIFT;
 	spinning.lock = lock;
-	spinning.prev = x86_read_percpu(spinning);
+	spinning.prev = percpu_read(spinning);
 	smp_wmb();
-	x86_write_percpu(spinning, &spinning);
+	percpu_write(spinning, &spinning);
 	upcall_mask = current_vcpu_info()->evtchn_upcall_mask;

 	do {
@@ -184,7 +184,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,

 	/* announce we're done */
 	other = spinning.prev;
-	x86_write_percpu(spinning, other);
+	percpu_write(spinning, other);
 	rm_lock = &__get_cpu_var(spinning_rm_lock);
 	raw_local_irq_disable();
 	__raw_write_lock(rm_lock);
--- head-2010-05-25.orig/drivers/xen/netback/interface.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/netback/interface.c	2010-03-24 15:25:06.000000000 +0100
@@ -121,7 +121,7 @@ static void netbk_get_drvinfo(struct net
 			      struct ethtool_drvinfo *info)
 {
 	strcpy(info->driver, "netbk");
-	strcpy(info->bus_info, dev->dev.parent->bus_id);
+	strcpy(info->bus_info, dev_name(dev->dev.parent));
 }

 static const struct netif_stat {
--- head-2010-05-25.orig/drivers/xen/netback/netback.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/netback/netback.c	2010-03-24 15:25:06.000000000 +0100
@@ -333,7 +333,7 @@ int netif_be_start_xmit(struct sk_buff *
 			 */
 			netif->tx_queue_timeout.data = (unsigned long)netif;
 			netif->tx_queue_timeout.function = tx_queue_callback;
-			__mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
+			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
 		}
 	}

@@ -354,7 +354,7 @@ static void xen_network_done_notify(void
 	static struct net_device *eth0_dev = NULL;
 	if (unlikely(eth0_dev == NULL))
 		eth0_dev = __dev_get_by_name(&init_net, "eth0");
-	netif_rx_schedule(???);
+	napi_schedule(???);
 }
 /*
  * Add following to poll() function in NAPI driver (Tigon3 is example):
@@ -1308,8 +1308,7 @@ static void net_tx_action(unsigned long
 					(unsigned long)netif;
 				netif->credit_timeout.function =
 					tx_credit_callback;
-				__mod_timer(&netif->credit_timeout,
-					    next_credit);
+				mod_timer(&netif->credit_timeout, next_credit);
 				netif_put(netif);
 				continue;
 			}
--- head-2010-05-25.orig/drivers/xen/netfront/netfront.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/netfront/netfront.c	2010-03-24 15:25:06.000000000 +0100
@@ -102,7 +102,7 @@ static const int MODPARM_rx_flip = 0;
 static inline void dev_disable_gso_features(struct net_device *dev)
 {
 	/* Turn off all GSO bits except ROBUST. */
-	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+	dev->features &= ~NETIF_F_GSO_MASK;
 	dev->features |= NETIF_F_GSO_ROBUST;
 }
 #elif defined(NETIF_F_TSO)
@@ -635,7 +635,7 @@ static int network_open(struct net_devic
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
 			netfront_accelerator_call_stop_napi_irq(np, dev);

-			netif_rx_schedule(&np->napi);
+			napi_schedule(&np->napi);
 		}
 	}
 	spin_unlock_bh(&np->rx_lock);
@@ -707,7 +707,7 @@ static void rx_refill_timeout(unsigned l

 	netfront_accelerator_call_stop_napi_irq(np, dev);

-	netif_rx_schedule(&np->napi);
+	napi_schedule(&np->napi);
 }

 static void network_alloc_rx_buffers(struct net_device *dev)
@@ -1064,7 +1064,7 @@ static irqreturn_t netif_int(int irq, vo
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
 			netfront_accelerator_call_stop_napi_irq(np, dev);

-			netif_rx_schedule(&np->napi);
+			napi_schedule(&np->napi);
 		}
 	}

@@ -1521,7 +1521,7 @@ err:
 		}

 		if (!more_to_do && !accel_more_to_do)
-			__netif_rx_complete(napi);
+			__napi_complete(napi);

 		local_irq_restore(flags);
 	}
--- head-2010-05-25.orig/drivers/xen/sfc_netfront/accel_msg.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/sfc_netfront/accel_msg.c	2010-03-24 15:25:06.000000000 +0100
@@ -47,7 +47,7 @@ static void vnic_start_interrupts(netfro
 		netfront_accel_disable_net_interrupts(vnic);
 		vnic->irq_enabled = 0;
 		NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
-		netif_rx_schedule(&np->napi);
+		napi_schedule(&np->napi);
 	} else {
 		/*
 		 * Nothing yet, make sure we get interrupts through
@@ -532,7 +532,7 @@ irqreturn_t netfront_accel_net_channel_i
 				vnic->stats.event_count_since_irq;
 		vnic->stats.event_count_since_irq = 0;
 #endif
-		netif_rx_schedule(&np->napi);
+		napi_schedule(&np->napi);
 	}
 	else {
 		spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
--- head-2010-05-25.orig/drivers/xen/usbback/usbstub.c	2010-03-24 15:06:12.000000000 +0100
+++ head-2010-05-25/drivers/xen/usbback/usbstub.c	2010-03-24 15:25:06.000000000 +0100
@@ -188,7 +188,7 @@ static int usbstub_probe(struct usb_inte
 		const struct usb_device_id *id)
 {
 	struct usb_device *udev = interface_to_usbdev(intf);
-	char *busid = intf->dev.parent->bus_id;
+	const char *busid = dev_name(intf->dev.parent);
 	struct vusb_port_id *portid = NULL;
 	struct usbstub *stub = NULL;
 	usbif_t *usbif = NULL;
--- head-2010-05-25.orig/drivers/xen/usbfront/usbfront-dbg.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/usbfront/usbfront-dbg.c	2010-03-24 15:25:06.000000000 +0100
@@ -64,7 +64,7 @@ static ssize_t show_statistics(struct de
 			"%s\n"
 			"xenhcd, hcd state %d\n",
 			hcd->self.controller->bus->name,
-			hcd->self.controller->bus_id,
+			dev_name(hcd->self.controller),
 			hcd->product_desc,
 			hcd->state);
 	size -= temp;
--- head-2010-05-25.orig/drivers/xen/usbfront/xenbus.c	2010-04-15 09:53:49.000000000 +0200
+++ head-2010-05-25/drivers/xen/usbfront/xenbus.c	2010-03-24 15:25:06.000000000 +0100
@@ -252,10 +252,10 @@ static struct usb_hcd *create_hcd(struct
 	}
 	switch (usb_ver) {
 	case USB_VER_USB11:
-		hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev->dev.bus_id);
+		hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev_name(&dev->dev));
 		break;
 	case USB_VER_USB20:
-		hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev->dev.bus_id);
+		hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev_name(&dev->dev));
 		break;
 	default:
 		xenbus_dev_fatal(dev, err, "invalid usb-ver");
--- head-2010-05-25.orig/drivers/xen/xenbus/xenbus_probe.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/xenbus/xenbus_probe.c	2010-03-24 15:25:06.000000000 +0100
@@ -230,7 +230,7 @@ static struct xen_bus_type xenbus_fronte
 	},
 #if defined(CONFIG_XEN) || defined(MODULE)
 	.dev = {
-		.bus_id = "xen",
+		.init_name = "xen",
 	},
 #endif
 };
--- head-2010-05-25.orig/drivers/xen/xenbus/xenbus_probe_backend.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/xenbus/xenbus_probe_backend.c	2010-03-24 15:25:06.000000000 +0100
@@ -129,7 +129,7 @@ static struct xen_bus_type xenbus_backen
 		.dev_attrs = xenbus_backend_attrs,
 	},
 	.dev = {
-		.bus_id = "xen-backend",
+		.init_name = "xen-backend",
 	},
 };

--- head-2010-05-25.orig/include/linux/interrupt.h	2010-03-24 14:53:41.000000000 +0100
+++ head-2010-05-25/include/linux/interrupt.h	2010-03-24 15:25:06.000000000 +0100
@@ -52,6 +52,7 @@
  * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
+ * IRQF_NO_SUSPEND - Prevent this interrupt from being disabled during suspend.
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
@@ -62,6 +63,7 @@
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
+#define IRQF_NO_SUSPEND		0x00008000

 /*
  * Bits used by threaded handlers:
--- head-2010-05-25.orig/kernel/irq/manage.c	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/kernel/irq/manage.c	2010-03-24 15:25:06.000000000 +0100
@@ -200,7 +200,8 @@ static inline int setup_affinity(unsigne
 void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 {
 	if (suspend) {
-		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+		if (!desc->action ||
+		    (desc->action->flags & (IRQF_TIMER | IRQF_NO_SUSPEND)))
 			return;
 		desc->status |= IRQ_SUSPENDED;
 	}
--- head-2010-05-25.orig/lib/swiotlb-xen.c	2010-03-24 15:17:58.000000000 +0100
+++ head-2010-05-25/lib/swiotlb-xen.c	2010-03-24 15:25:06.000000000 +0100
@@ -175,7 +175,7 @@ static void *swiotlb_bus_to_virt(dma_add
 	return phys_to_virt(swiotlb_bus_to_phys(address));
 }

-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
 {
 	return 0;
 }
@@ -523,13 +523,13 @@ swiotlb_full(struct device *dev, size_t
  * Once the device is given the dma address, the device owns this memory until
  * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
  */
-static dma_addr_t
-_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
-			 int dir, struct dma_attrs *attrs)
-{
-	struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
-	dma_addr_t dev_addr = gnttab_dma_map_page(page) +
-			      offset_in_page(paddr);
+dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+			    unsigned long offset, size_t size,
+			    enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
+{
+	phys_addr_t phys = page_to_pseudophys(page) + offset;
+	dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset;
 	void *map;

 	BUG_ON(dir == DMA_NONE);
@@ -539,44 +539,24 @@ _swiotlb_map_single(struct device *hwdev
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!address_needs_mapping(hwdev, dev_addr, size) &&
-	    !range_needs_mapping(paddr, size))
+	if (!address_needs_mapping(dev, dev_addr, size) &&
+	    !range_needs_mapping(phys, size))
 		return dev_addr;

 	/*
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
 	gnttab_dma_unmap_page(dev_addr);
-	map = map_single(hwdev, paddr, size, dir);
+	map = map_single(dev, phys, size, dir);
 	if (!map) {
-		swiotlb_full(hwdev, size, dir, 1);
+		swiotlb_full(dev, size, dir, 1);
 		map = io_tlb_overflow_buffer;
 	}

-	dev_addr = swiotlb_virt_to_bus(hwdev, map);
+	dev_addr = swiotlb_virt_to_bus(dev, map);
 	return dev_addr;
 }
-
-dma_addr_t
-swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
-			 int dir, struct dma_attrs *attrs)
-{
-	return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
-}
-EXPORT_SYMBOL(swiotlb_map_single_attrs);
-
-dma_addr_t
-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
-{
-	return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
-}
-EXPORT_SYMBOL(swiotlb_map_single);
-
-dma_addr_t
-swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
-{
-	return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
-}
+EXPORT_SYMBOL_GPL(swiotlb_map_page);

 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
@@ -586,9 +566,9 @@ swiotlb_map_single_phys(struct device *h
  * After this call, reads by the cpu to the buffer are guaranteed to see
  * whatever the device wrote there.
  */
-void
-swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
-			   size_t size, int dir, struct dma_attrs *attrs)
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);

@@ -598,15 +578,7 @@ swiotlb_unmap_single_attrs(struct device
 	else
 		gnttab_dma_unmap_page(dev_addr);
 }
-EXPORT_SYMBOL(swiotlb_unmap_single_attrs);
-
-void
-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
-		     int dir)
-{
-	return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
-}
-EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL_GPL(swiotlb_unmap_page);

 /*
  * Make physical memory consistent for a single streaming mode DMA translation
@@ -620,7 +592,7 @@ EXPORT_SYMBOL(swiotlb_unmap_single);
  */
 void
 swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-			    size_t size, int dir)
+			    size_t size, enum dma_data_direction dir)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);

@@ -632,7 +604,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_cp

 void
 swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
-			       size_t size, int dir)
+			       size_t size, enum dma_data_direction dir)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);

@@ -644,7 +616,8 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_de

 void
 swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-				  unsigned long offset, size_t size, int dir)
+				  unsigned long offset, size_t size,
+				  enum dma_data_direction dir)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);

@@ -656,7 +629,8 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra

 void
 swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
-				     unsigned long offset, size_t size, int dir)
+				     unsigned long offset, size_t size,
+				     enum dma_data_direction dir)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);

@@ -684,7 +658,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra
  */
 int
 swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		     int dir, struct dma_attrs *attrs)
+		     enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -736,7 +710,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
  */
 void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-		       int nelems, int dir, struct dma_attrs *attrs)
+		       int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -770,7 +744,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg);
  */
 void
 swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sgl,
-			int nelems, int dir)
+			int nelems, enum dma_data_direction dir)
 {
 	struct scatterlist *sg;
 	int i;
@@ -787,7 +761,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);

 void
 swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sgl,
-			   int nelems, int dir)
+			   int nelems, enum dma_data_direction dir)
 {
 	struct scatterlist *sg;
 	int i;
--- head-2010-05-25.orig/mm/page_alloc.c	2010-03-24 15:06:12.000000000 +0100
+++ head-2010-05-25/mm/page_alloc.c	2010-03-24 15:25:06.000000000 +0100
@@ -4685,11 +4685,9 @@ static void __setup_per_zone_wmarks(void
 	}

 #ifdef CONFIG_XEN
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned int cpu;

-		if (!populated_zone(zone))
-			continue;
 		for_each_online_cpu(cpu) {
 			unsigned long high;